1: Data Overview
2: Visualizations
3: Text exploration
4: TF (Term Frequency)
5: Sentiment Analysis
6: Conclusion
#Importing libraries
library('ggplot2')
library('ggthemes')
library('scales')
library('grid')
library('gridExtra')
library('corrplot')
library('ggraph')
library('igraph')
library('dplyr')
library('readr')
library('tibble')
library('tidyr')
library('stringr')
library('forcats')
library('tidytext')
library('SnowballC')
library('wordcloud')
library('stringr')
library(devtools)
library(ggpubr)
library(h2o)
#Overview of data
train <- read_csv("training_variants")
test <- read_csv('test_variants.csv')
head(train)
#checking for missing values
sum(is.na(train))
sum(is.na(test))
glimpse(train)
summary(train)
#There are 3321 different IDs in the training set containing 264 different Gene expressions with 2996 different Variations.
#count of Genes
train %>%
group_by(Gene) %>%
summarise(ct = n()) %>%
arrange(desc(ct))
#Aggregating genes by class
gene_per_class<- aggregate(train$Gene, by=list(train$Class),FUN= n_distinct)
colnames(gene_per_class) = c('Class','Genes')
gene_per_class
#count of variations
train %>%
group_by(Variation) %>%
summarise(ct = n()) %>%
arrange(desc(ct))
#Aggregating genes by class
var_per_class<- aggregate(train$Variation, by=list(train$Class),FUN= n_distinct)
colnames(var_per_class) = c('Class','Variation')
var_per_class
#Checking the proportion of classes in the training dataset
#We notice class 7 has a high presence in the data
prop.table(table(train$Class))
#frequency distribution of the most frequent Gene values
options(repr.plot.width=6, repr.plot.height=4)
gene <- train %>%
group_by(Gene) %>%
summarise(cntg = n()) %>%
arrange(desc(cntg)) %>%
filter(cntg>59)
gene %>%
ggplot(aes(reorder(Gene, -cntg, FUN = min), cntg)) +
geom_point(size = 3, color='darkblue') +
labs(x = "Gene", y = "Frequency") +
coord_flip() +
ggtitle("Frequency Distribution of Genes")
#frequency distribution of the most frequent Variation values
variation <- train %>%
group_by(Variation) %>%
summarise(cntv = n()) %>%
arrange(desc(cntv)) %>%
filter(cntv>2)
variation %>%
ggplot(aes(reorder(Variation, -cntv, FUN = min), cntv)) +
geom_point(size = 3, color='darkred') +
labs(x = "Variation", y = "Frequency") +
coord_flip() +
ggtitle("Frequency Distribution of Variations")
#joining train and test sets
#most frequent Variations in the train vs test data
train1<- train %>% mutate(set = factor("train"))
test1 <- test %>% mutate(set = factor("test"))
joindf <- suppressWarnings(full_join(train1, test1))
joindf %>%
group_by(Variation, set) %>%
summarise(ct = n()) %>%
filter(ct > 2) %>%
ggplot(aes(reorder(Variation, -ct, FUN = median), ct, colour = set)) +
geom_point(size = 3) +
coord_cartesian(ylim = c(0, 100), xlim=c(0,5)) +
labs(x = "Variation", y = "Frequency") +
ggtitle("Top Variations for Train & Test")
#most frequent Genes in the train vs test data
joindf %>%
group_by(Gene, set) %>%
summarise(ct = n()) %>%
filter(ct > 2) %>%
ggplot(aes(reorder(Gene, -ct, FUN = median), ct, colour = set)) +
geom_point(size = 3) +
coord_cartesian(ylim = c(0, 200), xlim=c(0,7)) +
labs(x = "Gene", y = "Frequency") +
ggtitle("Most frequent Genes for Train & Test")
#Class distribution
cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
train %>%
ggplot(aes(Class)) +
geom_bar(fill="#E69F00") +
ggtitle("Class distribution")
#We notice that most instances are observed in the class 7. While the least frequesnt instances are in the classes
8 & 9.
#variables relationships
#frequency distribution of the most frequent Genes for the different Classes
train %>%
filter(Gene %in% str_c(gene$Gene)) %>%
ggplot(aes(Gene)) +
geom_bar(fill="#009E73") +
scale_y_log10() +
theme(axis.text.x = element_text(angle=90, vjust=0.5, size=7)) +
facet_wrap(~ Class) +
ggtitle("Gene distribution per class")
#First thing we notice is that classes 8 & 9 have none of the most frequest genes.
#Some genes dominate in one classes while alomost absent in others. In particuler, EGFR is mostly present in 2 & 7 classes.
#PTEN dominates in class 4.TP53 is predominantly present in class 1 & 4.
train %>%
filter(Variation %in% str_c(variation$Variation)) %>%
ggplot(aes(Variation)) +
geom_bar(fill="#CC79A7") +
scale_y_log10() +
theme(axis.text.x = element_text(angle=90, vjust=0.5, size=7)) +
facet_wrap(~ Class) +
ggtitle("Variation distribution per class")
# From the following graph, we observe:
#Truncating mutations are mainly represented in the class 1.
#Fusions dominate in the class 2.
# Class 8 doesnt contain any of the most frequent variations.
#Classes sorted by Genes
train %>%
filter(Gene %in% str_c(gene$Gene)) %>%
ggplot(aes(Class)) +
geom_bar(fill="#000000") +
scale_y_log10() +
facet_wrap(~ Gene) +
ggtitle("Classes per Gene")
#ALK is represented in 2,3,5,7 classes
#BRAF is represented in 2,4,5,6,7 classes
#BRCA1 is uniformly destributed amonth 1,3,4,5,6 classes
#BRCA2 is present in 2,4,5,7 classes
#ERBB2 dominates in 2,4,5,6,7 classes
#KIT is shared between 2 classes: 2 & 7
#PDGFRA is observed in the classes 2,3,5,7
#PTEN dominates in the class 4 but is also present in 1,3,5
#TP53 is prediminantly present in 1 and 4 but is also observed in 2,3,5,6
train %>%
filter(Variation %in% str_c(variation$Variation)) %>%
ggplot(aes(Class)) +
geom_bar(fill="#CC79A7") +
scale_y_log10() +
facet_wrap(~ Variation) +
ggtitle("Classes per Variation")
#This illustration demontrates presence of the most frequent variations in classes.
#Reading in the text files
train.txt <- readLines("training_text")
test.txt <-readLines("test_text")
#Separating the columns in a structure of a dataframe
train.txt <- str_split_fixed(train.txt[2:length(train.txt)], "\\|\\|",2)
train.txt<-data_frame(ID=train.txt[,1], text=train.txt[,2])
test.txt <-str_split_fixed(test.txt[2:length(test.txt)], "\\|\\|",2)
test.txt <- data_frame(ID=test.txt[,1], text=test.txt[,2])
Overview of text data
train.txt %>%
mutate(txt.len=str_length(text)) %>%
summary()
train.txt %>%
mutate(txt.len=str_length(text)) %>%
filter(txt.len<=50) %>%
select(ID, text, txt.len)
test.txt %>%
mutate(txt.len=str_length(text)) %>%
summary()
test.txt %>%
mutate(txt.len=str_length(text)) %>%
filter(txt.len<=100) %>%
select(ID, text, txt.len)
#frequency of occurance of "pathogenic" and "benign"
train.txt <- train.txt %>%
mutate(pathogenic = str_count(text, "pathogenic"),
benign = str_count(text, "benign"))
temp <- train %>%
select(ID, Class)
train.txt$ID<-as.numeric(train.txt$ID)
df<-left_join(train.txt, temp, by = "ID") %>%
ggplot(aes(pathogenic)) +
geom_bar(width = 3,fill="#009E73") +
scale_y_log10() +
facet_wrap(~ Class) +
coord_cartesian(xlim=c(0,100)) +
ggtitle("Distribution of pathogenic in classes")
c
#"Pathogenic" isn't traced in classes 8 and 9. However, classes 1 and 4 have the highest frequency of the word "pathogenic".
#Other classes depict "its"pathogenic" presence in different amounts.
#Now let's plot a ratio of the mean occurence of the word "pathogenic" over the mean occurence of the word "benign" and VS
sp<-full_join(train.txt, temp, by = "ID") %>%
group_by(Class) %>%
summarise(ratio1 = mean(pathogenic)/mean(benign)) %>%
ggplot(aes(reorder(Class, -ratio1, FUN = max), ratio1)) +
geom_point(colour = "darkblue", size = 5) +
labs(x = "Class") +
ggtitle("'Pathogenic'/'Benign' Ratio")
bp<-full_join(train.txt, temp, by = "ID") %>%
group_by(Class) %>%
summarise(ratio2 = mean(benign)/mean(pathogenic)) %>%
ggplot(aes(reorder(Class, -ratio2, FUN = max), ratio2)) +
geom_point(colour = "red", size = 5) +
labs(x = "Class") +
ggtitle("'Benign'/'Pathogenic' Ratio")
figure<-ggarrange(sp, bp + font("x.text", size = 10),
ncol = 1, nrow = 2)
figure
#The word 'pathogenic' has the highest count in classes 6,5,3,4,1, while the least count in classes 2,9,7,8
#VS for the word "benign".
# First of all we need to break the text into individual tokens (so called tokenization) and transform it to a tidy data
# structure. We will use tidytext’s unnest_tokens() function for that.
data1 <- train.txt %>% select(ID, text) %>% unnest_tokens(word, text)
#The tidytext package contains a dictionary of stop words, like "and" or "next", which we dont want in our data.
# Additionally, we will remove numbers and symbols.
data("stop_words")
#Removing stop words and converting the words to its root form
data2 <- data1 %>%
anti_join(stop_words, by = "word") %>%
mutate(word = wordStem(word)) %>%
filter(str_detect(word, "[a-z]"))
#most popular words count
data2 %>%
count(word) %>%
filter(n > 5e4) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
coord_flip() +
geom_col(fill="#E69F00") +
ggtitle("Most frequent word stems")
#Here we use a wordcloud to visualize TF
set.seed(1234)
options(repr.plot.width=8, repr.plot.height=6)
data2 %>%
count(word) %>%
with(wordcloud(word, freq = n, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2")))
#The bind_tf_idf function in the tidytext package will help us define how many times each Class contains each term
frequency <-full_join(data2, train, by = "ID") %>%
count(Class, word)
options(repr.plot.width=6, repr.plot.height=4)
tf_idf <- frequency %>%
bind_tf_idf(word, Class, n)
# The next graph demonstrates the terms with the highest count throughout all classes
# Class 8 has the highest number of dnmt3b7 and k42a. They are followed by the class 8 and its most frequent terms like
#2hg,u2af35 etc.
tf_idf %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
top_n(15, tf_idf) %>%
ggplot(aes(word, tf_idf, fill = Class)) +
geom_col() +
labs(x = NULL, y = "count") +
coord_flip()
# The following visualization shows the most frequent words per each Class
tf_idf %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(Class) %>%
top_n(7, tf_idf) %>%
ggplot(aes(word, tf_idf, fill = Class)) +
labs(y = "count") +
facet_wrap(~ Class, ncol = 3, scales = "free") +
geom_col() +
coord_flip()
#Grouping the text as per the classes
frequency <- full_join(data2, train, by = "ID")%>%
count(Class, word) %>%
group_by(Class) %>%
mutate(proportion = n / sum(n)) %>%
select(-n) %>%
spread(Class, proportion) %>%
gather(Class, proportion, `2`,`6`)
#Visualizing class 7 word frequency versus classes 2 and 6
#We notice the words 'mutat' and 'activ' most frequently occur in all 3 classes
#Comparing the class 7 with class 2 , the words 'alk' and 'acid' occur frequently in both classes
image<-ggplot(frequency,aes(x=frequency$proportion, y = `7`, color = abs(`7`- frequency$proportion ))) +
geom_abline(color = "blue", lty = 2) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.1, height = 0.1) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
facet_wrap(facets = ~ Class, ncol=2) +
theme(legend.position="none") +
labs(y = "Class 7", x = "Class 2 & Class 6")
suppressWarnings(print(image))
get_sentiments("afinn")
get_sentiments("nrc")
#Filtering out sad words from dataset
joy <- get_sentiments("nrc") %>%
filter(sentiment == "sadness")
#We notice 'cancer' has the highest word count
full_join(data2, train, by = "ID") %>%
filter(Class == "8") %>%
inner_join(joy) %>%
count(word, sort = TRUE)
I have presented snippets of my analysis of clinical data. I attempted to identify patterns, relationships and assertions about genetic mutations based on clinical data. This report included the following measures:
Tf-idf statistic - where I measured the importance if each word in the document. The following set of genes has the highest count: BRCA1,TP53,EGFR,PTEN,BRCA2. Truncating Mutations are the most frequent variation of a gene. Classes 7 and 4 are represented the most in the dataset. However, class 1 has the highest gene count. Variation of genes is predominantly present in this set of classes 7, 2,1,6,4. The word 'pathogenic' prevails in classes 6,5,4,3,1where the following words dominate: brca2, vuss, pht, etc.
Relationships between words
Sentiment Analysis - where I studied the emotional intent of words to infer whether a section of text is positive or negative. I also looked at the emotional states such as "angry," "sad," and "happy" of the data.
Statistical text analysis and appropriately chosen model will enable us to classify clinical evidence faster, more accurately and efficiently saving the time of medical professionals and reducing patent's waiting time.