Analysis of Genetic Mutations

*Fragments of analysis

Text Mining

Analyzing text data to find anomalies, patterns and correlations to predict pathogenic gene mutations

Data file

INDEX:

1: Data Overview
2: Visualizations
3: Text exploration
4: TF (Term Frequency)
5: Sentiment Analysis

6: Conclusion

#Importing libraries
library('ggplot2')
library('ggthemes') 
library('scales') 
library('grid') 
library('gridExtra') 
library('corrplot') 
library('ggraph')
library('igraph') 
library('dplyr') 
library('readr') 
library('tibble')
library('tidyr') 
library('stringr')
library('forcats')
library('tidytext') 
library('SnowballC')
library('wordcloud')
library('stringr')
library(devtools)
library(ggpubr)
library(h2o)

Data Overview¶

#Overview of data
train <- read_csv("training_variants")
test  <- read_csv('test_variants.csv')
head(train)

Parsed with column specification:
cols(
  ID = col_double(),
  Gene = col_character(),
  Variation = col_character(),
  Class = col_double()
)
Parsed with column specification:
cols(
  ID = col_double(),
  Gene = col_character(),
  Variation = col_character(),
  Class = col_double()
)

#checking for missing values
sum(is.na(train))
sum(is.na(test))

glimpse(train)
summary(train)

Observations: 3,321
Variables: 4
$ ID        <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
$ Gene      <chr> "FAM58A", "CBL", "CBL", "CBL", "CBL", "CBL", "CBL", "CBL"...
$ Variation <chr> "Truncating Mutations", "W802*", "Q249E", "N454D", "L399V...
$ Class     <dbl> 1, 2, 2, 3, 4, 4, 5, 1, 4, 4, 4, 4, 4, 4, 5, 4, 1, 4, 5, ...

       ID           Gene            Variation             Class      
 Min.   :   0   Length:3321        Length:3321        Min.   :1.000  
 1st Qu.: 830   Class :character   Class :character   1st Qu.:2.000  
 Median :1660   Mode  :character   Mode  :character   Median :4.000  
 Mean   :1660                                         Mean   :4.366  
 3rd Qu.:2490                                         3rd Qu.:7.000  
 Max.   :3320                                         Max.   :9.000

#There are 3321 different IDs in the training set containing 264 different Gene expressions with 2996 different Variations. 
#count of Genes
train %>%
  group_by(Gene) %>%
  summarise(ct = n()) %>%
  arrange(desc(ct))

#Aggregating genes by class
gene_per_class<- aggregate(train$Gene, by=list(train$Class),FUN= n_distinct) 
colnames(gene_per_class) = c('Class','Genes')
gene_per_class

#count of variations
train %>%
  group_by(Variation) %>%
  summarise(ct = n()) %>%
  arrange(desc(ct))

#Aggregating genes by class
var_per_class<- aggregate(train$Variation, by=list(train$Class),FUN= n_distinct) 
colnames(var_per_class) = c('Class','Variation')
var_per_class

#Checking the proportion of classes in the training dataset
#We notice class 7 has a high presence in the data
prop.table(table(train$Class))

          1           2           3           4           5           6 
0.171032821 0.136103583 0.026799157 0.206564288 0.072869618 0.082806384 
          7           8           9 
0.286961759 0.005721168 0.011141223

Visualizations¶

#frequency distribution of the most frequent Gene values
options(repr.plot.width=6, repr.plot.height=4)
gene <- train %>%
  group_by(Gene) %>%
  summarise(cntg = n()) %>%
  arrange(desc(cntg)) %>%
  filter(cntg>59)
gene %>%
  ggplot(aes(reorder(Gene, -cntg, FUN = min), cntg)) +
  geom_point(size = 3, color='darkblue') +
  labs(x = "Gene", y = "Frequency") +
  coord_flip() +
  ggtitle("Frequency Distribution of Genes")

#frequency distribution of the most frequent Variation values
variation <- train %>%
  group_by(Variation) %>%
  summarise(cntv = n()) %>%
  arrange(desc(cntv)) %>%
  filter(cntv>2)

variation %>%
  ggplot(aes(reorder(Variation, -cntv, FUN = min), cntv)) +
  geom_point(size = 3, color='darkred') +
  labs(x = "Variation", y = "Frequency") +
  coord_flip() +
  ggtitle("Frequency Distribution of Variations")

#joining train and test sets
#most frequent Variations in the train vs test data
train1<- train %>% mutate(set = factor("train")) 
test1 <- test %>% mutate(set = factor("test"))
joindf <- suppressWarnings(full_join(train1, test1))

joindf %>%
  group_by(Variation, set) %>%
  summarise(ct = n()) %>%
  filter(ct > 2) %>%
  ggplot(aes(reorder(Variation, -ct, FUN = median), ct, colour = set)) +
  geom_point(size = 3) +
  coord_cartesian(ylim = c(0, 100), xlim=c(0,5)) +
  labs(x = "Variation", y = "Frequency") +
  ggtitle("Top Variations for Train & Test")

Joining, by = c("ID", "Gene", "Variation", "Class", "set")

#most frequent Genes in the train vs test data
joindf %>%
  group_by(Gene, set) %>%
  summarise(ct = n()) %>%
  filter(ct > 2) %>%
  ggplot(aes(reorder(Gene, -ct, FUN = median), ct, colour = set)) +
  geom_point(size = 3) +
  coord_cartesian(ylim = c(0, 200), xlim=c(0,7)) +
  labs(x = "Gene", y = "Frequency") +
  ggtitle("Most frequent Genes for Train & Test")

#Class distribution
cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
train %>%
  ggplot(aes(Class)) +
  geom_bar(fill="#E69F00") +
  ggtitle("Class distribution")
#We notice that most instances are observed in the class 7. While the least frequesnt instances are in the classes 
8 & 9.

#variables relationships
#frequency distribution of the most frequent Genes for the different Classes
train %>%
  filter(Gene %in% str_c(gene$Gene)) %>%
  ggplot(aes(Gene)) +
  geom_bar(fill="#009E73") +
  scale_y_log10() +
  theme(axis.text.x  = element_text(angle=90, vjust=0.5, size=7)) +
  facet_wrap(~ Class) +
  ggtitle("Gene distribution per class")
#First thing we notice is that classes 8 & 9 have none of the most frequest genes.
#Some genes dominate in one classes while alomost absent in others. In particuler, EGFR is mostly present in 2 & 7 classes.
#PTEN dominates in class 4.TP53 is predominantly present in class 1 & 4.

train %>%
  filter(Variation %in% str_c(variation$Variation)) %>%
  ggplot(aes(Variation)) +
  geom_bar(fill="#CC79A7") +
  scale_y_log10() +
  theme(axis.text.x  = element_text(angle=90, vjust=0.5, size=7)) +
  facet_wrap(~ Class) +
  ggtitle("Variation distribution per class")
# From the following graph, we observe:
#Truncating mutations are mainly represented in the class 1. 
#Fusions dominate in the class 2.
# Class 8 doesnt contain any of the most frequent variations.

#Classes sorted by Genes 
train %>%
  filter(Gene %in% str_c(gene$Gene)) %>%
  ggplot(aes(Class)) +
  geom_bar(fill="#000000") +
  scale_y_log10() +
  facet_wrap(~ Gene) +
  ggtitle("Classes per Gene")
#ALK is represented in 2,3,5,7 classes
#BRAF is represented in 2,4,5,6,7 classes
#BRCA1 is uniformly destributed amonth 1,3,4,5,6 classes
#BRCA2 is present in 2,4,5,7 classes
#ERBB2 dominates in 2,4,5,6,7 classes
#KIT is shared between 2 classes: 2 & 7
#PDGFRA is observed in the classes 2,3,5,7
#PTEN dominates in the class 4 but is also present in 1,3,5
#TP53 is prediminantly present in 1 and 4 but is also observed in 2,3,5,6

train %>%
  filter(Variation %in% str_c(variation$Variation)) %>%
  ggplot(aes(Class)) +
  geom_bar(fill="#CC79A7") +
  scale_y_log10() +
  facet_wrap(~ Variation) +
  ggtitle("Classes per Variation") 
#This illustration demontrates presence of the most frequent variations in classes.

Text exploration¶

#Reading in the text files
train.txt <- readLines("training_text")
test.txt <-readLines("test_text")

#Separating the columns in a structure of a dataframe
train.txt <- str_split_fixed(train.txt[2:length(train.txt)], "\\|\\|",2)  
train.txt<-data_frame(ID=train.txt[,1], text=train.txt[,2])

test.txt <-str_split_fixed(test.txt[2:length(test.txt)], "\\|\\|",2)
test.txt <- data_frame(ID=test.txt[,1], text=test.txt[,2])

Overview of text data

train.txt %>%
  mutate(txt.len=str_length(text)) %>%
  summary()

      ID                text              txt.len      
 Length:3321        Length:3321        Min.   :     5  
 Class :character   Class :character   1st Qu.: 31452  
 Mode  :character   Mode  :character   Median : 46294  
                                       Mean   : 63867  
                                       3rd Qu.: 80349  
                                       Max.   :525742

train.txt %>%
  mutate(txt.len=str_length(text)) %>%
  filter(txt.len<=50) %>%
  select(ID, text, txt.len)

test.txt %>%
  mutate(txt.len=str_length(text)) %>%
  summary()

      ID                text              txt.len      
 Length:367         Length:367         Min.   :     5  
 Class :character   Class :character   1st Qu.: 30994  
 Mode  :character   Mode  :character   Median : 49007  
                                       Mean   : 67335  
                                       3rd Qu.: 82514  
                                       Max.   :489305

test.txt %>%
  mutate(txt.len=str_length(text)) %>%
  filter(txt.len<=100) %>%
  select(ID, text, txt.len)

#frequency of occurance of  "pathogenic" and "benign" 
train.txt <- train.txt %>%
  mutate(pathogenic = str_count(text, "pathogenic"),
         benign = str_count(text, "benign"))
temp <- train %>%
  select(ID, Class)
train.txt$ID<-as.numeric(train.txt$ID)

df<-left_join(train.txt, temp, by = "ID") %>%
  ggplot(aes(pathogenic)) +
  geom_bar(width = 3,fill="#009E73") +
  scale_y_log10() +
  facet_wrap(~ Class) +
  coord_cartesian(xlim=c(0,100)) +
  ggtitle("Distribution of pathogenic in classes")

c

#"Pathogenic" isn't traced in classes 8 and 9. However, classes 1 and 4 have the highest frequency of the word "pathogenic".
#Other classes depict "its"pathogenic" presence in different amounts.

#Now let's plot a ratio of the mean occurence of the word "pathogenic" over the mean occurence of the word "benign" and VS

sp<-full_join(train.txt, temp, by = "ID") %>%
  group_by(Class) %>%
  summarise(ratio1 = mean(pathogenic)/mean(benign)) %>%
  ggplot(aes(reorder(Class, -ratio1, FUN = max), ratio1)) +
  geom_point(colour = "darkblue", size = 5) +
  labs(x = "Class") +
  ggtitle("'Pathogenic'/'Benign' Ratio")

bp<-full_join(train.txt, temp, by = "ID") %>%
  group_by(Class) %>%
  summarise(ratio2 = mean(benign)/mean(pathogenic)) %>%
  ggplot(aes(reorder(Class, -ratio2, FUN = max), ratio2)) +
  geom_point(colour = "red", size = 5) +
  labs(x = "Class") +
  ggtitle("'Benign'/'Pathogenic' Ratio")

figure<-ggarrange(sp, bp + font("x.text", size = 10),
                    ncol = 1, nrow = 2)
figure
#The word 'pathogenic' has the highest count in classes 6,5,3,4,1, while the least count in classes 2,9,7,8
#VS for the word "benign".

TF (Term Frequency)¶

# First of all we need to break the text into individual tokens (so called tokenization) and transform it to a tidy data 
# structure. We will use tidytext’s unnest_tokens() function for that.
data1 <- train.txt %>% select(ID, text) %>% unnest_tokens(word, text)
#The tidytext package contains a dictionary of stop words, like "and" or "next", which we dont want in our data. 
# Additionally, we will remove numbers and symbols.
data("stop_words")
#Removing stop words and converting the words to its root form
data2 <- data1 %>%
  anti_join(stop_words, by = "word") %>%
  mutate(word = wordStem(word)) %>%
  filter(str_detect(word, "[a-z]"))

#most popular words count
data2 %>%
  count(word) %>%
  filter(n > 5e4) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  coord_flip() +
  geom_col(fill="#E69F00") +
  ggtitle("Most frequent word stems")

#Here we use a wordcloud to visualize TF
set.seed(1234)
options(repr.plot.width=8, repr.plot.height=6)
data2 %>% 
  count(word) %>%
  with(wordcloud(word, freq = n, min.freq = 1,
                 max.words=200, random.order=FALSE, rot.per=0.35, 
                 colors=brewer.pal(8, "Dark2")))

#The bind_tf_idf function in the tidytext package will help us define how many times each Class contains each term
frequency <-full_join(data2, train, by = "ID") %>%
  count(Class, word)
options(repr.plot.width=6, repr.plot.height=4)
tf_idf <- frequency %>%
  bind_tf_idf(word, Class, n)
# The next graph demonstrates the terms with the highest count throughout all classes
# Class 8 has the highest number of dnmt3b7 and k42a. They are followed by the class 8 and its most frequent terms like
#2hg,u2af35 etc. 
tf_idf %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>%
  top_n(15, tf_idf) %>%
  ggplot(aes(word, tf_idf, fill = Class)) +
  geom_col() +
  labs(x = NULL, y = "count") +
  coord_flip()

# The following visualization shows the most frequent words per each Class  
  tf_idf %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>%
  group_by(Class) %>%
  top_n(7, tf_idf) %>%
  ggplot(aes(word, tf_idf, fill = Class)) +
  labs(y = "count") +
  facet_wrap(~ Class, ncol = 3, scales = "free") +
  geom_col() +
  coord_flip()

#Grouping the text as per the classes
frequency <- full_join(data2, train, by = "ID")%>%
  count(Class, word) %>%
  group_by(Class) %>%
  mutate(proportion = n / sum(n)) %>% 
  select(-n) %>% 
  spread(Class, proportion) %>% 
  gather(Class, proportion, `2`,`6`)

#Visualizing class 7 word frequency versus classes 2 and 6
#We notice the words 'mutat' and 'activ' most frequently occur in all 3 classes
#Comparing the class 7 with class 2 , the words 'alk' and 'acid' occur frequently in both classes 
image<-ggplot(frequency,aes(x=frequency$proportion, y = `7`, color = abs(`7`- frequency$proportion ))) +
  geom_abline(color = "blue", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.1, height = 0.1) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
  facet_wrap(facets = ~ Class, ncol=2) +
  theme(legend.position="none") +
  labs(y = "Class 7", x = "Class 2 & Class 6")
suppressWarnings(print(image))

Sentiment Analysis¶

get_sentiments("afinn")

get_sentiments("nrc")

#Filtering out sad words from dataset
joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "sadness")
#We notice 'cancer' has the highest word count
full_join(data2, train, by = "ID") %>%
  filter(Class == "8") %>%
  inner_join(joy) %>%
  count(word, sort = TRUE)

Joining, by = "word"

Conclusion¶

I have presented snippets of my analysis of clinical data. I attempted to identify patterns, relationships and assertions about genetic mutations based on clinical data. This report included the following measures:

Tf-idf statistic - where I measured the importance if each word in the document. The following set of genes has the highest count: BRCA1,TP53,EGFR,PTEN,BRCA2. Truncating Mutations are the most frequent variation of a gene. Classes 7 and 4 are represented the most in the dataset. However, class 1 has the highest gene count. Variation of genes is predominantly present in this set of classes 7, 2,1,6,4. The word 'pathogenic' prevails in classes 6,5,4,3,1where the following words dominate: brca2, vuss, pht, etc.
Relationships between words
Sentiment Analysis - where I studied the emotional intent of words to infer whether a section of text is positive or negative. I also looked at the emotional states such as "angry," "sad," and "happy" of the data.

Statistical text analysis and appropriately chosen model will enable us to classify clinical evidence faster, more accurately and efficiently saving the time of medical professionals and reducing patent's waiting time.

Gene	ct
BRCA1	264
TP53	163
EGFR	141
PTEN	126
BRCA2	125
KIT	99
BRAF	93
ALK	69
ERBB2	69
PDGFRA	60
PIK3CA	56
CDKN2A	52
FGFR2	50
FLT3	49
TSC2	47
MTOR	45
KRAS	44
MAP2K1	43
VHL	41
RET	40
FGFR3	39
MLH1	35
JAK2	33
MET	33
SMAD4	33
NOTCH1	31
AKT1	28
ABL1	26
PTPN11	26
ROS1	26
...	...
LATS2	1
MDM2	1
MDM4	1
MEN1	1
MYOD1	1
NCOR1	1
PAK1	1
PAX8	1
PIK3R3	1
PMS1	1
PPM1D	1
RAD51B	1
RAD51D	1
RAD54L	1
RARA	1
RICTOR	1
RNF43	1
RRAS2	1
RYBP	1
SDHB	1
SDHC	1
SHOC2	1
SHQ1	1
SRSF2	1
STAG2	1
TCF3	1
TCF7L2	1
VEGFA	1
WHSC1	1
WHSC1L1	1

Variation	ct
Truncating Mutations	93
Deletion	74
Amplification	71
Fusions	34
Overexpression	6
G12V	4
E17K	3
Q61H	3
Q61L	3
Q61R	3
T58I	3
A146T	2
A146V	2
C618R	2
E330K	2
E542K	2
ETV6-NTRK3 Fusion	2
EWSR1-ETV1 Fusion	2
F28L	2
F384L	2
G12A	2
G12C	2
G12D	2
G12S	2
G13C	2
G13D	2
G13V	2
G35R	2
G67R	2
I31M	2
...	...
Y646H	1
Y646N	1
Y646S	1
Y647C	1
Y652H	1
Y65C	1
Y68D	1
Y68H	1
Y69H	1
Y772_A775dup	1
Y791F	1
Y801H	1
Y803N	1
Y806C	1
Y823D	1
Y835F	1
Y842C	1
Y846C	1
Y849C	1
Y849S	1
Y87C	1
Y87N	1
Y901C	1
Y931C	1
Y98H	1
Y98N	1
YAP1-FAM118B Fusion	1
YAP1-MAMLD1 Fusion	1
ZC3H7B-BCOR Fusion	1
ZNF198-FGFR1 Fusion	1

Class	Variation
1	423
2	399
3	89
4	669
5	242
6	265
7	889
8	19
9	37

ID	text	txt.len
1109	null	5
1277	null	5
1407	null	5
1639	null	5
2755	null	5

word	score
abandon	-2
abandoned	-2
abandons	-2
abducted	-2
abduction	-2
abductions	-2
abhor	-3
abhorred	-3
abhorrent	-3
abhors	-3
abilities	2
ability	2
aboard	1
absentee	-1
absentees	-1
absolve	2
absolved	2
absolves	2
absolving	2
absorbed	1
abuse	-3
abused	-3
abuses	-3
abusive	-3
accept	1
accepted	1
accepting	1
accepts	1
accident	-2
accidental	-2
...	...
worry	-3
worrying	-3
worse	-3
worsen	-3
worsened	-3
worsening	-3
worsens	-3
worshiped	3
worst	-3
worth	2
worthless	-2
worthy	2
wow	4
wowow	4
wowww	4
wrathful	-3
wreck	-2
wrong	-2
wronged	-2
wtf	-4
yeah	1
yearning	1
yeees	2
yes	1
youthful	2
yucky	-2
yummy	3
zealot	-2
zealots	-2
zealous	2

ID	Gene	Variation	Class
0	FAM58A	Truncating Mutations	1
1	CBL	W802*	2
2	CBL	Q249E	2
3	CBL	N454D	3
4	CBL	L399V	4
5	CBL	V391I	4

word	sentiment
abacus	trust
abandon	fear
abandon	negative
abandon	sadness
abandoned	anger
abandoned	fear
abandoned	negative
abandoned	sadness
abandonment	anger
abandonment	fear
abandonment	negative
abandonment	sadness
abandonment	surprise
abba	positive
abbot	trust
abduction	fear
abduction	negative
abduction	sadness
abduction	surprise
aberrant	negative
aberration	disgust
aberration	negative
abhor	anger
abhor	disgust
abhor	fear
abhor	negative
abhorrent	anger
abhorrent	disgust
abhorrent	fear
abhorrent	negative
...	...
yellows	negative
yelp	anger
yelp	fear
yelp	negative
yelp	surprise
young	anticipation
young	joy
young	positive
young	surprise
younger	positive
youth	anger
youth	anticipation
youth	fear
youth	joy
youth	positive
youth	surprise
zany	surprise
zeal	anticipation
zeal	joy
zeal	positive
zeal	surprise
zeal	trust
zealous	joy
zealous	positive
zealous	trust
zest	anticipation
zest	joy
zest	positive
zest	trust
zip	negative

word	n
cancer	690
inhibit	185
loss	143
tumour	122
leukemia	95
sarcoma	84
treat	81
lower	64
repress	59
death	58
suppress	58
blue	36
carcinoma	35
restrict	33
weight	25
error	23
chronic	15
lethal	14
cross	12
black	11
bottom	10
depart	8
lost	8
lose	7
bad	6
die	6
inter	6
diminish	5
dismal	5
late	5
kill	4
retard	4
absent	3
dark	3
default	3
morbid	3
pain	3
remiss	3
fatal	2
margin	2
strip	2
bacteria	1
broken	1
cage	1
cataract	1
fall	1
fat	1
gore	1
hut	1
ill	1
orphan	1
overload	1
revolution	1
shell	1
sick	1
withdraw	1

Class	Genes
1	142
2	96
3	26
4	92
5	48
6	56
7	108
8	12
9	10