Prediction of Stroke

Analyzing clinical data for drivers of stroke

Data file

INDEX:

1: Data processing
2: Descriptive statistics
3: Visualizations
4: Regression analysis
5: Evaluation of the model

Importing libraries

library(plyr)
library(dplyr)
library(ggplot2)
library(DT)
library(psych)
library(purrr)
library(pscl)
library(ROCR)

Overview of data

train.data<-read.csv("train_2v.csv", header = TRUE)
head(train.data)

# We have 3 data types in the dataset: categorical, numeric, IDs.
# 12 variables and 43400 observations.
str(train.data)

'data.frame':	43400 obs. of  12 variables:
 $ id               : int  30669 30468 16523 56543 46136 32257 52800 41413 15266 28674 ...
 $ gender           : Factor w/ 3 levels "Female","Male",..: 2 2 1 1 2 1 1 1 1 1 ...
 $ age              : num  3 58 8 70 14 47 52 75 32 74 ...
 $ hypertension     : int  0 1 0 0 0 0 0 0 0 1 ...
 $ heart_disease    : int  0 0 0 0 0 0 0 1 0 0 ...
 $ ever_married     : Factor w/ 2 levels "No","Yes": 1 2 1 2 1 2 2 2 2 2 ...
 $ work_type        : Factor w/ 5 levels "children","Govt_job",..: 1 4 4 4 3 4 4 5 4 5 ...
 $ Residence_type   : Factor w/ 2 levels "Rural","Urban": 1 2 2 1 1 2 2 1 1 2 ...
 $ avg_glucose_level: num  95.1 88 110.9 69 161.3 ...
 $ bmi              : num  18 39.2 17.6 35.9 19.1 50.1 17.7 27 32.3 54.6 ...
 $ smoking_status   : Factor w/ 4 levels "","formerly smoked",..: 1 3 1 2 1 1 2 3 4 3 ...
 $ stroke           : int  0 0 0 0 0 0 0 0 0 0 ...

Data processing¶

a) Quantifying missing values

# BMI has 3.4% of missing values. 
sort(apply(train.data, 2, function(x){sum(is.na(x))/length(x)}*100), decreasing = TRUE)

#Replacing NAs with mean values in bmi 
train.data$bmi<-ifelse(is.na(train.data$bmi), mean(train.data$bmi, na.rm = TRUE), train.data$bmi)

#Replacing blanks with "Unknown" for smoking_status
train.data$smoking_status[train.data$smoking_status==""]<-NA
train.data$smoking_status<-as.factor(ifelse(is.na(train.data$smoking_status), "Unknown", paste(train.data$smoking_status)))

b) eliminating useless attributes (unique values - IDs)

train.data<-subset(train.data, select=-c(id))

#removing "others" from gender 
train.data<-filter(train.data, gender!="Other")

c) converting categorical variables to dummy variables

train.data$male=ifelse(train.data$gender=="Male",1,0)
train.data$gender=NULL
train.data$married=ifelse(train.data$ever_married=="Yes",1,0)
train.data$ever_married=NULL
train.data$rural_res=ifelse(train.data$Residence_type=="Rural",1,0)
train.data$Residence_type=NULL

Descriptive statistics¶

# This descriptive statistics demonstrates characteristics of each variable for both groups: stroke and non-stroke patients.
# From the statistical analysis, we notice that mean values for all variables besided rural_res are 
# significantly higher for stroke patients than non-stroke patients, therefore we can preliminarily conclude 
# that individuals who are older with hypertension and/or heart disease with higher level of glucose and bmi are in a
#  risk group for stroke. 
train.data %>% split(.$stroke) %>% map(describe)

T-test

# The T-test shows the difference between mean values of stroke and non-stroke patients for all variables
lapply(train.data[,c('age','male' , 'hypertension' , 'heart_disease' ,'married', 
                     'rural_res' , 'avg_glucose_level' , 
                     'bmi' )], function(i) t.test(i ~ train.data$stroke))

$age

	Welch Two Sample t-test

data:  i by train.data$stroke
t = -58.237, df = 879.78, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -27.28984 -25.51040
sample estimates:
mean in group 0 mean in group 1 
       41.74333        68.14345 


$male

	Welch Two Sample t-test

data:  i by train.data$stroke
t = -2.33, df = 810.26, p-value = 0.02005
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.07704575 -0.00658825
sample estimates:
mean in group 0 mean in group 1 
       0.407736        0.449553 


$hypertension

	Welch Two Sample t-test

data:  i by train.data$stroke
t = -10.526, df = 794.49, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.1955405 -0.1340731
sample estimates:
mean in group 0 mean in group 1 
     0.09062104      0.25542784 


$heart_disease

	Welch Two Sample t-test

data:  i by train.data$stroke
t = -12.128, df = 788.95, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.2112372 -0.1523849
sample estimates:
mean in group 0 mean in group 1 
     0.04424259      0.22605364 


$married

	Welch Two Sample t-test

data:  i by train.data$stroke
t = -23.354, df = 855.81, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.2804599 -0.2369741
sample estimates:
mean in group 0 mean in group 1 
      0.6391119       0.8978289 


$rural_res

	Welch Two Sample t-test

data:  i by train.data$stroke
t = 0.46721, df = 810.98, p-value = 0.6405
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.02698224  0.04383918
sample estimates:
mean in group 0 mean in group 1 
      0.4988499       0.4904215 


$avg_glucose_level

	Welch Two Sample t-test

data:  i by train.data$stroke
t = -11.917, df = 796.69, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -29.76701 -21.34722
sample estimates:
mean in group 0 mean in group 1 
       104.0256        129.5827 


$bmi

	Welch Two Sample t-test

data:  i by train.data$stroke
t = -5.0415, df = 833.57, p-value = 5.667e-07
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -1.4673920 -0.6449841
sample estimates:
mean in group 0 mean in group 1 
       28.58609        29.64228

Visualizations¶

# Count of stroke and non-stroke patients by age. This illustration shows that peak of the stoke patients are
# in the age range 78-82. The stroke is not oversed in the young group patients 0-32 years old. The average age for a 
# stroke is about 68 years old.
options(repr.plot.width=6, repr.plot.height=4)
ggplot(train.data,aes(x=age,group=stroke,fill=stroke))+
  geom_histogram(position="identity",binwidth=0.5)+theme_minimal()

# This graph shows distribution of healthy and unhealthy patients by gender. 
ggplot(train.data,aes(x=male,group=stroke,fill=stroke))+
  geom_histogram(position="identity",binwidth=0.5)+theme_minimal()

# The next table quntifies it and we can see that distibution is not significantly different for males and females. 
m<-train.data%>%
  group_by(male, stroke = as.factor(stroke))%>%
  summarise(count = n())%>%
  mutate(percent = round(count/sum(count)*100,2))%>%

print(as.data.frame(m))

# A tibble: 4 x 4
# Groups:   male [2]
   male stroke count percent
  <dbl> <fct>  <int>   <dbl>
1     0 0      25234   98.3 
2     0 1        431    1.68
3     1 0      17372   98.0 
4     1 1        352    1.99

# The next plot illustrates that married people are more likely (by about 2%) to get a stroke than the ones 
# who are not married.
ggplot(train.data,aes(x=married,group=stroke,fill=stroke))+
  geom_histogram(position="identity",binwidth=0.5)+theme_minimal()

# The next table give it a more precise look. Single individuals have about 5 times less chance to get a stroke than
# married people.
n<-train.data%>%
  group_by(married, stroke = as.factor(stroke))%>%
  summarise(count = n())%>%
  mutate(percent = round(count/sum(count)*100,2))%>%
print(as.data.frame(n))

# A tibble: 4 x 4
# Groups:   married [2]
  married stroke count percent
    <dbl> <fct>  <int>   <dbl>
1       0 0      15376   99.5 
2       0 1         80    0.52
3       1 0      27230   97.5 
4       1 1        703    2.52

# Hipertension per stroke and non-stroke patients
ggplot(train.data,aes(x=hypertension,group=stroke,fill=stroke))+
  geom_histogram(position="identity",binwidth=0.5)+theme_minimal()

# 1.5% out of people without hypertension typically get a stroke, while about 5% out of patients with hypertension get 
# a stroke. # The latter indicates that people with hypertention have a higher risk to get a stroke than people who dont
# have hypertension.
k<-train.data%>%
  group_by(hypertension, stroke = as.factor(stroke))%>%
  summarise(count = n())%>%
  mutate(percent = round(count/sum(count)*100,2))%>%
print(as.data.frame(k))

# A tibble: 4 x 4
# Groups:   hypertension [2]
  hypertension stroke count percent
         <int> <fct>  <int>   <dbl>
1            0 0      38745   98.5 
2            0 1        583    1.48
3            1 0       3861   95.1 
4            1 1        200    4.92

# People with high bmi have more chances to get a stroke.
bmi<-train.data %>%
  group_by(stroke)%>%
  summarise(bmi=mean(bmi))
ggplot(bmi,aes(x = stroke,y=bmi)) +
  geom_bar(stat="identity",position=position_identity(), fill="lightblue")+theme_minimal()

# The same refers to the glucose level. Stroke patients typically have a higher glucose level than healthy people.
avg_clucose_level<-train.data %>%
  group_by(stroke)%>%
  summarise(avg_clucose_level=mean(bmi))
ggplot(avg_clucose_level,aes(x = stroke,y=avg_clucose_level)) +
  geom_bar(stat="identity",position=position_identity(), fill="lightblue")+theme_minimal()

# Glucose level by age
ggplot(train.data, aes(x = age, y = avg_glucose_level)) + 
  geom_point() +
  facet_wrap(~ stroke)+
  geom_smooth(method = 'lm', color='lightblue')

# This boxplot demonstrates correlation between stroke and heart_desease. People with a heart disease
# have higher chances to get a stroke.
ggplot(train.data, aes(as.factor(heart_disease), age, fill = as.factor(heart_disease)))+
  geom_boxplot()+
  labs(title = "Heart Disease by stoke/non-stroke patients",
    x = "heart_disease")+
  scale_fill_discrete("heart_disease")+
  facet_wrap(~ stroke)+
  theme(plot.title = element_text(hjust = .5))

# The next illustration shows the most interesting observations about the stroke. For example, people who never worked
# have zero chance to get a stroke. However, self-employed have the highest risk factor 3.7%. It might be 
# explained that they experience more stress, and more responsibilities.
l<-train.data%>%
  group_by(work_type, stroke = as.factor(stroke))%>%
  summarise(count = n())%>%
  mutate(percent = round(count/sum(count)*100,2))%>%
print(as.data.frame(l))

# A tibble: 9 x 4
# Groups:   work_type [5]
  work_type     stroke count percent
  <fct>         <fct>  <int>   <dbl>
1 children      0       6152  100.0 
2 children      1          2    0.03
3 Govt_job      0       5349   98.4 
4 Govt_job      1         89    1.64
5 Never_worked  0        177  100   
6 Private       0      24386   98.2 
7 Private       1        441    1.78
8 Self-employed 0       6542   96.3 
9 Self-employed 1        251    3.69

# Glucose level per jobe type. never_worked group has the lowest glucose lelvel, high stress work type like self-employed
# relatively has the highest glucose value. 
# The same interpretation can be applied to bmi.
ggplot(train.data, aes(reorder(work_type, avg_glucose_level), avg_glucose_level, fill = work_type))+
  geom_boxplot()+
  labs(    title = "Glucose level by Work Type",
    x = "work_type")+
  theme(plot.title = element_text(hjust = .5))

ggplot(train.data, aes(reorder(work_type, bmi), bmi, fill = work_type))+
  geom_boxplot()+
  labs(    title = "BMI by Work Type",
           x = "bmi")+
  theme(plot.title = element_text(hjust = .5))

# Chances of stroke are almost equally distributed among rural vs urban residents which means residence type 
# merely affects stroke probability.
o<-train.data%>%
  group_by(rural_res, stroke = as.factor(stroke))%>%
  summarise(count = n())%>%
  mutate(percent = round(count/sum(count)*100,2))%>%
print(as.data.frame(o))

# A tibble: 4 x 4
# Groups:   rural_res [2]
  rural_res stroke count percent
      <dbl> <fct>  <int>   <dbl>
1         0 0      21352   98.2 
2         0 1        399    1.83
3         1 0      21254   98.2 
4         1 1        384    1.77

# The difference between people who formerly smoked and never smoked and got a stroke is around 25%. 
p<-train.data%>%
  group_by(smoking_status, stroke = as.factor(stroke))%>%
  summarise(count = n())%>%
  mutate(percent = round(count/sum(count)*100,2))%>%
print(as.data.frame(p))

# A tibble: 8 x 4
# Groups:   smoking_status [4]
  smoking_status  stroke count percent
  <fct>           <fct>  <int>   <dbl>
1 formerly smoked 0       7266   97.0 
2 formerly smoked 1        221    2.95
3 never smoked    0      15767   98.2 
4 never smoked    1        284    1.77
5 smokes          0       6428   98.0 
6 smokes          1        133    2.03
7 Unknown         0      13145   98.9 
8 Unknown         1        145    1.09

ggplot(train.data,aes(x=smoking_status,group=stroke,fill=stroke))+
  geom_bar(position="dodge")+theme_minimal()

Regression analysis¶

# Based on high p-values(>0.5), we identify insignificant variables like married.
summary(glm(stroke ~ age+married+male+hypertension+bmi+avg_glucose_level+
              heart_disease, data=train.data, family="binomial"))

Call:
glm(formula = stroke ~ age + married + male + hypertension + 
    bmi + avg_glucose_level + heart_disease, family = "binomial", 
    data = train.data)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.8220  -0.1955  -0.1043  -0.0525   4.0453  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)       -8.2721737  0.2845278 -29.073  < 2e-16 ***
age                0.0700328  0.0029060  24.099  < 2e-16 ***
married           -0.0641752  0.1239570  -0.518  0.60465    
male               0.0900964  0.0751554   1.199  0.23061    
hypertension       0.3137743  0.0873849   3.591  0.00033 ***
bmi               -0.0093171  0.0060877  -1.530  0.12590    
avg_glucose_level  0.0037533  0.0006566   5.716 1.09e-08 ***
heart_disease      0.6266817  0.0941966   6.653 2.87e-11 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 7839.0  on 43388  degrees of freedom
Residual deviance: 6451.7  on 43381  degrees of freedom
AIC: 6467.7

Number of Fisher Scoring iterations: 8

# We will repeat the model excluding insignificant variables.
# AIC slightly reduced
model<-glm(stroke ~ age+hypertension+bmi+male+avg_glucose_level+
              heart_disease, data=train.data, family=binomial)
summary(model)

Call:
glm(formula = stroke ~ age + hypertension + bmi + male + avg_glucose_level + 
    heart_disease, family = binomial, data = train.data)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.8230  -0.1960  -0.1045  -0.0516   4.0553  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)       -8.3078422  0.2772934 -29.960  < 2e-16 ***
age                0.0697791  0.0028751  24.270  < 2e-16 ***
hypertension       0.3135123  0.0873690   3.588 0.000333 ***
bmi               -0.0094598  0.0060860  -1.554 0.120098    
male               0.0893218  0.0751373   1.189 0.234526    
avg_glucose_level  0.0037496  0.0006565   5.712 1.12e-08 ***
heart_disease      0.6275293  0.0941765   6.663 2.68e-11 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 7839  on 43388  degrees of freedom
Residual deviance: 6452  on 43382  degrees of freedom
AIC: 6466

Number of Fisher Scoring iterations: 8

exp(coef(model))

Interpreting results of algorithms¶

# The regression informs us that for every one unit increase in Age, 
# the odds of getting a stroke increases by a factor of 1.07.
# every one unit increase in hypertension, 
# the odds of getting a stroke increases by a factor of 1.37.
# every one unit increase in glucose level, 
# the odds of getting a stroke increases by a factor of 1.00.
# every one unit increase in heart disease, 
# the odds of getting a stroke increases by a factor of 1.87.

Evaluation of the model¶

# Pseudo R^2 to measure predictive power of the model
# McFadden value is close to 0.18 which indicates that it is a good model, however it has a room for improvement. 
pR2(model)

# Compute AUC for predicting stroke with the model
# ROC curve is pretty close to the upper left corner, which indicates the accuracy of the model. 
# Values above 0.80 indicate that the model does a good job in discriminating
# between the two categories which comprise our target variable.
prob <- predict(model, train.data, type="response")
pred <- prediction(prob, train.data$stroke)
perf <- performance(pred, measure = "tpr", x.measure = "fpr")
plot(perf)

Our analysis identified the factors which influence probability of getting a stroke. They are high glucose level, older age, obeisity, high blood pressure, heart desease, high stress level (self-employed).

	vars	n	mean	sd	median	trimmed	mad	min	max	range	skew	kurtosis	se
age	1	42606	41.74332911	22.3862265	43.00	41.9455495	26.68680	0.08	82.00	81.92	-0.086933019	-0.9935530	0.1084539843
hypertension	2	42606	0.09062104	0.2870728	0.00	0.0000000	0.00000	0.00	1.00	1.00	2.852024515	6.1341878	0.0013907744
heart_disease	3	42606	0.04424259	0.2056360	0.00	0.0000000	0.00000	0.00	1.00	1.00	4.432556783	17.6479738	0.0009962397
work_type*	4	42606	3.46512228	1.2935857	4.00	3.5813824	0.00000	1.00	5.00	4.00	-0.884896293	-0.5680437	0.0062670019
avg_glucose_level	5	42606	104.02555485	42.6122105	91.47	95.8772065	23.94399	55.00	291.05	236.05	1.700284299	2.3166207	0.2064422967
bmi	6	42606	28.58609211	7.6673450	28.00	28.0614135	6.81996	10.10	97.60	87.50	0.915753345	2.1515728	0.0371457919
smoking_status*	7	42606	2.59738065	1.0948393	2.00	2.6217215	1.48260	1.00	4.00	3.00	0.066447752	-1.3605695	0.0053041403
stroke	8	42606	0.00000000	0.0000000	0.00	0.0000000	0.00000	0.00	0.00	0.00	NaN	NaN	0.0000000000
male	9	42606	0.40773600	0.4914194	0.00	0.3846741	0.00000	0.00	1.00	1.00	0.375491223	-1.8590500	0.0023807671
married	10	42606	0.63911186	0.4802638	1.00	0.6738837	0.00000	0.00	1.00	1.00	-0.579300798	-1.6644496	0.0023267218
rural_res	11	42606	0.49884993	0.5000045	0.00	0.4985625	0.00000	0.00	1.00	1.00	0.004600141	-2.0000258	0.0024223594

	vars	n	mean	sd	median	trimmed	mad	min	max	range	skew	kurtosis	se
age	1	783	68.1434483	12.3165365	71.00000	69.6810207	11.86080	1.32	82.00	80.68	-1.11719691	1.3255164	0.44015711
hypertension	2	783	0.2554278	0.4363802	0.00000	0.1945774	0.00000	0.00	1.00	1.00	1.11948167	-0.7477108	0.01559496
heart_disease	3	783	0.2260536	0.4185417	0.00000	0.1578947	0.00000	0.00	1.00	1.00	1.30737999	-0.2911245	0.01495746
work_type*	4	783	4.0855683	0.8898884	4.00000	4.2344498	0.00000	1.00	5.00	4.00	-1.22232699	1.1557968	0.03180202
avg_glucose_level	5	783	129.5826692	59.7340728	104.47000	124.7507974	46.99842	55.01	271.74	216.73	0.62601436	-1.1313453	2.13472161
bmi	6	783	29.6422802	5.7692936	28.60504	29.1591792	4.14381	14.30	56.60	42.30	1.02686457	2.1128410	0.20617773
smoking_status*	7	783	2.2579821	1.0619480	2.00000	2.1977671	1.48260	1.00	4.00	3.00	0.40099783	-1.0615294	0.03795092
stroke	8	783	1.0000000	0.0000000	1.00000	1.0000000	0.00000	1.00	1.00	0.00	NaN	NaN	0.00000000
male	9	783	0.4495530	0.4977666	0.00000	0.4370016	0.00000	0.00	1.00	1.00	0.20243454	-1.9615205	0.01778873
married	10	783	0.8978289	0.3030668	1.00000	0.9968102	0.00000	0.00	1.00	1.00	-2.62200115	4.8811288	0.01083072
rural_res	11	783	0.4904215	0.5002278	0.00000	0.4880383	0.00000	0.00	1.00	1.00	0.03824782	-2.0010879	0.01787668

id	gender	age	hypertension	ever_married	work_type	Residence_type	avg_glucose_level	bmi	smoking_status
30669	Male	3	0	No	children	Rural	95.12	18.0
30468	Male	58	1	Yes	Private	Urban	87.96	39.2	never smoked
16523	Female	8	0	No	Private	Urban	110.89	17.6
56543	Female	70	0	Yes	Private	Rural	69.04	35.9	formerly smoked
46136	Male	14	0	No	Never_worked	Rural	161.28	19.1
32257	Female	47	0	Yes	Private	Urban	210.95	50.1