1: Data processing
2: Descriptive statistics
3: Visualizations
4: Regression analysis
5: Evaluation of the model
Importing libraries
library(plyr)
library(dplyr)
library(ggplot2)
library(DT)
library(psych)
library(purrr)
library(pscl)
library(ROCR)
Overview of data
train.data<-read.csv("train_2v.csv", header = TRUE)
head(train.data)
# We have 3 data types in the dataset: categorical, numeric, IDs.
# 12 variables and 43400 observations.
str(train.data)
a) Quantifying missing values
# BMI has 3.4% of missing values.
sort(apply(train.data, 2, function(x){sum(is.na(x))/length(x)}*100), decreasing = TRUE)
#Replacing NAs with mean values in bmi
train.data$bmi<-ifelse(is.na(train.data$bmi), mean(train.data$bmi, na.rm = TRUE), train.data$bmi)
#Replacing blanks with "Unknown" for smoking_status
train.data$smoking_status[train.data$smoking_status==""]<-NA
train.data$smoking_status<-as.factor(ifelse(is.na(train.data$smoking_status), "Unknown", paste(train.data$smoking_status)))
b) eliminating useless attributes (unique values - IDs)
train.data<-subset(train.data, select=-c(id))
#removing "others" from gender
train.data<-filter(train.data, gender!="Other")
c) converting categorical variables to dummy variables
train.data$male=ifelse(train.data$gender=="Male",1,0)
train.data$gender=NULL
train.data$married=ifelse(train.data$ever_married=="Yes",1,0)
train.data$ever_married=NULL
train.data$rural_res=ifelse(train.data$Residence_type=="Rural",1,0)
train.data$Residence_type=NULL
# This descriptive statistics demonstrates characteristics of each variable for both groups: stroke and non-stroke patients.
# From the statistical analysis, we notice that mean values for all variables besided rural_res are
# significantly higher for stroke patients than non-stroke patients, therefore we can preliminarily conclude
# that individuals who are older with hypertension and/or heart disease with higher level of glucose and bmi are in a
# risk group for stroke.
train.data %>% split(.$stroke) %>% map(describe)
T-test
# The T-test shows the difference between mean values of stroke and non-stroke patients for all variables
lapply(train.data[,c('age','male' , 'hypertension' , 'heart_disease' ,'married',
'rural_res' , 'avg_glucose_level' ,
'bmi' )], function(i) t.test(i ~ train.data$stroke))
# Count of stroke and non-stroke patients by age. This illustration shows that peak of the stoke patients are
# in the age range 78-82. The stroke is not oversed in the young group patients 0-32 years old. The average age for a
# stroke is about 68 years old.
options(repr.plot.width=6, repr.plot.height=4)
ggplot(train.data,aes(x=age,group=stroke,fill=stroke))+
geom_histogram(position="identity",binwidth=0.5)+theme_minimal()
# This graph shows distribution of healthy and unhealthy patients by gender.
ggplot(train.data,aes(x=male,group=stroke,fill=stroke))+
geom_histogram(position="identity",binwidth=0.5)+theme_minimal()
# The next table quntifies it and we can see that distibution is not significantly different for males and females.
m<-train.data%>%
group_by(male, stroke = as.factor(stroke))%>%
summarise(count = n())%>%
mutate(percent = round(count/sum(count)*100,2))%>%
print(as.data.frame(m))
# The next plot illustrates that married people are more likely (by about 2%) to get a stroke than the ones
# who are not married.
ggplot(train.data,aes(x=married,group=stroke,fill=stroke))+
geom_histogram(position="identity",binwidth=0.5)+theme_minimal()
# The next table give it a more precise look. Single individuals have about 5 times less chance to get a stroke than
# married people.
n<-train.data%>%
group_by(married, stroke = as.factor(stroke))%>%
summarise(count = n())%>%
mutate(percent = round(count/sum(count)*100,2))%>%
print(as.data.frame(n))
# Hipertension per stroke and non-stroke patients
ggplot(train.data,aes(x=hypertension,group=stroke,fill=stroke))+
geom_histogram(position="identity",binwidth=0.5)+theme_minimal()
# 1.5% out of people without hypertension typically get a stroke, while about 5% out of patients with hypertension get
# a stroke. # The latter indicates that people with hypertention have a higher risk to get a stroke than people who dont
# have hypertension.
k<-train.data%>%
group_by(hypertension, stroke = as.factor(stroke))%>%
summarise(count = n())%>%
mutate(percent = round(count/sum(count)*100,2))%>%
print(as.data.frame(k))
# People with high bmi have more chances to get a stroke.
bmi<-train.data %>%
group_by(stroke)%>%
summarise(bmi=mean(bmi))
ggplot(bmi,aes(x = stroke,y=bmi)) +
geom_bar(stat="identity",position=position_identity(), fill="lightblue")+theme_minimal()
# The same refers to the glucose level. Stroke patients typically have a higher glucose level than healthy people.
avg_clucose_level<-train.data %>%
group_by(stroke)%>%
summarise(avg_clucose_level=mean(bmi))
ggplot(avg_clucose_level,aes(x = stroke,y=avg_clucose_level)) +
geom_bar(stat="identity",position=position_identity(), fill="lightblue")+theme_minimal()
# Glucose level by age
ggplot(train.data, aes(x = age, y = avg_glucose_level)) +
geom_point() +
facet_wrap(~ stroke)+
geom_smooth(method = 'lm', color='lightblue')
# This boxplot demonstrates correlation between stroke and heart_desease. People with a heart disease
# have higher chances to get a stroke.
ggplot(train.data, aes(as.factor(heart_disease), age, fill = as.factor(heart_disease)))+
geom_boxplot()+
labs(title = "Heart Disease by stoke/non-stroke patients",
x = "heart_disease")+
scale_fill_discrete("heart_disease")+
facet_wrap(~ stroke)+
theme(plot.title = element_text(hjust = .5))
# The next illustration shows the most interesting observations about the stroke. For example, people who never worked
# have zero chance to get a stroke. However, self-employed have the highest risk factor 3.7%. It might be
# explained that they experience more stress, and more responsibilities.
l<-train.data%>%
group_by(work_type, stroke = as.factor(stroke))%>%
summarise(count = n())%>%
mutate(percent = round(count/sum(count)*100,2))%>%
print(as.data.frame(l))
# Glucose level per jobe type. never_worked group has the lowest glucose lelvel, high stress work type like self-employed
# relatively has the highest glucose value.
# The same interpretation can be applied to bmi.
ggplot(train.data, aes(reorder(work_type, avg_glucose_level), avg_glucose_level, fill = work_type))+
geom_boxplot()+
labs( title = "Glucose level by Work Type",
x = "work_type")+
theme(plot.title = element_text(hjust = .5))
ggplot(train.data, aes(reorder(work_type, bmi), bmi, fill = work_type))+
geom_boxplot()+
labs( title = "BMI by Work Type",
x = "bmi")+
theme(plot.title = element_text(hjust = .5))
# Chances of stroke are almost equally distributed among rural vs urban residents which means residence type
# merely affects stroke probability.
o<-train.data%>%
group_by(rural_res, stroke = as.factor(stroke))%>%
summarise(count = n())%>%
mutate(percent = round(count/sum(count)*100,2))%>%
print(as.data.frame(o))
# The difference between people who formerly smoked and never smoked and got a stroke is around 25%.
p<-train.data%>%
group_by(smoking_status, stroke = as.factor(stroke))%>%
summarise(count = n())%>%
mutate(percent = round(count/sum(count)*100,2))%>%
print(as.data.frame(p))
ggplot(train.data,aes(x=smoking_status,group=stroke,fill=stroke))+
geom_bar(position="dodge")+theme_minimal()
# Based on high p-values(>0.5), we identify insignificant variables like married.
summary(glm(stroke ~ age+married+male+hypertension+bmi+avg_glucose_level+
heart_disease, data=train.data, family="binomial"))
# We will repeat the model excluding insignificant variables.
# AIC slightly reduced
model<-glm(stroke ~ age+hypertension+bmi+male+avg_glucose_level+
heart_disease, data=train.data, family=binomial)
summary(model)
exp(coef(model))
# The regression informs us that for every one unit increase in Age,
# the odds of getting a stroke increases by a factor of 1.07.
# every one unit increase in hypertension,
# the odds of getting a stroke increases by a factor of 1.37.
# every one unit increase in glucose level,
# the odds of getting a stroke increases by a factor of 1.00.
# every one unit increase in heart disease,
# the odds of getting a stroke increases by a factor of 1.87.
# Pseudo R^2 to measure predictive power of the model
# McFadden value is close to 0.18 which indicates that it is a good model, however it has a room for improvement.
pR2(model)
# Compute AUC for predicting stroke with the model
# ROC curve is pretty close to the upper left corner, which indicates the accuracy of the model.
# Values above 0.80 indicate that the model does a good job in discriminating
# between the two categories which comprise our target variable.
prob <- predict(model, train.data, type="response")
pred <- prediction(prob, train.data$stroke)
perf <- performance(pred, measure = "tpr", x.measure = "fpr")
plot(perf)
Our analysis identified the factors which influence probability of getting a stroke. They are high glucose level, older age, obeisity, high blood pressure, heart desease, high stress level (self-employed).