1: Data processing
2: Descriptive statistics
3: Visualizations
4: Model generation
5: Ensemble prediction with Stacking
6: Conclusion
Importing libraries
library(e1071)
library(tibble)
library(psych)
library(dplyr)
library(caret)
library(caretEnsemble)
library(GGally)
library(ggplot2)
library(monomvn)
library(randomForest)
library(kernlab)
library(elasticnet)
library(car)
library(corrplot)
Overview of data
data <- read.csv("insurance.csv")
glimpse(data)
#There are 1338 observations and 7 variables in the dataset. 3 categorical variables.
colSums(is.na(data))
#No missing values observed
#Converting variables into dummy variables
data$male=ifelse(data$sex=="male",1,0)
data$sex=NULL
data$smoker=ifelse(data$smoker=="yes",1,0)
head(data)
describe(data)
#The distribution of charges is right skewed.
#Distribution of dependent variable "charges". Charges $0-$15000 have the highest count
options(repr.plot.width=6, repr.plot.height=4)
hist(data$charges, col = 'lightblue', xlab="charges",main = 'Distribution of Medical Charges')
#BMI follows a normal distribution with the mean value around 31.
ggplot(data,aes(bmi))+
geom_histogram(aes(y=..density..),col="black",fill="lightblue")+
stat_function(fun=dnorm,color="red",args = list(mean=mean(data$bmi),sd=sd(data$bmi)),lwd=1.5)
#BMI and charges have positive linear relationships, however it is not a strong correlation
scatterplot(data$bmi,data$charges, xlab="bmi", ylab="charges",regLine=TRUE,col=carPalette(),main="BMI and charges")
#Children
#This graph shows proportion of children across all population in the sample. The distribution is skewed to the right.
# Almost half of the people have no chilred and the count decreases as number of children increases.
data.frame(prop.table(table(data$children)))%>%
ggplot(aes(Var1,Freq))+
geom_bar(stat="identity",fill="lightblue",col="black")+
geom_text(aes(y=Freq+.01,label=paste0(Freq*nrow(data)," / ",signif(Freq,3)*100)))+
labs(x="number of children",y="count / %")
ggplot(data, aes(reorder(children, charges), charges, fill = children))+
geom_boxplot()+
labs( title = "Charges by Number of Children",
x = "children")+
theme(plot.title = element_text(hjust = .5))
#Age
#The scatterplot depicts that the age follows uniform distribution, whereas charges are right skewed.
#We also observe positive linear relationships between these two variables. The older a person, the higher the charges are.
scatter.hist(data$age,data$charges, xlab="age",ylab="charges")
#Male
#There is no significant difference in median values of charges for males and females. However, charges for males demonstrate
#higher variability then for women.
ggplot(data, aes(as.factor(male), charges, fill = as.factor(male)))+
geom_boxplot()+
labs(title = "Charges by gender",
x = "male")+
scale_fill_discrete("male")+
theme(plot.title = element_text(hjust = .5))
#Region
#We observe outliers as well as difference in median values of charges per region.
#The highest charges are noticed in the northeast.
scatterplot(data$region,data$charges,xlab="region",ylab="charges")
#From scatterplotMatrix we can see distribution of each variable (uniform, normal, right skewed),
#linear relationships of age&bmi with charges; highest bmi is recorded in the southeast;
#inverse relationships of children and charges.
suppressWarnings(scatterplotMatrix(~ age+ region+bmi+charges+children, data=data, col="black", regline=TRUE, main="Full Picture"))
#Smoker
#The graph depicts almost 80% of smokers and 20% non-smokers. Let's see how it is in relationships with charges.
options(repr.plot.width=4, repr.plot.height=3)
data.frame(prop.table(table(data$smoker)))%>%
ggplot(aes(Var1,Freq))+
geom_bar(stat="identity",fill="lightblue",col="black")+
geom_text(aes(y=Freq+.03,label=paste0(Freq*nrow(data)," / ",signif(Freq,3)*100)))+
labs(x="smoker",y="count / %")
#We observe a significant contrast in charges for smokers vs non-smokers. Obviously, smokers are charged more.
ggplot(data, aes(as.factor(smoker), charges, fill = as.factor(smoker)))+
geom_boxplot()+
labs(title = "Smoker's vs Non-smoker's charges",
x = "smoker")+
scale_fill_discrete("smoker")+
theme(plot.title = element_text(hjust = .5))
#From the following illustration, we can additionally confirm the following:
# 1. there are more obese people among non-smokers than smokers
# 2. more obese people are observed in southeast.
# 3. obese people are charged more than non-obese.
# 4. the charges are higher in northeast than in other regions.
ggpairs(data, columns = c("smoker", "bmi","region", "children", "charges"), ggplot2::aes(colour=region))
# Correlation analysis
#One correlation stands out in this graph - a strong positive correlation between charges and smoker.
# Other variables are not as strongly correlated, the second largest one is age and charges (0.3), which is followed by the correlation
# of bmi and charges 0.2.
newdata<-data
for(i in which(sapply(newdata,is.factor))){
newdata[,i]<-as.numeric(newdata[,i])
}
corrplot::corrplot(cor(newdata),method = "number",type = "upper")
#We use randon forest to see which variables are more significant for the dataset.
#Based on the visualization, smoker, age and bmi are more significant. While choldren,
#region and gender are evidently less significant, which means they do not have a strong effect on the charges.
set.seed(123)
rf<-randomForest(charges~smoker+age+bmi+children+region+male,data,importance = T,ntree = 1000)
varImpPlot(rf)
#We will use linear regression to compare the results with the random forest output.
#Smoker, Regions, BMI, age, children are significant variables due to low p-values (<0.5)
children<-as.character(data$children)
model <-lm(charges ~ smoker+region+bmi+age+male+children+smoker, data)
summary(model)
#Residuals follow normal distribution with a slight skewness to the right
par(mfrow=c(1,1))
hist(model$residuals, col = 'lightblue', xlab="residuals",main = 'Histogram of Residuals')
#To achieve a higher accuracy in prediction, we will combine different model predictions into one ensemble prediction.
#We will use stacking method for it.
#Now we split the data into train and test datasets
train <- createDataPartition(y = data$charges,p = 0.75,list = FALSE)
training <- data[train,]
testing <- data[-train,]
#Next we create a repeated cross validation object
fit <- trainControl(method = 'repeatedcv',number = 10, repeats = 5, summaryFunction=defaultSummary)
#We need to build an object to hold the list of algorithms
algorithmList <- c('lm','pls','rf', 'svmRadial')
#Training of sub models
models <- suppressWarnings(caretList(charges~., data=training, trControl=fit, methodList=algorithmList, preProcess = c("center","scale")))
results <- resamples(models)
summary(results)
#This demonstrates that SVM is the optimal model out of four, it is followed by random forest.
dotplot(results, metric = 'Rsquared')
#Correlation of predictions
# Some of the models show relatively low correlation between each other, which means that they have different
# predictions. Ensemble prediction is more beneficial with the models suggesting different predictions.
modelCor(results)
#Combine predictions using GLM
stack.glm <- caretStack(models, method="glm", metric="RMSE", trControl=fit)
print(stack.glm)
summary(stack.glm)
#Checking performance of model on the test dataset
#Rsquared is 89% which indicates there is a room for improvement of the model
prediction <- predict(stack.glm, testing)
act_pred <- data.frame(obs=testing$charges, pred=prediction)
defaultSummary(act_pred)
#Plotting Actual vs predicted values
axisRange <- extendrange(c(testing$charges, prediction))
plot(testing$charges, prediction, ylim = axisRange, xlim = axisRange, main = 'Actual vs Predicted plot')
abline(0, 1, col = "lightblue", lty = 2)
We have analyzied the data and can conclude the following:
1) medical charges are highly dependant on smoking status. if a person smokes, he has much higher charges than the one who doesnt.
2) obesity also affects medical charges. Obese people get higher bills than non-obese people.
3) age influences the charges as well. Older people get to pay more.