1: Data processing
2: Descriptive statistics
3: Visualizations
4: Machine learning models
Importing libraries
import numpy as np
import pandas as pd
import os
import sklearn
from sklearn.ensemble import RandomForestClassifier
from collections import Counter
from sklearn.tree import DecisionTreeClassifier,export_graphviz
from sklearn.model_selection import train_test_split,cross_val_predict, cross_val_score
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.externals.six import StringIO
from IPython.display import Image
import pydotplus
import seaborn as sns
import category_encoders as ce
import matplotlib.pyplot as plt
import matplotlib as mlp
from pandas.plotting import scatter_matrix
import xgboost as xgb
from sklearn.metrics import mean_squared_error,confusion_matrix,classification_report,accuracy_score
Reading dataset
dataset=pd.read_csv("C:/Users/Maryna/Desktop/winter HW/dataset.csv")
dataset.head()
Data overview: 1781 values. 21 columns out of which 7 categorical variables
print(dataset.info())
dataset.drop(["URL", "WHOIS_REGDATE", "WHOIS_UPDATED_DATE"], axis=1, inplace = True)
b) Detection and interpolation of missing values
print(dataset.isnull().sum())
dataset = dataset.interpolate()
print(dataset.isnull().sum())
c) Feature Engineering on Categorical Data
dataset["SERVER"].value_counts()
#Grouping values with the least count into one bin "Other" to reduce number of unique values
series = pd.value_counts(dataset.SERVER)
mask = (series/series.sum() * 100).lt(1)
dataset['SERVER'] = np.where(dataset['SERVER'].isin(series[mask].index),'Other',dataset['SERVER'])
dataset["SERVER"].value_counts()
dataset["CHARSET"].value_counts()
#Grouping values with the least count into one bin "Other" to reduce number of unique values
series = pd.value_counts(dataset.CHARSET)
mask = (series/series.sum() * 100).lt(1)
dataset['CHARSET'] = np.where(dataset['CHARSET'].isin(series[mask].index),'Other',dataset['CHARSET'])
dataset["WHOIS_COUNTRY"].value_counts()
#Handling duplicate values for countries
def replace(x):
if x == "[u'GB'; u'UK']"or x=="United Kingdom" or x=="UK":
return "GB"
elif x == "Cyprus":
return "CY"
elif x == "us":
return "US"
elif x == "ru":
return "RU"
elif x == "se":
return "SE"
else:
return x
dataset["WHOIS_COUNTRY"] = list(map(lambda x: replace(x), dataset["WHOIS_COUNTRY"]))
#Grouping values with the least count into one bin "Other" to reduce number of unique values
counts = dataset['WHOIS_COUNTRY'].value_counts()
dataset['WHOIS_COUNTRY'] = np.where(dataset['WHOIS_COUNTRY'].isin(counts[counts < 10].index),'Other',dataset['WHOIS_COUNTRY'])
dataset["WHOIS_COUNTRY"].value_counts()
dataset["WHOIS_STATEPRO"].value_counts()
#Handling duplicate values for States
def replace_ny_ca(x):
if x == "California"or x=="CALIFORNIA":
return "CA"
elif x == "Arizona":
return "AZ"
elif x == "New York" or x=="NEW YORK":
return "NY"
elif x == "Ohio":
return "OH"
elif x == "Utah":
return "UT"
elif x == "None":
return "NA"
elif x == "Texas":
return "TX"
elif x == "Washington":
return "WA"
elif x == "va":
return "VA"
elif x == "Illinois" or x=="il":
return "IL"
elif x == "District of Columbia" or x=="DC" or x=="Maryland":
return "MD"
elif x == "New Jersey":
return "NJ"
elif x == "Maine" or x=="MAINE":
return "ME"
elif x == "Quebec" or x=="QUEBEC" or x=="qc" or x=="quebec":
return "QC"
elif x == "Missouri":
return "MO"
elif x == "Nevada":
return "NV"
elif x == "WC1N" or x=="Greater London" or x=="UK" or x=="WEST MIDLANDS" or x=="worcs" or x=="Peterborough" or x=="London" or x=="HANTS" or x=="MIDDLESEX":
return "England"
elif x == "Pennsylvania":
return "PA"
elif x == "Florida" or x=="FLORIDA":
return "FL"
else:
return x
dataset["WHOIS_STATEPRO"] = list(map(lambda x: replace_ny_ca(x), dataset["WHOIS_STATEPRO"]))
#Indentifying missing State values based on country column
for i,v in dataset.iterrows():
if v['WHOIS_STATEPRO']=='None' and v['WHOIS_COUNTRY']=='GB':
print ('England')
elif v['WHOIS_STATEPRO']=='None' and v['WHOIS_COUNTRY']=='SE':
print ('Sweden')
elif v['WHOIS_STATEPRO']=='None' and v['WHOIS_COUNTRY']=='LU':
print ('Luxembourg')
elif v['WHOIS_STATEPRO']=='None' and v['WHOIS_COUNTRY']=='FR':
print ('France')
elif v['WHOIS_STATEPRO']=='None' and v['WHOIS_COUNTRY']=='IL':
print ('Israel')
elif v['WHOIS_STATEPRO']=='None' and v['WHOIS_COUNTRY']=='BE':
print ('Belgium')
elif v['WHOIS_STATEPRO']=='None' and v['WHOIS_COUNTRY']=='NO':
print ('Norway')
elif v['WHOIS_STATEPRO']=='None' and v['WHOIS_COUNTRY']=='TR':
print ('Turkey')
elif v['WHOIS_STATEPRO']=='None' and v['WHOIS_COUNTRY']=='DE':
print ('Germany')
elif v['WHOIS_STATEPRO']=='None' and v['WHOIS_COUNTRY']=='BR':
print ('Brazil')
elif v['WHOIS_STATEPRO']=='None' and v['WHOIS_COUNTRY']=='JP':
print ('Japan')
elif v['WHOIS_STATEPRO']=='None' and v['WHOIS_COUNTRY']=='AU':
print ('Australia')
elif v['WHOIS_STATEPRO']=='None' and v['WHOIS_COUNTRY']=='PH':
print ('Philippines')
elif v['WHOIS_STATEPRO']=='None' and v['WHOIS_COUNTRY']=='CZ':
print ('CzechRep')
elif v['WHOIS_STATEPRO']=='None' and v['WHOIS_COUNTRY']=='KR':
print ('SKorea')
elif v['WHOIS_STATEPRO']=='None' and v['WHOIS_COUNTRY']=='UA':
print ('Ukraine')
elif v['WHOIS_STATEPRO']=='None' and v['WHOIS_COUNTRY']=='HK':
print ('Hong Kong')
elif v['WHOIS_STATEPRO']=='None' and v['WHOIS_COUNTRY']=='CH':
print ('Switzerland')
elif v['WHOIS_STATEPRO']=='None' and v['WHOIS_COUNTRY']=='CY':
print ('Cypres')
#Grouping values with the least count into one bin "Other" to reduce number of unique values
counts = dataset['WHOIS_STATEPRO'].value_counts()
dataset['WHOIS_STATEPRO'] = np.where(dataset['WHOIS_STATEPRO'].isin(counts[counts < 20].index),'Other',dataset['WHOIS_STATEPRO'])
dataset["WHOIS_STATEPRO"].value_counts()
#binary encoding of categirical variables
dataset_ce = dataset.copy()
encoder = ce.BinaryEncoder(cols=['WHOIS_STATEPRO','WHOIS_COUNTRY', 'CHARSET', 'SERVER'])
df_binary = encoder.fit_transform(dataset_ce)
df_binary.head()
dataset.describe()
%matplotlib inline
sns.set(style="darkgrid")
sns.countplot(x="Type", data=dataset)
plt.title('Benign vs Malicious Websites',fontsize=30)
plt.ylabel('Number of Occurrences',fontsize=20)
plt.xlabel('Type',fontsize=20)
plt.figure(figsize=(16, 6))
#Segregating the classes
yes = dataset[dataset.Type == 1]
no = dataset[dataset.Type == 0]
print('YES : %d No: %d'%(len(yes), len(no)))
ratio=round(len(yes)/(len(yes)+len(no))*100)
print("Malitious websites ratio is",ratio,"%")
dataset.drop(dataset.loc[dataset['WHOIS_COUNTRY']=="None"].index, inplace=True)
dataset[dataset['Type']==1].groupby('WHOIS_COUNTRY')['WHOIS_COUNTRY'].count().sort_values(ascending=False).head(5).plot(kind='bar')
sns.set(style="darkgrid")
plt.title('Frequency Distribution of Malicious Websites per Country',fontsize=15)
plt.ylabel('Number of Occurrences', fontsize=20)
plt.xlabel('Country', fontsize=20)
plt.show()
dataset.drop(dataset.loc[dataset['WHOIS_STATEPRO']=="NA"].index, inplace=True)
dataset[dataset['Type']==1].groupby('WHOIS_STATEPRO')['WHOIS_STATEPRO'].count().sort_values(ascending=False).head(6).plot(kind='bar')
sns.set(style="darkgrid")
plt.title('Frequency Distribution of Malicious Websites per State', fontsize=15)
plt.ylabel('Number of Occurrences', fontsize=20)
plt.xlabel('State', fontsize=20)
plt.show()
sns.countplot(y="Type", hue='CHARSET', data=dataset)
plt.title('Frequency Distribution of Charset',fontsize=25)
plt.ylabel('Number of Occurrences', fontsize=20)
plt.xlabel('Charset', fontsize=20)
dataset.drop(dataset.loc[dataset['SERVER']=="Other"].index, inplace=True)
dataset[dataset['Type']==1].groupby('SERVER')['SERVER'].count().sort_values(ascending=False).head(5).plot(kind='barh')
sns.set(style="darkgrid")
plt.title('Frequency Distribution of Malicious Websites per Server',fontsize=15)
plt.xlabel('Server', fontsize=20)
plt.show()
#Density distribution of SOURCE_APP_PACKETS
#Mean value is in the range of 14 and 18. right skewed
sns.distplot(df_binary['SOURCE_APP_PACKETS'], hist=True, kde=True,
color = 'darkblue',
hist_kws={'edgecolor':'black'},
kde_kws={'linewidth': 4})
plt.xlim(0, 80)
#Positive correlation among some of the variables and right skewed histograms are observed
scatter_matrix(df_binary[['NUMBER_SPECIAL_CHARACTERS','TCP_CONVERSATION_EXCHANGE','URL_LENGTH','DNS_QUERY_TIMES']])
plt.show()
bplot = sns.boxplot(y='DNS_QUERY_TIMES', x='WHOIS_COUNTRY',
data=dataset,
width=0.5,
palette="colorblind")
bplot = sns.boxplot(x='REMOTE_IPS', y='SERVER',
data=dataset,
width=0.5,
palette="colorblind")
#split dataset in features and target variable
X = df_binary.drop('Type',axis=1) #Predictors
y = df_binary['Type']
#apply SelectKBest class to extract top 15 best features
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score'] #naming the dataframe columns
print(featureScores.nlargest(15,'Score')) #print 15 best features
feature_cols = ['SOURCE_APP_BYTES','CONTENT_LENGTH','APP_BYTES', 'REMOTE_APP_BYTES', 'DIST_REMOTE_TCP_PORT', 'URL_LENGTH',
'WHOIS_COUNTRY_1','TCP_CONVERSATION_EXCHANGE', 'NUMBER_SPECIAL_CHARACTERS', 'WHOIS_STATEPRO_1',
'REMOTE_APP_PACKETS', 'SOURCE_APP_PACKETS', 'APP_PACKETS','WHOIS_COUNTRY_4','WHOIS_STATEPRO_3']
X = df_binary[feature_cols] # Features
y = df_binary.Type # Target variable
# To understand model performance, dividing the dataset into a training set and a test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(criterion="entropy", max_depth=12)
# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)
#Predict the response for test dataset
y_pred = clf.predict(X_test)
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,
filled=True, rounded=True,
special_characters=True, feature_names = feature_cols,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,
filled=True, rounded=True,
special_characters=True, feature_names = feature_cols,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
#Create a Gaussian Classifier
rf=RandomForestClassifier(n_estimators=100)
#Train the model using the training sets y_pred=rf.predict(X_test)
rf.fit(X_train,y_train)
y_pred=rf.predict(X_test)
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
# Visualize our results
def print_score(classifier,X_train,Y_train,X_test,Y_test,train=True):
if train == True:
print("Training results:\n")
print('Accuracy Score: {0:.4f}\n'.format(accuracy_score(Y_train,classifier.predict(X_train))))
print('Classification Report:\n{}\n'.format(classification_report(Y_train,classifier.predict(X_train))))
print('Confusion Matrix:\n{}\n'.format(confusion_matrix(Y_train,classifier.predict(X_train))))
res = cross_val_score(classifier, X_train, Y_train, cv=10, n_jobs=-1, scoring='accuracy')
print('Average Accuracy:\t{0:.4f}\n'.format(res.mean()))
print('Standard Deviation:\t{0:.4f}'.format(res.std()))
elif train == False:
print("Test results:\n")
print('Accuracy Score: {0:.4f}\n'.format(accuracy_score(Y_test,classifier.predict(X_test))))
print('Classification Report:\n{}\n'.format(classification_report(Y_test,classifier.predict(X_test))))
print('Confusion Matrix:\n{}\n'.format(confusion_matrix(Y_test,classifier.predict(X_test))))
print_score(rf,X_train,y_train,X_test,y_test,train=False)
def create_graph(forest, feature_names):
estimator = forest.estimators_[5]
export_graphviz(estimator, out_file='tree.dot',
feature_names = feature_names,
class_names = ['benign', 'malicious'],
rounded = True, proportion = False, precision = 2, filled = True)
# Convert to png using system command (requires Graphviz)
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=200'])
create_graph(rf, list(X))
# Display in jupyter notebook
from IPython.display import Image
Image(filename = 'tree.png')
#Converting the dataset into an optimized data structure called Dmatrix
data_dmatrix = xgb.DMatrix(data=X,label=y)
#Instantiating XGBoost regressor object by calling the XGBRegressor() class from the XGBoost library with the
#hyper-parameters passed as arguments
xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
max_depth = 5, alpha = 10, n_estimators = 10)
#Fitting the regressor to the training set and making predictions on the test set
xg_reg.fit(X_train,y_train)
preds = xg_reg.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))
#In order to build more robust models, let's do a k-fold cross validation where all the entries in the
#original training dataset are used for both training as well as validation
params = {"objective":"reg:linear",'colsample_bytree': 0.3,'learning_rate': 0.1,
'max_depth': 5, 'alpha': 10}
cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
num_boost_round=50,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123)
# RMSE for prediction has reduced as compared to the first time
print((cv_results["test-rmse-mean"]).tail(1))
#Plotting feature importance
xg_reg = xgb.train(params=params, dtrain=data_dmatrix, num_boost_round=10)
xgb.plot_importance(xg_reg)
plt.rcParams['figure.figsize'] = [5, 5]
plt.show()
xgb.plot_tree(xg_reg,num_trees=0)
plt.rcParams['figure.figsize'] = [20, 10]
plt.show()