"""
mushroom classification problem
author: Nur Mohammad Bijoy
contact: nurmdbijoy@gmail.com
dataset: https://github.com/nurbijoy/research/blob/master/dataset/mushroom.csv
credits: udacity.com, towardsdatascience.com, kaggle.com, google.com
"""
first of all, let's import important libraries
# importing the libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
and ignoring warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
now let's define the the datapath. note that I'm using google colab.
# defining the dataset
datapath = '/content/sample_data/mushroom.csv'
df = pd.read_csv(datapath)
ok, datapath is set. now let's get familiar with the dataset. for this, we'll check dataframe information such as
df.describe() | shows necessary information about the dataset
df.head() | head of the dataset
df.tail() | tail of the dataset
df.shape | dataset shape
df.columns | dataset columns
df.nunique() | unique feature types count
df.info() | doing some data analysis
print(df.describe())
print(df.head())
print(df.tail())
print(df.shape)
print(df.columns)
print(df.nunique())
print(df.info())
ok. we have checked the dataset. the dataset contains 8123 entries each having 23 columns. we can see that, none of the entries contains null values. so far it's okay. now let's visualize the data.
sb.countplot(x='Class', data=df)
so far, it's okay. but the problem is that, the data is in the form of string. our model can't work with the string data. we need numeric data.for that, we need to encode the data.
LabelEncoder() | performs label encoding.
That means, it replaces the string data to numeric data.
# pre-processing the data
le = LabelEncoder()
for feature in df.columns :
df[feature] = le.fit_transform(df[feature])
okay. lebel encoding is complete. let's check whether it worked or not. for that, we check dataframe head.
print(df.head())
the data was encoded successfully. we can see that the data contains only numeric values. now let's check correlation matrix to find the correlation among the features. for that, we use
df.corr() | finds the correlation matrix
sb.heatmap(df.corr(), annot = True) | visualize the correlation matrix
plt.figure(figsize=(20, 15))
corr = df.corr()
sb.heatmap(corr, annot = True)
plt.show()
perfect. we can see that feature Velitype has no relation at all. we can simply drop the feature. for that, we use
df = df.drop(["VeilType"],axis=1)
df = df.drop(["VeilType"],axis=1)
Now let's check whether it has dropped or not. for that we simply check,
'VeliType' in df | returns True if exist otherwise False
print('VeliType' in df.columns)
Cool. it has dropped. now let's have a quick look at the characteristics of the data. for that, we check
sb.violinplot(ax = ax, x="Characteristics", y="value", hue="Class", split = True, data=df_div, inner = 'quartile', palette = 'Set1')
df_no_class = df.drop(["Class"],axis = 1)
df_div = pd.melt(df, "Class", var_name="Characteristics")
fig, ax = plt.subplots(figsize=(20,15))
p = sb.violinplot(ax = ax, x="Characteristics", y="value", hue="Class", split = True, data=df_div, inner = 'quartile', palette = 'Set1')
df_no_class = df.drop(["Class"],axis = 1)
p.set_xticklabels(rotation = 90, labels = list(df_no_class.columns));
So cool. Now let's work with the models. For that, at first we need to split the data into features and class. for that we use,
x = df.loc[:, df.columns != "Class"]
y = df["Class"]
# Split into features and classes
x = df.loc[:, df.columns != "Class"]
y = df["Class"]
the data successfully splitted into features and class. now let's split the data into train and test. for that, we use
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3)
here, test_size=0.3 means we split whole data into 70:30 form so that 70% data will be used for training and 30% data will be used for testing.
# splitting data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3)
splitted successfully. now let's import models.
# algorithm
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
we'll train our data into 7 models and check which one classifies perfectly. for that we'll use 7 models names:
Naive Bayes
K Nearest Neighbour
Random Forest
Support Vector Machine
Logistic Regression
Decision Tree
Neural Network
Now let's create object of the models.
# model object
clfBayes = GaussianNB()
clfKNN = KNeighborsClassifier()
clfForest = RandomForestClassifier()
clfSVM = SVC()
clfLR = LogisticRegression()
clfTree = DecisionTreeClassifier()
clfNeural = MLPClassifier()
let's visualize the Decision Tree classifier for simplicity how it works. first let's create the model object.
# fitting model in decision tree
clfTree = clfTree.fit(x_train,y_train)
trained the model. now let's visualize how it works. for that, we need to import grapviz library. let's do it.
from sklearn import tree
import graphviz
dot_data = tree.export_graphviz(clfTree, out_file=None,
feature_names=x.columns,
filled=True, rounded=True,
special_characters=True)
graphviz.Source(dot_data)
Perfect. Let's check the importance of the features.
features_list = x.columns.values
feature_importance = clfTree.feature_importances_
sorted_idx = np.argsort(feature_importance)
plt.figure(figsize=(20,7))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), features_list[sorted_idx])
plt.xlabel('Importance')
plt.title('Feature importances')
plt.draw()
plt.show()
For the Decision Tree classifier, CapColor, Odor, GillAttachment, StalkRoot, GillColor, StalkSurfaceAboveRing, StalkColorAboveRing, StalkColorBelowRing, RingType and CapShape is not import. We can drop the features.
drop_list = ['CapColor', 'Odor', 'GillAttachment', 'StalkRoot', 'GillColor', 'StalkSurfaceAboveRing', 'StalkColorAboveRing', 'StalkColorBelowRing', 'RingType' ,'CapShape']
for drop_feature in drop_list:
df = df.drop([drop_feature],axis=1)
perfect. Now let's check features list.
print(df.columns)
print(df.columns.size)
Unnecessary features has removed. now let's train our models for all 7 models. for evaluating accuracy, we need an accuracy matrix.
# accuracy metrix
models = []
# fitting model in bayes
clfBayes.fit(x_train,y_train)
pred = clfBayes.predict(x_test)
models.append(accuracy_score(y_test,pred))
# fitting model in knn
clfKNN.fit(x_train,y_train)
pred = clfKNN.predict(x_test)
models.append(accuracy_score(y_test,pred))
# fitting model in forest
clfForest.fit(x_train,y_train)
pred = clfForest.predict(x_test)
models.append(accuracy_score(y_test,pred))
# fitting model in bayes
clfSVM.fit(x_train,y_train)
pred = clfSVM.predict(x_test)
models.append(accuracy_score(y_test,pred))
# fitting model in Logistic regression
clfLR.fit(x_train,y_train)
pred = clfLR.predict(x_test)
models.append(accuracy_score(y_test,pred))
# fitting model in decision tree
clfTree.fit(x_train,y_train)
pred = clfTree.predict(x_test)
models.append(accuracy_score(y_test,pred))
# fitting model in neural network
clfNeural.fit(x_train,y_train)
pred = clfNeural.predict(x_test)
models.append(accuracy_score(y_test,pred))
now it's time to print accuracy.
# printing accuracy
print('Bayes accuracy = ', models[0])
print('KNN accuracy = ', models[1])
print('Random Forest accuracy = ', models[2])
print('SVM accuracy = ', models[3])
print('Logistic Regression accuracy = ', models[4])
print('Decision Tree accuracy = ', models[5])
print('Neural Network accuracy = ', models[6])
Now let's see accuracy visually.
names = ['Bayes', 'KNN', 'RF', 'SVM', 'LR', 'DT', 'NN']
plt.figure(figsize=(20,5))
plt.bar(names, models, color=['black', 'red', 'green', 'blue', 'cyan', 'yellow', 'purple'])
plt.show()
Almost done. A few things need to be performed. We'll measure model performance. For that we'll check confusion matrix and AUC, ROC curve. let's start with the confusion matrix.
y_pred = clfNeural.predict(x_test)
conf = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 6))
sb.heatmap(conf , annot = True, linewidths=.5, cbar =None)
plt.title('Neural Network Classifier confusion matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label');
ROC Curve
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
plt.figure(figsize=(8,8))
plt.title('Reciever Operating Characteristics')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.plot(fpr,tpr, color='purple')
plt.show()
AUC Curve
plt.figure(figsize=(8,8))
plt.title('Area Under Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
auc = metrics.roc_auc_score(y_test, y_pred)
plt.plot(fpr,tpr,label="auc = "+str(auc), color='darkorange')
plt.legend()
plt.show()
Done.