"""
    mushroom classification problem
    author: Nur Mohammad Bijoy
    contact: nurmdbijoy@gmail.com
    dataset: https://github.com/nurbijoy/research/blob/master/dataset/mushroom.csv
    credits: udacity.com, towardsdatascience.com, kaggle.com, google.com
"""

first of all, let's import important libraries

In [0]:
# importing the libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder

and ignoring warnings

In [0]:
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

now let's define the the datapath. note that I'm using google colab.

In [0]:
# defining the dataset
datapath = '/content/sample_data/mushroom.csv'
df = pd.read_csv(datapath)

ok, datapath is set. now let's get familiar with the dataset. for this, we'll check dataframe information such as

df.describe() | shows necessary information about the dataset
df.head() | head of the dataset
df.tail() | tail of the dataset
df.shape | dataset shape
df.columns | dataset columns
df.nunique() | unique feature types count
df.info() | doing some data analysis
In [0]:
print(df.describe())
        RingNumber
count  8124.000000
mean      1.069424
std       0.271064
min       0.000000
25%       1.000000
50%       1.000000
75%       1.000000
max       2.000000
In [0]:
print(df.head())
  CapShape CapSurface CapColor  ...  Population  Habitat      Class
0   convex     smooth    brown  ...   scattered    urban  poisonous
1   convex     smooth   yellow  ...    numerous  grasses     edible
2     bell     smooth    white  ...    numerous  meadows     edible
3   convex      scaly    white  ...   scattered    urban  poisonous
4   convex     smooth     gray  ...    abundant  grasses     edible

[5 rows x 23 columns]
In [0]:
print(df.tail())
     CapShape CapSurface CapColor  ...  Population Habitat      Class
8119  knobbed     smooth    brown  ...   clustered  leaves     edible
8120   convex     smooth    brown  ...     several  leaves     edible
8121     flat     smooth    brown  ...   clustered  leaves     edible
8122  knobbed      scaly    brown  ...     several  leaves  poisonous
8123   convex     smooth    brown  ...   clustered  leaves     edible

[5 rows x 23 columns]
In [0]:
print(df.shape)
(8124, 23)
In [0]:
print(df.columns)
Index(['CapShape', 'CapSurface', 'CapColor', 'Bruises', 'Odor',
       'GillAttachment', 'GillSpacing', 'GillSize', 'GillColor', 'StalkShape',
       'StalkRoot', 'StalkSurfaceAboveRing', 'StalkSurfaceBelowRing',
       'StalkColorAboveRing', 'StalkColorBelowRing', 'VeilType', 'VeilColor',
       'RingNumber', 'RingType', 'SporePrintColor', 'Population', 'Habitat',
       'Class'],
      dtype='object')
In [0]:
print(df.nunique())
CapShape                  6
CapSurface                4
CapColor                 10
Bruises                   2
Odor                      9
GillAttachment            2
GillSpacing               2
GillSize                  2
GillColor                12
StalkShape                2
StalkRoot                 5
StalkSurfaceAboveRing     4
StalkSurfaceBelowRing     4
StalkColorAboveRing       9
StalkColorBelowRing       9
VeilType                  1
VeilColor                 4
RingNumber                3
RingType                  5
SporePrintColor           9
Population                6
Habitat                   7
Class                     2
dtype: int64
In [0]:
print(df.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   CapShape               8124 non-null   object
 1   CapSurface             8124 non-null   object
 2   CapColor               8124 non-null   object
 3   Bruises                8124 non-null   bool  
 4   Odor                   8124 non-null   object
 5   GillAttachment         8124 non-null   object
 6   GillSpacing            8124 non-null   object
 7   GillSize               8124 non-null   object
 8   GillColor              8124 non-null   object
 9   StalkShape             8124 non-null   object
 10  StalkRoot              8124 non-null   object
 11  StalkSurfaceAboveRing  8124 non-null   object
 12  StalkSurfaceBelowRing  8124 non-null   object
 13  StalkColorAboveRing    8124 non-null   object
 14  StalkColorBelowRing    8124 non-null   object
 15  VeilType               8124 non-null   object
 16  VeilColor              8124 non-null   object
 17  RingNumber             8124 non-null   int64 
 18  RingType               8124 non-null   object
 19  SporePrintColor        8124 non-null   object
 20  Population             8124 non-null   object
 21  Habitat                8124 non-null   object
 22  Class                  8124 non-null   object
dtypes: bool(1), int64(1), object(21)
memory usage: 1.4+ MB
None

ok. we have checked the dataset. the dataset contains 8123 entries each having 23 columns. we can see that, none of the entries contains null values. so far it's okay. now let's visualize the data.

In [0]:
sb.countplot(x='Class', data=df)
Out[0]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fd61c902eb8>

so far, it's okay. but the problem is that, the data is in the form of string. our model can't work with the string data. we need numeric data.for that, we need to encode the data.

LabelEncoder() | performs label encoding.

That means, it replaces the string data to numeric data.

In [0]:
# pre-processing the data
le = LabelEncoder()
for feature in df.columns :
    df[feature] = le.fit_transform(df[feature])

okay. lebel encoding is complete. let's check whether it worked or not. for that, we check dataframe head.

In [0]:
print(df.head())
   CapShape  CapSurface  CapColor  ...  Population  Habitat  Class
0         2           3         0  ...           3        4      1
1         2           3         9  ...           2        0      0
2         0           3         8  ...           2        2      0
3         2           2         8  ...           3        4      1
4         2           3         3  ...           0        0      0

[5 rows x 23 columns]

the data was encoded successfully. we can see that the data contains only numeric values. now let's check correlation matrix to find the correlation among the features. for that, we use

df.corr() | finds the correlation matrix
sb.heatmap(df.corr(), annot = True) | visualize the correlation matrix
In [0]:
plt.figure(figsize=(20, 15))
corr = df.corr()
sb.heatmap(corr, annot = True)
plt.show()

perfect. we can see that feature Velitype has no relation at all. we can simply drop the feature. for that, we use

df = df.drop(["VeilType"],axis=1)
In [0]:
df = df.drop(["VeilType"],axis=1)

Now let's check whether it has dropped or not. for that we simply check,

'VeliType' in df | returns True if exist otherwise False
In [0]:
print('VeliType' in df.columns)
False

Cool. it has dropped. now let's have a quick look at the characteristics of the data. for that, we check

sb.violinplot(ax = ax, x="Characteristics", y="value", hue="Class", split = True, data=df_div, inner = 'quartile', palette = 'Set1')
df_no_class = df.drop(["Class"],axis = 1)
In [0]:
df_div = pd.melt(df, "Class", var_name="Characteristics")
fig, ax = plt.subplots(figsize=(20,15))
p = sb.violinplot(ax = ax, x="Characteristics", y="value", hue="Class", split = True, data=df_div, inner = 'quartile', palette = 'Set1')
df_no_class = df.drop(["Class"],axis = 1)
p.set_xticklabels(rotation = 90, labels = list(df_no_class.columns));

So cool. Now let's work with the models. For that, at first we need to split the data into features and class. for that we use,

x = df.loc[:, df.columns != "Class"]
y = df["Class"]
In [0]:
# Split into features and classes
x = df.loc[:, df.columns != "Class"]
y = df["Class"]

the data successfully splitted into features and class. now let's split the data into train and test. for that, we use

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3)

here, test_size=0.3 means we split whole data into 70:30 form so that 70% data will be used for training and 30% data will be used for testing.

In [0]:
# splitting data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3)

splitted successfully. now let's import models.

In [0]:
# algorithm
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

we'll train our data into 7 models and check which one classifies perfectly. for that we'll use 7 models names:

Naive Bayes
K Nearest Neighbour
Random Forest
Support Vector Machine
Logistic Regression
Decision Tree
Neural Network

Now let's create object of the models.

In [0]:
# model object
clfBayes = GaussianNB()
clfKNN = KNeighborsClassifier()
clfForest = RandomForestClassifier()
clfSVM = SVC()
clfLR = LogisticRegression()
clfTree = DecisionTreeClassifier()
clfNeural = MLPClassifier()

let's visualize the Decision Tree classifier for simplicity how it works. first let's create the model object.

In [0]:
# fitting model in decision tree
clfTree = clfTree.fit(x_train,y_train)

trained the model. now let's visualize how it works. for that, we need to import grapviz library. let's do it.

In [0]:
from sklearn import tree
import graphviz
dot_data = tree.export_graphviz(clfTree, out_file=None,
                         feature_names=x.columns,  
                         filled=True, rounded=True,  
                         special_characters=True)  
graphviz.Source(dot_data)
Out[0]:
Tree 0 SporePrintColor ≤ 2.5 gini = 0.499 samples = 5686 value = [2971, 2715] 1 GillSize ≤ 0.5 gini = 0.205 samples = 2725 value = [2409, 316] 0->1 True 14 RingNumber ≤ 1.5 gini = 0.308 samples = 2961 value = [562, 2399] 0->14 False 2 gini = 0.0 samples = 2307 value = [2307, 0] 1->2 3 Population ≤ 4.5 gini = 0.369 samples = 418 value = [102, 316] 1->3 4 StalkShape ≤ 0.5 gini = 0.286 samples = 382 value = [66, 316] 3->4 13 gini = 0.0 samples = 36 value = [36, 0] 3->13 5 CapShape ≤ 4.0 gini = 0.171 samples = 349 value = [33, 316] 4->5 12 gini = 0.0 samples = 33 value = [33, 0] 4->12 6 CapSurface ≤ 1.0 gini = 0.112 samples = 336 value = [20, 316] 5->6 11 gini = 0.0 samples = 13 value = [13, 0] 5->11 7 Habitat ≤ 5.0 gini = 0.363 samples = 84 value = [20, 64] 6->7 10 gini = 0.0 samples = 252 value = [0, 252] 6->10 8 gini = 0.0 samples = 20 value = [20, 0] 7->8 9 gini = 0.0 samples = 64 value = [0, 64] 7->9 15 VeilColor ≤ 1.5 gini = 0.128 samples = 2528 value = [173, 2355] 14->15 26 SporePrintColor ≤ 5.5 gini = 0.183 samples = 433 value = [389, 44] 14->26 16 gini = 0.0 samples = 67 value = [67, 0] 15->16 17 GillSpacing ≤ 0.5 gini = 0.082 samples = 2461 value = [106, 2355] 15->17 18 StalkSurfaceBelowRing ≤ 0.5 gini = 0.033 samples = 2380 value = [40, 2340] 17->18 23 Population ≤ 2.5 gini = 0.302 samples = 81 value = [66, 15] 17->23 19 StalkShape ≤ 0.5 gini = 0.405 samples = 142 value = [40, 102] 18->19 22 gini = 0.0 samples = 2238 value = [0, 2238] 18->22 20 gini = 0.0 samples = 40 value = [40, 0] 19->20 21 gini = 0.0 samples = 102 value = [0, 102] 19->21 24 gini = 0.0 samples = 15 value = [0, 15] 23->24 25 gini = 0.0 samples = 66 value = [66, 0] 23->25 27 gini = 0.0 samples = 44 value = [0, 44] 26->27 28 gini = 0.0 samples = 389 value = [389, 0] 26->28

Perfect. Let's check the importance of the features.

In [0]:
features_list = x.columns.values
feature_importance = clfTree.feature_importances_
sorted_idx = np.argsort(feature_importance)


plt.figure(figsize=(20,7))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), features_list[sorted_idx])
plt.xlabel('Importance')
plt.title('Feature importances')
plt.draw()
plt.show()

For the Decision Tree classifier, CapColor, Odor, GillAttachment, StalkRoot, GillColor, StalkSurfaceAboveRing, StalkColorAboveRing, StalkColorBelowRing, RingType and CapShape is not import. We can drop the features.

In [0]:
drop_list = ['CapColor', 'Odor', 'GillAttachment', 'StalkRoot', 'GillColor', 'StalkSurfaceAboveRing', 'StalkColorAboveRing', 'StalkColorBelowRing', 'RingType' ,'CapShape']
for drop_feature in drop_list:
  df = df.drop([drop_feature],axis=1)

perfect. Now let's check features list.

In [0]:
print(df.columns)
print(df.columns.size)
Index(['CapSurface', 'Bruises', 'GillSpacing', 'GillSize', 'StalkShape',
       'StalkSurfaceBelowRing', 'VeilColor', 'RingNumber', 'SporePrintColor',
       'Population', 'Habitat', 'Class'],
      dtype='object')
12

Unnecessary features has removed. now let's train our models for all 7 models. for evaluating accuracy, we need an accuracy matrix.

In [0]:
# accuracy metrix
models = []

# fitting model in bayes
clfBayes.fit(x_train,y_train)
pred = clfBayes.predict(x_test)
models.append(accuracy_score(y_test,pred))

# fitting model in knn
clfKNN.fit(x_train,y_train)
pred = clfKNN.predict(x_test)
models.append(accuracy_score(y_test,pred))

# fitting model in forest
clfForest.fit(x_train,y_train)
pred = clfForest.predict(x_test)
models.append(accuracy_score(y_test,pred))

# fitting model in bayes
clfSVM.fit(x_train,y_train)
pred = clfSVM.predict(x_test)
models.append(accuracy_score(y_test,pred))

# fitting model in Logistic regression
clfLR.fit(x_train,y_train)
pred = clfLR.predict(x_test)
models.append(accuracy_score(y_test,pred))


# fitting model in decision tree
clfTree.fit(x_train,y_train)
pred = clfTree.predict(x_test)
models.append(accuracy_score(y_test,pred))

# fitting model in neural network
clfNeural.fit(x_train,y_train)
pred = clfNeural.predict(x_test)
models.append(accuracy_score(y_test,pred))

now it's time to print accuracy.

In [0]:
# printing accuracy
print('Bayes accuracy = ', models[0])
print('KNN accuracy = ', models[1])
print('Random Forest accuracy = ', models[2])
print('SVM accuracy = ', models[3])
print('Logistic Regression accuracy = ', models[4])
print('Decision Tree accuracy = ', models[5])
print('Neural Network accuracy = ', models[6])
Bayes accuracy =  0.8687448728465955
KNN accuracy =  0.9979491386382281
Random Forest accuracy =  1.0
SVM accuracy =  1.0
Logistic Regression accuracy =  0.9384741591468416
Decision Tree accuracy =  1.0
Neural Network accuracy =  1.0

Now let's see accuracy visually.

In [0]:
names = ['Bayes', 'KNN', 'RF', 'SVM', 'LR', 'DT', 'NN']
plt.figure(figsize=(20,5))
plt.bar(names, models, color=['black', 'red', 'green', 'blue', 'cyan', 'yellow', 'purple'])
plt.show()

Almost done. A few things need to be performed. We'll measure model performance. For that we'll check confusion matrix and AUC, ROC curve. let's start with the confusion matrix.

In [0]:
y_pred = clfNeural.predict(x_test)
conf = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6, 6))
sb.heatmap(conf , annot = True,  linewidths=.5, cbar =None)
plt.title('Neural Network Classifier confusion matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label');
ROC Curve
In [0]:
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)

plt.figure(figsize=(8,8))
plt.title('Reciever Operating Characteristics')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.plot(fpr,tpr, color='purple')
plt.show()
AUC Curve
In [0]:
plt.figure(figsize=(8,8))
plt.title('Area Under Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

auc = metrics.roc_auc_score(y_test, y_pred)
plt.plot(fpr,tpr,label="auc = "+str(auc), color='darkorange')
plt.legend()
plt.show()
Done.