import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
https://www.kaggle.com/uciml/mushroom-classification
Attribute Information: (classes: edible=e, poisonous=p)
df = pd.read_csv('mushrooms.csv')
df.head()
df.info()
No missing values so that's good.
df['class'].value_counts()
df['class'].value_counts() / df['class'].count()
We can see that the edible and poisonous mushrooms are rather equally distributed so no need to balance them further.
Now let's explore each feature a bit. Let's see how the values of each feature are distributed, split by the mushroom class (edible, poisonous).
features = df.columns[1:]
print(features)
plt.style.use('lenk_style') # my custom style sheet
fig, ax = plt.subplots(len(features), 1, figsize=(10,120), sharey=True)
plt.subplots_adjust(hspace=.25)
for i in range(len(features)):
s = sns.countplot(x=features[i], data=df, hue='class', ax=ax[i])
ax[i].set_xlabel(features[i])
ax[i].set_ylabel('count')
ax[i].legend(loc=1)
for p in s.patches: # for each column on a chart
s.annotate(format(p.get_height(), '.0f'),
# (column's left edge + column witdh) / 2
(p.get_x() + p.get_width() / 2, p.get_height()),
ha='center',
va='center',
xytext=(0, 9), textcoords='offset points')
We can see that some of the features alone are pretty good predictors of a mushroom being edible or poisonous. For example, poisonous mushrooms have foul, fishy and spicy odours.
Now let's prepare the data for the models.
First, let's prepare x (features) and y (target) variables. For the target, I'm going to right away map the 'p' and 'e' to 1 and 0 respectively.
# Creating independent and dependent variables
x = df.iloc[:,1:].values
y = df.iloc[:,0].map({'p':1, 'e':0}).values
x
y
For all the features, we need to apply one-hot encoding since, at the moment, they contain the letter coding.
from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder()
x = onehotencoder.fit_transform(x).toarray()
Let's check the result.
print(len(x[0]))
x[0]
As a result of the one-hot encoding, one mushroom has 117 characteristics which is a lot. We'll deal with that shortly.
Let's proceed with splitting the data into test and training sets.
from sklearn.model_selection import train_test_split
random_state=42 # creating a variable which I can later use in the models as well
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=random_state)
Now let's apply principal component analysis to reduce the dimensionality of the data (currently 117). I'll go with having 2 components.
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)
x_train[0]
x_test[0]
Excellent, now we are ready to test out the performance of all the different models.
train = pd.DataFrame(x_train, columns=['PC1', 'PC2'])
train['p'] = y_train
plt.subplots(figsize=(7,7))
plt.scatter(x=train.loc[train['p']==0, 'PC1'], y=train.loc[train['p']==0, 'PC2'], alpha=.5, label='edible')
plt.scatter(x=train.loc[train['p']==1, 'PC1'], y=train.loc[train['p']==1, 'PC2'], alpha=.5, label='poisonous')
plt.xlabel('PC1')
plt.ylabel('PC2')
# plt.legend()
plt.savefig('mushrooms.png', bbox_inches='tight', dpi=400)
plt.show()
For the model metrics (confusion matrics and accuracy), I'll write a function which I cal reuse. I will also store each model's accuracy in a dictionary.
from sklearn.metrics import confusion_matrix, accuracy_score
acscore = {}
def model_metrics(input_classifier, model_name):
y_pred = classifier.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test, y_pred)
acscore[model_name] = ac
return cm, ac
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=random_state)
classifier.fit(x_train, y_train)
model_metrics(classifier, 'LogisticRegression')
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
classifier.fit(x_train, y_train)
model_metrics(classifier, 'KNeighborsClassifier')
# Training the RBF Kernel SVC on the Training set
from sklearn.svm import SVC
classifier = SVC(kernel='rbf', random_state=random_state)
classifier.fit(x_train, y_train)
model_metrics(classifier, 'KernelSVM')
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(x_train, y_train)
model_metrics(classifier, 'NaiveBeyes')
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion='entropy', random_state=random_state)
classifier.fit(x_train, y_train)
model_metrics(classifier, 'DecisionTreeClassifier')
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=random_state)
classifier.fit(x_train, y_train)
model_metrics(classifier, 'RandomForestClassifier')
# sorting the dictionary by value
acscore = dict(sorted(acscore.items(), key=lambda item: item[1]))
acscore
It looks like the Random Forest performed the best but Decision Tree and K-Nearest are not far behind. Logistic regression, the only linear model here, performed the worst.
plt.barh(list(acscore.keys()), list(acscore.values()), align='center')
for i, v in enumerate(acscore.values()):
plt.text(x=v, y=i, s=str(round(v*100, 2))+'%', va='center', ha='left')
plt.title('Classifier Models Accuracy')
plt.show()