31 Aug

## Confusion Matrices

Confusion matrices are commonly used in most classifications problems. I used constantly in a recent fraud detection challenge to see the potential trade-off between my False Positives and False Negatives. The template code I started off with was from Sklearn's documentation page. I have taken this code and modified it to my liking to be a bit more descriptive about the model's performance. Take a look a the original code at the link below or use my modified function for your tasks. Otherwise, take a look at the confusion matrix's functions' documentation below. I plot the results of two models with it.

Matplotlib Color Maps: https://matplotlib.org/tutorials/colors/colormaps.html

### Import Preliminaries¶

In [1]:
# Import Modulse
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Import plottting functions
from matplotlib import rcParams, gridspec

# Import preprocessing functions
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import confusion_matrix

# Import sklearn models
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression

# Plotting Configuration
rcParams['figure.figsize'] = (30.0, 10.0)
rcParams.update({'font.size': 20})
colors = ['#74a9cf', '#6a51a3']


### Import Data¶

In [2]:
# Import dataset

# Append dataframe to a list
dfs = [kdddf]

# Lower columns names in dataframes
for df in dfs:
df.columns = df.columns.str.lower()
if df.columns.contains('class') == True:
df.columns = df.columns.str.replace('class', 'target')

# Change target column names
kdddf = kdddf.rename(columns={'normal.':'target'})


### Data Preprocessing¶

##### Encoding the Data¶
In [3]:
# Encode target value to 0 and 1
kdddf.target = [0 if i == 'normal.' else 1 for i in kdddf.target]

# drop feature that contains not data
kdddf.drop(['0.13'], axis=1,inplace=True)

# Encode all object colunms to categorical codes
for col in kdddf.select_dtypes('object'): #
kdddf[col] = kdddf[col].astype('category').cat.codes

##### Sub Sampling and Preprocessing the Data¶
In [4]:
# Define working directory
wdf = kdddf
working_file_name = 'kdd'.lower() #synthetics, kdd, credit_card
print(f'Dataset: {working_file_name}')

# define the classes for the confusion matrix
classes = list(np.unique(wdf['target'].values))

# Encode value to numpy
X = wdf.drop('target',axis=1).values
y = wdf['target'].values

# Conduct a train-test-split on the data
train_x, test_x, train_y, test_y = train_test_split(X,y, test_size=0.25)

# Compute the sample weights of the dataset
weights = compute_sample_weight(class_weight='balanced', y=train_y)

# Compuete the number of PCA comentents we are goint to use
components = round(train_x.shape[1]/1.5)

# Saving a backup copy the original data
train_x_norm, train_y_norm = train_x.copy(), train_y.copy()
test_x_norm, test_y_norm = test_x.copy(), test_y.copy()

# Print preprocessing title
print('='*10,'Preprocessing','='*10,'\n')

if working_file_name != 'credit_card':
# Transform the data using PCA
pca = PCA(n_components=components)
train_x = pca.fit_transform(train_x)
test_x = pca.transform(test_x)
print(f'PCA Components: {components}')
else:
print('No PCA Used')

# Transform the data using a standard scaler
scaler = StandardScaler(copy=True)
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)
print(f'Standard Scaler: Yes')

# Print data shape title
print('\n'+'='*10,'Data Shape','='*10,'\n')

# Printing metrics around the shape of the datframe

print(f'\nDataFrame Shape of X: {len(X)}')
print(f'DataFrame Shape of y: {len(y)}')
print(f'DataFrame Shape of X: {len(train_x)}')
print(f'DataFrame Shape of y: {len(train_y)}')
print(f'DataFrame Shape of X: {len(test_x)}')
print(f'DataFrame Shape of y: {len(test_y)}')
print(f'Target Classes: {classes}')

# KNN Deviations
if working_file_name =='synthetics':

print('\n'+'='*10,'KNN Data Shape','='*10,'\n')

X_reduce = wdf.drop(['target','oldbalanceorg'],axis=1).values
y_reduce = wdf['target'].values

# Sub sample the dataframe
if len(X_reduce) > sub_sample_size:
X_reduce, y_reduce = sub_sampling(X_reduce,y_reduce, sub_sample_size)

# Conduct a train-test-split on the data
train_x_reduce, test_x_reduce, train_y_reduce, test_y_reduce = train_test_split(
X_reduce,y_reduce, test_size=0.25)
print(f'\nDataFrame Shape of X: {len(X_reduce)}')
print(f'DataFrame Shape of y: {len(y_reduce)}')
print(f'DataFrame Shape of X: {len(train_x_reduce)}')
print(f'DataFrame Shape of y: {len(train_y_reduce)}')
print(f'DataFrame Shape of X: {len(test_x_reduce)}')
print(f'DataFrame Shape of y: {len(test_y_reduce)}')
print(f'Target Classes: {classes}')

Dataset: kdd
========== Preprocessing ==========

PCA Components: 27
Standard Scaler: Yes

========== Data Shape ==========

DataFrame Shape of X: 4898430
DataFrame Shape of y: 4898430
DataFrame Shape of X: 3673822
DataFrame Shape of y: 3673822
DataFrame Shape of X: 1224608
DataFrame Shape of y: 1224608
Target Classes: [0, 1]


### Defining Confusion Matrix Function¶

In [7]:
# Plot a confusion matrix function
def plot_confusion_matrix(cm, classes, fontsize=20,
normalize=False, title='Confusion matrix',
cmap=plt.cm.Blues):
'''
THE MAIN CONFUSION MATRIX, KAVI DON'T DELTETE BY ACCIDENT AGAIN. Function plots a
confusion matrix given a cm matrix and class names

Parameters
----------
cm: sklearn confusion matrix
classes: numpy 1D array containing all unique class names

Examples
---------
>>>>

plot_confusion_matrix(
cm,
classes,
fontsize=25,
normalize=True,
title=model.name.capitalize() + ': Test Set',
cmap=plt.cm.Greens)

'''
cm_num = cm
cm_per = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
# print("Normalized confusion matrix")
else:
None
# print('Confusion matrix, without normalization')

# print(cm)

plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title.replace('_',' ').title()+'\n', size=fontsize)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45, size=fontsize)
plt.yticks(tick_marks, classes, size=fontsize)

fmt = '.5f' if normalize else 'd'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
# Set color parameters
color = "white" if cm[i, j] > thresh else "black"
alignment = "center"

# Plot perentage
text = format(cm_per[i, j], '.5f')
text = text + '%'
plt.text(j, i,
text,
fontsize=fontsize,
verticalalignment='baseline',
horizontalalignment='center',
color=color)
# Plot numeric
text = format(cm_num[i, j], 'd')
text = '\n \n' + text
plt.text(j, i,
text,
fontsize=fontsize,
verticalalignment='center',
horizontalalignment='center',
color=color)

plt.tight_layout()
plt.ylabel('True label'.title(), size=fontsize)
plt.xlabel('Predicted label'.title(), size=fontsize)

return None

# Plot a train and test confusion matrix together
def plot_confusion_normal(model, train_x, train_y,
test_x, test_y, cmap=plt.cm.Greens):
'''
Fuction plota grid and calls the plot_confusion_matrix function
to plot two confusion matrices. One for the tarin set and another
for the test set

Parameters
----------
cm: sklearn confusion matrix
classes: numpy 1D array containing all unique class names

Examples
----------
>>>> plot_confusion_normal(xg_model, train_x, train_y)
>>>> plot_confusion_normal(rf_model, train_x, train_y)
'''

# Set the plot size
rcParams['figure.figsize'] = (30.0, 22.5)

# Set up grid
plt.figure()
fig = gridspec.GridSpec(3, 3)
grid_length = list(range(1, 3))
tuple_grid = [(i, j) for i in grid_length for j in grid_length]

# Plot Training Confusion Matrix
plt.subplot2grid((3, 3), (0, 0))
cm = confusion_matrix(train_y, model.predict(train_x))
plot_confusion_matrix(
cm,
classes,
fontsize=25,
normalize=True,
title=model.name.capitalize() + ': Train Set',
cmap=cmap)

# Plot Testing Confusion Matrix
plt.subplot2grid((3, 3), (0, 1))
cm = confusion_matrix(test_y, model.predict(test_x))
plot_confusion_matrix(
cm,
classes,
fontsize=25,
normalize=True,
title=model.name.capitalize() + ': Test Set',
cmap=cmap)

return None


### Plot Confusion Matrix¶

##### Dummy Model¶
In [8]:
# Import and fit dummy model
dm_model = DummyClassifier(strategy='constant', constant=1)
dm_model.name = 'dummy_model'
dm_model.fit(train_x, train_y, sample_weight = weights)

# Plot Model Results
plot_confusion_normal(dm_model, train_x=train_x, train_y=train_y,
test_x=test_x ,test_y=test_y, cmap=plt.cm.Greys)


### Logistic Regression¶

In [7]:
# Import fit logistic regression model
lr_model = LogisticRegression( random_state=7)
lr_model.name = "logistic_regression"
lr_model.fit(train_x, train_y, sample_weight = weights)

# Plot Model Reullts
plot_confusion_normal(lr_model, train_x=train_x, train_y=train_y,
test_x=test_x ,test_y=test_y)


Author: Kavi Sekhon