DBSCAN
DBSCAN stands for density-based spatial clustering of applications with noise. The algorithim select random points on the hyperplane and if the points is connected to greater than n points within a defined disctance, the algorithim defines the points a a core sample. If the point is not a core sample it is defined as Noise. Then another point is chosen by the algorithim. Once all the point are marked as sample or noise the algorithim stops.
Import Preliminaries¶
%matplotlib inline
%config InlineBackend.figure_format='retina'
# Import modules
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib as mpl
import numpy as np
import pandas as pd
import sklearn
import seaborn as sns
import warnings
from sklearn.datasets import load_wine
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
# Set pandas options
pd.set_option('max_columns',1000)
pd.set_option('max_rows',30)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
# Set plotting options
mpl.rcParams['figure.figsize'] = (9.0, 3.0)
# Set warning options
warnings.filterwarnings('ignore');
Import Data¶
# Import Data
wine = load_wine()
X, y = wine.data, wine.target
# Train-test split the data
train_x, test_x, train_y, test_y = train_test_split(X,y)
wdf = pd.DataFrame(wine.data, columns=wine['feature_names'])
wdf = pd.concat([wdf, pd.DataFrame(wine.target, columns=['target'])], axis=1)
wdf.head(5)
Preprocessing¶
# Scale the data
scaler = StandardScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)
wdf = pd.DataFrame(train_x, columns=wine['feature_names'])
wdf = pd.concat([wdf, pd.DataFrame(train_y, columns=['target'])], axis=1)
wdf.head(5)
Ground Truth¶
# Target clases
train_y
# Plot Target Counts
target_counts = pd.Series(train_y).value_counts()
target_counts.plot.barh(colors=['#f2f0f7','#cbc9e2','#9e9ac8','#6a51a3']);
plt.title('Target Counts')
plt.xlabel('Count')
plt.ylabel('Target');
Fit the Model¶
# Fit the model
dbscan.fit(train_x)
Dbscan has not predict method. There are no centroide in the dbscan algorithim but we can use the fit_transform method a piece of the data.
Clusters¶
#View training predictions
pred_train_y = dbscan.fit_predict(train_x)
pred_train_y
# Plot Cluster Counts
target_counts = pd.Series(pred_train_y).value_counts()
target_counts.plot.barh(colors=['#f2f0f7','#cbc9e2','#9e9ac8','#6a51a3'],
edgecolor='white',hatch='.');
plt.title('Cluster Counts')
plt.xlabel('Count')
plt.ylabel('Cluser');
EPS Sensitivity¶
# Define paraameter range and score lists
eps_range = np.arange(0, 5,0.1)[1:]
clusters = []
# Train a knn_model for every neighbour value in our list
for i in eps_range:
dbscan=DBSCAN(eps = i).fit(train_x,train_y)
clusters.append(len((np.unique(dbscan.fit_predict(train_x)))))
# Plot our results
mpl.rcParams['figure.figsize'] = (9.0, 6.0)
plt.plot(eps_range,clusters,label="Train",linewidth=2, color='#6A51A3')
plt.legend()
plt.title('DSCAN EPS Sensitivity')
plt.xlabel('EPS')
plt.ylabel('Clusters');
Min Samples Sensitivety¶
# Define paraameter range and score lists
min_samples_range = np.arange(0, 5,0.1)[1:]
clusters = []
# Train a knn_model for every neighbour value in our list
for i in min_samples_range:
clusters.append(len((np.unique(dbscan.fit_predict(train_x)))))
# Plot our results
mpl.rcParams['figure.figsize'] = (9.0, 6.0)
plt.plot(min_samples_range, clusters, label="Train", linewidth=2, color='#6A51A3')
plt.legend()
plt.title('DSCAN Min Samples Sensitivity')
plt.xlabel('Min Samples')
plt.ylabel('Clusters');
General Notes¶
- Points within dense regions of the data are core samples
- Points within sparse regions of thed data are boundry sample
- Make sure you scale the data durinp preprocessing to makes sure the data is in the same range
Author: Kavi Sekhon