Grid Search
Import Preliminaries¶
# Import modules
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib as mpl
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.cross_validation import cross_val_score
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import warnings
# Set pandas options
pd.set_option('max_columns',1000)
pd.set_option('max_rows',30)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
# Set plotting options
mpl.rcParams['figure.figsize'] = (8.0, 7.0)
# Set warning options
warnings.filterwarnings('ignore');
def grid_search_groupby(results: pd.DataFrame, param_1: str, param_2: str) -> pd.DataFrame:
'''
Create a aggregated dataframe from the grid search results use the two
hyper paramters that we pass into the function. We will be using this
function to plot heatmaps from our grid search.
Parameters
----------
results: DataFrame of Grid Score results.
Examples
----------
>>> (grid_search_groupby(results,'max_depth','n_estimators')
>>> grid_search_groupby(results,'max_leaf_nodes','n_estimators')
'''
assert (type(results) == type(pd.DataFrame())), 'results should be a pandas.core.frame.DataFrame'
assert (type(param_1) == str), 'param_1 should be a string'
assert (type(param_2) == str), 'param_2 should be a string'
params_df = pd.DataFrame.from_dict(list(results.params.values))
mean_test_score = results.mean_test_score
result_shrt_df = pd.concat([mean_test_score, params_df], axis=1)
result_groupby = result_shrt_df.groupby([param_1, param_2])['mean_test_score'].mean().unstack()
return result_groupby
Import Data¶
# Loading iris data set
iris = load_iris()
X, y = iris.data, iris.target
# Initialize the random forest model
rf_model = RandomForestClassifier()
# Setting up the grid
grid = {'n_estimators':np.arange(0,100, 25)[1:],
'max_depth':list(range(1,10,2)),
'max_leaf_nodes':[5,10,25,50,75,100]}
# Initialize with GridSearchCV with grid
grid_search = GridSearchCV(estimator=rf_model, param_grid=grid,
scoring='accuracy', n_jobs=-1, refit=True, cv=10,
return_train_score=True)
# Fit search
grid_search.fit(X,y);
# Print the best grid search score
print('Accuracy of best parameters: %.5f'%grid_search.best_score_)
print('Best parameters: %s' %grid_search.best_params_)
Baseline Cross Validation Score¶
# print the baseline score of our model with default parameters
scores = cross_val_score(rf_model, X, y, cv=10, scoring='accuracy', n_jobs=1)
print ('Baesline with default parameters: %.3f' %np.mean(scores))
# View the raw grid search scores
grid_search.grid_scores_[:3]
# View the results in a dataframe
results = pd.DataFrame(grid_search.cv_results_)
results.head(3)
Result Groupby¶
# View cross validation scores for between max_depth and n_estimator
result_groupby_1 = grid_search_groupby(results,'max_depth','n_estimators')
result_groupby_1
Paramter Plot¶
# Plot grid search results
mpl.rcParams['figure.figsize'] = (8.0, 7.0)
sns.heatmap(grid_search_groupby(results,'max_depth','n_estimators'),
cmap='plasma', annot=True, fmt='.4f');
plt.title('Grid Search Result: Max Depth vs N-Estimators');
plt.xlabel('N_Estimators')
plt.ylabel('Max Depth');
Multiple Parameter Plot¶
# Change figuresize
mpl.rcParams['figure.figsize'] = (20.0, 20.0)
# Set up grid
plt.figure()
fig = gridspec.GridSpec(3, 3)
grid_length = list(range(1, 3))
tuple_grid = [(i, j) for i in grid_length for j in grid_length]
plt.subplot2grid((3, 3), (0, 0))
sns.heatmap(grid_search_groupby(results,'max_depth','n_estimators'),
cmap='plasma', annot=True, fmt='.4f', center=.90);
plt.title('GS Result: Max Depth vs N-Estimators', fontsize=10);
plt.xlabel('N-Estimators')
plt.ylabel('Max Depth');
plt.subplot2grid((3, 3), (0, 1))
sns.heatmap(grid_search_groupby(results,'max_depth','max_leaf_nodes'),
cmap='plasma', annot=True, fmt='.4f', center=.90);
plt.title('GS Result: Max Depth vs Max Leaf Nodes', fontsize=10);
plt.xlabel('N_Estimators')
plt.ylabel('Max Depth');
plt.subplot2grid((3, 3), (1, 0))
sns.heatmap(grid_search_groupby(results,'max_leaf_nodes','n_estimators'),
cmap='plasma', annot=True, fmt='.4f', center=.90);
plt.title('GS Result: Max Leaf Nodes vs N-Estimators', fontsize=10);
plt.xlabel('N_Estimators')
plt.ylabel('Max Depth');
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
# Steps in the Pipeline
steps = [('scale', StandardScaler(0)),
('pca', PCA(n_components=2)),
('lr_model', LogisticRegression())]
# Create the Pipeline
pipeline = Pipeline(steps)
# Hyperparameters to test
pipeline_grid = {'lr_model__C': [0.1, 0.1, 1, 10, 100, 1000],
'pca__n_components': [1,2,3,4]}
# Run the grid search
grid_search = GridSearchCV(pipeline,
param_grid=pipeline_grid,
cv=10,
n_jobs=-1)
grid_search.fit(X,y);
View Pipeline Parameters¶
pipeline.get_params().keys()
pipeline_results = pd.DataFrame(grid_search.cv_results_)
grid_search_groupby(pipeline_results, 'lr_model__C','pca__n_components')
# Plot grid search results
mpl.rcParams['figure.figsize'] = (8.0, 7.0)
sns.heatmap(grid_search_groupby(pipeline_results,
'lr_model__C','pca__n_components'),
cmap='plasma', annot=True, fmt='.4f');
plt.xlabel('PCA Components')
plt.ylabel('Max Depth');
General Notes¶
Author: Kavi Sekhon