Rewieghting Classes
If you have a class present in your dataset, you should reweight them to account for the lack in the data. Sklearns implementation of this allows you to pass the weights during the fitting portion of the model and reweights the data for your during the "fitting" process. You can also adjust the weights during a Grid Search by passing the data through GridSearch's "fit_params" argument.
Import Preliminaries¶
%matplotlib inline
%config InlineBackend.figure_format='retina'
# Import modules
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib as mpl
import numpy as np
import pandas as pd
import scipy
import sklearn
import seaborn as sns
import warnings
# Import Model Selection
from sqlalchemy import create_engine
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_sample_weight
# Set pandas options
pd.set_option('max_columns',1000)
pd.set_option('max_rows',30)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
# Set plotting options
mpl.rcParams['figure.figsize'] = (8.0, 7.0)
# Set warning options
warnings.filterwarnings('ignore');
Import Data¶
# Setup local database parameters
db_username = 'root'
db_password = 'mypassword'
host = '127.0.0.1'
port = '3306'
db_name = 'sloan_sky_survey'
# Create a MySQLEngine
engine = create_engine('mysql+mysqldb://'+db_username+':'+db_password+'@\
'+host+':'+port+'/'+db_name)
# Connect to database
engine.connect();
# Import data from SQL
df = pd.read_sql("""SELECT * FROM survey;""", engine).dropna(how='any', axis=0)
df.rename(columns={'class':'target'}, inplace=True)
df.head(10)
Preprocess Data¶
# One-Hot Encoding Data, and Label Encoding the target values
targets = df.target.astype('category').cat.codes
targets = pd.Series(targets, name='target')
encoded_df = pd.get_dummies(df.drop('target', axis=1))
encoded_df = pd.concat([encoded_df, targets ], axis=1)
X = encoded_df.drop('target',axis=1)
y = encoded_df.target
# Conduct a train-test-split on the data
train_x, test_x, train_y, test_y = train_test_split(X,y, test_size=0.25)
Data Overview¶
# Plot a barplot of the target clasees in our training set (1 = posinous, 0 = eatable by humans)
pd.Series(train_y).value_counts().plot.barh(grid=False, color=['#e0ecf4','#9ebcda','#8856a7'], width=0.25,edgecolor='w')
plt.title('Target Outcomes')
plt.ylabel('Target Class')
plt.xlabel('Number fo Records');
Calculate Weights¶
# Compute the sample weights, weights have only been computed based on the training target values
weights = compute_sample_weight(class_weight='balanced', y=train_y)
pd.Series(weights).head(10)
Fit the Model¶
# Pass weight into fit paramter
lg_model = LogisticRegression(C=10000)
lg_model.fit(train_x, train_y, sample_weight=weights)
Pass Weights into Grid Search¶
grid = {'C':list(range(10,100,20))}
random_search = GridSearchCV(estimator=lg_model,
param_grid = grid,
scoring = 'accuracy',
n_jobs=-1, refit=True,
cv = 5,
return_train_score=True,
fit_params= {'sample_weight':weights}) #Pass weights
# Fit the randomized search model with our data
random_search.fit(train_x,train_y)
General Notes¶
-- For grid searching the fit_params parameter in the sklearn 0.20 and up has been moved to the 1fit
method. I am using sklearn 0.19 for this example.
-- You can feed in custom weights into the compute sample weights function, but is best to just use 'balanced' for most use cases
-- Again the returned array already has weight as applied to original y value.
Sources¶
Author: Kavi Sekhon