Downsampling
In supervised learning, many datasets contain data that is class imbalanced. Therefore you will have to downsample the majority class to match the minority class shape. Below I have created a function that can downsample numpy data.
Import Preliminaries¶
# Import modules
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
Import Data¶
# Import data
iris = load_iris()
X, y = iris.data[:80], iris.target[:80]
View Class Imbalance¶
# View value counts
pd.Series(y).value_counts()
Retrieve Downsampled Class Index¶
# Indicies of each class' observation
class_0 = np.where(y==0)[0]
class_1 = np.where(y==1)[0]
class_1[:5], class_0[:5]
# Random select indices from the magority class to fit
# the minority class
class_0_downsampled = np.random.choice(class_0,
size=len(class_1),
replace=False)
class_0_downsampled
# Stack the minority class index and dowmsampled majority class
downsampled_index = np.hstack((class_0_downsampled, class_1))
downsampled_index
# Sort the index
downsampled_index.sort()
downsampled_index
# Down sample dataset
X, y = X[downsampled_index], y[downsampled_index]
# View new dataset sizes
print(f'DataFrame Shape of X: {len(X)}')
print(f'DataFrame Shape of y: {len(y)}')
# View the class counts of y
pd.Series(y).value_counts()
def binary_random_downsample(X, y):
'''
Function conduct downsampling for binary crossentropy problem
between to classes
Parameters
----------
X: numpy matrix containing all the features of the dataset
y: numpy 1D array containing all target values of the dataset
Examples
----------
>>>> X, y = binary_random_downsample(X,y)
'''
assert type(X) == np.ndarray, 'Make sure X is a numpy array'
assert type(y) == np.ndarray, "Make sure y is a numpy array"
class_0 = np.where(y==0)[0]
class_1 = np.where(y==1)[0]
if len(class_0) > len(class_1):
magc = class_0
minc = class_1
else:
magc = class_1
minc = class_0
mag_class_downsampled = np.random.choice(magc,
size=len(minc),
replace=False)
downsampled_index = np.hstack((mag_class_downsampled, minc))
downsampled_index.sort()
X, y = X[downsampled_index], y[downsampled_index]
return X,y
Testing our Function¶
# Import data and run our function, print the results
iris = load_iris()
X, y = iris.data[:80], iris.target[:80]
X, y = binary_random_downsample(X,y)
print(f'DataFrame Shape of X: {len(X)}')
print(f'DataFrame Shape of y: {len(y)}')
Author: Kavi Sekhon