Feature Selection

Feature engineering is one of the most important task in any machine learning project. Feature selection is one of the subtask that is carried out to select the features which best represent the target variable. There are various methods for feature selection that will be using the project including feature selection using correlation, select k best using chi square , select from models such as logistic regression, random forest etc. Feature selection can be:

  1. Filter-based

  2. Wrapper-based

  3. Embedded methods

# Load the packages
import warnings
warnings.filterwarnings("ignore")
import json
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, RFE, SequentialFeatureSelector, SelectFromModel
from sklearn.feature_selection import chi2, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
# Load the data
df = pd.read_csv('./../../../data/train/train.csv')

Let us assume that out of 53 colums that are available we want around 30 columns.

# Declare the number of features required
n_feat = 30
# Separate out the data into features and target variable
y = df['Attrition']
X = df.drop('Attrition', axis=1)

Filter-based methods

High Correlation Filter

Correlation

Starting with our first method first-based method which is dependent on correlation, we eliminate columns whose correlation coefficients is greater than 0.75.

# Create correlation matrix
corr_matrix = df.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find features with correlation greater than 0.75
to_drop = [column for column in upper.columns if any(upper[column] > 0.75)]

print(f"Columns to be dropped: {to_drop}")
Columns to be dropped: ['TotalWorkingYears', 'YearsInCurrentRole', 'YearsWithCurrManager', 'JobLevel', 'PerformanceRating', 'Department Sales', 'Gender Male', 'JobRole Human Resources']
# Create a resulting dataframe for the output
columns = df.columns
result = [True if column not in to_drop else False for column in columns]
correlation = pd.DataFrame(columns=['Correlation'], index=columns)
correlation['Correlation'] = result

Univariate Selection Methods

SelectKBest using f_classif

# Declare the transformer
f_class_selector = SelectKBest(k=n_feat)
f_class_selector.fit(X, y)
SelectKBest(k=30)
# Get the features from the support
f_class_support = f_class_selector.get_support()
f_class_features = X.loc[:, f_class_support].columns.tolist()
print(f"Feature select with f classif: {f_class_features}")
Feature select with f classif: ['Age', 'DailyRate', 'DistanceFromHome', 'MonthlyIncome', 'TotalWorkingYears', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsWithCurrManager', 'EnvironmentSatisfaction', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'OverTime', 'StockOptionLevel', 'BusinessTravel Non-Travel', 'BusinessTravel Travel_Rarely', 'Department Human Resources', 'Department Research & Development', 'EducationField Life Sciences', 'EducationField Medical', 'EducationField Other', 'Gender Female', 'JobRole Healthcare Representative', 'JobRole Manager', 'JobRole Manufacturing Director', 'JobRole Research Director', 'JobRole Sales Representative', 'MaritalStatus Divorced', 'MaritalStatus Married', 'MaritalStatus Single']
# Store the result in dataframe
columns = df.columns
result = [True if column in f_class_features else False for column in columns]
f_class_df = pd.DataFrame(columns=['F classif'], index=columns)
f_class_df['F classif'] = result

SelectKBest using chi square

# Declare the transformer
chi2 = SelectKBest(chi2, k=n_feat)
chi2.fit(X,y)
SelectKBest(k=30, score_func=<function chi2 at 0x1238bca60>)
# Get the features from the support
chi2_support = chi2.get_support()
chi2_features = X.loc[:, chi2_support].columns.tolist()
print(f"Features selected with chi2: {chi2_features}")
Features selected with chi2: ['Age', 'MonthlyIncome', 'TotalWorkingYears', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsWithCurrManager', 'EnvironmentSatisfaction', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'OverTime', 'StockOptionLevel', 'BusinessTravel Non-Travel', 'BusinessTravel Travel_Rarely', 'Department Human Resources', 'Department Research & Development', 'EducationField Life Sciences', 'EducationField Medical', 'EducationField Other', 'EducationField Technical Degree', 'Gender Female', 'JobRole Healthcare Representative', 'JobRole Human Resources', 'JobRole Manager', 'JobRole Manufacturing Director', 'JobRole Research Director', 'JobRole Sales Representative', 'MaritalStatus Divorced', 'MaritalStatus Married', 'MaritalStatus Single']
# Store the result in dataframe
columns = df.columns
result = [True if column in chi2_features else False for column in columns]
chi2_df = pd.DataFrame(columns=['Chi2'], index=columns)
chi2_df['Chi2'] = result

SelectKBest using mutual information

# Declare the transformer
mutual_info = SelectKBest(mutual_info_classif, k=n_feat)
mutual_info.fit(X, y)
SelectKBest(k=30, score_func=<function mutual_info_classif at 0x123f31700>)
# Get the features from the support
mutual_info_support = mutual_info.get_support()
mutual_info_features = X.loc[:, mutual_info_support].columns.tolist()
# Store the result in dataframe
columns = df.columns
result = [True if column in mutual_info_features else False for column in columns]
mi_df = pd.DataFrame(columns=['Mutual Informtion'], index=columns)
mi_df['Mutual Informtion'] = result

Wrapper Methods

Recursive Feature Elimination

# Declare the transformer
rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=n_feat, step=3)
rfe_selector.fit(X, y)
RFE(estimator=LogisticRegression(), n_features_to_select=30, step=3)
# Get the features from support
rfe_support = rfe_selector.get_support()
rfe_features = X.loc[:, rfe_support].columns.tolist()
print(f"Feature selected with RFE for Logistic Regression: {rfe_features}")
Feature selected with RFE for Logistic Regression: ['Age', 'NumCompaniesWorked', 'TotalWorkingYears', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'OverTime', 'BusinessTravel Non-Travel', 'BusinessTravel Travel_Frequently', 'BusinessTravel Travel_Rarely', 'Department Human Resources', 'Department Research & Development', 'Department Sales', 'EducationField Human Resources', 'EducationField Life Sciences', 'EducationField Marketing', 'EducationField Medical', 'EducationField Other', 'EducationField Technical Degree', 'Gender Female', 'Gender Male', 'JobRole Healthcare Representative', 'JobRole Human Resources', 'JobRole Manager', 'JobRole Manufacturing Director', 'JobRole Research Director', 'JobRole Research Scientist', 'MaritalStatus Divorced', 'MaritalStatus Married', 'MaritalStatus Single']
columns = df.columns
result = [True if column in rfe_features else False for column in columns]
rfe_lr_df = pd.DataFrame(columns=['RFE LR'], index=columns)
rfe_lr_df['RFE LR'] = result

Sequential Feature Selector

# Declare the transformer
sfs_selector = SequentialFeatureSelector(estimator=LogisticRegression(max_iter=1000), n_features_to_select=n_feat, direction='backward', scoring='f1', cv=5, n_jobs=-1)
sfs_selector.fit(X, y)
SequentialFeatureSelector(direction='backward',
                          estimator=LogisticRegression(max_iter=1000),
                          n_features_to_select=30, n_jobs=-1, scoring='f1')
# Get the features from the ssupport
sfs_support = sfs_selector.get_support()
sfs_features = X.loc[:, sfs_support].columns.tolist()
print(f"Features selected with Sequential Feature Selector: {sfs_features}")
Features selected with Sequential Feature Selector: ['Age', 'DistanceFromHome', 'MonthlyRate', 'NumCompaniesWorked', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'Education', 'EnvironmentSatisfaction', 'JobInvolvement', 'JobSatisfaction', 'OverTime', 'BusinessTravel Non-Travel', 'BusinessTravel Travel_Rarely', 'Department Research & Development', 'EducationField Human Resources', 'EducationField Life Sciences', 'EducationField Marketing', 'EducationField Medical', 'EducationField Other', 'EducationField Technical Degree', 'JobRole Healthcare Representative', 'JobRole Human Resources', 'JobRole Manager', 'JobRole Manufacturing Director', 'JobRole Research Director', 'JobRole Research Scientist', 'JobRole Sales Executive', 'MaritalStatus Divorced', 'MaritalStatus Married']
columns = df.columns
result = [True if column in sfs_features else False for column in columns]
sfs_df = pd.DataFrame(columns=['SFS LR'], index=columns)
sfs_df['SFS LR'] = result

Embedded Methods

LASSO Regression

# Declare the transformer
lasso_selector = SelectFromModel(estimator=LogisticRegression(C=1, penalty='l1', solver='saga'), max_features=n_feat)
lasso_selector.fit(X, y)
SelectFromModel(estimator=LogisticRegression(C=1, penalty='l1', solver='saga'),
                max_features=30)
# Get the features from the support
lasso_support = lasso_selector.get_support()
lasso_features = X.loc[:, lasso_support].columns.tolist()
print(f"Features selected from Lasso Model: {lasso_features}")
Features selected from Lasso Model: ['Age', 'DistanceFromHome', 'NumCompaniesWorked', 'TotalWorkingYears', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'OverTime', 'BusinessTravel Non-Travel', 'BusinessTravel Travel_Frequently', 'BusinessTravel Travel_Rarely', 'Department Human Resources', 'EducationField Human Resources', 'EducationField Life Sciences', 'EducationField Marketing', 'EducationField Medical', 'EducationField Other', 'EducationField Technical Degree', 'Gender Female', 'Gender Male', 'JobRole Healthcare Representative', 'JobRole Laboratory Technician', 'JobRole Manager', 'JobRole Manufacturing Director', 'JobRole Research Director', 'JobRole Research Scientist', 'JobRole Sales Executive', 'MaritalStatus Divorced', 'MaritalStatus Married', 'MaritalStatus Single']
columns = df.columns
result = [True if column in lasso_features else False for column in columns]
lasso_df = pd.DataFrame(columns=['Lasso'], index=columns)
lasso_df['Lasso'] = result

No features are selected by Lasso regression.

Random Forest

# Declare the transformer
rf_selector = SelectFromModel(estimator=RandomForestClassifier(), max_features=n_feat)
rf_selector.fit(X, y)
SelectFromModel(estimator=RandomForestClassifier(), max_features=30)
# Get the features
rf_support = rfe_selector.get_support()
rf_features = X.loc[:, rfe_support].columns.tolist()
print(f"Features selected from Random Forest Classifier: {rfe_features}")
Features selected from Random Forest Classifier: ['Age', 'NumCompaniesWorked', 'TotalWorkingYears', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'OverTime', 'BusinessTravel Non-Travel', 'BusinessTravel Travel_Frequently', 'BusinessTravel Travel_Rarely', 'Department Human Resources', 'Department Research & Development', 'Department Sales', 'EducationField Human Resources', 'EducationField Life Sciences', 'EducationField Marketing', 'EducationField Medical', 'EducationField Other', 'EducationField Technical Degree', 'Gender Female', 'Gender Male', 'JobRole Healthcare Representative', 'JobRole Human Resources', 'JobRole Manager', 'JobRole Manufacturing Director', 'JobRole Research Director', 'JobRole Research Scientist', 'MaritalStatus Divorced', 'MaritalStatus Married', 'MaritalStatus Single']
columns = df.columns
result = [True if column in rfe_features else False for column in columns]
rf_df = pd.DataFrame(columns=['RF'], index=columns)
rf_df['RF'] = result

Combined Result

# Combine all the result dataframes
res_df = pd.concat([correlation, f_class_df, chi2_df, mi_df, rfe_lr_df, sfs_df, lasso_df, rf_df], axis=1)
# Calculate the total votes for each features based on the different methods
res_df['Total'] = np.sum(res_df, axis=1)
# Sort the resultant dataframe based on votes
res_df.sort_values('Total', inplace=True, ascending=False)
res_df
Correlation F classif Chi2 Mutual Informtion RFE LR SFS LR Lasso RF Total
Age True True True True True True True True 8
MaritalStatus Married True True True True True True True True 8
MaritalStatus Divorced True True True True True True True True 8
BusinessTravel Non-Travel True True True True True True True True 8
EducationField Medical True True True True True True True True 8
OverTime True True True True True True True True 8
JobRole Manufacturing Director True True True True True True True True 8
YearsWithCurrManager False True True True True True True True 7
Department Research & Development True True True True True True False True 7
BusinessTravel Travel_Rarely True True True False True True True True 7
EducationField Other True True True False True True True True 7
JobRole Healthcare Representative True True True False True True True True 7
EducationField Life Sciences True True True False True True True True 7
YearsInCurrentRole False True True True True True True True 7
JobRole Manager True True True False True True True True 7
JobRole Research Director True True True False True True True True 7
YearsSinceLastPromotion True False False True True True True True 6
Department Human Resources True True True False True False True True 6
MaritalStatus Single True True True False True False True True 6
EducationField Marketing True False False True True True True True 6
NumCompaniesWorked True False False True True True True True 6
EducationField Technical Degree True False True False True True True True 6
Gender Female True True True False True False True True 6
TotalWorkingYears False True True True True False True True 6
JobSatisfaction True True True True False True False False 5
JobRole Research Scientist True False False False True True True True 5
JobInvolvement True True True True False True False False 5
EnvironmentSatisfaction True True True True False True False False 5
EducationField Human Resources True False False False True True True True 5
DistanceFromHome True True False True False True True False 5
JobRole Human Resources False False True False True True False True 4
BusinessTravel Travel_Frequently True False False False True False True True 4
YearsAtCompany True True True True False False False False 4
StockOptionLevel True True True True False False False False 4
MonthlyIncome True True True True False False False False 4
JobRole Sales Representative True True True False False False False False 3
Gender Male False False False False True False True True 3
JobLevel False True True True False False False False 3
Education True False False True False True False False 3
JobRole Sales Executive True False False False False True True False 3
DailyRate True True False True False False False False 3
PercentSalaryHike True False False True False False False False 2
HourlyRate True False False True False False False False 2
MonthlyRate True False False False False True False False 2
Department Sales False False False False True False False True 2
TrainingTimesLastYear True False False True False False False False 2
JobRole Laboratory Technician True False False False False False True False 2
RelationshipSatisfaction True False False True False False False False 2
WorkLifeBalance True False False True False False False False 2
PerformanceRating False False False True False False False False 1
EmployeeNumber True False False False False False False False 1
Attrition True False False False False False False False 1
# Store the result
res_df.to_csv('./../../../data/feature_ranking.csv')