import numpy as np
import matplotlib.pyplot as plt
from pandas import DataFrame, concat
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import Lasso, LogisticRegression, ElasticNet
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression, chi2, f_classif, f_regression
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import StandardScaler
[docs]
class FeatureSelection:
"""Feature selection. Supports the following tasks: classification, regression, multiclass classification and
multiclass multioutput classification.
Note:
The following specified methods can be used for each individual task:
- for the **classification** problem Mutual information, F statistics, chi-squared test, Random Forest, Lasso or
ElasticNet can be used;
- for the **regression** problem Mutual information, F statistics, Random Forest, Lasso or ElasticNet can be
used;
- for the **multiclass classification** Random Forest, Lasso or ElasticNet can be used;
- for the **multiclass multioutput classification** Random Forest can be used.
Random Forest is used by default.
Parameters:
y_column (str): The name of the column to predict.
task (str): A task for the model. Values `reg`, `class`, `multiclass` and `multiclass_multioutput` are
supported.
method (str): A technique to compute features importance. Values `random_forest`(default), `mutual_inf`, `chi2`,
`f_statistic`, 'lasso' and 'elasticnet' are supported.
permutation_importance (bool): Uses permutation feature importance, false is default.
Attributes:
new_dataframe (pandas.DataFrame): New dataframe with the selected features only.
importances (np.ndarray): A list of the importances created using selected method.
model : A model for feature selection.
permutation_model : Permutation model for feature selection.
"""
def __init__(self, y_column, task, method='random_forest', permutation_importance=False):
self.y_column = y_column
self.task = task
self.method = method
self.permutation_importance = permutation_importance
self.new_dataframe = DataFrame()
self.importances = np.array([])
self.tasks_list = ['reg', 'class', 'multiclass', 'multiclass_multioutput']
[docs]
def create_model(self, df):
"""
A method to create a model for feature selection using specified method. Random Forest is used by default.
Parameters:
df (pandas.Dataframe): The dataframe.
Raises:
ValueError: If there are null values in the dataframe.
ValueError: If there are object columns in the dataframe.
NotImplementedError: If self.method isn't supported with the task.
"""
# check for null values
if not df.isnull().sum().sum() == 0:
raise ValueError('All values in the dataframe must be not null.')
# check for categorical columns
if len([var for var in df.columns if df[var].dtype == 'object']) > 0:
raise ValueError('All values in the dataframe must not be object.')
# initialize all methods
self._init_methods_dict()
# initialize importance dictionary
self._init_importance_dict()
# raise error if the method is not supported
if self.method not in self.methods_dict.keys():
raise NotImplementedError(f'Task {self.task} does not support method "{self.method}".')
# get x and y
self.x = df.drop([self.y_column], axis=1)
self.y = df[self.y_column]
# get estimator, calculate importance and get importance
self.model = self.methods_dict[self.method](self.x, self.y)
self.importances = self.importance_dict[self.method](self.model)
# if permutation_importance, create it
if self.permutation_importance:
self.create_permutation_importance()
[docs]
def create_permutation_importance(self, **kwargs):
"""A method for creating permutation importance for the features. This method will be automatically called if
'permutation_importance' parameter was set to True. Features importances will be set to importances_mean from
permutation_importance model.
Note:
This method can be called only after method 'create_model' has been called.
Raises:
Exception: Model was not created, self.x or self.importances was not initialized.
Exception: Permutation importance was used with the method that doesn't implement class
sklearn.base.BaseEstimator.
"""
try:
# calculate permutation_importance and get importance
self.permutation_model = permutation_importance(self.model, self.x, self.y, **kwargs)
self.importances = self.permutation_model.importances_mean
except AttributeError:
raise AttributeError('Model was not created yet.')
except TypeError:
raise TypeError('Permutation importance can only be used with the estimator.')
[docs]
def create_new_dataset(self, threshold='mean'):
"""
A method for creating new dataset. It uses threshold parameter to select features.
Note:
This method can be called only after method 'create_model' has been called.
This method uses absolute numeric value of the importences during comparison with the threshold value.
Parameters:
threshold : The threshold value to use. It can be 'mean'(default), 'median' or numeric.
Raises:
Exception: Model was not created.
"""
try:
# get importance
scores = self.importances
# check if multiclass and lasso or elasticnet because it returns (n_targets, n_features)
if (self.task == 'multiclass') and (not self.method == 'random_forest'):
# calculate mean value for each feature
scores = np.mean(np.abs(self.importances), axis=0)
df_scores = DataFrame({'feature_name': self.x.columns, 'feature_score': scores})
# calculate mean as threshold and select columns
if threshold == 'mean':
self.threshold = df_scores['feature_score'].abs().mean()
cols = df_scores[df_scores['feature_score'] > self.threshold]['feature_name']
self.new_dataframe = concat([self.x[cols], self.y], axis=1)
return self.new_dataframe
# calculate median as threshold and select columns
elif threshold == 'median':
self.threshold = df_scores['feature_score'].abs().median()
cols = df_scores[df_scores['feature_score'] > self.threshold]['feature_name']
self.new_dataframe = concat([self.x[cols], self.y], axis=1)
return self.new_dataframe
# select columns with threshold
else:
self.threshold = threshold
cols = df_scores[df_scores['feature_score'].abs() > self.threshold]['feature_name']
self.new_dataframe = concat([self.x[cols], self.y], axis=1)
return self.new_dataframe
except AttributeError:
raise AttributeError('Model was not created yet.')
[docs]
def plot_importance(self, figsize=(5, 5), importance_threshold=None):
"""
A method for plotting feature importance using created model.
Note:
This method can be called only after method 'create_model' has
been called.
Parameters:
figsize (list): Figsize of the plot.
importance_threshold (float): The threshold of importance by which
the features will be plotted.
Raises:
Exception: Model was not created, self.x or self.importances was
not initialized.
"""
try:
# check if importances is (n_targets, n_features)
if len(self.importances.shape) > 1:
n = 0
# plot each class importances
for n_class in self.importances:
df_to_plot = DataFrame({'feature_name': self.x.columns, 'feature_score': n_class})
# if importance_threshold plot only selected features
if importance_threshold:
df_to_plot[df_to_plot['feature_score'] > importance_threshold].plot.barh(
x='feature_name', y='feature_score', figsize=figsize
)
# else plot all features
else:
df_to_plot.plot.barh(x='feature_name', y='feature_score', figsize=figsize)
plt.title(f'Model {self.method} class {self.model.classes_[n]} scores')
n += 1
# else importances is (, n_features)
else:
df_to_plot = DataFrame({'feature name': self.x.columns, 'feature score': self.importances})
# if importance_threshold plot only selected features
if importance_threshold:
df_to_plot[df_to_plot['feature score'] > importance_threshold].plot.barh(
x='feature name', y='feature score', figsize=figsize
)
# else plot all features
else:
df_to_plot.plot.barh(x='feature name', y='feature score', figsize=figsize)
plt.title(f'Model {self.method} features scores')
except AttributeError:
raise AttributeError('Model was not created yet.')
[docs]
def _init_methods_dict(self):
"""
Non-public method for creating a methods' dictionary.
Raises:
NotImplementedError: If `self.task` is not supported.
"""
# methods_dict initialization for classification task
if self.task == 'class':
self.methods_dict = {
'mutual_inf': lambda x, y: mutual_info_classif(x, y),
'chi2': lambda x, y: chi2(x, y),
'f_statistic': lambda x, y: f_classif(x, y),
'random_forest': lambda x, y: RandomForestClassifier(n_estimators=10).fit(x, y),
'lasso': lambda x, y: LogisticRegression(penalty='l1', solver='saga').fit(
StandardScaler().fit_transform(x), y
),
'elasticnet': lambda x, y: LogisticRegression(penalty='elasticnet', l1_ratio=0.5, solver='saga').fit(
StandardScaler().fit_transform(x), y
),
}
# methods_dict initialization for regression task
elif self.task == 'reg':
self.methods_dict = {
'mutual_inf': lambda x, y: mutual_info_regression(x, y),
'f_statistic': lambda x, y: f_regression(x, y),
'random_forest': lambda x, y: RandomForestRegressor(n_estimators=10).fit(x, y),
'lasso': lambda x, y: Lasso().fit(StandardScaler().fit_transform(x), y),
'elasticnet': lambda x, y: ElasticNet().fit(StandardScaler().fit_transform(x), y),
}
# methods_dict initialization for multiclass classification task
elif self.task == 'multiclass':
self.methods_dict = {
'random_forest': lambda x, y: RandomForestClassifier(n_estimators=10).fit(x, y),
'lasso': lambda x, y: LogisticRegression(penalty='l1', solver='saga', multi_class='multinomial').fit(
StandardScaler().fit_transform(x), y
),
'elasticnet': lambda x, y: LogisticRegression(
penalty='elasticnet', l1_ratio=0.5, solver='saga', multi_class='multinomial'
).fit(StandardScaler().fit_transform(x), y),
}
# methods_dict initialization for multiclass multioutput classification task
elif self.task == 'multiclass_multioutput':
self.methods_dict = {
'random_forest': lambda x, y: RandomForestClassifier(n_estimators=10).fit(x, y),
}
else:
raise NotImplementedError(f'Value task must be one of the {self.tasks_list}')
[docs]
def _init_importance_dict(self):
"""Non-public method for creating an importance dictionary."""
self.importance_dict = {
'random_forest': lambda model: model.feature_importances_,
'mutual_inf': lambda model: model,
'chi2': lambda model: model[1],
'f_statistic': lambda model: -np.log10(model[1]) / (-np.log10(model[1])).max(),
'lasso': lambda model: model.coef_,
'elasticnet': lambda model: model.coef_,
}
def __call__(self, df):
self.create_model(df)
self.plot_importance()