Source code for insolver.feature_engineering.feature_selection

import numpy as np
import matplotlib.pyplot as plt
from pandas import DataFrame, concat
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import Lasso, LogisticRegression, ElasticNet
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression, chi2, f_classif, f_regression
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import StandardScaler



[docs]
class FeatureSelection:
    """Feature selection. Supports the following tasks: classification, regression, multiclass classification and
    multiclass multioutput classification.

    Note:
        The following specified methods can be used for each individual task:

        - for the **classification** problem Mutual information, F statistics, chi-squared test, Random Forest, Lasso or
          ElasticNet can be used;
        - for the **regression** problem Mutual information, F statistics, Random Forest, Lasso or ElasticNet can be
          used;
        - for the **multiclass classification** Random Forest, Lasso or ElasticNet can be used;
        - for the **multiclass multioutput classification** Random Forest can be used.

        Random Forest is used by default.

    Parameters:
        y_column (str): The name of the column to predict.
        task (str): A task for the model. Values `reg`, `class`, `multiclass` and `multiclass_multioutput` are
         supported.
        method (str): A technique to compute features importance. Values `random_forest`(default), `mutual_inf`, `chi2`,
         `f_statistic`, 'lasso' and 'elasticnet' are supported.
        permutation_importance (bool): Uses permutation feature importance, false is default.

    Attributes:
        new_dataframe (pandas.DataFrame): New dataframe with the selected features only.
        importances (np.ndarray): A list of the importances created using selected method.
        model : A model for feature selection.
        permutation_model : Permutation model for feature selection.

    """

    def __init__(self, y_column, task, method='random_forest', permutation_importance=False):
        self.y_column = y_column
        self.task = task
        self.method = method
        self.permutation_importance = permutation_importance
        self.new_dataframe = DataFrame()
        self.importances = np.array([])
        self.tasks_list = ['reg', 'class', 'multiclass', 'multiclass_multioutput']


[docs]
    def create_model(self, df):
        """
        A method to create a model for feature selection using specified method. Random Forest is used by default.

        Parameters:
            df (pandas.Dataframe): The dataframe.

        Raises:
            ValueError: If there are null values in the dataframe.
            ValueError: If there are object columns in the dataframe.
            NotImplementedError: If self.method isn't supported with the task.

        """
        # check for null values
        if not df.isnull().sum().sum() == 0:
            raise ValueError('All values in the dataframe must be not null.')

        # check for categorical columns
        if len([var for var in df.columns if df[var].dtype == 'object']) > 0:
            raise ValueError('All values in the dataframe must not be object.')

        # initialize all methods
        self._init_methods_dict()
        # initialize importance dictionary
        self._init_importance_dict()

        # raise error if the method is not supported
        if self.method not in self.methods_dict.keys():
            raise NotImplementedError(f'Task {self.task} does not support method "{self.method}".')

        # get x and y
        self.x = df.drop([self.y_column], axis=1)
        self.y = df[self.y_column]

        # get estimator, calculate importance and get importance
        self.model = self.methods_dict[self.method](self.x, self.y)
        self.importances = self.importance_dict[self.method](self.model)

        # if permutation_importance, create it
        if self.permutation_importance:
            self.create_permutation_importance()



[docs]
    def create_permutation_importance(self, **kwargs):
        """A method for creating permutation importance for the features. This method will be automatically called if
        'permutation_importance' parameter was set to True. Features importances will be set to importances_mean from
        permutation_importance model.

        Note:
            This method can be called only after method 'create_model' has been called.

        Raises:
            Exception: Model was not created, self.x or self.importances was not initialized.
            Exception: Permutation importance was used with the method that doesn't implement class
                sklearn.base.BaseEstimator.

        """
        try:
            # calculate permutation_importance and get importance
            self.permutation_model = permutation_importance(self.model, self.x, self.y, **kwargs)
            self.importances = self.permutation_model.importances_mean

        except AttributeError:
            raise AttributeError('Model was not created yet.')

        except TypeError:
            raise TypeError('Permutation importance can only be used with the estimator.')



[docs]
    def create_new_dataset(self, threshold='mean'):
        """
        A method for creating new dataset. It uses threshold parameter to select features.

        Note:
            This method can be called only after method 'create_model' has been called.
            This method uses absolute numeric value of the importences during comparison with the threshold value.

        Parameters:
            threshold : The threshold value to use. It can be 'mean'(default), 'median' or numeric.

        Raises:
            Exception: Model was not created.

        """

        try:
            # get importance
            scores = self.importances

            # check if multiclass and lasso or elasticnet because it returns (n_targets, n_features)
            if (self.task == 'multiclass') and (not self.method == 'random_forest'):
                # calculate mean value for each feature
                scores = np.mean(np.abs(self.importances), axis=0)

            df_scores = DataFrame({'feature_name': self.x.columns, 'feature_score': scores})

            # calculate mean as threshold and select columns
            if threshold == 'mean':
                self.threshold = df_scores['feature_score'].abs().mean()
                cols = df_scores[df_scores['feature_score'] > self.threshold]['feature_name']
                self.new_dataframe = concat([self.x[cols], self.y], axis=1)

                return self.new_dataframe

            # calculate median as threshold and select columns
            elif threshold == 'median':
                self.threshold = df_scores['feature_score'].abs().median()
                cols = df_scores[df_scores['feature_score'] > self.threshold]['feature_name']
                self.new_dataframe = concat([self.x[cols], self.y], axis=1)

                return self.new_dataframe

            # select columns with threshold
            else:
                self.threshold = threshold
                cols = df_scores[df_scores['feature_score'].abs() > self.threshold]['feature_name']
                self.new_dataframe = concat([self.x[cols], self.y], axis=1)

                return self.new_dataframe

        except AttributeError:
            raise AttributeError('Model was not created yet.')



[docs]
    def plot_importance(self, figsize=(5, 5), importance_threshold=None):
        """
        A method for plotting feature importance using created model.

        Note:
            This method can be called only after method 'create_model' has
            been called.

        Parameters:
            figsize (list): Figsize of the plot.
            importance_threshold (float): The threshold of importance by which
                the features will be plotted.

        Raises:
            Exception: Model was not created, self.x or self.importances was
                not initialized.

        """
        try:
            # check if importances is (n_targets, n_features)
            if len(self.importances.shape) > 1:
                n = 0
                # plot each class importances
                for n_class in self.importances:
                    df_to_plot = DataFrame({'feature_name': self.x.columns, 'feature_score': n_class})

                    # if importance_threshold plot only selected features
                    if importance_threshold:
                        df_to_plot[df_to_plot['feature_score'] > importance_threshold].plot.barh(
                            x='feature_name', y='feature_score', figsize=figsize
                        )

                    # else plot all features
                    else:
                        df_to_plot.plot.barh(x='feature_name', y='feature_score', figsize=figsize)

                    plt.title(f'Model {self.method} class {self.model.classes_[n]} scores')
                    n += 1

            # else importances is (, n_features)
            else:
                df_to_plot = DataFrame({'feature name': self.x.columns, 'feature score': self.importances})

                # if importance_threshold plot only selected features
                if importance_threshold:
                    df_to_plot[df_to_plot['feature score'] > importance_threshold].plot.barh(
                        x='feature name', y='feature score', figsize=figsize
                    )

                # else plot all features
                else:
                    df_to_plot.plot.barh(x='feature name', y='feature score', figsize=figsize)

                plt.title(f'Model {self.method} features scores')

        except AttributeError:
            raise AttributeError('Model was not created yet.')



[docs]
    def _init_methods_dict(self):
        """
        Non-public method for creating a methods' dictionary.

        Raises:
            NotImplementedError: If `self.task` is not supported.

        """
        # methods_dict initialization for classification task
        if self.task == 'class':
            self.methods_dict = {
                'mutual_inf': lambda x, y: mutual_info_classif(x, y),
                'chi2': lambda x, y: chi2(x, y),
                'f_statistic': lambda x, y: f_classif(x, y),
                'random_forest': lambda x, y: RandomForestClassifier(n_estimators=10).fit(x, y),
                'lasso': lambda x, y: LogisticRegression(penalty='l1', solver='saga').fit(
                    StandardScaler().fit_transform(x), y
                ),
                'elasticnet': lambda x, y: LogisticRegression(penalty='elasticnet', l1_ratio=0.5, solver='saga').fit(
                    StandardScaler().fit_transform(x), y
                ),
            }

        # methods_dict initialization for regression task
        elif self.task == 'reg':
            self.methods_dict = {
                'mutual_inf': lambda x, y: mutual_info_regression(x, y),
                'f_statistic': lambda x, y: f_regression(x, y),
                'random_forest': lambda x, y: RandomForestRegressor(n_estimators=10).fit(x, y),
                'lasso': lambda x, y: Lasso().fit(StandardScaler().fit_transform(x), y),
                'elasticnet': lambda x, y: ElasticNet().fit(StandardScaler().fit_transform(x), y),
            }

        # methods_dict initialization for multiclass classification task
        elif self.task == 'multiclass':
            self.methods_dict = {
                'random_forest': lambda x, y: RandomForestClassifier(n_estimators=10).fit(x, y),
                'lasso': lambda x, y: LogisticRegression(penalty='l1', solver='saga', multi_class='multinomial').fit(
                    StandardScaler().fit_transform(x), y
                ),
                'elasticnet': lambda x, y: LogisticRegression(
                    penalty='elasticnet', l1_ratio=0.5, solver='saga', multi_class='multinomial'
                ).fit(StandardScaler().fit_transform(x), y),
            }

        # methods_dict initialization for multiclass multioutput classification task
        elif self.task == 'multiclass_multioutput':
            self.methods_dict = {
                'random_forest': lambda x, y: RandomForestClassifier(n_estimators=10).fit(x, y),
            }

        else:
            raise NotImplementedError(f'Value task must be one of the {self.tasks_list}')



[docs]
    def _init_importance_dict(self):
        """Non-public method for creating an importance dictionary."""
        self.importance_dict = {
            'random_forest': lambda model: model.feature_importances_,
            'mutual_inf': lambda model: model,
            'chi2': lambda model: model[1],
            'f_statistic': lambda model: -np.log10(model[1]) / (-np.log10(model[1])).max(),
            'lasso': lambda model: model.coef_,
            'elasticnet': lambda model: model.coef_,
        }


    def __call__(self, df):
        self.create_model(df)
        self.plot_importance()