Source code for insolver.wrappers.glm

from functools import partial

from pandas import DataFrame, Series, concat
from numpy import sum, sqrt, repeat

from h2o.frame import H2OFrame
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from sklearn.linear_model import TweedieRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from .base import InsolverBaseWrapper
from .extensions import InsolverH2OExtension, InsolverCVHPExtension, InsolverPDPExtension
from .extensions.h2oext import to_h2oframe



[docs]
class InsolverGLMWrapper(InsolverBaseWrapper, InsolverH2OExtension, InsolverCVHPExtension, InsolverPDPExtension):
    """Insolver wrapper for Generalized Linear Models.

    Parameters:
        backend (str): Framework for building GLM, currently 'h2o' and 'sklearn' are supported.
        family (str, float, int, optional): Distribution for GLM. Supports any family from h2o as
          str. For sklearn supported `str` families are ['gaussian', 'normal', 'poisson', 'gamma', 'inverse_gaussian'],
          also may be defined as `int` or `float` as a power for Tweedie GLM. By default, Gaussian GLM is fitted.
        link (str, optional): Link function for GLM. If `None`, sets to default value for both h2o and sklearn.
        standardize (bool, optional): Whether to standardize data before fitting the model. Enabled by default.
        h2o_init_params (dict, optional): Parameters passed to `h2o.init()`, when `backend` == 'h2o'.
        load_path (str, optional): Path to GLM model to load from disk.
        **kwargs: Parameters for GLM estimators (for H2OGeneralizedLinearEstimator or TweedieRegressor) except
          `family` (`power` for TweedieRegressor) and `link`.

    """

    def __init__(
        self, backend, family=None, link=None, standardize=True, h2o_init_params=None, load_path=None, **kwargs
    ):
        super(InsolverGLMWrapper, self).__init__(backend)
        self.init_args = self._get_init_args(vars())
        self.algo, self._backends = 'glm', ['h2o', 'sklearn']
        self._back_load_dict = {
            'sklearn': self._pickle_load,
            'h2o': partial(self._h2o_load, h2o_init_params=h2o_init_params),
        }
        self._back_save_dict = {'sklearn': self._pickle_save, 'h2o': self._h2o_save}

        if backend not in self._backends:
            raise NotImplementedError(f'Error with the backend choice. Supported backends: {self._backends}')

        self.params, self.standardize = None, standardize
        if load_path is not None:
            self.load_model(load_path)
        else:
            family = family if family is not None else 'gaussian'
            link = link if link is not None else 'family_default' if backend == 'h2o' else 'auto'
            if backend == 'h2o':
                self._h2o_init(h2o_init_params)
                self.model = H2OGeneralizedLinearEstimator(
                    family=family, link=link, standardize=self.standardize, **kwargs
                )
            elif backend == 'sklearn':
                if isinstance(family, str):
                    family_power = {'gaussian': 0, 'normal': 0, 'poisson': 1, 'gamma': 2, 'inverse_gaussian': 3}
                    if family in family_power.keys():
                        family = family_power[family]
                    else:
                        raise NotImplementedError('Distribution is not supported with sklearn backend.')
                elif isinstance(family, (float, int)) and (0 < family < 1):
                    raise ValueError('No distribution exists for Tweedie power in range (0, 1).')
                kwargs.update({'power': family, 'link': link})
                self.params = kwargs

                def __params_pipe(**glm_pars):
                    glm_pars.update(self.params)
                    return Pipeline(
                        [
                            ('scaler', StandardScaler(with_mean=self.standardize, with_std=self.standardize)),
                            ('glm', TweedieRegressor(**glm_pars)),
                        ]
                    )

                self.model, self.object = __params_pipe(**self.params), __params_pipe
        self._update_meta()


[docs]
    def fit(
        self, X, y, sample_weight=None, X_valid=None, y_valid=None, sample_weight_valid=None, report=None, **kwargs
    ):
        """Fit a Generalized Linear Model.

        Args:
            X (pd.DataFrame, pd.Series): Training data.
            y (pd.DataFrame, pd.Series): Training target values.
            sample_weight (pd.DataFrame, pd.Series, optional): Training sample weights.
            X_valid (pd.DataFrame, pd.Series, optional): Validation data (only h2o supported).
            y_valid (pd.DataFrame, pd.Series, optional): Validation target values (only h2o supported).
            sample_weight_valid (pd.DataFrame, pd.Series, optional): Validation sample weights.
            report (list, tuple, optional): A list of metrics to report after model fitting, optional.
            **kwargs: Other parameters passed to H2OGeneralizedLinearEstimator.
        """
        if (self.backend == 'sklearn') & isinstance(self.model, Pipeline):
            if isinstance(X, (DataFrame, Series)):
                self.model.feature_name_ = X.columns.tolist() if isinstance(X, DataFrame) else [X.name]
            self.model.fit(X, y, glm__sample_weight=sample_weight)
        elif (self.backend == 'h2o') & isinstance(self.model, H2OGeneralizedLinearEstimator):
            features, target, train_set, params = self._x_y_to_h2o_frame(
                X, y, sample_weight, {**kwargs}, X_valid, y_valid, sample_weight_valid
            )
            self.model.train(y=target, x=features, training_frame=train_set, **params)
        else:
            raise NotImplementedError(f'Error with the backend choice. Supported backends: {self._backends}')
        self._update_meta()
        if report is not None:
            if isinstance(report, (list, tuple)):
                if self.backend == 'h2o':
                    prediction = self.model.predict(train_set).as_data_frame().values.reshape(-1)
                else:
                    prediction = self.model.predict(X)
                print(
                    DataFrame([[x.__name__, x(y, prediction)] for x in report])
                    .rename({0: 'Metrics', 1: 'Value'}, axis=1)
                    .set_index('Metrics')
                )



[docs]
    def predict(self, X, sample_weight=None, **kwargs):
        """Predict using GLM with feature matrix X.

        Args:
            X (pd.DataFrame, pd.Series): Samples.
            sample_weight (pd.DataFrame, pd.Series, optional): Test sample weights.
            **kwargs: Other parameters passed to H2OGeneralizedLinearEstimator.predict().

        Returns:
            array: Returns predicted values.
        """
        if not self.__is_fitted():
            raise Exception("This instance is not fitted yet. Call 'fit' before using this estimator.")
        if (self.backend == 'sklearn') & isinstance(self.model, Pipeline):
            predictions = self.model.predict(
                X if not hasattr(self.model, 'feature_name_') else X[self.model.feature_name_]
            )
        elif (self.backend == 'h2o') & isinstance(self.model, H2OGeneralizedLinearEstimator):
            if self.model.parms['offset_column']['actual_value'] is not None and sample_weight is None:
                offset_name = self.model.parms['offset_column']['actual_value']['column_name']
                sample_weight = Series(repeat(0, len(X)), name=offset_name, index=X.index)
            if sample_weight is not None:
                X = concat([X, sample_weight], axis=1)
            h2o_predict = X if isinstance(X, H2OFrame) else to_h2oframe(X)
            predictions = self.model.predict(h2o_predict, **kwargs).as_data_frame().values.reshape(-1)
        else:
            raise NotImplementedError(f'Error with the backend choice. Supported backends: {self._backends}')
        return predictions



[docs]
    def coef_norm(self):
        """Output GLM coefficients for standardized data.

        Returns:
            dict: {`str`: `float`} Dictionary containing GLM coefficients for standardized data.
        """
        if not self.__is_fitted():
            raise Exception("This instance is not fitted yet. Call 'fit' before using this estimator.")
        if self.standardize:
            if (self.backend == 'sklearn') & isinstance(self.model, Pipeline):
                if self.model.feature_name_ is None:
                    self.model.feature_name_ = [
                        f'Variable_{i}' for i in range(len(list(self.model.named_steps['glm'].coef_)))
                    ]
                coefs = zip(
                    ['Intercept'] + self.model.feature_name_,
                    [self.model.named_steps['glm'].intercept_] + list(self.model.named_steps['glm'].coef_),
                )
                coefs = {x[0]: x[1] for x in coefs}
            elif (self.backend == 'h2o') & isinstance(self.model, H2OGeneralizedLinearEstimator):
                coefs = self.model.coef_norm()
            else:
                raise NotImplementedError(f'Error with the backend choice. Supported backends: {self._backends}')

        else:
            raise Exception('Normalized coefficients unavailable since model fitted on non-standardized data.')
        return coefs



[docs]
    def coef(self):
        """Output GLM coefficients for non-standardized data. Also calculated when GLM fitted on standardized data.

        Returns:
            dict: {`str`: `float`} Dictionary containing GLM coefficients for non-standardized data.
        """
        if not self.__is_fitted():
            raise Exception("This instance is not fitted yet. Call 'fit' before using this estimator.")
        if (self.backend == 'sklearn') & isinstance(self.model, Pipeline):
            if self.model.feature_name_ is None:
                self.model.feature_name_ = [
                    f'Variable_{i}' for i in range(len(list(self.model.named_steps['glm'].coef_)))
                ]
            if self.standardize:
                intercept = self.model.named_steps['glm'].intercept_ - sum(
                    self.model.named_steps['glm'].coef_
                    * self.model.named_steps['scaler'].mean_
                    / sqrt(self.model.named_steps['scaler'].var_)
                )
                coefs = self.model.named_steps['glm'].coef_ / sqrt(self.model.named_steps['scaler'].var_)
                coefs = zip(['Intercept'] + self.model.feature_name_, [intercept] + list(coefs))
            else:
                coefs = zip(
                    ['Intercept'] + self.model.feature_name_,
                    [self.model.named_steps['glm'].intercept_] + list(self.model.named_steps['glm'].coef_),
                )
            coefs = {x[0]: x[1] for x in coefs}
        elif (self.backend == 'h2o') & isinstance(self.model, H2OGeneralizedLinearEstimator):
            coefs = self.model.coef()
        else:
            raise NotImplementedError(f'Error with the backend choice. Supported backends: {self._backends}')
        return coefs



[docs]
    def coef_to_csv(self, path_or_buf=None, **kwargs):
        """Write GLM coefficients to a comma-separated values (csv) file.

        Args:
            path_or_buf : str or file handle, default None
                File path or object, if None is provided the result is returned as
                a string.  If a non-binary file object is passed, it should be opened
                with `newline=''`, disabling universal newlines. If a binary
                file object is passed, `mode` might need to contain a `'b'`.
            **kwargs: Other parameters passed to Pandas DataFrame.to_csv method.
        Returns:
            None or str
                If path_or_buf is None, returns the resulting csv format as a
                string. Otherwise returns None.
        """
        result = DataFrame()
        sources_methods = {
            'coefficients for standardized data': self.coef_norm,
            'coefficients for non-standardized data': self.coef,
        }

        for name, method in sources_methods.items():
            try:
                column = method()

                if isinstance(column, dict):
                    result = result.join(Series(column, name=name), how='outer')
            except Exception as e:
                # exception of class Exception usage justified because
                # method self.coef_norm() can raise exception of that class
                print(e)

        if result.size > 0:
            kwargs['path_or_buf'] = path_or_buf
            return result.to_csv(**kwargs)
        else:
            print('csv file was not created, no available data')


    def __is_fitted(self):
        """Function to check if model is fitted.

        Raises:
            NotImplementedError: if method for checking that model is fitted is not implemented
                and if model instance type does not match with expected type (for used backend).

        Returns:
            True if model is fitted, False otherwise.

        """
        methods = {
            'sklearn': lambda x: x.__sklearn_is_fitted__(),
            'h2o': lambda x: x._model_json is not None,
        }

        if self.backend not in methods.keys():
            raise NotImplementedError('__is_fitted method does not support chosen backend')
        return methods[self.backend](self.model)