Source code for insolver.discretization.discretizer

from pandas import DataFrame, Series
from numpy import log10, log2, std, percentile, subtract, sqrt, power, ndarray, array, min as npmin, max as npmax
from insolver.discretization.discretizer_utils import SklearnDiscretizer, CARTDiscretizer, ChiMergeDiscretizer



[docs]
class InsolverDiscretizer:
    """Trasform continuous variable into discrete form.

    Parameters:
        method (str): The method used to discretize. Should be in {'uniform', 'quantile', 'kmeans', 'cart'}.

    """

    _methods = ['uniform', 'quantile', 'kmeans', 'cart', 'chimerge']

    _n_bins_formula = (
        'square-root',
        'sturges',
        'huntsberger',
        'brooks-carrther',
        'cencov',
        'rice-rule',
        'terrell-scott',
        'scott',
        'freedman-diaconis',
    )

    def __init__(self, method='uniform'):
        if method not in self._methods:
            raise NotImplementedError(f'Accepted methods are {self._methods}, got {method} instead.')

        self.method = method
        self.X = None


[docs]
    def transform(self, X, y=None, n_bins=None, min_samples_leaf=None):
        """Apply discretization to given data.

        Args:
            X: 1-D array, The data to be descretized.
            y: 1-D array, The target values, ignored for unsupervised transformations.
            n_bins (int, str): The number of bins; Either integer number or value in
              {'square-root', 'sturges', 'rice-rule', 'scotts-rule', 'freedman-diaconis'}.
            min_samples_leaf (int, float):  The minimum number of samples required to be at a leaf
            node. Used for 'cart' method only, ignored otherwise.

        Returns:
            1-D array, The transformed data.

        Examples:

        Unsupervised discretization

        >>> import numpy as np
        >>> from insolver.discretization import InsolverDiscretizer
        >>> X = np.array([85, 90, 78, 96, 80, 70, 65, 95])
        >>> insolverDisc = InsolverDiscretizer(method='uniform')
        >>> insolverDisc.transform(X, n_bins=3)
        array([1., 2., 1., 2., 1., 0., 0., 2.])

        Supervised discretization

        >>> import numpy as np
        >>> from insolver.discretization import InsolverDiscretizer
        >>> X = np.array([85, 90, 78, 96, 80, 70, 65, 95])
        >>> y = np.array([1, 0, 1, 0, 0, 1, 1, 1])
        >>> insolverDisc = InsolverDiscretizer(method='chimerge')
        >>> insolverDisc.transform(X, y, n_bins=3)
        array([1, 1, 0, 2, 1, 0, 0, 1], dtype=int64)

        """

        self.X = X
        self.__check_X_type()

        if self.method in ['uniform', 'quantile', 'kmeans']:
            if not ((isinstance(n_bins, int) and n_bins > 1) or n_bins in self._n_bins_formula):
                raise ValueError(
                    'Invalid number of bins. '
                    f'Accepted integer value or one of the following options: {self._n_bins_formula},'
                    f'got {n_bins} instead.'
                )

            len_X = self.__check_X_shape()

            if n_bins in self._n_bins_formula:
                n_bins = self.__calculate_n_bins(n_bins, len_X)

            return SklearnDiscretizer._transform(self.X, n_bins, self.method)

        if self.method == 'cart':
            y = self.__check_y(y)
            return CARTDiscretizer._transform(self.X, y, min_samples_leaf)

        if self.method == 'chimerge':
            y = self.__check_y(y)
            return ChiMergeDiscretizer()._transform(self.X, y, n_bins)


    def __check_y(self, y):
        """Check y type."""
        if isinstance(y, DataFrame):
            y = y.values.reshape(-1)
        elif isinstance(y, Series):
            y = y.values
        elif isinstance(y, list):
            y = array(y)
        elif not (isinstance(y, ndarray)):
            raise ValueError(
                'Invalid target type. '
                'Accepted pandas DataFrame and Series instancies, list and numpy array, got '
                f'{type(y)} instead.'
            )

        if (not y.shape != (len(y), 1) or len(y.shape) != 1) or (len(y) != self.X.shape[0]):
            raise ValueError(
                'Invalid target shape. Expected 1-D array with shape '
                f'{(self.X.shape[0],)} or {(self.X.shape[0], 1)}, got {y.shape} instead'
            )

        return y

    def __calculate_n_bins(self, n_bins, len_X):
        """Calculate number of bins.

        Args:
            n_bins(string, int): The formula to calculate number of bins.
            len_X(int): length of X.

        Returns:
            int, The number of bins.

        References:
            Cebeci, Z. and Yıldız, F. (2017) Unsupervised Discretization of Continuous Variables in a Chicken Egg
            Quality Traits Dataset. Turkish Journal of Agriculture-Food Science and Technology, 5.4, 315-320.
            Available from: http://agrifoodscience.com/index.php/TURJAF/article/view/1056
        """
        if n_bins == 'square-root':
            return round(sqrt(len_X))

        elif n_bins == 'sturges':
            return round(1 + log2(len_X))

        elif n_bins == 'huntsberger':
            return round(1 + 3.322 * log10(len_X))

        elif n_bins == 'brooks-carrther':
            return round(5 * log10(len_X))

        elif n_bins == 'cencov':
            return round(pow(len_X, 1 / 3))

        elif n_bins == 'rice-rule':
            return round(power(len_X, 1 / 3) * 2)

        elif n_bins == 'terrell-scott':
            return round(power(2 * len_X, 1 / 3))

        elif n_bins == 'scott':
            return round((npmax(self.X) - npmin(self.X)) / 3.5 * std(self.X) * power(len_X, -1 / 3))

        elif n_bins == 'freedman-diaconis':
            iqr = subtract(*percentile(self.X, [75, 25]))
            h = 2 * iqr / power(len_X, 1 / 3)
            return round((npmax(self.X) - npmin(self.X)) / h)

    def __check_X_shape(self):
        """Check shape of X.


        Returns:
            int, length of X.
        """

        if len(self.X.shape) not in (1, 2):
            raise ValueError(f'Expected 1D or 2D array, got shape={self.X.shape} instead.')

        if len(self.X.shape) == 1:
            self.X = self.X.reshape(-1, 1)
            return self.X.shape[0]
        elif len(self.X.shape) == 2:
            if self.X.shape[0] == 1:
                self.X = self.X.reshape(-1, 1)

        return self.X.shape[0]

    def __check_X_type(self):
        """Check X type."""
        if isinstance(self.X, DataFrame) or isinstance(self.X, Series):
            self.X = self.X.values
        elif isinstance(self.X, list):
            self.X = array(self.X)
        elif not (isinstance(self.X, ndarray)):
            raise ValueError(
                'Invalid data type. '
                'Accepted pandas DataFrame and Series instancies, list and numpy array, got '
                f'{type(self.X)} instead.'
            )