Source code for insolver.discretization.discretizer

from pandas import DataFrame, Series
from numpy import log10, log2, std, percentile, subtract, sqrt, power, ndarray, array, min as npmin, max as npmax
from insolver.discretization.discretizer_utils import SklearnDiscretizer, CARTDiscretizer, ChiMergeDiscretizer


[docs] class InsolverDiscretizer: """Trasform continuous variable into discrete form. Parameters: method (str): The method used to discretize. Should be in {'uniform', 'quantile', 'kmeans', 'cart'}. """ _methods = ['uniform', 'quantile', 'kmeans', 'cart', 'chimerge'] _n_bins_formula = ( 'square-root', 'sturges', 'huntsberger', 'brooks-carrther', 'cencov', 'rice-rule', 'terrell-scott', 'scott', 'freedman-diaconis', ) def __init__(self, method='uniform'): if method not in self._methods: raise NotImplementedError(f'Accepted methods are {self._methods}, got {method} instead.') self.method = method self.X = None
[docs] def transform(self, X, y=None, n_bins=None, min_samples_leaf=None): """Apply discretization to given data. Args: X: 1-D array, The data to be descretized. y: 1-D array, The target values, ignored for unsupervised transformations. n_bins (int, str): The number of bins; Either integer number or value in {'square-root', 'sturges', 'rice-rule', 'scotts-rule', 'freedman-diaconis'}. min_samples_leaf (int, float): The minimum number of samples required to be at a leaf node. Used for 'cart' method only, ignored otherwise. Returns: 1-D array, The transformed data. Examples: Unsupervised discretization >>> import numpy as np >>> from insolver.discretization import InsolverDiscretizer >>> X = np.array([85, 90, 78, 96, 80, 70, 65, 95]) >>> insolverDisc = InsolverDiscretizer(method='uniform') >>> insolverDisc.transform(X, n_bins=3) array([1., 2., 1., 2., 1., 0., 0., 2.]) Supervised discretization >>> import numpy as np >>> from insolver.discretization import InsolverDiscretizer >>> X = np.array([85, 90, 78, 96, 80, 70, 65, 95]) >>> y = np.array([1, 0, 1, 0, 0, 1, 1, 1]) >>> insolverDisc = InsolverDiscretizer(method='chimerge') >>> insolverDisc.transform(X, y, n_bins=3) array([1, 1, 0, 2, 1, 0, 0, 1], dtype=int64) """ self.X = X self.__check_X_type() if self.method in ['uniform', 'quantile', 'kmeans']: if not ((isinstance(n_bins, int) and n_bins > 1) or n_bins in self._n_bins_formula): raise ValueError( 'Invalid number of bins. ' f'Accepted integer value or one of the following options: {self._n_bins_formula},' f'got {n_bins} instead.' ) len_X = self.__check_X_shape() if n_bins in self._n_bins_formula: n_bins = self.__calculate_n_bins(n_bins, len_X) return SklearnDiscretizer._transform(self.X, n_bins, self.method) if self.method == 'cart': y = self.__check_y(y) return CARTDiscretizer._transform(self.X, y, min_samples_leaf) if self.method == 'chimerge': y = self.__check_y(y) return ChiMergeDiscretizer()._transform(self.X, y, n_bins)
def __check_y(self, y): """Check y type.""" if isinstance(y, DataFrame): y = y.values.reshape(-1) elif isinstance(y, Series): y = y.values elif isinstance(y, list): y = array(y) elif not (isinstance(y, ndarray)): raise ValueError( 'Invalid target type. ' 'Accepted pandas DataFrame and Series instancies, list and numpy array, got ' f'{type(y)} instead.' ) if (not y.shape != (len(y), 1) or len(y.shape) != 1) or (len(y) != self.X.shape[0]): raise ValueError( 'Invalid target shape. Expected 1-D array with shape ' f'{(self.X.shape[0],)} or {(self.X.shape[0], 1)}, got {y.shape} instead' ) return y def __calculate_n_bins(self, n_bins, len_X): """Calculate number of bins. Args: n_bins(string, int): The formula to calculate number of bins. len_X(int): length of X. Returns: int, The number of bins. References: Cebeci, Z. and Yıldız, F. (2017) Unsupervised Discretization of Continuous Variables in a Chicken Egg Quality Traits Dataset. Turkish Journal of Agriculture-Food Science and Technology, 5.4, 315-320. Available from: http://agrifoodscience.com/index.php/TURJAF/article/view/1056 """ if n_bins == 'square-root': return round(sqrt(len_X)) elif n_bins == 'sturges': return round(1 + log2(len_X)) elif n_bins == 'huntsberger': return round(1 + 3.322 * log10(len_X)) elif n_bins == 'brooks-carrther': return round(5 * log10(len_X)) elif n_bins == 'cencov': return round(pow(len_X, 1 / 3)) elif n_bins == 'rice-rule': return round(power(len_X, 1 / 3) * 2) elif n_bins == 'terrell-scott': return round(power(2 * len_X, 1 / 3)) elif n_bins == 'scott': return round((npmax(self.X) - npmin(self.X)) / 3.5 * std(self.X) * power(len_X, -1 / 3)) elif n_bins == 'freedman-diaconis': iqr = subtract(*percentile(self.X, [75, 25])) h = 2 * iqr / power(len_X, 1 / 3) return round((npmax(self.X) - npmin(self.X)) / h) def __check_X_shape(self): """Check shape of X. Returns: int, length of X. """ if len(self.X.shape) not in (1, 2): raise ValueError(f'Expected 1D or 2D array, got shape={self.X.shape} instead.') if len(self.X.shape) == 1: self.X = self.X.reshape(-1, 1) return self.X.shape[0] elif len(self.X.shape) == 2: if self.X.shape[0] == 1: self.X = self.X.reshape(-1, 1) return self.X.shape[0] def __check_X_type(self): """Check X type.""" if isinstance(self.X, DataFrame) or isinstance(self.X, Series): self.X = self.X.values elif isinstance(self.X, list): self.X = array(self.X) elif not (isinstance(self.X, ndarray)): raise ValueError( 'Invalid data type. ' 'Accepted pandas DataFrame and Series instancies, list and numpy array, got ' f'{type(self.X)} instead.' )