Source code for insolver.discretization.discretizer_utils
import numpy as np
import pandas as pd
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
[docs]
class SklearnDiscretizer:
[docs]
@staticmethod
def _transform(X, n_bins, method):
"""Apply discretizations from scikit-learn.
Args:
X: 1-D array, The data to be descretized.
n_bins (int): The number of bins.
method (string): The method used by scikit-learn's KBinsDiscretizer. Either 'uniform', 'quantile' or
'kmeans'.
Returns:
1-D array, The transformed data.
References:
[1] https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html
"""
return KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy=method).fit_transform(X).reshape(-1)
[docs]
class CARTDiscretizer:
[docs]
@staticmethod
def _transform(X, y, min_samples_leaf=None, min_tree_depth=1, max_tree_depth=3):
"""Apply CART discretization.
Args:
X: 1-D array, The data to be descretized.
y: 1-D array, The target values.
min_samples_leaf(int): The minimum number of samples required to be at a leaf node. A split point at any
depth will only be considered if it leaves at least min_samples_leaf training samples in each of the left
and right branches.
This may have the effect of smoothing the model, especially in regression.
If int, then consider min_samples_leaf as the minimum number.
If float, then min_samples_leaf is a fraction and ceil(min_samples_leaf * n_samples) are the minimum
number of samples for each node.
If None, then min_samples_leaf implicitly set to 0.1.
Returns:
1-D array, The transformed data.
References:
[1] Liu, Huan, et al. "Discretization: An enabling technique." Data mining and knowledge discovery 6.4
(2002): 393-423.
"""
X = X.reshape(-1, 1)
min_samples_leaf = 0.1 if min_samples_leaf is None else min_samples_leaf
depths = range(min_tree_depth, max_tree_depth + 1)
roc_auc_scores = []
for tree_depth in depths:
tree_model = DecisionTreeClassifier(max_depth=tree_depth, min_samples_leaf=min_samples_leaf)
scores = cross_val_score(tree_model, X, y, cv=3, scoring='roc_auc')
roc_auc_scores.append(np.mean(scores))
best = depths[np.where(roc_auc_scores == np.max(roc_auc_scores))[0][0]]
tree_model = DecisionTreeClassifier(max_depth=best, min_samples_leaf=min_samples_leaf)
tree_model.fit(X, y)
return tree_model.predict_proba(X)[:, 1]
[docs]
class ChiMergeDiscretizer:
[docs]
def _transform(self, X, y, n_bins):
"""Apply ChiMerge discretization
Args:
X: 1-D array, The data to be descretized.
y: 1-D array, The target values.
n_bins(int): The number of bins.
Returns:
1-D array, The transformed data.
References:
[1] Kerber, Randy. "Chimerge: Discretization of numeric attributes." Proceedings of the tenth national
conference on Artificial intelligence. 1992.
Available from: https://www.aaai.org/Papers/AAAI/1992/AAAI92-019.pdf
"""
binned = np.copy(X)
intervals = self.__get_chimerge_intervals(X, y, n_bins)
for i in range(len(intervals)):
binned[(binned >= intervals[i][0]) & (binned <= intervals[i][1])] = i
return binned
@staticmethod
def _get_new_intervals(intervals, min_chi_index):
new_intervals = np.empty((len(intervals) - 1, 2))
found = False
i = 0
k = 0
while k < len(new_intervals):
if not found and i == min_chi_index:
t = np.concatenate((intervals[i], intervals[i + 1]))
new_intervals[k] = np.array([min(t), max(t)])
i += 2
else:
new_intervals[k] = intervals[i]
i += 1
k += 1
return new_intervals
@staticmethod
def _get_chi(values, values_reversed, target, target_unique, intervals, i):
left0 = np.argmax(values == intervals[i][0])
left1 = np.argmax(values == intervals[i + 1][0])
right0 = len(values_reversed) - np.argmax(values_reversed == intervals[i][1])
right1 = len(values_reversed) - np.argmax(values_reversed == intervals[i + 1][1])
interval_0 = target[left0:right0]
interval_1 = target[left1:right1]
a_1 = np.bincount(interval_0, minlength=len(target_unique))
a_2 = np.bincount(interval_1, minlength=len(target_unique))
r_1 = np.sum(a_1)
r_2 = np.sum(a_2)
c_j = np.sum([a_1, a_2], axis=0)
n = np.sum(c_j)
e_1j = r_1 * c_j / n
e_2j = r_2 * c_j / n
chi = np.power(a_1 - e_1j, 2) / e_1j + np.power(a_2 - e_2j, 2) / e_2j
return np.sum(np.nan_to_num(chi))
def _get_vals(self, chi, intervals):
min_chi_index = np.where(chi == np.min(chi))[0][0]
intervals = self._get_new_intervals(intervals, min_chi_index)
idx = np.array([min_chi_index - 1, min_chi_index, min_chi_index + 1])
idx = idx[(idx >= 0) & (idx <= len(chi) - 1)]
chi = np.delete(chi, idx)
return intervals, chi, idx
def __get_chimerge_intervals(self, values, target, max_intervals):
intervals = np.dstack((np.unique(values), np.unique(values)))[0]
target = target[np.argsort(values)]
target_unique = pd.unique(target) # faster than np.unique
values = np.sort(values)
values_reversed = values[::-1]
chi = np.empty(0)
# initial calculation chi2 for single-values intervals
for i in range(len(intervals) - 1):
chi_ = self._get_chi(values, values_reversed, target, target_unique, intervals, i)
chi = np.insert(chi, i, chi_)
intervals, chi, idx = self._get_vals(chi, intervals)
while len(intervals) > max_intervals:
# consequent recalculation chi2 for changed intervals
for i in idx[:-1]:
chi_ = self._get_chi(values, values_reversed, target, target_unique, intervals, i)
chi = np.insert(chi, i, chi_)
intervals, chi, idx = self._get_vals(chi, intervals)
return intervals