Source code for insolver.transforms.autofillna
from numpy import where
[docs]
class AutoFillNATransforms:
"""Auto Fill NA values.
Parameters:
numerical_columns (list): List of numerical columns
categorical_columns (list): List of categorical columns
numerical_method (str): Fill numerical NA values using this specified method: 'median' (by default), 'mean',
'mode' or 'remove'
categorical_method (str): Fill categorical NA values using this specified method: 'frequent' (by default),
'new_category', 'imputed_column' or 'remove'
numerical_constants (dict): Dictionary of constants for each numerical column
categorical_constants (dict): Dictionary of constants for each categorical column
"""
def __init__(
self,
numerical_columns=None,
categorical_columns=None,
numerical_method='median',
categorical_method='frequent',
numerical_constants=None,
categorical_constants=None,
priority=0,
):
self.priority = priority
self.numerical_columns = numerical_columns
self.categorical_columns = categorical_columns
self.numerical_constants = numerical_constants
self.categorical_constants = categorical_constants
self._num_methods = ['median', 'mean', 'mode', 'remove']
self._cat_methods = ['frequent', 'new_category', 'imputed_column', 'remove']
self.numerical_method = numerical_method
self.categorical_method = categorical_method
def _find_num_cat_features(self, df):
if not self.categorical_columns:
self.categorical_columns = [c for c in df.columns if df[c].dtype.name == 'object']
if not self.numerical_columns:
self.numerical_columns = [c for c in df.columns if df[c].dtype.name != 'object']
[docs]
def _fillna_numerical(self, df):
"""Replace numerical NaN values using specified method"""
if not self.numerical_columns:
return
if self.numerical_method == 'remove':
df.dropna(subset=self.numerical_columns, inplace=True)
return
if self.numerical_constants:
for column in self.numerical_constants.keys():
df[column].fillna(self.numerical_constants[column], inplace=True)
if self.numerical_method in self._num_methods:
self._num_methods_dict = {
'median': lambda col: df[col].median(),
'mean': lambda col: df[col].mean(),
'mode': lambda col: df[col].mode()[0],
}
self.values = {}
for column in self.numerical_columns:
if df[column].isnull().all():
self.values[column] = 1
else:
self.values[column] = self._num_methods_dict[self.numerical_method](column)
df[column].fillna(self.values[column], inplace=True)
else:
raise NotImplementedError(f'Method parameter supports values in {self._num_methods}.')
[docs]
def _fillnan_categorical(self, df):
"""Replace categorical NaN values using specified method"""
if not self.categorical_columns:
return
if self.categorical_method == 'remove':
df.dropna(subset=self.categorical_columns, inplace=True)
return
if self.categorical_constants:
for column in self.categorical_constants.keys():
df[column].fillna(self.categorical_constants[column], inplace=True)
if self.categorical_method in self._cat_methods:
if self.categorical_method == 'new_category':
for column in self.categorical_columns:
df[column].fillna('Unknown', inplace=True)
return
if self.categorical_method == 'imputed_column':
for column in self.categorical_columns:
df[f"{column}_Imputed"] = where(df[column].isnull(), 1, 0)
self.freq_categories = {}
for column in self.categorical_columns:
if df[column].mode().values.size > 0:
self.freq_categories[column] = df[column].mode()[0]
else:
self.freq_categories[column] = 1
df[column].fillna(self.freq_categories[column], inplace=True)
else:
raise NotImplementedError(f'Method parameter supports values in {self._cat_methods}.')
def __call__(self, df):
self._find_num_cat_features(df)
self._fillna_numerical(df)
self._fillnan_categorical(df)
return df