Source code for insolver.transforms.grouping_sorting

from pandas import DataFrame


[docs] class TransformParamUselessGroup: """Groups all parameter's values with few data to one group. Parameters: column_param (str): Column name in InsolverDataFrame containing parameter. size_min (int): Minimum allowed number of records for each parameter value, 1000 by default. group_name: Name of the group for parameter's values with few data. inference (bool): Sign if the transformation is used for inference, False by default. param_useless (list): The list of useless values of the parameter, for inference only. """ def __init__(self, column_param, size_min=1000, group_name=0, inference=False, param_useless=None, priority=1): self.priority = priority self.column_param = column_param self.size_min = size_min self.group_name = group_name self.inference = inference if inference and param_useless is not None: self.param_useless = param_useless else: self.param_useless = []
[docs] @staticmethod def _param_useless_get(df, column_param, size_min): """Checks the amount of data for each parameter's value. Args: df: InsolverDataFrame to explore. column_param (str): Column name in InsolverDataFrame containing parameter. size_min (int): Minimum allowed number of records for each parameter's value, 1000 by default. Returns: list: List of parameter's values with few data. """ param_size = DataFrame(df.groupby(column_param).size().reset_index(name='param_size')) param_useless = list(param_size[column_param].loc[param_size['param_size'] < size_min]) return param_useless
def __call__(self, df): if self.param_useless == list(): self.param_useless = self._param_useless_get(df, self.column_param, self.size_min) df.loc[df[self.column_param].isin(self.param_useless), self.column_param] = self.group_name return df
[docs] class TransformParamSortFreq: """Gets sorted by claims' frequency parameter's values. Parameters: column_param (str): Column name in InsolverDataFrame containing parameter. column_param_sort_freq (str): Column name in InsolverDataFrame for sorted values of parameter, column type is integer. column_policies_count (str): Column name in InsolverDataFrame containing numbers of policies, column type is integer or float. column_claims_count (str): Column name in InsolverDataFrame containing numbers of claims, column type is integer or float. inference (bool): Sign if the transformation is used for inference, False by default. param_freq_dict (dict): The dictionary of sorted values of the parameter, for inference only. """ def __init__( self, column_param, column_param_sort_freq, column_policies_count, column_claims_count, inference=False, param_freq_dict=None, priority=2, ): self.priority = priority self.column_param = column_param self.column_param_sort_freq = column_param_sort_freq self.column_policies_count = column_policies_count self.column_claims_count = column_claims_count self.param_freq = DataFrame self.inference = inference if inference and param_freq_dict is not None: self.param_freq_dict = param_freq_dict else: self.param_freq_dict = {} def __call__(self, df): if self.param_freq_dict == dict(): self.param_freq = df.groupby([self.column_param]).sum()[ [self.column_claims_count, self.column_policies_count] ] self.param_freq['freq'] = ( self.param_freq[self.column_claims_count] / self.param_freq[self.column_policies_count] ) keys = [] values = [] for i in enumerate(self.param_freq.sort_values('freq', ascending=False).index.values): keys.append(i[1]) values.append(float(i[0])) self.param_freq_dict = dict(zip(keys, values)) df[self.column_param_sort_freq] = df[self.column_param].map(self.param_freq_dict) return df
[docs] class TransformParamSortAC: """Gets sorted by claims' average sum parameter's values. Parameters: column_param (str): Column name in InsolverDataFrame containing parameter. column_param_sort_ac (str): Column name in InsolverDataFrame for sorted values of parameter, column type is integer. column_claims_count (str): Column name in InsolverDataFrame containing numbers of claims, column type is integer or float. column_claims_sum (str): Column name in InsolverDataFrame containing sums of claims, column type is integer or float. inference (bool): Sign if the transformation is used for inference, False by default. param_ac_dict (dict): The dictionary of sorted values of the parameter, for inference only. """ def __init__( self, column_param, column_param_sort_ac, column_claims_count, column_claims_sum, inference=False, param_ac_dict=None, priority=2, ): self.priority = priority self.column_param = column_param self.column_param_sort_ac = column_param_sort_ac self.column_claims_count = column_claims_count self.column_claims_sum = column_claims_sum self.param_ac = DataFrame self.inference = inference if inference and param_ac_dict is not None: self.param_ac_dict = param_ac_dict else: self.param_ac_dict = {} def __call__(self, df): if self.param_ac_dict == dict(): self.param_ac = df.groupby([self.column_param]).sum()[[self.column_claims_sum, self.column_claims_count]] self.param_ac['avg_claim'] = self.param_ac[self.column_claims_sum] / self.param_ac[self.column_claims_count] keys = [] values = [] for i in enumerate(self.param_ac.sort_values('avg_claim', ascending=False).index.values): keys.append(i[1]) values.append(float(i[0])) self.param_ac_dict = dict(zip(keys, values)) df[self.column_param_sort_ac] = df[self.column_param].map(self.param_ac_dict) return df