Source code for insolver.transforms.person

import re
import datetime

import pandas as pd


[docs] class TransformGenderGetFromName: """Gets clients' genders from theirs russian second names. Parameters: column_name (str): Column name in InsolverDataFrame containing clients' names, column type is string. column_gender (str): Column name in InsolverDataFrame for clients' genders. gender_male (str): Return value for male gender in InsolverDataFrame, 'male' by default. gender_female (str): Return value for female gender in InsolverDataFrame, 'female' by default. """ def __init__(self, column_name, column_gender, gender_male='male', gender_female='female', priority=0): self.priority = priority self.column_name = column_name self.column_gender = column_gender self.gender_male = gender_male self.gender_female = gender_female @staticmethod def _gender(client_name, gender_male, gender_female): if pd.isnull(client_name): gender = None elif len(client_name) < 2: gender = None elif client_name.upper().endswith(('ИЧ', 'ОГЛЫ')): gender = gender_male elif client_name.upper().endswith(('НА', 'КЫЗЫ')): gender = gender_female else: gender = None return gender def __call__(self, df): df[self.column_gender] = df[self.column_name].apply( self._gender, args=( self.gender_male, self.gender_female, ), ) return df
[docs] class TransformAgeGetFromBirthday: """Gets clients' ages in years from theirs birth dates and policies' start dates. Parameters: column_date_birth (str): Column name in InsolverDataFrame containing clients' birth dates, column type is date. column_date_start (str): Column name in InsolverDataFrame containing policies' start dates, column type is date. column_age (str): Column name in InsolverDataFrame for clients' ages in years, column type is int. """ def __init__(self, column_date_birth, column_date_start, column_age, priority=0): self.priority = priority self.column_date_birth = column_date_birth self.column_date_start = column_date_start self.column_age = column_age @staticmethod def _age_get(datebirth_datestart): date_birth = datebirth_datestart[0] date_start = datebirth_datestart[1] if pd.isnull(date_birth): age = None elif pd.isnull(date_start): age = None elif date_birth > datetime.datetime.now(): age = None elif date_birth.year < datetime.datetime.now().year - 120: age = None elif date_birth > date_start: age = None else: age = int((date_start - date_birth).days // 365.25) return age def __call__(self, df): df[self.column_age] = df[[self.column_date_birth, self.column_date_start]].apply(self._age_get, axis=1) return df
[docs] class TransformAge: """Transforms values of drivers' minimum ages in years. Values under 'age_min' are invalid. Values over 'age_max' will be grouped. Parameters: column_driver_minage (str): Column name in InsolverDataFrame containing drivers' minimum ages in years, column type is integer. age_min (int): Minimum value of drivers' age in years, lower values are invalid, 18 by default. age_max (int): Maximum value of drivers' age in years, bigger values will be grouped, 70 by default. """ def __init__(self, column_driver_minage, age_min=18, age_max=70, priority=1): self.priority = priority self.column_driver_minage = column_driver_minage self.age_min = age_min self.age_max = age_max @staticmethod def _age(age, age_min, age_max): if pd.isnull(age): age = None elif age < age_min: age = None elif age > age_max: age = age_max return age def __call__(self, df): df[self.column_driver_minage] = df[self.column_driver_minage].apply( self._age, args=(self.age_min, self.age_max) ) return df
[docs] class TransformAgeGender: """Gets intersections of drivers' minimum ages and genders. Parameters: column_age (str): Column name in InsolverDataFrame containing clients' ages in years, column type is integer. column_gender (str): Column name in InsolverDataFrame containing clients' genders. column_age_m (str): Column name in InsolverDataFrame for males' ages, for females default value is applied, column type is integer. column_age_f (str): Column name in InsolverDataFrame for females' ages, for males default value is applied, column type is integer. age_default (int): Default value of the age in years,18 by default. gender_male: Value for male gender in InsolverDataFrame, 'male' by default. gender_female: Value for male gender in InsolverDataFrame, 'female' by default. """ def __init__( self, column_age, column_gender, column_age_m, column_age_f, age_default=18, gender_male='male', gender_female='female', priority=2, ): self.priority = priority self.column_age = column_age self.column_gender = column_gender self.column_age_m = column_age_m self.column_age_f = column_age_f self.age_default = age_default self.gender_male = gender_male self.gender_female = gender_female @staticmethod def _age_gender(age_gender, age_default, gender_male, gender_female): age = age_gender[0] gender = age_gender[1] if pd.isnull(age): age_m = None age_f = None elif pd.isnull(gender): age_m = None age_f = None elif gender == gender_male: age_m = age age_f = age_default elif gender == gender_female: age_m = age_default age_f = age else: age_m = None age_f = None return [age_m, age_f] def __call__(self, df): df[self.column_age_m], df[self.column_age_f] = zip( *df[[self.column_age, self.column_gender]] .apply(self._age_gender, axis=1, args=(self.age_default, self.gender_male, self.gender_female)) .to_frame()[0] ) return df
[docs] class TransformNameCheck: """Checks if clients' first names are in special list. Names may concatenate surnames, first names and last names. Parameters: column_name (str): Column name in InsolverDataFrame containing clients' names, column type is string. name_full (bool): Sign if name is the concatenation of surname, first name and last name, False by default. column_name_check (str): Column name in InsolverDataFrame for bool values if first names are in the list or not. names_list (list): The list of clients' first names. name_position (int): The position of the name in full name. For example, argument should be 0 for notation such as 'John Doe', but 1 for notation like 'Ivanov Ivan'. """ def __init__(self, column_name, column_name_check, names_list, name_full=False, name_position=1, priority=1): self.priority = priority self.column_name = column_name self.name_full = name_full self.column_name_check = column_name_check self.name_position = name_position self.names_list = [n.upper() for n in names_list] @staticmethod def _name_get(client_name, name_position): tokenize_re = re.compile(r'[\w\-]+', re.I) try: name = tokenize_re.findall(str(client_name))[name_position].upper() return name except IndexError: return 'ERROR' def __call__(self, df): if not self.name_full: df[self.column_name_check] = 1 * df[self.column_name].isin(self.names_list) else: df[self.column_name_check] = 1 * df[self.column_name].apply(self._name_get).isin(self.names_list) return df