Source code for insolver.model_tools.model_utils

from sklearn.model_selection import train_test_split


[docs] def train_val_test_split(*arrays, val_size, test_size, random_state=0, shuffle=True, stratify=None): """Function for splitting dataset into train/validation/test partitions. Args: *arrays (array_like): Arrays to split into train/validation/test sets containing predictors. val_size (float): The proportion of the dataset to include in validation partition. test_size (float): The proportion of the dataset to include in test partition. random_state (int, optional): Random state, passed to train_test_split() from scikit-learn. (default=0). shuffle (bool, optional): Passed to train_test_split() from scikit-learn. (default=True). stratify (array_like, optional): Passed to train_test_split() from scikit-learn. (default=None). Returns: list: [x_train, x_valid, x_test, y_train, y_valid, y_test]. A list of partitions of the initial dataset. """ n_arrays = len(arrays) split1 = train_test_split( *arrays, random_state=random_state, shuffle=shuffle, test_size=test_size, stratify=stratify ) if n_arrays > 1: train, test = split1[0::2], split1[1::2] if val_size != 0: split2 = train_test_split( *train, random_state=random_state, shuffle=shuffle, test_size=val_size / (1 - test_size), stratify=stratify ) train, valid = split2[0::2], split2[1::2] return [*train, *valid, *test] else: return [*train, *test] else: train, test = split1[0], split1[1] if val_size != 0: split2 = train_test_split( train, random_state=random_state, shuffle=shuffle, test_size=val_size / (1 - test_size), stratify=stratify, ) train, valid = split2[0], split2[1] return [train, valid, test] else: return [train, test]
[docs] def train_test_column_split(x, y, df_column): """Function for splitting dataset into train/test partitions w.r.t. a column (pd.Series). Args: x (pd.DataFrame): DataFrame containing predictors. y (pd.DataFrame): DataFrame containing target variable. df_column (pd.Series): Series for train/test split, assuming it is contained in x. Returns: tuple: (x_train, x_test, y_train, y_test). A tuple of partitions of the initial dataset. """ x1, y1, col_name = x.copy(), y.copy(), df_column.name y1[col_name] = df_column return ( x1[x1[col_name] == 'train'].drop(col_name, axis=1), x1[x1[col_name] == 'test'].drop(col_name, axis=1), y1[y1[col_name] == 'train'].drop(col_name, axis=1), y1[y1[col_name] == 'test'].drop(col_name, axis=1), )