Source code for insolver.model_tools.model_utils

from sklearn.model_selection import train_test_split



[docs]
def train_val_test_split(*arrays, val_size, test_size, random_state=0, shuffle=True, stratify=None):
    """Function for splitting dataset into train/validation/test partitions.

    Args:
        *arrays (array_like): Arrays to split into train/validation/test sets containing predictors.
        val_size (float): The proportion of the dataset to include in validation partition.
        test_size (float): The proportion of the dataset to include in test partition.
        random_state (int, optional): Random state, passed to train_test_split() from scikit-learn. (default=0).
        shuffle (bool, optional): Passed to train_test_split() from scikit-learn. (default=True).
        stratify (array_like, optional): Passed to train_test_split() from scikit-learn. (default=None).

    Returns:
        list: [x_train, x_valid, x_test, y_train, y_valid, y_test].

        A list of partitions of the initial dataset.
    """
    n_arrays = len(arrays)
    split1 = train_test_split(
        *arrays, random_state=random_state, shuffle=shuffle, test_size=test_size, stratify=stratify
    )
    if n_arrays > 1:
        train, test = split1[0::2], split1[1::2]
        if val_size != 0:
            split2 = train_test_split(
                *train,
                random_state=random_state,
                shuffle=shuffle,
                test_size=val_size / (1 - test_size),
                stratify=stratify
            )
            train, valid = split2[0::2], split2[1::2]
            return [*train, *valid, *test]
        else:
            return [*train, *test]
    else:
        train, test = split1[0], split1[1]
        if val_size != 0:
            split2 = train_test_split(
                train,
                random_state=random_state,
                shuffle=shuffle,
                test_size=val_size / (1 - test_size),
                stratify=stratify,
            )
            train, valid = split2[0], split2[1]
            return [train, valid, test]
        else:
            return [train, test]




[docs]
def train_test_column_split(x, y, df_column):
    """Function for splitting dataset into train/test partitions w.r.t. a column (pd.Series).

    Args:
        x (pd.DataFrame): DataFrame containing predictors.
        y (pd.DataFrame): DataFrame containing target variable.
        df_column (pd.Series): Series for train/test split, assuming it is contained in x.

    Returns:
        tuple: (x_train, x_test, y_train, y_test).

        A tuple of partitions of the initial dataset.
    """
    x1, y1, col_name = x.copy(), y.copy(), df_column.name
    y1[col_name] = df_column
    return (
        x1[x1[col_name] == 'train'].drop(col_name, axis=1),
        x1[x1[col_name] == 'test'].drop(col_name, axis=1),
        y1[y1[col_name] == 'train'].drop(col_name, axis=1),
        y1[y1[col_name] == 'test'].drop(col_name, axis=1),
    )