Source code for qolmat.imputations.preprocessing

"""Script for preprocessing functions."""

import copy
from typing import Dict, Hashable, List, Optional, Tuple

import numpy as np
import pandas as pd
from category_encoders.one_hot import OneHotEncoder
from numpy.typing import NDArray
from sklearn import utils as sku
from sklearn.base import (
    BaseEstimator,
    RegressorMixin,
    TransformerMixin,
)
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.ensemble import (
    HistGradientBoostingClassifier,
    HistGradientBoostingRegressor,
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.utils import InputTags
from sklearn.utils.validation import (
    check_is_fitted,
)

# from typing_extensions import Self
from qolmat.utils import utils


[docs]class MixteHGBM(RegressorMixin, BaseEstimator):
    """MixteHGBM class.

    This is a custom scikit-learn estimator implementing a mixed model using
    HistGradientBoostingClassifier for string target data and
    HistGradientBoostingRegressor for numeric target data.
    """

[docs]    def __init__(self):
        super().__init__()

[docs]    def set_model_parameters(self, **args_model):
        """Set the arguments of the underlying model.

        Parameters
        ----------
        **args_model : dict
            Additional keyword arguments to be passed to the underlying models.

        """
        self.args_model = args_model

[docs]    def fit(self, X: NDArray, y: NDArray) -> "MixteHGBM":
        """Fit the model according to the given training data.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Training vectors.
        y : array-like, shape (n_samples,)
            Target values.

        Returns
        -------
        self : object
            Returns self.

        """
        X, y = sku.validation.validate_data(
            self,
            X,
            y,
            accept_sparse=False,
            ensure_all_finite="allow-nan",
            reset=True,
            dtype=["float", "int", "string", "categorical", "object"],
        )
        self.is_fitted_ = True
        self.n_features_in_ = X.shape[1]
        if hasattr(self, "args_model"):
            args_model = self.args_model
        else:
            args_model = {}
        if pd.api.types.is_string_dtype(y):
            model = HistGradientBoostingClassifier(**args_model)
        elif pd.api.types.is_numeric_dtype(y):
            model = HistGradientBoostingRegressor(**args_model)
        else:
            raise TypeError("Unknown label type")

        self.model_ = model.fit(X, y)
        return self

[docs]    def predict(self, X: NDArray) -> NDArray:
        """Predict using the fitted model.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Samples.

        Returns
        -------
        y_pred : array-like, shape (n_samples,)
            Predicted target values.

        """
        sku.validation.validate_data(
            self,
            X,
            accept_sparse=False,
            ensure_all_finite="allow-nan",
            reset=False,
            dtype=["float", "int", "string", "categorical", "object"],
        )
        check_is_fitted(self, "is_fitted_")
        y_pred = self.model_.predict(X)
        return y_pred

    def __sklearn_tags__(self):
        """Indicate if the class allows inputs with categorical data and nans.

        It modifies the behaviour of the functions checking data.
        """
        tags = super().__sklearn_tags__()
        tags.input_tags = InputTags(
            two_d_array=True, categorical=True, string=True, allow_nan=True
        )
        tags.target_tags.single_output = False
        tags.non_deterministic = True
        return tags


[docs]class BinTransformer(TransformerMixin, BaseEstimator):
    """BinTransformer class.

    Learn the possible values of the provided numerical feature,
    allowing to transform new values to the closest existing one.
    """

[docs]    def __init__(self, cols: Optional[List] = None):
        super().__init__()
        self.cols = cols

[docs]    def fit(self, X: NDArray, y: Optional[NDArray] = None) -> "BinTransformer":
        """Fit the BinTransformer to X.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The data to determine the unique values.

        y : None
            Ignored. This parameter exists only for compatibility with
            :class:`~sklearn.pipeline.Pipeline`.

        Returns
        -------
        self : object
            Fitted transformer.

        """
        sku.validation.validate_data(
            self,
            X,
            accept_sparse=False,
            ensure_all_finite="allow-nan",
            reset=True,
            dtype=["float", "int", "string", "categorical", "object"],
        )
        df = utils._validate_input(X)
        self.feature_names_in_ = df.columns
        self.n_features_in_ = len(df.columns)
        self.dict_df_bins_: Dict[Hashable, pd.DataFrame] = {}
        if self.cols is None:
            cols = df.select_dtypes(include="number").columns
        else:
            cols = self.cols
        for col in cols:
            values = df[col]
            values = values.dropna()
            df_bins = pd.DataFrame({"value": np.sort(values.unique())})
            df_bins["min"] = (df_bins["value"] + df_bins["value"].shift()) / 2
            self.dict_df_bins_[col] = df_bins.fillna(-np.inf)
        return self

[docs]    def transform(self, X: NDArray) -> NDArray:
        """Transform X to existing values learned during fit.

        Parameters
        ----------
        X : array-like of shape (n_samples,)
            The data to transform.

        Returns
        -------
        X_out : ndarray of shape (n_samples,)
            Transformed input.

        """
        sku.validation.validate_data(
            self,
            X,
            accept_sparse=False,
            ensure_all_finite="allow-nan",
            reset=False,
            dtype=["float", "int", "string", "categorical", "object"],
        )
        df = utils._validate_input(X)
        check_is_fitted(self)
        # if (
        #     not hasattr(self, "feature_names_in_")
        #     or df.columns.to_list() != self.feature_names_in_.to_list()
        # ):
        #     raise ValueError(
        #         f"Feature names in X {df.columns} don't match with "
        #         f"expected {self.feature_names_in_}"
        #     )
        df_out = df.copy()
        for col in df:
            values = df[col]
            if col in self.dict_df_bins_.keys():
                df_bins = self.dict_df_bins_[col]
                bins_X = np.digitize(values, df_bins["min"]) - 1
                values_out = df_bins.loc[bins_X, "value"]
                values_out.index = values.index
                df_out[col] = values_out.where(values.notna(), np.nan)
        if isinstance(X, np.ndarray):
            return df_out.values
        return df_out

[docs]    def inverse_transform(self, X: NDArray) -> NDArray:
        """Transform X to existing values learned during fit.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The data to transform.

        Returns
        -------
        X_out : ndarray of shape (n_samples,)
            Transformed input.

        """
        return self.transform(X)

    def __sklearn_tags__(self):
        """Indicate if the class allows inputs with categorical data and nans.

        It modifies the behaviour of the functions checking data.
        """
        tags = super().__sklearn_tags__()
        tags.input_tags = InputTags(
            two_d_array=True, categorical=True, string=True, allow_nan=True
        )
        tags.target_tags.single_output = False
        tags.non_deterministic = True
        return tags


[docs]class OneHotEncoderProjector(OneHotEncoder):
    """Class for one-hot encoding of categorical features.

    It inherits from the class OneHotEncoder imported from category_encoders.
    The decoding function accepts non boolean values (as it is the case for
    the sklearn OneHotEncoder). In this case the decoded value corresponds to
    the largest dummy value.
    """

[docs]    def __init__(self, **kwargs):
        super().__init__(**kwargs)

[docs]    def reverse_dummies(self, X: pd.DataFrame, mapping: Dict) -> pd.DataFrame:
        """Convert dummy variable into numerical variables.

        Parameters
        ----------
        X : DataFrame
            Input dataframe.
        mapping: list-like
              Mapping of column to be transformed to its
              new columns and value represented

        Returns
        -------
        numerical: DataFrame

        """
        out_cols = X.columns.tolist()
        mapped_columns = []
        for switch in mapping:
            col = switch.get("col")
            mod = switch.get("mapping")
            insert_at = out_cols.index(mod.columns[0])
            X.insert(insert_at, col, 0)
            positive_indexes = mod.index[mod.index > 0]
            max_code = X[mod.columns].max(axis=1)
            for existing_col, val in zip(mod.columns, positive_indexes):
                X.loc[X[existing_col] == max_code, col] = val
                mapped_columns.append(existing_col)
            X = X.drop(mod.columns, axis=1)
            out_cols = X.columns.tolist()

        return X


[docs]class WrapperTransformer(TransformerMixin, BaseEstimator):
    """Wrap a transformer.

    Wrapper with reversible transformers designed to embed the data.
    """

[docs]    def __init__(self, transformer: TransformerMixin, wrapper: TransformerMixin):
        super().__init__()
        self.transformer = transformer
        self.wrapper = wrapper

[docs]    def fit(self, X: NDArray, y: Optional[NDArray] = None) -> "WrapperTransformer":
        """Fit the model according to the given training data.

        Parameters
        ----------
        X : NDArray
            Input array.
        y : Optional[NDArray], optional
            _description_, by default None

        Returns
        -------
        Self
            The object itself.

        """
        X_transformed = copy.deepcopy(X)
        X_transformed = self.wrapper.fit_transform(X_transformed)
        X_transformed = self.transformer.fit(X_transformed)
        return self

[docs]    def fit_transform(self, X: NDArray) -> NDArray:
        """Fit the model according to the given training data and transform it.

        Parameters
        ----------
        X : NDArray
            Input array.

        Returns
        -------
        NDArray
            Transformed array.

        """
        X_transformed = copy.deepcopy(X)
        X_transformed = self.wrapper.fit_transform(X_transformed)
        X_transformed = self.transformer.fit_transform(X_transformed)
        X_transformed = self.wrapper.inverse_transform(X_transformed)
        return X_transformed

[docs]    def transform(self, X: NDArray) -> NDArray:
        """Transform X.

        Parameters
        ----------
        X : NDArray
            Input array.

        Returns
        -------
        NDArray
            Transformed array.

        """
        X_transformed = copy.deepcopy(X)
        X_transformed = self.wrapper.transform(X_transformed)
        X_transformed = self.transformer.transform(X_transformed)
        X_transformed = self.wrapper.inverse_transform(X_transformed)
        return X_transformed


[docs]def make_pipeline_mixte_preprocessing(
    scale_numerical: bool = False, avoid_new: bool = False
) -> Pipeline:
    """Create a preprocessing pipeline managing mixed type data.

    It does this by one hot encoding categorical data.

    Parameters
    ----------
    scale_numerical : bool, default=False
        Whether to scale numerical features.
    avoid_new : bool, default=False
        Whether to forbid new numerical values.

    Returns
    -------
    preprocessor : Pipeline
        Preprocessing pipeline

    """
    transformers: List[Tuple] = []
    if scale_numerical:
        transformers += [("num", StandardScaler(), selector(dtype_include=np.number))]

    ohe = OneHotEncoder(handle_unknown="ignore", use_cat_names=True)
    transformers += [("cat", ohe, selector(dtype_exclude=np.number))]
    col_transformer = ColumnTransformer(transformers=transformers, remainder="passthrough")
    col_transformer = col_transformer.set_output(transform="pandas")
    preprocessor = Pipeline(steps=[("col_transformer", col_transformer)])

    if avoid_new:
        preprocessor.steps.append(("bins", BinTransformer()))
    return preprocessor


[docs]def make_robust_MixteHGB(scale_numerical: bool = False, avoid_new: bool = False) -> Pipeline:
    """Create a robust pipeline for MixteHGBM.

    Create a preprocessing pipeline managing mixed type data
    by one hot encoding categorical features.
    This estimator is intended for use in ImputerRegressor
    to deal with mixed type data.

    Note that from sklearn 1.4 HistGradientBoosting Natively Supports
    Categorical DTypes in DataFrames, so that this pipeline is not
    required anymore.


    Parameters
    ----------
    scale_numerical : bool, default=False
        Whether to scale numerical features.
    avoid_new : bool, default=False
        Whether to forbid new numerical values.

    Returns
    -------
    robust_MixteHGB : object
        A robust pipeline for MixteHGBM.

    """
    preprocessor = make_pipeline_mixte_preprocessing(
        scale_numerical=scale_numerical, avoid_new=avoid_new
    )
    robust_MixteHGB = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("estimator", MixteHGBM()),
        ]
    )

    return robust_MixteHGB