Source code for qolmat.imputations.preprocessing

"""Script for preprocessing functions."""

import copy
from typing import Dict, Hashable, List, Optional, Tuple

import numpy as np
import pandas as pd
from category_encoders.one_hot import OneHotEncoder
from numpy.typing import NDArray
from sklearn import utils as sku
from sklearn.base import (
    BaseEstimator,
    RegressorMixin,
    TransformerMixin,
)
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.ensemble import (
    HistGradientBoostingClassifier,
    HistGradientBoostingRegressor,
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.utils import InputTags
from sklearn.utils.validation import (
    check_is_fitted,
)

# from typing_extensions import Self
from qolmat.utils import utils


[docs]class MixteHGBM(RegressorMixin, BaseEstimator): """MixteHGBM class. This is a custom scikit-learn estimator implementing a mixed model using HistGradientBoostingClassifier for string target data and HistGradientBoostingRegressor for numeric target data. """
[docs] def __init__(self): super().__init__()
[docs] def set_model_parameters(self, **args_model): """Set the arguments of the underlying model. Parameters ---------- **args_model : dict Additional keyword arguments to be passed to the underlying models. """ self.args_model = args_model
[docs] def fit(self, X: NDArray, y: NDArray) -> "MixteHGBM": """Fit the model according to the given training data. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Training vectors. y : array-like, shape (n_samples,) Target values. Returns ------- self : object Returns self. """ X, y = sku.validation.validate_data( self, X, y, accept_sparse=False, ensure_all_finite="allow-nan", reset=True, dtype=["float", "int", "string", "categorical", "object"], ) self.is_fitted_ = True self.n_features_in_ = X.shape[1] if hasattr(self, "args_model"): args_model = self.args_model else: args_model = {} if pd.api.types.is_string_dtype(y): model = HistGradientBoostingClassifier(**args_model) elif pd.api.types.is_numeric_dtype(y): model = HistGradientBoostingRegressor(**args_model) else: raise TypeError("Unknown label type") self.model_ = model.fit(X, y) return self
[docs] def predict(self, X: NDArray) -> NDArray: """Predict using the fitted model. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Samples. Returns ------- y_pred : array-like, shape (n_samples,) Predicted target values. """ sku.validation.validate_data( self, X, accept_sparse=False, ensure_all_finite="allow-nan", reset=False, dtype=["float", "int", "string", "categorical", "object"], ) check_is_fitted(self, "is_fitted_") y_pred = self.model_.predict(X) return y_pred
def __sklearn_tags__(self): """Indicate if the class allows inputs with categorical data and nans. It modifies the behaviour of the functions checking data. """ tags = super().__sklearn_tags__() tags.input_tags = InputTags( two_d_array=True, categorical=True, string=True, allow_nan=True ) tags.target_tags.single_output = False tags.non_deterministic = True return tags
[docs]class BinTransformer(TransformerMixin, BaseEstimator): """BinTransformer class. Learn the possible values of the provided numerical feature, allowing to transform new values to the closest existing one. """
[docs] def __init__(self, cols: Optional[List] = None): super().__init__() self.cols = cols
[docs] def fit(self, X: NDArray, y: Optional[NDArray] = None) -> "BinTransformer": """Fit the BinTransformer to X. Parameters ---------- X : array-like of shape (n_samples, n_features) The data to determine the unique values. y : None Ignored. This parameter exists only for compatibility with :class:`~sklearn.pipeline.Pipeline`. Returns ------- self : object Fitted transformer. """ sku.validation.validate_data( self, X, accept_sparse=False, ensure_all_finite="allow-nan", reset=True, dtype=["float", "int", "string", "categorical", "object"], ) df = utils._validate_input(X) self.feature_names_in_ = df.columns self.n_features_in_ = len(df.columns) self.dict_df_bins_: Dict[Hashable, pd.DataFrame] = {} if self.cols is None: cols = df.select_dtypes(include="number").columns else: cols = self.cols for col in cols: values = df[col] values = values.dropna() df_bins = pd.DataFrame({"value": np.sort(values.unique())}) df_bins["min"] = (df_bins["value"] + df_bins["value"].shift()) / 2 self.dict_df_bins_[col] = df_bins.fillna(-np.inf) return self
[docs] def transform(self, X: NDArray) -> NDArray: """Transform X to existing values learned during fit. Parameters ---------- X : array-like of shape (n_samples,) The data to transform. Returns ------- X_out : ndarray of shape (n_samples,) Transformed input. """ sku.validation.validate_data( self, X, accept_sparse=False, ensure_all_finite="allow-nan", reset=False, dtype=["float", "int", "string", "categorical", "object"], ) df = utils._validate_input(X) check_is_fitted(self) # if ( # not hasattr(self, "feature_names_in_") # or df.columns.to_list() != self.feature_names_in_.to_list() # ): # raise ValueError( # f"Feature names in X {df.columns} don't match with " # f"expected {self.feature_names_in_}" # ) df_out = df.copy() for col in df: values = df[col] if col in self.dict_df_bins_.keys(): df_bins = self.dict_df_bins_[col] bins_X = np.digitize(values, df_bins["min"]) - 1 values_out = df_bins.loc[bins_X, "value"] values_out.index = values.index df_out[col] = values_out.where(values.notna(), np.nan) if isinstance(X, np.ndarray): return df_out.values return df_out
[docs] def inverse_transform(self, X: NDArray) -> NDArray: """Transform X to existing values learned during fit. Parameters ---------- X : array-like of shape (n_samples, n_features) The data to transform. Returns ------- X_out : ndarray of shape (n_samples,) Transformed input. """ return self.transform(X)
def __sklearn_tags__(self): """Indicate if the class allows inputs with categorical data and nans. It modifies the behaviour of the functions checking data. """ tags = super().__sklearn_tags__() tags.input_tags = InputTags( two_d_array=True, categorical=True, string=True, allow_nan=True ) tags.target_tags.single_output = False tags.non_deterministic = True return tags
[docs]class OneHotEncoderProjector(OneHotEncoder): """Class for one-hot encoding of categorical features. It inherits from the class OneHotEncoder imported from category_encoders. The decoding function accepts non boolean values (as it is the case for the sklearn OneHotEncoder). In this case the decoded value corresponds to the largest dummy value. """
[docs] def __init__(self, **kwargs): super().__init__(**kwargs)
[docs] def reverse_dummies(self, X: pd.DataFrame, mapping: Dict) -> pd.DataFrame: """Convert dummy variable into numerical variables. Parameters ---------- X : DataFrame Input dataframe. mapping: list-like Mapping of column to be transformed to its new columns and value represented Returns ------- numerical: DataFrame """ out_cols = X.columns.tolist() mapped_columns = [] for switch in mapping: col = switch.get("col") mod = switch.get("mapping") insert_at = out_cols.index(mod.columns[0]) X.insert(insert_at, col, 0) positive_indexes = mod.index[mod.index > 0] max_code = X[mod.columns].max(axis=1) for existing_col, val in zip(mod.columns, positive_indexes): X.loc[X[existing_col] == max_code, col] = val mapped_columns.append(existing_col) X = X.drop(mod.columns, axis=1) out_cols = X.columns.tolist() return X
[docs]class WrapperTransformer(TransformerMixin, BaseEstimator): """Wrap a transformer. Wrapper with reversible transformers designed to embed the data. """
[docs] def __init__(self, transformer: TransformerMixin, wrapper: TransformerMixin): super().__init__() self.transformer = transformer self.wrapper = wrapper
[docs] def fit(self, X: NDArray, y: Optional[NDArray] = None) -> "WrapperTransformer": """Fit the model according to the given training data. Parameters ---------- X : NDArray Input array. y : Optional[NDArray], optional _description_, by default None Returns ------- Self The object itself. """ X_transformed = copy.deepcopy(X) X_transformed = self.wrapper.fit_transform(X_transformed) X_transformed = self.transformer.fit(X_transformed) return self
[docs] def fit_transform(self, X: NDArray) -> NDArray: """Fit the model according to the given training data and transform it. Parameters ---------- X : NDArray Input array. Returns ------- NDArray Transformed array. """ X_transformed = copy.deepcopy(X) X_transformed = self.wrapper.fit_transform(X_transformed) X_transformed = self.transformer.fit_transform(X_transformed) X_transformed = self.wrapper.inverse_transform(X_transformed) return X_transformed
[docs] def transform(self, X: NDArray) -> NDArray: """Transform X. Parameters ---------- X : NDArray Input array. Returns ------- NDArray Transformed array. """ X_transformed = copy.deepcopy(X) X_transformed = self.wrapper.transform(X_transformed) X_transformed = self.transformer.transform(X_transformed) X_transformed = self.wrapper.inverse_transform(X_transformed) return X_transformed
[docs]def make_pipeline_mixte_preprocessing( scale_numerical: bool = False, avoid_new: bool = False ) -> Pipeline: """Create a preprocessing pipeline managing mixed type data. It does this by one hot encoding categorical data. Parameters ---------- scale_numerical : bool, default=False Whether to scale numerical features. avoid_new : bool, default=False Whether to forbid new numerical values. Returns ------- preprocessor : Pipeline Preprocessing pipeline """ transformers: List[Tuple] = [] if scale_numerical: transformers += [("num", StandardScaler(), selector(dtype_include=np.number))] ohe = OneHotEncoder(handle_unknown="ignore", use_cat_names=True) transformers += [("cat", ohe, selector(dtype_exclude=np.number))] col_transformer = ColumnTransformer(transformers=transformers, remainder="passthrough") col_transformer = col_transformer.set_output(transform="pandas") preprocessor = Pipeline(steps=[("col_transformer", col_transformer)]) if avoid_new: preprocessor.steps.append(("bins", BinTransformer())) return preprocessor
[docs]def make_robust_MixteHGB(scale_numerical: bool = False, avoid_new: bool = False) -> Pipeline: """Create a robust pipeline for MixteHGBM. Create a preprocessing pipeline managing mixed type data by one hot encoding categorical features. This estimator is intended for use in ImputerRegressor to deal with mixed type data. Note that from sklearn 1.4 HistGradientBoosting Natively Supports Categorical DTypes in DataFrames, so that this pipeline is not required anymore. Parameters ---------- scale_numerical : bool, default=False Whether to scale numerical features. avoid_new : bool, default=False Whether to forbid new numerical values. Returns ------- robust_MixteHGB : object A robust pipeline for MixteHGBM. """ preprocessor = make_pipeline_mixte_preprocessing( scale_numerical=scale_numerical, avoid_new=avoid_new ) robust_MixteHGB = Pipeline( steps=[ ("preprocessor", preprocessor), ("estimator", MixteHGBM()), ] ) return robust_MixteHGB