Source code for geochemistrypi.data_mining.data.preprocessing

# -*- coding: utf-8 -*-
from typing import List, Optional

import numpy as np
import pandas as pd
from rich import print
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import GenericUnivariateSelect, SelectKBest, f_classif, f_regression
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from .data_readiness import show_data_columns


[docs] class MeanNormalScaler(BaseEstimator, TransformerMixin): """Custom Scikit-learn transformer for mean normalization. MeanNormalization involves subtracting the mean of each feature from the feature values and then dividing by the range (maximum value minus minimum value) of that feature. The transformation is given by: X_scaled = (X - X.mean()) / (X.max() - X.min()) """ def __init__(self: object, copy: bool = True): self.copy = copy self.mean_ = None self.scale_ = None
[docs] def fit(self: object, X: pd.DataFrame, y: Optional[pd.DataFrame] = None) -> object: """ Compute the mean and range (max - min) for each feature. Parameters ---------- X : pd.DataFrame The input dataframe where each column represents a feature. y : pd.DataFrame, optional (default: None) Ignored. Returns ------- self : object Fitted transformer. """ self.mean_ = np.mean(X, axis=0) self.scale_ = np.std(X, axis=0) return self
[docs] def transform(self: object, X: pd.DataFrame, y: Optional[pd.DataFrame] = None, copy: bool = None) -> np.ndarray: """ Apply mean normalization to the data. Parameters ---------- X : pd.DataFrame The input dataframe where each column represents a feature. y : pd.DataFrame, optional (default: None) Ignored. copy : bool, optional (default: None) Copy the input X or not. Returns ------- X_tr : np.ndarray The normalized data. """ copy = copy if copy is not None else self.copy X = X if not self.copy else X.copy() return (X - self.mean_) / self.scale_
[docs] def inverse_transform(self: object, X: pd.DataFrame) -> np.ndarray: """ Reverse the mean normalization transformation. Parameters ---------- X : pd.DataFrame The input dataframe where each column represents a feature. Returns ------- X_tr : np.ndarray The original data. """ X = X if not self.copy else X.copy() return X * self.scale_ + self.mean_
[docs] def feature_scaler(X: pd.DataFrame, method: List[str], method_idx: int) -> tuple[dict, np.ndarray]: """Apply feature scaling methods. Parameters ---------- X : pd.DataFrame The dataset. method : str The feature scaling methods. method_idx : int The index of methods. Returns ------- feature_scaling_config : dict The feature scaling configuration. X_scaled : np.ndarray The dataset after imputing. """ if method[method_idx] == "Min-max Scaling": scaler = MinMaxScaler() elif method[method_idx] == "Standardization": scaler = StandardScaler() elif method[method_idx] == "Mean Normalization": scaler = MeanNormalScaler() try: X_scaled = scaler.fit_transform(X) except ValueError: print("The selected feature scaling method is not applicable to the dataset!") print("Please check the dataset to find the reason.") feature_scaling_config = {type(scaler).__name__: scaler.get_params()} return feature_scaling_config, X_scaled
[docs] def feature_selector(X: pd.DataFrame, y: pd.DataFrame, feature_selection_task: int, method: List[str], method_idx: int) -> tuple[dict, pd.DataFrame]: """Apply feature selection methods. Parameters ---------- X : pd.DataFrame The feature dataset. y : pd.DataFrame The label dataset. feature_selection_task : int Feature selection for regression or classification tasks. method : str The feature selection methods. method_idx : int The index of methods. Returns ------- feature_selection_config : dict The feature selection configuration. X_selected : pd.DataFrame The feature dataset after selecting. """ print("-- Original Features --") show_data_columns(X.columns) features_num = len(X.columns) print(f"The original number of features is {features_num}, and your input must be less than {features_num}.") features_retain_num = int(input("Please enter the number of features to retain.\n" "@input: ")) if feature_selection_task == 1: score_func = f_regression elif feature_selection_task == 2: score_func = f_classif if method[method_idx] == "Generic Univariate Select": selector = GenericUnivariateSelect(score_func=score_func, mode="k_best", param=features_retain_num) elif method[method_idx] == "Select K Best": selector = SelectKBest(score_func=score_func, k=features_retain_num) try: selector.fit(X, y) features_selected = selector.get_feature_names_out() X = X[features_selected] except ValueError: print("The selected feature selection method is not applicable to the dataset!") print("Please check the dataset to find the reason.") feature_selection_config = {type(selector).__name__: selector.get_params()} return feature_selection_config, X