Source code for geochemistrypi.data_mining.process.regress

# -*- coding: utf-8 -*-
import os

import pandas as pd
from multipledispatch import dispatch
from rich import print

from ..constants import MLFLOW_ARTIFACT_DATA_PATH, SECTION
from ..data.data_readiness import num_input
from ..model.regression import (
    BayesianRidgeRegression,
    ClassicalLinearRegression,
    DecisionTreeRegression,
    ElasticNetRegression,
    ExtraTreesRegression,
    GradientBoostingRegression,
    KNNRegression,
    LassoRegression,
    MLPRegression,
    PolynomialRegression,
    RandomForestRegression,
    RegressionWorkflowBase,
    RidgeRegression,
    SGDRegression,
    SVMRegression,
    XGBoostRegression,
)
from ._base import ModelSelectionBase


[docs] class RegressionModelSelection(ModelSelectionBase): """Simulate the normal way of training regression algorithms.""" def __init__(self, model_name: str) -> None: self.model_name = model_name self.reg_workflow = RegressionWorkflowBase() self.transformer_config = {} @dispatch(object, object, object, object, object, object, object, object, object) def activate( self, X: pd.DataFrame, y: pd.DataFrame, X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.DataFrame, y_test: pd.DataFrame, name_train: pd.Series, name_test: pd.Series, name_all: pd.Series, ) -> None: """Train by Scikit-learn framework.""" self.reg_workflow.data_upload(X=X, y=y, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, name_train=name_train, name_test=name_test) # Model option if self.model_name == "Polynomial Regression": hyper_parameters = PolynomialRegression.manual_hyper_parameters() self.reg_workflow = PolynomialRegression( degree=hyper_parameters["degree"], interaction_only=hyper_parameters["interaction_only"], include_bias=hyper_parameters["include_bias"], ) poly_config, X_train, X_test = self.reg_workflow.poly(X_train, X_test) self.transformer_config.update(poly_config) self.reg_workflow.data_upload(X_train=X_train, X_test=X_test) elif self.model_name == "XGBoost": hyper_parameters = XGBoostRegression.manual_hyper_parameters() self.reg_workflow = XGBoostRegression( n_estimators=hyper_parameters["n_estimators"], learning_rate=hyper_parameters["learning_rate"], max_depth=hyper_parameters["max_depth"], subsample=hyper_parameters["subsample"], colsample_bytree=hyper_parameters["colsample_bytree"], alpha=hyper_parameters["alpha"], lambd=hyper_parameters["lambd"], ) elif self.model_name == "Decision Tree": hyper_parameters = DecisionTreeRegression.manual_hyper_parameters() self.reg_workflow = DecisionTreeRegression( criterion=hyper_parameters["criterion"], max_depth=hyper_parameters["max_depth"], min_samples_split=hyper_parameters["min_samples_split"], min_samples_leaf=hyper_parameters["min_samples_leaf"], max_features=hyper_parameters["max_features"], ) elif self.model_name == "Extra-Trees": hyper_parameters = ExtraTreesRegression.manual_hyper_parameters() self.reg_workflow = ExtraTreesRegression( n_estimators=hyper_parameters["n_estimators"], max_depth=hyper_parameters["max_depth"], min_samples_split=hyper_parameters["min_samples_split"], min_samples_leaf=hyper_parameters["min_samples_leaf"], max_features=hyper_parameters["max_features"], bootstrap=hyper_parameters["bootstrap"], oob_score=hyper_parameters["oob_score"], max_samples=hyper_parameters["max_samples"], ) elif self.model_name == "Random Forest": hyper_parameters = RandomForestRegression.manual_hyper_parameters() self.reg_workflow = RandomForestRegression( n_estimators=hyper_parameters["n_estimators"], max_depth=hyper_parameters["max_depth"], min_samples_split=hyper_parameters["min_samples_split"], min_samples_leaf=hyper_parameters["min_samples_leaf"], max_features=hyper_parameters["max_features"], bootstrap=hyper_parameters["bootstrap"], oob_score=hyper_parameters["oob_score"], max_samples=hyper_parameters["max_samples"], ) elif self.model_name == "Support Vector Machine": hyper_parameters = SVMRegression.manual_hyper_parameters() self.reg_workflow = SVMRegression( kernel=hyper_parameters["kernel"], degree=hyper_parameters["degree"], gamma=hyper_parameters["gamma"], C=hyper_parameters["C"], shrinking=hyper_parameters["shrinking"], ) elif self.model_name == "Multi-layer Perceptron": hyper_parameters = MLPRegression.manual_hyper_parameters() self.reg_workflow = MLPRegression( hidden_layer_sizes=hyper_parameters["hidden_layer_sizes"], activation=hyper_parameters["activation"], solver=hyper_parameters["solver"], alpha=hyper_parameters["alpha"], learning_rate=hyper_parameters["learning_rate"], max_iter=hyper_parameters["max_iter"], ) elif self.model_name == "Linear Regression": hyper_parameters = ClassicalLinearRegression.manual_hyper_parameters() self.reg_workflow = ClassicalLinearRegression(fit_intercept=hyper_parameters["fit_intercept"]) elif self.model_name == "K-Nearest Neighbors": hyper_parameters = KNNRegression.manual_hyper_parameters() self.reg_workflow = KNNRegression( n_neighbors=hyper_parameters["n_neighbors"], weights=hyper_parameters["weights"], algorithm=hyper_parameters["algorithm"], leaf_size=hyper_parameters["leaf_size"], p=hyper_parameters["p"], metric=hyper_parameters["metric"], ) elif self.model_name == "Gradient Boosting": hyper_parameters = GradientBoostingRegression.manual_hyper_parameters() self.reg_workflow = GradientBoostingRegression( n_estimators=hyper_parameters["n_estimators"], learning_rate=hyper_parameters["learning_rate"], max_depth=hyper_parameters["max_depth"], min_samples_split=hyper_parameters["min_samples_split"], min_samples_leaf=hyper_parameters["min_samples_leaf"], max_features=hyper_parameters["max_features"], subsample=hyper_parameters["subsample"], loss=hyper_parameters["loss"], ) elif self.model_name == "Lasso Regression": hyper_parameters = LassoRegression.manual_hyper_parameters() self.reg_workflow = LassoRegression( alpha=hyper_parameters["alpha"], fit_intercept=hyper_parameters["fit_intercept"], max_iter=hyper_parameters["max_iter"], tol=hyper_parameters["tol"], selection=hyper_parameters["selection"], ) elif self.model_name == "Elastic Net": hyper_parameters = ElasticNetRegression.manual_hyper_parameters() self.reg_workflow = ElasticNetRegression( alpha=hyper_parameters["alpha"], l1_ratio=hyper_parameters["l1_ratio"], fit_intercept=hyper_parameters["fit_intercept"], max_iter=hyper_parameters["max_iter"], tol=hyper_parameters["tol"], selection=hyper_parameters["selection"], ) elif self.model_name == "SGD Regression": hyper_parameters = SGDRegression.manual_hyper_parameters() self.reg_workflow = SGDRegression( loss=hyper_parameters["loss"], penalty=hyper_parameters["penalty"], alpha=hyper_parameters["alpha"], l1_ratio=hyper_parameters["l1_ratio"], fit_intercept=hyper_parameters["fit_intercept"], max_iter=hyper_parameters["max_iter"], tol=hyper_parameters["tol"], shuffle=hyper_parameters["shuffle"], learning_rate=hyper_parameters["learning_rate"], eta0=hyper_parameters["eta0"], power_t=hyper_parameters["power_t"], ) elif self.model_name == "BayesianRidge Regression": hyper_parameters = BayesianRidgeRegression.manual_hyper_parameters() self.reg_workflow = BayesianRidgeRegression( tol=hyper_parameters["tol"], alpha_1=hyper_parameters["alpha_1"], alpha_2=hyper_parameters["alpha_2"], lambda_1=hyper_parameters["lambda_1"], lambda_2=hyper_parameters["lambda_2"], alpha_init=hyper_parameters["alpha_init"], lambda_init=hyper_parameters["lambda_init"], compute_score=hyper_parameters["compute_score"], fit_intercept=hyper_parameters["fit_intercept"], copy_X=hyper_parameters["copy_X"], verbose=hyper_parameters["verbose"], ) elif self.model_name == "Ridge Regression": hyper_parameters = RidgeRegression.manual_hyper_parameters() self.reg_workflow = RidgeRegression( alpha=hyper_parameters["alpha"], fit_intercept=hyper_parameters["fit_intercept"], max_iter=hyper_parameters["max_iter"], tol=hyper_parameters["tol"], ) self.reg_workflow.show_info() # Use Scikit-learn style API to process input data self.reg_workflow.fit(X_train, y_train) y_train_predict = self.reg_workflow.predict(X_train) y_train_predict = self.reg_workflow.np2pd(y_train_predict, y_train.columns) y_train_predict = y_train_predict.dropna() y_train_predict = y_train_predict.reset_index(drop=True) self.reg_workflow.data_upload(y_train_predict=y_train_predict) y_test_predict = self.reg_workflow.predict(X_test) y_test_predict = self.reg_workflow.np2pd(y_test_predict, y_test.columns) y_test_predict = y_test_predict.dropna() y_test_predict = y_test_predict.reset_index(drop=True) self.reg_workflow.data_upload(y_test_predict=y_test_predict) # Save the model hyper-parameters self.reg_workflow.save_hyper_parameters(hyper_parameters, self.model_name, os.getenv("GEOPI_OUTPUT_PARAMETERS_PATH")) # Common components for every regression algorithm self.reg_workflow.common_components() # Special components of different algorithms self.reg_workflow.special_components() # Save the prediction result self.reg_workflow.data_save(y_train_predict, name_train, "Y Train Predict", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Model Train Prediction") self.reg_workflow.data_save(y_test_predict, name_test, "Y Test Predict", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Model Test Prediction") # Save the trained model self.reg_workflow.model_save() @dispatch(object, object, object, object, object, object, object, object, object, bool) def activate( self, X: pd.DataFrame, y: pd.DataFrame, X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.DataFrame, y_test: pd.DataFrame, name_train: pd.Series, name_test: pd.Series, name_all: pd.Series, is_automl: bool, ) -> None: """Train by FLAML framework + RAY framework.""" self.reg_workflow.data_upload(X=X, y=y, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, name_train=name_train, name_test=name_test) # Model option if self.model_name == "Polynomial Regression": print("Please specify the maximal degree of the polynomial features.") poly_degree = num_input(SECTION[2], "@Degree:") self.reg_workflow = PolynomialRegression(degree=poly_degree) poly_config, X_train, X_test = self.reg_workflow.poly(X_train, X_test) self.transformer_config.update(poly_config) self.reg_workflow.data_upload(X_train=X_train, X_test=X_test) elif self.model_name == "XGBoost": self.reg_workflow = XGBoostRegression() elif self.model_name == "Decision Tree": self.reg_workflow = DecisionTreeRegression() elif self.model_name == "Extra-Trees": self.reg_workflow = ExtraTreesRegression() elif self.model_name == "Random Forest": self.reg_workflow = RandomForestRegression() elif self.model_name == "Support Vector Machine": self.reg_workflow = SVMRegression() elif self.model_name == "Multi-layer Perceptron": self.reg_workflow = MLPRegression() elif self.model_name == "Linear Regression": self.reg_workflow = ClassicalLinearRegression() elif self.model_name == "K-Nearest Neighbors": self.reg_workflow = KNNRegression() elif self.model_name == "Gradient Boosting": self.reg_workflow = GradientBoostingRegression() elif self.model_name == "Lasso Regression": self.reg_workflow = LassoRegression() elif self.model_name == "Elastic Net": self.reg_workflow = ElasticNetRegression() elif self.model_name == "SGD Regression": self.reg_workflow = SGDRegression() elif self.model_name == "BayesianRidge Regression": self.reg_workflow = BayesianRidgeRegression() elif self.model_name == "Ridge Regression": self.reg_workflow = RidgeRegression() self.reg_workflow.show_info() # Use Scikit-learn style API to process input data self.reg_workflow.fit(X_train, y_train, is_automl) y_train_predict = self.reg_workflow.predict(X_train, is_automl) y_train_predict = self.reg_workflow.np2pd(y_train_predict, y_train.columns) y_train_predict = y_train_predict.dropna() y_train_predict = y_train_predict.reset_index(drop=True) self.reg_workflow.data_upload(y_train_predict=y_train_predict) y_test_predict = self.reg_workflow.predict(X_test, is_automl) y_test_predict = self.reg_workflow.np2pd(y_test_predict, y_test.columns) y_test_predict = y_test_predict.dropna() y_test_predict = y_test_predict.reset_index(drop=True) self.reg_workflow.data_upload(y_test_predict=y_test_predict) # Save the model hyper-parameters if self.reg_workflow.ray_best_model is not None: self.reg_workflow.save_hyper_parameters(self.reg_workflow.ray_best_model.get_params(), self.model_name, os.getenv("GEOPI_OUTPUT_PARAMETERS_PATH")) else: self.reg_workflow.save_hyper_parameters(self.reg_workflow.automl.best_config, self.model_name, os.getenv("GEOPI_OUTPUT_PARAMETERS_PATH")) # Common components for every regression algorithm self.reg_workflow.common_components(is_automl) # Special components of different algorithms self.reg_workflow.special_components(is_automl) # Save the prediction result self.reg_workflow.data_save(y_train_predict, name_train, "Y Train Predict", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Model Train Prediction") self.reg_workflow.data_save(y_test_predict, name_test, "Y Test Predict", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Model Test Prediction") # Save the trained model self.reg_workflow.model_save(is_automl)