Source code for geochemistrypi.data_mining.process.classify

# -*- coding: utf-8 -*-
import os

import pandas as pd
from multipledispatch import dispatch

from ..constants import MLFLOW_ARTIFACT_DATA_PATH
from ..model.classification import (
    AdaBoostClassification,
    ClassificationWorkflowBase,
    DecisionTreeClassification,
    ExtraTreesClassification,
    GradientBoostingClassification,
    KNNClassification,
    LogisticRegressionClassification,
    MLPClassification,
    RandomForestClassification,
    SGDClassification,
    SVMClassification,
    XGBoostClassification,
)
from ._base import ModelSelectionBase



[docs]
class ClassificationModelSelection(ModelSelectionBase):
    """Simulate the normal way of training classification algorithms."""

    def __init__(self, model_name: str) -> None:
        self.model_name = model_name
        self.clf_workflow = ClassificationWorkflowBase()
        self.transformer_config = {}

    @dispatch(object, object, object, object, object, object, object, object, object)
    def activate(
        self,
        X: pd.DataFrame,
        y: pd.DataFrame,
        X_train: pd.DataFrame,
        X_test: pd.DataFrame,
        y_train: pd.DataFrame,
        y_test: pd.DataFrame,
        name_train: pd.Series,
        name_test: pd.Series,
        name_all: pd.Series,
    ) -> None:
        """Train by Scikit-learn framework."""

        # Load the required data into the base class's attributes
        self.clf_workflow.data_upload(X=X, y=y, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, name_train=name_train, name_test=name_test, name_all=name_all)

        # Customize label
        y, y_train, y_test = self.clf_workflow.customize_label(y, y_train, y_test, name_all, name_train, name_test, os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH)

        # Sample balance
        sample_balance_config, X_train, y_train = self.clf_workflow.sample_balance(X_train, y_train, name_train, os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH)

        # Model option
        if self.model_name == "Support Vector Machine":
            hyper_parameters = SVMClassification.manual_hyper_parameters()
            self.clf_workflow = SVMClassification(
                kernel=hyper_parameters["kernel"],
                degree=hyper_parameters["degree"],
                gamma=hyper_parameters["gamma"],
                C=hyper_parameters["C"],
                shrinking=hyper_parameters["shrinking"],
            )
        elif self.model_name == "Decision Tree":
            hyper_parameters = DecisionTreeClassification.manual_hyper_parameters()
            self.clf_workflow = DecisionTreeClassification(
                criterion=hyper_parameters["criterion"],
                max_depth=hyper_parameters["max_depth"],
                min_samples_split=hyper_parameters["min_samples_split"],
                min_samples_leaf=hyper_parameters["min_samples_leaf"],
                max_features=hyper_parameters["max_features"],
            )
        elif self.model_name == "Random Forest":
            hyper_parameters = RandomForestClassification.manual_hyper_parameters()
            self.clf_workflow = RandomForestClassification(
                n_estimators=hyper_parameters["n_estimators"],
                max_depth=hyper_parameters["max_depth"],
                min_samples_split=hyper_parameters["min_samples_split"],
                min_samples_leaf=hyper_parameters["min_samples_leaf"],
                max_features=hyper_parameters["max_features"],
                bootstrap=hyper_parameters["bootstrap"],
                oob_score=hyper_parameters["oob_score"],
                max_samples=hyper_parameters["max_samples"],
            )
        elif self.model_name == "XGBoost":
            hyper_parameters = XGBoostClassification.manual_hyper_parameters()
            self.clf_workflow = XGBoostClassification(
                n_estimators=hyper_parameters["n_estimators"],
                learning_rate=hyper_parameters["learning_rate"],
                max_depth=hyper_parameters["max_depth"],
                subsample=hyper_parameters["subsample"],
                colsample_bytree=hyper_parameters["colsample_bytree"],
                alpha=hyper_parameters["alpha"],
                lambd=hyper_parameters["lambd"],
            )
        elif self.model_name == "Logistic Regression":
            hyper_parameters = LogisticRegressionClassification.manual_hyper_parameters()
            self.clf_workflow = LogisticRegressionClassification(
                penalty=hyper_parameters["penalty"],
                C=hyper_parameters["C"],
                solver=hyper_parameters["solver"],
                max_iter=hyper_parameters["max_iter"],
                class_weight=hyper_parameters["class_weight"],
                l1_ratio=hyper_parameters["l1_ratio"],
            )
        elif self.model_name == "Multi-layer Perceptron":
            hyper_parameters = MLPClassification.manual_hyper_parameters()
            self.clf_workflow = MLPClassification(
                hidden_layer_sizes=hyper_parameters["hidden_layer_sizes"],
                activation=hyper_parameters["activation"],
                solver=hyper_parameters["solver"],
                alpha=hyper_parameters["alpha"],
                learning_rate=hyper_parameters["learning_rate"],
                max_iter=hyper_parameters["max_iter"],
            )
        elif self.model_name == "Extra-Trees":
            hyper_parameters = ExtraTreesClassification.manual_hyper_parameters()
            self.clf_workflow = ExtraTreesClassification(
                n_estimators=hyper_parameters["n_estimators"],
                max_depth=hyper_parameters["max_depth"],
                min_samples_split=hyper_parameters["min_samples_split"],
                min_samples_leaf=hyper_parameters["min_samples_leaf"],
                max_features=hyper_parameters["max_features"],
                bootstrap=hyper_parameters["bootstrap"],
                oob_score=hyper_parameters["oob_score"],
                max_samples=hyper_parameters["max_samples"],
            )
        elif self.model_name == "Gradient Boosting":
            hyper_parameters = GradientBoostingClassification.manual_hyper_parameters()
            self.clf_workflow = GradientBoostingClassification(
                n_estimators=hyper_parameters["n_estimators"],
                learning_rate=hyper_parameters["learning_rate"],
                max_depth=hyper_parameters["max_depth"],
                min_samples_split=hyper_parameters["min_samples_split"],
                min_samples_leaf=hyper_parameters["min_samples_leaf"],
                max_features=hyper_parameters["max_features"],
                subsample=hyper_parameters["subsample"],
                loss=hyper_parameters["loss"],
            )
        elif self.model_name == "AdaBoost":
            hyper_parameters = AdaBoostClassification.manual_hyper_parameters()
            self.clf_workflow = AdaBoostClassification(
                n_estimators=hyper_parameters["n_estimators"],
                learning_rate=hyper_parameters["learning_rate"],
                max_depth=hyper_parameters["max_depth"],
            )
        elif self.model_name == "K-Nearest Neighbors":
            hyper_parameters = KNNClassification.manual_hyper_parameters()
            self.clf_workflow = KNNClassification(
                n_neighbors=hyper_parameters["n_neighbors"],
                weights=hyper_parameters["weights"],
                algorithm=hyper_parameters["algorithm"],
                leaf_size=hyper_parameters["leaf_size"],
                p=hyper_parameters["p"],
                metric=hyper_parameters["metric"],
            )
        elif self.model_name == "Stochastic Gradient Descent":
            hyper_parameters = SGDClassification.manual_hyper_parameters()
            self.clf_workflow = SGDClassification(
                loss=hyper_parameters["loss"],
                penalty=hyper_parameters["penalty"],
                alpha=hyper_parameters["alpha"],
                l1_ratio=hyper_parameters["l1_ratio"],
                fit_intercept=hyper_parameters["fit_intercept"],
                max_iter=hyper_parameters["max_iter"],
                tol=hyper_parameters["tol"],
                shuffle=hyper_parameters["shuffle"],
                learning_rate=hyper_parameters["learning_rate"],
                eta0=hyper_parameters["eta0"],
                power_t=hyper_parameters["power_t"],
                early_stopping=hyper_parameters["early_stopping"],
                validation_fraction=hyper_parameters["validation_fraction"],
                n_iter_no_change=hyper_parameters["n_iter_no_change"],
            )
        # Display what application functions the algorithm will provide
        self.clf_workflow.show_info()

        # Use Scikit-learn style API to process input data
        self.clf_workflow.fit(X_train, y_train)
        y_train_predict = self.clf_workflow.predict(X_train)
        y_train_predict = self.clf_workflow.np2pd(y_train_predict, y_train.columns)
        y_train_predict = y_train_predict.dropna()
        y_train_predict = y_train_predict.reset_index(drop=True)
        self.clf_workflow.data_upload(y_train_predict=y_train_predict)
        y_test_predict = self.clf_workflow.predict(X_test)
        y_test_predict = self.clf_workflow.np2pd(y_test_predict, y_test.columns)
        y_test_predict = y_test_predict.dropna()
        y_test_predict = y_test_predict.reset_index(drop=True)
        self.clf_workflow.data_upload(X=X, y=y, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, y_test_predict=y_test_predict)

        # Save the model hyper-parameters
        self.clf_workflow.save_hyper_parameters(hyper_parameters, self.model_name, os.getenv("GEOPI_OUTPUT_PARAMETERS_PATH"))

        # Common components for every classification algorithm
        self.clf_workflow.common_components()

        # Special components of different algorithms
        self.clf_workflow.special_components()

        # Save the prediction result
        self.clf_workflow.data_save(y_train_predict, name_train, "Y Train Predict", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Model Train Prediction")
        self.clf_workflow.data_save(y_test_predict, name_test, "Y Test Predict", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Model Test Prediction")

        # Save the trained model
        self.clf_workflow.model_save()

    @dispatch(object, object, object, object, object, object, object, object, object, bool)
    def activate(
        self,
        X: pd.DataFrame,
        y: pd.DataFrame,
        X_train: pd.DataFrame,
        X_test: pd.DataFrame,
        y_train: pd.DataFrame,
        y_test: pd.DataFrame,
        name_train: pd.Series,
        name_test: pd.Series,
        name_all: pd.Series,
        is_automl: bool,
    ) -> None:
        """Train by FLAML framework + RAY framework."""

        self.clf_workflow.data_upload(X=X, y=y, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, name_train=name_train, name_test=name_test, name_all=name_all)

        # Customize label
        y, y_train, y_test = self.clf_workflow.customize_label(y, y_train, y_test, name_all, name_train, name_test, os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH)

        # Sample balance
        sample_balance_config, X_train, y_train = self.clf_workflow.sample_balance(X_train, y_train, name_train, os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH)

        # Model option
        if self.model_name == "Support Vector Machine":
            self.clf_workflow = SVMClassification()
        elif self.model_name == "Decision Tree":
            self.clf_workflow = DecisionTreeClassification()
        elif self.model_name == "Random Forest":
            self.clf_workflow = RandomForestClassification()
        elif self.model_name == "XGBoost":
            self.clf_workflow = XGBoostClassification()
        elif self.model_name == "Logistic Regression":
            self.clf_workflow = LogisticRegressionClassification()
        elif self.model_name == "Multi-layer Perceptron":
            self.clf_workflow = MLPClassification()
        elif self.model_name == "Extra-Trees":
            self.clf_workflow = ExtraTreesClassification()
        elif self.model_name == "Gradient Boosting":
            self.clf_workflow = GradientBoostingClassification()
        elif self.model_name == "AdaBoost":
            self.clf_workflow = AdaBoostClassification()
        elif self.model_name == "K-Nearest Neighbors":
            self.clf_workflow = KNNClassification()
        elif self.model_name == "Stochastic Gradient Descent":
            self.clf_workflow = SGDClassification()

        self.clf_workflow.show_info()

        # Use Scikit-learn style API to process input data
        self.clf_workflow.fit(X_train, y_train, is_automl)
        y_train_predict = self.clf_workflow.predict(X_train, is_automl)
        y_train_predict = self.clf_workflow.np2pd(y_train_predict, y_train.columns)
        y_train_predict = y_train_predict.dropna()
        y_train_predict = y_train_predict.reset_index(drop=True)
        self.clf_workflow.data_upload(y_train_predict=y_train_predict)
        y_test_predict = self.clf_workflow.predict(X_test, is_automl)
        y_test_predict = self.clf_workflow.np2pd(y_test_predict, y_test.columns)
        y_test_predict = y_test_predict.dropna()
        y_test_predict = y_test_predict.reset_index(drop=True)
        self.clf_workflow.data_upload(X=X, y=y, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, y_test_predict=y_test_predict)

        # Save the model hyper-parameters
        if self.clf_workflow.ray_best_model is not None:
            self.clf_workflow.save_hyper_parameters(self.clf_workflow.ray_best_model.get_params(), self.model_name, os.getenv("GEOPI_OUTPUT_PARAMETERS_PATH"))
        else:
            self.clf_workflow.save_hyper_parameters(self.clf_workflow.automl.best_config, self.model_name, os.getenv("GEOPI_OUTPUT_PARAMETERS_PATH"))

        # Common components for every classification algorithm
        self.clf_workflow.common_components(is_automl)

        # Special components of different algorithms
        self.clf_workflow.special_components(is_automl)

        # Save the prediction result
        self.clf_workflow.data_save(y_train_predict, name_train, "Y Train Predict", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Model Train Prediction")
        self.clf_workflow.data_save(y_test_predict, name_test, "Y Test Predict", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Model Test Prediction")

        # Save the trained model
        self.clf_workflow.model_save(is_automl)