Source code for geochemistrypi.data_mining.process.classify

# -*- coding: utf-8 -*-
import os

import pandas as pd
from multipledispatch import dispatch

from ..constants import MLFLOW_ARTIFACT_DATA_PATH
from ..model.classification import (
    AdaBoostClassification,
    ClassificationWorkflowBase,
    DecisionTreeClassification,
    ExtraTreesClassification,
    GradientBoostingClassification,
    KNNClassification,
    LogisticRegressionClassification,
    MLPClassification,
    RandomForestClassification,
    SGDClassification,
    SVMClassification,
    XGBoostClassification,
)
from ._base import ModelSelectionBase


[docs] class ClassificationModelSelection(ModelSelectionBase): """Simulate the normal way of training classification algorithms.""" def __init__(self, model_name: str) -> None: self.model_name = model_name self.clf_workflow = ClassificationWorkflowBase() self.transformer_config = {} @dispatch(object, object, object, object, object, object, object, object, object) def activate( self, X: pd.DataFrame, y: pd.DataFrame, X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.DataFrame, y_test: pd.DataFrame, name_train: pd.Series, name_test: pd.Series, name_all: pd.Series, ) -> None: """Train by Scikit-learn framework.""" # Load the required data into the base class's attributes self.clf_workflow.data_upload(X=X, y=y, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, name_train=name_train, name_test=name_test, name_all=name_all) # Customize label y, y_train, y_test = self.clf_workflow.customize_label(y, y_train, y_test, name_all, name_train, name_test, os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH) # Sample balance sample_balance_config, X_train, y_train = self.clf_workflow.sample_balance(X_train, y_train, name_train, os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH) # Model option if self.model_name == "Support Vector Machine": hyper_parameters = SVMClassification.manual_hyper_parameters() self.clf_workflow = SVMClassification( kernel=hyper_parameters["kernel"], degree=hyper_parameters["degree"], gamma=hyper_parameters["gamma"], C=hyper_parameters["C"], shrinking=hyper_parameters["shrinking"], ) elif self.model_name == "Decision Tree": hyper_parameters = DecisionTreeClassification.manual_hyper_parameters() self.clf_workflow = DecisionTreeClassification( criterion=hyper_parameters["criterion"], max_depth=hyper_parameters["max_depth"], min_samples_split=hyper_parameters["min_samples_split"], min_samples_leaf=hyper_parameters["min_samples_leaf"], max_features=hyper_parameters["max_features"], ) elif self.model_name == "Random Forest": hyper_parameters = RandomForestClassification.manual_hyper_parameters() self.clf_workflow = RandomForestClassification( n_estimators=hyper_parameters["n_estimators"], max_depth=hyper_parameters["max_depth"], min_samples_split=hyper_parameters["min_samples_split"], min_samples_leaf=hyper_parameters["min_samples_leaf"], max_features=hyper_parameters["max_features"], bootstrap=hyper_parameters["bootstrap"], oob_score=hyper_parameters["oob_score"], max_samples=hyper_parameters["max_samples"], ) elif self.model_name == "XGBoost": hyper_parameters = XGBoostClassification.manual_hyper_parameters() self.clf_workflow = XGBoostClassification( n_estimators=hyper_parameters["n_estimators"], learning_rate=hyper_parameters["learning_rate"], max_depth=hyper_parameters["max_depth"], subsample=hyper_parameters["subsample"], colsample_bytree=hyper_parameters["colsample_bytree"], alpha=hyper_parameters["alpha"], lambd=hyper_parameters["lambd"], ) elif self.model_name == "Logistic Regression": hyper_parameters = LogisticRegressionClassification.manual_hyper_parameters() self.clf_workflow = LogisticRegressionClassification( penalty=hyper_parameters["penalty"], C=hyper_parameters["C"], solver=hyper_parameters["solver"], max_iter=hyper_parameters["max_iter"], class_weight=hyper_parameters["class_weight"], l1_ratio=hyper_parameters["l1_ratio"], ) elif self.model_name == "Multi-layer Perceptron": hyper_parameters = MLPClassification.manual_hyper_parameters() self.clf_workflow = MLPClassification( hidden_layer_sizes=hyper_parameters["hidden_layer_sizes"], activation=hyper_parameters["activation"], solver=hyper_parameters["solver"], alpha=hyper_parameters["alpha"], learning_rate=hyper_parameters["learning_rate"], max_iter=hyper_parameters["max_iter"], ) elif self.model_name == "Extra-Trees": hyper_parameters = ExtraTreesClassification.manual_hyper_parameters() self.clf_workflow = ExtraTreesClassification( n_estimators=hyper_parameters["n_estimators"], max_depth=hyper_parameters["max_depth"], min_samples_split=hyper_parameters["min_samples_split"], min_samples_leaf=hyper_parameters["min_samples_leaf"], max_features=hyper_parameters["max_features"], bootstrap=hyper_parameters["bootstrap"], oob_score=hyper_parameters["oob_score"], max_samples=hyper_parameters["max_samples"], ) elif self.model_name == "Gradient Boosting": hyper_parameters = GradientBoostingClassification.manual_hyper_parameters() self.clf_workflow = GradientBoostingClassification( n_estimators=hyper_parameters["n_estimators"], learning_rate=hyper_parameters["learning_rate"], max_depth=hyper_parameters["max_depth"], min_samples_split=hyper_parameters["min_samples_split"], min_samples_leaf=hyper_parameters["min_samples_leaf"], max_features=hyper_parameters["max_features"], subsample=hyper_parameters["subsample"], loss=hyper_parameters["loss"], ) elif self.model_name == "AdaBoost": hyper_parameters = AdaBoostClassification.manual_hyper_parameters() self.clf_workflow = AdaBoostClassification( n_estimators=hyper_parameters["n_estimators"], learning_rate=hyper_parameters["learning_rate"], max_depth=hyper_parameters["max_depth"], ) elif self.model_name == "K-Nearest Neighbors": hyper_parameters = KNNClassification.manual_hyper_parameters() self.clf_workflow = KNNClassification( n_neighbors=hyper_parameters["n_neighbors"], weights=hyper_parameters["weights"], algorithm=hyper_parameters["algorithm"], leaf_size=hyper_parameters["leaf_size"], p=hyper_parameters["p"], metric=hyper_parameters["metric"], ) elif self.model_name == "Stochastic Gradient Descent": hyper_parameters = SGDClassification.manual_hyper_parameters() self.clf_workflow = SGDClassification( loss=hyper_parameters["loss"], penalty=hyper_parameters["penalty"], alpha=hyper_parameters["alpha"], l1_ratio=hyper_parameters["l1_ratio"], fit_intercept=hyper_parameters["fit_intercept"], max_iter=hyper_parameters["max_iter"], tol=hyper_parameters["tol"], shuffle=hyper_parameters["shuffle"], learning_rate=hyper_parameters["learning_rate"], eta0=hyper_parameters["eta0"], power_t=hyper_parameters["power_t"], early_stopping=hyper_parameters["early_stopping"], validation_fraction=hyper_parameters["validation_fraction"], n_iter_no_change=hyper_parameters["n_iter_no_change"], ) # Display what application functions the algorithm will provide self.clf_workflow.show_info() # Use Scikit-learn style API to process input data self.clf_workflow.fit(X_train, y_train) y_train_predict = self.clf_workflow.predict(X_train) y_train_predict = self.clf_workflow.np2pd(y_train_predict, y_train.columns) y_train_predict = y_train_predict.dropna() y_train_predict = y_train_predict.reset_index(drop=True) self.clf_workflow.data_upload(y_train_predict=y_train_predict) y_test_predict = self.clf_workflow.predict(X_test) y_test_predict = self.clf_workflow.np2pd(y_test_predict, y_test.columns) y_test_predict = y_test_predict.dropna() y_test_predict = y_test_predict.reset_index(drop=True) self.clf_workflow.data_upload(X=X, y=y, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, y_test_predict=y_test_predict) # Save the model hyper-parameters self.clf_workflow.save_hyper_parameters(hyper_parameters, self.model_name, os.getenv("GEOPI_OUTPUT_PARAMETERS_PATH")) # Common components for every classification algorithm self.clf_workflow.common_components() # Special components of different algorithms self.clf_workflow.special_components() # Save the prediction result self.clf_workflow.data_save(y_train_predict, name_train, "Y Train Predict", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Model Train Prediction") self.clf_workflow.data_save(y_test_predict, name_test, "Y Test Predict", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Model Test Prediction") # Save the trained model self.clf_workflow.model_save() @dispatch(object, object, object, object, object, object, object, object, object, bool) def activate( self, X: pd.DataFrame, y: pd.DataFrame, X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.DataFrame, y_test: pd.DataFrame, name_train: pd.Series, name_test: pd.Series, name_all: pd.Series, is_automl: bool, ) -> None: """Train by FLAML framework + RAY framework.""" self.clf_workflow.data_upload(X=X, y=y, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, name_train=name_train, name_test=name_test, name_all=name_all) # Customize label y, y_train, y_test = self.clf_workflow.customize_label(y, y_train, y_test, name_all, name_train, name_test, os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH) # Sample balance sample_balance_config, X_train, y_train = self.clf_workflow.sample_balance(X_train, y_train, name_train, os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH) # Model option if self.model_name == "Support Vector Machine": self.clf_workflow = SVMClassification() elif self.model_name == "Decision Tree": self.clf_workflow = DecisionTreeClassification() elif self.model_name == "Random Forest": self.clf_workflow = RandomForestClassification() elif self.model_name == "XGBoost": self.clf_workflow = XGBoostClassification() elif self.model_name == "Logistic Regression": self.clf_workflow = LogisticRegressionClassification() elif self.model_name == "Multi-layer Perceptron": self.clf_workflow = MLPClassification() elif self.model_name == "Extra-Trees": self.clf_workflow = ExtraTreesClassification() elif self.model_name == "Gradient Boosting": self.clf_workflow = GradientBoostingClassification() elif self.model_name == "AdaBoost": self.clf_workflow = AdaBoostClassification() elif self.model_name == "K-Nearest Neighbors": self.clf_workflow = KNNClassification() elif self.model_name == "Stochastic Gradient Descent": self.clf_workflow = SGDClassification() self.clf_workflow.show_info() # Use Scikit-learn style API to process input data self.clf_workflow.fit(X_train, y_train, is_automl) y_train_predict = self.clf_workflow.predict(X_train, is_automl) y_train_predict = self.clf_workflow.np2pd(y_train_predict, y_train.columns) y_train_predict = y_train_predict.dropna() y_train_predict = y_train_predict.reset_index(drop=True) self.clf_workflow.data_upload(y_train_predict=y_train_predict) y_test_predict = self.clf_workflow.predict(X_test, is_automl) y_test_predict = self.clf_workflow.np2pd(y_test_predict, y_test.columns) y_test_predict = y_test_predict.dropna() y_test_predict = y_test_predict.reset_index(drop=True) self.clf_workflow.data_upload(X=X, y=y, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, y_test_predict=y_test_predict) # Save the model hyper-parameters if self.clf_workflow.ray_best_model is not None: self.clf_workflow.save_hyper_parameters(self.clf_workflow.ray_best_model.get_params(), self.model_name, os.getenv("GEOPI_OUTPUT_PARAMETERS_PATH")) else: self.clf_workflow.save_hyper_parameters(self.clf_workflow.automl.best_config, self.model_name, os.getenv("GEOPI_OUTPUT_PARAMETERS_PATH")) # Common components for every classification algorithm self.clf_workflow.common_components(is_automl) # Special components of different algorithms self.clf_workflow.special_components(is_automl) # Save the prediction result self.clf_workflow.data_save(y_train_predict, name_train, "Y Train Predict", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Model Train Prediction") self.clf_workflow.data_save(y_test_predict, name_test, "Y Test Predict", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Model Test Prediction") # Save the trained model self.clf_workflow.model_save(is_automl)