import copy
import json
import os
from typing import Dict, Optional, Tuple
import mlflow
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_selection import GenericUnivariateSelect, SelectKBest
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures, StandardScaler
from ..constants import MLFLOW_ARTIFACT_DATA_PATH
from ..utils.base import save_data, save_model, save_text
from .data_readiness import np2pd
from .preprocessing import MeanNormalScaler
[docs]
class PipelineConstrutor:
"""Construct a sklearn pipeline from a dictionary of transformers."""
@property
def transformer_dict(self) -> Dict:
"""A dictionary of transformers. Need to be updated when new transformers in the customized automated ML pipeline is added."""
return {
"SimpleImputer": SimpleImputer,
"MinMaxScaler": MinMaxScaler,
"StandardScaler": StandardScaler,
"MeanNormalScaler": MeanNormalScaler,
"PolynomialFeatures": PolynomialFeatures,
"RandomOverSampler": RandomOverSampler,
"RandomUnderSampler": RandomUnderSampler,
"GenericUnivariateSelect": GenericUnivariateSelect,
"SelectKBest": SelectKBest,
}
[docs]
def chain(self, transformer_config: Dict) -> object:
"""Chain transformers together into a sklearn pipeline.
Parameters
----------
transformer_config : Dict
A dictionary of transformers and their parameters.
Returns
-------
object
A sklearn pipeline.
"""
transformers = []
for transformer_name, transformer_params in transformer_config.items():
transformers.append(self.transformer_dict[transformer_name](**transformer_params))
return make_pipeline(*transformers)
[docs]
def model_inference(inference_data: pd.DataFrame, inference_name_column: str, is_inference: bool, run: object, transformer_config: Dict, transform_pipeline: Optional[object] = None):
"""Run the model inference.
Parameters
----------
inference_data : pd.DataFrame
The inference data.
inference_name_column: str
The name of inference_data
is_inference : bool
Whether to run the model inference.
run : object
The model selection object.
transformer_config : Dict
The transformer configuration.
transform_pipeline : Optional[object], optional
The transform pipeline object. The default is None.
"""
# If is_inference is True, then run the model inference.
if is_inference is True:
print("Use the trained model to make predictions on the application data.")
# If transformer_config is not {}, then transform the inference data with the transform pipeline.
if transformer_config:
inference_data_transformed = transform_pipeline.transform(inference_data)
else:
inference_data_transformed = inference_data
loaded_model = mlflow.sklearn.load_model(f"runs:/{mlflow.active_run().info.run_id}/{run.model_name}")
inference_data_predicted_np = loaded_model.predict(inference_data_transformed)
inference_data_predicted = np2pd(inference_data_predicted_np, ["Predicted Value"])
GEOPI_OUTPUT_ARTIFACTS_DATA_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH")
save_data(inference_data_predicted, inference_name_column, "Application Data Predicted", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)