Source code for geochemistrypi.data_mining.utils.base

# -*- coding: utf-8 -*-
import logging
import os
import pickle
import platform
import shutil
from typing import Optional

import joblib
import mlflow
import pandas as pd
from matplotlib import pyplot as plt
from rich import print


[docs] def create_geopi_output_dir(output_path: str, experiment_name: str, run_name: str, sub_run_name: Optional[str] = None) -> None: """Create the output directory for the current run and store the related pathes as environment variable. Parameters ---------- output_path : str The root path to store the output. experiment_name : str The name of the experiment. run_name : str The name of the run. sub_run_name : str, default=None The name of the sub run. """ # Set the output path for the current run # timestamp = datetime.datetime.now().strftime("%m-%d-%H-%M") if sub_run_name: geopi_output_path = os.path.join(output_path, experiment_name, f"{run_name}", sub_run_name) else: geopi_output_path = os.path.join(output_path, experiment_name, f"{run_name}") os.environ["GEOPI_OUTPUT_PATH"] = geopi_output_path os.makedirs(geopi_output_path, exist_ok=True) # Set the output artifacts path for the current run geopi_output_artifacts_path = os.path.join(geopi_output_path, "artifacts") os.environ["GEOPI_OUTPUT_ARTIFACTS_PATH"] = geopi_output_artifacts_path os.makedirs(geopi_output_artifacts_path, exist_ok=True) # Set the output artifacts data path for the current run geopi_output_artifacts_data_path = os.path.join(geopi_output_artifacts_path, "data") os.environ["GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"] = geopi_output_artifacts_data_path os.makedirs(geopi_output_artifacts_data_path, exist_ok=True) # Set the output artifacts model path for the current run geopi_output_artifacts_model_path = os.path.join(geopi_output_artifacts_path, "model") os.environ["GEOPI_OUTPUT_ARTIFACTS_MODEL_PATH"] = geopi_output_artifacts_model_path os.makedirs(geopi_output_artifacts_model_path, exist_ok=True) # Set the output artifacts image path for the current run geopi_output_artifacts_image_path = os.path.join(geopi_output_artifacts_path, "image") os.environ["GEOPI_OUTPUT_ARTIFACTS_IMAGE_PATH"] = geopi_output_artifacts_image_path os.makedirs(geopi_output_artifacts_image_path, exist_ok=True) # Set the output artifacts image model output path for the current run geopi_output_artifacts_image_model_output_path = os.path.join(geopi_output_artifacts_image_path, "model_output") os.environ["GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH"] = geopi_output_artifacts_image_model_output_path os.makedirs(geopi_output_artifacts_image_model_output_path, exist_ok=True) # Set the output artifacts image statistic path for the current run geopi_output_artifacts_image_statistic_path = os.path.join(geopi_output_artifacts_image_path, "statistic") os.environ["GEOPI_OUTPUT_ARTIFACTS_IMAGE_STATISTIC_PATH"] = geopi_output_artifacts_image_statistic_path os.makedirs(geopi_output_artifacts_image_statistic_path, exist_ok=True) # Set the output artifacts image map path for the current run geopi_output_artifacts_image_map_path = os.path.join(geopi_output_artifacts_image_path, "map") os.environ["GEOPI_OUTPUT_ARTIFACTS_IMAGE_MAP_PATH"] = geopi_output_artifacts_image_map_path os.makedirs(geopi_output_artifacts_image_map_path, exist_ok=True) # Set the output parameters path for the current run geopi_output_parameters_path = os.path.join(geopi_output_path, "parameters") os.environ["GEOPI_OUTPUT_PARAMETERS_PATH"] = geopi_output_parameters_path os.makedirs(geopi_output_parameters_path, exist_ok=True) # Set the outout metrics path for the current run geopi_output_metrics_path = os.path.join(geopi_output_path, "metrics") os.environ["GEOPI_OUTPUT_METRICS_PATH"] = geopi_output_metrics_path os.makedirs(geopi_output_metrics_path, exist_ok=True) # Set the summary outout path for the current run geopi_output_summary_path = os.path.join(geopi_output_path, "summary") os.environ["GEOPI_OUTPUT_SUMMARY_PATH"] = geopi_output_summary_path os.makedirs(geopi_output_summary_path, exist_ok=True)
[docs] def get_os() -> str: """Get the operating system. Returns ------- str The operating system. """ my_os = platform.system() if my_os == "Windows": return "Windows" elif my_os == "Linux": return "Linux" elif my_os == "Darwin": return "macOS" else: return "Unknown"
[docs] def check_package(package_name: str) -> bool: """Check whether the package is installed. Parameters ---------- package_name : str The name of the package. Returns ------- bool Whether the package is installed. """ import importlib try: importlib.import_module(package_name) return True except ImportError: return False
[docs] def install_package(package_name: str) -> None: """Install the package. Parameters ---------- package_name : str The name of the package. """ import subprocess subprocess.check_call(["python", "-m", "pip", "install", "--quiet", package_name])
[docs] def clear_output(text: str = None) -> None: """Clear the console output.""" if text: flag = input(text) else: flag = input("(Press Enter key to move forward)") my_os = platform.system() if flag == "": if my_os == "Windows": os.system("cls") # for Windows else: os.system("clear") # for Linux and macOS print("")
[docs] def save_fig(fig_name: str, local_image_path: str, mlflow_artifact_image_path: str = None, tight_layout: bool = True) -> None: """Save the figure in the local directory and in mlflow specialized directory. Parameters ---------- fig_name : str Figure name. local_image_path : str The path to store the image. mlflow_artifact_image_path : str, default=None The path to store the image in mlflow. tight_layout : bool, default=True Automatically adjust subplot parameters to give specified padding. """ full_path = os.path.join(local_image_path, fig_name + ".png") print(f"Save figure '{fig_name}' in {local_image_path}.") if tight_layout: plt.tight_layout() # Check that the original file exists, # and if it does, add a number after the filename to distinguish i = 1 dir = full_path[:-4] while os.path.isfile(full_path): full_path = dir + str(i) + ".png" i = i + 1 plt.savefig(full_path, format="png", dpi=300) plt.close() if mlflow_artifact_image_path: mlflow.log_artifact(full_path, artifact_path=mlflow_artifact_image_path) else: mlflow.log_artifact(full_path)
[docs] def save_data(df: pd.DataFrame, name_column: str, df_name: str, local_data_path: str, mlflow_artifact_data_path: str = None, index: bool = False) -> None: """Save the dataset in the local directory and in mlflow specialized directory. Parameters ---------- df : pd.DataFrame The dataset to store. name_column: The name of the data. df_name : str The name of the data sheet. local_data_path : str The path to store the data sheet mlflow_artifact_data_path : str, default=None The path to store the data sheet in mlflow. index : bool, default=False Whether to write the index. """ if name_column is not None and len(df) == len(name_column): if not df.index.empty and len(name_column.index) == len(df.index) and set(df.index) == set(name_column.index): name_column = name_column.reindex(df.index) df.reset_index(drop=True, inplace=True) name_column.reset_index(drop=True, inplace=True) df = pd.concat([name_column, df], axis=1) try: # drop the index in case that the dimensions change full_path = os.path.join(local_data_path, "{}.xlsx".format(df_name)) df.to_excel(full_path, index=index) if mlflow_artifact_data_path: mlflow.log_artifact(full_path, artifact_path=mlflow_artifact_data_path) else: mlflow.log_artifact(full_path) print(f"Successfully store '{df_name}' in '{df_name}.xlsx' in {local_data_path}.") except ModuleNotFoundError: print("** Please download openpyxl by pip3 **") print("** The data will be stored in .csv file **") full_path = os.path.join(local_data_path, "{}.csv".format(df_name)) df.to_csv(full_path, index=index) print(f"Successfully store '{df_name}' in '{df_name}.csv' in {local_data_path}.")
[docs] def save_data_without_data_identifier(df: pd.DataFrame, df_name: str, local_data_path: str, mlflow_artifact_data_path: str = None, index: bool = False) -> None: """Save the dataset in the local directory and in mlflow specialized directory. Parameters ---------- df : pd.DataFrame The dataset to store. df_name : str The name of the data sheet. local_data_path : str The path to store the data sheet mlflow_artifact_data_path : str, default=None The path to store the data sheet in mlflow. index : bool, default=False Whether to write the index. """ try: # drop the index in case that the dimensions change full_path = os.path.join(local_data_path, "{}.xlsx".format(df_name)) df.to_excel(full_path, index=index) if mlflow_artifact_data_path: mlflow.log_artifact(full_path, artifact_path=mlflow_artifact_data_path) else: mlflow.log_artifact(full_path) print(f"Successfully store '{df_name}' in '{df_name}.xlsx' in {local_data_path}.") except ModuleNotFoundError: print("** Please download openpyxl by pip3 **") print("** The data will be stored in .csv file **") full_path = os.path.join(local_data_path, "{}.csv".format(df_name)) df.to_csv(full_path, index=index) print(f"Successfully store '{df_name}' in '{df_name}.csv' in {local_data_path}.")
[docs] def save_text(string: str, text_name: str, local_text_path: str, mlflow_artifact_text_path: str = None) -> None: """Save the text. Parameters ---------- string : str The text to store. text_name : str The name of the text. local_text_path : str The path to store the text. mlflow_artifact_text_path : str, default=None The path to store the text in mlflow. """ full_path = os.path.join(local_text_path, text_name + ".txt") with open(full_path, "w") as f: f.write(string) print(f"Successfully store '{text_name}' in '{text_name}.txt' in {local_text_path}.") if not mlflow_artifact_text_path: pass elif mlflow_artifact_text_path == "root": # If artifact_path is "root", the artifact will be placed in the root artifacts directory. mlflow.log_artifact(full_path) else: mlflow.log_artifact(full_path, artifact_path=mlflow_artifact_text_path)
[docs] def save_model(model: object, model_name: str, data_sample: pd.DataFrame, local_model_path: str, mlflow_artifact_model_path: str = None) -> None: """Save the model in the local directory and in mlflow specialized directory. Parameters ---------- model : object The model to store. model_name : str The name of the model. data_sample : pd.DataFrame The sample of the dataset. local_model_path : str The path to store the model. mlflow_artifact_model_path : str, default=None The path to store the model in mlflow. """ pickle_filename = model_name + ".pkl" pickle_path = os.path.join(local_model_path, pickle_filename) with open(pickle_path, "wb") as f: pickle.dump(model, f) print(f"Successfully store '{model_name}' in '{pickle_filename}' in {local_model_path}.") joblib_filename = model_name + ".joblib" joblib_path = os.path.join(local_model_path, joblib_filename) with open(joblib_path, "wb") as f: joblib.dump(model, f) print(f"Successfully store '{model_name}' in '{joblib_filename}' in {local_model_path}.") if not mlflow_artifact_model_path: mlflow.sklearn.log_model(model, model_name, input_example=data_sample) else: mlflow.sklearn.log_model(model, mlflow_artifact_model_path, input_example=data_sample)
[docs] def log(log_path, log_name): # Create and configure logger # LOG_FORMAT = "%(levelname)s %(asctime)s - %(message)s" LOG_FORMAT = "%(asctime)s %(name)s %(levelname)s %(pathname)s %(message)s" DATE_FORMAT = "%Y-%m-%d %H:%M:%S %a " logging.basicConfig(filename=os.path.join(log_path, log_name), level=logging.DEBUG, format=LOG_FORMAT, datefmt=DATE_FORMAT, filemode="w") logger = logging.getLogger() return logger
[docs] def show_warning(is_show: bool = True) -> None: """Overriding Python's default filter to control whether to display warning information.""" import sys if not is_show: if not sys.warnoptions: import os os.environ["PYTHONWARNINGS"] = "ignore"
# os.environ["PYTHONWARNINGS"] = "default"
[docs] def copy_files(GEOPI_OUTPUT_ARTIFACTS_PATH: str, GEOPI_OUTPUT_METRICS_PATH: str, GEOPI_OUTPUT_PARAMETERS_PATH: str, GEOPI_OUTPUT_SUMMARY_PATH: str) -> None: """Copy all files from the source folder to the destination folder. Parameters ---------- GEOPI_OUTPUT_ARTIFACTS_PATH: str Source folder path. GEOPI_OUTPUT_METRICS_PATH: str Source folder path. GEOPI_OUTPUT_PARAMETERS_PATH: str Source folder path. GEOPI_OUTPUT_SUMMARY_PATH: str Destination folder path """ source_paths = [GEOPI_OUTPUT_ARTIFACTS_PATH, GEOPI_OUTPUT_METRICS_PATH, GEOPI_OUTPUT_PARAMETERS_PATH] for source_path in source_paths: for root, dirs, files in os.walk(source_path): for file in files: source_file_path = os.path.join(root, file) shutil.copy2(source_file_path, GEOPI_OUTPUT_SUMMARY_PATH)
[docs] def copy_files_from_source_dir_to_dest_dir(source_dir: str, dest_dir: str) -> None: """Copy all files from the source folder to the destination folder. Parameters ---------- source_dir: str Source folder path. dest_dir: str Destination folder path """ for root, dirs, files in os.walk(source_dir): for file in files: source_file_path = os.path.join(root, file) shutil.copy2(source_file_path, dest_dir)
[docs] def list_excel_files(directory: str) -> list: """Recursively lists all Excel files (including .xlsx, .xls, and .csv) in the specified directory and its subdirectories. Parameters ---------- directory : str The path to the directory to search for Excel files. Returns ------- excel_files : list A list of file paths for all Excel files found. Notes ----- (1) The function uses `os.walk` to traverse the directory and its subdirectories. (2) Only files with extensions .xlsx, .xls, and .csv are considered as Excel files. """ excel_files = [] for root, dirs, files in os.walk(directory): for file in files: if file.endswith(".xlsx") or file.endswith(".xls") or file.endswith(".csv"): excel_files.append(os.path.join(root, file)) return excel_files