Source code for geochemistrypi.data_mining.data.statistic

# -*- coding: utf-8 -*-
import random

import numpy as np
import pandas as pd
from rich import print
from scipy.stats import kruskal, wilcoxon



[docs]
def test_once(df_orig: pd.DataFrame, df_impute: pd.DataFrame, test: str) -> np.ndarray:
    """Do hypothesis testing on each pair-wise column once, non-parametric test.
    Null hypothesis: the distributions of the data set before and after imputing remain the same.

    Parameters
    ----------
    df_orig : pd.DataFrame (n_samples, n_components)
        The original dataset with missing value.

    df_impute : pd.DataFrame (n_samples, n_components)
        The dataset after imputation.

    test : str
        The statistics test method used.

    Returns
    -------
    pvals : np.ndarray
        A numpy array containing the p-values of the tests on each column in the column order
    """
    cols = df_orig.columns
    pvals = np.array([])

    if test == "wilcoxon":
        for c in cols:
            try:
                df_new_orig = df_orig[c].dropna()
                stat, pval = wilcoxon(df_new_orig, df_impute[c])
                pvals = np.append(pvals, pval)
            except Exception:
                pvals = np.append(pvals, 0)

    if test == "kruskal":
        for c in cols:
            df_new_orig = df_orig[c].dropna()
            stat, pval = kruskal(df_new_orig, df_impute[c], nan_policy="omit")
            pvals = np.append(pvals, pval)

    return pvals




[docs]
def monte_carlo_simulator(
    df_orig: pd.DataFrame,
    df_impute: pd.DataFrame,
    sample_size: int,
    iteration: int,
    test: str,
    confidence: float = 0.05,
) -> None:
    """Check which column rejects hypothesis testing, p value < significance level, to find whether
    the imputation change the distribution of the original data set.

    Parameters
    ----------
    df_orig : pd.DataFrame (n_samples, n_components)
        The original dataset with missing value.

    df_impute : pd.DataFrame (n_samples, n_components)
        The dataset after imputation.

    test : str
        The statistics test method used.

    sample_size : int
        The size of the sample for each iteration.

    iteration : int
        The number of iterations of Monte Carlo simulation.

    confidence : float
        Confidence level, default to be 0.05
    """
    random.seed(2)
    simu_pvals = np.array([0] * df_orig.shape[1])
    for i in range(iteration):
        # monte carlo sampling
        sample_idx = random.sample(range(df_orig.shape[0]), sample_size)
        sample_orig = df_orig.iloc[sample_idx]
        sample_impute = df_impute.iloc[sample_idx]

        # hypothesis testing, non-parametric test
        one_pval = test_once(df_orig=sample_orig, df_impute=sample_impute, test=test)
        simu_pvals = simu_pvals + one_pval

    # average p value
    col_res = simu_pvals / iteration
    # check which column rejects hypothesis testing, 0 < p value < significance level
    rejected_col = df_orig.columns[np.where((col_res < confidence) & (col_res > 0))[0]]

    print("Significance Level: ", confidence)
    print("The number of iterations of Monte Carlo simulation: ", iteration)
    print("The size of the sample for each iteration (half of the whole data set): ", sample_size)
    print("Average p-value: ")
    print("\n".join("{} {}".format(x, y) for x, y in zip(df_orig.columns, col_res)))
    print("Note: 'p-value < 0.05' means imputation method doesn't apply to that column.")
    print("The columns which rejects null hypothesis: ", end="")
    print("None") if not rejected_col.size else print(*list(rejected_col))