Source code for pyco2stats.propagate_errors

import numpy as np
import warnings
import torch # Constrained GMM uses PyTorch
from sklearn.exceptions import ConvergenceWarning # sklearn GMM might raise this
from pyco2stats.gaussian_mixtures import GMM
from tqdm import tqdm

"""
The Propagate_Errors class is aimed to perform the Monte Carlo error propagation in order to quantify the uncertainty
of fitted Gaussian Mixture Model (GMM) parameters. The class permits to estimate the effect of "analytical" uncertainties
 on single observations on GMM results and enables to evaluate the effect of parameters (i.e. n° of observations)
 on the final estimates. The Propagate_Errors assumes input data are normally-distributed, therefore log-transformation of 
 raw data is required.
 
"""


[docs]
class Propagate_Errors:
    """
    A class to perform Monte Carlo error propagation for Gaussian Mixture Models (GMMs)
    fitted using different methods. Assumes input data is log-transformed.
    Propagates errors by adding fixed-standard-deviation additive noise to log-transformed data,
    corresponding to relative error on the original scale. Includes parameter alignment.
    """

    @staticmethod
    def _generate_perturbed_sample(data_log_scale, percentage_relative_error):
        """
        Generates a single perturbed dataset by adding random noise to the
        log-transformed data, where the noise standard deviation is a fixed
        value determined by the percentage relative error on the original scale.

        Parameters
        ----------
        data_log_scale : array or list
            The original 1D log-transformed input data. Will be converted to a NumPy array.
        percentage_relative_error : float
            The relative uncertainty in percent (e.g., 5 for 5%).
            This directly determines the standard deviation of the additive noise on the log scale.

        Returns
        -------
        Result : array
            A new array with perturbed data points on the log scale.
        """
        # Ensure data is a numpy array
        data_log_scale = np.asarray(data_log_scale)

        # Calculate the standard deviation of the additive noise on the log scale
        # This is the percentage relative error (as a fraction)
        std_dev_additive_noise = percentage_relative_error / 100.0

        # Generate random noise from a normal distribution with the calculated std dev
        additive_noise = np.random.normal(loc=0, scale=std_dev_additive_noise, size=data_log_scale.shape)

        # Add the noise to the log-transformed data
        perturbed_data_log_scale = data_log_scale + additive_noise

        return perturbed_data_log_scale


[docs]
    @staticmethod
    def propagate_em_error(
        original_log_data, # Renamed to clarify it's the log data, removed type hint for flexibility
        percentage_relative_error: float, # Renamed parameter for clarity
        n_simulations: int,
        n_components: int,
        random_state: None,
        max_iter: int = 100,
        tol: float = 1e-6,
        show_progress: bool = False
    ) -> dict:
        """
        Propagates error through the EM-based GMM fitting by simulating
        additive noise on the log-transformed sample data using Monte Carlo.
        Noise std dev on log scale is fixed, equal to percentage_relative_error / 100.
        Aligns components by sorting means before storing results.
        Returns perturbed data statistics for diagnostics.

        Parameters
        ----------
        original_log_data : array
            The original 1D log-transformed data.
        percentage_relative_error : float
            The relative uncertainty in percent (e.g., 5 for 5%). This sets the std dev of additive noise on log scale.
        n_simulations : int
            The number of Monte Carlo simulations to run.
        n_components : int
            The number of Gaussian components in the mixture.
        max_iter : int
            Max iterations for the EM algorithm.
        tol : float
            Tolerance for EM convergence.

        Returns
        -------
        Result : dist
            A dictionary containing lists of results from each simulation:
            {'means': list[np.ndarray], 'std_devs': list[np.ndarray], 'weights': list[np.ndarray],
            'perturbed_data_means': list[float], 'perturbed_data_stds': list[float]}
            Lists of GMM parameters have shape (n_components,).
            Lists of data statistics have shape (n_simulations,).
        """
        # Ensure input data is a numpy array
        original_log_data = np.asarray(original_log_data)

        simulated_means = []
        simulated_std_devs = []
        simulated_weights = []
        perturbed_data_means = [] # To store mean of each perturbed sample
        perturbed_data_stds = []  # To store std dev of each perturbed sample


        # original_log_data_std and original_log_data_mean are not needed for perturbation std dev calculation now


       
        iterator = tqdm(range(n_simulations), desc="gaussian_mixture_em Monte Carlo") if show_progress else range(n_simulations)
        for i in iterator:
            # Generate perturbed log-transformed data using additive noise
            # Pass original_log_data and percentage_relative_error directly
            perturbed_log_data = Propagate_Errors._generate_perturbed_sample(
                original_log_data, percentage_relative_error # Pass percentage directly
            )

            # Store stats of the perturbed sample BEFORE fitting GMM
            perturbed_data_means.append(np.mean(perturbed_log_data))
            perturbed_data_stds.append(np.std(perturbed_log_data))

            # Catch potential warnings or errors from GMM fitting on perturbed data
            with warnings.catch_warnings():
                warnings.filterwarnings('ignore', category=ConvergenceWarning)
                try:
                    # Call the EM-based GMM fitting function
                    means, std_devs, weights, _ = GMM.gaussian_mixture_em(
                        perturbed_log_data, n_components, max_iter=max_iter, random_state=random_state, tol=tol
                    )

                    # --- Parameter Alignment Step ---
                    means = np.asarray(means)
                    std_devs = np.asarray(std_devs)
                    weights = np.asarray(weights)

                    sort_indices = np.argsort(means)
                    aligned_means = means[sort_indices]
                    aligned_std_devs = std_devs[sort_indices]
                    aligned_weights = weights[sort_indices]
                    # --- End Alignment Step ---

                    simulated_means.append(aligned_means)
                    simulated_std_devs.append(aligned_std_devs)
                    simulated_weights.append(aligned_weights)
                except Exception as e:
                    warnings.warn(f"Simulation {i+1}/{n_simulations} failed during GMM.gaussian_mixture_em: {e}", RuntimeWarning)
                    perturbed_data_means.pop() # Remove the last added mean for failed sim
                    perturbed_data_stds.pop()  # Remove the last added std dev for failed sim


        return {
            'means': simulated_means,
            'std_devs': simulated_std_devs,
            'weights': simulated_weights,
            'perturbed_data_means': perturbed_data_means, # Return the list of perturbed data means for successful fits
            'perturbed_data_stds': perturbed_data_stds   # Return the list of perturbed data stds for successful fits
        }



[docs]
    @staticmethod
    def propagate_sklearn_error(
        original_log_data, # Renamed, removed type hint
        percentage_relative_error: float, # Renamed parameter
        n_simulations: int,
        n_components: int,
        max_iter: int = 10,
        tol: float = 1e-10,
        n_init: int = 20,
        suppress_warnings: bool = True,
        covariance_type: str = 'spherical',
        show_progress: bool = False
    ) -> dict:
        """
        Propagates error through the sklearn-based GMM fitting by simulating
        additive noise on the log-transformed sample data using Monte Carlo.
        Noise std dev on log scale is fixed, equal to percentage_relative_error / 100.
        Aligns components by sorting means before storing results.

        Parameters
        ----------
        original_log_data : array
            The original 1D log-transformed data.
        percentage_relative_error : float
            The relative uncertainty in percent (e.g., 5 for 5%). This sets the std dev of additive noise on log scale.
        n_simulations : int
            The number of Monte Carlo simulations to run.
        n_components : int
            The number of Gaussian components in the mixture.
        max_iter : int
            Max iterations for the sklearn EM algorithm.
        tol : float
            Tolerance for sklearn EM convergence.
        n_init : int
            Number of initializations for sklearn GMM.
        suppress_warnings : bool
            Whether to suppress sklearn warnings.
        covariance_type : string
            The type of covariance ('spherical', 'diag', 'full', 'tied').

        Returns
        -------
        result : dict
            A dictionary containing lists of results from each simulation:
            {'means': list[np.ndarray], 'std_devs': list[np.ndarray], 'weights': list[np.ndarray]}
            Each np.ndarray in the lists has shape (n_components,).
        """
        # Ensure input data is a numpy array
        original_log_data = np.asarray(original_log_data)


        simulated_means = []
        simulated_std_devs = []
        simulated_weights = []

        # original_log_data_std is not needed for perturbation std dev calculation now


        iterator = tqdm(range(n_simulations), desc="gaussian_mixture_sklearn Monte Carlo") if show_progress else range(n_simulations)
        for i in iterator:
            # Generate perturbed log-transformed data using additive noise
            # Pass original_log_data and percentage_relative_error directly
            perturbed_log_data = Propagate_Errors._generate_perturbed_sample(
                original_log_data, percentage_relative_error # Pass percentage directly
            )

            # sklearn GMM expects data in a 2D array, even for univariate data
            perturbed_data_2d = perturbed_log_data.reshape(-1, 1)

            # Call the sklearn-based GMM fitting function
            try:
                means, std_devs, weights, _ = GMM.gaussian_mixture_sklearn(
                    perturbed_data_2d, n_components=n_components, max_iter=max_iter,
                    tol=tol, n_init=n_init, suppress_warnings=suppress_warnings,
                    covariance_type=covariance_type
                )

                # --- Parameter Alignment Step ---
                means = np.asarray(means)
                std_devs = np.asarray(std_devs)
                weights = np.asarray(weights)

                sort_indices = np.argsort(means)
                aligned_means = means[sort_indices]
                aligned_std_devs = std_devs[sort_indices]
                aligned_weights = weights[sort_indices]
                # --- End Alignment Step ---

                simulated_means.append(aligned_means)
                simulated_std_devs.append(aligned_std_devs)
                simulated_weights.append(aligned_weights)

            except Exception as e:
                 warnings.warn(f"Simulation {i+1}/{n_simulations} failed during GMM.gaussian_mixture_sklearn: {e}", RuntimeWarning)
                 # Skip failed simulations


        return {
            'means': simulated_means,
            'std_devs': simulated_std_devs, 
            'weights': simulated_weights
        }




[docs]
    @staticmethod
    def propagate_constrained_error(
        original_log_data, # Renamed, removed type hint
        percentage_relative_error: float, # Renamed parameter
        n_simulations: int,
        mean_constraints: list,
        std_constraints: list,
        n_components: int,
        n_epochs: int = 5000,
        lr: float = 0.001,
        verbose: bool = False, # Suppress verbose output during MC simulations
        show_progress: bool = False
    ) -> dict:
        """
        Propagates error through the constrained PyTorch-based GMM fitting by simulating
        additive noise on the log-transformed sample data using Monte Carlo.
        Noise std dev on log scale is fixed, equal to percentage_relative_error / 100.
        Aligns components by sorting means before storing results.

        Parameters
        ----------
        original_log_data : array
            The original 1D log-transformed data.
        percentage_relative_error : float
            The relative uncertainty in percent (e.g., 5 for 5%). This sets the std dev of additive noise on log scale.
        n_simulations : int
            The number of Monte Carlo simulations to run.
        mean_constraints : list
            List of tuples specifying (min, max) constraints for each component's mean on the log scale.
        std_constraints : list
            List of tuples specifying (min, max) constraints for each component's std dev on the log scale.
        n_components : int
            Number of Gaussian components.
        n_epochs : int
            Number of optimization epochs for constrained GMM.
        lr : float
            Learning rate for optimization.
        verbose : bool
            Whether to print progress during constrained GMM fitting (False recommended for MC).

        Returns
        -------
        result: dict
            A dictionary containing lists of results from each simulation:
            {'means': list[np.ndarray], 'std_devs': list[np.ndarray], 'weights': list[np.ndarray]}
            Each np.ndarray in the lists has shape (n_components,).
        """
        # Ensure input data is a numpy array
        original_log_data = np.asarray(original_log_data)


        simulated_means = []
        simulated_std_devs = []
        simulated_weights = []

        # original_log_data_std is not needed for perturbation std dev calculation now


        iterator = tqdm(range(n_simulations), desc="constrained_gaussian_mixture Monte Carlo") if show_progress else range(n_simulations)
        for i in iterator:
            # Generate perturbed log-transformed data using additive noise
            # Pass original_log_data and percentage_relative_error directly
            perturbed_log_data = Propagate_Errors._generate_perturbed_sample(
                original_log_data, percentage_relative_error # Pass percentage directly
            )

            # Convert perturbed data (NumPy array) to PyTorch tensor using torch.from_numpy
            perturbed_data_tensor = torch.from_numpy(perturbed_log_data).float()


            # Call the constrained PyTorch GMM fitting function
            try:
                means, std_devs, weights = GMM.constrained_gaussian_mixture(
                    perturbed_data_tensor, mean_constraints, std_constraints,
                    n_components, n_epochs=n_epochs, lr=lr, verbose=False # Force verbose off for MC
                )

                # --- Parameter Alignment Step ---
                # Note: PyTorch tensors need detachment before sorting with numpy
                means_np = means.detach().numpy()
                sort_indices = np.argsort(means_np)

                aligned_means = means_np[sort_indices]
                # Need to align std_devs and weights (which are already numpy from GMM method)
                aligned_std_devs = std_devs[sort_indices]
                aligned_weights = weights[sort_indices]
                # --- End Alignment Step ---

                simulated_means.append(aligned_means)
                simulated_std_devs.append(aligned_std_devs)
                simulated_weights.append(aligned_weights)

            except Exception as e:
                warnings.warn(f"Simulation {i+1}/{n_simulations} failed during GMM.constrained_gaussian_mixture: {e}", RuntimeWarning)
                # Skip failed simulations


        return {
            'means': simulated_means,
            'std_devs': simulated_std_devs,
            'weights': simulated_weights
        }



[docs]
    @staticmethod
    def elaborate_results(
        propagation_results: dict,
        single_fit_means: np.ndarray = None,
        single_fit_std_devs: np.ndarray = None,
        single_fit_weights: np.ndarray = None,
        original_data_mean: float = None,
        original_data_std: float = None,
        method_name: str = "GMM" # e.g., "EM", "Sklearn", "Constrained"
    ):
        """
        Elaborates and prints the results from a Monte Carlo error propagation run.

        Parameters
        ----------
        propagation_results : dict
            The dictionary returned by a propagate_*_error method. Expected keys: 'means', 'std_devs', 'weights'.
            May also contain 'perturbed_data_means', 'perturbed_data_stds' for certain methods (e.g., EM).
        single_fit_means : array, optional
            Means from a single fit on original data.
        single_fit_std_devs : array, optional
            Std devs from a single fit on original data.
        single_fit_weights : array, optional
            Weights from a single fit on original data.
        original_data_mean : float, optional
            Mean of the original log-transformed data.
        original_data_std : float, optional
            Std dev of the original log-transformed data.
        method_name : string
            The name of the GMM method used for reporting.

        Returns
        -------
        None
        """
        print(f"\n--- {method_name} Propagation Results (Elaborated) ---")

        if not propagation_results['means']:
            print(f"No successful {method_name} GMM simulations.")
            return

        simulated_means = np.array(propagation_results['means'])
        simulated_std_devs = np.array(propagation_results['std_devs'])
        simulated_weights = np.array(propagation_results['weights'])

        n_simulations = len(simulated_means) # Get actual number of successful simulations

        # Calculate central tendency (mean and median) and confidence intervals (2.5% - 97.5%)
        mean_means = np.mean(simulated_means, axis=0)
        median_means = np.median(simulated_means, axis=0)
        ci_means = np.percentile(simulated_means, [2.5, 97.5], axis=0)

        mean_std_devs = np.mean(simulated_std_devs, axis=0)
        median_std_devs = np.median(simulated_std_devs, axis=0)
        ci_std_devs = np.percentile(simulated_std_devs, [2.5, 97.5], axis=0)

        mean_weights = np.mean(simulated_weights, axis=0)
        median_weights = np.median(simulated_weights, axis=0)
        ci_weights = np.percentile(simulated_weights, [2.5, 97.5], axis=0)


        print(f"  Successful Simulations: {n_simulations}")
        print(f"  Mean Means (log scale): {mean_means}")
        print(f"  Median Means (log scale): {median_means}")
        print(f"  CI (2.5% - 97.5%) Means (log scale): {ci_means}")
        print(f"  Mean Std Devs (log scale): {mean_std_devs}")
        print(f"  Median Std Devs (log scale): {median_std_devs}")
        print(f"  CI (2.5% - 97.5%) Std Devs (log scale): {ci_std_devs}")
        print(f"  Mean Weights: {mean_weights}")
        print(f"  Median Weights: {median_weights}")
        print(f"  CI (2.5% - 97.5%) Weights: {ci_weights}")