Source code for pyco2stats.propagate_errors

import numpy as np
import warnings
import torch # Constrained GMM uses PyTorch
from sklearn.exceptions import ConvergenceWarning # sklearn GMM might raise this
from pyco2stats.gaussian_mixtures import GMM
from tqdm import tqdm

"""
The Propagate_Errors class is aimed to perform the Monte Carlo error propagation in order to quantify the uncertainty
of fitted Gaussian Mixture Model (GMM) parameters. The class permits to estimate the effect of "analytical" uncertainties
 on single observations on GMM results and enables to evaluate the effect of parameters (i.e. n° of observations)
 on the final estimates. The Propagate_Errors assumes input data are normally-distributed, therefore log-transformation of 
 raw data is required.
 
"""

[docs] class Propagate_Errors: """ A class to perform Monte Carlo error propagation for Gaussian Mixture Models (GMMs) fitted using different methods. Assumes input data is log-transformed. Propagates errors by adding fixed-standard-deviation additive noise to log-transformed data, corresponding to relative error on the original scale. Includes parameter alignment. """ @staticmethod def _generate_perturbed_sample(data_log_scale, percentage_relative_error): """ Generates a single perturbed dataset by adding random noise to the log-transformed data, where the noise standard deviation is a fixed value determined by the percentage relative error on the original scale. Parameters ---------- data_log_scale : array or list The original 1D log-transformed input data. Will be converted to a NumPy array. percentage_relative_error : float The relative uncertainty in percent (e.g., 5 for 5%). This directly determines the standard deviation of the additive noise on the log scale. Returns ------- Result : array A new array with perturbed data points on the log scale. """ # Ensure data is a numpy array data_log_scale = np.asarray(data_log_scale) # Calculate the standard deviation of the additive noise on the log scale # This is the percentage relative error (as a fraction) std_dev_additive_noise = percentage_relative_error / 100.0 # Generate random noise from a normal distribution with the calculated std dev additive_noise = np.random.normal(loc=0, scale=std_dev_additive_noise, size=data_log_scale.shape) # Add the noise to the log-transformed data perturbed_data_log_scale = data_log_scale + additive_noise return perturbed_data_log_scale
[docs] @staticmethod def propagate_em_error( original_log_data, # Renamed to clarify it's the log data, removed type hint for flexibility percentage_relative_error: float, # Renamed parameter for clarity n_simulations: int, n_components: int, random_state: None, max_iter: int = 100, tol: float = 1e-6, show_progress: bool = False ) -> dict: """ Propagates error through the EM-based GMM fitting by simulating additive noise on the log-transformed sample data using Monte Carlo. Noise std dev on log scale is fixed, equal to percentage_relative_error / 100. Aligns components by sorting means before storing results. Returns perturbed data statistics for diagnostics. Parameters ---------- original_log_data : array The original 1D log-transformed data. percentage_relative_error : float The relative uncertainty in percent (e.g., 5 for 5%). This sets the std dev of additive noise on log scale. n_simulations : int The number of Monte Carlo simulations to run. n_components : int The number of Gaussian components in the mixture. max_iter : int Max iterations for the EM algorithm. tol : float Tolerance for EM convergence. Returns ------- Result : dist A dictionary containing lists of results from each simulation: {'means': list[np.ndarray], 'std_devs': list[np.ndarray], 'weights': list[np.ndarray], 'perturbed_data_means': list[float], 'perturbed_data_stds': list[float]} Lists of GMM parameters have shape (n_components,). Lists of data statistics have shape (n_simulations,). """ # Ensure input data is a numpy array original_log_data = np.asarray(original_log_data) simulated_means = [] simulated_std_devs = [] simulated_weights = [] perturbed_data_means = [] # To store mean of each perturbed sample perturbed_data_stds = [] # To store std dev of each perturbed sample # original_log_data_std and original_log_data_mean are not needed for perturbation std dev calculation now iterator = tqdm(range(n_simulations), desc="gaussian_mixture_em Monte Carlo") if show_progress else range(n_simulations) for i in iterator: # Generate perturbed log-transformed data using additive noise # Pass original_log_data and percentage_relative_error directly perturbed_log_data = Propagate_Errors._generate_perturbed_sample( original_log_data, percentage_relative_error # Pass percentage directly ) # Store stats of the perturbed sample BEFORE fitting GMM perturbed_data_means.append(np.mean(perturbed_log_data)) perturbed_data_stds.append(np.std(perturbed_log_data)) # Catch potential warnings or errors from GMM fitting on perturbed data with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=ConvergenceWarning) try: # Call the EM-based GMM fitting function means, std_devs, weights, _ = GMM.gaussian_mixture_em( perturbed_log_data, n_components, max_iter=max_iter, random_state=random_state, tol=tol ) # --- Parameter Alignment Step --- means = np.asarray(means) std_devs = np.asarray(std_devs) weights = np.asarray(weights) sort_indices = np.argsort(means) aligned_means = means[sort_indices] aligned_std_devs = std_devs[sort_indices] aligned_weights = weights[sort_indices] # --- End Alignment Step --- simulated_means.append(aligned_means) simulated_std_devs.append(aligned_std_devs) simulated_weights.append(aligned_weights) except Exception as e: warnings.warn(f"Simulation {i+1}/{n_simulations} failed during GMM.gaussian_mixture_em: {e}", RuntimeWarning) perturbed_data_means.pop() # Remove the last added mean for failed sim perturbed_data_stds.pop() # Remove the last added std dev for failed sim return { 'means': simulated_means, 'std_devs': simulated_std_devs, 'weights': simulated_weights, 'perturbed_data_means': perturbed_data_means, # Return the list of perturbed data means for successful fits 'perturbed_data_stds': perturbed_data_stds # Return the list of perturbed data stds for successful fits }
[docs] @staticmethod def propagate_sklearn_error( original_log_data, # Renamed, removed type hint percentage_relative_error: float, # Renamed parameter n_simulations: int, n_components: int, max_iter: int = 10, tol: float = 1e-10, n_init: int = 20, suppress_warnings: bool = True, covariance_type: str = 'spherical', show_progress: bool = False ) -> dict: """ Propagates error through the sklearn-based GMM fitting by simulating additive noise on the log-transformed sample data using Monte Carlo. Noise std dev on log scale is fixed, equal to percentage_relative_error / 100. Aligns components by sorting means before storing results. Parameters ---------- original_log_data : array The original 1D log-transformed data. percentage_relative_error : float The relative uncertainty in percent (e.g., 5 for 5%). This sets the std dev of additive noise on log scale. n_simulations : int The number of Monte Carlo simulations to run. n_components : int The number of Gaussian components in the mixture. max_iter : int Max iterations for the sklearn EM algorithm. tol : float Tolerance for sklearn EM convergence. n_init : int Number of initializations for sklearn GMM. suppress_warnings : bool Whether to suppress sklearn warnings. covariance_type : string The type of covariance ('spherical', 'diag', 'full', 'tied'). Returns ------- result : dict A dictionary containing lists of results from each simulation: {'means': list[np.ndarray], 'std_devs': list[np.ndarray], 'weights': list[np.ndarray]} Each np.ndarray in the lists has shape (n_components,). """ # Ensure input data is a numpy array original_log_data = np.asarray(original_log_data) simulated_means = [] simulated_std_devs = [] simulated_weights = [] # original_log_data_std is not needed for perturbation std dev calculation now iterator = tqdm(range(n_simulations), desc="gaussian_mixture_sklearn Monte Carlo") if show_progress else range(n_simulations) for i in iterator: # Generate perturbed log-transformed data using additive noise # Pass original_log_data and percentage_relative_error directly perturbed_log_data = Propagate_Errors._generate_perturbed_sample( original_log_data, percentage_relative_error # Pass percentage directly ) # sklearn GMM expects data in a 2D array, even for univariate data perturbed_data_2d = perturbed_log_data.reshape(-1, 1) # Call the sklearn-based GMM fitting function try: means, std_devs, weights, _ = GMM.gaussian_mixture_sklearn( perturbed_data_2d, n_components=n_components, max_iter=max_iter, tol=tol, n_init=n_init, suppress_warnings=suppress_warnings, covariance_type=covariance_type ) # --- Parameter Alignment Step --- means = np.asarray(means) std_devs = np.asarray(std_devs) weights = np.asarray(weights) sort_indices = np.argsort(means) aligned_means = means[sort_indices] aligned_std_devs = std_devs[sort_indices] aligned_weights = weights[sort_indices] # --- End Alignment Step --- simulated_means.append(aligned_means) simulated_std_devs.append(aligned_std_devs) simulated_weights.append(aligned_weights) except Exception as e: warnings.warn(f"Simulation {i+1}/{n_simulations} failed during GMM.gaussian_mixture_sklearn: {e}", RuntimeWarning) # Skip failed simulations return { 'means': simulated_means, 'std_devs': simulated_std_devs, 'weights': simulated_weights }
[docs] @staticmethod def propagate_constrained_error( original_log_data, # Renamed, removed type hint percentage_relative_error: float, # Renamed parameter n_simulations: int, mean_constraints: list, std_constraints: list, n_components: int, n_epochs: int = 5000, lr: float = 0.001, verbose: bool = False, # Suppress verbose output during MC simulations show_progress: bool = False ) -> dict: """ Propagates error through the constrained PyTorch-based GMM fitting by simulating additive noise on the log-transformed sample data using Monte Carlo. Noise std dev on log scale is fixed, equal to percentage_relative_error / 100. Aligns components by sorting means before storing results. Parameters ---------- original_log_data : array The original 1D log-transformed data. percentage_relative_error : float The relative uncertainty in percent (e.g., 5 for 5%). This sets the std dev of additive noise on log scale. n_simulations : int The number of Monte Carlo simulations to run. mean_constraints : list List of tuples specifying (min, max) constraints for each component's mean on the log scale. std_constraints : list List of tuples specifying (min, max) constraints for each component's std dev on the log scale. n_components : int Number of Gaussian components. n_epochs : int Number of optimization epochs for constrained GMM. lr : float Learning rate for optimization. verbose : bool Whether to print progress during constrained GMM fitting (False recommended for MC). Returns ------- result: dict A dictionary containing lists of results from each simulation: {'means': list[np.ndarray], 'std_devs': list[np.ndarray], 'weights': list[np.ndarray]} Each np.ndarray in the lists has shape (n_components,). """ # Ensure input data is a numpy array original_log_data = np.asarray(original_log_data) simulated_means = [] simulated_std_devs = [] simulated_weights = [] # original_log_data_std is not needed for perturbation std dev calculation now iterator = tqdm(range(n_simulations), desc="constrained_gaussian_mixture Monte Carlo") if show_progress else range(n_simulations) for i in iterator: # Generate perturbed log-transformed data using additive noise # Pass original_log_data and percentage_relative_error directly perturbed_log_data = Propagate_Errors._generate_perturbed_sample( original_log_data, percentage_relative_error # Pass percentage directly ) # Convert perturbed data (NumPy array) to PyTorch tensor using torch.from_numpy perturbed_data_tensor = torch.from_numpy(perturbed_log_data).float() # Call the constrained PyTorch GMM fitting function try: means, std_devs, weights = GMM.constrained_gaussian_mixture( perturbed_data_tensor, mean_constraints, std_constraints, n_components, n_epochs=n_epochs, lr=lr, verbose=False # Force verbose off for MC ) # --- Parameter Alignment Step --- # Note: PyTorch tensors need detachment before sorting with numpy means_np = means.detach().numpy() sort_indices = np.argsort(means_np) aligned_means = means_np[sort_indices] # Need to align std_devs and weights (which are already numpy from GMM method) aligned_std_devs = std_devs[sort_indices] aligned_weights = weights[sort_indices] # --- End Alignment Step --- simulated_means.append(aligned_means) simulated_std_devs.append(aligned_std_devs) simulated_weights.append(aligned_weights) except Exception as e: warnings.warn(f"Simulation {i+1}/{n_simulations} failed during GMM.constrained_gaussian_mixture: {e}", RuntimeWarning) # Skip failed simulations return { 'means': simulated_means, 'std_devs': simulated_std_devs, 'weights': simulated_weights }
[docs] @staticmethod def elaborate_results( propagation_results: dict, single_fit_means: np.ndarray = None, single_fit_std_devs: np.ndarray = None, single_fit_weights: np.ndarray = None, original_data_mean: float = None, original_data_std: float = None, method_name: str = "GMM" # e.g., "EM", "Sklearn", "Constrained" ): """ Elaborates and prints the results from a Monte Carlo error propagation run. Parameters ---------- propagation_results : dict The dictionary returned by a propagate_*_error method. Expected keys: 'means', 'std_devs', 'weights'. May also contain 'perturbed_data_means', 'perturbed_data_stds' for certain methods (e.g., EM). single_fit_means : array, optional Means from a single fit on original data. single_fit_std_devs : array, optional Std devs from a single fit on original data. single_fit_weights : array, optional Weights from a single fit on original data. original_data_mean : float, optional Mean of the original log-transformed data. original_data_std : float, optional Std dev of the original log-transformed data. method_name : string The name of the GMM method used for reporting. Returns ------- None """ print(f"\n--- {method_name} Propagation Results (Elaborated) ---") if not propagation_results['means']: print(f"No successful {method_name} GMM simulations.") return simulated_means = np.array(propagation_results['means']) simulated_std_devs = np.array(propagation_results['std_devs']) simulated_weights = np.array(propagation_results['weights']) n_simulations = len(simulated_means) # Get actual number of successful simulations # Calculate central tendency (mean and median) and confidence intervals (2.5% - 97.5%) mean_means = np.mean(simulated_means, axis=0) median_means = np.median(simulated_means, axis=0) ci_means = np.percentile(simulated_means, [2.5, 97.5], axis=0) mean_std_devs = np.mean(simulated_std_devs, axis=0) median_std_devs = np.median(simulated_std_devs, axis=0) ci_std_devs = np.percentile(simulated_std_devs, [2.5, 97.5], axis=0) mean_weights = np.mean(simulated_weights, axis=0) median_weights = np.median(simulated_weights, axis=0) ci_weights = np.percentile(simulated_weights, [2.5, 97.5], axis=0) print(f" Successful Simulations: {n_simulations}") print(f" Mean Means (log scale): {mean_means}") print(f" Median Means (log scale): {median_means}") print(f" CI (2.5% - 97.5%) Means (log scale): {ci_means}") print(f" Mean Std Devs (log scale): {mean_std_devs}") print(f" Median Std Devs (log scale): {median_std_devs}") print(f" CI (2.5% - 97.5%) Std Devs (log scale): {ci_std_devs}") print(f" Mean Weights: {mean_weights}") print(f" Median Weights: {median_weights}") print(f" CI (2.5% - 97.5%) Weights: {ci_weights}")