Source code for OCDocker.OCScore.Analysis.Plotting.Stats

#!/usr/bin/env python3

# Description
###############################################################################
'''
Plotting helpers for statistical summaries (scatter/box/bar, diagnostics, PCA
importance). These utilities are used by Analysis workflows and StatTests.

Usage:

import OCDocker.OCScore.Analysis.Plotting.Stats as ocstatplot
'''

# Imports
###############################################################################
import warnings

import matplotlib.lines as mlines
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as sstats
import seaborn as sns
import OCDocker.Error as ocerror

from typing import Optional

# License
###############################################################################
'''
OCDocker
Authors: Rossi, A.D.; Monachesi, M.C.E.; Spelta, G.I.; Torres, P.H.M.
Federal University of Rio de Janeiro
Carlos Chagas Filho Institute of Biophysics
Laboratory for Molecular Modeling and Dynamics

This program is proprietary software owned by the Federal University of Rio de Janeiro (UFRJ),
developed by Rossi, A.D.; Monachesi, M.C.E.; Spelta, G.I.; Torres, P.H.M., and protected under Brazilian Law No. 9,609/1998.
All rights reserved. Use, reproduction, modification, and distribution are allowed under this UFRJ license,
provided this copyright notice is preserved. See the LICENSE file for details.

Contact: Artur Duque Rossi - arturossi10@gmail.com
'''

# Classes
###############################################################################

# Functions
###############################################################################
## Private ##

## Public ##

[docs] def plot_bar_with_significance( gh_df: pd.DataFrame, metric: str, y_col: str = 'diff', colour_mapping: Optional[dict[str, tuple[float, float, float]]] = None, output_dir: str = 'plots', top_n: Optional[int] = 30 ) -> None: ''' Plot Games-Howell pairwise differences as a horizontal bar chart. Parameters ---------- gh_df : pd.DataFrame Output of pingouin.pairwise_gameshowell (expects columns 'A','B','diff','pval'). metric : str Metric label for titling ('AUC' or 'RMSE'). y_col : str Which column from gh_df to plot as bar length (default 'diff'). colour_mapping : dict | None, optional Unused here, accepted for API compatibility. Default: None. output_dir : str Where to save the plot image. Default: 'plots'. top_n : int | None, optional If given, keep the top-N pairs by smallest p-value. Default: 30. ''' df = gh_df.copy() if 'pval' not in df.columns: # pingouin sometimes returns 'pval'/'pval_corr'; tolerate variants pcol = next((c for c in df.columns if c.startswith('pval')), None) if pcol is None: # User-facing error: missing required data in DataFrame ocerror.Error.data_not_found("Games-Howell dataframe must contain a p-value column (pval, pval_corr, etc.)") raise ValueError('Games-Howell dataframe must contain a p-value column.') df['pval'] = df[pcol] df['pair'] = df['A'].astype(str) + ' vs ' + df['B'].astype(str) df.sort_values(by=['pval', y_col], ascending=[True, False], inplace=True) if top_n is not None: df = df.head(top_n) # Color positive diffs blue, negative red for quick read colors = df[y_col].map(lambda v: 'tab:blue' if v >= 0 else 'tab:red') plt.figure(figsize=(max(8, 0.25 * len(df)), max(6, 0.35 * len(df)))) ax = sns.barplot(data=df, x=y_col, y='pair', palette=colors, orient='h') # Annotate p-values and significance stars def stars(p: float) -> str: '''Convert p-value to significance stars. Parameters ---------- p : float The p-value to convert. Returns ------- str Significance stars: '***' for p < 0.001, '**' for p < 0.01, '*' for p < 0.05, '' otherwise. ''' return '***' if p < 0.001 else ('**' if p < 0.01 else ('*' if p < 0.05 else '')) df_plot = df.reset_index(drop = True) y_values = df_plot[y_col].to_numpy() p_values = df_plot['pval'].to_numpy() for i, (y_val, p_val) in enumerate(zip(y_values, p_values)): ax.text( y_val + (0.01 if y_val >= 0 else -0.01), i, f"{y_val:.3f} (p={p_val:.2e}) {stars(p_val)}", ha='left' if y_val >= 0 else 'right', va='center', fontsize=8, ) ax.set_title(f'Games-Howell pairwise differences — {metric}') ax.set_xlabel(f'Difference in {metric}') ax.set_ylabel('Pair (A vs B)') plt.grid(True, axis='x', linestyle=':', linewidth=0.5) plt.tight_layout() plt.savefig(f"{output_dir}/games_howell_bar_{metric}.png", dpi=300) plt.close()
[docs] def plot_barplots(df: pd.DataFrame, n_trials: int, colour_mapping: dict[str, tuple[float, float, float]], output_dir: str) -> None: ''' Generate sorted barplots of mean RMSE and AUC across methodologies with annotations. Parameters ---------- df : pd.DataFrame Data containing 'RMSE', 'AUC', and 'Methodology'. n_trials : int Trial number for title and output naming. colour_mapping : dict[str, tuple[float, float, float]] Dictionary mapping methodologies to colors. output_dir : str Directory to save the barplot images. ''' df_means = df.groupby('Methodology')[['RMSE', 'AUC']].mean().reset_index() plt.figure(figsize = (16, 6)) for i, metric in enumerate(['RMSE', 'AUC']): plt.subplot(1, 2, i + 1) df_sorted = df_means.sort_values(by = metric) method_order = df_sorted['Methodology'].tolist() palette_sorted = {k: colour_mapping[k] for k in method_order} ax = sns.barplot( data = df_sorted, x = 'Methodology', y = metric, hue = 'Methodology', palette = palette_sorted, legend = False ) for j, val in enumerate(df_sorted[metric]): plt.text(j, val + 0.01, f"{val:.2f}", ha = 'center', va = 'bottom', fontsize = 9) plt.xticks(rotation = 90) plt.title(f'{metric} Mean per Method ({n_trials} Trials)') plt.grid(True) plt.minorticks_on() plt.grid(which = 'minor', linestyle = ':', linewidth = 0.5) plt.tight_layout() plt.savefig(f'{output_dir}/barplot_rmse_auc_{n_trials}.png') plt.close()
[docs] def plot_boxplots(df: pd.DataFrame, n_trials: int, colour_mapping: dict[str, tuple[float, float, float]], output_dir: str, show_simple_consensus: bool = False) -> None: ''' Generate enhanced boxplots of RMSE and AUC across methodologies, with group shading and mean lines. Parameters ---------- df : pd.DataFrame Data containing 'RMSE', 'AUC', and 'Methodology'. n_trials : int Number of trials used for title and filenames. colour_mapping : dict[str, tuple[float, float, float]] Dictionary mapping methodologies to colors. output_dir : str Directory to save the boxplot images. show_simple_consensus : bool Whether to include consensus methodologies (any label ending with "consensus"). ''' plot_df = df.copy() if not show_simple_consensus: plot_df = plot_df[~plot_df['Methodology'].str.endswith('consensus', na = False)] plt.figure(figsize = (16, 12)) mean_line_rmse, mean_line_auc = None, None for i, metric in enumerate(['RMSE', 'AUC']): plt.subplot(2, 1, i + 1) with warnings.catch_warnings(): # Seaborn currently forwards a deprecated Matplotlib `vert` kwarg # internally in some versions; silence this third-party warning. warnings.filterwarnings( "ignore", message = "vert: bool will be deprecated in a future version.*", category = PendingDeprecationWarning, ) ax = sns.boxplot( data = plot_df, x = 'Methodology', y = metric, hue = 'Methodology', palette = colour_mapping, showfliers = False, legend = False ) # Distinct line color for each metric mean_val = plot_df[metric].mean() line_color = 'red' if metric == 'RMSE' else 'blue' line = ax.axhline(mean_val, color = line_color, linestyle = '--', label = f'Mean {metric}') if i == 0: mean_line_rmse = line else: mean_line_auc = line plt.xticks(rotation = 90) plt.title(f'{metric} Distribution ({n_trials} Trials)') plt.grid(True, linestyle = ':', linewidth = 0.5) plt.minorticks_on() # Highlight NN, XGB, Transformer groups for prefix, color in [('NN', 'lightblue'), ('XGB', 'lightgreen'), ('Transformer', 'lightcoral')]: for method in plot_df['Methodology'].unique(): if method.startswith(prefix): idx = list(plot_df['Methodology'].unique()).index(method) plt.axvspan(idx - 0.5, idx + 0.5, color = color, alpha = 0.2) # Add figure-level legend at the bottom plt.figlegend( handles = [mean_line_rmse, mean_line_auc], labels = ['Mean RMSE', 'Mean AUC'], loc = 'lower center', bbox_to_anchor = (0.5, 0.02), ncol = 2, frameon = False ) # Adjust layout to avoid overlap plt.tight_layout(rect = (0, 0.08, 1, 1)) plt.savefig(f'{output_dir}/boxplots_rmse_auc_{n_trials}.png', dpi = 300) plt.close()
[docs] def plot_combined_metric_scatter(df: pd.DataFrame, n_trials: int, colour_mapping: dict[str, tuple[float, float, float]], output_dir: str, alpha: float = 0.9) -> None: ''' Generate a detailed scatter plot showing RMSE vs AUC across methods with shading and symbol cues. Parameters ---------- df : pd.DataFrame DataFrame with RMSE, AUC, and Methodology columns. n_trials : int Number of top trials considered. colour_mapping : dict[str, tuple[float, float, float]] Dictionary mapping methodologies to colors. output_dir : str Directory to save the scatter plot image. alpha : float, optional Transparency for the markers. Default is 0.9. ''' df = df.copy() df['AUC_adj'] = df['AUC'].apply(lambda x: 1 - x if x < 0.5 else x) df['AUC_category'] = df['AUC'].apply(lambda x: '>= 0.5' if x >= 0.5 else '< 0.5') df.loc[df['AUC_category'] == '< 0.5', 'AUC'] = df['AUC_adj'] plt.figure(figsize = (10, 8)) # Scatter for AUC ≥ 0.5 sns.scatterplot( data = df[df['AUC_category'] == '>= 0.5'], x = 'RMSE', y = 'AUC', hue = 'Methodology', palette = colour_mapping, alpha = alpha, marker = 'o', s = 100, legend = False ) # Scatter for AUC < 0.5 sns.scatterplot( data = df[df['AUC_category'] == '< 0.5'], x = 'RMSE', y = 'AUC', hue = 'Methodology', palette = colour_mapping, alpha = alpha, marker = '*', s = 130, legend = False ) plt.xlabel('RMSE') plt.ylabel('AUC (adjusted)') plt.title(f'Combined Metric Comparison ({n_trials} Trials)') plt.grid(True) plt.minorticks_on() plt.grid(which = 'minor', linestyle = ':', linewidth = 0.3) # Legends method_labels = df['Methodology'].unique().tolist() method_handles = [mlines.Line2D([0], [0], color = colour_mapping[m], lw = 4.1) for m in method_labels] shape_handles = [ mlines.Line2D([0], [0], marker = 'o', color = 'w', label = 'AUC ≥ 0.5', markerfacecolor = 'gray', markersize = 10), mlines.Line2D([0], [0], marker = '*', color = 'w', label = 'AUC < 0.5 (adjusted)', markerfacecolor = 'gray', markersize = 12) ] plt.figlegend(method_handles, method_labels, title = 'Methodology', loc = 'lower center', bbox_to_anchor = (0.5, 0.07), ncol = 5) plt.figlegend(shape_handles, ['AUC ≥ 0.5', 'AUC < 0.5 (adjusted)'], title = 'Marker Type', loc = 'lower center', bbox_to_anchor = (0.5, 0.01), ncol = 2) plt.tight_layout(rect = (0, 0.22, 1, 1)) plt.savefig(f'{output_dir}/scatter_combined_metric_{n_trials}.png', bbox_inches = 'tight', dpi = 300) plt.close()
[docs] def plot_heatmap( gh_df: pd.DataFrame, title: str, metric: str, output_dir: str = 'plots' ) -> None: '''Heatmap of Games-Howell p-values across methodology pairs. Parameters ---------- gh_df : pd.DataFrame Output of pingouin.pairwise_gameshowell (expects columns 'A','B 'diff','pval'). title : str Title for the heatmap. metric : str Metric label for titling ('AUC' or 'RMSE'). output_dir : str Where to save the plot image. Default: 'plots'. ''' df = gh_df.copy() pcol = 'pval' if 'pval' in df.columns else next((c for c in df.columns if c.startswith('pval')), None) if pcol is None: # User-facing error: missing required data in DataFrame ocerror.Error.data_not_found("Games-Howell dataframe must contain a p-value column (pval, pval_corr, etc.)") raise ValueError('Games-Howell dataframe must contain a p-value column.') mat = df.pivot(index='A', columns='B', values=pcol) # Mirror to make a symmetric matrix, leaving diagonal as NaN mat_full = mat.combine_first(mat.T) np.fill_diagonal(mat_full.values, np.nan) plt.figure(figsize=(max(8, 0.6 * mat_full.shape[1]), max(6, 0.35 * mat_full.shape[0]))) ax = sns.heatmap(-np.log10(mat_full), cmap='mako', annot=False, cbar_kws={'label': '-log10(p)'}) ax.set_title(title) plt.tight_layout() plt.savefig(f"{output_dir}/games_howell_heatmap_{metric}.png", dpi=300) plt.close()
[docs] def plot_normality_and_variance_diagnostics( df: pd.DataFrame, metric: str, n_trials: int, output_dir: str = 'plots' ) -> None: ''' Perform and plot normality and variance diagnostics across methodologies. Quick diagnostics across groups: - Shapiro-Wilk p-values per methodology (bar of -log10 p) - Group variances (bar) and Levene's p-value annotated Parameters ---------- df : pd.DataFrame Data containing 'Methodology' and the specified metric. metric : str Metric column to analyze (e.g., 'AUC' or 'RMSE'). n_trials : int Number of trials for title and output naming. output_dir : str Directory to save the diagnostics plot. Default: 'plots'. ''' # Compute Shapiro p-values and variances per group rows = [] groups = [] for method, sub in df.groupby('Methodology'): x = pd.to_numeric(sub[metric], errors='coerce').dropna().to_numpy() if x.size >= 3: try: p_shap = sstats.shapiro(x).pvalue except (ValueError, TypeError, AttributeError): # Fallback to NaN if statistical test fails p_shap = np.nan else: p_shap = np.nan var = float(np.var(x, ddof=1)) if x.size >= 2 else np.nan rows.append({'Methodology': method, 'p_shapiro': p_shap, 'variance': var}) groups.append(x) diag = pd.DataFrame(rows).sort_values(by='p_shapiro', ascending=True) # Levene across all groups try: groups_nonempty = [g for g in groups if g.size >= 2] p_levene = sstats.levene(*groups_nonempty).pvalue if len(groups_nonempty) >= 2 else np.nan except (ValueError, TypeError, AttributeError): # Fallback to NaN if statistical test fails p_levene = np.nan # Plot two panels plt.figure(figsize=(16, 6)) plt.subplot(1, 2, 1) sns.barplot(data=diag, x='Methodology', y=-np.log10(diag['p_shapiro']), color='steelblue') plt.xticks(rotation=90) plt.ylabel('-log10 Shapiro p-value') plt.title(f'Normality (Shapiro) — {metric}') plt.grid(True, axis='y', linestyle=':', linewidth=0.5) plt.subplot(1, 2, 2) sns.barplot(data=diag, x='Methodology', y='variance', color='tab:orange') plt.xticks(rotation=90) plt.ylabel('Group variance') lev_txt = f"Levene p={p_levene:.2e}" if isinstance(p_levene, float) and np.isfinite(p_levene) else "Levene p=N/A" plt.title(f'Variance across groups — {metric} ({lev_txt})') plt.grid(True, axis='y', linestyle=':', linewidth=0.5) plt.tight_layout() plt.savefig(f"{output_dir}/diagnostics_{metric}_{n_trials}.png", dpi=300) plt.close()
[docs] def plot_pca_importance_barplot( importance_df: pd.DataFrame, pca_type: str, n_features: int, n_trials: int, output_dir: str = 'plots' ) -> None: '''Barplot of top-N PCA feature importances. Parameters ---------- importance_df : pd.DataFrame DataFrame with 'Feature' and 'Importance' columns. pca_type : str PCA type label for titling (e.g., '1', '2'). n_features : int Number of top features to display. n_trials : int Number of trials for title and output naming. output_dir : str Directory to save the barplot image. Default: 'plots'. ''' top = importance_df.head(n_features) plt.figure(figsize=(10, max(5, 0.35 * len(top)))) sns.barplot(data=top, x='Importance', y='Feature', orient='h', color='steelblue') plt.title(f'PCA{pca_type}: Top {len(top)} feature importances') plt.xlabel('Importance (variance-weighted loadings)') plt.ylabel('Feature') plt.tight_layout() plt.savefig(f"{output_dir}/pca{pca_type}_importance_top{len(top)}_{n_trials}.png", dpi=300) plt.close()
[docs] def plot_pca_importance_histogram( importance_df: pd.DataFrame, pca_type: str, n_trials: int, output_dir: str = 'plots' ) -> None: '''Histogram of PCA feature importances. Parameters ---------- importance_df : pd.DataFrame DataFrame with 'Feature' and 'Importance' columns. pca_type : str PCA type label for titling (e.g., '1', '2'). n_trials : int Number of trials for title and output naming. output_dir : str Directory to save the histogram image. Default: 'plots'. ''' plt.figure(figsize=(8, 5)) sns.histplot(importance_df['Importance'], bins=30, color='tab:purple') plt.title(f'PCA{pca_type}: Distribution of feature importances') plt.xlabel('Importance') plt.ylabel('Count') plt.tight_layout() plt.savefig(f"{output_dir}/pca{pca_type}_importance_hist_{n_trials}.png", dpi=300) plt.close()
[docs] def plot_scatterplot( df_rmse: pd.DataFrame, df_auc: pd.DataFrame, df_all: pd.DataFrame, n_trials: int, colour_mapping: dict[str, tuple[float, float, float]], output_dir: str, orientation: str = 'horizontal', alpha: float = 0.9 ) -> None: '''Create scatter plots of RMSE vs AUC for all methods and filtered subsets. Create a 1x3 panel of scatter plots (RMSE vs AUC): - All filtered points - RMSE-filtered subset - AUC-filtered subset Parameters ---------- df_all : pd.DataFrame DataFrame with all filtered points. df_rmse : pd.DataFrame DataFrame filtered by RMSE threshold. df_auc : pd.DataFrame DataFrame filtered by AUC threshold. n_trials : int Number of top trials considered. colour_mapping : dict[str, tuple[float, float, float]] Dictionary mapping methodologies to colors. output_dir : str Directory to save the scatter plot image. orientation : str, optional Orientation of the scatter plot. Default is 'horizontal'. Options: 'horizontal', 'vertical'. alpha : float, optional Transparency for the markers. Default is 0.9. Raises ------ ValueError If the orientation parameter is not 'horizontal' or 'vertical'. ''' # Make orientation case-insensitive orientation = orientation.lower() if orientation == 'vertical': plt.figure(figsize=(8, 14)) elif orientation == 'horizontal': plt.figure(figsize=(18, 8)) else: # User-facing error: invalid orientation ocerror.Error.value_error(f"Invalid orientation: '{orientation}'. Must be 'horizontal' or 'vertical'.") raise ValueError(f"Orientation must be 'horizontal' or 'vertical', got {orientation}.") panels = [ (df_rmse, 'Error vs. AUC (Smallest Error)'), (df_auc, 'Error vs. AUC (Biggest AUC)'), (df_all, 'Error vs. AUC (Smallest Error - AUC)') ] for i, (df, title) in enumerate(panels, start=1): df = df.copy() df['AUC_adj'] = df['AUC'].apply(lambda x: 1 - x if x < 0.5 else x) df['AUC_category'] = df['AUC'].apply(lambda x: '>= 0.5' if x >= 0.5 else '< 0.5') df.loc[df['AUC_category'] == '< 0.5', 'AUC'] = df['AUC_adj'] if orientation == 'vertical': plt.subplot(3, 1, i) else: plt.subplot(1, 3, i) # Scatter for AUC ≥ 0.5 df_auc_ge = df[df['AUC_category'] == '>= 0.5'] if not df_auc_ge.empty: sns.scatterplot( data = df_auc_ge, x = 'RMSE', y = 'AUC', hue = 'Methodology', palette = colour_mapping, alpha = alpha, s = 30, legend = False, ) # Scatter for AUC < 0.5 df_auc_lt = df[df['AUC_category'] == '< 0.5'] if not df_auc_lt.empty: sns.scatterplot( data = df_auc_lt, x ='RMSE', y ='AUC', hue = 'Methodology', palette = colour_mapping, alpha = alpha, s = 50, marker = '*', legend = False, ) plt.title(title) plt.grid(True, linestyle=':', linewidth=0.5) plt.xlabel('RMSE') plt.ylabel('AUC') # Legends - define before use method_labels = df_all['Methodology'].unique().tolist() method_handles = [mlines.Line2D([0], [0], color = colour_mapping[m], lw = 4.1) for m in method_labels] shape_handles = [ mlines.Line2D([0], [0], marker = 'o', color = 'w', label = 'AUC ≥ 0.5', markerfacecolor = 'gray', markersize = 10), mlines.Line2D([0], [0], marker = '*', color = 'w', label = 'AUC < 0.5 (adjusted)', markerfacecolor = 'gray', markersize = 12) ] if orientation == 'vertical': # Methodology legend plt.figlegend(method_handles, method_labels, title = 'Methodology', loc = 'lower center', bbox_to_anchor = (0.5, 0.09), ncol = 5) # Shape legend plt.figlegend(shape_handles, ['AUC ≥ 0.5', 'AUC < 0.5 (adjusted)'], title = 'Marker Type', loc = 'lower center', bbox_to_anchor = (0.5, 0.03), ncol = 2) plt.tight_layout(rect = (0, 0.18, 1, 1)) else: # Methodology legend plt.figlegend(method_handles, method_labels, title = 'Methodology', loc = 'lower center', bbox_to_anchor = (0.5, 0.09), ncol = 5) # Shape legend plt.figlegend(shape_handles, ['AUC ≥ 0.5', 'AUC < 0.5 (adjusted)'], title = 'Marker Type', loc = 'lower center', bbox_to_anchor = (0.5, 0.02), ncol = 2) # Methodology legend plt.figlegend(method_handles, method_labels, title = 'Methodology', loc = 'lower center', bbox_to_anchor = (0.5, 0.09), ncol = 5) # Shape legend plt.figlegend(shape_handles, ['AUC ≥ 0.5', 'AUC < 0.5 (adjusted)'], title = 'Marker Type', loc = 'lower center', bbox_to_anchor = (0.5, 0.02), ncol = 2) if orientation == 'vertical': plt.subplots_adjust(bottom=0.28) plt.tight_layout(rect = (0, 0.25, 1, 1)) plt.savefig(f'{output_dir}/scatter_rmse_auc_panels_{n_trials}.png', dpi=300) plt.close()
[docs] def save_pca_importance_bins( importance_df: pd.DataFrame, pca_type: str, n_trials: int, output_dir: str = 'plots', n_bins: int = 10 ) -> None: '''Assign quantile bins (qcut) and save as CSV. Parameters ---------- importance_df : pd.DataFrame DataFrame with 'Feature' and 'Importance' columns. pca_type : str PCA type label for titling (e.g., '1', '2'). n_trials : int Number of trials for title and output naming. output_dir : str Directory to save the plot image. Default: 'plots'. n_bins : int Number of quantile bins to create. Default: 10. ''' df = importance_df.copy() try: df['bin'] = pd.qcut(df['Importance'], q=n_bins, labels=False, duplicates='drop') except ValueError: # Not enough unique values; fallback to rank-based bins ranks = df['Importance'].rank(method='average', pct=True) df['bin'] = (ranks * (n_bins - 1)).astype(int) df.to_csv(f"{output_dir}/pca{pca_type}_importance_bins_{n_trials}.csv", index=False)
[docs] def save_pca_importance_groups( importance_df: pd.DataFrame, pca_type: str, n_trials: int, output_dir: str = 'plots' ) -> None: '''Assign coarse groups by quantiles and save as CSV. Parameters ---------- importance_df : pd.DataFrame DataFrame with 'Feature' and 'Importance' columns. pca_type : str PCA type label for titling (e.g., '1', '2'). n_trials : int Number of trials for title and output naming. output_dir : str Directory to save the plot image. Default: 'plots'. ''' q = importance_df['Importance'].quantile bins = [0.0, q(0.2), q(0.4), q(0.6), q(0.8), q(1.0)] labels = ['Very Low', 'Low', 'Medium', 'High', 'Very High'] df = importance_df.copy() df['Group'] = pd.cut(df['Importance'], bins=bins, labels=labels, include_lowest=True, duplicates='drop') df.to_csv(f"{output_dir}/pca{pca_type}_importance_groups_{n_trials}.csv", index=False)