Source code for OCDocker.OCScore.Analysis.StudyProcessing

#!/usr/bin/env python3

# Description
###############################################################################
'''
Parse and structure Optuna study results into best-RMSE, best-AUC, and
best-combined views with consensus metrics.

Usage:

from OCDocker.OCScore.Analysis.StudyProcessing import get_study_data
'''

# Imports
###############################################################################

import pandas as pd

from typing import Union

import OCDocker.Error as ocerror
import OCDocker.OCScore.Utils.StudyParser as ocstudy

# License
###############################################################################
'''
OCDocker
Authors: Rossi, A.D.; Monachesi, M.C.E.; Spelta, G.I.; Torres, P.H.M.
Federal University of Rio de Janeiro
Carlos Chagas Filho Institute of Biophysics
Laboratory for Molecular Modeling and Dynamics

This program is proprietary software owned by the Federal University of Rio de Janeiro (UFRJ),
developed by Rossi, A.D.; Monachesi, M.C.E.; Spelta, G.I.; Torres, P.H.M., and protected under Brazilian Law No. 9,609/1998.
All rights reserved. Use, reproduction, modification, and distribution are allowed under this UFRJ license,
provided this copyright notice is preserved. See the LICENSE file for details.

Contact: Artur Duque Rossi - arturossi10@gmail.com
'''

# Classes
###############################################################################


# Functions
###############################################################################
## Private ##

## Public ##
[docs] def get_study_data( snames : list[str], storage : str, final_metrics : pd.DataFrame, n_trials : int, error_threshold : float = 1.5, nn_ae_start: Union[int, None] = None, nn_ae_end: Union[int, None] = None, xgb_ga_start: Union[int, None] = None, xgb_ga_end: Union[int, None] = None ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, float, float, float, float, float, float]: ''' Retrieve Optuna study data and structure it by best RMSE, AUC, and combined metrics. Parameters ---------- snames : list[str] List of study names. storage : str SQLAlchemy storage string. final_metrics : pd.DataFrame Consensus and raw metric dataframe. n_trials : int Number of trials per study. error_threshold : float Threshold to filter maximum RMSE. nn_ae_start : int | None, optional Start index for "NN + AE" labeling. nn_ae_end : int | None, optional End index for "NN + AE" labeling. xgb_ga_start : int | None, optional Start index for "XGB + GA" labeling. xgb_ga_end : int | None, optional End index for "XGB + GA" labeling. Returns ------- tuple Filtered RMSE, AUC, combined metric dataframes + full results_df + ranges. ''' results_df = ocstudy.analyze_studies_old(snames, storage=storage, n_trials=n_trials) if nn_ae_start is not None and nn_ae_end is not None: if nn_ae_start >= nn_ae_end: # User-facing error: invalid index range ocerror.Error.value_error(f"Invalid index range for 'NN + AE': start ({nn_ae_start}) must be less than end ({nn_ae_end})") raise ValueError("The start index for 'NN + AE' must be less than the end index.") results_df.loc[nn_ae_start:nn_ae_end - 1, 'study_type'] = 'NN + AE' if xgb_ga_start is not None and xgb_ga_end is not None: if xgb_ga_start >= xgb_ga_end: # User-facing error: invalid index range ocerror.Error.value_error(f"Invalid index range for 'XGB + GA': start ({xgb_ga_start}) must be less than end ({xgb_ga_end})") raise ValueError("The start index for 'XGB + GA' must be less than the end index.") results_df.loc[xgb_ga_start:xgb_ga_end - 1, 'study_type'] = 'XGB + GA' # Extract views best_rmse_df = results_df[["study_name", "study_type", "best_rmse_number", "best_rmse_value", "best_rmse_auc"]].copy() best_auc_df = results_df[["study_name", "study_type", "best_auc_number", "best_auc_value", "best_auc"]].copy() best_combined_df = results_df[["study_name", "study_type", "best_combined_number", "best_combined_metric", "best_combined_value", "best_combined_auc"]].copy() # Rename columns best_rmse_df.rename(columns={"study_type": "Methodology", "best_rmse_number": "Experiment", "best_rmse_value": "RMSE", "best_rmse_auc": "AUC"}, inplace=True) best_auc_df.rename(columns={"study_type": "Methodology", "best_auc_number": "Experiment", "best_auc_value": "RMSE", "best_auc": "AUC"}, inplace=True) best_combined_df.rename(columns={"study_type": "Methodology", "best_combined_number": "Experiment", "best_combined_metric": "combined_metric", "best_combined_value": "RMSE", "best_combined_auc": "AUC"}, inplace=True) # Add combined metric to RMSE and AUC views best_rmse_df["combined_metric"] = best_rmse_df["RMSE"] - best_rmse_df["AUC"] best_auc_df["combined_metric"] = best_auc_df["RMSE"] - best_auc_df["AUC"] # Merge in final metrics best_rmse_df = pd.concat([best_rmse_df, final_metrics], axis=0) best_auc_df = pd.concat([best_auc_df, final_metrics], axis=0) best_combined_df = pd.concat([best_combined_df, final_metrics], axis=0) # AUC normalization for df in [best_rmse_df, best_auc_df, best_combined_df]: df['AUC New'] = df['AUC'].apply(lambda x: 1 - x if x < 0.5 else x) df.reset_index(drop=True, inplace=True) min_error = min([df['RMSE'].min() for df in [best_rmse_df, best_auc_df, best_combined_df]]) max_error = max([df['RMSE'].max() for df in [best_rmse_df, best_auc_df, best_combined_df]]) min_auc = min([df['AUC New'].min() for df in [best_rmse_df, best_auc_df, best_combined_df]]) max_auc = max([df['AUC New'].max() for df in [best_rmse_df, best_auc_df, best_combined_df]]) # Filter best_rmse_df_filtered = best_rmse_df[best_rmse_df['RMSE'] <= error_threshold] best_auc_df_filtered = best_auc_df[best_auc_df['RMSE'] <= error_threshold] best_combined_df_filtered = best_combined_df[best_combined_df['RMSE'] <= error_threshold] return ( best_rmse_df_filtered, best_auc_df_filtered, best_combined_df_filtered, results_df, min_auc, max_auc, min_error, max_error, max_error - min_error, max_auc - min_auc )