#!/usr/bin/env python3
# Description
###############################################################################
'''
Parse and structure Optuna study results into best-RMSE, best-AUC, and
best-combined views with consensus metrics.
Usage:
from OCDocker.OCScore.Analysis.StudyProcessing import get_study_data
'''
# Imports
###############################################################################
import pandas as pd
from typing import Union
import OCDocker.Error as ocerror
import OCDocker.OCScore.Utils.StudyParser as ocstudy
# License
###############################################################################
'''
OCDocker
Authors: Rossi, A.D.; Monachesi, M.C.E.; Spelta, G.I.; Torres, P.H.M.
Federal University of Rio de Janeiro
Carlos Chagas Filho Institute of Biophysics
Laboratory for Molecular Modeling and Dynamics
This program is proprietary software owned by the Federal University of Rio de Janeiro (UFRJ),
developed by Rossi, A.D.; Monachesi, M.C.E.; Spelta, G.I.; Torres, P.H.M., and protected under Brazilian Law No. 9,609/1998.
All rights reserved. Use, reproduction, modification, and distribution are allowed under this UFRJ license,
provided this copyright notice is preserved. See the LICENSE file for details.
Contact: Artur Duque Rossi - arturossi10@gmail.com
'''
# Classes
###############################################################################
# Functions
###############################################################################
## Private ##
## Public ##
[docs]
def get_study_data(
snames : list[str],
storage : str,
final_metrics : pd.DataFrame,
n_trials : int,
error_threshold : float = 1.5,
nn_ae_start: Union[int, None] = None,
nn_ae_end: Union[int, None] = None,
xgb_ga_start: Union[int, None] = None,
xgb_ga_end: Union[int, None] = None
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, float, float, float, float, float, float]:
'''
Retrieve Optuna study data and structure it by best RMSE, AUC, and combined metrics.
Parameters
----------
snames : list[str]
List of study names.
storage : str
SQLAlchemy storage string.
final_metrics : pd.DataFrame
Consensus and raw metric dataframe.
n_trials : int
Number of trials per study.
error_threshold : float
Threshold to filter maximum RMSE.
nn_ae_start : int | None, optional
Start index for "NN + AE" labeling.
nn_ae_end : int | None, optional
End index for "NN + AE" labeling.
xgb_ga_start : int | None, optional
Start index for "XGB + GA" labeling.
xgb_ga_end : int | None, optional
End index for "XGB + GA" labeling.
Returns
-------
tuple
Filtered RMSE, AUC, combined metric dataframes + full results_df + ranges.
'''
results_df = ocstudy.analyze_studies_old(snames, storage=storage, n_trials=n_trials)
if nn_ae_start is not None and nn_ae_end is not None:
if nn_ae_start >= nn_ae_end:
# User-facing error: invalid index range
ocerror.Error.value_error(f"Invalid index range for 'NN + AE': start ({nn_ae_start}) must be less than end ({nn_ae_end})")
raise ValueError("The start index for 'NN + AE' must be less than the end index.")
results_df.loc[nn_ae_start:nn_ae_end - 1, 'study_type'] = 'NN + AE'
if xgb_ga_start is not None and xgb_ga_end is not None:
if xgb_ga_start >= xgb_ga_end:
# User-facing error: invalid index range
ocerror.Error.value_error(f"Invalid index range for 'XGB + GA': start ({xgb_ga_start}) must be less than end ({xgb_ga_end})")
raise ValueError("The start index for 'XGB + GA' must be less than the end index.")
results_df.loc[xgb_ga_start:xgb_ga_end - 1, 'study_type'] = 'XGB + GA'
# Extract views
best_rmse_df = results_df[["study_name", "study_type", "best_rmse_number", "best_rmse_value", "best_rmse_auc"]].copy()
best_auc_df = results_df[["study_name", "study_type", "best_auc_number", "best_auc_value", "best_auc"]].copy()
best_combined_df = results_df[["study_name", "study_type", "best_combined_number", "best_combined_metric", "best_combined_value", "best_combined_auc"]].copy()
# Rename columns
best_rmse_df.rename(columns={"study_type": "Methodology", "best_rmse_number": "Experiment", "best_rmse_value": "RMSE", "best_rmse_auc": "AUC"}, inplace=True)
best_auc_df.rename(columns={"study_type": "Methodology", "best_auc_number": "Experiment", "best_auc_value": "RMSE", "best_auc": "AUC"}, inplace=True)
best_combined_df.rename(columns={"study_type": "Methodology", "best_combined_number": "Experiment", "best_combined_metric": "combined_metric", "best_combined_value": "RMSE", "best_combined_auc": "AUC"}, inplace=True)
# Add combined metric to RMSE and AUC views
best_rmse_df["combined_metric"] = best_rmse_df["RMSE"] - best_rmse_df["AUC"]
best_auc_df["combined_metric"] = best_auc_df["RMSE"] - best_auc_df["AUC"]
# Merge in final metrics
best_rmse_df = pd.concat([best_rmse_df, final_metrics], axis=0)
best_auc_df = pd.concat([best_auc_df, final_metrics], axis=0)
best_combined_df = pd.concat([best_combined_df, final_metrics], axis=0)
# AUC normalization
for df in [best_rmse_df, best_auc_df, best_combined_df]:
df['AUC New'] = df['AUC'].apply(lambda x: 1 - x if x < 0.5 else x)
df.reset_index(drop=True, inplace=True)
min_error = min([df['RMSE'].min() for df in [best_rmse_df, best_auc_df, best_combined_df]])
max_error = max([df['RMSE'].max() for df in [best_rmse_df, best_auc_df, best_combined_df]])
min_auc = min([df['AUC New'].min() for df in [best_rmse_df, best_auc_df, best_combined_df]])
max_auc = max([df['AUC New'].max() for df in [best_rmse_df, best_auc_df, best_combined_df]])
# Filter
best_rmse_df_filtered = best_rmse_df[best_rmse_df['RMSE'] <= error_threshold]
best_auc_df_filtered = best_auc_df[best_auc_df['RMSE'] <= error_threshold]
best_combined_df_filtered = best_combined_df[best_combined_df['RMSE'] <= error_threshold]
return (
best_rmse_df_filtered,
best_auc_df_filtered,
best_combined_df_filtered,
results_df,
min_auc, max_auc,
min_error, max_error,
max_error - min_error,
max_auc - min_auc
)