Source code for OCDocker.OCScore.Analysis.Impact

#!/usr/bin/env python3

# Description
###############################################################################
'''
Summarize and visualize feature impact using Net Benefit Score (NBS) and related statistics.

Usage:

import OCDocker.OCScore.Analysis.Impact as ocimpact

It exposes high-level functions:

- build_impact_overview: build a table with NBS, direction, strength and stats
- plot_impact_arrows_inline_labels: render arrow plot with inline labels
- get_neutral_features: list neutral features by |NBS| < tau or Direction == 'neutral'

'''

# Imports
###############################################################################
from __future__ import annotations

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from typing import Iterable, Optional, Sequence, Union


# License
###############################################################################
'''
OCDocker
Authors: Rossi, A.D.; Monachesi, M.C.E.; Spelta, G.I.; Torres, P.H.M.
Federal University of Rio de Janeiro
Carlos Chagas Filho Institute of Biophysics
Laboratory for Molecular Modeling and Dynamics

This program is proprietary software owned by the Federal University of Rio de Janeiro (UFRJ),
developed by Rossi, A.D.; Monachesi, M.C.E.; Spelta, G.I.; Torres, P.H.M., and protected under Brazilian Law No. 9,609/1998.
All rights reserved. Use, reproduction, modification, and distribution are allowed under this UFRJ license,
provided this copyright notice is preserved. See the LICENSE file for details.

Contact: Artur Duque Rossi - arturossi10@gmail.com
'''

# Classes
###############################################################################


# Functions
###############################################################################
## Private ##

def _beneficial_categories(metric: str, categories: Iterable[str], custom: Optional[Iterable[str]] = None) -> set[str]:
    '''
    Decide which ordered categories are beneficial for the given metric.

    Parameters
    ----------
    metric : str
        Metric name ('AUC', 'RMSE', or other).
    categories : Iterable[str]
        Ordered category labels from the contingency table columns.
    custom : Optional[Iterable[str]]
        Explicit set of beneficial categories. If provided, overrides defaults.

    Returns
    -------
    set[str]
        Set of category labels considered beneficial.
    '''

    cats = [str(c) for c in categories]
    if custom is not None:
        return set(map(str, custom))

    m = metric.strip().upper()
    if m == "AUC":
        good = {c for c in cats if "high" in c.lower()}
    elif m == "RMSE":
        good = {c for c in cats if "low" in c.lower()}
    else:
        k = len(cats)
        good = set(cats[k // 2 :])
    return good if good else set(cats[-max(1, len(cats) // 2) :])


def _net_benefit(delta: pd.Series, beneficial: set[str]) -> float:
    '''
    Compute Net Benefit Score in [-1, 1] from Δp and beneficial categories.

    Parameters
    ----------
    delta : pd.Series
        Δp per category (output of _proportion_delta).
    beneficial : set[str]
        Set of categories deemed beneficial.

    Returns
    -------
    float
        Net Benefit Score (NBS).
    '''

    idx = delta.index.astype(str)
    good = [c for c in idx if c in beneficial]
    bad = [c for c in idx if c not in beneficial]
    return float(delta[good].sum() - delta[bad].sum())


def _proportion_delta(contingency: pd.DataFrame, presence_level: Union[int, str] = 1) -> pd.Series:
    '''
    Compute Δp(c) = p(c | feature=1) - p(c | feature=0) for each category c.

    Parameters
    ----------
    contingency : pd.DataFrame
        Contingency table with rows as feature presence (0/1) and columns as categories.
    presence_level : Union[int, str]
        Row key representing presence (defaults to 1). Falls back if not present.

    Returns
    -------
    pd.Series
        Series of deltas indexed by category label.
    '''

    cont = contingency.copy()
    props = cont.div(cont.sum(axis=1).replace(0, np.nan), axis=0).fillna(0)

    k1: int | str
    if presence_level in cont.index:
        k1 = presence_level
    elif str(presence_level) in cont.index:
        k1 = str(presence_level)
    else:
        k1 = cont.index[-1]

    k0: int | str
    if 0 in cont.index:
        k0 = 0
    elif "0" in cont.index:
        k0 = "0"
    else:
        k0 = cont.index[0]

    return (props.loc[k1] - props.loc[k0]).astype(float)


def _strength_from_nbs_norm(nbs_norm: float, thresholds: Sequence[float] = (0.10, 0.20, 0.35)) -> str:
    '''
    Bucketize |NBS_norm| into qualitative strength classes.

    Parameters
    ----------
    nbs_norm : float
        Net Benefit Score normalized to [-1, 1].
    thresholds : Sequence[float]
        Cutoffs applied to |NBS_norm| to define 'weak', 'moderate', 'strong'.

    Returns
    -------
    str
        One of: 'none', 'weak', 'moderate', 'strong', 'very strong'.
    '''

    a = abs(float(nbs_norm))
    if a == 0:
        return "none"
    if a < thresholds[0]:
        return "weak"
    if a < thresholds[1]:
        return "moderate"
    if a < thresholds[2]:
        return "strong"
    return "very strong"


def _strength_from_v(v: float) -> str:
    '''
    Map Cramér's V value to a qualitative strength label.

    Parameters
    ----------
    v : float
        Cramér's V statistic value.

    Returns
    -------
    str
        One of: 'unknown', 'none', 'weak', 'moderate', 'strong', 'very strong'.
    '''

    if pd.isna(v):
        return "unknown"
    if v < 0.10:
        return "none"
    if v < 0.20:
        return "weak"
    if v < 0.30:
        return "moderate"
    if v < 0.50:
        return "strong"
    return "very strong"


## Public ##

[docs] def build_impact_overview( chi_df: pd.DataFrame, contingency_dict: dict[str, pd.DataFrame], metric: str, presence_level: Union[int, str] = 1, beneficial_custom: Optional[Iterable[str]] = None, tau: float = 0.05, ) -> pd.DataFrame: ''' Build a clear impact table with NBS, direction, strength and stats. Parameters ---------- chi_df : pd.DataFrame DataFrame with chi-square outcomes, requires at least columns ['Feature', "Cramér's V", 'Chi2 Statistic', 'p-value']. contingency_dict : dict[str, pd.DataFrame] Mapping from feature -> contingency table with rows as presence (0/1) and columns as ordered categories (strings). metric : str Metric name used to identify beneficial categories ('AUC' or 'RMSE'). presence_level : Union[int, str], optional Row key considered as presence (default: 1). If not found, falls back. beneficial_custom : Optional[Iterable[str]], optional Explicit set of beneficial categories to use instead of defaults. tau : float, optional Tolerance to classify neutral direction by |NBS| < tau (default: 0.05). Returns ------- pd.DataFrame Sorted DataFrame with columns: 'Feature', 'NBS', 'Direction', 'Strength', 'Chi2', 'p-value', 'CramersV', 'FavoredCategory', 'HurtCategory', '|NBS|', 'NegLog10P'. ''' any_cont = next(iter(contingency_dict.values())) categories = any_cont.columns.astype(str).tolist() beneficial = _beneficial_categories(metric, categories, custom=beneficial_custom) feature_idx = chi_df.columns.get_loc('Feature') chi2_idx = chi_df.columns.get_loc('Chi2 Statistic') if 'Chi2 Statistic' in chi_df.columns else None pvalue_idx = chi_df.columns.get_loc('p-value') if 'p-value' in chi_df.columns else None cramers_v_idx = chi_df.columns.get_loc("Cramér's V") if "Cramér's V" in chi_df.columns else None rows = [] for row in chi_df.itertuples(index = False, name = None): feat = row[feature_idx] chi2_val = row[chi2_idx] if chi2_idx is not None else np.nan pvalue_val = row[pvalue_idx] if pvalue_idx is not None else np.nan cramers_v_val = row[cramers_v_idx] if cramers_v_idx is not None else np.nan cont = contingency_dict.get(feat) if cont is None or cont.empty: rows.append({ 'Feature': feat, 'NBS': np.nan, 'Direction': 'neutral', 'Strength': 'unknown', 'Chi2': chi2_val, 'p-value': pvalue_val, 'CramersV': cramers_v_val, 'FavoredCategory': None, 'HurtCategory': None, }) continue delta = _proportion_delta(cont, presence_level=presence_level) nbs = _net_benefit(delta, beneficial) if abs(nbs) < tau: direction = 'neutral' else: direction = 'positive' if nbs > 0 else 'negative' favored = delta.idxmax() hurt = delta.idxmin() rows.append({ 'Feature': feat, 'NBS': nbs, 'Direction': direction, 'Strength': _strength_from_v(cramers_v_val), 'Chi2': chi2_val, 'p-value': pvalue_val, 'CramersV': cramers_v_val, 'FavoredCategory': favored, 'HurtCategory': hurt, }) out = pd.DataFrame(rows) out['|NBS|'] = out['NBS'].abs() out['NegLog10P'] = -np.log10(np.clip(out['p-value'].astype(float), 1e-300, 1.0)) return out.sort_values(['Direction', '|NBS|', 'NegLog10P'], ascending=[True, False, False])
[docs] def get_neutral_features(impact_df: pd.DataFrame, tau: float = 0.05) -> list[str]: ''' Return a sorted list of neutral features by Direction or |NBS| < tau. Parameters ---------- impact_df : pd.DataFrame DataFrame returned by build_impact_overview, with 'NBS' and 'Direction'. tau : float, optional Neutrality threshold on original NBS scale (default: 0.05). Returns ------- list[str] Sorted list of neutral feature names. ''' if 'Direction' in impact_df.columns: series = ( impact_df.loc[impact_df['Direction'] == 'neutral', 'Feature'] .astype(str) .sort_values() ) return [str(x) for x in series] series = ( impact_df.loc[impact_df['NBS'].abs() < tau, 'Feature'] .astype(str) .sort_values() ) return [str(x) for x in series]
[docs] def plot_impact_arrows_inline_labels( impact_df: pd.DataFrame, title: str, outpath: Optional[str] = None, tau: float = 0.05, thresholds: Sequence[float] = (0.10, 0.20, 0.35), xpad: float = 0.025, height_per_feature: float = 0.42, max_height: float = 28.0, font_size: int = 10, ) -> None: ''' Render an arrow plot with inline feature labels based on NBS. Parameters ---------- impact_df : pd.DataFrame DataFrame with columns ['Feature','NBS'] and optionally 'Direction'. title : str Plot title. outpath : Optional[str], optional Output image path. If None, the figure is not saved to disk. tau : float, optional Neutrality threshold on original NBS scale (default: 0.05). thresholds : Sequence[float], optional Thresholds for marker strength derived from |NBS_norm| (default: 0.10, 0.20, 0.35). xpad : float, optional Horizontal text offset relative to marker (default: 0.025). height_per_feature : float, optional Figure height contribution per feature (default: 0.42). max_height : float, optional Maximum figure height (default: 28.0). font_size : int, optional Font size for labels (default: 10). ''' df = impact_df.copy() # Normalize to [-1, 1] for visualization df['NBS_norm'] = (df['NBS'] / 2.0).clip(-1.0, 1.0) tau_norm = tau / 2.0 if 'Direction' not in df.columns: df['Direction'] = np.where( df['NBS'] > +tau, 'positive', np.where(df['NBS'] < -tau, 'negative', 'neutral') ) df['Strength'] = df['NBS_norm'].apply(lambda v: _strength_from_nbs_norm(v, thresholds)) color_map = {'positive': '#2ca02c', 'negative': '#d62728', 'neutral': '#7f7f7f'} marker_map = {'none': 'o', 'weak': 'o', 'moderate': 's', 'strong': 'D', 'very strong': 'P'} size_map = {'none': 40, 'weak': 60, 'moderate': 90, 'strong': 120, 'very strong': 150} alpha_map = {'none': 0.35, 'weak': 0.5, 'moderate': 0.7, 'strong': 0.9, 'very strong': 0.95} df['Color'] = df['Direction'].map(color_map).fillna('#7f7f7f') df['Marker'] = df['Strength'].map(marker_map).fillna('o') df['Size'] = df['Strength'].map(size_map).fillna(60) df['Alpha'] = df['Strength'].map(alpha_map).fillna(0.6) df = df.sort_values('NBS_norm').reset_index(drop=True) fig_h = min(max(6.0, height_per_feature * len(df)), max_height) plt.figure(figsize=(12, fig_h)) for i, row in enumerate(df.itertuples(index = False)): plt.plot([0, row.NBS_norm], [i, i], color=row.Color, linewidth=2, alpha=row.Alpha) plt.scatter( row.NBS_norm, i, s=row.Size, c=row.Color, marker=row.Marker, edgecolor='k', linewidth=0.4, zorder=3, ) for i, row in enumerate(df.itertuples(index = False)): x = row.NBS_norm + (xpad if row.NBS_norm >= 0 else -xpad) ha = 'left' if row.NBS_norm >= 0 else 'right' plt.text(x, i, str(row.Feature), va='center', ha=ha, fontsize=font_size) plt.yticks([], []) plt.axvline(0, color='k', linestyle='--', linewidth=1) plt.axvline(+tau_norm, color='k', linestyle=':', linewidth=1) plt.axvline(-tau_norm, color='k', linestyle=':', linewidth=1) plt.xlabel("Net Benefit Score (normalized: −1 worse ← 0 → better +1)") plt.xlim(-1.05, 1.05) plt.title(title) import matplotlib.lines as mlines import matplotlib.patches as mpatches leg_dir = [ mpatches.Patch(color=color, label=lbl) for lbl, color in [('positive', '#2ca02c'), ('negative', '#d62728'), ('neutral', '#7f7f7f')] ] leg_str = [ mlines.Line2D([], [], color='k', marker='o', linestyle='None', markersize=8, label='none/weak'), mlines.Line2D([], [], color='k', marker='s', linestyle='None', markersize=8, label='moderate'), mlines.Line2D([], [], color='k', marker='D', linestyle='None', markersize=8, label='strong'), mlines.Line2D([], [], color='k', marker='P', linestyle='None', markersize=8, label='very strong'), ] lg1 = plt.legend(handles=leg_dir, title='direction', loc='upper left', frameon=False, fontsize=9) plt.gca().add_artist(lg1) plt.legend(handles=leg_str, title='strength (symbols)', loc='upper right', frameon=False, fontsize=9) plt.tight_layout() if outpath: plt.savefig(outpath, dpi=300) plt.close()
# Public API ###############################################################################