#!/usr/bin/env python3
# Description
###############################################################################
'''
Summarize and visualize feature impact using Net Benefit Score (NBS) and related statistics.
Usage:
import OCDocker.OCScore.Analysis.Impact as ocimpact
It exposes high-level functions:
- build_impact_overview: build a table with NBS, direction, strength and stats
- plot_impact_arrows_inline_labels: render arrow plot with inline labels
- get_neutral_features: list neutral features by |NBS| < tau or Direction == 'neutral'
'''
# Imports
###############################################################################
from __future__ import annotations
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from typing import Iterable, Optional, Sequence, Union
# License
###############################################################################
'''
OCDocker
Authors: Rossi, A.D.; Monachesi, M.C.E.; Spelta, G.I.; Torres, P.H.M.
Federal University of Rio de Janeiro
Carlos Chagas Filho Institute of Biophysics
Laboratory for Molecular Modeling and Dynamics
This program is proprietary software owned by the Federal University of Rio de Janeiro (UFRJ),
developed by Rossi, A.D.; Monachesi, M.C.E.; Spelta, G.I.; Torres, P.H.M., and protected under Brazilian Law No. 9,609/1998.
All rights reserved. Use, reproduction, modification, and distribution are allowed under this UFRJ license,
provided this copyright notice is preserved. See the LICENSE file for details.
Contact: Artur Duque Rossi - arturossi10@gmail.com
'''
# Classes
###############################################################################
# Functions
###############################################################################
## Private ##
def _beneficial_categories(metric: str, categories: Iterable[str], custom: Optional[Iterable[str]] = None) -> set[str]:
'''
Decide which ordered categories are beneficial for the given metric.
Parameters
----------
metric : str
Metric name ('AUC', 'RMSE', or other).
categories : Iterable[str]
Ordered category labels from the contingency table columns.
custom : Optional[Iterable[str]]
Explicit set of beneficial categories. If provided, overrides defaults.
Returns
-------
set[str]
Set of category labels considered beneficial.
'''
cats = [str(c) for c in categories]
if custom is not None:
return set(map(str, custom))
m = metric.strip().upper()
if m == "AUC":
good = {c for c in cats if "high" in c.lower()}
elif m == "RMSE":
good = {c for c in cats if "low" in c.lower()}
else:
k = len(cats)
good = set(cats[k // 2 :])
return good if good else set(cats[-max(1, len(cats) // 2) :])
def _net_benefit(delta: pd.Series, beneficial: set[str]) -> float:
'''
Compute Net Benefit Score in [-1, 1] from Δp and beneficial categories.
Parameters
----------
delta : pd.Series
Δp per category (output of _proportion_delta).
beneficial : set[str]
Set of categories deemed beneficial.
Returns
-------
float
Net Benefit Score (NBS).
'''
idx = delta.index.astype(str)
good = [c for c in idx if c in beneficial]
bad = [c for c in idx if c not in beneficial]
return float(delta[good].sum() - delta[bad].sum())
def _proportion_delta(contingency: pd.DataFrame, presence_level: Union[int, str] = 1) -> pd.Series:
'''
Compute Δp(c) = p(c | feature=1) - p(c | feature=0) for each category c.
Parameters
----------
contingency : pd.DataFrame
Contingency table with rows as feature presence (0/1) and columns as categories.
presence_level : Union[int, str]
Row key representing presence (defaults to 1). Falls back if not present.
Returns
-------
pd.Series
Series of deltas indexed by category label.
'''
cont = contingency.copy()
props = cont.div(cont.sum(axis=1).replace(0, np.nan), axis=0).fillna(0)
k1: int | str
if presence_level in cont.index:
k1 = presence_level
elif str(presence_level) in cont.index:
k1 = str(presence_level)
else:
k1 = cont.index[-1]
k0: int | str
if 0 in cont.index:
k0 = 0
elif "0" in cont.index:
k0 = "0"
else:
k0 = cont.index[0]
return (props.loc[k1] - props.loc[k0]).astype(float)
def _strength_from_nbs_norm(nbs_norm: float, thresholds: Sequence[float] = (0.10, 0.20, 0.35)) -> str:
'''
Bucketize |NBS_norm| into qualitative strength classes.
Parameters
----------
nbs_norm : float
Net Benefit Score normalized to [-1, 1].
thresholds : Sequence[float]
Cutoffs applied to |NBS_norm| to define 'weak', 'moderate', 'strong'.
Returns
-------
str
One of: 'none', 'weak', 'moderate', 'strong', 'very strong'.
'''
a = abs(float(nbs_norm))
if a == 0:
return "none"
if a < thresholds[0]:
return "weak"
if a < thresholds[1]:
return "moderate"
if a < thresholds[2]:
return "strong"
return "very strong"
def _strength_from_v(v: float) -> str:
'''
Map Cramér's V value to a qualitative strength label.
Parameters
----------
v : float
Cramér's V statistic value.
Returns
-------
str
One of: 'unknown', 'none', 'weak', 'moderate', 'strong', 'very strong'.
'''
if pd.isna(v):
return "unknown"
if v < 0.10:
return "none"
if v < 0.20:
return "weak"
if v < 0.30:
return "moderate"
if v < 0.50:
return "strong"
return "very strong"
## Public ##
[docs]
def build_impact_overview(
chi_df: pd.DataFrame,
contingency_dict: dict[str, pd.DataFrame],
metric: str,
presence_level: Union[int, str] = 1,
beneficial_custom: Optional[Iterable[str]] = None,
tau: float = 0.05,
) -> pd.DataFrame:
'''
Build a clear impact table with NBS, direction, strength and stats.
Parameters
----------
chi_df : pd.DataFrame
DataFrame with chi-square outcomes, requires at least columns
['Feature', "Cramér's V", 'Chi2 Statistic', 'p-value'].
contingency_dict : dict[str, pd.DataFrame]
Mapping from feature -> contingency table with rows as presence (0/1)
and columns as ordered categories (strings).
metric : str
Metric name used to identify beneficial categories ('AUC' or 'RMSE').
presence_level : Union[int, str], optional
Row key considered as presence (default: 1). If not found, falls back.
beneficial_custom : Optional[Iterable[str]], optional
Explicit set of beneficial categories to use instead of defaults.
tau : float, optional
Tolerance to classify neutral direction by |NBS| < tau (default: 0.05).
Returns
-------
pd.DataFrame
Sorted DataFrame with columns: 'Feature', 'NBS', 'Direction', 'Strength',
'Chi2', 'p-value', 'CramersV', 'FavoredCategory', 'HurtCategory',
'|NBS|', 'NegLog10P'.
'''
any_cont = next(iter(contingency_dict.values()))
categories = any_cont.columns.astype(str).tolist()
beneficial = _beneficial_categories(metric, categories, custom=beneficial_custom)
feature_idx = chi_df.columns.get_loc('Feature')
chi2_idx = chi_df.columns.get_loc('Chi2 Statistic') if 'Chi2 Statistic' in chi_df.columns else None
pvalue_idx = chi_df.columns.get_loc('p-value') if 'p-value' in chi_df.columns else None
cramers_v_idx = chi_df.columns.get_loc("Cramér's V") if "Cramér's V" in chi_df.columns else None
rows = []
for row in chi_df.itertuples(index = False, name = None):
feat = row[feature_idx]
chi2_val = row[chi2_idx] if chi2_idx is not None else np.nan
pvalue_val = row[pvalue_idx] if pvalue_idx is not None else np.nan
cramers_v_val = row[cramers_v_idx] if cramers_v_idx is not None else np.nan
cont = contingency_dict.get(feat)
if cont is None or cont.empty:
rows.append({
'Feature': feat,
'NBS': np.nan,
'Direction': 'neutral',
'Strength': 'unknown',
'Chi2': chi2_val,
'p-value': pvalue_val,
'CramersV': cramers_v_val,
'FavoredCategory': None,
'HurtCategory': None,
})
continue
delta = _proportion_delta(cont, presence_level=presence_level)
nbs = _net_benefit(delta, beneficial)
if abs(nbs) < tau:
direction = 'neutral'
else:
direction = 'positive' if nbs > 0 else 'negative'
favored = delta.idxmax()
hurt = delta.idxmin()
rows.append({
'Feature': feat,
'NBS': nbs,
'Direction': direction,
'Strength': _strength_from_v(cramers_v_val),
'Chi2': chi2_val,
'p-value': pvalue_val,
'CramersV': cramers_v_val,
'FavoredCategory': favored,
'HurtCategory': hurt,
})
out = pd.DataFrame(rows)
out['|NBS|'] = out['NBS'].abs()
out['NegLog10P'] = -np.log10(np.clip(out['p-value'].astype(float), 1e-300, 1.0))
return out.sort_values(['Direction', '|NBS|', 'NegLog10P'], ascending=[True, False, False])
[docs]
def get_neutral_features(impact_df: pd.DataFrame, tau: float = 0.05) -> list[str]:
'''
Return a sorted list of neutral features by Direction or |NBS| < tau.
Parameters
----------
impact_df : pd.DataFrame
DataFrame returned by build_impact_overview, with 'NBS' and 'Direction'.
tau : float, optional
Neutrality threshold on original NBS scale (default: 0.05).
Returns
-------
list[str]
Sorted list of neutral feature names.
'''
if 'Direction' in impact_df.columns:
series = (
impact_df.loc[impact_df['Direction'] == 'neutral', 'Feature']
.astype(str)
.sort_values()
)
return [str(x) for x in series]
series = (
impact_df.loc[impact_df['NBS'].abs() < tau, 'Feature']
.astype(str)
.sort_values()
)
return [str(x) for x in series]
[docs]
def plot_impact_arrows_inline_labels(
impact_df: pd.DataFrame,
title: str,
outpath: Optional[str] = None,
tau: float = 0.05,
thresholds: Sequence[float] = (0.10, 0.20, 0.35),
xpad: float = 0.025,
height_per_feature: float = 0.42,
max_height: float = 28.0,
font_size: int = 10,
) -> None:
'''
Render an arrow plot with inline feature labels based on NBS.
Parameters
----------
impact_df : pd.DataFrame
DataFrame with columns ['Feature','NBS'] and optionally 'Direction'.
title : str
Plot title.
outpath : Optional[str], optional
Output image path. If None, the figure is not saved to disk.
tau : float, optional
Neutrality threshold on original NBS scale (default: 0.05).
thresholds : Sequence[float], optional
Thresholds for marker strength derived from |NBS_norm| (default: 0.10, 0.20, 0.35).
xpad : float, optional
Horizontal text offset relative to marker (default: 0.025).
height_per_feature : float, optional
Figure height contribution per feature (default: 0.42).
max_height : float, optional
Maximum figure height (default: 28.0).
font_size : int, optional
Font size for labels (default: 10).
'''
df = impact_df.copy()
# Normalize to [-1, 1] for visualization
df['NBS_norm'] = (df['NBS'] / 2.0).clip(-1.0, 1.0)
tau_norm = tau / 2.0
if 'Direction' not in df.columns:
df['Direction'] = np.where(
df['NBS'] > +tau, 'positive', np.where(df['NBS'] < -tau, 'negative', 'neutral')
)
df['Strength'] = df['NBS_norm'].apply(lambda v: _strength_from_nbs_norm(v, thresholds))
color_map = {'positive': '#2ca02c', 'negative': '#d62728', 'neutral': '#7f7f7f'}
marker_map = {'none': 'o', 'weak': 'o', 'moderate': 's', 'strong': 'D', 'very strong': 'P'}
size_map = {'none': 40, 'weak': 60, 'moderate': 90, 'strong': 120, 'very strong': 150}
alpha_map = {'none': 0.35, 'weak': 0.5, 'moderate': 0.7, 'strong': 0.9, 'very strong': 0.95}
df['Color'] = df['Direction'].map(color_map).fillna('#7f7f7f')
df['Marker'] = df['Strength'].map(marker_map).fillna('o')
df['Size'] = df['Strength'].map(size_map).fillna(60)
df['Alpha'] = df['Strength'].map(alpha_map).fillna(0.6)
df = df.sort_values('NBS_norm').reset_index(drop=True)
fig_h = min(max(6.0, height_per_feature * len(df)), max_height)
plt.figure(figsize=(12, fig_h))
for i, row in enumerate(df.itertuples(index = False)):
plt.plot([0, row.NBS_norm], [i, i], color=row.Color, linewidth=2, alpha=row.Alpha)
plt.scatter(
row.NBS_norm,
i,
s=row.Size,
c=row.Color,
marker=row.Marker,
edgecolor='k',
linewidth=0.4,
zorder=3,
)
for i, row in enumerate(df.itertuples(index = False)):
x = row.NBS_norm + (xpad if row.NBS_norm >= 0 else -xpad)
ha = 'left' if row.NBS_norm >= 0 else 'right'
plt.text(x, i, str(row.Feature), va='center', ha=ha, fontsize=font_size)
plt.yticks([], [])
plt.axvline(0, color='k', linestyle='--', linewidth=1)
plt.axvline(+tau_norm, color='k', linestyle=':', linewidth=1)
plt.axvline(-tau_norm, color='k', linestyle=':', linewidth=1)
plt.xlabel("Net Benefit Score (normalized: −1 worse ← 0 → better +1)")
plt.xlim(-1.05, 1.05)
plt.title(title)
import matplotlib.lines as mlines
import matplotlib.patches as mpatches
leg_dir = [
mpatches.Patch(color=color, label=lbl)
for lbl, color in [('positive', '#2ca02c'), ('negative', '#d62728'), ('neutral', '#7f7f7f')]
]
leg_str = [
mlines.Line2D([], [], color='k', marker='o', linestyle='None', markersize=8, label='none/weak'),
mlines.Line2D([], [], color='k', marker='s', linestyle='None', markersize=8, label='moderate'),
mlines.Line2D([], [], color='k', marker='D', linestyle='None', markersize=8, label='strong'),
mlines.Line2D([], [], color='k', marker='P', linestyle='None', markersize=8, label='very strong'),
]
lg1 = plt.legend(handles=leg_dir, title='direction', loc='upper left', frameon=False, fontsize=9)
plt.gca().add_artist(lg1)
plt.legend(handles=leg_str, title='strength (symbols)', loc='upper right', frameon=False, fontsize=9)
plt.tight_layout()
if outpath:
plt.savefig(outpath, dpi=300)
plt.close()
# Public API
###############################################################################