Source code for OCDocker.OCScore.Analysis.FeatureImportance

#!/usr/bin/env python3

# Description
###############################################################################
'''
Test2 SHAP utilities for reproducible computation (no I/O, no plots).

Usage:

from OCDocker.OCScore.Analysis.FeatureImportance import compute_shap_values

Public API:
- build_stratified_background
- make_explainer
- compute_shap_values
- shap_importance_table
'''

# Imports
###############################################################################
from __future__ import annotations

import numpy as np
import pandas as pd

from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union, cast

import OCDocker.Error as ocerror

# License
###############################################################################
'''
OCDocker
Authors: Rossi, A.D.; Monachesi, M.C.E.; Spelta, G.I.; Torres, P.H.M.
Federal University of Rio de Janeiro
Carlos Chagas Filho Institute of Biophysics
Laboratory for Molecular Modeling and Dynamics

This program is proprietary software owned by the Federal University of Rio de Janeiro (UFRJ),
developed by Rossi, A.D.; Monachesi, M.C.E.; Spelta, G.I.; Torres, P.H.M., and protected under Brazilian Law No. 9,609/1998.
All rights reserved. Use, reproduction, modification, and distribution are allowed under this UFRJ license,
provided this copyright notice is preserved. See the LICENSE file for details.

Contact: Artur Duque Rossi - arturossi10@gmail.com
'''

__all__ = [
    "build_stratified_background",
    "compute_shap_values",
    "make_explainer",
    "shap_importance_table",
]

# Classes
###############################################################################


# Functions
###############################################################################
## Private ##

try:
    import shap
except Exception as e:  # pragma: no cover
    shap = None  # Deferred error: raised when functions are called

def _ensure_2d(X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
    '''Guarantee 2D float64 array without copying unnecessarily.

    Parameters
    ----------
    X : Union[np.ndarray, pd.DataFrame]
        Feature matrix.

    Returns
    -------
    np.ndarray
        Feature matrix as a 2D float64 array.
    '''

    if isinstance(X, pd.DataFrame):
        return np.asarray(X.to_numpy(dtype=float, copy=False), dtype=float)
    X = np.asarray(X)
    if X.ndim == 1:
        X = X.reshape(-1, 1)
    return X.astype(float, copy=False)


def _require_shap() -> None:
    '''Require shap to be installed.'''

    if shap is None:
        raise ImportError("shap is not installed. Please install `shap` to use Test2SHAP utilities.")


## Public ##

[docs] def build_stratified_background( X: Union[np.ndarray, pd.DataFrame], meta: pd.DataFrame, strata_cols: Sequence[str], per_stratum: int = 50, seed: int = 0, ) -> np.ndarray: '''Build a stratified background set by sampling up to `per_stratum` rows from each combination in `strata_cols` (e.g., ["target","active"]). Preserves class/target balance in the background while bounding its size. Parameters ---------- X : Union[np.ndarray, pd.DataFrame] Feature matrix. meta : pd.DataFrame Metadata DataFrame with stratification columns. strata_cols : Sequence[str] Column names to stratify by. per_stratum : int, optional Number of samples per stratum. Default is 50. seed : int, optional Random seed. Default is 0. Returns ------- np.ndarray Background array with shape (n_bg, n_features). ''' rng = np.random.default_rng(seed) X_arr = _ensure_2d(X) if len(meta) != X_arr.shape[0]: raise ValueError("meta rows must align with X rows") idxs: List[int] = [] for combo, g in meta.groupby(list(strata_cols), dropna=False): g_idx = g.index.to_numpy() if g_idx.size <= per_stratum: idxs.extend(g_idx.tolist()) else: take = rng.choice(g_idx, size=per_stratum, replace=False) idxs.extend(take.tolist()) idxs_arr = np.array(sorted(set(idxs)), dtype=int) return cast(np.ndarray, X_arr[idxs_arr])
[docs] def compute_shap_values( explainer: Any, X_eval: Union[np.ndarray, pd.DataFrame], task: str = "binary", nsamples: Optional[Union[int, str]] = "auto", class_index: int = 1, ) -> Dict[str, np.ndarray]: '''Compute SHAP values for the evaluation set. Parameters ---------- explainer : Any SHAP explainer object. X_eval : Union[np.ndarray, pd.DataFrame] Evaluation dataset. task : str, optional Task type. Default is "binary". nsamples : Optional[Union[int, str]], optional Number of samples for KernelExplainer. Ignored by Tree/Deep explainers when not applicable. Default is "auto". class_index : int, optional For binary classification with explainers returning per-class arrays (list), select this class index. Default is 1. Returns ------- Dict[str, np.ndarray] Dictionary with keys: - "shap_values": (n_samples, n_features) array - "base_values": (n_samples,) or scalar ''' _require_shap() X_eval_arr = _ensure_2d(X_eval) # Some explainers expose .shap_values (callable) with optional nsamples try: vals = explainer.shap_values(X_eval_arr, nsamples=nsamples) # KernelExplainer accepts nsamples except TypeError: vals = explainer.shap_values(X_eval_arr) # Tree/Deep # Align output shape if isinstance(vals, list): # Per-class outputs; choose the desired class (binary: index 1) vals_use = np.asarray(vals[class_index], dtype=float) else: vals_use = np.asarray(vals, dtype=float) base = getattr(explainer, "expected_value", 0.0) if isinstance(base, (list, tuple, np.ndarray)): # Per-class base values; align with class_index if present if len(np.atleast_1d(base)) > class_index: base_val = np.atleast_1d(base)[class_index] else: base_val = np.atleast_1d(base).ravel()[0] else: base_val = float(base) # Some explainers return per-sample base values; broadcast if needed if np.ndim(base_val) == 0: base_values = np.full(X_eval_arr.shape[0], float(base_val), dtype=float) else: base_values = np.asarray(base_val, dtype=float) return {"shap_values": vals_use, "base_values": base_values}
[docs] def make_explainer( model: Any, background: np.ndarray, method: str = "auto", link: Optional[str] = None, predict_fn: Optional[Callable] = None, ) -> Tuple[Any, int]: '''Create a SHAP Explainer for the given model and background. Parameters ---------- model : Any The model to explain. Can be tree model, PyTorch/TensorFlow model, or any model. background : np.ndarray Background dataset for SHAP explainer. method : str, optional Method to use: "auto" (TreeExplainer if tree model; DeepExplainer if torch/TF; else KernelExplainer), "tree", "deep", or "kernel". Default is "auto". link : Optional[str], optional Optional link function (e.g., "logit") for KernelExplainer. Default is None. predict_fn : Optional[Callable], optional Override prediction function (expects shape (n, n_classes) or (n,)). Default is None. Returns ------- Tuple[Any, int] Tuple of (explainer, predict_proba_index). predict_proba_index = 1 is commonly used for binary classification when the explainer returns per-class SHAP values (lists). ''' _require_shap() bg = _ensure_2d(background) # Allow user to override prediction function (useful for custom wrappers) if predict_fn is not None: fn = predict_fn else: # Default: try predict_proba -> class 1 prob; else predict if hasattr(model, "predict_proba"): fn = lambda X: model.predict_proba(X) else: fn = lambda X: model.predict(X) # Heuristics for method auto-selection is_tree = any(hasattr(model, attr) for attr in ("apply", "tree_", "feature_importances_")) is_torch = "torch" in type(model).__module__ is_tf = any(k in type(model).__module__ for k in ("tensorflow", "keras")) if method == "auto": if is_tree and hasattr(shap, "TreeExplainer"): explainer = shap.TreeExplainer(model, data=bg) proba_idx = 1 elif (is_torch or is_tf) and hasattr(shap, "DeepExplainer"): explainer = shap.DeepExplainer(model, bg) proba_idx = 1 else: link_obj = link if link in (None, "identity", "logit") else None explainer = shap.KernelExplainer(fn, bg, link=link_obj) proba_idx = 1 elif method.lower() == "tree": explainer = shap.TreeExplainer(model, data=bg) proba_idx = 1 elif method.lower() == "deep": explainer = shap.DeepExplainer(model, bg) proba_idx = 1 elif method.lower() == "kernel": link_obj = link if link in (None, "identity", "logit") else None explainer = shap.KernelExplainer(fn, bg, link=link_obj) proba_idx = 1 else: # User-facing error: unknown method ocerror.Error.value_error(f"Unknown method: '{method}'. Must be 'tree', 'deep', or 'kernel'.") raise ValueError(f"Unknown method: {method}") return explainer, proba_idx
[docs] def shap_importance_table( shap_values: np.ndarray, feature_names: Optional[Sequence[str]] = None, k: Optional[int] = None, ) -> pd.DataFrame: '''Compute mean absolute SHAP values per feature and return a ranked table. Parameters ---------- shap_values : np.ndarray SHAP values array of shape (n_samples, n_features). feature_names : Optional[Sequence[str]], optional Names of features. If None, generates names like "f0", "f1", etc. Default is None. Returns ------- pd.DataFrame DataFrame with columns: ["feature", "mean_abs_shap", "rank"] ''' S = np.asarray(shap_values, dtype=float) mean_abs = np.nanmean(np.abs(S), axis=0) if feature_names is None: feature_names = [f"f{i}" for i in range(S.shape[1])] df = pd.DataFrame({"feature": list(feature_names), "mean_abs_shap": mean_abs}) df.sort_values("mean_abs_shap", ascending=False, inplace=True) df["rank"] = np.arange(1, df.shape[0] + 1) if k is not None and k > 0: df = df.head(int(k)).reset_index(drop=True) else: df = df.reset_index(drop=True) return df
# Explainer selection # SHAP computation # Importance tables