Source code for OCDocker.OCScore.Utils.Plotting

#!/usr/bin/env python3

# Description
###############################################################################
'''
Set of functions to manage plotting operations in OCDocker in the context of
scoring functions.

Usage:

import OCDocker.OCScore.Utils.Plotting as ocscoreplot
'''

# Imports
###############################################################################

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from scipy.cluster.hierarchy import leaves_list, linkage
from sklearn.metrics import auc, roc_curve
from typing import Union, Optional

# License
###############################################################################
'''
OCDocker
Authors: Rossi, A.D.; Monachesi, M.C.E.; Spelta, G.I.; Torres, P.H.M.
Federal University of Rio de Janeiro
Carlos Chagas Filho Institute of Biophysics
Laboratory for Molecular Modeling and Dynamics

This program is proprietary software owned by the Federal University of Rio de Janeiro (UFRJ),
developed by Rossi, A.D.; Monachesi, M.C.E.; Spelta, G.I.; Torres, P.H.M., and protected under Brazilian Law No. 9,609/1998.
All rights reserved. Use, reproduction, modification, and distribution are allowed under this UFRJ license,
provided this copyright notice is preserved. See the LICENSE file for details.

Contact: Artur Duque Rossi - arturossi10@gmail.com
'''

# Classes
###############################################################################

# Functions
###############################################################################
## Private ##

## Public ##

[docs] def plot_correlation_similarity(df1 : pd.DataFrame, df2 : pd.DataFrame, columns : list = [], annot : bool = True, fontsize : Optional[float] = None, normalize : bool = True) -> None: ''' Plots the similarity of correlation matrices from two DataFrames. Parameters ---------- df1 : pd.DataFrame The first DataFrame. df2 : pd.DataFrame The second DataFrame. columns : list, optional List of columns to compare. If empty, all columns except metadata are used. annot : bool, optional If True, write the data value in each cell. If False, don't write the data value. fontsize : int, optional The size of the font for the data value annotations. normalize : bool, optional If True, normalize the correlation matrices after calculating the similarity. ''' # If no columns are specified, use all columns except metadata if not columns: # Find common columns in both DataFrames columns = df1.columns.intersection(df2.columns) # Filter both DataFrames to include only common columns filtered_df1 = df1[columns] filtered_df2 = df2[columns] # Calculate the correlation matrices corr_matrix_df1 = filtered_df1.corr() corr_matrix_df2 = filtered_df2.corr() # Calculate the similarity (or difference) matrix # This can be customized as needed; here we use simple subtraction similarity_matrix = corr_matrix_df1 - corr_matrix_df2 # Normalize the similarity matrix with min max scaling if normalize: min_val = similarity_matrix.min().min() max_val = similarity_matrix.max().max() matrix_shifted = similarity_matrix - min_val matrix_scaled = matrix_shifted / (max_val - min_val) similarity_matrix = (matrix_scaled * 2) - 1 # Plot the similarity matrix as a heatmap plt.figure(figsize = (10, 8)) ax = sns.heatmap(similarity_matrix, annot = annot, cmap = 'coolwarm', center = 0, vmin = -1, vmax = 1, linewidths = 0.5, fmt = ".2f") plt.title('Heatmap of Correlation Matrix Similarity') # Set annotation font size if fontsize and annot: for text in ax.texts: text.set_fontsize(fontsize) plt.tight_layout() # Adjusts the plot to ensure everything fits without overlapping plt.savefig('correlation_similarity.png') plt.close() ## Reorder for readability # Perform hierarchical clustering to reorder the correlation matrix linkage_matrix = linkage(similarity_matrix, method = 'average') order = leaves_list(linkage_matrix) # Reorder the similarity matrix based on the hierarchical clustering similarity_matrix = similarity_matrix.iloc[order, order] # Plot the reordered similarity matrix as a heatmap plt.figure(figsize=(10, 8)) ax2 = sns.heatmap(similarity_matrix, annot = True, cmap = 'coolwarm', center = 0, vmin = -1, vmax = 1, linewidths = 0.5, fmt = ".2f") plt.title('Reordered Heatmap of Correlation Matrix Similarity') # Set annotation font size if fontsize and annot: for text in ax2.texts: text.set_fontsize(fontsize) plt.tight_layout() plt.savefig('correlation_similarity_sorted.png') plt.close()
[docs] def plot_roc_curves(df : pd.DataFrame, feature_cols : list, labels : pd.Series, title : str = "ROC") -> None: ''' Plots ROC curves for a DataFrame. Parameters ---------- df: pd.DataFrame DataFrame containing the features to plot the ROC curves for. feature_cols: list List of feature columns to plot ROC curves for. labels: pd.Series Series containing the labels for the ROC curves. title: str, optional Title of the plot. Default is "ROC". ''' # Get the db values db = df['db'].unique() # Check if there are multiple databases if len(db) > 1: db = "_".join(db) else: db = db[0] # Calculate AUC for each feature and store the results auc_dict = {} for feature in feature_cols: fpr, tpr, _ = roc_curve(labels, df[feature]) roc_auc = auc(fpr, tpr) auc_dict[feature] = roc_auc # Sort the features by their AUC in descending order sorted_features = sorted(auc_dict, key=lambda k: auc_dict[k], reverse=True) # Create the plot plt.figure(figsize=(14, 10)) # Plot ROC curves for each feature, now sorted by AUC for feature in sorted_features: fpr, tpr, _ = roc_curve(labels, df[feature]) roc_auc = auc_dict[feature] plt.plot(fpr, tpr, lw=2, label=f'{feature} (area = {roc_auc:.2f})') # Plot the random line plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') # Set plot parameters plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title(f"ROC Curves for {db} Dataset Features") # Move the legend outside of the plot area plt.legend(loc='upper left', bbox_to_anchor=(1, 1)) # Adjust layout for tight fit, so the legend fits within the figure plt.tight_layout() plt.savefig(f'{title}.png') plt.close()