Model Evaluation Metrics - Python Data Science Path

Measure What Matters! 📊

Choosing the right evaluation metric is as important as choosing the right model. Different metrics tell different stories about your model's performance. From accuracy to AUC-ROC, from RMSE to R², understanding when and how to use each metric ensures you're optimizing for the right business objective and making informed decisions about model deployment.

Metrics Overview

graph TD A[Model Evaluation] --> B[Classification Metrics] A --> C[Regression Metrics] A --> D[Clustering Metrics] A --> E[Ranking Metrics] B --> F[Accuracy] B --> G[Precision/Recall] B --> H[F1-Score] B --> I[ROC-AUC] B --> J[Confusion Matrix] C --> K[MSE/RMSE] C --> L[MAE] C --> M[R²] C --> N[MAPE] D --> O[Silhouette Score] D --> P[Davies-Bouldin] D --> Q[Calinski-Harabasz] E --> R[NDCG] E --> S[MAP] E --> T[MRR]

Classification Metrics

Binary Classification Metrics

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_curve, auc,
    roc_auc_score, precision_recall_curve, average_precision_score,
    cohen_kappa_score, matthews_corrcoef, log_loss, brier_score_loss
)
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Generate imbalanced binary classification data
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15,
                          n_redundant=5, n_classes=2, weights=[0.9, 0.1],
                          flip_y=0.05, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, 
                                                    random_state=42, stratify=y)

# Train models
lr_model = LogisticRegression(random_state=42, max_iter=1000)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

lr_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)

# Get predictions
y_pred_lr = lr_model.predict(X_test)
y_pred_rf = rf_model.predict(X_test)
y_proba_lr = lr_model.predict_proba(X_test)[:, 1]
y_proba_rf = rf_model.predict_proba(X_test)[:, 1]

# Comprehensive metric calculation
class ClassificationMetrics:
    """Calculate and visualize classification metrics"""
    
    def __init__(self, y_true, y_pred, y_proba=None, model_name="Model"):
        self.y_true = y_true
        self.y_pred = y_pred
        self.y_proba = y_proba
        self.model_name = model_name
        self.metrics = {}
        
    def calculate_basic_metrics(self):
        """Calculate basic classification metrics"""
        
        # Confusion matrix
        cm = confusion_matrix(self.y_true, self.y_pred)
        tn, fp, fn, tp = cm.ravel()
        
        # Basic metrics
        self.metrics['Accuracy'] = accuracy_score(self.y_true, self.y_pred)
        self.metrics['Precision'] = precision_score(self.y_true, self.y_pred, zero_division=0)
        self.metrics['Recall'] = recall_score(self.y_true, self.y_pred)
        self.metrics['F1-Score'] = f1_score(self.y_true, self.y_pred)
        
        # Additional metrics
        self.metrics['Specificity'] = tn / (tn + fp) if (tn + fp) > 0 else 0
        self.metrics['NPV'] = tn / (tn + fn) if (tn + fn) > 0 else 0  # Negative Predictive Value
        self.metrics['FPR'] = fp / (fp + tn) if (fp + tn) > 0 else 0  # False Positive Rate
        self.metrics['FNR'] = fn / (fn + tp) if (fn + tp) > 0 else 0  # False Negative Rate
        
        # Balanced metrics
        self.metrics['Balanced Accuracy'] = (self.metrics['Recall'] + self.metrics['Specificity']) / 2
        self.metrics['MCC'] = matthews_corrcoef(self.y_true, self.y_pred)
        self.metrics['Cohen Kappa'] = cohen_kappa_score(self.y_true, self.y_pred)
        
        # Probabilistic metrics (if probabilities available)
        if self.y_proba is not None:
            self.metrics['ROC-AUC'] = roc_auc_score(self.y_true, self.y_proba)
            self.metrics['PR-AUC'] = average_precision_score(self.y_true, self.y_proba)
            self.metrics['Log Loss'] = log_loss(self.y_true, self.y_proba)
            self.metrics['Brier Score'] = brier_score_loss(self.y_true, self.y_proba)
        
        self.confusion_matrix = cm
        
        return self.metrics
    
    def print_report(self):
        """Print comprehensive classification report"""
        print(f"\n{'='*60}")
        print(f"Classification Report for {self.model_name}")
        print(f"{'='*60}")
        
        # Standard classification report
        print("\nDetailed Classification Report:")
        print(classification_report(self.y_true, self.y_pred, 
                                   target_names=['Class 0', 'Class 1']))
        
        # Additional metrics
        print("\nAdditional Metrics:")
        for metric, value in self.metrics.items():
            print(f"{metric:20s}: {value:.4f}")
        
        # Confusion Matrix
        print(f"\nConfusion Matrix:")
        print(self.confusion_matrix)
        print(f"TN: {self.confusion_matrix[0,0]}, FP: {self.confusion_matrix[0,1]}")
        print(f"FN: {self.confusion_matrix[1,0]}, TP: {self.confusion_matrix[1,1]}")
    
    def plot_confusion_matrix(self, ax=None):
        """Plot confusion matrix"""
        if ax is None:
            fig, ax = plt.subplots(figsize=(8, 6))
        
        sns.heatmap(self.confusion_matrix, annot=True, fmt='d', cmap='Blues', ax=ax)
        ax.set_title(f'Confusion Matrix - {self.model_name}')
        ax.set_xlabel('Predicted')
        ax.set_ylabel('Actual')
        ax.set_xticklabels(['Class 0', 'Class 1'])
        ax.set_yticklabels(['Class 0', 'Class 1'])
        
        # Add percentages
        total = self.confusion_matrix.sum()
        for i in range(2):
            for j in range(2):
                percentage = 100 * self.confusion_matrix[i, j] / total
                ax.text(j + 0.5, i + 0.7, f'{percentage:.1f}%',
                       ha='center', va='center', fontsize=9, style='italic')
    
    def plot_roc_curve(self, ax=None):
        """Plot ROC curve"""
        if self.y_proba is None:
            print("Probabilities needed for ROC curve")
            return
        
        if ax is None:
            fig, ax = plt.subplots(figsize=(8, 6))
        
        fpr, tpr, thresholds = roc_curve(self.y_true, self.y_proba)
        auc_score = auc(fpr, tpr)
        
        ax.plot(fpr, tpr, linewidth=2, label=f'{self.model_name} (AUC = {auc_score:.3f})')
        ax.plot([0, 1], [0, 1], 'k--', linewidth=1, label='Random Classifier')
        ax.fill_between(fpr, tpr, alpha=0.3)
        ax.set_xlabel('False Positive Rate')
        ax.set_ylabel('True Positive Rate')
        ax.set_title('ROC Curve')
        ax.legend()
        ax.grid(True, alpha=0.3)
        
        return fpr, tpr, auc_score
    
    def plot_precision_recall_curve(self, ax=None):
        """Plot Precision-Recall curve"""
        if self.y_proba is None:
            print("Probabilities needed for PR curve")
            return
        
        if ax is None:
            fig, ax = plt.subplots(figsize=(8, 6))
        
        precision, recall, thresholds = precision_recall_curve(self.y_true, self.y_proba)
        avg_precision = average_precision_score(self.y_true, self.y_proba)
        
        ax.plot(recall, precision, linewidth=2, 
               label=f'{self.model_name} (AP = {avg_precision:.3f})')
        ax.fill_between(recall, precision, alpha=0.3)
        ax.set_xlabel('Recall')
        ax.set_ylabel('Precision')
        ax.set_title('Precision-Recall Curve')
        ax.legend()
        ax.grid(True, alpha=0.3)
        
        # Add baseline (positive class prevalence)
        positive_rate = self.y_true.sum() / len(self.y_true)
        ax.axhline(y=positive_rate, color='r', linestyle='--', 
                  label=f'Baseline ({positive_rate:.3f})')
        
        return precision, recall, avg_precision
    
    def plot_threshold_analysis(self):
        """Analyze metrics at different thresholds"""
        if self.y_proba is None:
            print("Probabilities needed for threshold analysis")
            return
        
        thresholds = np.linspace(0, 1, 100)
        metrics_at_threshold = {
            'Precision': [],
            'Recall': [],
            'F1-Score': [],
            'Accuracy': []
        }
        
        for threshold in thresholds:
            y_pred_threshold = (self.y_proba >= threshold).astype(int)
            
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                metrics_at_threshold['Precision'].append(
                    precision_score(self.y_true, y_pred_threshold, zero_division=0))
                metrics_at_threshold['Recall'].append(
                    recall_score(self.y_true, y_pred_threshold, zero_division=0))
                metrics_at_threshold['F1-Score'].append(
                    f1_score(self.y_true, y_pred_threshold, zero_division=0))
                metrics_at_threshold['Accuracy'].append(
                    accuracy_score(self.y_true, y_pred_threshold))
        
        fig, ax = plt.subplots(figsize=(10, 6))
        
        for metric, values in metrics_at_threshold.items():
            ax.plot(thresholds, values, label=metric, linewidth=2)
        
        ax.set_xlabel('Threshold')
        ax.set_ylabel('Metric Value')
        ax.set_title(f'Metrics vs Threshold - {self.model_name}')
        ax.legend()
        ax.grid(True, alpha=0.3)
        ax.axvline(x=0.5, color='k', linestyle='--', alpha=0.5, label='Default (0.5)')
        
        plt.tight_layout()
        plt.show()

# Calculate metrics for both models
lr_metrics = ClassificationMetrics(y_test, y_pred_lr, y_proba_lr, "Logistic Regression")
rf_metrics = ClassificationMetrics(y_test, y_pred_rf, y_proba_rf, "Random Forest")

lr_metrics.calculate_basic_metrics()
rf_metrics.calculate_basic_metrics()

# Print reports
lr_metrics.print_report()
rf_metrics.print_report()

# Visualize metrics
fig, axes = plt.subplots(2, 4, figsize=(16, 10))

# Confusion matrices
lr_metrics.plot_confusion_matrix(axes[0, 0])
rf_metrics.plot_confusion_matrix(axes[0, 1])

# ROC curves
lr_metrics.plot_roc_curve(axes[0, 2])
rf_metrics.plot_roc_curve(axes[0, 2])  # Both on same plot

# PR curves
lr_metrics.plot_precision_recall_curve(axes[0, 3])
rf_metrics.plot_precision_recall_curve(axes[0, 3])  # Both on same plot

# Metric comparison bar plot
metrics_comparison = pd.DataFrame({
    'Logistic Regression': lr_metrics.metrics,
    'Random Forest': rf_metrics.metrics
}).T

metrics_to_plot = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC']
metrics_comparison[metrics_to_plot].plot(kind='bar', ax=axes[1, 0])
axes[1, 0].set_title('Metrics Comparison')
axes[1, 0].set_ylabel('Score')
axes[1, 0].legend(loc='lower right')
axes[1, 0].grid(True, alpha=0.3)

# Class distribution
unique, counts = np.unique(y_test, return_counts=True)
axes[1, 1].bar(unique, counts, alpha=0.7)
axes[1, 1].set_title('Test Set Class Distribution')
axes[1, 1].set_xlabel('Class')
axes[1, 1].set_ylabel('Count')
axes[1, 1].set_xticks([0, 1])
for i, count in enumerate(counts):
    axes[1, 1].text(i, count, f'{count}\n({count/len(y_test):.1%})',
                   ha='center', va='bottom')

# Feature importance (for Random Forest)
importances = rf_model.feature_importances_
indices = np.argsort(importances)[::-1][:10]
axes[1, 2].barh(range(10), importances[indices])
axes[1, 2].set_yticks(range(10))
axes[1, 2].set_yticklabels([f'Feature {i}' for i in indices])
axes[1, 2].set_xlabel('Importance')
axes[1, 2].set_title('Top 10 Feature Importances (RF)')

# Score distribution
axes[1, 3].hist(y_proba_lr[y_test == 0], bins=30, alpha=0.5, label='Class 0', density=True)
axes[1, 3].hist(y_proba_lr[y_test == 1], bins=30, alpha=0.5, label='Class 1', density=True)
axes[1, 3].set_xlabel('Predicted Probability')
axes[1, 3].set_ylabel('Density')
axes[1, 3].set_title('Score Distribution by Class (LR)')
axes[1, 3].legend()
axes[1, 3].axvline(x=0.5, color='k', linestyle='--', alpha=0.5)

plt.suptitle('Binary Classification Metrics Analysis', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

# Threshold analysis for Logistic Regression
lr_metrics.plot_threshold_analysis()

Multi-class Classification Metrics

Handling Multiple Classes

from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support,
    confusion_matrix, classification_report,
    cohen_kappa_score, matthews_corrcoef
)
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from itertools import cycle

# Generate multi-class data
X_multi, y_multi = make_classification(n_samples=1000, n_features=20, 
                                       n_informative=15, n_redundant=5,
                                       n_classes=4, n_clusters_per_class=1,
                                       random_state=42)

# Class names for better visualization
class_names = ['Class A', 'Class B', 'Class C', 'Class D']

X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(
    X_multi, y_multi, test_size=0.3, random_state=42, stratify=y_multi
)

# Train multi-class classifier
rf_multi = RandomForestClassifier(n_estimators=100, random_state=42)
rf_multi.fit(X_train_m, y_train_m)

y_pred_m = rf_multi.predict(X_test_m)
y_proba_m = rf_multi.predict_proba(X_test_m)

class MulticlassMetrics:
    """Calculate metrics for multi-class classification"""
    
    def __init__(self, y_true, y_pred, y_proba=None, class_names=None):
        self.y_true = y_true
        self.y_pred = y_pred
        self.y_proba = y_proba
        self.n_classes = len(np.unique(y_true))
        self.class_names = class_names or [f'Class {i}' for i in range(self.n_classes)]
        
    def calculate_metrics(self):
        """Calculate multi-class metrics"""
        
        # Overall metrics
        metrics = {
            'Accuracy': accuracy_score(self.y_true, self.y_pred),
            'Cohen Kappa': cohen_kappa_score(self.y_true, self.y_pred),
            'MCC': matthews_corrcoef(self.y_true, self.y_pred)
        }
        
        # Per-class metrics
        precision, recall, f1, support = precision_recall_fscore_support(
            self.y_true, self.y_pred, average=None
        )
        
        # Averaging strategies
        for average in ['micro', 'macro', 'weighted']:
            p, r, f, _ = precision_recall_fscore_support(
                self.y_true, self.y_pred, average=average
            )
            metrics[f'Precision ({average})'] = p
            metrics[f'Recall ({average})'] = r
            metrics[f'F1-Score ({average})'] = f
        
        # Per-class breakdown
        per_class_metrics = pd.DataFrame({
            'Class': self.class_names,
            'Precision': precision,
            'Recall': recall,
            'F1-Score': f1,
            'Support': support
        })
        
        return metrics, per_class_metrics
    
    def plot_confusion_matrix(self, normalize=False):
        """Plot confusion matrix for multi-class"""
        cm = confusion_matrix(self.y_true, self.y_pred)
        
        if normalize:
            cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
            fmt = '.2f'
            title = 'Normalized Confusion Matrix'
        else:
            fmt = 'd'
            title = 'Confusion Matrix'
        
        fig, ax = plt.subplots(figsize=(10, 8))
        sns.heatmap(cm, annot=True, fmt=fmt, cmap='Blues', ax=ax,
                   xticklabels=self.class_names, yticklabels=self.class_names)
        ax.set_title(title)
        ax.set_xlabel('Predicted')
        ax.set_ylabel('Actual')
        plt.tight_layout()
        plt.show()
        
        return cm
    
    def plot_roc_curves(self):
        """Plot ROC curves for each class (one-vs-rest)"""
        if self.y_proba is None:
            print("Probabilities needed for ROC curves")
            return
        
        # Binarize labels for one-vs-rest
        y_true_bin = label_binarize(self.y_true, classes=range(self.n_classes))
        
        fig, axes = plt.subplots(1, 2, figsize=(12, 5))
        
        # Individual ROC curves
        for i in range(self.n_classes):
            fpr, tpr, _ = roc_curve(y_true_bin[:, i], self.y_proba[:, i])
            auc_score = auc(fpr, tpr)
            axes[0].plot(fpr, tpr, label=f'{self.class_names[i]} (AUC = {auc_score:.3f})')
        
        axes[0].plot([0, 1], [0, 1], 'k--', linewidth=1)
        axes[0].set_xlabel('False Positive Rate')
        axes[0].set_ylabel('True Positive Rate')
        axes[0].set_title('ROC Curves (One-vs-Rest)')
        axes[0].legend()
        axes[0].grid(True, alpha=0.3)
        
        # Micro and Macro average ROC
        # Micro-average
        fpr_micro, tpr_micro, _ = roc_curve(y_true_bin.ravel(), self.y_proba.ravel())
        auc_micro = auc(fpr_micro, tpr_micro)
        
        axes[1].plot(fpr_micro, tpr_micro, 
                    label=f'Micro-average (AUC = {auc_micro:.3f})', linewidth=2)
        
        # Macro-average
        all_fpr = np.unique(np.concatenate([
            roc_curve(y_true_bin[:, i], self.y_proba[:, i])[0]
            for i in range(self.n_classes)
        ]))
        
        mean_tpr = np.zeros_like(all_fpr)
        for i in range(self.n_classes):
            fpr, tpr, _ = roc_curve(y_true_bin[:, i], self.y_proba[:, i])
            mean_tpr += np.interp(all_fpr, fpr, tpr)
        
        mean_tpr /= self.n_classes
        auc_macro = auc(all_fpr, mean_tpr)
        
        axes[1].plot(all_fpr, mean_tpr,
                    label=f'Macro-average (AUC = {auc_macro:.3f})', linewidth=2)
        
        axes[1].plot([0, 1], [0, 1], 'k--', linewidth=1)
        axes[1].set_xlabel('False Positive Rate')
        axes[1].set_ylabel('True Positive Rate')
        axes[1].set_title('Averaged ROC Curves')
        axes[1].legend()
        axes[1].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()

# Calculate multi-class metrics
mc_metrics = MulticlassMetrics(y_test_m, y_pred_m, y_proba_m, class_names)
overall_metrics, per_class_df = mc_metrics.calculate_metrics()

print("\nMulti-class Classification Metrics")
print("="*60)
print("\nOverall Metrics:")
for metric, value in overall_metrics.items():
    print(f"{metric:25s}: {value:.4f}")

print("\nPer-Class Metrics:")
print(per_class_df.to_string(index=False))

print("\nDetailed Classification Report:")
print(classification_report(y_test_m, y_pred_m, target_names=class_names))

# Visualizations
mc_metrics.plot_confusion_matrix(normalize=False)
mc_metrics.plot_confusion_matrix(normalize=True)
mc_metrics.plot_roc_curves()

Regression Metrics

Evaluating Continuous Predictions

from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, r2_score,
    mean_absolute_percentage_error, median_absolute_error,
    explained_variance_score, max_error
)
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
import numpy as np

# Generate regression data
X_reg, y_reg = make_regression(n_samples=500, n_features=10, 
                               n_informative=8, noise=20, random_state=42)

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X_reg, y_reg, test_size=0.3, random_state=42
)

# Train regression models
lr_reg = LinearRegression()
ridge_reg = Ridge(alpha=1.0)
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)

models = {'Linear Regression': lr_reg, 'Ridge': ridge_reg, 'Random Forest': rf_reg}

for name, model in models.items():
    model.fit(X_train_r, y_train_r)

class RegressionMetrics:
    """Comprehensive regression metrics"""
    
    def __init__(self, y_true, y_pred, model_name="Model"):
        self.y_true = y_true
        self.y_pred = y_pred
        self.model_name = model_name
        self.residuals = y_true - y_pred
        
    def calculate_metrics(self):
        """Calculate all regression metrics"""
        
        metrics = {
            'MSE': mean_squared_error(self.y_true, self.y_pred),
            'RMSE': np.sqrt(mean_squared_error(self.y_true, self.y_pred)),
            'MAE': mean_absolute_error(self.y_true, self.y_pred),
            'MedAE': median_absolute_error(self.y_true, self.y_pred),
            'MAPE': mean_absolute_percentage_error(self.y_true, self.y_pred) * 100,
            'R²': r2_score(self.y_true, self.y_pred),
            'Adjusted R²': self.adjusted_r2(self.y_true, self.y_pred),
            'Explained Var': explained_variance_score(self.y_true, self.y_pred),
            'Max Error': max_error(self.y_true, self.y_pred)
        }
        
        # Additional custom metrics
        metrics['Mean Residual'] = np.mean(self.residuals)
        metrics['Std Residual'] = np.std(self.residuals)
        metrics['Min Residual'] = np.min(self.residuals)
        metrics['Max Residual'] = np.max(self.residuals)
        
        # Percentage of predictions within tolerance
        tolerances = [5, 10, 20]
        for tol in tolerances:
            within_tol = np.sum(np.abs(self.residuals) <= tol) / len(self.residuals)
            metrics[f'Within ±{tol}'] = within_tol
        
        return metrics
    
    def adjusted_r2(self, y_true, y_pred, n_features=10):
        """Calculate adjusted R²"""
        n = len(y_true)
        r2 = r2_score(y_true, y_pred)
        adj_r2 = 1 - (1 - r2) * (n - 1) / (n - n_features - 1)
        return adj_r2
    
    def plot_predictions(self, ax=None):
        """Plot predictions vs actual"""
        if ax is None:
            fig, ax = plt.subplots(figsize=(8, 6))
        
        ax.scatter(self.y_true, self.y_pred, alpha=0.5, s=20)
        
        # Perfect prediction line
        min_val = min(self.y_true.min(), self.y_pred.min())
        max_val = max(self.y_true.max(), self.y_pred.max())
        ax.plot([min_val, max_val], [min_val, max_val], 'r--', lw=2, label='Perfect Prediction')
        
        # Add R² annotation
        r2 = r2_score(self.y_true, self.y_pred)
        ax.text(0.05, 0.95, f'R² = {r2:.3f}', transform=ax.transAxes,
               fontsize=12, verticalalignment='top',
               bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
        
        ax.set_xlabel('Actual Values')
        ax.set_ylabel('Predicted Values')
        ax.set_title(f'Predictions vs Actual - {self.model_name}')
        ax.legend()
        ax.grid(True, alpha=0.3)
    
    def plot_residuals(self):
        """Comprehensive residual analysis"""
        fig, axes = plt.subplots(2, 2, figsize=(12, 10))
        
        # 1. Residuals vs Predicted
        axes[0, 0].scatter(self.y_pred, self.residuals, alpha=0.5, s=20)
        axes[0, 0].axhline(y=0, color='r', linestyle='--')
        axes[0, 0].set_xlabel('Predicted Values')
        axes[0, 0].set_ylabel('Residuals')
        axes[0, 0].set_title('Residuals vs Predicted')
        axes[0, 0].grid(True, alpha=0.3)
        
        # Add confidence bands
        std_residuals = np.std(self.residuals)
        axes[0, 0].fill_between(sorted(self.y_pred), -2*std_residuals, 2*std_residuals,
                               alpha=0.2, color='gray', label='±2σ')
        axes[0, 0].legend()
        
        # 2. Q-Q Plot
        from scipy import stats
        stats.probplot(self.residuals, dist="norm", plot=axes[0, 1])
        axes[0, 1].set_title('Q-Q Plot of Residuals')
        
        # 3. Histogram of Residuals
        axes[1, 0].hist(self.residuals, bins=30, edgecolor='black', alpha=0.7)
        axes[1, 0].axvline(x=0, color='r', linestyle='--')
        axes[1, 0].set_xlabel('Residuals')
        axes[1, 0].set_ylabel('Frequency')
        axes[1, 0].set_title('Distribution of Residuals')
        
        # Add normal distribution overlay
        mu, std = self.residuals.mean(), self.residuals.std()
        x = np.linspace(self.residuals.min(), self.residuals.max(), 100)
        axes[1, 0].plot(x, stats.norm.pdf(x, mu, std) * len(self.residuals) * 
                       (self.residuals.max() - self.residuals.min()) / 30,
                       'r-', lw=2, label='Normal')
        axes[1, 0].legend()
        
        # 4. Scale-Location Plot
        standardized_residuals = self.residuals / np.sqrt(np.abs(self.residuals))
        axes[1, 1].scatter(self.y_pred, np.sqrt(np.abs(standardized_residuals)), alpha=0.5, s=20)
        axes[1, 1].set_xlabel('Predicted Values')
        axes[1, 1].set_ylabel('√|Standardized Residuals|')
        axes[1, 1].set_title('Scale-Location Plot')
        axes[1, 1].grid(True, alpha=0.3)
        
        # Add trend line
        z = np.polyfit(self.y_pred, np.sqrt(np.abs(standardized_residuals)), 1)
        p = np.poly1d(z)
        axes[1, 1].plot(sorted(self.y_pred), p(sorted(self.y_pred)), "r--", alpha=0.5)
        
        plt.suptitle(f'Residual Analysis - {self.model_name}', fontsize=14)
        plt.tight_layout()
        plt.show()

# Compare regression models
results = []
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for idx, (name, model) in enumerate(models.items()):
    y_pred = model.predict(X_test_r)
    
    reg_metrics = RegressionMetrics(y_test_r, y_pred, name)
    metrics = reg_metrics.calculate_metrics()
    metrics['Model'] = name
    results.append(metrics)
    
    reg_metrics.plot_predictions(axes[idx])

plt.suptitle('Regression Model Comparison', fontsize=14)
plt.tight_layout()
plt.show()

# Display metrics comparison
results_df = pd.DataFrame(results)
cols = ['Model', 'RMSE', 'MAE', 'R²', 'MAPE']
print("\nRegression Metrics Comparison:")
print(results_df[cols].to_string(index=False))

# Detailed residual analysis for best model
best_model = models['Random Forest']
y_pred_best = best_model.predict(X_test_r)
best_metrics = RegressionMetrics(y_test_r, y_pred_best, 'Random Forest')
best_metrics.plot_residuals()

Clustering Metrics

Evaluating Unsupervised Learning

from sklearn.metrics import (
    silhouette_score, calinski_harabasz_score, davies_bouldin_score,
    silhouette_samples, homogeneity_score, completeness_score, v_measure_score,
    adjusted_rand_score, adjusted_mutual_info_score
)
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.datasets import make_blobs

# Generate clustering data
X_cluster, y_true_cluster = make_blobs(n_samples=500, centers=4, n_features=2,
                                       center_box=(-10, 10), random_state=42)

class ClusteringMetrics:
    """Evaluate clustering performance"""
    
    def __init__(self, X, labels_pred, labels_true=None):
        self.X = X
        self.labels_pred = labels_pred
        self.labels_true = labels_true
        self.n_clusters = len(set(labels_pred)) - (1 if -1 in labels_pred else 0)
        
    def calculate_internal_metrics(self):
        """Calculate internal clustering metrics (no ground truth needed)"""
        
        if self.n_clusters < 2:
            print("Need at least 2 clusters for evaluation")
            return {}
        
        metrics = {}
        
        # Filter out noise points for metrics that don't handle them
        mask = self.labels_pred != -1
        X_filtered = self.X[mask]
        labels_filtered = self.labels_pred[mask]
        
        if len(set(labels_filtered)) >= 2:
            metrics['Silhouette Score'] = silhouette_score(X_filtered, labels_filtered)
            metrics['Calinski-Harabasz'] = calinski_harabasz_score(X_filtered, labels_filtered)
            metrics['Davies-Bouldin'] = davies_bouldin_score(X_filtered, labels_filtered)
            
            # Silhouette samples for detailed analysis
            self.silhouette_samples = silhouette_samples(X_filtered, labels_filtered)
        
        metrics['N Clusters'] = self.n_clusters
        metrics['N Noise Points'] = np.sum(self.labels_pred == -1)
        
        return metrics
    
    def calculate_external_metrics(self):
        """Calculate external metrics (ground truth needed)"""
        
        if self.labels_true is None:
            print("Ground truth labels needed for external metrics")
            return {}
        
        metrics = {
            'Homogeneity': homogeneity_score(self.labels_true, self.labels_pred),
            'Completeness': completeness_score(self.labels_true, self.labels_pred),
            'V-Measure': v_measure_score(self.labels_true, self.labels_pred),
            'Adjusted Rand Index': adjusted_rand_score(self.labels_true, self.labels_pred),
            'Adjusted MI': adjusted_mutual_info_score(self.labels_true, self.labels_pred)
        }
        
        return metrics
    
    def plot_clusters(self, ax=None):
        """Visualize clusters (for 2D data)"""
        if self.X.shape[1] != 2:
            print("Plotting only works for 2D data")
            return
        
        if ax is None:
            fig, ax = plt.subplots(figsize=(8, 6))
        
        # Plot clusters
        unique_labels = set(self.labels_pred)
        colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
        
        for k, col in zip(unique_labels, colors):
            if k == -1:
                # Black for noise
                col = 'black'
                marker = 'x'
            else:
                marker = 'o'
            
            class_mask = self.labels_pred == k
            ax.scatter(self.X[class_mask, 0], self.X[class_mask, 1],
                      c=[col], marker=marker, s=50, alpha=0.6,
                      label=f'Cluster {k}' if k != -1 else 'Noise')
        
        ax.set_xlabel('Feature 1')
        ax.set_ylabel('Feature 2')
        ax.set_title('Clustering Results')
        ax.legend()
        ax.grid(True, alpha=0.3)
    
    def plot_silhouette_analysis(self):
        """Silhouette analysis visualization"""
        if not hasattr(self, 'silhouette_samples'):
            print("Run calculate_internal_metrics first")
            return
        
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
        
        # Silhouette plot
        y_lower = 10
        
        mask = self.labels_pred != -1
        labels_filtered = self.labels_pred[mask]
        
        for i in range(self.n_clusters):
            cluster_silhouette_values = self.silhouette_samples[labels_filtered == i]
            cluster_silhouette_values.sort()
            
            size_cluster_i = cluster_silhouette_values.shape[0]
            y_upper = y_lower + size_cluster_i
            
            color = plt.cm.Spectral(float(i) / self.n_clusters)
            ax1.fill_betweenx(np.arange(y_lower, y_upper),
                             0, cluster_silhouette_values,
                             facecolor=color, edgecolor=color, alpha=0.7)
            
            ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
            y_lower = y_upper + 10
        
        ax1.set_xlabel("Silhouette Coefficient")
        ax1.set_ylabel("Cluster Label")
        ax1.set_title("Silhouette Plot")
        
        # Add average silhouette score line
        avg_score = np.mean(self.silhouette_samples)
        ax1.axvline(x=avg_score, color="red", linestyle="--",
                   label=f'Average ({avg_score:.3f})')
        ax1.legend()
        
        # Cluster visualization
        self.plot_clusters(ax2)
        
        plt.tight_layout()
        plt.show()

# Compare clustering algorithms
clustering_algorithms = {
    'K-Means (k=4)': KMeans(n_clusters=4, random_state=42),
    'K-Means (k=3)': KMeans(n_clusters=3, random_state=42),
    'K-Means (k=5)': KMeans(n_clusters=5, random_state=42),
    'DBSCAN': DBSCAN(eps=1.5, min_samples=5),
    'Agglomerative': AgglomerativeClustering(n_clusters=4)
}

results = []
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for idx, (name, algorithm) in enumerate(clustering_algorithms.items()):
    # Fit and predict
    labels_pred = algorithm.fit_predict(X_cluster)
    
    # Calculate metrics
    cm = ClusteringMetrics(X_cluster, labels_pred, y_true_cluster)
    internal_metrics = cm.calculate_internal_metrics()
    external_metrics = cm.calculate_external_metrics()
    
    # Combine results
    result = {'Algorithm': name}
    result.update(internal_metrics)
    result.update(external_metrics)
    results.append(result)
    
    # Visualize
    cm.plot_clusters(axes[idx])
    axes[idx].set_title(name)

# Remove empty subplot
fig.delaxes(axes[5])

plt.suptitle('Clustering Algorithm Comparison', fontsize=14)
plt.tight_layout()
plt.show()

# Display metrics comparison
results_df = pd.DataFrame(results)
print("\nClustering Metrics Comparison:")
print("="*80)
print("\nInternal Metrics (no ground truth needed):")
internal_cols = ['Algorithm', 'N Clusters', 'Silhouette Score', 
                'Calinski-Harabasz', 'Davies-Bouldin']
print(results_df[internal_cols].to_string(index=False))

print("\nExternal Metrics (with ground truth):")
external_cols = ['Algorithm', 'V-Measure', 'Adjusted Rand Index', 'Adjusted MI']
print(results_df[external_cols].to_string(index=False))

print("\nMetric Interpretation:")
print("- Silhouette Score: Higher is better (-1 to 1)")
print("- Calinski-Harabasz: Higher is better")
print("- Davies-Bouldin: Lower is better")
print("- V-Measure: Higher is better (0 to 1)")
print("- Adjusted Rand Index: Higher is better (-1 to 1)")

# Detailed silhouette analysis for K-Means (k=4)
kmeans_4 = KMeans(n_clusters=4, random_state=42)
labels_kmeans = kmeans_4.fit_predict(X_cluster)
cm_kmeans = ClusteringMetrics(X_cluster, labels_kmeans)
cm_kmeans.calculate_internal_metrics()
cm_kmeans.plot_silhouette_analysis()

Choosing the Right Metric

# Metric selection guide

class MetricSelectionGuide:
    """Guide for choosing appropriate evaluation metrics"""
    
    @staticmethod
    def classification_guide():
        """Guide for classification metrics"""
        
        guide = {
            'Balanced Classes': {
                'primary': 'Accuracy',
                'secondary': ['Precision', 'Recall', 'F1-Score'],
                'reason': 'All classes equally important'
            },
            'Imbalanced Classes': {
                'primary': 'F1-Score (weighted)',
                'secondary': ['Precision', 'Recall', 'ROC-AUC', 'PR-AUC'],
                'reason': 'Accuracy can be misleading with imbalance'
            },
            'Cost-Sensitive (FP costly)': {
                'primary': 'Precision',
                'secondary': ['Specificity', 'F0.5-Score'],
                'reason': 'Minimize false positives'
            },
            'Cost-Sensitive (FN costly)': {
                'primary': 'Recall',
                'secondary': ['F2-Score', 'TPR'],
                'reason': 'Minimize false negatives'
            },
            'Ranking/Probability': {
                'primary': 'ROC-AUC',
                'secondary': ['PR-AUC', 'Log Loss', 'Brier Score'],
                'reason': 'Focus on probability calibration'
            },
            'Multi-class': {
                'primary': 'Macro F1-Score',
                'secondary': ['Cohen Kappa', 'MCC', 'Weighted F1'],
                'reason': 'Consider all classes fairly'
            }
        }
        
        return guide
    
    @staticmethod
    def regression_guide():
        """Guide for regression metrics"""
        
        guide = {
            'General Purpose': {
                'primary': 'RMSE',
                'secondary': ['MAE', 'R²'],
                'reason': 'Standard metrics, penalizes large errors'
            },
            'Outlier Sensitive': {
                'primary': 'MAE',
                'secondary': ['Median AE', 'Quantile Loss'],
                'reason': 'Less sensitive to outliers than RMSE'
            },
            'Relative Error Important': {
                'primary': 'MAPE',
                'secondary': ['SMAPE', 'RMSPE'],
                'reason': 'Error as percentage of actual value'
            },
            'Variance Explanation': {
                'primary': 'R²',
                'secondary': ['Adjusted R²', 'Explained Variance'],
                'reason': 'Proportion of variance explained'
            },
            'Business Metrics': {
                'primary': 'Custom Loss',
                'secondary': ['Quantile Loss', 'Huber Loss'],
                'reason': 'Align with business objectives'
            }
        }
        
        return guide
    
    @staticmethod
    def print_decision_tree():
        """Print metric selection decision tree"""
        
        print("\n" + "="*80)
        print("METRIC SELECTION DECISION TREE")
        print("="*80)
        
        print("\n1. PROBLEM TYPE?")
        print("   ├── Classification → Go to 2")
        print("   ├── Regression → Go to 3")
        print("   └── Clustering → Go to 4")
        
        print("\n2. CLASSIFICATION:")
        print("   ├── Binary?")
        print("   │   ├── Balanced? → Accuracy, F1-Score")
        print("   │   ├── Imbalanced? → F1-Score, PR-AUC")
        print("   │   └── Need probabilities? → ROC-AUC, Brier Score")
        print("   └── Multi-class?")
        print("       ├── Balanced? → Accuracy, Macro F1")
        print("       └── Imbalanced? → Weighted F1, Cohen Kappa")
        
        print("\n3. REGRESSION:")
        print("   ├── Outliers present? → MAE, Median AE")
        print("   ├── Need interpretability? → R², RMSE")
        print("   └── Business constraints? → Custom metric")
        
        print("\n4. CLUSTERING:")
        print("   ├── Have ground truth? → ARI, V-Measure")
        print("   └── No ground truth? → Silhouette, Calinski-Harabasz")
        
        print("\n" + "="*80)

# Create comprehensive metric comparison
def compare_metrics_impact():
    """Show how different metrics lead to different model selection"""
    
    # Generate imbalanced data
    X_comp, y_comp = make_classification(n_samples=1000, n_features=20,
                                         n_classes=2, weights=[0.95, 0.05],
                                         random_state=42)
    
    X_train_comp, X_test_comp, y_train_comp, y_test_comp = train_test_split(
        X_comp, y_comp, test_size=0.3, random_state=42, stratify=y_comp
    )
    
    # Train different models
    from sklearn.dummy import DummyClassifier
    from sklearn.tree import DecisionTreeClassifier
    
    models = {
        'Always Predict Majority': DummyClassifier(strategy='most_frequent'),
        'Random (Stratified)': DummyClassifier(strategy='stratified'),
        'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
        'Decision Tree': DecisionTreeClassifier(max_depth=3, random_state=42),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
    }
    
    results = []
    
    for name, model in models.items():
        model.fit(X_train_comp, y_train_comp)
        y_pred = model.predict(X_test_comp)
        
        # Calculate various metrics
        result = {
            'Model': name,
            'Accuracy': accuracy_score(y_test_comp, y_pred),
            'Precision': precision_score(y_test_comp, y_pred, zero_division=0),
            'Recall': recall_score(y_test_comp, y_pred),
            'F1-Score': f1_score(y_test_comp, y_pred),
        }
        
        # Add ROC-AUC if model supports probabilities
        if hasattr(model, 'predict_proba'):
            y_proba = model.predict_proba(X_test_comp)[:, 1]
            result['ROC-AUC'] = roc_auc_score(y_test_comp, y_proba)
        else:
            result['ROC-AUC'] = np.nan
        
        results.append(result)
    
    results_df = pd.DataFrame(results)
    
    print("\nModel Comparison with Different Metrics (Imbalanced Data)")
    print("="*80)
    print(f"Class Distribution - Class 0: {(y_test_comp==0).mean():.1%}, "
          f"Class 1: {(y_test_comp==1).mean():.1%}")
    print("\n", results_df.to_string(index=False))
    
    print("\n⚠️ Notice how 'Always Predict Majority' has high accuracy but zero recall!")
    print("This demonstrates why accuracy alone is misleading for imbalanced data.")
    
    # Visualize metric differences
    fig, ax = plt.subplots(figsize=(12, 6))
    
    x = np.arange(len(results_df))
    width = 0.15
    
    metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
    colors = ['blue', 'green', 'red', 'purple']
    
    for i, (metric, color) in enumerate(zip(metrics, colors)):
        ax.bar(x + i * width, results_df[metric], width, label=metric, color=color, alpha=0.7)
    
    ax.set_xlabel('Model')
    ax.set_ylabel('Score')
    ax.set_title('Impact of Metric Choice on Model Ranking')
    ax.set_xticks(x + width * 1.5)
    ax.set_xticklabels(results_df['Model'], rotation=45, ha='right')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

# Print guides
guide = MetricSelectionGuide()

print("\nCLASSIFICATION METRICS GUIDE")
print("="*60)
for scenario, details in guide.classification_guide().items():
    print(f"\n{scenario}:")
    print(f"  Primary Metric: {details['primary']}")
    print(f"  Secondary: {', '.join(details['secondary'])}")
    print(f"  Reason: {details['reason']}")

print("\n\nREGRESSION METRICS GUIDE")
print("="*60)
for scenario, details in guide.regression_guide().items():
    print(f"\n{scenario}:")
    print(f"  Primary Metric: {details['primary']}")
    print(f"  Secondary: {', '.join(details['secondary'])}")
    print(f"  Reason: {details['reason']}")

guide.print_decision_tree()
compare_metrics_impact()

Practice Exercises

Exercise 1: Custom Business Metric

Create a custom evaluation metric that:

Weights false positives and false negatives differently based on business cost
Incorporates confidence thresholds
Handles multi-class scenarios
Provides interpretable output
Can be used in cross-validation

Exercise 2: Metric Monitoring System

Build a monitoring system that:

Tracks multiple metrics over time
Detects metric degradation
Compares training vs validation metrics
Identifies overfitting patterns
Generates automated reports

Exercise 3: Metric Selection Tool

Develop an automated tool that:

Analyzes the dataset characteristics
Recommends appropriate metrics
Calculates all relevant metrics
Provides visualization dashboard
Exports comparison reports

Key Takeaways

📊 Different metrics tell different stories about model performance
⚖️ Choose metrics that align with business objectives
🎯 Accuracy is often misleading for imbalanced datasets
📈 ROC-AUC evaluates ranking ability, not classification threshold
🔍 Precision focuses on false positives, Recall on false negatives
📉 RMSE penalizes large errors more than MAE
🔄 Always use multiple metrics for comprehensive evaluation
⚠️ Consider the cost of different types of errors
📋 Document metric choices and reasoning