Model Evaluation Metrics
Measure What Matters! 📊
Choosing the right evaluation metric is as important as choosing the right model. Different metrics tell different stories about your model's performance. From accuracy to AUC-ROC, from RMSE to R², understanding when and how to use each metric ensures you're optimizing for the right business objective and making informed decisions about model deployment.
Metrics Overview
graph TD
A[Model Evaluation] --> B[Classification Metrics]
A --> C[Regression Metrics]
A --> D[Clustering Metrics]
A --> E[Ranking Metrics]
B --> F[Accuracy]
B --> G[Precision/Recall]
B --> H[F1-Score]
B --> I[ROC-AUC]
B --> J[Confusion Matrix]
C --> K[MSE/RMSE]
C --> L[MAE]
C --> M[R²]
C --> N[MAPE]
D --> O[Silhouette Score]
D --> P[Davies-Bouldin]
D --> Q[Calinski-Harabasz]
E --> R[NDCG]
E --> S[MAP]
E --> T[MRR]
Classification Metrics
Binary Classification Metrics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score,
confusion_matrix, classification_report, roc_curve, auc,
roc_auc_score, precision_recall_curve, average_precision_score,
cohen_kappa_score, matthews_corrcoef, log_loss, brier_score_loss
)
import warnings
warnings.filterwarnings('ignore')
# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
# Generate imbalanced binary classification data
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15,
n_redundant=5, n_classes=2, weights=[0.9, 0.1],
flip_y=0.05, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
random_state=42, stratify=y)
# Train models
lr_model = LogisticRegression(random_state=42, max_iter=1000)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
lr_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)
# Get predictions
y_pred_lr = lr_model.predict(X_test)
y_pred_rf = rf_model.predict(X_test)
y_proba_lr = lr_model.predict_proba(X_test)[:, 1]
y_proba_rf = rf_model.predict_proba(X_test)[:, 1]
# Comprehensive metric calculation
class ClassificationMetrics:
"""Calculate and visualize classification metrics"""
def __init__(self, y_true, y_pred, y_proba=None, model_name="Model"):
self.y_true = y_true
self.y_pred = y_pred
self.y_proba = y_proba
self.model_name = model_name
self.metrics = {}
def calculate_basic_metrics(self):
"""Calculate basic classification metrics"""
# Confusion matrix
cm = confusion_matrix(self.y_true, self.y_pred)
tn, fp, fn, tp = cm.ravel()
# Basic metrics
self.metrics['Accuracy'] = accuracy_score(self.y_true, self.y_pred)
self.metrics['Precision'] = precision_score(self.y_true, self.y_pred, zero_division=0)
self.metrics['Recall'] = recall_score(self.y_true, self.y_pred)
self.metrics['F1-Score'] = f1_score(self.y_true, self.y_pred)
# Additional metrics
self.metrics['Specificity'] = tn / (tn + fp) if (tn + fp) > 0 else 0
self.metrics['NPV'] = tn / (tn + fn) if (tn + fn) > 0 else 0 # Negative Predictive Value
self.metrics['FPR'] = fp / (fp + tn) if (fp + tn) > 0 else 0 # False Positive Rate
self.metrics['FNR'] = fn / (fn + tp) if (fn + tp) > 0 else 0 # False Negative Rate
# Balanced metrics
self.metrics['Balanced Accuracy'] = (self.metrics['Recall'] + self.metrics['Specificity']) / 2
self.metrics['MCC'] = matthews_corrcoef(self.y_true, self.y_pred)
self.metrics['Cohen Kappa'] = cohen_kappa_score(self.y_true, self.y_pred)
# Probabilistic metrics (if probabilities available)
if self.y_proba is not None:
self.metrics['ROC-AUC'] = roc_auc_score(self.y_true, self.y_proba)
self.metrics['PR-AUC'] = average_precision_score(self.y_true, self.y_proba)
self.metrics['Log Loss'] = log_loss(self.y_true, self.y_proba)
self.metrics['Brier Score'] = brier_score_loss(self.y_true, self.y_proba)
self.confusion_matrix = cm
return self.metrics
def print_report(self):
"""Print comprehensive classification report"""
print(f"\n{'='*60}")
print(f"Classification Report for {self.model_name}")
print(f"{'='*60}")
# Standard classification report
print("\nDetailed Classification Report:")
print(classification_report(self.y_true, self.y_pred,
target_names=['Class 0', 'Class 1']))
# Additional metrics
print("\nAdditional Metrics:")
for metric, value in self.metrics.items():
print(f"{metric:20s}: {value:.4f}")
# Confusion Matrix
print(f"\nConfusion Matrix:")
print(self.confusion_matrix)
print(f"TN: {self.confusion_matrix[0,0]}, FP: {self.confusion_matrix[0,1]}")
print(f"FN: {self.confusion_matrix[1,0]}, TP: {self.confusion_matrix[1,1]}")
def plot_confusion_matrix(self, ax=None):
"""Plot confusion matrix"""
if ax is None:
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(self.confusion_matrix, annot=True, fmt='d', cmap='Blues', ax=ax)
ax.set_title(f'Confusion Matrix - {self.model_name}')
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_xticklabels(['Class 0', 'Class 1'])
ax.set_yticklabels(['Class 0', 'Class 1'])
# Add percentages
total = self.confusion_matrix.sum()
for i in range(2):
for j in range(2):
percentage = 100 * self.confusion_matrix[i, j] / total
ax.text(j + 0.5, i + 0.7, f'{percentage:.1f}%',
ha='center', va='center', fontsize=9, style='italic')
def plot_roc_curve(self, ax=None):
"""Plot ROC curve"""
if self.y_proba is None:
print("Probabilities needed for ROC curve")
return
if ax is None:
fig, ax = plt.subplots(figsize=(8, 6))
fpr, tpr, thresholds = roc_curve(self.y_true, self.y_proba)
auc_score = auc(fpr, tpr)
ax.plot(fpr, tpr, linewidth=2, label=f'{self.model_name} (AUC = {auc_score:.3f})')
ax.plot([0, 1], [0, 1], 'k--', linewidth=1, label='Random Classifier')
ax.fill_between(fpr, tpr, alpha=0.3)
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('ROC Curve')
ax.legend()
ax.grid(True, alpha=0.3)
return fpr, tpr, auc_score
def plot_precision_recall_curve(self, ax=None):
"""Plot Precision-Recall curve"""
if self.y_proba is None:
print("Probabilities needed for PR curve")
return
if ax is None:
fig, ax = plt.subplots(figsize=(8, 6))
precision, recall, thresholds = precision_recall_curve(self.y_true, self.y_proba)
avg_precision = average_precision_score(self.y_true, self.y_proba)
ax.plot(recall, precision, linewidth=2,
label=f'{self.model_name} (AP = {avg_precision:.3f})')
ax.fill_between(recall, precision, alpha=0.3)
ax.set_xlabel('Recall')
ax.set_ylabel('Precision')
ax.set_title('Precision-Recall Curve')
ax.legend()
ax.grid(True, alpha=0.3)
# Add baseline (positive class prevalence)
positive_rate = self.y_true.sum() / len(self.y_true)
ax.axhline(y=positive_rate, color='r', linestyle='--',
label=f'Baseline ({positive_rate:.3f})')
return precision, recall, avg_precision
def plot_threshold_analysis(self):
"""Analyze metrics at different thresholds"""
if self.y_proba is None:
print("Probabilities needed for threshold analysis")
return
thresholds = np.linspace(0, 1, 100)
metrics_at_threshold = {
'Precision': [],
'Recall': [],
'F1-Score': [],
'Accuracy': []
}
for threshold in thresholds:
y_pred_threshold = (self.y_proba >= threshold).astype(int)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
metrics_at_threshold['Precision'].append(
precision_score(self.y_true, y_pred_threshold, zero_division=0))
metrics_at_threshold['Recall'].append(
recall_score(self.y_true, y_pred_threshold, zero_division=0))
metrics_at_threshold['F1-Score'].append(
f1_score(self.y_true, y_pred_threshold, zero_division=0))
metrics_at_threshold['Accuracy'].append(
accuracy_score(self.y_true, y_pred_threshold))
fig, ax = plt.subplots(figsize=(10, 6))
for metric, values in metrics_at_threshold.items():
ax.plot(thresholds, values, label=metric, linewidth=2)
ax.set_xlabel('Threshold')
ax.set_ylabel('Metric Value')
ax.set_title(f'Metrics vs Threshold - {self.model_name}')
ax.legend()
ax.grid(True, alpha=0.3)
ax.axvline(x=0.5, color='k', linestyle='--', alpha=0.5, label='Default (0.5)')
plt.tight_layout()
plt.show()
# Calculate metrics for both models
lr_metrics = ClassificationMetrics(y_test, y_pred_lr, y_proba_lr, "Logistic Regression")
rf_metrics = ClassificationMetrics(y_test, y_pred_rf, y_proba_rf, "Random Forest")
lr_metrics.calculate_basic_metrics()
rf_metrics.calculate_basic_metrics()
# Print reports
lr_metrics.print_report()
rf_metrics.print_report()
# Visualize metrics
fig, axes = plt.subplots(2, 4, figsize=(16, 10))
# Confusion matrices
lr_metrics.plot_confusion_matrix(axes[0, 0])
rf_metrics.plot_confusion_matrix(axes[0, 1])
# ROC curves
lr_metrics.plot_roc_curve(axes[0, 2])
rf_metrics.plot_roc_curve(axes[0, 2]) # Both on same plot
# PR curves
lr_metrics.plot_precision_recall_curve(axes[0, 3])
rf_metrics.plot_precision_recall_curve(axes[0, 3]) # Both on same plot
# Metric comparison bar plot
metrics_comparison = pd.DataFrame({
'Logistic Regression': lr_metrics.metrics,
'Random Forest': rf_metrics.metrics
}).T
metrics_to_plot = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC']
metrics_comparison[metrics_to_plot].plot(kind='bar', ax=axes[1, 0])
axes[1, 0].set_title('Metrics Comparison')
axes[1, 0].set_ylabel('Score')
axes[1, 0].legend(loc='lower right')
axes[1, 0].grid(True, alpha=0.3)
# Class distribution
unique, counts = np.unique(y_test, return_counts=True)
axes[1, 1].bar(unique, counts, alpha=0.7)
axes[1, 1].set_title('Test Set Class Distribution')
axes[1, 1].set_xlabel('Class')
axes[1, 1].set_ylabel('Count')
axes[1, 1].set_xticks([0, 1])
for i, count in enumerate(counts):
axes[1, 1].text(i, count, f'{count}\n({count/len(y_test):.1%})',
ha='center', va='bottom')
# Feature importance (for Random Forest)
importances = rf_model.feature_importances_
indices = np.argsort(importances)[::-1][:10]
axes[1, 2].barh(range(10), importances[indices])
axes[1, 2].set_yticks(range(10))
axes[1, 2].set_yticklabels([f'Feature {i}' for i in indices])
axes[1, 2].set_xlabel('Importance')
axes[1, 2].set_title('Top 10 Feature Importances (RF)')
# Score distribution
axes[1, 3].hist(y_proba_lr[y_test == 0], bins=30, alpha=0.5, label='Class 0', density=True)
axes[1, 3].hist(y_proba_lr[y_test == 1], bins=30, alpha=0.5, label='Class 1', density=True)
axes[1, 3].set_xlabel('Predicted Probability')
axes[1, 3].set_ylabel('Density')
axes[1, 3].set_title('Score Distribution by Class (LR)')
axes[1, 3].legend()
axes[1, 3].axvline(x=0.5, color='k', linestyle='--', alpha=0.5)
plt.suptitle('Binary Classification Metrics Analysis', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()
# Threshold analysis for Logistic Regression
lr_metrics.plot_threshold_analysis()
Multi-class Classification Metrics
Handling Multiple Classes
from sklearn.metrics import (
accuracy_score, precision_recall_fscore_support,
confusion_matrix, classification_report,
cohen_kappa_score, matthews_corrcoef
)
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from itertools import cycle
# Generate multi-class data
X_multi, y_multi = make_classification(n_samples=1000, n_features=20,
n_informative=15, n_redundant=5,
n_classes=4, n_clusters_per_class=1,
random_state=42)
# Class names for better visualization
class_names = ['Class A', 'Class B', 'Class C', 'Class D']
X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(
X_multi, y_multi, test_size=0.3, random_state=42, stratify=y_multi
)
# Train multi-class classifier
rf_multi = RandomForestClassifier(n_estimators=100, random_state=42)
rf_multi.fit(X_train_m, y_train_m)
y_pred_m = rf_multi.predict(X_test_m)
y_proba_m = rf_multi.predict_proba(X_test_m)
class MulticlassMetrics:
"""Calculate metrics for multi-class classification"""
def __init__(self, y_true, y_pred, y_proba=None, class_names=None):
self.y_true = y_true
self.y_pred = y_pred
self.y_proba = y_proba
self.n_classes = len(np.unique(y_true))
self.class_names = class_names or [f'Class {i}' for i in range(self.n_classes)]
def calculate_metrics(self):
"""Calculate multi-class metrics"""
# Overall metrics
metrics = {
'Accuracy': accuracy_score(self.y_true, self.y_pred),
'Cohen Kappa': cohen_kappa_score(self.y_true, self.y_pred),
'MCC': matthews_corrcoef(self.y_true, self.y_pred)
}
# Per-class metrics
precision, recall, f1, support = precision_recall_fscore_support(
self.y_true, self.y_pred, average=None
)
# Averaging strategies
for average in ['micro', 'macro', 'weighted']:
p, r, f, _ = precision_recall_fscore_support(
self.y_true, self.y_pred, average=average
)
metrics[f'Precision ({average})'] = p
metrics[f'Recall ({average})'] = r
metrics[f'F1-Score ({average})'] = f
# Per-class breakdown
per_class_metrics = pd.DataFrame({
'Class': self.class_names,
'Precision': precision,
'Recall': recall,
'F1-Score': f1,
'Support': support
})
return metrics, per_class_metrics
def plot_confusion_matrix(self, normalize=False):
"""Plot confusion matrix for multi-class"""
cm = confusion_matrix(self.y_true, self.y_pred)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
fmt = '.2f'
title = 'Normalized Confusion Matrix'
else:
fmt = 'd'
title = 'Confusion Matrix'
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt=fmt, cmap='Blues', ax=ax,
xticklabels=self.class_names, yticklabels=self.class_names)
ax.set_title(title)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
plt.tight_layout()
plt.show()
return cm
def plot_roc_curves(self):
"""Plot ROC curves for each class (one-vs-rest)"""
if self.y_proba is None:
print("Probabilities needed for ROC curves")
return
# Binarize labels for one-vs-rest
y_true_bin = label_binarize(self.y_true, classes=range(self.n_classes))
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# Individual ROC curves
for i in range(self.n_classes):
fpr, tpr, _ = roc_curve(y_true_bin[:, i], self.y_proba[:, i])
auc_score = auc(fpr, tpr)
axes[0].plot(fpr, tpr, label=f'{self.class_names[i]} (AUC = {auc_score:.3f})')
axes[0].plot([0, 1], [0, 1], 'k--', linewidth=1)
axes[0].set_xlabel('False Positive Rate')
axes[0].set_ylabel('True Positive Rate')
axes[0].set_title('ROC Curves (One-vs-Rest)')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
# Micro and Macro average ROC
# Micro-average
fpr_micro, tpr_micro, _ = roc_curve(y_true_bin.ravel(), self.y_proba.ravel())
auc_micro = auc(fpr_micro, tpr_micro)
axes[1].plot(fpr_micro, tpr_micro,
label=f'Micro-average (AUC = {auc_micro:.3f})', linewidth=2)
# Macro-average
all_fpr = np.unique(np.concatenate([
roc_curve(y_true_bin[:, i], self.y_proba[:, i])[0]
for i in range(self.n_classes)
]))
mean_tpr = np.zeros_like(all_fpr)
for i in range(self.n_classes):
fpr, tpr, _ = roc_curve(y_true_bin[:, i], self.y_proba[:, i])
mean_tpr += np.interp(all_fpr, fpr, tpr)
mean_tpr /= self.n_classes
auc_macro = auc(all_fpr, mean_tpr)
axes[1].plot(all_fpr, mean_tpr,
label=f'Macro-average (AUC = {auc_macro:.3f})', linewidth=2)
axes[1].plot([0, 1], [0, 1], 'k--', linewidth=1)
axes[1].set_xlabel('False Positive Rate')
axes[1].set_ylabel('True Positive Rate')
axes[1].set_title('Averaged ROC Curves')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Calculate multi-class metrics
mc_metrics = MulticlassMetrics(y_test_m, y_pred_m, y_proba_m, class_names)
overall_metrics, per_class_df = mc_metrics.calculate_metrics()
print("\nMulti-class Classification Metrics")
print("="*60)
print("\nOverall Metrics:")
for metric, value in overall_metrics.items():
print(f"{metric:25s}: {value:.4f}")
print("\nPer-Class Metrics:")
print(per_class_df.to_string(index=False))
print("\nDetailed Classification Report:")
print(classification_report(y_test_m, y_pred_m, target_names=class_names))
# Visualizations
mc_metrics.plot_confusion_matrix(normalize=False)
mc_metrics.plot_confusion_matrix(normalize=True)
mc_metrics.plot_roc_curves()
Regression Metrics
Evaluating Continuous Predictions
from sklearn.metrics import (
mean_squared_error, mean_absolute_error, r2_score,
mean_absolute_percentage_error, median_absolute_error,
explained_variance_score, max_error
)
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
import numpy as np
# Generate regression data
X_reg, y_reg = make_regression(n_samples=500, n_features=10,
n_informative=8, noise=20, random_state=42)
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
X_reg, y_reg, test_size=0.3, random_state=42
)
# Train regression models
lr_reg = LinearRegression()
ridge_reg = Ridge(alpha=1.0)
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
models = {'Linear Regression': lr_reg, 'Ridge': ridge_reg, 'Random Forest': rf_reg}
for name, model in models.items():
model.fit(X_train_r, y_train_r)
class RegressionMetrics:
"""Comprehensive regression metrics"""
def __init__(self, y_true, y_pred, model_name="Model"):
self.y_true = y_true
self.y_pred = y_pred
self.model_name = model_name
self.residuals = y_true - y_pred
def calculate_metrics(self):
"""Calculate all regression metrics"""
metrics = {
'MSE': mean_squared_error(self.y_true, self.y_pred),
'RMSE': np.sqrt(mean_squared_error(self.y_true, self.y_pred)),
'MAE': mean_absolute_error(self.y_true, self.y_pred),
'MedAE': median_absolute_error(self.y_true, self.y_pred),
'MAPE': mean_absolute_percentage_error(self.y_true, self.y_pred) * 100,
'R²': r2_score(self.y_true, self.y_pred),
'Adjusted R²': self.adjusted_r2(self.y_true, self.y_pred),
'Explained Var': explained_variance_score(self.y_true, self.y_pred),
'Max Error': max_error(self.y_true, self.y_pred)
}
# Additional custom metrics
metrics['Mean Residual'] = np.mean(self.residuals)
metrics['Std Residual'] = np.std(self.residuals)
metrics['Min Residual'] = np.min(self.residuals)
metrics['Max Residual'] = np.max(self.residuals)
# Percentage of predictions within tolerance
tolerances = [5, 10, 20]
for tol in tolerances:
within_tol = np.sum(np.abs(self.residuals) <= tol) / len(self.residuals)
metrics[f'Within ±{tol}'] = within_tol
return metrics
def adjusted_r2(self, y_true, y_pred, n_features=10):
"""Calculate adjusted R²"""
n = len(y_true)
r2 = r2_score(y_true, y_pred)
adj_r2 = 1 - (1 - r2) * (n - 1) / (n - n_features - 1)
return adj_r2
def plot_predictions(self, ax=None):
"""Plot predictions vs actual"""
if ax is None:
fig, ax = plt.subplots(figsize=(8, 6))
ax.scatter(self.y_true, self.y_pred, alpha=0.5, s=20)
# Perfect prediction line
min_val = min(self.y_true.min(), self.y_pred.min())
max_val = max(self.y_true.max(), self.y_pred.max())
ax.plot([min_val, max_val], [min_val, max_val], 'r--', lw=2, label='Perfect Prediction')
# Add R² annotation
r2 = r2_score(self.y_true, self.y_pred)
ax.text(0.05, 0.95, f'R² = {r2:.3f}', transform=ax.transAxes,
fontsize=12, verticalalignment='top',
bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
ax.set_xlabel('Actual Values')
ax.set_ylabel('Predicted Values')
ax.set_title(f'Predictions vs Actual - {self.model_name}')
ax.legend()
ax.grid(True, alpha=0.3)
def plot_residuals(self):
"""Comprehensive residual analysis"""
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
# 1. Residuals vs Predicted
axes[0, 0].scatter(self.y_pred, self.residuals, alpha=0.5, s=20)
axes[0, 0].axhline(y=0, color='r', linestyle='--')
axes[0, 0].set_xlabel('Predicted Values')
axes[0, 0].set_ylabel('Residuals')
axes[0, 0].set_title('Residuals vs Predicted')
axes[0, 0].grid(True, alpha=0.3)
# Add confidence bands
std_residuals = np.std(self.residuals)
axes[0, 0].fill_between(sorted(self.y_pred), -2*std_residuals, 2*std_residuals,
alpha=0.2, color='gray', label='±2σ')
axes[0, 0].legend()
# 2. Q-Q Plot
from scipy import stats
stats.probplot(self.residuals, dist="norm", plot=axes[0, 1])
axes[0, 1].set_title('Q-Q Plot of Residuals')
# 3. Histogram of Residuals
axes[1, 0].hist(self.residuals, bins=30, edgecolor='black', alpha=0.7)
axes[1, 0].axvline(x=0, color='r', linestyle='--')
axes[1, 0].set_xlabel('Residuals')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('Distribution of Residuals')
# Add normal distribution overlay
mu, std = self.residuals.mean(), self.residuals.std()
x = np.linspace(self.residuals.min(), self.residuals.max(), 100)
axes[1, 0].plot(x, stats.norm.pdf(x, mu, std) * len(self.residuals) *
(self.residuals.max() - self.residuals.min()) / 30,
'r-', lw=2, label='Normal')
axes[1, 0].legend()
# 4. Scale-Location Plot
standardized_residuals = self.residuals / np.sqrt(np.abs(self.residuals))
axes[1, 1].scatter(self.y_pred, np.sqrt(np.abs(standardized_residuals)), alpha=0.5, s=20)
axes[1, 1].set_xlabel('Predicted Values')
axes[1, 1].set_ylabel('√|Standardized Residuals|')
axes[1, 1].set_title('Scale-Location Plot')
axes[1, 1].grid(True, alpha=0.3)
# Add trend line
z = np.polyfit(self.y_pred, np.sqrt(np.abs(standardized_residuals)), 1)
p = np.poly1d(z)
axes[1, 1].plot(sorted(self.y_pred), p(sorted(self.y_pred)), "r--", alpha=0.5)
plt.suptitle(f'Residual Analysis - {self.model_name}', fontsize=14)
plt.tight_layout()
plt.show()
# Compare regression models
results = []
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
for idx, (name, model) in enumerate(models.items()):
y_pred = model.predict(X_test_r)
reg_metrics = RegressionMetrics(y_test_r, y_pred, name)
metrics = reg_metrics.calculate_metrics()
metrics['Model'] = name
results.append(metrics)
reg_metrics.plot_predictions(axes[idx])
plt.suptitle('Regression Model Comparison', fontsize=14)
plt.tight_layout()
plt.show()
# Display metrics comparison
results_df = pd.DataFrame(results)
cols = ['Model', 'RMSE', 'MAE', 'R²', 'MAPE']
print("\nRegression Metrics Comparison:")
print(results_df[cols].to_string(index=False))
# Detailed residual analysis for best model
best_model = models['Random Forest']
y_pred_best = best_model.predict(X_test_r)
best_metrics = RegressionMetrics(y_test_r, y_pred_best, 'Random Forest')
best_metrics.plot_residuals()
Clustering Metrics
Evaluating Unsupervised Learning
from sklearn.metrics import (
silhouette_score, calinski_harabasz_score, davies_bouldin_score,
silhouette_samples, homogeneity_score, completeness_score, v_measure_score,
adjusted_rand_score, adjusted_mutual_info_score
)
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.datasets import make_blobs
# Generate clustering data
X_cluster, y_true_cluster = make_blobs(n_samples=500, centers=4, n_features=2,
center_box=(-10, 10), random_state=42)
class ClusteringMetrics:
"""Evaluate clustering performance"""
def __init__(self, X, labels_pred, labels_true=None):
self.X = X
self.labels_pred = labels_pred
self.labels_true = labels_true
self.n_clusters = len(set(labels_pred)) - (1 if -1 in labels_pred else 0)
def calculate_internal_metrics(self):
"""Calculate internal clustering metrics (no ground truth needed)"""
if self.n_clusters < 2:
print("Need at least 2 clusters for evaluation")
return {}
metrics = {}
# Filter out noise points for metrics that don't handle them
mask = self.labels_pred != -1
X_filtered = self.X[mask]
labels_filtered = self.labels_pred[mask]
if len(set(labels_filtered)) >= 2:
metrics['Silhouette Score'] = silhouette_score(X_filtered, labels_filtered)
metrics['Calinski-Harabasz'] = calinski_harabasz_score(X_filtered, labels_filtered)
metrics['Davies-Bouldin'] = davies_bouldin_score(X_filtered, labels_filtered)
# Silhouette samples for detailed analysis
self.silhouette_samples = silhouette_samples(X_filtered, labels_filtered)
metrics['N Clusters'] = self.n_clusters
metrics['N Noise Points'] = np.sum(self.labels_pred == -1)
return metrics
def calculate_external_metrics(self):
"""Calculate external metrics (ground truth needed)"""
if self.labels_true is None:
print("Ground truth labels needed for external metrics")
return {}
metrics = {
'Homogeneity': homogeneity_score(self.labels_true, self.labels_pred),
'Completeness': completeness_score(self.labels_true, self.labels_pred),
'V-Measure': v_measure_score(self.labels_true, self.labels_pred),
'Adjusted Rand Index': adjusted_rand_score(self.labels_true, self.labels_pred),
'Adjusted MI': adjusted_mutual_info_score(self.labels_true, self.labels_pred)
}
return metrics
def plot_clusters(self, ax=None):
"""Visualize clusters (for 2D data)"""
if self.X.shape[1] != 2:
print("Plotting only works for 2D data")
return
if ax is None:
fig, ax = plt.subplots(figsize=(8, 6))
# Plot clusters
unique_labels = set(self.labels_pred)
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
for k, col in zip(unique_labels, colors):
if k == -1:
# Black for noise
col = 'black'
marker = 'x'
else:
marker = 'o'
class_mask = self.labels_pred == k
ax.scatter(self.X[class_mask, 0], self.X[class_mask, 1],
c=[col], marker=marker, s=50, alpha=0.6,
label=f'Cluster {k}' if k != -1 else 'Noise')
ax.set_xlabel('Feature 1')
ax.set_ylabel('Feature 2')
ax.set_title('Clustering Results')
ax.legend()
ax.grid(True, alpha=0.3)
def plot_silhouette_analysis(self):
"""Silhouette analysis visualization"""
if not hasattr(self, 'silhouette_samples'):
print("Run calculate_internal_metrics first")
return
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
# Silhouette plot
y_lower = 10
mask = self.labels_pred != -1
labels_filtered = self.labels_pred[mask]
for i in range(self.n_clusters):
cluster_silhouette_values = self.silhouette_samples[labels_filtered == i]
cluster_silhouette_values.sort()
size_cluster_i = cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = plt.cm.Spectral(float(i) / self.n_clusters)
ax1.fill_betweenx(np.arange(y_lower, y_upper),
0, cluster_silhouette_values,
facecolor=color, edgecolor=color, alpha=0.7)
ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
y_lower = y_upper + 10
ax1.set_xlabel("Silhouette Coefficient")
ax1.set_ylabel("Cluster Label")
ax1.set_title("Silhouette Plot")
# Add average silhouette score line
avg_score = np.mean(self.silhouette_samples)
ax1.axvline(x=avg_score, color="red", linestyle="--",
label=f'Average ({avg_score:.3f})')
ax1.legend()
# Cluster visualization
self.plot_clusters(ax2)
plt.tight_layout()
plt.show()
# Compare clustering algorithms
clustering_algorithms = {
'K-Means (k=4)': KMeans(n_clusters=4, random_state=42),
'K-Means (k=3)': KMeans(n_clusters=3, random_state=42),
'K-Means (k=5)': KMeans(n_clusters=5, random_state=42),
'DBSCAN': DBSCAN(eps=1.5, min_samples=5),
'Agglomerative': AgglomerativeClustering(n_clusters=4)
}
results = []
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()
for idx, (name, algorithm) in enumerate(clustering_algorithms.items()):
# Fit and predict
labels_pred = algorithm.fit_predict(X_cluster)
# Calculate metrics
cm = ClusteringMetrics(X_cluster, labels_pred, y_true_cluster)
internal_metrics = cm.calculate_internal_metrics()
external_metrics = cm.calculate_external_metrics()
# Combine results
result = {'Algorithm': name}
result.update(internal_metrics)
result.update(external_metrics)
results.append(result)
# Visualize
cm.plot_clusters(axes[idx])
axes[idx].set_title(name)
# Remove empty subplot
fig.delaxes(axes[5])
plt.suptitle('Clustering Algorithm Comparison', fontsize=14)
plt.tight_layout()
plt.show()
# Display metrics comparison
results_df = pd.DataFrame(results)
print("\nClustering Metrics Comparison:")
print("="*80)
print("\nInternal Metrics (no ground truth needed):")
internal_cols = ['Algorithm', 'N Clusters', 'Silhouette Score',
'Calinski-Harabasz', 'Davies-Bouldin']
print(results_df[internal_cols].to_string(index=False))
print("\nExternal Metrics (with ground truth):")
external_cols = ['Algorithm', 'V-Measure', 'Adjusted Rand Index', 'Adjusted MI']
print(results_df[external_cols].to_string(index=False))
print("\nMetric Interpretation:")
print("- Silhouette Score: Higher is better (-1 to 1)")
print("- Calinski-Harabasz: Higher is better")
print("- Davies-Bouldin: Lower is better")
print("- V-Measure: Higher is better (0 to 1)")
print("- Adjusted Rand Index: Higher is better (-1 to 1)")
# Detailed silhouette analysis for K-Means (k=4)
kmeans_4 = KMeans(n_clusters=4, random_state=42)
labels_kmeans = kmeans_4.fit_predict(X_cluster)
cm_kmeans = ClusteringMetrics(X_cluster, labels_kmeans)
cm_kmeans.calculate_internal_metrics()
cm_kmeans.plot_silhouette_analysis()
Choosing the Right Metric
# Metric selection guide
class MetricSelectionGuide:
"""Guide for choosing appropriate evaluation metrics"""
@staticmethod
def classification_guide():
"""Guide for classification metrics"""
guide = {
'Balanced Classes': {
'primary': 'Accuracy',
'secondary': ['Precision', 'Recall', 'F1-Score'],
'reason': 'All classes equally important'
},
'Imbalanced Classes': {
'primary': 'F1-Score (weighted)',
'secondary': ['Precision', 'Recall', 'ROC-AUC', 'PR-AUC'],
'reason': 'Accuracy can be misleading with imbalance'
},
'Cost-Sensitive (FP costly)': {
'primary': 'Precision',
'secondary': ['Specificity', 'F0.5-Score'],
'reason': 'Minimize false positives'
},
'Cost-Sensitive (FN costly)': {
'primary': 'Recall',
'secondary': ['F2-Score', 'TPR'],
'reason': 'Minimize false negatives'
},
'Ranking/Probability': {
'primary': 'ROC-AUC',
'secondary': ['PR-AUC', 'Log Loss', 'Brier Score'],
'reason': 'Focus on probability calibration'
},
'Multi-class': {
'primary': 'Macro F1-Score',
'secondary': ['Cohen Kappa', 'MCC', 'Weighted F1'],
'reason': 'Consider all classes fairly'
}
}
return guide
@staticmethod
def regression_guide():
"""Guide for regression metrics"""
guide = {
'General Purpose': {
'primary': 'RMSE',
'secondary': ['MAE', 'R²'],
'reason': 'Standard metrics, penalizes large errors'
},
'Outlier Sensitive': {
'primary': 'MAE',
'secondary': ['Median AE', 'Quantile Loss'],
'reason': 'Less sensitive to outliers than RMSE'
},
'Relative Error Important': {
'primary': 'MAPE',
'secondary': ['SMAPE', 'RMSPE'],
'reason': 'Error as percentage of actual value'
},
'Variance Explanation': {
'primary': 'R²',
'secondary': ['Adjusted R²', 'Explained Variance'],
'reason': 'Proportion of variance explained'
},
'Business Metrics': {
'primary': 'Custom Loss',
'secondary': ['Quantile Loss', 'Huber Loss'],
'reason': 'Align with business objectives'
}
}
return guide
@staticmethod
def print_decision_tree():
"""Print metric selection decision tree"""
print("\n" + "="*80)
print("METRIC SELECTION DECISION TREE")
print("="*80)
print("\n1. PROBLEM TYPE?")
print(" ├── Classification → Go to 2")
print(" ├── Regression → Go to 3")
print(" └── Clustering → Go to 4")
print("\n2. CLASSIFICATION:")
print(" ├── Binary?")
print(" │ ├── Balanced? → Accuracy, F1-Score")
print(" │ ├── Imbalanced? → F1-Score, PR-AUC")
print(" │ └── Need probabilities? → ROC-AUC, Brier Score")
print(" └── Multi-class?")
print(" ├── Balanced? → Accuracy, Macro F1")
print(" └── Imbalanced? → Weighted F1, Cohen Kappa")
print("\n3. REGRESSION:")
print(" ├── Outliers present? → MAE, Median AE")
print(" ├── Need interpretability? → R², RMSE")
print(" └── Business constraints? → Custom metric")
print("\n4. CLUSTERING:")
print(" ├── Have ground truth? → ARI, V-Measure")
print(" └── No ground truth? → Silhouette, Calinski-Harabasz")
print("\n" + "="*80)
# Create comprehensive metric comparison
def compare_metrics_impact():
"""Show how different metrics lead to different model selection"""
# Generate imbalanced data
X_comp, y_comp = make_classification(n_samples=1000, n_features=20,
n_classes=2, weights=[0.95, 0.05],
random_state=42)
X_train_comp, X_test_comp, y_train_comp, y_test_comp = train_test_split(
X_comp, y_comp, test_size=0.3, random_state=42, stratify=y_comp
)
# Train different models
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
models = {
'Always Predict Majority': DummyClassifier(strategy='most_frequent'),
'Random (Stratified)': DummyClassifier(strategy='stratified'),
'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
'Decision Tree': DecisionTreeClassifier(max_depth=3, random_state=42),
'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}
results = []
for name, model in models.items():
model.fit(X_train_comp, y_train_comp)
y_pred = model.predict(X_test_comp)
# Calculate various metrics
result = {
'Model': name,
'Accuracy': accuracy_score(y_test_comp, y_pred),
'Precision': precision_score(y_test_comp, y_pred, zero_division=0),
'Recall': recall_score(y_test_comp, y_pred),
'F1-Score': f1_score(y_test_comp, y_pred),
}
# Add ROC-AUC if model supports probabilities
if hasattr(model, 'predict_proba'):
y_proba = model.predict_proba(X_test_comp)[:, 1]
result['ROC-AUC'] = roc_auc_score(y_test_comp, y_proba)
else:
result['ROC-AUC'] = np.nan
results.append(result)
results_df = pd.DataFrame(results)
print("\nModel Comparison with Different Metrics (Imbalanced Data)")
print("="*80)
print(f"Class Distribution - Class 0: {(y_test_comp==0).mean():.1%}, "
f"Class 1: {(y_test_comp==1).mean():.1%}")
print("\n", results_df.to_string(index=False))
print("\n⚠️ Notice how 'Always Predict Majority' has high accuracy but zero recall!")
print("This demonstrates why accuracy alone is misleading for imbalanced data.")
# Visualize metric differences
fig, ax = plt.subplots(figsize=(12, 6))
x = np.arange(len(results_df))
width = 0.15
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
colors = ['blue', 'green', 'red', 'purple']
for i, (metric, color) in enumerate(zip(metrics, colors)):
ax.bar(x + i * width, results_df[metric], width, label=metric, color=color, alpha=0.7)
ax.set_xlabel('Model')
ax.set_ylabel('Score')
ax.set_title('Impact of Metric Choice on Model Ranking')
ax.set_xticks(x + width * 1.5)
ax.set_xticklabels(results_df['Model'], rotation=45, ha='right')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Print guides
guide = MetricSelectionGuide()
print("\nCLASSIFICATION METRICS GUIDE")
print("="*60)
for scenario, details in guide.classification_guide().items():
print(f"\n{scenario}:")
print(f" Primary Metric: {details['primary']}")
print(f" Secondary: {', '.join(details['secondary'])}")
print(f" Reason: {details['reason']}")
print("\n\nREGRESSION METRICS GUIDE")
print("="*60)
for scenario, details in guide.regression_guide().items():
print(f"\n{scenario}:")
print(f" Primary Metric: {details['primary']}")
print(f" Secondary: {', '.join(details['secondary'])}")
print(f" Reason: {details['reason']}")
guide.print_decision_tree()
compare_metrics_impact()
Practice Exercises
Exercise 1: Custom Business Metric
Create a custom evaluation metric that:
- Weights false positives and false negatives differently based on business cost
- Incorporates confidence thresholds
- Handles multi-class scenarios
- Provides interpretable output
- Can be used in cross-validation
Exercise 2: Metric Monitoring System
Build a monitoring system that:
- Tracks multiple metrics over time
- Detects metric degradation
- Compares training vs validation metrics
- Identifies overfitting patterns
- Generates automated reports
Exercise 3: Metric Selection Tool
Develop an automated tool that:
- Analyzes the dataset characteristics
- Recommends appropriate metrics
- Calculates all relevant metrics
- Provides visualization dashboard
- Exports comparison reports
Key Takeaways
- 📊 Different metrics tell different stories about model performance
- ⚖️ Choose metrics that align with business objectives
- 🎯 Accuracy is often misleading for imbalanced datasets
- 📈 ROC-AUC evaluates ranking ability, not classification threshold
- 🔍 Precision focuses on false positives, Recall on false negatives
- 📉 RMSE penalizes large errors more than MAE
- 🔄 Always use multiple metrics for comprehensive evaluation
- ⚠️ Consider the cost of different types of errors
- 📋 Document metric choices and reasoning