Linear & Logistic Regression
The Foundation of Predictive Modeling! 📈
Linear and Logistic Regression are the workhorses of machine learning - simple yet powerful, interpretable yet effective. From predicting house prices to classifying customers, these algorithms form the backbone of countless data science applications. Master these fundamentals, including regularization and optimization, to build robust predictive models.
Regression Overview
graph TD
A[Regression Models] --> B[Linear Regression]
A --> C[Logistic Regression]
B --> D[Simple Linear]
B --> E[Multiple Linear]
B --> F[Polynomial]
B --> G[Regularized]
G --> H[Ridge/L2]
G --> I[Lasso/L1]
G --> J[Elastic Net]
C --> K[Binary Classification]
C --> L[Multi-class]
L --> M[One-vs-Rest]
L --> N[Multinomial]
style A fill:#f9f,stroke:#333,stroke-width:2px
style B fill:#bbf,stroke:#333,stroke-width:2px
style C fill:#fbf,stroke:#333,stroke-width:2px
Linear Regression Fundamentals
Understanding Linear Regression
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.datasets import make_regression, load_boston, load_iris
import warnings
warnings.filterwarnings('ignore')
# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
# Generate synthetic regression data
np.random.seed(42)
X_simple = 2 * np.random.rand(100, 1)
y_simple = 4 + 3 * X_simple[:, 0] + np.random.randn(100)
class LinearRegressionAnalysis:
"""Comprehensive Linear Regression Analysis"""
def __init__(self, fit_intercept=True):
self.fit_intercept = fit_intercept
self.model = None
self.coefficients = None
self.intercept = None
self.residuals = None
def manual_implementation(self, X, y):
"""
Manual implementation using Normal Equation
θ = (X^T X)^(-1) X^T y
"""
# Add intercept term if needed
if self.fit_intercept:
X_b = np.c_[np.ones((X.shape[0], 1)), X]
else:
X_b = X
# Normal equation
theta = np.linalg.inv(X_b.T @ X_b) @ X_b.T @ y
if self.fit_intercept:
self.intercept = theta[0]
self.coefficients = theta[1:]
else:
self.intercept = 0
self.coefficients = theta
return self
def gradient_descent(self, X, y, learning_rate=0.01, n_iterations=1000):
"""
Implement gradient descent optimization
"""
n_samples, n_features = X.shape
# Initialize parameters
if self.fit_intercept:
theta = np.zeros(n_features + 1)
X_b = np.c_[np.ones((n_samples, 1)), X]
else:
theta = np.zeros(n_features)
X_b = X
# Store cost history
cost_history = []
for iteration in range(n_iterations):
# Calculate predictions
y_pred = X_b @ theta
# Calculate cost (MSE)
cost = (1/(2*n_samples)) * np.sum((y_pred - y)**2)
cost_history.append(cost)
# Calculate gradients
gradients = (1/n_samples) * X_b.T @ (y_pred - y)
# Update parameters
theta -= learning_rate * gradients
if self.fit_intercept:
self.intercept = theta[0]
self.coefficients = theta[1:]
else:
self.intercept = 0
self.coefficients = theta
return self, cost_history
def fit_sklearn(self, X, y):
"""Fit using scikit-learn"""
self.model = LinearRegression(fit_intercept=self.fit_intercept)
self.model.fit(X, y)
self.coefficients = self.model.coef_
self.intercept = self.model.intercept_
return self
def predict(self, X):
"""Make predictions"""
if self.model:
return self.model.predict(X)
else:
if self.fit_intercept:
return X @ self.coefficients + self.intercept
else:
return X @ self.coefficients
def calculate_statistics(self, X, y, y_pred):
"""Calculate regression statistics"""
n = len(y)
k = X.shape[1] # number of predictors
# Residuals
residuals = y - y_pred
# R-squared
ss_total = np.sum((y - np.mean(y))**2)
ss_residual = np.sum(residuals**2)
r2 = 1 - (ss_residual / ss_total)
# Adjusted R-squared
adj_r2 = 1 - (1 - r2) * (n - 1) / (n - k - 1)
# Standard error of residuals
se_residuals = np.sqrt(ss_residual / (n - k - 1))
# Standard errors of coefficients
X_with_intercept = np.c_[np.ones((X.shape[0], 1)), X]
var_coef = se_residuals**2 * np.linalg.inv(X_with_intercept.T @ X_with_intercept)
se_coef = np.sqrt(np.diag(var_coef))
# t-statistics
params = np.concatenate([[self.intercept], self.coefficients])
t_stats = params / se_coef
# p-values (two-tailed test)
from scipy import stats
p_values = 2 * (1 - stats.t.cdf(np.abs(t_stats), n - k - 1))
return {
'r2': r2,
'adj_r2': adj_r2,
'se_residuals': se_residuals,
'se_coefficients': se_coef,
't_statistics': t_stats,
'p_values': p_values,
'residuals': residuals
}
# Simple Linear Regression Example
print("="*60)
print("SIMPLE LINEAR REGRESSION")
print("="*60)
# Fit models
lr_analysis = LinearRegressionAnalysis()
# Manual implementation
lr_analysis.manual_implementation(X_simple.reshape(-1, 1), y_simple)
print(f"\nManual Implementation (Normal Equation):")
print(f"Intercept: {lr_analysis.intercept:.4f}")
print(f"Coefficient: {lr_analysis.coefficients[0]:.4f}")
# Gradient descent
lr_gd, cost_history = lr_analysis.gradient_descent(X_simple.reshape(-1, 1), y_simple,
learning_rate=0.1, n_iterations=100)
print(f"\nGradient Descent Implementation:")
print(f"Intercept: {lr_gd.intercept:.4f}")
print(f"Coefficient: {lr_gd.coefficients[0]:.4f}")
# Scikit-learn implementation
lr_analysis.fit_sklearn(X_simple.reshape(-1, 1), y_simple)
print(f"\nScikit-learn Implementation:")
print(f"Intercept: {lr_analysis.intercept:.4f}")
print(f"Coefficient: {lr_analysis.coefficients[0]:.4f}")
# Calculate statistics
y_pred_simple = lr_analysis.predict(X_simple.reshape(-1, 1))
stats_dict = lr_analysis.calculate_statistics(X_simple.reshape(-1, 1), y_simple, y_pred_simple)
print(f"\nRegression Statistics:")
print(f"R²: {stats_dict['r2']:.4f}")
print(f"Adjusted R²: {stats_dict['adj_r2']:.4f}")
print(f"Standard Error: {stats_dict['se_residuals']:.4f}")
print(f"Coefficient p-value: {stats_dict['p_values'][1]:.6f}")
# Visualization
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
# 1. Scatter plot with regression line
axes[0].scatter(X_simple, y_simple, alpha=0.5, label='Data points')
axes[0].plot(X_simple, y_pred_simple, 'r-', lw=2, label='Regression line')
axes[0].fill_between(X_simple.flatten(),
y_pred_simple - 1.96 * stats_dict['se_residuals'],
y_pred_simple + 1.96 * stats_dict['se_residuals'],
alpha=0.2, color='red', label='95% CI')
axes[0].set_xlabel('X')
axes[0].set_ylabel('y')
axes[0].set_title('Simple Linear Regression')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
# Add equation to plot
equation = f'y = {lr_analysis.intercept:.2f} + {lr_analysis.coefficients[0]:.2f}x'
axes[0].text(0.05, 0.95, equation, transform=axes[0].transAxes,
fontsize=11, verticalalignment='top',
bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
# 2. Gradient descent convergence
axes[1].plot(cost_history, 'b-', lw=2)
axes[1].set_xlabel('Iteration')
axes[1].set_ylabel('Cost (MSE)')
axes[1].set_title('Gradient Descent Convergence')
axes[1].grid(True, alpha=0.3)
# 3. Residual plot
axes[2].scatter(y_pred_simple, stats_dict['residuals'], alpha=0.5)
axes[2].axhline(y=0, color='r', linestyle='--')
axes[2].set_xlabel('Fitted Values')
axes[2].set_ylabel('Residuals')
axes[2].set_title('Residual Plot')
axes[2].grid(True, alpha=0.3)
plt.suptitle('Simple Linear Regression Analysis', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()
Multiple Linear Regression
Working with Multiple Features
# Generate multiple regression data
X_multi, y_multi = make_regression(n_samples=200, n_features=5,
n_informative=3, noise=10, random_state=42)
feature_names = [f'Feature_{i+1}' for i in range(X_multi.shape[1])]
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_multi, y_multi,
test_size=0.3, random_state=42)
# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
class MultipleLinearRegression:
"""Multiple Linear Regression with diagnostics"""
def __init__(self):
self.model = LinearRegression()
self.scaler = StandardScaler()
def fit(self, X, y):
"""Fit the model"""
self.X_scaled = self.scaler.fit_transform(X)
self.model.fit(self.X_scaled, y)
self.coefficients = self.model.coef_
self.intercept = self.model.intercept_
return self
def predict(self, X):
"""Make predictions"""
X_scaled = self.scaler.transform(X)
return self.model.predict(X_scaled)
def feature_importance_analysis(self, X, y, feature_names):
"""Analyze feature importance"""
# Standardized coefficients
standardized_coef = self.coefficients
# Calculate correlation with target
correlations = []
for i in range(X.shape[1]):
corr = np.corrcoef(X[:, i], y)[0, 1]
correlations.append(corr)
# Create DataFrame
importance_df = pd.DataFrame({
'Feature': feature_names,
'Coefficient': self.model.coef_ / self.scaler.scale_, # Original scale
'Standardized_Coef': standardized_coef,
'Abs_Standardized_Coef': np.abs(standardized_coef),
'Correlation_with_y': correlations
}).sort_values('Abs_Standardized_Coef', ascending=False)
return importance_df
def multicollinearity_check(self, X, feature_names):
"""Check for multicollinearity using VIF"""
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_data = pd.DataFrame()
vif_data["Feature"] = feature_names
vif_data["VIF"] = [variance_inflation_factor(X, i)
for i in range(X.shape[1])]
# Interpretation
vif_data['Multicollinearity'] = vif_data['VIF'].apply(
lambda x: 'High' if x > 10 else ('Moderate' if x > 5 else 'Low')
)
return vif_data.sort_values('VIF', ascending=False)
def diagnostic_plots(self, X, y):
"""Create diagnostic plots"""
y_pred = self.predict(X)
residuals = y - y_pred
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
# 1. Actual vs Predicted
axes[0, 0].scatter(y, y_pred, alpha=0.5)
axes[0, 0].plot([y.min(), y.max()], [y.min(), y.max()], 'r--', lw=2)
axes[0, 0].set_xlabel('Actual')
axes[0, 0].set_ylabel('Predicted')
axes[0, 0].set_title('Actual vs Predicted')
r2 = r2_score(y, y_pred)
axes[0, 0].text(0.05, 0.95, f'R² = {r2:.3f}', transform=axes[0, 0].transAxes,
fontsize=12, verticalalignment='top',
bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
# 2. Residuals vs Fitted
axes[0, 1].scatter(y_pred, residuals, alpha=0.5)
axes[0, 1].axhline(y=0, color='r', linestyle='--')
axes[0, 1].set_xlabel('Fitted Values')
axes[0, 1].set_ylabel('Residuals')
axes[0, 1].set_title('Residuals vs Fitted')
# 3. Q-Q Plot
from scipy import stats
stats.probplot(residuals, dist="norm", plot=axes[0, 2])
axes[0, 2].set_title('Q-Q Plot')
# 4. Histogram of Residuals
axes[1, 0].hist(residuals, bins=30, edgecolor='black', alpha=0.7)
axes[1, 0].set_xlabel('Residuals')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('Distribution of Residuals')
# 5. Scale-Location Plot
standardized_residuals = residuals / np.std(residuals)
axes[1, 1].scatter(y_pred, np.sqrt(np.abs(standardized_residuals)), alpha=0.5)
axes[1, 1].set_xlabel('Fitted Values')
axes[1, 1].set_ylabel('√|Standardized Residuals|')
axes[1, 1].set_title('Scale-Location Plot')
# 6. Cook's Distance
from scipy.stats import f
n = len(y)
k = self.X_scaled.shape[1]
# Simplified Cook's distance calculation
leverage = np.diag(self.X_scaled @ np.linalg.inv(self.X_scaled.T @ self.X_scaled) @ self.X_scaled.T)
cooks_d = (residuals**2 / (k + 1)) * (leverage / (1 - leverage)**2) / np.var(residuals)
axes[1, 2].stem(range(len(cooks_d)), cooks_d, markerfmt=',')
axes[1, 2].set_xlabel('Observation')
axes[1, 2].set_ylabel("Cook's Distance")
axes[1, 2].set_title("Cook's Distance")
threshold = 4 / n
axes[1, 2].axhline(y=threshold, color='r', linestyle='--',
label=f'Threshold ({threshold:.3f})')
axes[1, 2].legend()
plt.suptitle('Regression Diagnostic Plots', fontsize=14)
plt.tight_layout()
plt.show()
# Fit Multiple Linear Regression
mlr = MultipleLinearRegression()
mlr.fit(X_train, y_train)
# Feature importance
importance_df = mlr.feature_importance_analysis(X_train, y_train, feature_names)
print("\n" + "="*60)
print("MULTIPLE LINEAR REGRESSION")
print("="*60)
print("\nFeature Importance Analysis:")
print(importance_df.to_string(index=False))
# Multicollinearity check
vif_df = mlr.multicollinearity_check(X_train, feature_names)
print("\nVariance Inflation Factor (VIF) Analysis:")
print(vif_df.to_string(index=False))
# Model evaluation
y_pred_train = mlr.predict(X_train)
y_pred_test = mlr.predict(X_test)
print(f"\nModel Performance:")
print(f"Train R²: {r2_score(y_train, y_pred_train):.4f}")
print(f"Test R²: {r2_score(y_test, y_pred_test):.4f}")
print(f"Train RMSE: {np.sqrt(mean_squared_error(y_train, y_pred_train)):.4f}")
print(f"Test RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_test)):.4f}")
# Diagnostic plots
mlr.diagnostic_plots(X_test, y_test)
Regularized Regression
Ridge, Lasso, and Elastic Net
# Generate data with multicollinearity
np.random.seed(42)
n_samples, n_features = 100, 20
X_reg = np.random.randn(n_samples, n_features)
# Create multicollinearity
X_reg[:, 1] = X_reg[:, 0] + 0.5 * np.random.randn(n_samples)
X_reg[:, 2] = X_reg[:, 0] + 0.5 * np.random.randn(n_samples)
# True coefficients (sparse: only first 5 are non-zero)
true_coef = np.zeros(n_features)
true_coef[:5] = [3, -2, 1.5, 0, -1]
y_reg = X_reg @ true_coef + 0.5 * np.random.randn(n_samples)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
X_reg, y_reg, test_size=0.3, random_state=42
)
class RegularizedRegression:
"""Compare regularized regression methods"""
def __init__(self):
self.models = {}
self.results = []
def fit_all_models(self, X_train, y_train, X_test, y_test, alphas=None):
"""Fit all regularization methods"""
if alphas is None:
alphas = np.logspace(-4, 2, 50)
# Store results for each model
self.ridge_results = {'alphas': [], 'train_scores': [], 'test_scores': [], 'coefs': []}
self.lasso_results = {'alphas': [], 'train_scores': [], 'test_scores': [], 'coefs': []}
self.elastic_results = {'alphas': [], 'train_scores': [], 'test_scores': [], 'coefs': []}
for alpha in alphas:
# Ridge
ridge = Ridge(alpha=alpha)
ridge.fit(X_train, y_train)
self.ridge_results['alphas'].append(alpha)
self.ridge_results['train_scores'].append(ridge.score(X_train, y_train))
self.ridge_results['test_scores'].append(ridge.score(X_test, y_test))
self.ridge_results['coefs'].append(ridge.coef_)
# Lasso
lasso = Lasso(alpha=alpha, max_iter=2000)
lasso.fit(X_train, y_train)
self.lasso_results['alphas'].append(alpha)
self.lasso_results['train_scores'].append(lasso.score(X_train, y_train))
self.lasso_results['test_scores'].append(lasso.score(X_test, y_test))
self.lasso_results['coefs'].append(lasso.coef_)
# Elastic Net
elastic = ElasticNet(alpha=alpha, l1_ratio=0.5, max_iter=2000)
elastic.fit(X_train, y_train)
self.elastic_results['alphas'].append(alpha)
self.elastic_results['train_scores'].append(elastic.score(X_train, y_train))
self.elastic_results['test_scores'].append(elastic.score(X_test, y_test))
self.elastic_results['coefs'].append(elastic.coef_)
# Find best alpha for each
self.best_ridge_alpha = alphas[np.argmax(self.ridge_results['test_scores'])]
self.best_lasso_alpha = alphas[np.argmax(self.lasso_results['test_scores'])]
self.best_elastic_alpha = alphas[np.argmax(self.elastic_results['test_scores'])]
# Fit best models
self.models['OLS'] = LinearRegression()
self.models['Ridge'] = Ridge(alpha=self.best_ridge_alpha)
self.models['Lasso'] = Lasso(alpha=self.best_lasso_alpha, max_iter=2000)
self.models['Elastic Net'] = ElasticNet(alpha=self.best_elastic_alpha,
l1_ratio=0.5, max_iter=2000)
for name, model in self.models.items():
model.fit(X_train, y_train)
def plot_regularization_paths(self):
"""Plot coefficient paths for different alphas"""
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
# Ridge coefficients
ridge_coefs = np.array(self.ridge_results['coefs']).T
for i in range(ridge_coefs.shape[0]):
axes[0].plot(self.ridge_results['alphas'], ridge_coefs[i], alpha=0.5)
axes[0].set_xscale('log')
axes[0].set_xlabel('Alpha (λ)')
axes[0].set_ylabel('Coefficients')
axes[0].set_title('Ridge Regularization Path')
axes[0].axvline(x=self.best_ridge_alpha, color='r', linestyle='--',
label=f'Best α={self.best_ridge_alpha:.4f}')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
# Lasso coefficients
lasso_coefs = np.array(self.lasso_results['coefs']).T
for i in range(lasso_coefs.shape[0]):
axes[1].plot(self.lasso_results['alphas'], lasso_coefs[i], alpha=0.5)
axes[1].set_xscale('log')
axes[1].set_xlabel('Alpha (λ)')
axes[1].set_ylabel('Coefficients')
axes[1].set_title('Lasso Regularization Path')
axes[1].axvline(x=self.best_lasso_alpha, color='r', linestyle='--',
label=f'Best α={self.best_lasso_alpha:.4f}')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
# Elastic Net coefficients
elastic_coefs = np.array(self.elastic_results['coefs']).T
for i in range(elastic_coefs.shape[0]):
axes[2].plot(self.elastic_results['alphas'], elastic_coefs[i], alpha=0.5)
axes[2].set_xscale('log')
axes[2].set_xlabel('Alpha (λ)')
axes[2].set_ylabel('Coefficients')
axes[2].set_title('Elastic Net Regularization Path')
axes[2].axvline(x=self.best_elastic_alpha, color='r', linestyle='--',
label=f'Best α={self.best_elastic_alpha:.4f}')
axes[2].legend()
axes[2].grid(True, alpha=0.3)
plt.suptitle('Regularization Paths', fontsize=14)
plt.tight_layout()
plt.show()
def plot_alpha_vs_score(self):
"""Plot alpha vs model score"""
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
# Ridge
axes[0].plot(self.ridge_results['alphas'], self.ridge_results['train_scores'],
label='Train', alpha=0.7)
axes[0].plot(self.ridge_results['alphas'], self.ridge_results['test_scores'],
label='Test', alpha=0.7)
axes[0].set_xscale('log')
axes[0].set_xlabel('Alpha (λ)')
axes[0].set_ylabel('R² Score')
axes[0].set_title('Ridge: Alpha vs Score')
axes[0].axvline(x=self.best_ridge_alpha, color='r', linestyle='--')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
# Lasso
axes[1].plot(self.lasso_results['alphas'], self.lasso_results['train_scores'],
label='Train', alpha=0.7)
axes[1].plot(self.lasso_results['alphas'], self.lasso_results['test_scores'],
label='Test', alpha=0.7)
axes[1].set_xscale('log')
axes[1].set_xlabel('Alpha (λ)')
axes[1].set_ylabel('R² Score')
axes[1].set_title('Lasso: Alpha vs Score')
axes[1].axvline(x=self.best_lasso_alpha, color='r', linestyle='--')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
# Elastic Net
axes[2].plot(self.elastic_results['alphas'], self.elastic_results['train_scores'],
label='Train', alpha=0.7)
axes[2].plot(self.elastic_results['alphas'], self.elastic_results['test_scores'],
label='Test', alpha=0.7)
axes[2].set_xscale('log')
axes[2].set_xlabel('Alpha (λ)')
axes[2].set_ylabel('R² Score')
axes[2].set_title('Elastic Net: Alpha vs Score')
axes[2].axvline(x=self.best_elastic_alpha, color='r', linestyle='--')
axes[2].legend()
axes[2].grid(True, alpha=0.3)
plt.suptitle('Regularization Strength vs Model Performance', fontsize=14)
plt.tight_layout()
plt.show()
def compare_coefficients(self, true_coef=None):
"""Compare coefficients across models"""
n_features = self.models['OLS'].coef_.shape[0]
feature_names = [f'Feature {i}' for i in range(n_features)]
coef_comparison = pd.DataFrame({
'Feature': feature_names,
'True': true_coef if true_coef is not None else [np.nan] * n_features,
'OLS': self.models['OLS'].coef_,
'Ridge': self.models['Ridge'].coef_,
'Lasso': self.models['Lasso'].coef_,
'Elastic Net': self.models['Elastic Net'].coef_
})
# Count non-zero coefficients
print("\nNon-zero Coefficients:")
for col in ['OLS', 'Ridge', 'Lasso', 'Elastic Net']:
non_zero = np.sum(np.abs(coef_comparison[col]) > 1e-10)
print(f"{col}: {non_zero}/{n_features}")
return coef_comparison
# Fit regularized models
reg_models = RegularizedRegression()
reg_models.fit_all_models(X_train_reg, y_train_reg, X_test_reg, y_test_reg)
print("\n" + "="*60)
print("REGULARIZED REGRESSION")
print("="*60)
# Compare coefficients
coef_comparison = reg_models.compare_coefficients(true_coef)
print("\nCoefficient Comparison (First 10 features):")
print(coef_comparison.head(10).to_string(index=False))
# Model comparison
print("\nModel Performance Comparison:")
results = []
for name, model in reg_models.models.items():
train_score = model.score(X_train_reg, y_train_reg)
test_score = model.score(X_test_reg, y_test_reg)
train_rmse = np.sqrt(mean_squared_error(y_train_reg, model.predict(X_train_reg)))
test_rmse = np.sqrt(mean_squared_error(y_test_reg, model.predict(X_test_reg)))
results.append({
'Model': name,
'Train R²': train_score,
'Test R²': test_score,
'Train RMSE': train_rmse,
'Test RMSE': test_rmse
})
results_df = pd.DataFrame(results)
print(results_df.to_string(index=False))
# Visualizations
reg_models.plot_regularization_paths()
reg_models.plot_alpha_vs_score()
Logistic Regression
Classification with Logistic Regression
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
f1_score, roc_auc_score, roc_curve, confusion_matrix,
classification_report)
from sklearn.datasets import make_classification
# Generate binary classification data
X_binary, y_binary = make_classification(n_samples=500, n_features=10,
n_informative=7, n_redundant=3,
n_classes=2, weights=[0.7, 0.3],
random_state=42)
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(
X_binary, y_binary, test_size=0.3, stratify=y_binary, random_state=42
)
class LogisticRegressionAnalysis:
"""Comprehensive Logistic Regression Analysis"""
def __init__(self, penalty='l2', C=1.0, solver='lbfgs', max_iter=1000):
self.penalty = penalty
self.C = C
self.solver = solver
self.max_iter = max_iter
self.model = None
def sigmoid(self, z):
"""Sigmoid activation function"""
return 1 / (1 + np.exp(-np.clip(z, -500, 500)))
def manual_implementation(self, X, y, learning_rate=0.01, n_iterations=1000):
"""
Manual implementation using gradient descent
"""
n_samples, n_features = X.shape
# Initialize parameters
self.weights = np.zeros(n_features)
self.bias = 0
# Cost history
cost_history = []
for i in range(n_iterations):
# Forward propagation
z = X @ self.weights + self.bias
y_pred = self.sigmoid(z)
# Calculate cost (binary cross-entropy)
cost = -np.mean(y * np.log(y_pred + 1e-10) +
(1 - y) * np.log(1 - y_pred + 1e-10))
cost_history.append(cost)
# Backward propagation
dz = y_pred - y
dw = (1/n_samples) * (X.T @ dz)
db = (1/n_samples) * np.sum(dz)
# Update parameters
self.weights -= learning_rate * dw
self.bias -= learning_rate * db
return cost_history
def fit_sklearn(self, X_train, y_train, X_test=None, y_test=None):
"""Fit using scikit-learn"""
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
# Fit model
self.model = LogisticRegression(
penalty=self.penalty,
C=self.C,
solver=self.solver,
max_iter=self.max_iter,
random_state=42
)
self.model.fit(X_train_scaled, y_train)
# Store scaler
self.scaler = scaler
# Calculate probabilities for decision boundary
self.train_proba = self.model.predict_proba(X_train_scaled)[:, 1]
if X_test is not None:
X_test_scaled = scaler.transform(X_test)
self.test_proba = self.model.predict_proba(X_test_scaled)[:, 1]
return self
def plot_decision_boundary_2d(self, X, y):
"""Plot decision boundary for 2D data"""
if X.shape[1] != 2:
print("Can only plot decision boundary for 2D data")
return
# Create mesh
h = 0.02
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# Predict on mesh
Z = self.model.predict_proba(
self.scaler.transform(np.c_[xx.ravel(), yy.ravel()])
)[:, 1].reshape(xx.shape)
# Plot
plt.figure(figsize=(10, 8))
plt.contourf(xx, yy, Z, alpha=0.8, cmap=plt.cm.RdYlBu_r, levels=20)
plt.colorbar(label='Probability of Class 1')
# Plot points
scatter = plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.RdYlBu_r,
edgecolor='black', s=50)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Logistic Regression Decision Boundary')
# Add decision boundary line
plt.contour(xx, yy, Z, levels=[0.5], colors='black', linewidths=2)
plt.show()
def plot_probability_distribution(self, y_true, y_proba):
"""Plot probability distributions by class"""
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# Histogram of probabilities by true class
axes[0].hist(y_proba[y_true == 0], bins=30, alpha=0.5,
label='Class 0', density=True, color='blue')
axes[0].hist(y_proba[y_true == 1], bins=30, alpha=0.5,
label='Class 1', density=True, color='red')
axes[0].axvline(x=0.5, color='black', linestyle='--',
label='Default Threshold')
axes[0].set_xlabel('Predicted Probability')
axes[0].set_ylabel('Density')
axes[0].set_title('Probability Distribution by True Class')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
# Calibration plot
from sklearn.calibration import calibration_curve
fraction_pos, mean_pred = calibration_curve(y_true, y_proba, n_bins=10)
axes[1].plot(mean_pred, fraction_pos, 'o-', label='Model')
axes[1].plot([0, 1], [0, 1], 'k--', label='Perfect Calibration')
axes[1].set_xlabel('Mean Predicted Probability')
axes[1].set_ylabel('Fraction of Positives')
axes[1].set_title('Calibration Plot')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
plt.suptitle('Logistic Regression Probability Analysis', fontsize=14)
plt.tight_layout()
plt.show()
def coefficient_analysis(self, feature_names):
"""Analyze and visualize coefficients"""
if self.model is None:
print("Model not fitted yet")
return
# Get coefficients
coefs = self.model.coef_[0]
# Calculate odds ratios
odds_ratios = np.exp(coefs)
# Create DataFrame
coef_df = pd.DataFrame({
'Feature': feature_names,
'Coefficient': coefs,
'Odds Ratio': odds_ratios,
'Impact': odds_ratios - 1 # Percentage change in odds
}).sort_values('Coefficient', key=abs, ascending=False)
# Visualization
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# Coefficients
y_pos = np.arange(len(feature_names))
axes[0].barh(y_pos, coef_df['Coefficient'].values)
axes[0].set_yticks(y_pos)
axes[0].set_yticklabels(coef_df['Feature'].values)
axes[0].set_xlabel('Coefficient')
axes[0].set_title('Feature Coefficients')
axes[0].axvline(x=0, color='red', linestyle='--')
axes[0].grid(True, alpha=0.3)
# Odds Ratios
axes[1].barh(y_pos, coef_df['Odds Ratio'].values)
axes[1].set_yticks(y_pos)
axes[1].set_yticklabels(coef_df['Feature'].values)
axes[1].set_xlabel('Odds Ratio')
axes[1].set_title('Odds Ratios')
axes[1].axvline(x=1, color='red', linestyle='--')
axes[1].grid(True, alpha=0.3)
plt.suptitle('Logistic Regression Coefficient Analysis', fontsize=14)
plt.tight_layout()
plt.show()
return coef_df
# Logistic Regression Analysis
print("\n" + "="*60)
print("LOGISTIC REGRESSION")
print("="*60)
# Manual implementation
log_reg_manual = LogisticRegressionAnalysis()
cost_history = log_reg_manual.manual_implementation(X_train_b, y_train_b,
learning_rate=0.1,
n_iterations=500)
# Scikit-learn implementation
log_reg = LogisticRegressionAnalysis(penalty='l2', C=1.0)
log_reg.fit_sklearn(X_train_b, y_train_b, X_test_b, y_test_b)
# Predictions
y_pred_train = log_reg.model.predict(log_reg.scaler.transform(X_train_b))
y_pred_test = log_reg.model.predict(log_reg.scaler.transform(X_test_b))
y_proba_test = log_reg.test_proba
# Model evaluation
print("\nModel Performance:")
print(f"Train Accuracy: {accuracy_score(y_train_b, y_pred_train):.4f}")
print(f"Test Accuracy: {accuracy_score(y_test_b, y_pred_test):.4f}")
print(f"Precision: {precision_score(y_test_b, y_pred_test):.4f}")
print(f"Recall: {recall_score(y_test_b, y_pred_test):.4f}")
print(f"F1-Score: {f1_score(y_test_b, y_pred_test):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_test_b, y_proba_test):.4f}")
print("\nClassification Report:")
print(classification_report(y_test_b, y_pred_test,
target_names=['Class 0', 'Class 1']))
# Coefficient analysis
feature_names_binary = [f'Feature_{i+1}' for i in range(X_binary.shape[1])]
coef_df = log_reg.coefficient_analysis(feature_names_binary)
print("\nTop 5 Most Important Features:")
print(coef_df.head().to_string(index=False))
# Probability analysis
log_reg.plot_probability_distribution(y_test_b, y_proba_test)
# For 2D visualization, create a simple 2D dataset
X_2d, y_2d = make_classification(n_samples=200, n_features=2,
n_redundant=0, n_informative=2,
n_clusters_per_class=1, random_state=42)
log_reg_2d = LogisticRegressionAnalysis()
log_reg_2d.fit_sklearn(X_2d, y_2d)
log_reg_2d.plot_decision_boundary_2d(X_2d, y_2d)
Multi-class Classification
One-vs-Rest and Multinomial
# Generate multi-class data
X_multi, y_multi = make_classification(n_samples=600, n_features=10,
n_informative=8, n_redundant=2,
n_classes=3, n_clusters_per_class=1,
random_state=42)
X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(
X_multi, y_multi, test_size=0.3, stratify=y_multi, random_state=42
)
# Compare multi-class strategies
class MulticlassLogisticRegression:
"""Compare multi-class logistic regression strategies"""
def __init__(self):
self.models = {}
self.results = {}
def fit_models(self, X_train, y_train, X_test, y_test):
"""Fit different multi-class strategies"""
# Standardize
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# One-vs-Rest (OvR)
self.models['OvR'] = LogisticRegression(
multi_class='ovr', solver='liblinear', random_state=42
)
self.models['OvR'].fit(X_train_scaled, y_train)
# Multinomial
self.models['Multinomial'] = LogisticRegression(
multi_class='multinomial', solver='lbfgs', random_state=42
)
self.models['Multinomial'].fit(X_train_scaled, y_train)
# Evaluate both
for name, model in self.models.items():
y_pred = model.predict(X_test_scaled)
y_proba = model.predict_proba(X_test_scaled)
self.results[name] = {
'accuracy': accuracy_score(y_test, y_pred),
'predictions': y_pred,
'probabilities': y_proba,
'confusion_matrix': confusion_matrix(y_test, y_pred)
}
self.scaler = scaler
return self
def compare_performance(self):
"""Compare model performance"""
comparison_df = pd.DataFrame({
'Model': list(self.results.keys()),
'Accuracy': [self.results[m]['accuracy'] for m in self.results]
})
print("\nMulti-class Strategy Comparison:")
print(comparison_df.to_string(index=False))
# Plot confusion matrices
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
for idx, (name, results) in enumerate(self.results.items()):
cm = results['confusion_matrix']
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx])
axes[idx].set_title(f'{name} Confusion Matrix')
axes[idx].set_xlabel('Predicted')
axes[idx].set_ylabel('Actual')
plt.suptitle('Multi-class Logistic Regression Comparison', fontsize=14)
plt.tight_layout()
plt.show()
# Fit multi-class models
mc_log_reg = MulticlassLogisticRegression()
mc_log_reg.fit_models(X_train_m, y_train_m, X_test_m, y_test_m)
mc_log_reg.compare_performance()
Best Practices and Guidelines
# Best practices for regression models
class RegressionBestPractices:
"""Guidelines for using regression models effectively"""
@staticmethod
def assumptions_checklist():
"""Linear regression assumptions"""
print("\n" + "="*60)
print("LINEAR REGRESSION ASSUMPTIONS CHECKLIST")
print("="*60)
assumptions = [
{
'assumption': 'Linearity',
'check': 'Plot residuals vs fitted values',
'violation_sign': 'Pattern in residual plot',
'solution': 'Transform variables or use polynomial features'
},
{
'assumption': 'Independence',
'check': 'Durbin-Watson test',
'violation_sign': 'Autocorrelation in residuals',
'solution': 'Use time series methods or GLS'
},
{
'assumption': 'Homoscedasticity',
'check': 'Scale-location plot, Breusch-Pagan test',
'violation_sign': 'Funnel shape in residual plot',
'solution': 'Transform target or use weighted least squares'
},
{
'assumption': 'Normality of residuals',
'check': 'Q-Q plot, Shapiro-Wilk test',
'violation_sign': 'Deviation from diagonal in Q-Q plot',
'solution': 'Transform target or use robust regression'
},
{
'assumption': 'No multicollinearity',
'check': 'VIF, correlation matrix',
'violation_sign': 'VIF > 10',
'solution': 'Remove correlated features or use regularization'
}
]
for i, item in enumerate(assumptions, 1):
print(f"\n{i}. {item['assumption']}")
print(f" Check: {item['check']}")
print(f" Violation sign: {item['violation_sign']}")
print(f" Solution: {item['solution']}")
@staticmethod
def when_to_use_guide():
"""Guide for choosing regression type"""
print("\n" + "="*60)
print("WHEN TO USE EACH REGRESSION TYPE")
print("="*60)
guide = {
'Linear Regression': {
'use_when': [
'Continuous target variable',
'Linear relationship exists',
'Need interpretability',
'Assumptions are met'
],
'avoid_when': [
'Binary/categorical target',
'Non-linear relationships',
'High multicollinearity'
]
},
'Ridge Regression': {
'use_when': [
'Multicollinearity present',
'Many features',
'Overfitting in OLS',
'All features potentially relevant'
],
'avoid_when': [
'Need exact feature selection',
'Sparse solutions required'
]
},
'Lasso Regression': {
'use_when': [
'Feature selection needed',
'Sparse solutions desired',
'Many irrelevant features',
'Interpretability important'
],
'avoid_when': [
'Groups of correlated features',
'All features are important'
]
},
'Elastic Net': {
'use_when': [
'Correlated feature groups',
'Need balance between Ridge and Lasso',
'High-dimensional data'
],
'avoid_when': [
'Simple problems',
'Few features'
]
},
'Logistic Regression': {
'use_when': [
'Binary/multi-class classification',
'Need probability estimates',
'Linear decision boundary sufficient',
'Interpretability important'
],
'avoid_when': [
'Non-linear decision boundary',
'Complex interactions',
'Continuous target'
]
}
}
for model, details in guide.items():
print(f"\n{model}:")
print(" Use when:")
for condition in details['use_when']:
print(f" • {condition}")
print(" Avoid when:")
for condition in details['avoid_when']:
print(f" • {condition}")
@staticmethod
def hyperparameter_tuning_tips():
"""Tips for hyperparameter tuning"""
print("\n" + "="*60)
print("HYPERPARAMETER TUNING TIPS")
print("="*60)
tips = """
1. Regularization Strength (alpha/C):
• Start with log scale: [0.001, 0.01, 0.1, 1, 10, 100]
• Use CV to find optimal value
• Higher alpha = more regularization
2. Solver Selection:
• Small datasets: 'liblinear'
• Large datasets: 'sag' or 'saga'
• Multinomial: 'lbfgs' or 'newton-cg'
3. L1 Ratio (Elastic Net):
• 0 = Ridge, 1 = Lasso
• Try [0.1, 0.5, 0.7, 0.9, 0.95, 0.99]
4. Cross-validation:
• Use 5-fold or 10-fold CV
• Consider stratified CV for classification
• Use time series CV for temporal data
"""
print(tips)
# Print best practices
practices = RegressionBestPractices()
practices.assumptions_checklist()
practices.when_to_use_guide()
practices.hyperparameter_tuning_tips()
# Summary comparison of all regression types
print("\n" + "="*60)
print("REGRESSION METHODS SUMMARY")
print("="*60)
summary_data = {
'Method': ['OLS', 'Ridge', 'Lasso', 'Elastic Net', 'Logistic'],
'Type': ['Regression', 'Regression', 'Regression', 'Regression', 'Classification'],
'Regularization': ['None', 'L2', 'L1', 'L1+L2', 'L2 (optional)'],
'Feature Selection': ['No', 'No', 'Yes', 'Yes', 'No'],
'Handles Multicollinearity': ['No', 'Yes', 'Partially', 'Yes', 'Partially'],
'Interpretability': ['High', 'High', 'High', 'High', 'High'],
'Computational Cost': ['Low', 'Low', 'Medium', 'Medium', 'Low']
}
summary_df = pd.DataFrame(summary_data)
print(summary_df.to_string(index=False))
Practice Exercises
Exercise 1: Build a Complete Regression Pipeline
Create a comprehensive regression pipeline that:
- Performs exploratory data analysis
- Checks regression assumptions
- Compares OLS, Ridge, Lasso, and Elastic Net
- Implements cross-validation for hyperparameter tuning
- Provides diagnostic plots and interpretation
Exercise 2: Custom Regularization
Implement a custom regularization method that:
- Combines L1 and L2 penalties with custom weights
- Includes feature-specific regularization strengths
- Implements coordinate descent optimization
- Compares with standard methods
Exercise 3: Logistic Regression Extensions
Extend logistic regression to handle:
- Imbalanced classes with class weights
- Ordinal regression for ordered categories
- Multi-label classification
- Custom loss functions
- Confidence intervals for predictions
Key Takeaways
- 📈 Linear regression assumes linearity, independence, homoscedasticity, and normality
- 🎯 Always check assumptions before interpreting results
- 🔧 Regularization prevents overfitting and handles multicollinearity
- ⚖️ Ridge keeps all features, Lasso performs feature selection
- 🔄 Elastic Net combines benefits of Ridge and Lasso
- 📊 Logistic regression outputs probabilities for classification
- 🎲 Use cross-validation to select regularization strength
- 📉 Monitor both training and test performance
- 🔍 Diagnostic plots reveal assumption violations
- 💡 Interpretability is a key advantage of these methods