Skip to main content

Random Forests

The Power of Ensemble Learning! 🌲🌲🌲

Random Forests combine the wisdom of crowds with the power of decision trees. By building hundreds of trees and letting them vote, Random Forests achieve remarkable accuracy while maintaining robustness against overfitting. Master this versatile algorithm that excels at both classification and regression tasks.

Random Forest Architecture

graph TD A[Random Forest] --> B[Bootstrap Sampling] A --> C[Random Feature Selection] A --> D[Multiple Trees] B --> E[Sample 1] B --> F[Sample 2] B --> G[Sample n] C --> H[Features Subset 1] C --> I[Features Subset 2] C --> J[Features Subset m] D --> K[Tree 1] D --> L[Tree 2] D --> M[Tree n] K --> N[Predictions] L --> N M --> N N --> O[Voting/Averaging] O --> P[Final Prediction] style A fill:#f9f,stroke:#333,stroke-width:2px style O fill:#bbf,stroke:#333,stroke-width:2px style P fill:#9f9,stroke:#333,stroke-width:2px

Understanding Random Forests

Core Concepts and Implementation

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import (accuracy_score, confusion_matrix, classification_report,
                           mean_squared_error, r2_score, mean_absolute_error)
from sklearn.datasets import make_classification, make_regression, load_iris, load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Generate classification data
X_class, y_class = make_classification(
    n_samples=1000, 
    n_features=20,
    n_informative=15,
    n_redundant=5,
    n_classes=3,
    random_state=42
)

# Generate regression data
X_reg, y_reg = make_regression(
    n_samples=1000,
    n_features=20,
    n_informative=15,
    noise=0.1,
    random_state=42
)

class RandomForestAnalyzer:
    """Comprehensive Random Forest Analysis Tool"""
    
    def __init__(self, task='classification'):
        self.task = task
        self.model = None
        self.bootstrap_samples = []
        self.oob_scores = []
        
    def manual_bagging_demo(self, X, y, n_estimators=5):
        """
        Demonstrate bagging concept manually
        """
        n_samples = X.shape[0]
        predictions = []
        
        print("Manual Bagging Demonstration")
        print("="*50)
        
        for i in range(n_estimators):
            # Bootstrap sampling
            indices = np.random.choice(n_samples, n_samples, replace=True)
            X_bootstrap = X[indices]
            y_bootstrap = y[indices]
            
            # Track out-of-bag samples
            oob_indices = np.setdiff1d(np.arange(n_samples), indices)
            
            # Train simple decision tree
            tree = DecisionTreeClassifier(max_depth=3, random_state=i)
            tree.fit(X_bootstrap, y_bootstrap)
            
            # Store predictions
            predictions.append(tree.predict(X))
            
            # Calculate unique samples ratio
            unique_ratio = len(np.unique(indices)) / n_samples
            
            print(f"Tree {i+1}:")
            print(f"  Unique samples: {len(np.unique(indices))}/{n_samples} ({unique_ratio:.1%})")
            print(f"  OOB samples: {len(oob_indices)}")
            
            self.bootstrap_samples.append(indices)
        
        # Ensemble prediction (majority voting)
        predictions = np.array(predictions)
        ensemble_pred = np.apply_along_axis(
            lambda x: np.bincount(x).argmax(), 0, predictions
        )
        
        return ensemble_pred, predictions
    
    def fit_random_forest(self, X_train, y_train, X_test, y_test, **kwargs):
        """Fit Random Forest with analysis"""
        
        if self.task == 'classification':
            self.model = RandomForestClassifier(
                n_estimators=kwargs.get('n_estimators', 100),
                max_depth=kwargs.get('max_depth', None),
                min_samples_split=kwargs.get('min_samples_split', 2),
                min_samples_leaf=kwargs.get('min_samples_leaf', 1),
                max_features=kwargs.get('max_features', 'sqrt'),
                bootstrap=kwargs.get('bootstrap', True),
                oob_score=kwargs.get('oob_score', True),
                random_state=42,
                n_jobs=-1
            )
        else:
            self.model = RandomForestRegressor(
                n_estimators=kwargs.get('n_estimators', 100),
                max_depth=kwargs.get('max_depth', None),
                min_samples_split=kwargs.get('min_samples_split', 2),
                min_samples_leaf=kwargs.get('min_samples_leaf', 1),
                max_features=kwargs.get('max_features', 'sqrt'),
                bootstrap=kwargs.get('bootstrap', True),
                oob_score=kwargs.get('oob_score', True),
                random_state=42,
                n_jobs=-1
            )
        
        # Fit model
        self.model.fit(X_train, y_train)
        
        # Predictions
        self.y_train_pred = self.model.predict(X_train)
        self.y_test_pred = self.model.predict(X_test)
        
        # Store data for analysis
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        
        return self
    
    def analyze_tree_diversity(self):
        """Analyze diversity among trees in the forest"""
        
        if not hasattr(self.model, 'estimators_'):
            print("Model not fitted yet!")
            return
        
        n_trees = len(self.model.estimators_)
        n_samples = min(100, len(self.X_test))  # Use subset for efficiency
        X_subset = self.X_test[:n_samples]
        
        # Get predictions from each tree
        tree_predictions = np.array([
            tree.predict(X_subset) for tree in self.model.estimators_
        ])
        
        # Calculate pairwise agreement between trees
        agreement_matrix = np.zeros((n_trees, n_trees))
        for i in range(n_trees):
            for j in range(n_trees):
                agreement = np.mean(tree_predictions[i] == tree_predictions[j])
                agreement_matrix[i, j] = agreement
        
        # Visualize
        fig, axes = plt.subplots(1, 2, figsize=(12, 5))
        
        # Agreement heatmap
        sns.heatmap(agreement_matrix[:20, :20], cmap='coolwarm', 
                   vmin=0, vmax=1, center=0.5,
                   ax=axes[0], cbar_kws={'label': 'Agreement'})
        axes[0].set_title('Tree Agreement Matrix (First 20 Trees)')
        axes[0].set_xlabel('Tree Index')
        axes[0].set_ylabel('Tree Index')
        
        # Agreement distribution
        upper_triangle = agreement_matrix[np.triu_indices(n_trees, k=1)]
        axes[1].hist(upper_triangle, bins=30, edgecolor='black', alpha=0.7)
        axes[1].axvline(x=upper_triangle.mean(), color='red', 
                       linestyle='--', label=f'Mean: {upper_triangle.mean():.3f}')
        axes[1].set_xlabel('Pairwise Agreement')
        axes[1].set_ylabel('Frequency')
        axes[1].set_title('Distribution of Tree Agreement')
        axes[1].legend()
        axes[1].grid(True, alpha=0.3)
        
        plt.suptitle('Tree Diversity Analysis', fontsize=14, y=1.02)
        plt.tight_layout()
        plt.show()
        
        return agreement_matrix
    
    def feature_importance_analysis(self):
        """Comprehensive feature importance analysis"""
        
        if not self.model:
            print("Model not fitted yet!")
            return
        
        # Get feature importances
        importances = self.model.feature_importances_
        feature_names = [f'Feature_{i}' for i in range(len(importances))]
        
        # Calculate standard deviation across trees
        std = np.std([tree.feature_importances_ 
                     for tree in self.model.estimators_], axis=0)
        
        # Create DataFrame
        importance_df = pd.DataFrame({
            'Feature': feature_names,
            'Importance': importances,
            'Std': std
        }).sort_values('Importance', ascending=False)
        
        # Permutation importance for comparison
        perm_importance = permutation_importance(
            self.model, self.X_test, self.y_test,
            n_repeats=10, random_state=42, n_jobs=-1
        )
        
        importance_df['Permutation_Importance'] = perm_importance.importances_mean
        importance_df['Perm_Std'] = perm_importance.importances_std
        
        # Visualization
        fig, axes = plt.subplots(2, 2, figsize=(14, 10))
        
        # 1. MDI Feature Importance
        top_n = 10
        top_features = importance_df.head(top_n)
        
        axes[0, 0].barh(range(top_n), top_features['Importance'].values)
        axes[0, 0].set_yticks(range(top_n))
        axes[0, 0].set_yticklabels(top_features['Feature'].values)
        axes[0, 0].set_xlabel('Mean Decrease in Impurity')
        axes[0, 0].set_title('Top 10 Features (MDI)')
        axes[0, 0].grid(True, alpha=0.3)
        
        # Add error bars
        axes[0, 0].barh(range(top_n), top_features['Importance'].values,
                       xerr=top_features['Std'].values, alpha=0.7)
        
        # 2. Permutation Importance
        axes[0, 1].barh(range(top_n), top_features['Permutation_Importance'].values)
        axes[0, 1].set_yticks(range(top_n))
        axes[0, 1].set_yticklabels(top_features['Feature'].values)
        axes[0, 1].set_xlabel('Permutation Importance')
        axes[0, 1].set_title('Top 10 Features (Permutation)')
        axes[0, 1].grid(True, alpha=0.3)
        
        # 3. Importance Comparison
        axes[1, 0].scatter(importance_df['Importance'], 
                          importance_df['Permutation_Importance'], alpha=0.6)
        axes[1, 0].set_xlabel('MDI Importance')
        axes[1, 0].set_ylabel('Permutation Importance')
        axes[1, 0].set_title('MDI vs Permutation Importance')
        axes[1, 0].grid(True, alpha=0.3)
        
        # Add diagonal line
        max_val = max(importance_df['Importance'].max(), 
                     importance_df['Permutation_Importance'].max())
        axes[1, 0].plot([0, max_val], [0, max_val], 'r--', alpha=0.5)
        
        # 4. Cumulative Importance
        cumsum = np.cumsum(importance_df['Importance'].values)
        axes[1, 1].plot(range(len(cumsum)), cumsum / cumsum[-1], 'b-', linewidth=2)
        axes[1, 1].axhline(y=0.8, color='r', linestyle='--', 
                          label='80% variance explained')
        axes[1, 1].set_xlabel('Number of Features')
        axes[1, 1].set_ylabel('Cumulative Importance')
        axes[1, 1].set_title('Cumulative Feature Importance')
        axes[1, 1].legend()
        axes[1, 1].grid(True, alpha=0.3)
        
        # Find number of features for 80% importance
        n_features_80 = np.argmax(cumsum / cumsum[-1] >= 0.8) + 1
        axes[1, 1].axvline(x=n_features_80, color='g', linestyle='--',
                          label=f'{n_features_80} features for 80%')
        axes[1, 1].legend()
        
        plt.suptitle('Feature Importance Analysis', fontsize=14, y=1.02)
        plt.tight_layout()
        plt.show()
        
        return importance_df
    
    def convergence_analysis(self):
        """Analyze model convergence with number of trees"""
        
        if not self.model:
            print("Model not fitted yet!")
            return
        
        # Calculate scores for increasing number of estimators
        n_estimators_range = range(1, len(self.model.estimators_) + 1, 5)
        train_scores = []
        test_scores = []
        oob_scores = []
        
        for n in n_estimators_range:
            # Use first n trees
            subset_predictions_train = np.array([
                self.model.estimators_[i].predict(self.X_train)
                for i in range(n)
            ])
            subset_predictions_test = np.array([
                self.model.estimators_[i].predict(self.X_test)
                for i in range(n)
            ])
            
            # Aggregate predictions
            if self.task == 'classification':
                ensemble_train = np.apply_along_axis(
                    lambda x: np.bincount(x.astype(int)).argmax(), 
                    0, subset_predictions_train
                )
                ensemble_test = np.apply_along_axis(
                    lambda x: np.bincount(x.astype(int)).argmax(), 
                    0, subset_predictions_test
                )
                train_score = accuracy_score(self.y_train, ensemble_train)
                test_score = accuracy_score(self.y_test, ensemble_test)
            else:
                ensemble_train = np.mean(subset_predictions_train, axis=0)
                ensemble_test = np.mean(subset_predictions_test, axis=0)
                train_score = r2_score(self.y_train, ensemble_train)
                test_score = r2_score(self.y_test, ensemble_test)
            
            train_scores.append(train_score)
            test_scores.append(test_score)
        
        # Visualization
        fig, axes = plt.subplots(1, 2, figsize=(12, 5))
        
        # Score convergence
        axes[0].plot(n_estimators_range, train_scores, label='Train', alpha=0.7)
        axes[0].plot(n_estimators_range, test_scores, label='Test', alpha=0.7)
        if hasattr(self.model, 'oob_score_'):
            axes[0].axhline(y=self.model.oob_score_, color='g', 
                           linestyle='--', label=f'OOB Score: {self.model.oob_score_:.3f}')
        axes[0].set_xlabel('Number of Trees')
        axes[0].set_ylabel('Score')
        axes[0].set_title('Model Convergence')
        axes[0].legend()
        axes[0].grid(True, alpha=0.3)
        
        # Score stability (variance reduction)
        window = 10
        if len(test_scores) > window:
            rolling_mean = pd.Series(test_scores).rolling(window).mean()
            rolling_std = pd.Series(test_scores).rolling(window).std()
            
            axes[1].plot(n_estimators_range, test_scores, alpha=0.3, label='Test Score')
            axes[1].plot(n_estimators_range, rolling_mean, 'b-', 
                        label=f'Rolling Mean (window={window})')
            axes[1].fill_between(n_estimators_range,
                                rolling_mean - rolling_std,
                                rolling_mean + rolling_std,
                                alpha=0.2, label='±1 Std Dev')
            axes[1].set_xlabel('Number of Trees')
            axes[1].set_ylabel('Test Score')
            axes[1].set_title('Score Stability')
            axes[1].legend()
            axes[1].grid(True, alpha=0.3)
        
        plt.suptitle('Convergence Analysis', fontsize=14, y=1.02)
        plt.tight_layout()
        plt.show()
        
        return train_scores, test_scores

# Initialize analyzer
print("="*60)
print("RANDOM FOREST FUNDAMENTALS")
print("="*60)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_class, y_class, test_size=0.3, stratify=y_class, random_state=42
)

# Manual bagging demonstration
rf_analyzer = RandomForestAnalyzer(task='classification')
ensemble_pred, individual_preds = rf_analyzer.manual_bagging_demo(
    X_train[:100], y_train[:100], n_estimators=5
)

print(f"\nEnsemble Accuracy: {accuracy_score(y_train[:100], ensemble_pred):.3f}")

# Fit full Random Forest
rf_analyzer.fit_random_forest(
    X_train, y_train, X_test, y_test,
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    max_features='sqrt'
)

# Model performance
print("\n" + "="*60)
print("MODEL PERFORMANCE")
print("="*60)
print(f"Train Accuracy: {accuracy_score(y_train, rf_analyzer.y_train_pred):.4f}")
print(f"Test Accuracy: {accuracy_score(y_test, rf_analyzer.y_test_pred):.4f}")
if hasattr(rf_analyzer.model, 'oob_score_'):
    print(f"OOB Score: {rf_analyzer.model.oob_score_:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, rf_analyzer.y_test_pred))

# Analyze tree diversity
agreement_matrix = rf_analyzer.analyze_tree_diversity()

# Feature importance
importance_df = rf_analyzer.feature_importance_analysis()
print("\nTop 5 Most Important Features:")
print(importance_df[['Feature', 'Importance', 'Permutation_Importance']].head())

# Convergence analysis
train_scores, test_scores = rf_analyzer.convergence_analysis()

Random Forest vs Single Decision Tree

Comparing Performance and Robustness

class ForestVsTreeComparison:
    """Compare Random Forest with Single Decision Tree"""
    
    def __init__(self):
        self.models = {}
        self.results = {}
        
    def compare_models(self, X_train, y_train, X_test, y_test):
        """Compare different tree-based models"""
        
        # Single Decision Tree
        self.models['Single Tree'] = DecisionTreeClassifier(
            max_depth=10, random_state=42
        )
        
        # Random Forest
        self.models['Random Forest'] = RandomForestClassifier(
            n_estimators=100, max_depth=10, random_state=42, n_jobs=-1
        )
        
        # Extremely Randomized Trees
        from sklearn.ensemble import ExtraTreesClassifier
        self.models['Extra Trees'] = ExtraTreesClassifier(
            n_estimators=100, max_depth=10, random_state=42, n_jobs=-1
        )
        
        # Fit and evaluate all models
        for name, model in self.models.items():
            # Fit
            model.fit(X_train, y_train)
            
            # Predictions
            y_train_pred = model.predict(X_train)
            y_test_pred = model.predict(X_test)
            
            # Store results
            self.results[name] = {
                'train_accuracy': accuracy_score(y_train, y_train_pred),
                'test_accuracy': accuracy_score(y_test, y_test_pred),
                'overfitting': accuracy_score(y_train, y_train_pred) - 
                              accuracy_score(y_test, y_test_pred),
                'y_test_pred': y_test_pred
            }
        
        return self
    
    def visualize_decision_boundaries(self, X, y):
        """Visualize decision boundaries (for 2D data)"""
        
        if X.shape[1] != 2:
            print("Can only visualize 2D data")
            return
        
        fig, axes = plt.subplots(1, 3, figsize=(15, 5))
        
        # Create mesh
        h = 0.02
        x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
        y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                            np.arange(y_min, y_max, h))
        
        for idx, (name, model) in enumerate(self.models.items()):
            # Predict on mesh
            Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
            Z = Z.reshape(xx.shape)
            
            # Plot
            axes[idx].contourf(xx, yy, Z, alpha=0.8, cmap=plt.cm.RdYlBu)
            axes[idx].scatter(X[:, 0], X[:, 1], c=y, 
                            cmap=plt.cm.RdYlBu, edgecolor='black', s=30)
            axes[idx].set_title(f'{name}')
            axes[idx].set_xlabel('Feature 1')
            axes[idx].set_ylabel('Feature 2')
        
        plt.suptitle('Decision Boundaries Comparison', fontsize=14, y=1.02)
        plt.tight_layout()
        plt.show()
    
    def noise_robustness_test(self, X, y, noise_levels=[0, 0.1, 0.2, 0.3, 0.4]):
        """Test robustness to noise"""
        
        results = {name: [] for name in self.models.keys()}
        
        for noise_level in noise_levels:
            # Add noise to features
            X_noisy = X + np.random.normal(0, noise_level, X.shape)
            
            # Split data
            X_train, X_test, y_train, y_test = train_test_split(
                X_noisy, y, test_size=0.3, random_state=42
            )
            
            # Evaluate each model
            for name, model in self.models.items():
                model.fit(X_train, y_train)
                score = accuracy_score(y_test, model.predict(X_test))
                results[name].append(score)
        
        # Visualization
        plt.figure(figsize=(10, 6))
        for name, scores in results.items():
            plt.plot(noise_levels, scores, marker='o', label=name, linewidth=2)
        
        plt.xlabel('Noise Level (Standard Deviation)')
        plt.ylabel('Test Accuracy')
        plt.title('Model Robustness to Feature Noise')
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.show()
        
        return results

# Compare models
comparison = ForestVsTreeComparison()
comparison.compare_models(X_train, y_train, X_test, y_test)

print("\n" + "="*60)
print("MODEL COMPARISON")
print("="*60)

# Print results
comparison_df = pd.DataFrame(comparison.results).T
print(comparison_df.to_string())

# Test noise robustness
print("\nTesting Noise Robustness...")
noise_results = comparison.noise_robustness_test(X_class, y_class)

# Create 2D dataset for visualization
from sklearn.datasets import make_moons
X_2d, y_2d = make_moons(n_samples=300, noise=0.3, random_state=42)

# Fit models on 2D data
comparison_2d = ForestVsTreeComparison()
comparison_2d.compare_models(X_2d, y_2d, X_2d, y_2d)  # Using same data for simplicity
comparison_2d.visualize_decision_boundaries(X_2d, y_2d)

Hyperparameter Tuning

Optimizing Random Forest Performance

class RandomForestTuner:
    """Comprehensive hyperparameter tuning for Random Forests"""
    
    def __init__(self, task='classification'):
        self.task = task
        self.best_model = None
        self.cv_results = None
        self.param_importance = {}
        
    def grid_search_tuning(self, X_train, y_train):
        """Perform grid search with cross-validation"""
        
        if self.task == 'classification':
            model = RandomForestClassifier(random_state=42)
            param_grid = {
                'n_estimators': [50, 100, 200],
                'max_depth': [5, 10, 20, None],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4],
                'max_features': ['sqrt', 'log2', 0.5]
            }
        else:
            model = RandomForestRegressor(random_state=42)
            param_grid = {
                'n_estimators': [50, 100, 200],
                'max_depth': [5, 10, 20, None],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4],
                'max_features': ['sqrt', 'log2', 0.5]
            }
        
        # Grid search
        grid_search = GridSearchCV(
            model, param_grid,
            cv=5, scoring='accuracy' if self.task == 'classification' else 'r2',
            n_jobs=-1, verbose=1
        )
        
        grid_search.fit(X_train, y_train)
        
        self.best_model = grid_search.best_estimator_
        self.cv_results = pd.DataFrame(grid_search.cv_results_)
        
        return grid_search.best_params_, grid_search.best_score_
    
    def parameter_importance_analysis(self, X_train, y_train):
        """Analyze importance of each hyperparameter"""
        
        base_params = {
            'n_estimators': 100,
            'max_depth': 10,
            'min_samples_split': 5,
            'min_samples_leaf': 2,
            'max_features': 'sqrt',
            'random_state': 42
        }
        
        param_ranges = {
            'n_estimators': [10, 50, 100, 200, 500],
            'max_depth': [3, 5, 10, 20, None],
            'min_samples_split': [2, 5, 10, 20],
            'min_samples_leaf': [1, 2, 4, 8],
            'max_features': [0.3, 0.5, 'sqrt', 'log2', None]
        }
        
        results = {}
        
        for param_name, param_values in param_ranges.items():
            scores = []
            
            for value in param_values:
                # Update parameter
                params = base_params.copy()
                params[param_name] = value
                
                # Create and evaluate model
                if self.task == 'classification':
                    model = RandomForestClassifier(**params)
                else:
                    model = RandomForestRegressor(**params)
                
                # Cross-validation score
                cv_scores = cross_val_score(
                    model, X_train, y_train, cv=3,
                    scoring='accuracy' if self.task == 'classification' else 'r2'
                )
                scores.append(cv_scores.mean())
            
            results[param_name] = {
                'values': param_values,
                'scores': scores,
                'variance': np.var(scores)
            }
        
        self.param_importance = results
        
        # Visualization
        fig, axes = plt.subplots(2, 3, figsize=(15, 10))
        axes = axes.flatten()
        
        for idx, (param_name, data) in enumerate(results.items()):
            if idx < len(axes):
                # Convert None and string values for plotting
                x_values = []
                for v in data['values']:
                    if v is None:
                        x_values.append('None')
                    elif isinstance(v, str):
                        x_values.append(v)
                    else:
                        x_values.append(str(v))
                
                axes[idx].plot(x_values, data['scores'], 'o-', linewidth=2, markersize=8)
                axes[idx].set_xlabel(param_name)
                axes[idx].set_ylabel('CV Score')
                axes[idx].set_title(f'{param_name} (var={data["variance"]:.4f})')
                axes[idx].grid(True, alpha=0.3)
                axes[idx].tick_params(axis='x', rotation=45)
        
        # Remove empty subplot
        if len(results) < len(axes):
            fig.delaxes(axes[-1])
        
        plt.suptitle('Hyperparameter Sensitivity Analysis', fontsize=14, y=1.02)
        plt.tight_layout()
        plt.show()
        
        return results
    
    def learning_curves(self, X_train, y_train, X_test, y_test):
        """Generate learning curves"""
        
        train_sizes = np.linspace(0.1, 1.0, 10)
        train_scores = []
        test_scores = []
        
        for train_size in train_sizes:
            n_samples = int(train_size * len(X_train))
            
            # Use subset of data
            X_subset = X_train[:n_samples]
            y_subset = y_train[:n_samples]
            
            # Train model
            if self.task == 'classification':
                model = RandomForestClassifier(n_estimators=100, random_state=42)
            else:
                model = RandomForestRegressor(n_estimators=100, random_state=42)
            
            model.fit(X_subset, y_subset)
            
            # Evaluate
            train_pred = model.predict(X_subset)
            test_pred = model.predict(X_test)
            
            if self.task == 'classification':
                train_score = accuracy_score(y_subset, train_pred)
                test_score = accuracy_score(y_test, test_pred)
            else:
                train_score = r2_score(y_subset, train_pred)
                test_score = r2_score(y_test, test_pred)
            
            train_scores.append(train_score)
            test_scores.append(test_score)
        
        # Visualization
        plt.figure(figsize=(10, 6))
        plt.plot(train_sizes * len(X_train), train_scores, 
                'o-', label='Training score', linewidth=2)
        plt.plot(train_sizes * len(X_train), test_scores, 
                'o-', label='Test score', linewidth=2)
        plt.xlabel('Training Set Size')
        plt.ylabel('Score')
        plt.title('Learning Curves')
        plt.legend(loc='best')
        plt.grid(True, alpha=0.3)
        plt.show()
        
        return train_scores, test_scores

# Hyperparameter tuning
print("\n" + "="*60)
print("HYPERPARAMETER TUNING")
print("="*60)

tuner = RandomForestTuner(task='classification')

# Quick parameter importance analysis
print("Analyzing parameter importance...")
param_results = tuner.parameter_importance_analysis(X_train[:500], y_train[:500])

# Grid search (using smaller subset for speed)
print("\nPerforming grid search...")
best_params, best_score = tuner.grid_search_tuning(X_train[:500], y_train[:500])

print(f"\nBest Parameters: {best_params}")
print(f"Best CV Score: {best_score:.4f}")

# Learning curves
print("\nGenerating learning curves...")
train_scores, test_scores = tuner.learning_curves(X_train, y_train, X_test, y_test)

Best Practices and Guidelines

class RandomForestBestPractices:
    """Best practices and guidelines for Random Forests"""
    
    @staticmethod
    def parameter_guidelines():
        """Guidelines for setting Random Forest parameters"""
        
        print("\n" + "="*60)
        print("RANDOM FOREST PARAMETER GUIDELINES")
        print("="*60)
        
        guidelines = {
            'n_estimators': {
                'default': 100,
                'range': '50-500',
                'tip': 'More trees = better performance but diminishing returns after ~100-200',
                'computational': 'Linear with n_estimators'
            },
            'max_depth': {
                'default': None,
                'range': '3-20 or None',
                'tip': 'None = fully grown trees. Limit for overfitting control',
                'computational': 'Exponential with depth'
            },
            'min_samples_split': {
                'default': 2,
                'range': '2-20',
                'tip': 'Higher values prevent overfitting but may underfit',
                'computational': 'Reduces tree complexity'
            },
            'min_samples_leaf': {
                'default': 1,
                'range': '1-10',
                'tip': 'Minimum samples in leaf nodes. Higher = smoother boundaries',
                'computational': 'Reduces tree size'
            },
            'max_features': {
                'default': 'sqrt',
                'range': 'sqrt, log2, 0.3-0.8, None',
                'tip': 'sqrt for classification, 1/3 for regression. Controls diversity',
                'computational': 'Reduces features to consider at each split'
            },
            'bootstrap': {
                'default': True,
                'range': 'True/False',
                'tip': 'True for bagging. False = use whole dataset (Extra Trees)',
                'computational': 'No impact'
            },
            'oob_score': {
                'default': False,
                'range': 'True/False',
                'tip': 'True to get free validation score. Only with bootstrap=True',
                'computational': 'Small overhead'
            },
            'n_jobs': {
                'default': 1,
                'range': '-1 for all cores',
                'tip': 'Parallelize tree building. -1 uses all CPU cores',
                'computational': 'Linear speedup with cores'
            }
        }
        
        for param, info in guidelines.items():
            print(f"\n{param}:")
            print(f"  Default: {info['default']}")
            print(f"  Typical Range: {info['range']}")
            print(f"  Tip: {info['tip']}")
            print(f"  Computational Impact: {info['computational']}")
    
    @staticmethod
    def when_to_use_random_forests():
        """When to use Random Forests"""
        
        print("\n" + "="*60)
        print("WHEN TO USE RANDOM FORESTS")
        print("="*60)
        
        use_cases = {
            'Ideal For': [
                'Mixed data types (numerical and categorical)',
                'Non-linear relationships',
                'Feature importance is needed',
                'Robust predictions without much tuning',
                'Both classification and regression',
                'Handle missing values (with proper imputation)',
                'Parallel processing available',
                'Moderate to large datasets'
            ],
            'Advantages': [
                'No feature scaling required',
                'Handles non-linearity well',
                'Robust to outliers',
                'Low risk of overfitting',
                'Feature importance built-in',
                'OOB error estimation',
                'Can handle thousands of features',
                'Works well out-of-the-box'
            ],
            'Disadvantages': [
                'Black box model (less interpretable)',
                'Can be slow for real-time predictions',
                'Large memory footprint',
                'Biased toward high-cardinality features',
                'Cannot extrapolate (predictions bounded by training range)',
                'May overfit with noisy data',
                'Difficult to capture linear relationships'
            ],
            'Avoid When': [
                'Need model interpretability',
                'Very small datasets (<100 samples)',
                'Linear relationships dominate',
                'Real-time prediction with strict latency',
                'Extrapolation is needed',
                'Memory constraints exist',
                'Sparse high-dimensional data (use linear models)'
            ]
        }
        
        for category, items in use_cases.items():
            print(f"\n{category}:")
            for item in items:
                print(f"  • {item}")

# Print best practices
practices = RandomForestBestPractices()
practices.parameter_guidelines()
practices.when_to_use_random_forests()

# Final comparison summary
print("\n" + "="*60)
print("RANDOM FOREST VS OTHER ALGORITHMS")
print("="*60)

comparison_data = {
    'Algorithm': ['Random Forest', 'Gradient Boosting', 'SVM', 'Neural Network', 'Linear Model'],
    'Training Speed': ['Medium', 'Slow', 'Slow', 'Slow', 'Fast'],
    'Prediction Speed': ['Medium', 'Fast', 'Fast', 'Fast', 'Very Fast'],
    'Accuracy': ['High', 'Very High', 'High', 'Very High', 'Medium'],
    'Interpretability': ['Medium', 'Low', 'Low', 'Very Low', 'High'],
    'Tuning Required': ['Low', 'High', 'High', 'Very High', 'Low'],
    'Handles Non-linearity': ['Yes', 'Yes', 'Yes', 'Yes', 'No'],
    'Feature Scaling': ['No', 'No', 'Yes', 'Yes', 'Yes']
}

comparison_df = pd.DataFrame(comparison_data)
print(comparison_df.to_string(index=False))

Practice Exercises

Exercise 1: Custom Random Forest Implementation

Implement a simplified Random Forest from scratch:

  1. Create bootstrap samples
  2. Build decision trees with random feature selection
  3. Implement voting/averaging for predictions
  4. Calculate OOB error
  5. Compare with scikit-learn implementation

Exercise 2: Feature Importance Study

Conduct a comprehensive feature importance analysis:

  1. Compare MDI vs permutation importance
  2. Implement SHAP values for Random Forest
  3. Analyze feature interactions
  4. Create feature importance stability analysis
  5. Build automated feature selection pipeline

Exercise 3: Ensemble Method Comparison

Compare Random Forest with other ensemble methods:

  1. Implement voting classifier with different base models
  2. Compare with AdaBoost and Gradient Boosting
  3. Analyze diversity vs accuracy trade-off
  4. Create stacking ensemble with Random Forest
  5. Benchmark on multiple datasets

Key Takeaways

Further Resources