strategy['channels'].extend(['Email', 'In-app messages']) strategy['offers'].extend(['Welcome discount', 'Free trial']) strategy['potential_value'] = 'High' elif 'Loyal customers' in chars: strategy['tactics'].extend([ 'Loyalty program benefits', 'Referral incentives', 'Appreciation campaigns' ]) strategy['channels'].extend(['Email', 'Direct mail']) strategy['offers'].extend(['Points multiplier', 'Birthday rewards']) strategy['retention_risk'] = 'Low' elif 'Budget conscious' in chars: strategy['tactics'].extend([ 'Value-focused messaging', 'Bundle offers', 'Sale notifications' ]) strategy['channels'].extend(['Email', 'SMS']) strategy['offers'].extend(['Volume discounts', 'Clearance alerts']) strategies[segment_name] = strategy return strategies # Perform customer segmentation print("="*60) print("CUSTOMER SEGMENTATION ANALYSIS") print("="*60) # Select features for segmentation feature_cols = ['age', 'annual_income', 'spending_score', 'num_purchases', 'avg_purchase_value', 'days_since_last_purchase', 'website_visits', 'loyalty_years'] segmentation = CustomerSegmentation(customer_data) X_scaled = segmentation.prepare_features(feature_cols) # Find optimal number of segments print("\nFinding optimal number of segments...") optimal_k = segmentation.find_optimal_segments(k_range=range(3, 9)) print(f"Optimal number of segments: {optimal_k}") # Perform segmentation print(f"\nSegmenting customers into {optimal_k} groups...") segments = segmentation.segment_customers(optimal_k) segment_names = segmentation.name_segments() # Display segment information print("\n" + "="*60) print("SEGMENT PROFILES") print("="*60) for segment_id, name in segment_names.items(): info = segments[segment_id] print(f"\n{name} (Segment {segment_id}):") print(f" Size: {info['size']} customers ({info['percentage']:.1f}%)") print(f" Characteristics: {', '.join(info['characteristics'])}") print(f" Key metrics:") for metric, value in list(info['profile'].items())[:4]: print(f" {metric}: {value:.2f}") # Visualize segments segmentation.visualize_segments() # Generate marketing strategies strategies = segmentation.generate_marketing_strategies() print("\n" + "="*60) print("MARKETING STRATEGIES BY SEGMENT") print("="*60) for segment_name, strategy in strategies.items(): print(f"\n{segment_name}:") print(f" Size: {strategy['size']} customers ({strategy['percentage']})") print(f" Retention Risk: {strategy['retention_risk']}") print(f" Potential Value: {strategy['potential_value']}") print(f" Tactics: {', '.join(strategy['tactics'][:2])}") print(f" Channels: {', '.join(strategy['channels'])}") print(f" Offers: {', '.join(strategy['offers'])}")

Image Compression with K-means

Color Quantization Application

from sklearn.utils import shuffle
from PIL import Image
import requests
from io import BytesIO

class ImageCompression:
    """Image compression using K-means color quantization"""
    
    def __init__(self):
        self.original_image = None
        self.compressed_images = {}
        
    def create_sample_image(self, size=(150, 150)):
        """Create a synthetic image for demonstration"""
        
        # Create an image with distinct color regions
        img_array = np.zeros((size[0], size[1], 3), dtype=np.uint8)
        
        # Create color gradients
        for i in range(size[0]):
            for j in range(size[1]):
                # Top-left: Red gradient
                if i < size[0]//2 and j < size[1]//2:
                    img_array[i, j] = [255 - i*2, 50, 50]
                # Top-right: Green gradient
                elif i < size[0]//2 and j >= size[1]//2:
                    img_array[i, j] = [50, 255 - i*2, 50]
                # Bottom-left: Blue gradient
                elif i >= size[0]//2 and j < size[1]//2:
                    img_array[i, j] = [50, 50, 255 - (i-size[0]//2)*2]
                # Bottom-right: Yellow gradient
                else:
                    img_array[i, j] = [255 - (i-size[0]//2)*2, 255 - (i-size[0]//2)*2, 50]
        
        # Add some noise
        noise = np.random.normal(0, 20, img_array.shape)
        img_array = np.clip(img_array.astype(float) + noise, 0, 255).astype(np.uint8)
        
        return img_array
    
    def compress_image(self, image, n_colors):
        """Compress image using K-means color quantization"""
        
        # Store original dimensions
        original_shape = image.shape
        h, w = original_shape[:2]
        
        # Reshape image to be a list of pixels
        image_array = image.reshape((h * w, 3))
        
        # Sample pixels for faster processing
        n_samples = min(1000, len(image_array))
        image_sample = shuffle(image_array, random_state=0, n_samples=n_samples)
        
        # Fit K-means on sample
        kmeans = KMeans(n_clusters=n_colors, random_state=42, n_init=10)
        kmeans.fit(image_sample)
        
        # Predict cluster for all pixels
        labels = kmeans.predict(image_array)
        
        # Replace each pixel with its cluster center
        compressed = kmeans.cluster_centers_[labels]
        compressed_image = compressed.reshape(original_shape).astype(np.uint8)
        
        # Calculate compression metrics
        original_colors = len(np.unique(image_array, axis=0))
        compression_ratio = original_colors / n_colors
        
        # Calculate MSE
        mse = np.mean((image_array - compressed)**2)
        psnr = 20 * np.log10(255.0 / np.sqrt(mse))
        
        metrics = {
            'original_colors': original_colors,
            'compressed_colors': n_colors,
            'compression_ratio': compression_ratio,
            'mse': mse,
            'psnr': psnr,
            'file_size_reduction': self._estimate_file_size_reduction(n_colors)
        }
        
        return compressed_image, kmeans.cluster_centers_, labels.reshape(h, w), metrics
    
    def _estimate_file_size_reduction(self, n_colors):
        """Estimate file size reduction"""
        # Original: 24 bits per pixel (8 bits per channel)
        # Compressed: log2(n_colors) bits per pixel + palette
        
        original_bits = 24
        compressed_bits = np.ceil(np.log2(n_colors))
        reduction = 1 - (compressed_bits / original_bits)
        return reduction
    
    def visualize_compression(self, image, n_colors_list=[4, 8, 16, 32]):
        """Visualize compression at different levels"""
        
        n_levels = len(n_colors_list)
        fig, axes = plt.subplots(2, n_levels + 1, figsize=(20, 8))
        
        # Original image
        axes[0, 0].imshow(image)
        axes[0, 0].set_title('Original Image')
        axes[0, 0].axis('off')
        
        axes[1, 0].text(0.5, 0.5, 'Original\nNo compression', 
                       ha='center', va='center', fontsize=12)
        axes[1, 0].axis('off')
        
        compression_results = []
        
        for idx, n_colors in enumerate(n_colors_list, 1):
            # Compress image
            compressed, palette, labels, metrics = self.compress_image(image, n_colors)
            
            # Show compressed image
            axes[0, idx].imshow(compressed)
            axes[0, idx].set_title(f'{n_colors} Colors')
            axes[0, idx].axis('off')
            
            # Show metrics
            metrics_text = (f"Colors: {n_colors}\n"
                          f"Ratio: {metrics['compression_ratio']:.1f}x\n"
                          f"PSNR: {metrics['psnr']:.1f} dB\n"
                          f"Size ↓: {metrics['file_size_reduction']*100:.0f}%")
            axes[1, idx].text(0.5, 0.5, metrics_text, 
                            ha='center', va='center', fontsize=10)
            axes[1, idx].axis('off')
            
            compression_results.append({
                'n_colors': n_colors,
                'compressed_image': compressed,
                'palette': palette,
                'metrics': metrics
            })
        
        plt.suptitle('Image Compression using K-means Color Quantization', fontsize=14)
        plt.tight_layout()
        plt.show()
        
        return compression_results
    
    def visualize_color_space(self, image, compressed_results):
        """Visualize color space and clustering"""
        
        fig, axes = plt.subplots(1, len(compressed_results), figsize=(20, 5))
        
        # Sample pixels for visualization
        h, w = image.shape[:2]
        pixels = image.reshape(-1, 3)
        sample_size = min(5000, len(pixels))
        sample_indices = np.random.choice(len(pixels), sample_size, replace=False)
        pixel_sample = pixels[sample_indices]
        
        for idx, result in enumerate(compressed_results):
            ax = axes[idx]
            n_colors = result['n_colors']
            palette = result['palette']
            
            # 3D to 2D projection (using first two channels)
            ax.scatter(pixel_sample[:, 0], pixel_sample[:, 1], 
                      c=pixel_sample/255, s=1, alpha=0.3)
            
            # Plot cluster centers
            ax.scatter(palette[:, 0], palette[:, 1], 
                      c=palette/255, s=200, edgecolor='black', 
                      linewidth=2, marker='*')
            
            ax.set_xlabel('Red Channel')
            ax.set_ylabel('Green Channel')
            ax.set_title(f'{n_colors} Color Clusters')
            ax.set_xlim([0, 255])
            ax.set_ylim([0, 255])
        
        plt.suptitle('Color Space Clustering', fontsize=14)
        plt.tight_layout()
        plt.show()
    
    def analyze_compression_quality(self, image):
        """Analyze quality vs compression trade-off"""
        
        n_colors_range = [2, 4, 8, 16, 32, 64, 128]
        metrics_list = []
        
        for n_colors in n_colors_range:
            _, _, _, metrics = self.compress_image(image, n_colors)
            metrics['n_colors'] = n_colors
            metrics_list.append(metrics)
        
        metrics_df = pd.DataFrame(metrics_list)
        
        # Plot quality metrics
        fig, axes = plt.subplots(1, 3, figsize=(15, 5))
        
        # PSNR vs colors
        axes[0].plot(metrics_df['n_colors'], metrics_df['psnr'], 'o-', linewidth=2)
        axes[0].set_xlabel('Number of Colors')
        axes[0].set_ylabel('PSNR (dB)')
        axes[0].set_title('Quality vs Compression Level')
        axes[0].grid(True, alpha=0.3)
        axes[0].axhline(y=30, color='r', linestyle='--', label='Good Quality (30dB)')
        axes[0].legend()
        
        # File size reduction
        axes[1].plot(metrics_df['n_colors'], 
                    metrics_df['file_size_reduction']*100, 'o-', linewidth=2)
        axes[1].set_xlabel('Number of Colors')
        axes[1].set_ylabel('File Size Reduction (%)')
        axes[1].set_title('Compression Efficiency')
        axes[1].grid(True, alpha=0.3)
        
        # MSE vs colors (log scale)
        axes[2].semilogy(metrics_df['n_colors'], metrics_df['mse'], 'o-', linewidth=2)
        axes[2].set_xlabel('Number of Colors')
        axes[2].set_ylabel('Mean Squared Error (log scale)')
        axes[2].set_title('Reconstruction Error')
        axes[2].grid(True, alpha=0.3)
        
        plt.suptitle('Compression Quality Analysis', fontsize=14)
        plt.tight_layout()
        plt.show()
        
        return metrics_df

# Demonstrate image compression
print("\n" + "="*60)
print("IMAGE COMPRESSION WITH K-MEANS")
print("="*60)

compressor = ImageCompression()

# Create sample image
print("\nCreating sample image...")
sample_image = compressor.create_sample_image(size=(150, 150))

# Compress at different levels
print("Compressing at different color levels...")
compression_results = compressor.visualize_compression(
    sample_image, 
    n_colors_list=[4, 8, 16, 64]
)

# Visualize color space
print("\nVisualizing color space clustering...")
compressor.visualize_color_space(sample_image, compression_results[:3])

# Analyze quality metrics
print("\nAnalyzing compression quality...")
quality_metrics = compressor.analyze_compression_quality(sample_image)

print("\nCompression Quality Summary:")
print(quality_metrics[['n_colors', 'psnr', 'file_size_reduction']].to_string(index=False))

Handling K-means Limitations

Common Problems and Solutions

from sklearn.datasets import make_circles, make_moons, make_blobs
from sklearn.cluster import DBSCAN, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler

class KMeansLimitations:
    """Demonstrate and handle K-means limitations"""
    
    @staticmethod
    def demonstrate_shape_limitation():
        """Show K-means failure on non-spherical clusters"""
        
        # Generate different cluster shapes
        datasets = {
            'Spherical (K-means works)': make_blobs(n_samples=300, centers=3, 
                                                   cluster_std=0.5, random_state=42),
            'Elongated': (make_blobs(n_samples=300, centers=3, 
                                   cluster_std=0.5, random_state=42)[0] @ 
                         np.array([[1, 0], [0, 0.3]]), None),
            'Moons': make_moons(n_samples=300, noise=0.05, random_state=42),
            'Circles': make_circles(n_samples=300, noise=0.05, factor=0.5, 
                                  random_state=42),
            'Anisotropic': (make_blobs(n_samples=300, centers=3, 
                                      cluster_std=0.5, random_state=42)[0] @ 
                          np.array([[0.6, -0.8], [0.4, 0.7]]), None),
            'Varying Density': (np.vstack([
                np.random.randn(100, 2) * 0.5 + [2, 2],
                np.random.randn(200, 2) * 2 + [-2, -2]
            ]), None)
        }
        
        fig, axes = plt.subplots(2, 3, figsize=(15, 10))
        axes = axes.ravel()
        
        for idx, (name, (X, y_true)) in enumerate(datasets.items()):
            ax = axes[idx]
            
            # Determine number of clusters
            if 'Circles' in name or 'Moons' in name:
                n_clusters = 2
            else:
                n_clusters = 3
            
            # Apply K-means
            kmeans = KMeans(n_clusters=n_clusters, random_state=42)
            kmeans_labels = kmeans.fit_predict(X)
            
            # Apply alternative clustering (DBSCAN for non-spherical)
            if 'Circles' in name or 'Moons' in name:
                dbscan = DBSCAN(eps=0.3, min_samples=5)
                alt_labels = dbscan.fit_predict(X)
                alt_name = 'DBSCAN'
            else:
                alt_labels = kmeans_labels
                alt_name = 'K-means'
            
            # Plot K-means result
            scatter = ax.scatter(X[:, 0], X[:, 1], c=kmeans_labels, 
                               cmap='viridis', alpha=0.6, s=30)
            ax.scatter(kmeans.cluster_centers_[:, 0], 
                      kmeans.cluster_centers_[:, 1],
                      c='red', marker='x', s=200, linewidths=3)
            
            # Calculate silhouette score
            kmeans_score = silhouette_score(X, kmeans_labels)
            
            ax.set_title(f'{name}\nK-means Silhouette: {kmeans_score:.3f}')
            
        plt.suptitle('K-means Performance on Different Cluster Shapes', fontsize=14)
        plt.tight_layout()
        plt.show()
    
    @staticmethod
    def handle_outliers(X, contamination=0.1):
        """Handle outliers before clustering"""
        from sklearn.ensemble import IsolationForest
        
        print("\n" + "="*60)
        print("OUTLIER DETECTION AND HANDLING")
        print("="*60)
        
        # Detect outliers using Isolation Forest
        iso_forest = IsolationForest(contamination=contamination, random_state=42)
        outlier_labels = iso_forest.fit_predict(X)
        
        # Separate inliers and outliers
        X_inliers = X[outlier_labels == 1]
        X_outliers = X[outlier_labels == -1]
        
        print(f"\nDetected {len(X_outliers)} outliers ({len(X_outliers)/len(X)*100:.1f}%)")
        print(f"Remaining inliers: {len(X_inliers)}")
        
        # Cluster only inliers
        kmeans = KMeans(n_clusters=3, random_state=42)
        inlier_clusters = kmeans.fit_predict(X_inliers)
        
        # Assign outliers to nearest cluster or mark as noise
        outlier_clusters = []
        noise_threshold = np.percentile([np.linalg.norm(x - kmeans.cluster_centers_[c]) 
                                        for x, c in zip(X_inliers, inlier_clusters)], 95)
        
        for point in X_outliers:
            distances = [np.linalg.norm(point - center) 
                        for center in kmeans.cluster_centers_]
            min_dist = min(distances)
            
            if min_dist > noise_threshold:
                outlier_clusters.append(-1)  # Mark as noise
            else:
                outlier_clusters.append(np.argmin(distances))
        
        # Combine results
        all_labels = np.zeros(len(X))
        all_labels[outlier_labels == 1] = inlier_clusters
        all_labels[outlier_labels == -1] = outlier_clusters
        
        # Visualize
        fig, axes = plt.subplots(1, 3, figsize=(15, 5))
        
        # Original data with outliers
        axes[0].scatter(X[:, 0], X[:, 1], c='blue', alpha=0.5, s=30)
        axes[0].scatter(X_outliers[:, 0], X_outliers[:, 1], 
                       c='red', marker='x', s=50, label='Outliers')
        axes[0].set_title('Original Data with Outliers')
        axes[0].legend()
        
        # K-means without outlier handling
        kmeans_all = KMeans(n_clusters=3, random_state=42)
        labels_with_outliers = kmeans_all.fit_predict(X)
        axes[1].scatter(X[:, 0], X[:, 1], c=labels_with_outliers, 
                       cmap='viridis', alpha=0.6, s=30)
        axes[1].scatter(kmeans_all.cluster_centers_[:, 0],
                       kmeans_all.cluster_centers_[:, 1],
                       c='red', marker='x', s=200, linewidths=3)
        axes[1].set_title('K-means with Outliers')
        
        # K-means after outlier handling
        axes[2].scatter(X[:, 0], X[:, 1], c=all_labels, 
                       cmap='viridis', alpha=0.6, s=30)
        axes[2].scatter(kmeans.cluster_centers_[:, 0],
                       kmeans.cluster_centers_[:, 1],
                       c='red', marker='x', s=200, linewidths=3)
        noise_points = X[all_labels == -1]
        if len(noise_points) > 0:
            axes[2].scatter(noise_points[:, 0], noise_points[:, 1],
                          c='black', marker='x', s=50, label='Noise')
        axes[2].set_title('K-means after Outlier Handling')
        axes[2].legend()
        
        plt.suptitle('Impact of Outlier Handling on K-means', fontsize=14)
        plt.tight_layout()
        plt.show()
        
        return all_labels, X_inliers, X_outliers
    
    @staticmethod
    def compare_scaling_impact():
        """Show importance of feature scaling"""
        
        # Create data with different scales
        np.random.seed(42)
        X_unscaled = np.column_stack([
            np.random.normal(100, 20, 300),  # Feature 1: scale ~100
            np.random.normal(0, 1, 300)      # Feature 2: scale ~1
        ])
        
        # Add true clusters
        X_unscaled[:100, 0] += 50
        X_unscaled[100:200, 1] += 2
        X_unscaled[200:, 0] -= 30
        X_unscaled[200:, 1] -= 1
        
        # Scale data
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X_unscaled)
        
        fig, axes = plt.subplots(1, 2, figsize=(12, 5))
        
        # Unscaled clustering
        kmeans_unscaled = KMeans(n_clusters=3, random_state=42)
        labels_unscaled = kmeans_unscaled.fit_predict(X_unscaled)
        
        axes[0].scatter(X_unscaled[:, 0], X_unscaled[:, 1], 
                       c=labels_unscaled, cmap='viridis', alpha=0.6)
        axes[0].set_xlabel('Feature 1 (scale ~100)')
        axes[0].set_ylabel('Feature 2 (scale ~1)')
        axes[0].set_title('K-means on Unscaled Data')
        
        score_unscaled = silhouette_score(X_unscaled, labels_unscaled)
        axes[0].text(0.02, 0.98, f'Silhouette: {score_unscaled:.3f}',
                    transform=axes[0].transAxes, verticalalignment='top',
                    bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
        
        # Scaled clustering
        kmeans_scaled = KMeans(n_clusters=3, random_state=42)
        labels_scaled = kmeans_scaled.fit_predict(X_scaled)
        
        axes[1].scatter(X_scaled[:, 0], X_scaled[:, 1], 
                       c=labels_scaled, cmap='viridis', alpha=0.6)
        axes[1].set_xlabel('Feature 1 (standardized)')
        axes[1].set_ylabel('Feature 2 (standardized)')
        axes[1].set_title('K-means on Scaled Data')
        
        score_scaled = silhouette_score(X_scaled, labels_scaled)
        axes[1].text(0.02, 0.98, f'Silhouette: {score_scaled:.3f}',
                    transform=axes[1].transAxes, verticalalignment='top',
                    bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
        
        plt.suptitle('Impact of Feature Scaling on K-means', fontsize=14)
        plt.tight_layout()
        plt.show()
        
        print("\n" + "="*60)
        print("FEATURE SCALING IMPACT")
        print("="*60)
        print(f"Silhouette Score (Unscaled): {score_unscaled:.3f}")
        print(f"Silhouette Score (Scaled): {score_scaled:.3f}")
        print(f"Improvement: {100*(score_scaled-score_unscaled)/abs(score_unscaled):.1f}%")

# Demonstrate K-means limitations
print("\n" + "="*60)
print("K-MEANS LIMITATIONS AND SOLUTIONS")
print("="*60)

limitations = KMeansLimitations()

print("\n1. Non-spherical Clusters:")
limitations.demonstrate_shape_limitation()

print("\n2. Impact of Scaling:")
limitations.compare_scaling_impact()

print("\n3. Handling Outliers:")
# Generate data with outliers
X_with_outliers = np.vstack([
    make_blobs(n_samples=300, centers=3, cluster_std=0.5, random_state=42)[0],
    np.random.uniform(-8, 8, (30, 2))  # Outliers
])

labels_with_outliers, inliers, outliers = limitations.handle_outliers(
    X_with_outliers, contamination=0.1
)

Best Practices and Guidelines

class KMeansBestPractices:
    """Best practices for using K-means clustering"""
    
    @staticmethod
    def preprocessing_checklist():
        """Preprocessing steps for K-means"""
        
        checklist = """
K-MEANS PREPROCESSING CHECKLIST:
================================

1. ✓ Handle Missing Values
   - Remove or impute missing data
   - K-means cannot handle NaN values
   - Consider missingness patterns

2. ✓ Scale Features
   - Use StandardScaler or MinMaxScaler
   - Critical when features have different units/scales
   - Consider RobustScaler for outliers

3. ✓ Remove/Handle Outliers
   - Use IsolationForest or LOF
   - Outliers can severely affect centroids
   - Consider trimming or winsorization

4. ✓ Feature Selection/Engineering
   - Remove irrelevant features
   - Create domain-specific features
   - Consider PCA for high-dimensional data

5. ✓ Check for Multicollinearity
   - Highly correlated features can bias results
   - Consider removing or combining correlated features
   - Use correlation matrix or VIF

6. ✓ Handle Categorical Variables
   - One-hot encode nominal categories
   - Consider ordinal encoding for ordinal data
   - Be aware of dimensionality increase
        """
        
        print(checklist)
    
    @staticmethod
    def algorithm_selection_guide():
        """When to use K-means vs alternatives"""
        
        guide = """
CLUSTERING ALGORITHM SELECTION GUIDE:
=====================================

Use K-means when:
  • Clusters are roughly spherical
  • Clusters have similar sizes
  • Clusters have similar densities
  • You know the number of clusters
  • Speed is important
  • Interpretability is needed
  • Large datasets (use Mini-batch K-means)

Consider alternatives when:
  • Clusters have arbitrary shapes → DBSCAN, HDBSCAN
  • Clusters have different densities → DBSCAN, Mean Shift
  • Hierarchical structure exists → Agglomerative Clustering
  • Number of clusters unknown → DBSCAN, Mean Shift, HDBSCAN
  • Clusters overlap significantly → Gaussian Mixture Models
  • Data is categorical → K-modes, K-prototypes
  • Need soft clustering → Fuzzy C-means, GMM
  • Very high dimensions → Spectral Clustering, UMAP + clustering
        """
        
        print(guide)
    
    @staticmethod
    def evaluation_metrics_guide():
        """Guide for evaluating clustering results"""
        
        metrics = """
CLUSTERING EVALUATION METRICS:
==============================

Internal Metrics (no ground truth needed):
------------------------------------------
1. Silhouette Score [-1, 1]
   - Higher is better
   - Measures cluster separation
   - Good for comparing different k values

2. Calinski-Harabasz Index [0, ∞)
   - Higher is better
   - Ratio of between-cluster to within-cluster variance
   - Favors convex clusters

3. Davies-Bouldin Index [0, ∞)
   - Lower is better
   - Average similarity between clusters
   - Good for comparing algorithms

4. Inertia/SSE [0, ∞)
   - Lower is better
   - Within-cluster sum of squares
   - Use for elbow method

External Metrics (ground truth needed):
----------------------------------------
1. Adjusted Rand Index [-1, 1]
   - Higher is better
   - Adjusted for chance
   - Compares to true labels

2. Adjusted Mutual Information [0, 1]
   - Higher is better
   - Information-theoretic measure
   - Normalized and adjusted for chance

3. V-measure [0, 1]
   - Higher is better
   - Harmonic mean of homogeneity and completeness
   - Balanced measure
        """
        
        print(metrics)
    
    @staticmethod
    def common_pitfalls():
        """Common K-means pitfalls and solutions"""
        
        pitfalls = [
            {
                'pitfall': 'Not scaling features',
                'consequence': 'Features with larger scale dominate',
                'solution': 'Always standardize or normalize features'
            },
            {
                'pitfall': 'Ignoring outliers',
                'consequence': 'Centroids pulled toward outliers',
                'solution': 'Detect and handle outliers before clustering'
            },
            {
                'pitfall': 'Wrong K selection',
                'consequence': 'Poor cluster quality',
                'solution': 'Use multiple methods (elbow, silhouette, gap)'
            },
            {
                'pitfall': 'Single initialization',
                'consequence': 'Local optimum',
                'solution': 'Use n_init > 1 or k-means++'
            },
            {
                'pitfall': 'Assuming spherical clusters',
                'consequence': 'Poor results on complex shapes',
                'solution': 'Visualize data, consider DBSCAN'
            }
        ]
        
        print("\nCOMMON K-MEANS PITFALLS:")
        print("="*50)
        for p in pitfalls:
            print(f"\n❌ Pitfall: {p['pitfall']}")
            print(f"   Consequence: {p['consequence']}")
            print(f"   ✓ Solution: {p['solution']}")

# Print best practices
practices = KMeansBestPractices()
practices.preprocessing_checklist()
practices.algorithm_selection_guide()
practices.evaluation_metrics_guide()
practices.common_pitfalls()

# Summary
print("\n" + "="*60)
print("K-MEANS CLUSTERING COMPLETE GUIDE SUMMARY")
print("="*60)

summary = """
KEY TAKEAWAYS:
• K-means is fast, scalable, and interpretable
• Requires number of clusters (k) to be specified
• Sensitive to initialization (use k-means++)
• Assumes spherical clusters of similar size
• Sensitive to outliers and scale
• Always preprocess data properly
• Use multiple metrics to evaluate results
• Consider alternatives for non-spherical clusters

WORKFLOW:
1. Explore and understand your data
2. Preprocess (scale, handle outliers, missing values)
3. Determine optimal k using multiple methods
4. Apply K-means with proper initialization
5. Evaluate results with appropriate metrics
6. Visualize and interpret clusters
7. Iterate if necessary

APPLICATIONS:
• Customer segmentation
• Image compression
• Document clustering
• Anomaly detection (preprocessing)
• Feature learning
• Market segmentation
• Recommendation systems
"""

print(summary)

Practice Exercises

Exercise 1: Complete Clustering Pipeline

Build an end-to-end clustering system that:

  1. Automatically preprocesses data (scaling, outlier detection)
  2. Determines optimal K using ensemble methods
  3. Applies K-means with best practices
  4. Evaluates results with multiple metrics
  5. Generates automated insights and visualizations
  6. Compares with alternative clustering methods

Exercise 2: Real-time Customer Segmentation

Implement a streaming customer segmentation system that:

  1. Updates segments as new customers arrive
  2. Detects segment drift over time
  3. Automatically adjusts number of segments
  4. Provides real-time marketing recommendations
  5. Maintains segment stability for existing customers

Exercise 3: Advanced Image Compression

Create an intelligent image compression system that:

  1. Automatically determines optimal color palette size
  2. Preserves important image features
  3. Applies different compression to different regions
  4. Compares with standard compression algorithms
  5. Provides quality/size trade-off analysis

Key Takeaways

Further Resources