strategy['channels'].extend(['Email', 'In-app messages']) strategy['offers'].extend(['Welcome discount', 'Free trial']) strategy['potential_value'] = 'High' elif 'Loyal customers' in chars: strategy['tactics'].extend([ 'Loyalty program benefits', 'Referral incentives', 'Appreciation campaigns' ]) strategy['channels'].extend(['Email', 'Direct mail']) strategy['offers'].extend(['Points multiplier', 'Birthday rewards']) strategy['retention_risk'] = 'Low' elif 'Budget conscious' in chars: strategy['tactics'].extend([ 'Value-focused messaging', 'Bundle offers', 'Sale notifications' ]) strategy['channels'].extend(['Email', 'SMS']) strategy['offers'].extend(['Volume discounts', 'Clearance alerts']) strategies[segment_name] = strategy return strategies # Perform customer segmentation print("="*60) print("CUSTOMER SEGMENTATION ANALYSIS") print("="*60) # Select features for segmentation feature_cols = ['age', 'annual_income', 'spending_score', 'num_purchases', 'avg_purchase_value', 'days_since_last_purchase', 'website_visits', 'loyalty_years'] segmentation = CustomerSegmentation(customer_data) X_scaled = segmentation.prepare_features(feature_cols) # Find optimal number of segments print("\nFinding optimal number of segments...") optimal_k = segmentation.find_optimal_segments(k_range=range(3, 9)) print(f"Optimal number of segments: {optimal_k}") # Perform segmentation print(f"\nSegmenting customers into {optimal_k} groups...") segments = segmentation.segment_customers(optimal_k) segment_names = segmentation.name_segments() # Display segment information print("\n" + "="*60) print("SEGMENT PROFILES") print("="*60) for segment_id, name in segment_names.items(): info = segments[segment_id] print(f"\n{name} (Segment {segment_id}):") print(f" Size: {info['size']} customers ({info['percentage']:.1f}%)") print(f" Characteristics: {', '.join(info['characteristics'])}") print(f" Key metrics:") for metric, value in list(info['profile'].items())[:4]: print(f" {metric}: {value:.2f}") # Visualize segments segmentation.visualize_segments() # Generate marketing strategies strategies = segmentation.generate_marketing_strategies() print("\n" + "="*60) print("MARKETING STRATEGIES BY SEGMENT") print("="*60) for segment_name, strategy in strategies.items(): print(f"\n{segment_name}:") print(f" Size: {strategy['size']} customers ({strategy['percentage']})") print(f" Retention Risk: {strategy['retention_risk']}") print(f" Potential Value: {strategy['potential_value']}") print(f" Tactics: {', '.join(strategy['tactics'][:2])}") print(f" Channels: {', '.join(strategy['channels'])}") print(f" Offers: {', '.join(strategy['offers'])}")
from sklearn.utils import shuffle
from PIL import Image
import requests
from io import BytesIO
class ImageCompression:
"""Image compression using K-means color quantization"""
def __init__(self):
self.original_image = None
self.compressed_images = {}
def create_sample_image(self, size=(150, 150)):
"""Create a synthetic image for demonstration"""
# Create an image with distinct color regions
img_array = np.zeros((size[0], size[1], 3), dtype=np.uint8)
# Create color gradients
for i in range(size[0]):
for j in range(size[1]):
# Top-left: Red gradient
if i < size[0]//2 and j < size[1]//2:
img_array[i, j] = [255 - i*2, 50, 50]
# Top-right: Green gradient
elif i < size[0]//2 and j >= size[1]//2:
img_array[i, j] = [50, 255 - i*2, 50]
# Bottom-left: Blue gradient
elif i >= size[0]//2 and j < size[1]//2:
img_array[i, j] = [50, 50, 255 - (i-size[0]//2)*2]
# Bottom-right: Yellow gradient
else:
img_array[i, j] = [255 - (i-size[0]//2)*2, 255 - (i-size[0]//2)*2, 50]
# Add some noise
noise = np.random.normal(0, 20, img_array.shape)
img_array = np.clip(img_array.astype(float) + noise, 0, 255).astype(np.uint8)
return img_array
def compress_image(self, image, n_colors):
"""Compress image using K-means color quantization"""
# Store original dimensions
original_shape = image.shape
h, w = original_shape[:2]
# Reshape image to be a list of pixels
image_array = image.reshape((h * w, 3))
# Sample pixels for faster processing
n_samples = min(1000, len(image_array))
image_sample = shuffle(image_array, random_state=0, n_samples=n_samples)
# Fit K-means on sample
kmeans = KMeans(n_clusters=n_colors, random_state=42, n_init=10)
kmeans.fit(image_sample)
# Predict cluster for all pixels
labels = kmeans.predict(image_array)
# Replace each pixel with its cluster center
compressed = kmeans.cluster_centers_[labels]
compressed_image = compressed.reshape(original_shape).astype(np.uint8)
# Calculate compression metrics
original_colors = len(np.unique(image_array, axis=0))
compression_ratio = original_colors / n_colors
# Calculate MSE
mse = np.mean((image_array - compressed)**2)
psnr = 20 * np.log10(255.0 / np.sqrt(mse))
metrics = {
'original_colors': original_colors,
'compressed_colors': n_colors,
'compression_ratio': compression_ratio,
'mse': mse,
'psnr': psnr,
'file_size_reduction': self._estimate_file_size_reduction(n_colors)
}
return compressed_image, kmeans.cluster_centers_, labels.reshape(h, w), metrics
def _estimate_file_size_reduction(self, n_colors):
"""Estimate file size reduction"""
# Original: 24 bits per pixel (8 bits per channel)
# Compressed: log2(n_colors) bits per pixel + palette
original_bits = 24
compressed_bits = np.ceil(np.log2(n_colors))
reduction = 1 - (compressed_bits / original_bits)
return reduction
def visualize_compression(self, image, n_colors_list=[4, 8, 16, 32]):
"""Visualize compression at different levels"""
n_levels = len(n_colors_list)
fig, axes = plt.subplots(2, n_levels + 1, figsize=(20, 8))
# Original image
axes[0, 0].imshow(image)
axes[0, 0].set_title('Original Image')
axes[0, 0].axis('off')
axes[1, 0].text(0.5, 0.5, 'Original\nNo compression',
ha='center', va='center', fontsize=12)
axes[1, 0].axis('off')
compression_results = []
for idx, n_colors in enumerate(n_colors_list, 1):
# Compress image
compressed, palette, labels, metrics = self.compress_image(image, n_colors)
# Show compressed image
axes[0, idx].imshow(compressed)
axes[0, idx].set_title(f'{n_colors} Colors')
axes[0, idx].axis('off')
# Show metrics
metrics_text = (f"Colors: {n_colors}\n"
f"Ratio: {metrics['compression_ratio']:.1f}x\n"
f"PSNR: {metrics['psnr']:.1f} dB\n"
f"Size ↓: {metrics['file_size_reduction']*100:.0f}%")
axes[1, idx].text(0.5, 0.5, metrics_text,
ha='center', va='center', fontsize=10)
axes[1, idx].axis('off')
compression_results.append({
'n_colors': n_colors,
'compressed_image': compressed,
'palette': palette,
'metrics': metrics
})
plt.suptitle('Image Compression using K-means Color Quantization', fontsize=14)
plt.tight_layout()
plt.show()
return compression_results
def visualize_color_space(self, image, compressed_results):
"""Visualize color space and clustering"""
fig, axes = plt.subplots(1, len(compressed_results), figsize=(20, 5))
# Sample pixels for visualization
h, w = image.shape[:2]
pixels = image.reshape(-1, 3)
sample_size = min(5000, len(pixels))
sample_indices = np.random.choice(len(pixels), sample_size, replace=False)
pixel_sample = pixels[sample_indices]
for idx, result in enumerate(compressed_results):
ax = axes[idx]
n_colors = result['n_colors']
palette = result['palette']
# 3D to 2D projection (using first two channels)
ax.scatter(pixel_sample[:, 0], pixel_sample[:, 1],
c=pixel_sample/255, s=1, alpha=0.3)
# Plot cluster centers
ax.scatter(palette[:, 0], palette[:, 1],
c=palette/255, s=200, edgecolor='black',
linewidth=2, marker='*')
ax.set_xlabel('Red Channel')
ax.set_ylabel('Green Channel')
ax.set_title(f'{n_colors} Color Clusters')
ax.set_xlim([0, 255])
ax.set_ylim([0, 255])
plt.suptitle('Color Space Clustering', fontsize=14)
plt.tight_layout()
plt.show()
def analyze_compression_quality(self, image):
"""Analyze quality vs compression trade-off"""
n_colors_range = [2, 4, 8, 16, 32, 64, 128]
metrics_list = []
for n_colors in n_colors_range:
_, _, _, metrics = self.compress_image(image, n_colors)
metrics['n_colors'] = n_colors
metrics_list.append(metrics)
metrics_df = pd.DataFrame(metrics_list)
# Plot quality metrics
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
# PSNR vs colors
axes[0].plot(metrics_df['n_colors'], metrics_df['psnr'], 'o-', linewidth=2)
axes[0].set_xlabel('Number of Colors')
axes[0].set_ylabel('PSNR (dB)')
axes[0].set_title('Quality vs Compression Level')
axes[0].grid(True, alpha=0.3)
axes[0].axhline(y=30, color='r', linestyle='--', label='Good Quality (30dB)')
axes[0].legend()
# File size reduction
axes[1].plot(metrics_df['n_colors'],
metrics_df['file_size_reduction']*100, 'o-', linewidth=2)
axes[1].set_xlabel('Number of Colors')
axes[1].set_ylabel('File Size Reduction (%)')
axes[1].set_title('Compression Efficiency')
axes[1].grid(True, alpha=0.3)
# MSE vs colors (log scale)
axes[2].semilogy(metrics_df['n_colors'], metrics_df['mse'], 'o-', linewidth=2)
axes[2].set_xlabel('Number of Colors')
axes[2].set_ylabel('Mean Squared Error (log scale)')
axes[2].set_title('Reconstruction Error')
axes[2].grid(True, alpha=0.3)
plt.suptitle('Compression Quality Analysis', fontsize=14)
plt.tight_layout()
plt.show()
return metrics_df
# Demonstrate image compression
print("\n" + "="*60)
print("IMAGE COMPRESSION WITH K-MEANS")
print("="*60)
compressor = ImageCompression()
# Create sample image
print("\nCreating sample image...")
sample_image = compressor.create_sample_image(size=(150, 150))
# Compress at different levels
print("Compressing at different color levels...")
compression_results = compressor.visualize_compression(
sample_image,
n_colors_list=[4, 8, 16, 64]
)
# Visualize color space
print("\nVisualizing color space clustering...")
compressor.visualize_color_space(sample_image, compression_results[:3])
# Analyze quality metrics
print("\nAnalyzing compression quality...")
quality_metrics = compressor.analyze_compression_quality(sample_image)
print("\nCompression Quality Summary:")
print(quality_metrics[['n_colors', 'psnr', 'file_size_reduction']].to_string(index=False))
from sklearn.datasets import make_circles, make_moons, make_blobs
from sklearn.cluster import DBSCAN, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
class KMeansLimitations:
"""Demonstrate and handle K-means limitations"""
@staticmethod
def demonstrate_shape_limitation():
"""Show K-means failure on non-spherical clusters"""
# Generate different cluster shapes
datasets = {
'Spherical (K-means works)': make_blobs(n_samples=300, centers=3,
cluster_std=0.5, random_state=42),
'Elongated': (make_blobs(n_samples=300, centers=3,
cluster_std=0.5, random_state=42)[0] @
np.array([[1, 0], [0, 0.3]]), None),
'Moons': make_moons(n_samples=300, noise=0.05, random_state=42),
'Circles': make_circles(n_samples=300, noise=0.05, factor=0.5,
random_state=42),
'Anisotropic': (make_blobs(n_samples=300, centers=3,
cluster_std=0.5, random_state=42)[0] @
np.array([[0.6, -0.8], [0.4, 0.7]]), None),
'Varying Density': (np.vstack([
np.random.randn(100, 2) * 0.5 + [2, 2],
np.random.randn(200, 2) * 2 + [-2, -2]
]), None)
}
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()
for idx, (name, (X, y_true)) in enumerate(datasets.items()):
ax = axes[idx]
# Determine number of clusters
if 'Circles' in name or 'Moons' in name:
n_clusters = 2
else:
n_clusters = 3
# Apply K-means
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans_labels = kmeans.fit_predict(X)
# Apply alternative clustering (DBSCAN for non-spherical)
if 'Circles' in name or 'Moons' in name:
dbscan = DBSCAN(eps=0.3, min_samples=5)
alt_labels = dbscan.fit_predict(X)
alt_name = 'DBSCAN'
else:
alt_labels = kmeans_labels
alt_name = 'K-means'
# Plot K-means result
scatter = ax.scatter(X[:, 0], X[:, 1], c=kmeans_labels,
cmap='viridis', alpha=0.6, s=30)
ax.scatter(kmeans.cluster_centers_[:, 0],
kmeans.cluster_centers_[:, 1],
c='red', marker='x', s=200, linewidths=3)
# Calculate silhouette score
kmeans_score = silhouette_score(X, kmeans_labels)
ax.set_title(f'{name}\nK-means Silhouette: {kmeans_score:.3f}')
plt.suptitle('K-means Performance on Different Cluster Shapes', fontsize=14)
plt.tight_layout()
plt.show()
@staticmethod
def handle_outliers(X, contamination=0.1):
"""Handle outliers before clustering"""
from sklearn.ensemble import IsolationForest
print("\n" + "="*60)
print("OUTLIER DETECTION AND HANDLING")
print("="*60)
# Detect outliers using Isolation Forest
iso_forest = IsolationForest(contamination=contamination, random_state=42)
outlier_labels = iso_forest.fit_predict(X)
# Separate inliers and outliers
X_inliers = X[outlier_labels == 1]
X_outliers = X[outlier_labels == -1]
print(f"\nDetected {len(X_outliers)} outliers ({len(X_outliers)/len(X)*100:.1f}%)")
print(f"Remaining inliers: {len(X_inliers)}")
# Cluster only inliers
kmeans = KMeans(n_clusters=3, random_state=42)
inlier_clusters = kmeans.fit_predict(X_inliers)
# Assign outliers to nearest cluster or mark as noise
outlier_clusters = []
noise_threshold = np.percentile([np.linalg.norm(x - kmeans.cluster_centers_[c])
for x, c in zip(X_inliers, inlier_clusters)], 95)
for point in X_outliers:
distances = [np.linalg.norm(point - center)
for center in kmeans.cluster_centers_]
min_dist = min(distances)
if min_dist > noise_threshold:
outlier_clusters.append(-1) # Mark as noise
else:
outlier_clusters.append(np.argmin(distances))
# Combine results
all_labels = np.zeros(len(X))
all_labels[outlier_labels == 1] = inlier_clusters
all_labels[outlier_labels == -1] = outlier_clusters
# Visualize
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
# Original data with outliers
axes[0].scatter(X[:, 0], X[:, 1], c='blue', alpha=0.5, s=30)
axes[0].scatter(X_outliers[:, 0], X_outliers[:, 1],
c='red', marker='x', s=50, label='Outliers')
axes[0].set_title('Original Data with Outliers')
axes[0].legend()
# K-means without outlier handling
kmeans_all = KMeans(n_clusters=3, random_state=42)
labels_with_outliers = kmeans_all.fit_predict(X)
axes[1].scatter(X[:, 0], X[:, 1], c=labels_with_outliers,
cmap='viridis', alpha=0.6, s=30)
axes[1].scatter(kmeans_all.cluster_centers_[:, 0],
kmeans_all.cluster_centers_[:, 1],
c='red', marker='x', s=200, linewidths=3)
axes[1].set_title('K-means with Outliers')
# K-means after outlier handling
axes[2].scatter(X[:, 0], X[:, 1], c=all_labels,
cmap='viridis', alpha=0.6, s=30)
axes[2].scatter(kmeans.cluster_centers_[:, 0],
kmeans.cluster_centers_[:, 1],
c='red', marker='x', s=200, linewidths=3)
noise_points = X[all_labels == -1]
if len(noise_points) > 0:
axes[2].scatter(noise_points[:, 0], noise_points[:, 1],
c='black', marker='x', s=50, label='Noise')
axes[2].set_title('K-means after Outlier Handling')
axes[2].legend()
plt.suptitle('Impact of Outlier Handling on K-means', fontsize=14)
plt.tight_layout()
plt.show()
return all_labels, X_inliers, X_outliers
@staticmethod
def compare_scaling_impact():
"""Show importance of feature scaling"""
# Create data with different scales
np.random.seed(42)
X_unscaled = np.column_stack([
np.random.normal(100, 20, 300), # Feature 1: scale ~100
np.random.normal(0, 1, 300) # Feature 2: scale ~1
])
# Add true clusters
X_unscaled[:100, 0] += 50
X_unscaled[100:200, 1] += 2
X_unscaled[200:, 0] -= 30
X_unscaled[200:, 1] -= 1
# Scale data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_unscaled)
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# Unscaled clustering
kmeans_unscaled = KMeans(n_clusters=3, random_state=42)
labels_unscaled = kmeans_unscaled.fit_predict(X_unscaled)
axes[0].scatter(X_unscaled[:, 0], X_unscaled[:, 1],
c=labels_unscaled, cmap='viridis', alpha=0.6)
axes[0].set_xlabel('Feature 1 (scale ~100)')
axes[0].set_ylabel('Feature 2 (scale ~1)')
axes[0].set_title('K-means on Unscaled Data')
score_unscaled = silhouette_score(X_unscaled, labels_unscaled)
axes[0].text(0.02, 0.98, f'Silhouette: {score_unscaled:.3f}',
transform=axes[0].transAxes, verticalalignment='top',
bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
# Scaled clustering
kmeans_scaled = KMeans(n_clusters=3, random_state=42)
labels_scaled = kmeans_scaled.fit_predict(X_scaled)
axes[1].scatter(X_scaled[:, 0], X_scaled[:, 1],
c=labels_scaled, cmap='viridis', alpha=0.6)
axes[1].set_xlabel('Feature 1 (standardized)')
axes[1].set_ylabel('Feature 2 (standardized)')
axes[1].set_title('K-means on Scaled Data')
score_scaled = silhouette_score(X_scaled, labels_scaled)
axes[1].text(0.02, 0.98, f'Silhouette: {score_scaled:.3f}',
transform=axes[1].transAxes, verticalalignment='top',
bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
plt.suptitle('Impact of Feature Scaling on K-means', fontsize=14)
plt.tight_layout()
plt.show()
print("\n" + "="*60)
print("FEATURE SCALING IMPACT")
print("="*60)
print(f"Silhouette Score (Unscaled): {score_unscaled:.3f}")
print(f"Silhouette Score (Scaled): {score_scaled:.3f}")
print(f"Improvement: {100*(score_scaled-score_unscaled)/abs(score_unscaled):.1f}%")
# Demonstrate K-means limitations
print("\n" + "="*60)
print("K-MEANS LIMITATIONS AND SOLUTIONS")
print("="*60)
limitations = KMeansLimitations()
print("\n1. Non-spherical Clusters:")
limitations.demonstrate_shape_limitation()
print("\n2. Impact of Scaling:")
limitations.compare_scaling_impact()
print("\n3. Handling Outliers:")
# Generate data with outliers
X_with_outliers = np.vstack([
make_blobs(n_samples=300, centers=3, cluster_std=0.5, random_state=42)[0],
np.random.uniform(-8, 8, (30, 2)) # Outliers
])
labels_with_outliers, inliers, outliers = limitations.handle_outliers(
X_with_outliers, contamination=0.1
)
class KMeansBestPractices:
"""Best practices for using K-means clustering"""
@staticmethod
def preprocessing_checklist():
"""Preprocessing steps for K-means"""
checklist = """
K-MEANS PREPROCESSING CHECKLIST:
================================
1. ✓ Handle Missing Values
- Remove or impute missing data
- K-means cannot handle NaN values
- Consider missingness patterns
2. ✓ Scale Features
- Use StandardScaler or MinMaxScaler
- Critical when features have different units/scales
- Consider RobustScaler for outliers
3. ✓ Remove/Handle Outliers
- Use IsolationForest or LOF
- Outliers can severely affect centroids
- Consider trimming or winsorization
4. ✓ Feature Selection/Engineering
- Remove irrelevant features
- Create domain-specific features
- Consider PCA for high-dimensional data
5. ✓ Check for Multicollinearity
- Highly correlated features can bias results
- Consider removing or combining correlated features
- Use correlation matrix or VIF
6. ✓ Handle Categorical Variables
- One-hot encode nominal categories
- Consider ordinal encoding for ordinal data
- Be aware of dimensionality increase
"""
print(checklist)
@staticmethod
def algorithm_selection_guide():
"""When to use K-means vs alternatives"""
guide = """
CLUSTERING ALGORITHM SELECTION GUIDE:
=====================================
Use K-means when:
• Clusters are roughly spherical
• Clusters have similar sizes
• Clusters have similar densities
• You know the number of clusters
• Speed is important
• Interpretability is needed
• Large datasets (use Mini-batch K-means)
Consider alternatives when:
• Clusters have arbitrary shapes → DBSCAN, HDBSCAN
• Clusters have different densities → DBSCAN, Mean Shift
• Hierarchical structure exists → Agglomerative Clustering
• Number of clusters unknown → DBSCAN, Mean Shift, HDBSCAN
• Clusters overlap significantly → Gaussian Mixture Models
• Data is categorical → K-modes, K-prototypes
• Need soft clustering → Fuzzy C-means, GMM
• Very high dimensions → Spectral Clustering, UMAP + clustering
"""
print(guide)
@staticmethod
def evaluation_metrics_guide():
"""Guide for evaluating clustering results"""
metrics = """
CLUSTERING EVALUATION METRICS:
==============================
Internal Metrics (no ground truth needed):
------------------------------------------
1. Silhouette Score [-1, 1]
- Higher is better
- Measures cluster separation
- Good for comparing different k values
2. Calinski-Harabasz Index [0, ∞)
- Higher is better
- Ratio of between-cluster to within-cluster variance
- Favors convex clusters
3. Davies-Bouldin Index [0, ∞)
- Lower is better
- Average similarity between clusters
- Good for comparing algorithms
4. Inertia/SSE [0, ∞)
- Lower is better
- Within-cluster sum of squares
- Use for elbow method
External Metrics (ground truth needed):
----------------------------------------
1. Adjusted Rand Index [-1, 1]
- Higher is better
- Adjusted for chance
- Compares to true labels
2. Adjusted Mutual Information [0, 1]
- Higher is better
- Information-theoretic measure
- Normalized and adjusted for chance
3. V-measure [0, 1]
- Higher is better
- Harmonic mean of homogeneity and completeness
- Balanced measure
"""
print(metrics)
@staticmethod
def common_pitfalls():
"""Common K-means pitfalls and solutions"""
pitfalls = [
{
'pitfall': 'Not scaling features',
'consequence': 'Features with larger scale dominate',
'solution': 'Always standardize or normalize features'
},
{
'pitfall': 'Ignoring outliers',
'consequence': 'Centroids pulled toward outliers',
'solution': 'Detect and handle outliers before clustering'
},
{
'pitfall': 'Wrong K selection',
'consequence': 'Poor cluster quality',
'solution': 'Use multiple methods (elbow, silhouette, gap)'
},
{
'pitfall': 'Single initialization',
'consequence': 'Local optimum',
'solution': 'Use n_init > 1 or k-means++'
},
{
'pitfall': 'Assuming spherical clusters',
'consequence': 'Poor results on complex shapes',
'solution': 'Visualize data, consider DBSCAN'
}
]
print("\nCOMMON K-MEANS PITFALLS:")
print("="*50)
for p in pitfalls:
print(f"\n❌ Pitfall: {p['pitfall']}")
print(f" Consequence: {p['consequence']}")
print(f" ✓ Solution: {p['solution']}")
# Print best practices
practices = KMeansBestPractices()
practices.preprocessing_checklist()
practices.algorithm_selection_guide()
practices.evaluation_metrics_guide()
practices.common_pitfalls()
# Summary
print("\n" + "="*60)
print("K-MEANS CLUSTERING COMPLETE GUIDE SUMMARY")
print("="*60)
summary = """
KEY TAKEAWAYS:
• K-means is fast, scalable, and interpretable
• Requires number of clusters (k) to be specified
• Sensitive to initialization (use k-means++)
• Assumes spherical clusters of similar size
• Sensitive to outliers and scale
• Always preprocess data properly
• Use multiple metrics to evaluate results
• Consider alternatives for non-spherical clusters
WORKFLOW:
1. Explore and understand your data
2. Preprocess (scale, handle outliers, missing values)
3. Determine optimal k using multiple methods
4. Apply K-means with proper initialization
5. Evaluate results with appropriate metrics
6. Visualize and interpret clusters
7. Iterate if necessary
APPLICATIONS:
• Customer segmentation
• Image compression
• Document clustering
• Anomaly detection (preprocessing)
• Feature learning
• Market segmentation
• Recommendation systems
"""
print(summary)
Build an end-to-end clustering system that:
Implement a streaming customer segmentation system that:
Create an intelligent image compression system that: