UMAP (Uniform Manifold Approximation and Projection) is a cutting-edge dimensionality reduction technique that has quickly become a favorite in the data science community. Developed in 2018, UMAP offers the visualization quality of t-SNE with better preservation of global structure, faster computation, and the ability to transform new data. It's based on manifold theory and topological data analysis, making it both mathematically rigorous and practically powerful.
# First, install UMAP if not already installed
# pip install umap-learn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_digits, load_iris, fetch_olivetti_faces
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap
import time
import warnings
warnings.filterwarnings('ignore')
# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
print("="*60)
print("UMAP FUNDAMENTALS")
print("="*60)
# Core concepts
umap_concepts = """
UMAP KEY CONCEPTS:
1. THEORETICAL FOUNDATION:
• Based on Riemannian geometry and algebraic topology
• Assumes data is uniformly distributed on manifold
• Preserves topological structure
• Uses fuzzy topological representation
2. ALGORITHM OVERVIEW:
• Construct fuzzy topological representation (simplicial complex)
• Find low-dimensional representation with similar topology
• Optimize layout using force-directed graph layout
• Cross-entropy minimization
3. KEY PARAMETERS:
• n_neighbors: Local vs global structure balance (5-50, default 15)
• min_dist: Minimum distance between points (0.0-0.99, default 0.1)
• n_components: Output dimensions (usually 2 or 3)
• metric: Distance metric (euclidean, manhattan, etc.)
4. ADVANTAGES OVER t-SNE:
• Preserves more global structure
• Significantly faster (O(n^1.14) vs O(n^2))
• Can transform new data
• Supports custom distance metrics
• Works with sparse data
• Scales to millions of samples
5. WHEN TO USE UMAP:
• Large datasets (>10,000 samples)
• Need to preserve global relationships
• Real-time or interactive applications
• Need to project new data
• Working with sparse data (text, genomics)
"""
print(umap_concepts)
class UMAPAnalyzer:
"""Comprehensive UMAP analysis and visualization"""
def __init__(self):
self.models = {}
self.embeddings = {}
self.times = {}
def compare_with_other_methods(self, X, y=None):
"""Compare UMAP with PCA and t-SNE"""
# Standardize data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
# PCA
print("Running PCA...")
start = time.time()
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
pca_time = time.time() - start
# t-SNE
print("Running t-SNE...")
start = time.time()
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X_scaled)
tsne_time = time.time() - start
# UMAP
print("Running UMAP...")
start = time.time()
reducer = umap.UMAP(n_components=2, random_state=42)
X_umap = reducer.fit_transform(X_scaled)
umap_time = time.time() - start
# Store results
self.models = {'PCA': pca, 't-SNE': tsne, 'UMAP': reducer}
self.embeddings = {'PCA': X_pca, 't-SNE': X_tsne, 'UMAP': X_umap}
self.times = {'PCA': pca_time, 't-SNE': tsne_time, 'UMAP': umap_time}
# Visualizations
methods = ['PCA', 't-SNE', 'UMAP']
embeddings = [X_pca, X_tsne, X_umap]
times = [pca_time, tsne_time, umap_time]
for idx, (method, embedding, elapsed) in enumerate(zip(methods, embeddings, times)):
# Scatter plot
if y is not None:
scatter = axes[0, idx].scatter(embedding[:, 0], embedding[:, 1],
c=y, cmap='viridis', alpha=0.6, s=30)
plt.colorbar(scatter, ax=axes[0, idx])
else:
axes[0, idx].scatter(embedding[:, 0], embedding[:, 1],
alpha=0.6, s=30)
axes[0, idx].set_title(f'{method} ({elapsed:.3f}s)')
axes[0, idx].set_xlabel('Component 1')
axes[0, idx].set_ylabel('Component 2')
axes[0, idx].grid(True, alpha=0.3)
# Time comparison
axes[1, 0].bar(methods, times, color=['blue', 'orange', 'green'])
axes[1, 0].set_ylabel('Time (seconds)')
axes[1, 0].set_title('Computation Time Comparison')
axes[1, 0].grid(True, alpha=0.3, axis='y')
# Distance preservation analysis
from sklearn.metrics import pairwise_distances
original_dist = pairwise_distances(X_scaled[:100]) # Sample for efficiency
correlations = []
for method, embedding in self.embeddings.items():
embedded_dist = pairwise_distances(embedding[:100])
corr = np.corrcoef(original_dist.flatten(), embedded_dist.flatten())[0, 1]
correlations.append(corr)
axes[1, 1].bar(methods, correlations, color=['blue', 'orange', 'green'])
axes[1, 1].set_ylabel('Correlation')
axes[1, 1].set_title('Distance Preservation')
axes[1, 1].set_ylim(0, 1)
axes[1, 1].grid(True, alpha=0.3, axis='y')
# Feature comparison
features_data = {
'Method': methods,
'Global Structure': ['High', 'Low', 'Medium-High'],
'Local Structure': ['Low', 'High', 'High'],
'Speed': ['Fast', 'Slow', 'Medium'],
'New Data': ['Yes', 'No', 'Yes']
}
# Create comparison table
table_text = []
for i in range(len(methods)):
table_text.append([features_data['Method'][i],
features_data['Global Structure'][i],
features_data['Local Structure'][i],
features_data['Speed'][i],
features_data['New Data'][i]])
table = axes[1, 2].table(cellText=table_text,
colLabels=['Method', 'Global', 'Local', 'Speed', 'New Data'],
cellLoc='center',
loc='center')
table.auto_set_font_size(False)
table.set_fontsize(9)
table.scale(1, 1.5)
axes[1, 2].axis('off')
axes[1, 2].set_title('Feature Comparison')
plt.suptitle('Dimensionality Reduction Methods Comparison', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()
return self.embeddings
def parameter_exploration(self, X, y=None):
"""Explore UMAP parameters: n_neighbors and min_dist"""
fig, axes = plt.subplots(3, 3, figsize=(15, 15))
# Different n_neighbors values
n_neighbors_values = [5, 15, 50]
for idx, n_neigh in enumerate(n_neighbors_values):
reducer = umap.UMAP(n_neighbors=n_neigh, random_state=42)
embedding = reducer.fit_transform(X)
if y is not None:
axes[0, idx].scatter(embedding[:, 0], embedding[:, 1],
c=y, cmap='viridis', alpha=0.6, s=30)
else:
axes[0, idx].scatter(embedding[:, 0], embedding[:, 1],
alpha=0.6, s=30)
axes[0, idx].set_title(f'n_neighbors = {n_neigh}')
axes[0, idx].set_xlabel('UMAP 1')
axes[0, idx].set_ylabel('UMAP 2')
axes[0, idx].grid(True, alpha=0.3)
# Different min_dist values
min_dist_values = [0.0, 0.1, 0.5]
for idx, min_d in enumerate(min_dist_values):
reducer = umap.UMAP(min_dist=min_d, random_state=42)
embedding = reducer.fit_transform(X)
if y is not None:
axes[1, idx].scatter(embedding[:, 0], embedding[:, 1],
c=y, cmap='viridis', alpha=0.6, s=30)
else:
axes[1, idx].scatter(embedding[:, 0], embedding[:, 1],
alpha=0.6, s=30)
axes[1, idx].set_title(f'min_dist = {min_d}')
axes[1, idx].set_xlabel('UMAP 1')
axes[1, idx].set_ylabel('UMAP 2')
axes[1, idx].grid(True, alpha=0.3)
# Combined parameter variations
param_combinations = [(5, 0.0), (15, 0.1), (50, 0.5)]
for idx, (n_neigh, min_d) in enumerate(param_combinations):
reducer = umap.UMAP(n_neighbors=n_neigh, min_dist=min_d, random_state=42)
embedding = reducer.fit_transform(X)
if y is not None:
axes[2, idx].scatter(embedding[:, 0], embedding[:, 1],
c=y, cmap='viridis', alpha=0.6, s=30)
else:
axes[2, idx].scatter(embedding[:, 0], embedding[:, 1],
alpha=0.6, s=30)
axes[2, idx].set_title(f'n_neigh={n_neigh}, min_d={min_d}')
axes[2, idx].set_xlabel('UMAP 1')
axes[2, idx].set_ylabel('UMAP 2')
axes[2, idx].grid(True, alpha=0.3)
plt.suptitle('UMAP Parameter Exploration', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()
# Load sample data
digits = load_digits()
X_digits = digits.data
y_digits = digits.target
# Sample for faster computation
sample_idx = np.random.choice(len(X_digits), 500, replace=False)
X_sample = X_digits[sample_idx]
y_sample = y_digits[sample_idx]
# Initialize analyzer
analyzer = UMAPAnalyzer()
print("\n" + "="*60)
print("COMPARING DIMENSIONALITY REDUCTION METHODS")
print("="*60)
embeddings = analyzer.compare_with_other_methods(X_sample, y_sample)
print("\n" + "="*60)
print("UMAP PARAMETER EXPLORATION")
print("="*60)
analyzer.parameter_exploration(X_sample, y_sample)
class AdvancedUMAP:
"""Advanced UMAP techniques and applications"""
def __init__(self):
self.models = {}
def supervised_umap(self, X, y):
"""Supervised and semi-supervised UMAP"""
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
# Unsupervised UMAP
print("Unsupervised UMAP...")
unsupervised = umap.UMAP(n_components=2, random_state=42)
X_unsupervised = unsupervised.fit_transform(X)
# Supervised UMAP (uses labels)
print("Supervised UMAP...")
supervised = umap.UMAP(n_components=2, random_state=42)
X_supervised = supervised.fit_transform(X, y)
# Semi-supervised UMAP (partial labels)
print("Semi-supervised UMAP...")
# Simulate partial labels (mask 50% as unknown with -1)
y_partial = y.copy()
mask = np.random.random(len(y)) > 0.5
y_partial[mask] = -1
semi_supervised = umap.UMAP(n_components=2, random_state=42)
X_semi = semi_supervised.fit_transform(X, y_partial)
# Visualizations
titles = ['Unsupervised', 'Supervised', 'Semi-supervised (50% labels)']
embeddings = [X_unsupervised, X_supervised, X_semi]
for idx, (title, embedding) in enumerate(zip(titles, embeddings)):
scatter = axes[idx].scatter(embedding[:, 0], embedding[:, 1],
c=y, cmap='viridis', alpha=0.6, s=30)
axes[idx].set_title(title)
axes[idx].set_xlabel('UMAP 1')
axes[idx].set_ylabel('UMAP 2')
axes[idx].grid(True, alpha=0.3)
plt.colorbar(scatter, ax=axes[idx])
plt.suptitle('Supervised vs Unsupervised UMAP', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()
return unsupervised, supervised, semi_supervised
def transform_new_data(self, X_train, X_test, y_train=None, y_test=None):
"""Demonstrate UMAP's ability to transform new data"""
print("Training UMAP on training data...")
reducer = umap.UMAP(n_components=2, random_state=42)
X_train_embedded = reducer.fit_transform(X_train)
print("Transforming test data...")
X_test_embedded = reducer.transform(X_test)
# Visualization
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# Training data
if y_train is not None:
scatter = axes[0].scatter(X_train_embedded[:, 0], X_train_embedded[:, 1],
c=y_train, cmap='viridis', alpha=0.6, s=30)
plt.colorbar(scatter, ax=axes[0])
else:
axes[0].scatter(X_train_embedded[:, 0], X_train_embedded[:, 1],
alpha=0.6, s=30)
axes[0].set_title('Training Data Embedding')
axes[0].set_xlabel('UMAP 1')
axes[0].set_ylabel('UMAP 2')
axes[0].grid(True, alpha=0.3)
# Test data
if y_test is not None:
scatter = axes[1].scatter(X_test_embedded[:, 0], X_test_embedded[:, 1],
c=y_test, cmap='viridis', alpha=0.6, s=30,
marker='^')
plt.colorbar(scatter, ax=axes[1])
else:
axes[1].scatter(X_test_embedded[:, 0], X_test_embedded[:, 1],
alpha=0.6, s=30, marker='^')
axes[1].set_title('Test Data Transformation')
axes[1].set_xlabel('UMAP 1')
axes[1].set_ylabel('UMAP 2')
axes[1].grid(True, alpha=0.3)
plt.suptitle('UMAP Transform New Data Capability', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()
return reducer, X_train_embedded, X_test_embedded
def custom_metrics(self, X):
"""Use custom distance metrics with UMAP"""
from sklearn.datasets import make_classification
# Generate data
X_custom, y_custom = make_classification(n_samples=300, n_features=20,
n_informative=10, n_redundant=5,
n_clusters_per_class=2,
random_state=42)
# Different metrics
metrics = ['euclidean', 'manhattan', 'chebyshev', 'cosine']
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.ravel()
for idx, metric in enumerate(metrics):
print(f"UMAP with {metric} distance...")
reducer = umap.UMAP(n_components=2, metric=metric, random_state=42)
embedding = reducer.fit_transform(X_custom)
scatter = axes[idx].scatter(embedding[:, 0], embedding[:, 1],
c=y_custom, cmap='viridis', alpha=0.6, s=30)
axes[idx].set_title(f'{metric.capitalize()} Distance')
axes[idx].set_xlabel('UMAP 1')
axes[idx].set_ylabel('UMAP 2')
axes[idx].grid(True, alpha=0.3)
plt.colorbar(scatter, ax=axes[idx])
plt.suptitle('UMAP with Different Distance Metrics', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()
def umap_for_clustering(self, X):
"""Use UMAP as preprocessing for clustering"""
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
# Original data clustering
print("Clustering on original data...")
kmeans_original = KMeans(n_clusters=3, random_state=42)
labels_original = kmeans_original.fit_predict(X)
silhouette_original = silhouette_score(X, labels_original)
# UMAP reduction
print("Applying UMAP...")
reducer = umap.UMAP(n_components=2, random_state=42)
X_umap = reducer.fit_transform(X)
# Clustering on UMAP embedding
print("Clustering on UMAP embedding...")
kmeans_umap = KMeans(n_clusters=3, random_state=42)
labels_umap = kmeans_umap.fit_predict(X_umap)
silhouette_umap = silhouette_score(X_umap, labels_umap)
# DBSCAN on UMAP
dbscan = DBSCAN(eps=0.5, min_samples=5)
labels_dbscan = dbscan.fit_predict(X_umap)
# Visualization
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
# K-means on original
axes[0].scatter(X_umap[:, 0], X_umap[:, 1],
c=labels_original, cmap='viridis', alpha=0.6, s=30)
axes[0].set_title(f'K-means on Original\nSilhouette: {silhouette_original:.3f}')
axes[0].set_xlabel('UMAP 1')
axes[0].set_ylabel('UMAP 2')
axes[0].grid(True, alpha=0.3)
# K-means on UMAP
axes[1].scatter(X_umap[:, 0], X_umap[:, 1],
c=labels_umap, cmap='viridis', alpha=0.6, s=30)
axes[1].set_title(f'K-means on UMAP\nSilhouette: {silhouette_umap:.3f}')
axes[1].set_xlabel('UMAP 1')
axes[1].set_ylabel('UMAP 2')
axes[1].grid(True, alpha=0.3)
# DBSCAN on UMAP
axes[2].scatter(X_umap[:, 0], X_umap[:, 1],
c=labels_dbscan, cmap='viridis', alpha=0.6, s=30)
n_clusters = len(set(labels_dbscan)) - (1 if -1 in labels_dbscan else 0)
axes[2].set_title(f'DBSCAN on UMAP\n{n_clusters} clusters found')
axes[2].set_xlabel('UMAP 1')
axes[2].set_ylabel('UMAP 2')
axes[2].grid(True, alpha=0.3)
plt.suptitle('UMAP for Clustering Preprocessing', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()
return X_umap, labels_umap
# Advanced features
advanced = AdvancedUMAP()
print("\n" + "="*60)
print("ADVANCED UMAP FEATURES")
print("="*60)
print("\n1. Supervised vs Unsupervised UMAP:")
unsup, sup, semi = advanced.supervised_umap(X_sample, y_sample)
print("\n2. Transform New Data:")
# Split data
split_idx = len(X_sample) // 2
X_train = X_sample[:split_idx]
X_test = X_sample[split_idx:]
y_train = y_sample[:split_idx]
y_test = y_sample[split_idx:]
reducer, X_train_emb, X_test_emb = advanced.transform_new_data(
X_train, X_test, y_train, y_test
)
print("\n3. Custom Distance Metrics:")
advanced.custom_metrics(X_sample)
print("\n4. UMAP for Clustering:")
X_umap_cluster, cluster_labels = advanced.umap_for_clustering(X_sample)
class UMAPApplications:
"""Real-world applications of UMAP"""
def __init__(self):
self.results = {}
def text_embeddings_visualization(self):
"""Visualize text embeddings using UMAP"""
from sklearn.feature_extraction.text import TfidfVectorizer
# Sample documents
documents = [
# Technology
"Machine learning algorithms process data automatically",
"Deep learning neural networks require GPUs",
"Artificial intelligence transforms industries",
"Data science involves statistics and programming",
"Cloud computing provides scalable resources",
# Sports
"Football teams compete in tournaments",
"Basketball players score points in games",
"Tennis matches require skill and endurance",
"Swimming races test speed and technique",
"Marathon running demands physical fitness",
# Science
"Biology studies living organisms and ecosystems",
"Chemistry explores molecular structures and reactions",
"Physics explains fundamental forces and motion",
"Astronomy observes stars and galaxies",
"Geology examines Earth's rocks and minerals",
# Food
"Italian pasta dishes use fresh ingredients",
"Japanese sushi requires skilled preparation",
"Mexican tacos combine spices and flavors",
"French cuisine emphasizes technique and presentation",
"Indian curry blends aromatic spices"
]
categories = ['Tech'] * 5 + ['Sports'] * 5 + ['Science'] * 5 + ['Food'] * 5
category_map = {'Tech': 0, 'Sports': 1, 'Science': 2, 'Food': 3}
y_text = [category_map[cat] for cat in categories]
# TF-IDF vectorization
print("Vectorizing text...")
vectorizer = TfidfVectorizer(max_features=100)
X_text = vectorizer.fit_transform(documents).toarray()
# UMAP embedding
print("Applying UMAP to text embeddings...")
reducer = umap.UMAP(n_components=2, metric='cosine', random_state=42)
X_text_umap = reducer.fit_transform(X_text)
# Visualization
fig, ax = plt.subplots(figsize=(10, 8))
colors = ['red', 'blue', 'green', 'orange']
for i, cat in enumerate(['Tech', 'Sports', 'Science', 'Food']):
mask = np.array(categories) == cat
ax.scatter(X_text_umap[mask, 0], X_text_umap[mask, 1],
c=colors[i], label=cat, alpha=0.7, s=100)
# Add text labels
for i, doc in enumerate(documents):
# Shorten document for display
short_doc = doc[:30] + '...' if len(doc) > 30 else doc
ax.annotate(short_doc, xy=(X_text_umap[i, 0], X_text_umap[i, 1]),
fontsize=8, alpha=0.7, ha='center')
ax.set_title('UMAP Visualization of Text Embeddings')
ax.set_xlabel('UMAP 1')
ax.set_ylabel('UMAP 2')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print(f"\nText Embedding Results:")
print(f" Original dimensions: {X_text.shape[1]}")
print(f" Reduced dimensions: 2")
print(f" Categories: {len(set(categories))}")
return X_text_umap
def high_dimensional_genomics(self):
"""Simulate genomics data visualization"""
np.random.seed(42)
# Simulate gene expression data
n_samples = 500
n_genes = 1000
# Different cell types with characteristic expression patterns
cell_types = {
'Neuron': 150,
'Astrocyte': 150,
'Microglia': 100,
'Oligodendrocyte': 100
}
X_genomics = []
y_genomics = []
for cell_idx, (cell_type, n_cells) in enumerate(cell_types.items()):
# Base expression
base_expr = np.random.lognormal(0, 1, (n_cells, n_genes))
# Cell-type specific genes
marker_genes = np.random.choice(n_genes, 100, replace=False)
base_expr[:, marker_genes] *= (cell_idx + 1) * 3
X_genomics.append(base_expr)
y_genomics.extend([cell_idx] * n_cells)
X_genomics = np.vstack(X_genomics)
y_genomics = np.array(y_genomics)
# Log transformation (common in genomics)
X_genomics = np.log1p(X_genomics)
# PCA for initial reduction
print("Initial PCA reduction...")
pca = PCA(n_components=50, random_state=42)
X_pca = pca.fit_transform(X_genomics)
# Compare t-SNE and UMAP for genomics
print("Running t-SNE...")
start = time.time()
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X_pca)
tsne_time = time.time() - start
print("Running UMAP...")
start = time.time()
reducer = umap.UMAP(n_components=2, random_state=42)
X_umap = reducer.fit_transform(X_pca)
umap_time = time.time() - start
# Visualization
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
# t-SNE
for cell_idx, cell_type in enumerate(cell_types.keys()):
mask = y_genomics == cell_idx
axes[0].scatter(X_tsne[mask, 0], X_tsne[mask, 1],
label=cell_type, alpha=0.6, s=20)
axes[0].set_title(f't-SNE ({tsne_time:.2f}s)')
axes[0].set_xlabel('t-SNE 1')
axes[0].set_ylabel('t-SNE 2')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
# UMAP
for cell_idx, cell_type in enumerate(cell_types.keys()):
mask = y_genomics == cell_idx
axes[1].scatter(X_umap[mask, 0], X_umap[mask, 1],
label=cell_type, alpha=0.6, s=20)
axes[1].set_title(f'UMAP ({umap_time:.2f}s)')
axes[1].set_xlabel('UMAP 1')
axes[1].set_ylabel('UMAP 2')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
# Speedup comparison
speedup = tsne_time / umap_time
axes[2].bar(['t-SNE', 'UMAP'], [tsne_time, umap_time],
color=['orange', 'green'])
axes[2].set_ylabel('Time (seconds)')
axes[2].set_title(f'Speed Comparison\n(UMAP is {speedup:.1f}x faster)')
axes[2].grid(True, alpha=0.3, axis='y')
plt.suptitle('High-Dimensional Genomics Data Visualization',
fontsize=14, y=1.02)
plt.tight_layout()
plt.show()
print(f"\nGenomics Analysis Results:")
print(f" Samples: {n_samples}")
print(f" Genes: {n_genes}")
print(f" Cell types: {len(cell_types)}")
print(f" UMAP speedup: {speedup:.1f}x")
return X_umap, y_genomics
def interactive_exploration(self, X, y=None):
"""Create data for interactive UMAP exploration"""
print("Creating embeddings with different parameters...")
# Parameter grid
param_grid = {
'n_neighbors': [5, 15, 30, 50],
'min_dist': [0.0, 0.1, 0.3, 0.5]
}
results = []
for n_neigh in param_grid['n_neighbors']:
for min_d in param_grid['min_dist']:
reducer = umap.UMAP(n_neighbors=n_neigh,
min_dist=min_d,
random_state=42)
embedding = reducer.fit_transform(X)
results.append({
'n_neighbors': n_neigh,
'min_dist': min_d,
'embedding': embedding
})
# Create visualization grid
fig, axes = plt.subplots(4, 4, figsize=(16, 16))
for idx, result in enumerate(results):
row = idx // 4
col = idx % 4
embedding = result['embedding']
if y is not None:
scatter = axes[row, col].scatter(embedding[:, 0], embedding[:, 1],
c=y, cmap='viridis',
alpha=0.6, s=10)
else:
axes[row, col].scatter(embedding[:, 0], embedding[:, 1],
alpha=0.6, s=10)
axes[row, col].set_title(f"n={result['n_neighbors']}, "
f"d={result['min_dist']}", fontsize=8)
axes[row, col].set_xticks([])
axes[row, col].set_yticks([])
plt.suptitle('UMAP Parameter Grid Exploration', fontsize=14, y=1.002)
plt.tight_layout()
plt.show()
return results
# Applications
apps = UMAPApplications()
print("\n" + "="*60)
print("UMAP REAL-WORLD APPLICATIONS")
print("="*60)
print("\n1. Text Embeddings Visualization:")
X_text_umap = apps.text_embeddings_visualization()
print("\n2. High-Dimensional Genomics Data:")
X_genomics_umap, y_genomics = apps.high_dimensional_genomics()
print("\n3. Interactive Parameter Exploration:")
param_results = apps.interactive_exploration(X_sample[:200], y_sample[:200])
print("\n" + "="*60)
print("UMAP BEST PRACTICES")
print("="*60)
best_practices = """
KEY GUIDELINES:
1. DATA PREPROCESSING:
• Standardize/normalize features (usually helpful)
• Handle missing values before UMAP
• Consider PCA for very high dimensions (>100)
• Log-transform skewed data
2. PARAMETER SELECTION:
• n_neighbors: Controls local vs global balance
- Small (2-10): Focus on local structure
- Medium (10-50): Balanced
- Large (50-200): More global structure
• min_dist: Controls clustering tightness
- 0.0: Tightly packed clusters
- 0.1-0.25: Default range, good separation
- 0.5-1.0: Looser, more uniform distribution
3. METRIC SELECTION:
• euclidean: Standard for continuous features
• manhattan: Robust to outliers
• cosine: Text data, high-dimensional sparse
• correlation: Gene expression data
• Custom metrics: Domain-specific needs
4. COMPUTATIONAL TIPS:
• Use init='spectral' for reproducibility
• Set random_state for consistency
• Use n_jobs=-1 for parallel processing
• Consider approximate nearest neighbors for large data
5. WHEN TO USE UMAP:
✓ Large datasets (>10,000 samples)
✓ Need to preserve global structure
✓ Need to transform new data
✓ Working with sparse data
✓ Interactive/real-time applications
6. ADVANTAGES:
• Fast (O(n^1.14) complexity)
• Preserves local AND global structure
• Can transform new data
• Supports supervised learning
• Works with custom metrics
• Handles sparse data well
7. LIMITATIONS:
• Results can vary with parameters
• Less interpretable than PCA
• Assumes manifold structure
• Memory intensive for very large datasets
"""
print(best_practices)
# Comparison table
comparison_data = {
'Aspect': ['Speed', 'Global', 'Local', 'New Data', 'Sparse', 'Interpret'],
'PCA': ['Fast', 'High', 'Low', 'Yes', 'Yes', 'High'],
't-SNE': ['Slow', 'Low', 'High', 'No', 'No', 'Low'],
'UMAP': ['Medium', 'Med-High', 'High', 'Yes', 'Yes', 'Medium']
}
comparison_df = pd.DataFrame(comparison_data)
print("\nDimensionality Reduction Comparison:")
print("="*60)
print(comparison_df.to_string(index=False))
# Parameter guidelines
param_guidelines = """
PARAMETER SELECTION GUIDE:
Dataset Size | n_neighbors | min_dist
--------------------|-------------|----------
< 1,000 samples | 5-15 | 0.01-0.1
1,000-10,000 | 10-30 | 0.1-0.25
10,000-100,000 | 15-50 | 0.1-0.3
> 100,000 | 30-100 | 0.25-0.5
Data Type | Recommended Metric
--------------------|------------------
Continuous | euclidean, manhattan
Text/Sparse | cosine, jaccard
Genomics | correlation, cosine
Binary | hamming, jaccard
Mixed | gower (custom)
"""
print(param_guidelines)
# Troubleshooting guide
troubleshooting = """
TROUBLESHOOTING COMMON ISSUES:
Problem: Clusters too tight/overlapping
Solution: Increase min_dist (try 0.3-0.5)
Problem: Lost global structure
Solution: Increase n_neighbors (try 30-100)
Problem: Too much noise/scattered points
Solution: Decrease n_neighbors (try 5-15)
Problem: Slow performance
Solution: Use PCA first, reduce n_neighbors, use approximate NN
Problem: Different results each run
Solution: Set random_state, use init='spectral'
Problem: Memory errors
Solution: Subsample data, use PCA first, reduce n_neighbors
"""
print(troubleshooting)
Build a complete UMAP analysis pipeline:
Implement a custom distance metric for UMAP:
Create an interactive UMAP dashboard: