Neural networks are the foundation of deep learning, inspired by the human brain's structure. They consist of interconnected layers of neurons that can learn complex patterns in data through a process called backpropagation. From simple perceptrons to deep architectures, neural networks have revolutionized fields like computer vision, natural language processing, and reinforcement learning. This lesson covers the fundamentals: neurons, activation functions, forward propagation, backpropagation, and building your first networks.
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification, make_moons, make_circles
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')
# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
print("="*60)
print("NEURAL NETWORK FUNDAMENTALS")
print("="*60)
# Core concepts
nn_concepts = """
NEURAL NETWORK KEY CONCEPTS:
1. NEURON (PERCEPTRON):
• Basic unit of neural network
• Inputs: x₁, x₂, ..., xₙ
• Weights: w₁, w₂, ..., wₙ
• Bias: b
• Output: y = activation(Σ(wᵢxᵢ) + b)
2. ACTIVATION FUNCTIONS:
• Sigmoid: σ(x) = 1/(1 + e^(-x))
• Tanh: tanh(x) = (e^x - e^(-x))/(e^x + e^(-x))
• ReLU: f(x) = max(0, x)
• Leaky ReLU: f(x) = max(0.01x, x)
• Softmax: for multi-class output
3. NETWORK ARCHITECTURE:
• Input Layer: Features
• Hidden Layers: Learn representations
• Output Layer: Predictions
• Depth: Number of layers
• Width: Neurons per layer
4. FORWARD PROPAGATION:
• Pass input through network
• Apply weights, biases, activations
• Generate predictions
5. BACKPROPAGATION:
• Calculate loss/error
• Compute gradients via chain rule
• Update weights to minimize loss
6. GRADIENT DESCENT:
• Batch: Use entire dataset
• Stochastic (SGD): One sample at a time
• Mini-batch: Small batches
• Learning rate: Step size
7. LOSS FUNCTIONS:
• MSE: Regression
• Cross-entropy: Classification
• Custom losses for specific tasks
"""
print(nn_concepts)
class NeuralNetworkFromScratch:
"""Simple neural network implementation using only NumPy"""
def __init__(self, layer_sizes, activation='relu', learning_rate=0.01):
"""
Initialize neural network
layer_sizes: list of neurons in each layer [input, hidden1, hidden2, ..., output]
"""
self.layer_sizes = layer_sizes
self.learning_rate = learning_rate
self.activation = activation
# Initialize weights and biases
self.weights = []
self.biases = []
for i in range(len(layer_sizes) - 1):
# He initialization for ReLU, Xavier for others
if activation == 'relu':
w = np.random.randn(layer_sizes[i], layer_sizes[i+1]) * np.sqrt(2.0 / layer_sizes[i])
else:
w = np.random.randn(layer_sizes[i], layer_sizes[i+1]) * np.sqrt(1.0 / layer_sizes[i])
b = np.zeros((1, layer_sizes[i+1]))
self.weights.append(w)
self.biases.append(b)
def sigmoid(self, x):
"""Sigmoid activation function"""
return 1 / (1 + np.exp(-np.clip(x, -500, 500)))
def sigmoid_derivative(self, x):
"""Derivative of sigmoid"""
s = self.sigmoid(x)
return s * (1 - s)
def relu(self, x):
"""ReLU activation function"""
return np.maximum(0, x)
def relu_derivative(self, x):
"""Derivative of ReLU"""
return (x > 0).astype(float)
def tanh(self, x):
"""Tanh activation function"""
return np.tanh(x)
def tanh_derivative(self, x):
"""Derivative of tanh"""
return 1 - np.tanh(x)**2
def softmax(self, x):
"""Softmax for output layer"""
exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
return exp_x / np.sum(exp_x, axis=1, keepdims=True)
def forward_propagation(self, X):
"""Forward pass through the network"""
self.activations = [X]
self.z_values = []
for i in range(len(self.weights)):
z = np.dot(self.activations[-1], self.weights[i]) + self.biases[i]
self.z_values.append(z)
# Apply activation function
if i == len(self.weights) - 1: # Output layer
if self.layer_sizes[-1] > 1: # Multi-class
a = self.softmax(z)
else: # Binary
a = self.sigmoid(z)
else: # Hidden layers
if self.activation == 'relu':
a = self.relu(z)
elif self.activation == 'tanh':
a = self.tanh(z)
else:
a = self.sigmoid(z)
self.activations.append(a)
return self.activations[-1]
def backward_propagation(self, X, y):
"""Backward pass - compute gradients"""
m = X.shape[0]
# Initialize gradient storage
dW = [np.zeros_like(w) for w in self.weights]
db = [np.zeros_like(b) for b in self.biases]
# Output layer gradient
if self.layer_sizes[-1] > 1: # Multi-class
delta = self.activations[-1] - y
else: # Binary
delta = self.activations[-1] - y.reshape(-1, 1)
# Backpropagate through layers
for i in range(len(self.weights) - 1, -1, -1):
dW[i] = np.dot(self.activations[i].T, delta) / m
db[i] = np.sum(delta, axis=0, keepdims=True) / m
if i > 0: # Not the first layer
delta = np.dot(delta, self.weights[i].T)
# Apply activation derivative
if self.activation == 'relu':
delta *= self.relu_derivative(self.z_values[i-1])
elif self.activation == 'tanh':
delta *= self.tanh_derivative(self.z_values[i-1])
else:
delta *= self.sigmoid_derivative(self.z_values[i-1])
return dW, db
def update_parameters(self, dW, db):
"""Update weights and biases using gradients"""
for i in range(len(self.weights)):
self.weights[i] -= self.learning_rate * dW[i]
self.biases[i] -= self.learning_rate * db[i]
def compute_loss(self, y_true, y_pred):
"""Calculate loss"""
m = y_true.shape[0]
if self.layer_sizes[-1] > 1: # Multi-class cross-entropy
loss = -np.sum(y_true * np.log(y_pred + 1e-8)) / m
else: # Binary cross-entropy
y_true = y_true.reshape(-1, 1)
loss = -np.mean(y_true * np.log(y_pred + 1e-8) +
(1 - y_true) * np.log(1 - y_pred + 1e-8))
return loss
def train(self, X, y, epochs=100, batch_size=32, verbose=True):
"""Train the neural network"""
n_samples = X.shape[0]
losses = []
for epoch in range(epochs):
# Shuffle data
indices = np.random.permutation(n_samples)
X_shuffled = X[indices]
y_shuffled = y[indices]
epoch_loss = 0
n_batches = 0
# Mini-batch training
for i in range(0, n_samples, batch_size):
X_batch = X_shuffled[i:i+batch_size]
y_batch = y_shuffled[i:i+batch_size]
# Forward pass
y_pred = self.forward_propagation(X_batch)
# Compute loss
batch_loss = self.compute_loss(y_batch, y_pred)
epoch_loss += batch_loss
n_batches += 1
# Backward pass
dW, db = self.backward_propagation(X_batch, y_batch)
# Update parameters
self.update_parameters(dW, db)
avg_loss = epoch_loss / n_batches
losses.append(avg_loss)
if verbose and epoch % 10 == 0:
print(f"Epoch {epoch:3d}/{epochs}, Loss: {avg_loss:.4f}")
return losses
def predict(self, X):
"""Make predictions"""
output = self.forward_propagation(X)
if self.layer_sizes[-1] > 1: # Multi-class
return np.argmax(output, axis=1)
else: # Binary
return (output > 0.5).astype(int).flatten()
def predict_proba(self, X):
"""Get probability predictions"""
return self.forward_propagation(X)
# Demonstrate neural network on different datasets
def demonstrate_neural_network():
"""Visualize neural network learning on different datasets"""
# Generate datasets
datasets = {
'Linear': make_classification(n_samples=200, n_features=2,
n_redundant=0, n_informative=2,
n_clusters_per_class=1, random_state=42),
'Moons': make_moons(n_samples=200, noise=0.2, random_state=42),
'Circles': make_circles(n_samples=200, noise=0.1, factor=0.5, random_state=42)
}
fig, axes = plt.subplots(3, 4, figsize=(16, 12))
for row, (name, (X, y)) in enumerate(datasets.items()):
# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X)
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42
)
# Train neural networks with different architectures
architectures = [
([2, 1], 'No Hidden'),
([2, 4, 1], '1 Hidden (4)'),
([2, 8, 4, 1], '2 Hidden (8,4)'),
([2, 16, 8, 4, 1], '3 Hidden (16,8,4)')
]
for col, (arch, arch_name) in enumerate(architectures):
# Create and train network
nn = NeuralNetworkFromScratch(arch, activation='relu', learning_rate=0.1)
losses = nn.train(X_train, y_train, epochs=100, verbose=False)
# Predict on test set
y_pred = nn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
# Create mesh for decision boundary
h = 0.02
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = nn.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# Plot decision boundary
axes[row, col].contourf(xx, yy, Z, alpha=0.3, cmap='viridis')
axes[row, col].scatter(X[:, 0], X[:, 1], c=y, cmap='viridis',
edgecolor='black', linewidth=0.5, s=30)
axes[row, col].set_title(f'{name}\n{arch_name}\nAcc: {accuracy:.2f}')
axes[row, col].set_xlabel('Feature 1')
axes[row, col].set_ylabel('Feature 2')
axes[row, col].grid(True, alpha=0.3)
plt.suptitle('Neural Network Decision Boundaries', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()
print("\n" + "="*60)
print("NEURAL NETWORK FROM SCRATCH")
print("="*60)
print("\nDemonstrating neural networks on different datasets:")
demonstrate_neural_network()
class ActivationFunctionsAnalysis:
"""Analyze different activation functions"""
def __init__(self):
self.functions = {}
def visualize_activations(self):
"""Visualize common activation functions and their derivatives"""
x = np.linspace(-5, 5, 1000)
# Define activation functions and derivatives
activations = {
'Sigmoid': {
'func': lambda x: 1 / (1 + np.exp(-x)),
'deriv': lambda x: (1 / (1 + np.exp(-x))) * (1 - 1 / (1 + np.exp(-x)))
},
'Tanh': {
'func': lambda x: np.tanh(x),
'deriv': lambda x: 1 - np.tanh(x)**2
},
'ReLU': {
'func': lambda x: np.maximum(0, x),
'deriv': lambda x: (x > 0).astype(float)
},
'Leaky ReLU': {
'func': lambda x: np.where(x > 0, x, 0.01 * x),
'deriv': lambda x: np.where(x > 0, 1, 0.01)
},
'ELU': {
'func': lambda x: np.where(x > 0, x, np.exp(x) - 1),
'deriv': lambda x: np.where(x > 0, 1, np.exp(x))
},
'Swish': {
'func': lambda x: x * (1 / (1 + np.exp(-x))),
'deriv': lambda x: (1 / (1 + np.exp(-x))) + x * (1 / (1 + np.exp(-x))) * (1 - 1 / (1 + np.exp(-x)))
}
}
fig, axes = plt.subplots(2, 6, figsize=(18, 8))
for idx, (name, funcs) in enumerate(activations.items()):
# Function plot
y = funcs['func'](x)
axes[0, idx].plot(x, y, linewidth=2, color='blue')
axes[0, idx].axhline(y=0, color='k', linewidth=0.5, alpha=0.3)
axes[0, idx].axvline(x=0, color='k', linewidth=0.5, alpha=0.3)
axes[0, idx].set_title(name)
axes[0, idx].set_xlabel('x')
axes[0, idx].set_ylabel('f(x)')
axes[0, idx].grid(True, alpha=0.3)
axes[0, idx].set_ylim(-2, 2)
# Derivative plot
y_deriv = funcs['deriv'](x)
axes[1, idx].plot(x, y_deriv, linewidth=2, color='red')
axes[1, idx].axhline(y=0, color='k', linewidth=0.5, alpha=0.3)
axes[1, idx].axvline(x=0, color='k', linewidth=0.5, alpha=0.3)
axes[1, idx].set_title(f"{name} Derivative")
axes[1, idx].set_xlabel('x')
axes[1, idx].set_ylabel("f'(x)")
axes[1, idx].grid(True, alpha=0.3)
axes[1, idx].set_ylim(-0.5, 1.5)
plt.suptitle('Activation Functions and Their Derivatives', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()
def compare_activations_performance(self):
"""Compare different activation functions on same problem"""
# Generate non-linear dataset
X, y = make_moons(n_samples=500, noise=0.2, random_state=42)
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42
)
activations = ['sigmoid', 'tanh', 'relu']
results = {}
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
for idx, activation in enumerate(activations):
# Train network
nn = NeuralNetworkFromScratch([2, 16, 8, 1],
activation=activation,
learning_rate=0.1)
losses = nn.train(X_train, y_train, epochs=100, verbose=False)
# Evaluate
y_pred = nn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
results[activation] = {'losses': losses, 'accuracy': accuracy}
# Plot learning curve
axes[0, idx].plot(losses, linewidth=2)
axes[0, idx].set_xlabel('Epoch')
axes[0, idx].set_ylabel('Loss')
axes[0, idx].set_title(f'{activation.capitalize()}\nFinal Acc: {accuracy:.3f}')
axes[0, idx].grid(True, alpha=0.3)
# Plot decision boundary
h = 0.02
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = nn.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
axes[1, idx].contourf(xx, yy, Z, alpha=0.3, cmap='viridis')
axes[1, idx].scatter(X_test[:, 0], X_test[:, 1], c=y_test,
cmap='viridis', edgecolor='black', linewidth=0.5, s=30)
axes[1, idx].set_title(f'Decision Boundary')
axes[1, idx].set_xlabel('Feature 1')
axes[1, idx].set_ylabel('Feature 2')
axes[1, idx].grid(True, alpha=0.3)
plt.suptitle('Activation Function Performance Comparison', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()
return results
# Activation functions analysis
activation_analyzer = ActivationFunctionsAnalysis()
print("\n" + "="*60)
print("ACTIVATION FUNCTIONS ANALYSIS")
print("="*60)
print("\n1. Visualizing Activation Functions:")
activation_analyzer.visualize_activations()
print("\n2. Comparing Activation Performance:")
activation_results = activation_analyzer.compare_activations_performance()
class BackpropagationVisualization:
"""Visualize and understand backpropagation"""
def __init__(self):
self.gradients = {}
def simple_backprop_example(self):
"""Step-by-step backpropagation on a simple network"""
print("\nSimple Backpropagation Example:")
print("-" * 40)
# Simple 2-2-1 network
np.random.seed(42)
# Input
X = np.array([[0.5, 0.3]])
y = np.array([[1]])
# Initialize weights
W1 = np.random.randn(2, 2) * 0.5
b1 = np.zeros((1, 2))
W2 = np.random.randn(2, 1) * 0.5
b2 = np.zeros((1, 1))
print("Network Architecture: 2 -> 2 -> 1")
print(f"\nInput: {X}")
print(f"Target: {y}")
print(f"\nInitial Weights:")
print(f"W1:\n{W1}")
print(f"b1: {b1}")
print(f"W2:\n{W2}")
print(f"b2: {b2}")
# Forward pass
print("\n" + "="*40)
print("FORWARD PASS")
print("="*40)
# Layer 1
z1 = np.dot(X, W1) + b1
a1 = 1 / (1 + np.exp(-z1)) # Sigmoid
print(f"\nLayer 1:")
print(f" z1 = X @ W1 + b1 = {z1}")
print(f" a1 = sigmoid(z1) = {a1}")
# Layer 2
z2 = np.dot(a1, W2) + b2
a2 = 1 / (1 + np.exp(-z2)) # Sigmoid
print(f"\nLayer 2:")
print(f" z2 = a1 @ W2 + b2 = {z2}")
print(f" a2 = sigmoid(z2) = {a2}")
# Loss
loss = -np.mean(y * np.log(a2) + (1 - y) * np.log(1 - a2))
print(f"\nLoss (Binary Cross-Entropy): {loss:.4f}")
# Backward pass
print("\n" + "="*40)
print("BACKWARD PASS")
print("="*40)
# Output layer gradients
da2 = -(y / a2 - (1 - y) / (1 - a2))
dz2 = da2 * a2 * (1 - a2) # Sigmoid derivative
dW2 = np.dot(a1.T, dz2)
db2 = np.sum(dz2, axis=0, keepdims=True)
print(f"\nOutput Layer Gradients:")
print(f" da2 = ∂L/∂a2 = {da2}")
print(f" dz2 = da2 * sigmoid'(z2) = {dz2}")
print(f" dW2 = a1.T @ dz2 = {dW2}")
print(f" db2 = sum(dz2) = {db2}")
# Hidden layer gradients
da1 = np.dot(dz2, W2.T)
dz1 = da1 * a1 * (1 - a1) # Sigmoid derivative
dW1 = np.dot(X.T, dz1)
db1 = np.sum(dz1, axis=0, keepdims=True)
print(f"\nHidden Layer Gradients:")
print(f" da1 = dz2 @ W2.T = {da1}")
print(f" dz1 = da1 * sigmoid'(z1) = {dz1}")
print(f" dW1 = X.T @ dz1 = {dW1}")
print(f" db1 = sum(dz1) = {db1}")
# Update weights
learning_rate = 0.1
W1_new = W1 - learning_rate * dW1
b1_new = b1 - learning_rate * db1
W2_new = W2 - learning_rate * dW2
b2_new = b2 - learning_rate * db2
print(f"\n" + "="*40)
print(f"WEIGHT UPDATE (learning_rate={learning_rate})")
print(f"="*40)
print(f"\nUpdated W1:\n{W1_new}")
print(f"Updated b1: {b1_new}")
print(f"Updated W2:\n{W2_new}")
print(f"Updated b2: {b2_new}")
return {
'forward': {'z1': z1, 'a1': a1, 'z2': z2, 'a2': a2, 'loss': loss},
'backward': {'dW1': dW1, 'db1': db1, 'dW2': dW2, 'db2': db2}
}
def gradient_flow_visualization(self):
"""Visualize gradient flow through network"""
# Create sample network
nn = NeuralNetworkFromScratch([2, 4, 3, 1], activation='relu')
# Generate sample data
X = np.random.randn(100, 2)
y = (X[:, 0] + X[:, 1] > 0).astype(int)
# Track gradients during training
gradient_norms = {i: [] for i in range(len(nn.weights))}
for epoch in range(50):
# Forward pass
_ = nn.forward_propagation(X)
# Backward pass
dW, db = nn.backward_propagation(X, y)
# Store gradient norms
for i, grad in enumerate(dW):
gradient_norms[i].append(np.linalg.norm(grad))
# Update
nn.update_parameters(dW, db)
# Visualize gradient flow
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# Gradient norms over time
for layer, norms in gradient_norms.items():
axes[0].plot(norms, label=f'Layer {layer+1}', linewidth=2)
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Gradient Norm')
axes[0].set_title('Gradient Flow During Training')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
# Final gradient magnitudes
final_gradients = [gradient_norms[i][-1] for i in range(len(gradient_norms))]
axes[1].bar(range(len(final_gradients)), final_gradients, color='steelblue')
axes[1].set_xlabel('Layer')
axes[1].set_ylabel('Final Gradient Norm')
axes[1].set_title('Final Gradient Magnitudes by Layer')
axes[1].grid(True, alpha=0.3, axis='y')
plt.suptitle('Gradient Flow Analysis', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()
# Backpropagation visualization
backprop_viz = BackpropagationVisualization()
print("\n" + "="*60)
print("UNDERSTANDING BACKPROPAGATION")
print("="*60)
# Simple example
backprop_results = backprop_viz.simple_backprop_example()
print("\n3. Gradient Flow Visualization:")
backprop_viz.gradient_flow_visualization()
print("\n" + "="*60)
print("NEURAL NETWORK BEST PRACTICES")
print("="*60)
best_practices = """
KEY GUIDELINES:
1. INITIALIZATION:
• Xavier/Glorot: For sigmoid/tanh
• He: For ReLU variants
• Avoid zero initialization
• Consider batch normalization
2. ACTIVATION FUNCTIONS:
• Hidden layers: ReLU, Leaky ReLU, ELU
• Output: Sigmoid (binary), Softmax (multi-class)
• Avoid sigmoid/tanh in deep networks (vanishing gradient)
3. ARCHITECTURE DESIGN:
• Start simple, increase complexity
• Pyramid shape often works (decreasing width)
• Depth vs width trade-off
• Consider skip connections for very deep networks
4. TRAINING TIPS:
• Normalize/standardize inputs
• Use appropriate loss function
• Monitor training and validation loss
• Early stopping to prevent overfitting
• Learning rate scheduling
5. REGULARIZATION:
• L1/L2 weight regularization
• Dropout layers
• Data augmentation
• Batch normalization
6. COMMON PROBLEMS & SOLUTIONS:
Vanishing Gradient:
• Use ReLU activation
• Batch normalization
• Residual connections
Exploding Gradient:
• Gradient clipping
• Proper weight initialization
• Lower learning rate
Overfitting:
• More data
• Regularization
• Simpler architecture
• Dropout
Underfitting:
• More complex model
• Train longer
• Better features
• Reduce regularization
7. HYPERPARAMETER TUNING:
• Learning rate: Most important
• Architecture: Layers, neurons
• Batch size: Memory vs convergence
• Regularization strength
"""
print(best_practices)
# Common pitfalls
pitfalls = """
COMMON PITFALLS TO AVOID:
1. Not normalizing inputs
2. Using wrong loss function
3. Learning rate too high/low
4. Not checking for NaN/Inf
5. Forgetting to shuffle data
6. Testing on training data
7. Ignoring class imbalance
8. Not setting random seeds
9. Improper train/val/test split
10. Not monitoring gradients
"""
print(pitfalls)
Extend the neural network with advanced optimizers:
Add regularization to prevent overfitting:
Build a network for multi-class problems: