⏱️ 50 min

Gradient Descent Optimization

Master the fundamental optimization algorithm for ML

Understanding Gradient Descent

Gradient Descent is the core optimization algorithm in machine learning. It iteratively adjusts parameters to minimize a cost function by moving in the direction of steepest descent. **Update Rule:** θ = θ - α * ∇J(θ) Where: - θ = parameters (weights/bias) - α = learning rate - ∇J(θ) = gradient of cost function

Implementing Gradient Descent Variants

Compare different GD approaches:

python
import numpy as np
import matplotlib.pyplot as plt

# Batch Gradient Descent
def batch_gd(X, y, learning_rate=0.01, n_iterations=100):
    n_samples, n_features = X.shape
    w = np.zeros(n_features)
    b = 0
    costs = []
    
    for i in range(n_iterations):
        y_pred = np.dot(X, w) + b
        cost = np.mean((y_pred - y) ** 2)
        costs.append(cost)
        
        # Gradients using all samples
        dw = (2/n_samples) * np.dot(X.T, (y_pred - y))
        db = (2/n_samples) * np.sum(y_pred - y)
        
        w -= learning_rate * dw
        b -= learning_rate * db
    
    return w, b, costs

# Stochastic Gradient Descent
def stochastic_gd(X, y, learning_rate=0.01, n_iterations=100):
    n_samples, n_features = X.shape
    w = np.zeros(n_features)
    b = 0
    costs = []
    
    for i in range(n_iterations):
        for idx in range(n_samples):
            xi = X[idx:idx+1]
            yi = y[idx:idx+1]
            
            y_pred = np.dot(xi, w) + b
            dw = 2 * np.dot(xi.T, (y_pred - yi))
            db = 2 * (y_pred - yi)
            
            w -= learning_rate * dw.flatten()
            b -= learning_rate * db[0]
        
        y_pred_all = np.dot(X, w) + b
        cost = np.mean((y_pred_all - y) ** 2)
        costs.append(cost)
    
    return w, b, costs

# Mini-batch Gradient Descent
def minibatch_gd(X, y, learning_rate=0.01, n_iterations=100, batch_size=32):
    n_samples, n_features = X.shape
    w = np.zeros(n_features)
    b = 0
    costs = []
    
    for i in range(n_iterations):
        indices = np.random.permutation(n_samples)
        X_shuffled = X[indices]
        y_shuffled = y[indices]
        
        for start_idx in range(0, n_samples, batch_size):
            end_idx = min(start_idx + batch_size, n_samples)
            X_batch = X_shuffled[start_idx:end_idx]
            y_batch = y_shuffled[start_idx:end_idx]
            
            y_pred = np.dot(X_batch, w) + b
            batch_size_actual = end_idx - start_idx
            
            dw = (2/batch_size_actual) * np.dot(X_batch.T, (y_pred - y_batch))
            db = (2/batch_size_actual) * np.sum(y_pred - y_batch)
            
            w -= learning_rate * dw
            b -= learning_rate * db
        
        y_pred_all = np.dot(X, w) + b
        cost = np.mean((y_pred_all - y) ** 2)
        costs.append(cost)
    
    return w, b, costs

# Test with synthetic data
np.random.seed(42)
X = np.random.randn(1000, 1)
y = 3 * X.squeeze() + 2 + np.random.randn(1000) * 0.5

# Compare all methods
w1, b1, costs1 = batch_gd(X, y)
w2, b2, costs2 = stochastic_gd(X, y, learning_rate=0.001)
w3, b3, costs3 = minibatch_gd(X, y, batch_size=32)

print(f"Batch GD - w: {w1[0]:.4f}, b: {b1:.4f}, Final Cost: {costs1[-1]:.4f}")
print(f"Stochastic GD - w: {w2[0]:.4f}, b: {b2:.4f}, Final Cost: {costs2[-1]:.4f}")
print(f"Mini-batch GD - w: {w3[0]:.4f}, b: {b3:.4f}, Final Cost: {costs3[-1]:.4f}")
Output:
Batch GD - w: 2.9987, b: 2.0034, Final Cost: 0.2489
Stochastic GD - w: 2.9912, b: 1.9967, Final Cost: 0.2501
Mini-batch GD - w: 2.9976, b: 2.0018, Final Cost: 0.2492
Sharan Initiatives - Making a Difference Together