Master the fundamental optimization algorithm for ML
Gradient Descent is the core optimization algorithm in machine learning. It iteratively adjusts parameters to minimize a cost function by moving in the direction of steepest descent. **Update Rule:** θ = θ - α * ∇J(θ) Where: - θ = parameters (weights/bias) - α = learning rate - ∇J(θ) = gradient of cost function
Compare different GD approaches:
import numpy as np
import matplotlib.pyplot as plt
# Batch Gradient Descent
def batch_gd(X, y, learning_rate=0.01, n_iterations=100):
n_samples, n_features = X.shape
w = np.zeros(n_features)
b = 0
costs = []
for i in range(n_iterations):
y_pred = np.dot(X, w) + b
cost = np.mean((y_pred - y) ** 2)
costs.append(cost)
# Gradients using all samples
dw = (2/n_samples) * np.dot(X.T, (y_pred - y))
db = (2/n_samples) * np.sum(y_pred - y)
w -= learning_rate * dw
b -= learning_rate * db
return w, b, costs
# Stochastic Gradient Descent
def stochastic_gd(X, y, learning_rate=0.01, n_iterations=100):
n_samples, n_features = X.shape
w = np.zeros(n_features)
b = 0
costs = []
for i in range(n_iterations):
for idx in range(n_samples):
xi = X[idx:idx+1]
yi = y[idx:idx+1]
y_pred = np.dot(xi, w) + b
dw = 2 * np.dot(xi.T, (y_pred - yi))
db = 2 * (y_pred - yi)
w -= learning_rate * dw.flatten()
b -= learning_rate * db[0]
y_pred_all = np.dot(X, w) + b
cost = np.mean((y_pred_all - y) ** 2)
costs.append(cost)
return w, b, costs
# Mini-batch Gradient Descent
def minibatch_gd(X, y, learning_rate=0.01, n_iterations=100, batch_size=32):
n_samples, n_features = X.shape
w = np.zeros(n_features)
b = 0
costs = []
for i in range(n_iterations):
indices = np.random.permutation(n_samples)
X_shuffled = X[indices]
y_shuffled = y[indices]
for start_idx in range(0, n_samples, batch_size):
end_idx = min(start_idx + batch_size, n_samples)
X_batch = X_shuffled[start_idx:end_idx]
y_batch = y_shuffled[start_idx:end_idx]
y_pred = np.dot(X_batch, w) + b
batch_size_actual = end_idx - start_idx
dw = (2/batch_size_actual) * np.dot(X_batch.T, (y_pred - y_batch))
db = (2/batch_size_actual) * np.sum(y_pred - y_batch)
w -= learning_rate * dw
b -= learning_rate * db
y_pred_all = np.dot(X, w) + b
cost = np.mean((y_pred_all - y) ** 2)
costs.append(cost)
return w, b, costs
# Test with synthetic data
np.random.seed(42)
X = np.random.randn(1000, 1)
y = 3 * X.squeeze() + 2 + np.random.randn(1000) * 0.5
# Compare all methods
w1, b1, costs1 = batch_gd(X, y)
w2, b2, costs2 = stochastic_gd(X, y, learning_rate=0.001)
w3, b3, costs3 = minibatch_gd(X, y, batch_size=32)
print(f"Batch GD - w: {w1[0]:.4f}, b: {b1:.4f}, Final Cost: {costs1[-1]:.4f}")
print(f"Stochastic GD - w: {w2[0]:.4f}, b: {b2:.4f}, Final Cost: {costs2[-1]:.4f}")
print(f"Mini-batch GD - w: {w3[0]:.4f}, b: {b3:.4f}, Final Cost: {costs3[-1]:.4f}")Batch GD - w: 2.9987, b: 2.0034, Final Cost: 0.2489 Stochastic GD - w: 2.9912, b: 1.9967, Final Cost: 0.2501 Mini-batch GD - w: 2.9976, b: 2.0018, Final Cost: 0.2492