Handle sequential data with recurrent neural networks
RNNs are designed to work with sequential data by maintaining a hidden state that captures information about previous time steps. **Why RNNs?** - Process variable-length sequences (text, time series, audio) - Share parameters across time steps - Capture temporal dependencies **Applications:** - Language modeling - Machine translation - Speech recognition - Time series forecasting - Sentiment analysis
Build a basic RNN from scratch:
import numpy as np
class SimpleRNN:
def __init__(self, input_size, hidden_size, output_size):
# Initialize weights
self.hidden_size = hidden_size
# Input to hidden
self.Wxh = np.random.randn(hidden_size, input_size) * 0.01
# Hidden to hidden
self.Whh = np.random.randn(hidden_size, hidden_size) * 0.01
# Hidden to output
self.Why = np.random.randn(output_size, hidden_size) * 0.01
# Biases
self.bh = np.zeros((hidden_size, 1))
self.by = np.zeros((output_size, 1))
def forward(self, inputs, h_prev):
"""
inputs: list of input vectors (one per time step)
h_prev: previous hidden state
"""
xs, hs, ys, ps = {}, {}, {}, {}
hs[-1] = np.copy(h_prev)
# Forward pass through time
for t, x in enumerate(inputs):
xs[t] = x
# Hidden state: h_t = tanh(W_xh * x_t + W_hh * h_{t-1} + b_h)
hs[t] = np.tanh(np.dot(self.Wxh, xs[t]) +
np.dot(self.Whh, hs[t-1]) + self.bh)
# Output: y_t = W_hy * h_t + b_y
ys[t] = np.dot(self.Why, hs[t]) + self.by
# Softmax probabilities
ps[t] = self.softmax(ys[t])
return xs, hs, ys, ps
def softmax(self, x):
e_x = np.exp(x - np.max(x))
return e_x / np.sum(e_x)
def sample(self, h, seed_ix, n):
"""Generate a sequence by sampling from the network"""
x = np.zeros((self.Wxh.shape[1], 1))
x[seed_ix] = 1
ixes = []
for t in range(n):
h = np.tanh(np.dot(self.Wxh, x) + np.dot(self.Whh, h) + self.bh)
y = np.dot(self.Why, h) + self.by
p = self.softmax(y)
ix = np.random.choice(range(len(p)), p=p.ravel())
x = np.zeros((self.Wxh.shape[1], 1))
x[ix] = 1
ixes.append(ix)
return ixes
# Example: Character-level RNN
text = "hello world"
chars = list(set(text))
char_to_ix = {ch: i for i, ch in enumerate(chars)}
ix_to_char = {i: ch for i, ch in enumerate(chars)}
vocab_size = len(chars)
hidden_size = 100
print(f"Vocabulary: {chars}")
print(f"Vocab size: {vocab_size}")
# Create RNN
rnn = SimpleRNN(vocab_size, hidden_size, vocab_size)
# Convert text to input format
inputs = [np.zeros((vocab_size, 1)) for _ in range(len(text))]
for i, ch in enumerate(text):
inputs[i][char_to_ix[ch]] = 1
# Forward pass
h_prev = np.zeros((hidden_size, 1))
xs, hs, ys, ps = rnn.forward(inputs, h_prev)
print(f"\nProcessed sequence length: {len(xs)}")
print(f"Hidden state shape: {hs[0].shape}")
# Sample from the model
h = np.zeros((hidden_size, 1))
sample_ix = rnn.sample(h, char_to_ix['h'], 20)
sample_text = ''.join([ix_to_char[ix] for ix in sample_ix])
print(f"\nGenerated sample: {sample_text}")Vocabulary: ['h', 'e', 'l', 'o', ' ', 'w', 'r', 'd'] Vocab size: 8 Processed sequence length: 11 Hidden state shape: (100, 1) Generated sample: loh wdlreo hlworedlh
Simple RNNs struggle with long-term dependencies due to vanishing gradients: **Problem:** During backpropagation through time (BPTT), gradients are multiplied many times. With activation functions like tanh/sigmoid: - Values between 0 and 1 - Repeated multiplication ā gradients approach 0 - Network can't learn long-range dependencies **Solution: LSTM (Long Short-Term Memory)** LSTMs use gating mechanisms to control information flow: - **Forget gate**: What to remove from cell state - **Input gate**: What new information to add - **Output gate**: What to output from cell state This allows gradients to flow unchanged through many time steps.
Build an LSTM for sentiment analysis:
import torch
import torch.nn as nn
import torch.optim as optim
class SentimentLSTM(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim,
n_layers=2, dropout=0.5):
super().__init__()
# Embedding layer
self.embedding = nn.Embedding(vocab_size, embedding_dim)
# LSTM layer(s)
self.lstm = nn.LSTM(embedding_dim,
hidden_dim,
num_layers=n_layers,
dropout=dropout if n_layers > 1 else 0,
batch_first=True)
# Dropout
self.dropout = nn.Dropout(dropout)
# Output layer
self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, text):
# text shape: [batch_size, seq_len]
# Embedding: [batch_size, seq_len, embedding_dim]
embedded = self.dropout(self.embedding(text))
# LSTM: output [batch_size, seq_len, hidden_dim]
# hidden/cell: [num_layers, batch_size, hidden_dim]
output, (hidden, cell) = self.lstm(embedded)
# Use the last hidden state
# hidden[-1] is the last layer's hidden state
hidden = self.dropout(hidden[-1])
# Fully connected: [batch_size, output_dim]
return self.fc(hidden)
# Example usage
vocab_size = 10000
embedding_dim = 100
hidden_dim = 256
output_dim = 1 # Binary sentiment (positive/negative)
n_layers = 2
# Create model
model = SentimentLSTM(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers)
print(f"Model Architecture:")
print(model)
# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"\nTotal parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
# Example forward pass
batch_size = 32
seq_len = 50
dummy_input = torch.randint(0, vocab_size, (batch_size, seq_len))
with torch.no_grad():
output = model(dummy_input)
predictions = torch.sigmoid(output)
print(f"\nInput shape: {dummy_input.shape}")
print(f"Output shape: {output.shape}")
print(f"Sample predictions (first 5): {predictions[:5].squeeze().numpy()}")
# Training loop example
def train_epoch(model, iterator, optimizer, criterion):
model.train()
epoch_loss = 0
for batch in iterator:
# Zero gradients
optimizer.zero_grad()
# Forward pass
predictions = model(batch.text).squeeze(1)
# Calculate loss
loss = criterion(predictions, batch.label)
# Backward pass
loss.backward()
# Update weights
optimizer.step()
epoch_loss += loss.item()
return epoch_loss / len(iterator)
print("\nā LSTM model ready for training!")Model Architecture: SentimentLSTM( (embedding): Embedding(10000, 100) (lstm): LSTM(100, 256, num_layers=2, batch_first=True, dropout=0.5) (dropout): Dropout(p=0.5, inplace=False) (fc): Linear(in_features=256, out_features=1, bias=True) ) Total parameters: 1,539,329 Trainable parameters: 1,539,329 Input shape: torch.Size([32, 50]) Output shape: torch.Size([32, 1]) Sample predictions (first 5): [0.512 0.489 0.503 0.497 0.508] ā LSTM model ready for training!