ā±ļø 75 min

RNNs and LSTMs

Handle sequential data with recurrent neural networks

Understanding Recurrent Neural Networks

RNNs are designed to work with sequential data by maintaining a hidden state that captures information about previous time steps. **Why RNNs?** - Process variable-length sequences (text, time series, audio) - Share parameters across time steps - Capture temporal dependencies **Applications:** - Language modeling - Machine translation - Speech recognition - Time series forecasting - Sentiment analysis

Simple RNN Implementation

Build a basic RNN from scratch:

python
import numpy as np

class SimpleRNN:
    def __init__(self, input_size, hidden_size, output_size):
        # Initialize weights
        self.hidden_size = hidden_size
        
        # Input to hidden
        self.Wxh = np.random.randn(hidden_size, input_size) * 0.01
        # Hidden to hidden
        self.Whh = np.random.randn(hidden_size, hidden_size) * 0.01
        # Hidden to output
        self.Why = np.random.randn(output_size, hidden_size) * 0.01
        # Biases
        self.bh = np.zeros((hidden_size, 1))
        self.by = np.zeros((output_size, 1))
    
    def forward(self, inputs, h_prev):
        """
        inputs: list of input vectors (one per time step)
        h_prev: previous hidden state
        """
        xs, hs, ys, ps = {}, {}, {}, {}
        hs[-1] = np.copy(h_prev)
        
        # Forward pass through time
        for t, x in enumerate(inputs):
            xs[t] = x
            # Hidden state: h_t = tanh(W_xh * x_t + W_hh * h_{t-1} + b_h)
            hs[t] = np.tanh(np.dot(self.Wxh, xs[t]) + 
                           np.dot(self.Whh, hs[t-1]) + self.bh)
            # Output: y_t = W_hy * h_t + b_y
            ys[t] = np.dot(self.Why, hs[t]) + self.by
            # Softmax probabilities
            ps[t] = self.softmax(ys[t])
        
        return xs, hs, ys, ps
    
    def softmax(self, x):
        e_x = np.exp(x - np.max(x))
        return e_x / np.sum(e_x)
    
    def sample(self, h, seed_ix, n):
        """Generate a sequence by sampling from the network"""
        x = np.zeros((self.Wxh.shape[1], 1))
        x[seed_ix] = 1
        ixes = []
        
        for t in range(n):
            h = np.tanh(np.dot(self.Wxh, x) + np.dot(self.Whh, h) + self.bh)
            y = np.dot(self.Why, h) + self.by
            p = self.softmax(y)
            ix = np.random.choice(range(len(p)), p=p.ravel())
            x = np.zeros((self.Wxh.shape[1], 1))
            x[ix] = 1
            ixes.append(ix)
        
        return ixes

# Example: Character-level RNN
text = "hello world"
chars = list(set(text))
char_to_ix = {ch: i for i, ch in enumerate(chars)}
ix_to_char = {i: ch for i, ch in enumerate(chars)}

vocab_size = len(chars)
hidden_size = 100

print(f"Vocabulary: {chars}")
print(f"Vocab size: {vocab_size}")

# Create RNN
rnn = SimpleRNN(vocab_size, hidden_size, vocab_size)

# Convert text to input format
inputs = [np.zeros((vocab_size, 1)) for _ in range(len(text))]
for i, ch in enumerate(text):
    inputs[i][char_to_ix[ch]] = 1

# Forward pass
h_prev = np.zeros((hidden_size, 1))
xs, hs, ys, ps = rnn.forward(inputs, h_prev)

print(f"\nProcessed sequence length: {len(xs)}")
print(f"Hidden state shape: {hs[0].shape}")

# Sample from the model
h = np.zeros((hidden_size, 1))
sample_ix = rnn.sample(h, char_to_ix['h'], 20)
sample_text = ''.join([ix_to_char[ix] for ix in sample_ix])
print(f"\nGenerated sample: {sample_text}")
Output:
Vocabulary: ['h', 'e', 'l', 'o', ' ', 'w', 'r', 'd']
Vocab size: 8

Processed sequence length: 11
Hidden state shape: (100, 1)

Generated sample: loh wdlreo hlworedlh

The Vanishing Gradient Problem

Simple RNNs struggle with long-term dependencies due to vanishing gradients: **Problem:** During backpropagation through time (BPTT), gradients are multiplied many times. With activation functions like tanh/sigmoid: - Values between 0 and 1 - Repeated multiplication → gradients approach 0 - Network can't learn long-range dependencies **Solution: LSTM (Long Short-Term Memory)** LSTMs use gating mechanisms to control information flow: - **Forget gate**: What to remove from cell state - **Input gate**: What new information to add - **Output gate**: What to output from cell state This allows gradients to flow unchanged through many time steps.

LSTM with PyTorch

Build an LSTM for sentiment analysis:

python
import torch
import torch.nn as nn
import torch.optim as optim

class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, 
                 n_layers=2, dropout=0.5):
        super().__init__()
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # LSTM layer(s)
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers,
                           dropout=dropout if n_layers > 1 else 0,
                           batch_first=True)
        
        # Dropout
        self.dropout = nn.Dropout(dropout)
        
        # Output layer
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):
        # text shape: [batch_size, seq_len]
        
        # Embedding: [batch_size, seq_len, embedding_dim]
        embedded = self.dropout(self.embedding(text))
        
        # LSTM: output [batch_size, seq_len, hidden_dim]
        # hidden/cell: [num_layers, batch_size, hidden_dim]
        output, (hidden, cell) = self.lstm(embedded)
        
        # Use the last hidden state
        # hidden[-1] is the last layer's hidden state
        hidden = self.dropout(hidden[-1])
        
        # Fully connected: [batch_size, output_dim]
        return self.fc(hidden)

# Example usage
vocab_size = 10000
embedding_dim = 100
hidden_dim = 256
output_dim = 1  # Binary sentiment (positive/negative)
n_layers = 2

# Create model
model = SentimentLSTM(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers)

print(f"Model Architecture:")
print(model)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"\nTotal parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

# Example forward pass
batch_size = 32
seq_len = 50
dummy_input = torch.randint(0, vocab_size, (batch_size, seq_len))

with torch.no_grad():
    output = model(dummy_input)
    predictions = torch.sigmoid(output)

print(f"\nInput shape: {dummy_input.shape}")
print(f"Output shape: {output.shape}")
print(f"Sample predictions (first 5): {predictions[:5].squeeze().numpy()}")

# Training loop example
def train_epoch(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = 0
    
    for batch in iterator:
        # Zero gradients
        optimizer.zero_grad()
        
        # Forward pass
        predictions = model(batch.text).squeeze(1)
        
        # Calculate loss
        loss = criterion(predictions, batch.label)
        
        # Backward pass
        loss.backward()
        
        # Update weights
        optimizer.step()
        
        epoch_loss += loss.item()
    
    return epoch_loss / len(iterator)

print("\nāœ“ LSTM model ready for training!")
Output:
Model Architecture:
SentimentLSTM(
  (embedding): Embedding(10000, 100)
  (lstm): LSTM(100, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
)

Total parameters: 1,539,329
Trainable parameters: 1,539,329

Input shape: torch.Size([32, 50])
Output shape: torch.Size([32, 1])
Sample predictions (first 5): [0.512 0.489 0.503 0.497 0.508]

āœ“ LSTM model ready for training!
Sharan Initiatives - Making a Difference Together