Master the architecture powering computer vision
Convolutional Neural Networks are specialized for processing grid-like data such as images. They use convolution operations to automatically learn spatial hierarchies of features. **Key Components:** 1. **Convolutional Layers**: Extract features using learnable filters 2. **Pooling Layers**: Downsample to reduce dimensions 3. **Fully Connected Layers**: Classification at the end **Why CNNs for Images?** - Parameter sharing reduces model size - Translation invariance - Hierarchical feature learning
Implement CNN in PyTorch:
import torch
import torch.nn as nn
import torch.nn.functional as F
class SimpleCNN(nn.Module):
def __init__(self, num_classes=10):
super(SimpleCNN, self).__init__()
# Convolutional layers
self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
# Pooling layer
self.pool = nn.MaxPool2d(2, 2)
# Fully connected layers
self.fc1 = nn.Linear(128 * 4 * 4, 512)
self.fc2 = nn.Linear(512, num_classes)
# Dropout
self.dropout = nn.Dropout(0.5)
def forward(self, x):
# Input: [batch, 3, 32, 32]
# Conv block 1
x = self.pool(F.relu(self.conv1(x))) # [batch, 32, 16, 16]
# Conv block 2
x = self.pool(F.relu(self.conv2(x))) # [batch, 64, 8, 8]
# Conv block 3
x = self.pool(F.relu(self.conv3(x))) # [batch, 128, 4, 4]
# Flatten
x = x.view(-1, 128 * 4 * 4) # [batch, 2048]
# Fully connected layers
x = F.relu(self.fc1(x))
x = self.dropout(x)
x = self.fc2(x)
return x
# Create model
model = SimpleCNN(num_classes=10)
# Print architecture
print("CNN Architecture:")
print(model)
# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"\nTotal Parameters: {total_params:,}")
print(f"Trainable Parameters: {trainable_params:,}")
# Test forward pass
dummy_input = torch.randn(1, 3, 32, 32)
output = model(dummy_input)
print(f"\nInput shape: {dummy_input.shape}")
print(f"Output shape: {output.shape}")CNN Architecture: SimpleCNN( (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (conv3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) (fc1): Linear(in_features=2048, out_features=512, bias=True) (fc2): Linear(in_features=512, out_features=10, bias=True) (dropout): Dropout(p=0.5, inplace=False) ) Total Parameters: 1,081,930 Trainable Parameters: 1,081,930 Input shape: torch.Size([1, 3, 32, 32]) Output shape: torch.Size([1, 10])
Visualize how convolution works:
import numpy as np
def convolve2d(image, kernel):
"""Simple 2D convolution (no padding)"""
image_h, image_w = image.shape
kernel_h, kernel_w = kernel.shape
output_h = image_h - kernel_h + 1
output_w = image_w - kernel_w + 1
output = np.zeros((output_h, output_w))
for i in range(output_h):
for j in range(output_w):
region = image[i:i+kernel_h, j:j+kernel_w]
output[i, j] = np.sum(region * kernel)
return output
# Example image (5x5)
image = np.array([
[1, 2, 3, 4, 5],
[6, 7, 8, 9, 10],
[11, 12, 13, 14, 15],
[16, 17, 18, 19, 20],
[21, 22, 23, 24, 25]
])
# Edge detection kernel (vertical edges)
kernel_vertical = np.array([
[-1, 0, 1],
[-2, 0, 2],
[-1, 0, 1]
])
# Apply convolution
result = convolve2d(image, kernel_vertical)
print("Original Image (5x5):")
print(image)
print("\nVertical Edge Detection Kernel (Sobel):")
print(kernel_vertical)
print("\nResult after Convolution (3x3):")
print(result.astype(int))Original Image (5x5): [[ 1 2 3 4 5] [ 6 7 8 9 10] [11 12 13 14 15] [16 17 18 19 20] [21 22 23 24 25]] Vertical Edge Detection Kernel (Sobel): [[-1 0 1] [-2 0 2] [-1 0 1]] Result after Convolution (3x3): [[ -8 -8 -8] [ -8 -8 -8] [ -8 -8 -8]]