Learn dense vector representations of words
Word embeddings map words to dense vectors where semantic similarity is captured by vector distance. **Popular Methods:** - Word2Vec (Skip-gram, CBOW) - GloVe (Global Vectors) - FastText **Key Ideas:** - Words with similar meanings have similar vectors - Arithmetic on vectors (king - man + woman ≈ queen) - Pre-trained embeddings transfer across tasks
Build skip-gram model:
import numpy as np
from collections import defaultdict
class SimpleWord2Vec:
def __init__(self, sentences, embedding_dim=50, window=2, lr=0.025):
self.embedding_dim = embedding_dim
self.window = window
self.lr = lr
# Build vocabulary
self.vocab = {}
self.inv_vocab = {}
idx = 0
for sentence in sentences:
for word in sentence:
if word not in self.vocab:
self.vocab[word] = idx
self.inv_vocab[idx] = word
idx += 1
self.vocab_size = len(self.vocab)
# Initialize embeddings
self.W1 = np.random.randn(self.vocab_size, embedding_dim) * 0.01
self.W2 = np.random.randn(embedding_dim, self.vocab_size) * 0.01
def get_training_pairs(self, sentences):
"""Generate (target, context) pairs"""
pairs = []
for sentence in sentences:
for i, target_word in enumerate(sentence):
# Get context words within window
start = max(0, i - self.window)
end = min(len(sentence), i + self.window + 1)
for j in range(start, end):
if i != j:
context_word = sentence[j]
pairs.append((target_word, context_word))
return pairs
def train(self, sentences, epochs=100):
pairs = self.get_training_pairs(sentences)
for epoch in range(epochs):
loss = 0
for target, context in pairs:
target_idx = self.vocab[target]
context_idx = self.vocab[context]
# Forward pass
h = self.W1[target_idx]
u = np.dot(h, self.W2)
y_pred = self.softmax(u)
# Compute loss
loss -= np.log(y_pred[context_idx])
# Backward pass
e = y_pred
e[context_idx] -= 1
# Update weights
self.W2 -= self.lr * np.outer(h, e)
self.W1[target_idx] -= self.lr * np.dot(self.W2, e)
if epoch % 20 == 0:
print(f"Epoch {epoch}, Loss: {loss / len(pairs):.4f}")
def softmax(self, x):
e_x = np.exp(x - np.max(x))
return e_x / e_x.sum()
def get_vector(self, word):
return self.W1[self.vocab[word]]
def similarity(self, word1, word2):
v1 = self.get_vector(word1)
v2 = self.get_vector(word2)
return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
# Example corpus
sentences = [
['the', 'cat', 'sits', 'on', 'the', 'mat'],
['the', 'dog', 'plays', 'in', 'the', 'park'],
['cats', 'and', 'dogs', 'are', 'pets'],
['the', 'quick', 'brown', 'cat', 'jumps'],
]
# Train Word2Vec
print("Training Word2Vec...")
w2v = SimpleWord2Vec(sentences, embedding_dim=10, window=2)
w2v.train(sentences, epochs=100)
# Test similarities
print("\nWord Similarities:")
print(f" cat - dog: {w2v.similarity('cat', 'dog'):.3f}")
print(f" cat - mat: {w2v.similarity('cat', 'mat'):.3f}")
print(f" the - and: {w2v.similarity('the', 'and'):.3f}")Training Word2Vec... Epoch 0, Loss: 2.9847 Epoch 20, Loss: 2.5634 Epoch 40, Loss: 2.3123 Epoch 60, Loss: 2.1456 Epoch 80, Loss: 2.0234 Word Similarities: cat - dog: 0.423 cat - mat: 0.267 the - and: 0.512