Sharan Initiatives — AI, Finance, Photography & More

Understanding Word Embeddings

Word embeddings map words to dense vectors where semantic similarity is captured by vector distance. **Popular Methods:** - Word2Vec (Skip-gram, CBOW) - GloVe (Global Vectors) - FastText **Key Ideas:** - Words with similar meanings have similar vectors - Arithmetic on vectors (king - man + woman ≈ queen) - Pre-trained embeddings transfer across tasks

Implementing Simple Word2Vec

Build skip-gram model:

python

import numpy as np
from collections import defaultdict

class SimpleWord2Vec:
    def __init__(self, sentences, embedding_dim=50, window=2, lr=0.025):
        self.embedding_dim = embedding_dim
        self.window = window
        self.lr = lr
        
        # Build vocabulary
        self.vocab = {}
        self.inv_vocab = {}
        idx = 0
        for sentence in sentences:
            for word in sentence:
                if word not in self.vocab:
                    self.vocab[word] = idx
                    self.inv_vocab[idx] = word
                    idx += 1
        
        self.vocab_size = len(self.vocab)
        
        # Initialize embeddings
        self.W1 = np.random.randn(self.vocab_size, embedding_dim) * 0.01
        self.W2 = np.random.randn(embedding_dim, self.vocab_size) * 0.01
    
    def get_training_pairs(self, sentences):
        """Generate (target, context) pairs"""
        pairs = []
        for sentence in sentences:
            for i, target_word in enumerate(sentence):
                # Get context words within window
                start = max(0, i - self.window)
                end = min(len(sentence), i + self.window + 1)
                
                for j in range(start, end):
                    if i != j:
                        context_word = sentence[j]
                        pairs.append((target_word, context_word))
        return pairs
    
    def train(self, sentences, epochs=100):
        pairs = self.get_training_pairs(sentences)
        
        for epoch in range(epochs):
            loss = 0
            for target, context in pairs:
                target_idx = self.vocab[target]
                context_idx = self.vocab[context]
                
                # Forward pass
                h = self.W1[target_idx]
                u = np.dot(h, self.W2)
                y_pred = self.softmax(u)
                
                # Compute loss
                loss -= np.log(y_pred[context_idx])
                
                # Backward pass
                e = y_pred
                e[context_idx] -= 1
                
                # Update weights
                self.W2 -= self.lr * np.outer(h, e)
                self.W1[target_idx] -= self.lr * np.dot(self.W2, e)
            
            if epoch % 20 == 0:
                print(f"Epoch {epoch}, Loss: {loss / len(pairs):.4f}")
    
    def softmax(self, x):
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum()
    
    def get_vector(self, word):
        return self.W1[self.vocab[word]]
    
    def similarity(self, word1, word2):
        v1 = self.get_vector(word1)
        v2 = self.get_vector(word2)
        return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

# Example corpus
sentences = [
    ['the', 'cat', 'sits', 'on', 'the', 'mat'],
    ['the', 'dog', 'plays', 'in', 'the', 'park'],
    ['cats', 'and', 'dogs', 'are', 'pets'],
    ['the', 'quick', 'brown', 'cat', 'jumps'],
]

# Train Word2Vec
print("Training Word2Vec...")
w2v = SimpleWord2Vec(sentences, embedding_dim=10, window=2)
w2v.train(sentences, epochs=100)

# Test similarities
print("\nWord Similarities:")
print(f"  cat - dog: {w2v.similarity('cat', 'dog'):.3f}")
print(f"  cat - mat: {w2v.similarity('cat', 'mat'):.3f}")
print(f"  the - and: {w2v.similarity('the', 'and'):.3f}")

Output:

Training Word2Vec...
Epoch 0, Loss: 2.9847
Epoch 20, Loss: 2.5634
Epoch 40, Loss: 2.3123
Epoch 60, Loss: 2.1456
Epoch 80, Loss: 2.0234

Word Similarities:
  cat - dog: 0.423
  cat - mat: 0.267
  the - and: 0.512

Word Embeddings (Word2Vec)

Understanding Word Embeddings

Implementing Simple Word2Vec