⏱️ 45 min

Text Preprocessing for NLP

Prepare text data for machine learning

Text Preprocessing Steps

Proper text preprocessing is crucial for NLP tasks: **Common Steps:** 1. Lowercasing 2. Tokenization 3. Removing punctuation/special characters 4. Removing stop words 5. Stemming/Lemmatization 6. Handling contractions

Implementing Text Preprocessing

Build preprocessing pipeline:

python
import re
from collections import Counter

class TextPreprocessor:
    def __init__(self):
        self.stop_words = {'the', 'a', 'an', 'in', 'on', 'at', 'to', 'for',
                          'of', 'and', 'is', 'are', 'was', 'were', 'be'}
    
    def lowercase(self, text):
        return text.lower()
    
    def remove_punctuation(self, text):
        return re.sub(r'[^\w\s]', '', text)
    
    def tokenize(self, text):
        return text.split()
    
    def remove_stopwords(self, tokens):
        return [token for token in tokens if token not in self.stop_words]
    
    def simple_stem(self, word):
        """Simple stemmer (removes common suffixes)"""
        suffixes = ['ing', 'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']
        for suffix in suffixes:
            if word.endswith(suffix):
                return word[:-len(suffix)]
        return word
    
    def preprocess(self, text, remove_stops=True, stem=True):
        # Lowercase
        text = self.lowercase(text)
        
        # Remove punctuation
        text = self.remove_punctuation(text)
        
        # Tokenize
        tokens = self.tokenize(text)
        
        # Remove stopwords
        if remove_stops:
            tokens = self.remove_stopwords(tokens)
        
        # Stem
        if stem:
            tokens = [self.simple_stem(token) for token in tokens]
        
        return tokens

# Example usage
preprocessor = TextPreprocessor()

text = """Machine Learning is amazing! It's revolutionizing industries 
and creating new opportunities. The applications are endless."""

print("Original text:")
print(text)

# Preprocess
tokens = preprocessor.preprocess(text)
print(f"\nAfter preprocessing:")
print(tokens)

# Word frequency
word_freq = Counter(tokens)
print(f"\nMost common words:")
for word, freq in word_freq.most_common(5):
    print(f"  '{word}': {freq}")

# Example: Building vocabulary
vocab = {word: idx for idx, word in enumerate(sorted(set(tokens)))}
print(f"\nVocabulary (first 10):")
for word, idx in list(vocab.items())[:10]:
    print(f"  '{word}': {idx}")
Output:
Original text:
Machine Learning is amazing! It's revolutionizing industries 
and creating new opportunities. The applications are endless.

After preprocessing:
['machine', 'learn', 'amaz', 'revolution', 'industr', 'creat', 
'new', 'opportun', 'applic', 'endless']

Most common words:
  'machine': 1
  'learn': 1
  'amaz': 1
  'revolution': 1
  'industr': 1

Vocabulary (first 10):
  'amaz': 0
  'applic': 1
  'creat': 2
  'endless': 3
  'industr': 4
  'learn': 5
  'machine': 6
  'new': 7
  'opportun': 8
  'revolution': 9
Sharan Initiatives - Making a Difference Together