Prepare text data for machine learning
Proper text preprocessing is crucial for NLP tasks: **Common Steps:** 1. Lowercasing 2. Tokenization 3. Removing punctuation/special characters 4. Removing stop words 5. Stemming/Lemmatization 6. Handling contractions
Build preprocessing pipeline:
import re
from collections import Counter
class TextPreprocessor:
def __init__(self):
self.stop_words = {'the', 'a', 'an', 'in', 'on', 'at', 'to', 'for',
'of', 'and', 'is', 'are', 'was', 'were', 'be'}
def lowercase(self, text):
return text.lower()
def remove_punctuation(self, text):
return re.sub(r'[^\w\s]', '', text)
def tokenize(self, text):
return text.split()
def remove_stopwords(self, tokens):
return [token for token in tokens if token not in self.stop_words]
def simple_stem(self, word):
"""Simple stemmer (removes common suffixes)"""
suffixes = ['ing', 'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']
for suffix in suffixes:
if word.endswith(suffix):
return word[:-len(suffix)]
return word
def preprocess(self, text, remove_stops=True, stem=True):
# Lowercase
text = self.lowercase(text)
# Remove punctuation
text = self.remove_punctuation(text)
# Tokenize
tokens = self.tokenize(text)
# Remove stopwords
if remove_stops:
tokens = self.remove_stopwords(tokens)
# Stem
if stem:
tokens = [self.simple_stem(token) for token in tokens]
return tokens
# Example usage
preprocessor = TextPreprocessor()
text = """Machine Learning is amazing! It's revolutionizing industries
and creating new opportunities. The applications are endless."""
print("Original text:")
print(text)
# Preprocess
tokens = preprocessor.preprocess(text)
print(f"\nAfter preprocessing:")
print(tokens)
# Word frequency
word_freq = Counter(tokens)
print(f"\nMost common words:")
for word, freq in word_freq.most_common(5):
print(f" '{word}': {freq}")
# Example: Building vocabulary
vocab = {word: idx for idx, word in enumerate(sorted(set(tokens)))}
print(f"\nVocabulary (first 10):")
for word, idx in list(vocab.items())[:10]:
print(f" '{word}': {idx}")Original text: Machine Learning is amazing! It's revolutionizing industries and creating new opportunities. The applications are endless. After preprocessing: ['machine', 'learn', 'amaz', 'revolution', 'industr', 'creat', 'new', 'opportun', 'applic', 'endless'] Most common words: 'machine': 1 'learn': 1 'amaz': 1 'revolution': 1 'industr': 1 Vocabulary (first 10): 'amaz': 0 'applic': 1 'creat': 2 'endless': 3 'industr': 4 'learn': 5 'machine': 6 'new': 7 'opportun': 8 'revolution': 9