Implementing Naive Bayes for Email Spam Classification

Reading Email Dataset

The first step in our spam classification task is to load the email dataset. We'll use Python's csv module to read the SMSSpamCollection file which contains labeled SMS messages.


import csv

def load_sms_dataset(file_path):
    """
    Load SMS dataset from a tab-separated file
    Returns: tuple of (labels, messages)
    """
    labels = []
    messages = []
    
    with open(file_path, 'r', encoding='utf-8') as file:
        csv_reader = csv.reader(file, delimiter='\t')
        for row in csv_reader:
            labels.append(row[0])
            messages.append(row[1])
            
    return labels, messages

# Usage example
labels, messages = load_sms_dataset('../data/SMSSpamCollection')
print(f"Loaded {len(labels)} messages")

Email Text Preprocessing

Before feeding our text data to the Naive Bayes classifier, we need to preprocess it. This involves several steps:

  • Converting text to lowercase
  • Removing punctuation
  • Sentence and word tokenization
  • Removing stop words
  • Part-of-speech tagging
  • Lemmatization

Using NLTK for Text Processing

Natural Language Toolkit (NLTK) provides excellent resources for text preprocessing. First, ensure NLTK is installed:


pip install nltk

After installation, download the required NLTK data packages:


import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

Preprocessing Function

Here's a comprehensive preprocessing function that handles all the necessary steps:


import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
import string

def preprocess_text(text):
    """
    Preprocess text for spam classification:
    1. Convert to lowercase
    2. Remove punctuation
    3. Tokenize sentences and words
    4. Remove stop words
    5. Perform POS tagging
    6. Lemmatize tokens
    """
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize sentences
    sentences = sent_tokenize(text)
    
    # Tokenize words and remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = []
    
    for sentence in sentences:
        words = word_tokenize(sentence)
        for word in words:
            if word not in stop_words and len(word) > 2:  # Also remove very short words
                tokens.append(word)
    
    # POS tagging
    tagged_tokens = nltk.pos_tag(tokens)
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = []
    
    for word, pos in tagged_tokens:
        # Map POS tags to lemmatizer format
        pos_tag = get_wordnet_pos(pos)
        if pos_tag:
            lemma = lemmatizer.lemmatize(word, pos=pos_tag)
        else:
            lemma = lemmatizer.lemmatize(word)
        lemmatized_tokens.append(lemma)
    
    return lemmatized_tokens

def get_wordnet_pos(treebank_tag):
    """
    Convert treebank POS tags to wordnet format
    """
    if treebank_tag.startswith('J'):
        return 'a'  # adjective
    elif treebank_tag.startswith('V'):
        return 'v'  # verb
    elif treebank_tag.startswith('N'):
        return 'n'  # noun
    elif treebank_tag.startswith('R'):
        return 'r'  # adverb
    else:
        return None

Preparing Training and Testing Data

After preprocessing all messages, we need to split our dataset into training and testing sets:


from sklearn.model_selection import train_test_split

# Preprocess all messages
processed_messages = [preprocess_text(msg) for msg in messages]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    processed_messages, 
    labels, 
    test_size=0.2, 
    random_state=42
)

print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")

Feature Vectorization

For Naive Bayes classification, we need to convert our text data into numerical features. We'll use the Bag-of-Words approach with TF-IDF weighting:


from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize

def dummy_tokenizer(text):
    """
    Dummy tokenizer since our text is already preprocessed
    """
    return text.split()

# Create TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(
    tokenizer=dummy_tokenizer,
    lowercase=False,
    max_features=5000  # Use top 5000 features
)

# Fit and transform training data
X_train_tfidf = tfidf_vectorizer.fit_transform([' '.join(msg) for msg in X_train])

# Transform test data
X_test_tfidf = tfidf_vectorizer.transform([' '.join(msg) for msg in X_test])

print(f"Training data shape: {X_train_tfidf.shape}")
print(f"Testing data shape: {X_test_tfidf.shape}")

Training the Naive Bayes Model

Now we'll train a Multinomial Naive Bayes classifier on our preprocessed and vectorized data:


from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize and train the model
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)

# Make predictions on test data
y_pred = nb_classifier.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Complete Implementation

Here's the complete implementation combining all the steps:


import csv
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

def load_sms_dataset(file_path):
    """Load SMS dataset from a tab-separated file"""
    labels = []
    messages = []
    
    with open(file_path, 'r', encoding='utf-8') as file:
        csv_reader = csv.reader(file, delimiter='\t')
        for row in csv_reader:
            labels.append(row[0])
            messages.append(row[1])
            
    return labels, messages

def get_wordnet_pos(treebank_tag):
    """Convert treebank POS tags to wordnet format"""
    if treebank_tag.startswith('J'):
        return 'a'  # adjective
    elif treebank_tag.startswith('V'):
        return 'v'  # verb
    elif treebank_tag.startswith('N'):
        return 'n'  # noun
    elif treebank_tag.startswith('R'):
        return 'r'  # adverb
    else:
        return None

def preprocess_text(text):
    """Preprocess text for spam classification"""
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize sentences and words
    sentences = sent_tokenize(text)
    tokens = []
    stop_words = set(stopwords.words('english'))
    
    for sentence in sentences:
        words = word_tokenize(sentence)
        for word in words:
            if word not in stop_words and len(word) > 2:
                tokens.append(word)
    
    # POS tagging and lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = []
    
    for word, pos in nltk.pos_tag(tokens):
        pos_tag = get_wordnet_pos(pos)
        if pos_tag:
            lemma = lemmatizer.lemmatize(word, pos=pos_tag)
        else:
            lemma = lemmatizer.lemmatize(word)
        lemmatized_tokens.append(lemma)
    
    return lemmatized_tokens

def dummy_tokenizer(text):
    """Dummy tokenizer for preprocessed text"""
    return text.split()

def main():
    # Load dataset
    labels, messages = load_sms_dataset('../data/SMSSpamCollection')
    
    # Preprocess messages
    print("Preprocessing messages...")
    processed_messages = [preprocess_text(msg) for msg in messages]
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        processed_messages, 
        labels, 
        test_size=0.2, 
        random_state=42
    )
    
    # Vectorize text
    print("Creating feature vectors...")
    tfidf_vectorizer = TfidfVectorizer(
        tokenizer=dummy_tokenizer,
        lowercase=False,
        max_features=5000
    )
    
    X_train_tfidf = tfidf_vectorizer.fit_transform([' '.join(msg) for msg in X_train])
    X_test_tfidf = tfidf_vectorizer.transform([' '.join(msg) for msg in X_test])
    
    # Train classifier
    print("Training Naive Bayes classifier...")
    nb_classifier = MultinomialNB()
    nb_classifier.fit(X_train_tfidf, y_train)
    
    # Evaluate
    y_pred = nb_classifier.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"\nModel Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

if __name__ == '__main__':
    main()

Tags: Machine Learning Naive Bayes Text Classification Spam Detection NLP

Posted on Sun, 10 May 2026 14:09:15 +0000 by Saphod