Spam filtering systems often utilize a bag-of-words model, where each word's frequency in a document is considered, allowing for multiple occurrences.
1. Data Preparasion: Text Segmentation
Previous examples used pre-defined word vectors. Here's how to build a word list from raw text documents.
Consider the following Python session:
>>> sample_text = 'This book is the best book on python or M.L. I have ever laid eyes upon.'
>>> sample_text.split()
['This', 'book', 'is', 'the', 'best', 'book', 'on', 'python', 'or', 'M.L.', 'I', 'have', 'ever', 'laid', 'eyes', 'upon.']
>>> import re
>>> pattern = re.compile('\W*')
>>> tokens = pattern.split(sample_text)
>>> tokens
['This', 'book', 'is', 'the', 'best', 'book', 'on', 'python', 'or', 'M', 'L', 'I', 'have', 'ever', 'laid', 'eyes', 'upon', '']
>>> [token for token in tokens if len(token) > 0]
['This', 'book', 'is', 'the', 'best', 'book', 'on', 'python', 'or', 'M', 'L', 'I', 'have', 'ever', 'laid', 'eyes', 'upon']
>>> [token.lower() for token in tokens if len(token) > 0]
['this', 'book', 'is', 'the', 'best', 'book', 'on', 'python', 'or', 'm', 'l', 'i', 'have', 'ever', 'laid', 'eyes', 'upon']
This example uses a dataset of 50 emails located in email/ham and email/spam folders, each containing 25 text files numbered 1.txt to 25.txt. Ten emails are randomly selected as a test set.
Create a Python module bayes_module.py with the following foundational functions:
#!/usr/bin/python
# -*- coding:utf-8 -*-
# Generates a list of unique words from a collection of documents.
def build_vocabulary(doc_collection):
vocab = set([])
for doc in doc_collection:
vocab = vocab | set(doc)
return list(vocab)
# Converts a document into a vector based on a given vocabulary.
def doc_to_vector(vocab, doc_words):
vec = [0] * len(vocab)
for word in doc_words:
if word in vocab:
vec[vocab.index(word)] += 1 # Bag-of-words model
else:
print(f"Word not in vocabulary: {word}")
return vec
2. Training the Algorithm: Calculating Probabilities from Vectors
Given the numerical representation of documents and their class labels, we compute probabilities. First, calculate the prior probability P(Class). Then, compute the conditional probabilities P(word | Class) for the Naive Bayes classifier.
import numpy as np
def train_naive_bayes(feature_matrix, labels):
num_docs = len(feature_matrix)
num_features = len(feature_matrix[0])
p_spam = sum(labels) / float(num_docs)
# Initialize counters with Laplace smoothing
spam_word_count = np.ones(num_features)
ham_word_count = np.ones(num_features)
spam_total_words = 2.0
ham_total_words = 2.0
for i in range(num_docs):
if labels[i] == 1: # Spam document
spam_word_count += feature_matrix[i]
spam_total_words += sum(feature_matrix[i])
else: # Ham document
ham_word_count += feature_matrix[i]
ham_total_words += sum(feature_matrix[i])
# Calculate log probabilities to prevent underflow
p_words_given_spam = np.log(spam_word_count / spam_total_words)
p_words_given_ham = np.log(ham_word_count / ham_total_words)
return p_words_given_ham, p_words_given_spam, p_spam
3. Testing the Algorithm: Cross-Validation with Naive Bayes
def classify_email(feature_vec, p_ham_vec, p_spam_vec, p_spam_prior):
# Use log probabilities for numerical stability
spam_score = sum(feature_vec * p_spam_vec) + np.log(p_spam_prior)
ham_score = sum(feature_vec * p_ham_vec) + np.log(1.0 - p_spam_prior)
return 1 if spam_score > ham_score else 0
def parse_text(raw_string):
import re
token_list = re.split(r'\W*', raw_string)
return [tok.lower() for tok in token_list if len(tok) > 2]
def evaluate_spam_filter():
import random
docs = []
classes = []
all_words = []
# Load spam emails
for i in range(1, 26):
word_stream = parse_text(open(f'email/spam/{i}.txt').read())
docs.append(word_stream)
all_words.extend(word_stream)
classes.append(1)
# Load ham emails
for i in range(1, 26):
word_stream = parse_text(open(f'email/ham/{i}.txt').read())
docs.append(word_stream)
all_words.extend(word_stream)
classes.append(0)
vocabulary = build_vocabulary(docs)
training_indices = list(range(50))
test_indices = []
# Randomly select 10 documents for testing
for _ in range(10):
rand_idx = int(random.uniform(0, len(training_indices)))
test_indices.append(training_indices[rand_idx])
del training_indices[rand_idx]
# Prepare training data
train_features = []
train_labels = []
for idx in training_indices:
train_features.append(doc_to_vector(vocabulary, docs[idx]))
train_labels.append(classes[idx])
# Train the classifier
p_ham_v, p_spam_v, p_spam = train_naive_bayes(np.array(train_features), np.array(train_labels))
# Test the classifier
errors = 0
for idx in test_indices:
feature_vector = doc_to_vector(vocabulary, docs[idx])
prediction = classify_email(np.array(feature_vector), p_ham_v, p_spam_v, p_spam)
if prediction != classes[idx]:
errors += 1
error_rate = float(errors) / len(test_indices)
print(f'Classification error rate: {error_rate}')
return error_rate
Executing the test function yields varying results due to random sampling:
>>> import bayes_module
>>> reload(bayes_module)
>>> bayes_module.evaluate_spam_filter()
Classification error rate: 0.1
>>> bayes_module.evaluate_spam_filter()
Classification error rate: 0.0
The classifier typically shows a bias where spam emails are more likely to be misclassified as ham than vice versa, which is generally a preferable error direction.