Reading Email Dataset
The first step in our spam classification task is to load the email dataset. We'll use Python's csv module to read the SMSSpamCollection file which contains labeled SMS messages.
import csv
def load_sms_dataset(file_path):
"""
Load SMS dataset from a tab-separated file
Returns: tuple of (labels, messages)
"""
labels = []
messages = []
with open(file_path, 'r', encoding='utf-8') as file:
csv_reader = csv.reader(file, delimiter='\t')
for row in csv_reader:
labels.append(row[0])
messages.append(row[1])
return labels, messages
# Usage example
labels, messages = load_sms_dataset('../data/SMSSpamCollection')
print(f"Loaded {len(labels)} messages")
Email Text Preprocessing
Before feeding our text data to the Naive Bayes classifier, we need to preprocess it. This involves several steps:
- Converting text to lowercase
- Removing punctuation
- Sentence and word tokenization
- Removing stop words
- Part-of-speech tagging
- Lemmatization
Using NLTK for Text Processing
Natural Language Toolkit (NLTK) provides excellent resources for text preprocessing. First, ensure NLTK is installed:
pip install nltk
After installation, download the required NLTK data packages:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
Preprocessing Function
Here's a comprehensive preprocessing function that handles all the necessary steps:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
import string
def preprocess_text(text):
"""
Preprocess text for spam classification:
1. Convert to lowercase
2. Remove punctuation
3. Tokenize sentences and words
4. Remove stop words
5. Perform POS tagging
6. Lemmatize tokens
"""
# Convert to lowercase
text = text.lower()
# Remove punctuation
text = text.translate(str.maketrans('', '', string.punctuation))
# Tokenize sentences
sentences = sent_tokenize(text)
# Tokenize words and remove stop words
stop_words = set(stopwords.words('english'))
tokens = []
for sentence in sentences:
words = word_tokenize(sentence)
for word in words:
if word not in stop_words and len(word) > 2: # Also remove very short words
tokens.append(word)
# POS tagging
tagged_tokens = nltk.pos_tag(tokens)
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = []
for word, pos in tagged_tokens:
# Map POS tags to lemmatizer format
pos_tag = get_wordnet_pos(pos)
if pos_tag:
lemma = lemmatizer.lemmatize(word, pos=pos_tag)
else:
lemma = lemmatizer.lemmatize(word)
lemmatized_tokens.append(lemma)
return lemmatized_tokens
def get_wordnet_pos(treebank_tag):
"""
Convert treebank POS tags to wordnet format
"""
if treebank_tag.startswith('J'):
return 'a' # adjective
elif treebank_tag.startswith('V'):
return 'v' # verb
elif treebank_tag.startswith('N'):
return 'n' # noun
elif treebank_tag.startswith('R'):
return 'r' # adverb
else:
return None
Preparing Training and Testing Data
After preprocessing all messages, we need to split our dataset into training and testing sets:
from sklearn.model_selection import train_test_split
# Preprocess all messages
processed_messages = [preprocess_text(msg) for msg in messages]
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
processed_messages,
labels,
test_size=0.2,
random_state=42
)
print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")
Feature Vectorization
For Naive Bayes classification, we need to convert our text data into numerical features. We'll use the Bag-of-Words approach with TF-IDF weighting:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize
def dummy_tokenizer(text):
"""
Dummy tokenizer since our text is already preprocessed
"""
return text.split()
# Create TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(
tokenizer=dummy_tokenizer,
lowercase=False,
max_features=5000 # Use top 5000 features
)
# Fit and transform training data
X_train_tfidf = tfidf_vectorizer.fit_transform([' '.join(msg) for msg in X_train])
# Transform test data
X_test_tfidf = tfidf_vectorizer.transform([' '.join(msg) for msg in X_test])
print(f"Training data shape: {X_train_tfidf.shape}")
print(f"Testing data shape: {X_test_tfidf.shape}")
Training the Naive Bayes Model
Now we'll train a Multinomial Naive Bayes classifier on our preprocessed and vectorized data:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Initialize and train the model
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)
# Make predictions on test data
y_pred = nb_classifier.predict(X_test_tfidf)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")
# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
Complete Implementation
Here's the complete implementation combining all the steps:
import csv
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
def load_sms_dataset(file_path):
"""Load SMS dataset from a tab-separated file"""
labels = []
messages = []
with open(file_path, 'r', encoding='utf-8') as file:
csv_reader = csv.reader(file, delimiter='\t')
for row in csv_reader:
labels.append(row[0])
messages.append(row[1])
return labels, messages
def get_wordnet_pos(treebank_tag):
"""Convert treebank POS tags to wordnet format"""
if treebank_tag.startswith('J'):
return 'a' # adjective
elif treebank_tag.startswith('V'):
return 'v' # verb
elif treebank_tag.startswith('N'):
return 'n' # noun
elif treebank_tag.startswith('R'):
return 'r' # adverb
else:
return None
def preprocess_text(text):
"""Preprocess text for spam classification"""
# Convert to lowercase
text = text.lower()
# Remove punctuation
text = text.translate(str.maketrans('', '', string.punctuation))
# Tokenize sentences and words
sentences = sent_tokenize(text)
tokens = []
stop_words = set(stopwords.words('english'))
for sentence in sentences:
words = word_tokenize(sentence)
for word in words:
if word not in stop_words and len(word) > 2:
tokens.append(word)
# POS tagging and lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = []
for word, pos in nltk.pos_tag(tokens):
pos_tag = get_wordnet_pos(pos)
if pos_tag:
lemma = lemmatizer.lemmatize(word, pos=pos_tag)
else:
lemma = lemmatizer.lemmatize(word)
lemmatized_tokens.append(lemma)
return lemmatized_tokens
def dummy_tokenizer(text):
"""Dummy tokenizer for preprocessed text"""
return text.split()
def main():
# Load dataset
labels, messages = load_sms_dataset('../data/SMSSpamCollection')
# Preprocess messages
print("Preprocessing messages...")
processed_messages = [preprocess_text(msg) for msg in messages]
# Split data
X_train, X_test, y_train, y_test = train_test_split(
processed_messages,
labels,
test_size=0.2,
random_state=42
)
# Vectorize text
print("Creating feature vectors...")
tfidf_vectorizer = TfidfVectorizer(
tokenizer=dummy_tokenizer,
lowercase=False,
max_features=5000
)
X_train_tfidf = tfidf_vectorizer.fit_transform([' '.join(msg) for msg in X_train])
X_test_tfidf = tfidf_vectorizer.transform([' '.join(msg) for msg in X_test])
# Train classifier
print("Training Naive Bayes classifier...")
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)
# Evaluate
y_pred = nb_classifier.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
if __name__ == '__main__':
main()