Task Overview
Sentiment classification is a fundamantal task in Natural Language Processing (NLP) that involves categorizing text (such as reviews or tweets) based on emotional sentiment (e.g., binary classification: positive/negative).
In this tutorial, we'll use the IMDB movie review dataset to implement three different models using PyTorch. We'll start by preparing the data using the torchtext library, which helps create a vocabulary. Then we'll implement and train three distinct models for sentiment classification.
Package Imports and Random Seed
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext import data
import torch.optim as optim
import time
SEED = 1234
# Set random seeds and deterministic algorithms for reproducibility
torch.manual_seed(SEED) # Set CPU random seed
torch.cuda.manual_seed(SEED) # Set GPU random seed
torch.backends.cudnn.deterministic = True # Use deterministic algorithms
Dataset Import
# Using torchtext
TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(dtype=torch.float)
# Inspect a sample
print(vars(train_data.examples[0]))
Fieldis a torchtext class for text preprocessingtokenize='spacy'specifies the spaCy tokenizer for word segmentationLabelFieldstandardizes labels and converts them to tensor formatdtype=torch.floatsets label data type to float (0/1 for binary classification)
Dataset Processing: Train-Validation Split
# Split training data into training and validation sets
train_data, valid_data = train_data.split(random_state=random.seed(SEED))
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')
train_data.split()divides the data into training and validation setsrandom_state=random.seed(SEED)ensures consistent splits across runs
Benefits of this approach:
- Prevents overfitting
- Enables hyperparameter tuning without data leakage
Vocabulary Creation
When creating a vocabulary, we need to:
- Convert text and labels to numerical format
- Utilize pre-trained word vectors for better sementic understanding
- Control vocabulary size to balance efficiency and coverage
# Build vocabulary from training data with pre-trained embeddings
TEXT.build_vocab(train_data, max_size=25000, vectors="glove.6B.100d", unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")
max_size=25000limits vocabulary to the most frequent 25,000 wordsvectors="glove.6B.100d"loads pre-trained GloVe embeddingsunk_init=torch.Tensor.normal_initializes unknown words with normal distribution
Creating Iterators
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Create iterators for efficient batching
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
(train_data, valid_data, test_data),
batch_size=BATCH_SIZE,
device=device)
# Example of accessing words from a batch
[TEXT.vocab.itos[i] for i in next(iter(train_iterator)).text[1, :]]
BucketIteratorcreates batches with similar lengths to minimize paddingdevicespecifies whether to use GPU or CPU
Word Averaging Model
The Word Averaging model takes the mean of all word vectors in a sentence and passes it through a classifier:
- Input sentence → tokenization → word vectors
- Average pooling → sentence vector
- Fully connected layer → classification output
class TextClassifierWordAvg(nn.Module):
def __init__(self, vocab_size, embedding_dim, output_dim, pad_idx):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
self.fc = nn.Linear(embedding_dim, output_dim)
def forward(self, text):
# text shape: [sent_len, batch_size]
embedded = self.embedding(text)
# Reshape for pooling: [batch_size, sent_len, emb_dim]
embedded = embedded.permute(1, 0, 2)
# Average pooling over sequence length: [batch_size, embedding_dim]
pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1)).squeeze(1)
return self.fc(pooled)
Model initialization:
# Model parameters
VOCAB_SIZE = len(TEXT.vocab)
EMBEDDING_DIM = 100
OUTPUT_DIM = 1
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
# Initialize model
sentiment_model = TextClassifierWordAvg(VOCAB_SIZE, EMBEDDING_DIM, OUTPUT_DIM, PAD_IDX)
Training Process
First, let's load pre-trained embeddings:
# Load pre-trained embeddings
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)
# Initialize special tokens
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)
Now, let's define training and evaluation functions:
def train_model(model, iterator, optimizer, criterion):
epoch_loss = 0
epoch_acc = 0
model.train()
for batch in iterator:
optimizer.zero_grad()
predictions = model(batch.text).squeeze(1)
loss = criterion(predictions, batch.label)
acc = calculate_accuracy(predictions, batch.label)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
def evaluate_model(model, iterator, criterion):
epoch_loss = 0
epoch_acc = 0
model.eval()
with torch.no_grad():
for batch in iterator:
predictions = model(batch.text).squeeze(1)
loss = criterion(predictions, batch.label)
acc = calculate_accuracy(predictions, batch.label)
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
def calculate_accuracy(predictions, labels):
rounded_preds = torch.round(torch.sigmoid(predictions))
correct = (rounded_preds == labels).float()
accuracy = correct.sum() / len(correct)
return accuracy
def time_elapsed(start_time, end_time):
elapsed_time = end_time - start_time
minutes = int(elapsed_time / 60)
seconds = int(elapsed_time - (minutes * 60))
return minutes, seconds
Now let's train the model:
# Initialize training components
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)
# Training parameters
NUM_EPOCHS = 10
best_validation_loss = float('inf')
# Training loop
for epoch in range(NUM_EPOCHS):
start_time = time.time()
train_loss, train_acc = train_model(model, train_iterator, optimizer, criterion)
valid_loss, valid_acc = evaluate_model(model, valid_iterator, criterion)
end_time = time.time()
epoch_mins, epoch_secs = time_elapsed(start_time, end_time)
# Save best model
if valid_loss < best_validation_loss:
best_validation_loss = valid_loss
torch.save(model.state_dict(), 'best-sentiment-model.pt')
# Print progress
print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
print(f'\tVal Loss: {valid_loss:.3f} | Val Acc: {valid_acc*100:.2f}%')
Recurrent Neural Network (RNN) Model
Let's implement an LSTM-based model for sentiment classification:
class SentimentRNN(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers,
bidirectional, dropout_rate, pad_idx):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers,
bidirectional=bidirectional, dropout=dropout_rate)
self.fc = nn.Linear(hidden_dim * 2, output_dim)
self.dropout = nn.Dropout(dropout_rate)
def forward(self, text):
# text shape: [sent_len, batch_size]
embedded = self.dropout(self.embedding(text))
# embedded shape: [sent_len, batch_size, emb_dim]
# LSTM output
output, (hidden, cell) = self.lstm(embedded)
# output shape: [sent_len, batch_size, hid_dim * num_directions]
# hidden shape: [num_layers * num_directions, batch_size, hid_dim]
# Concatenate final forward and backward hidden states
hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
return self.fc(hidden.squeeze(0))
Model initialization:
# RNN model parameters
HIDDEN_DIM = 256
NUM_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT_RATE = 0.5
# Initialize RNN model
rnn_model = SentimentRNN(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM,
NUM_LAYERS, BIDIRECTIONAL, DROPOUT_RATE, PAD_IDX)
Convolutional Neural Network (CNN) Model
CNNs can also be effective for text classification. Here's a simple implementation:
class TextCNN(nn.Module):
def __init__(self, vocab_size, embedding_dim, num_filters, filter_sizes, output_dim, dropout, pad_idx):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
self.convs = nn.ModuleList([
nn.Conv2d(in_channels=1,
out_channels=num_filters,
kernel_size=(fs, embedding_dim))
for fs in filter_sizes
])
self.fc = nn.Linear(len(filter_sizes) * num_filters, output_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, text):
# text shape: [sent_len, batch_size]
text = text.permute(1, 0) # [batch_size, sent_len]
embedded = self.embedding(text) # [batch_size, sent_len, emb_dim]
embedded = embedded.unsqueeze(1) # [batch_size, 1, sent_len, emb_dim]
conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
# conved_n shape: [batch_size, num_filters, sent_len-filter_size[n]+1]
pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
# pooled_n shape: [batch_size, num_filters]
cat = self.dropout(torch.cat(pooled, dim=1))
# cat shape: [batch_size, num_filters * len(filter_sizes)]
return self.fc(cat)
Model initialization:
# CNN model parameters
NUM_FILTERS = 100
FILTER_SIZES = [3, 4, 5]
DROPOUT = 0.5
# Initialize CNN model
cnn_model = TextCNN(VOCAB_SIZE, EMBEDDING_DIM, NUM_FILTERS, FILTER_SIZES,
OUTPUT_DIM, DROPOUT, PAD_IDX)
Next Steps
After implementing these models, you can:
- Participate in Kaggle competitions related to sentiment analysis
- Reproduce and improve upon existing projects on GitHub
- Experiment with different hyperparameters and architectures
- Try more advanced techniques like attention mechenisms or transformers