Library Imports
Initialize the necessary modules for data handling, dimensionality reduction, modeling, and visualization.
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.datasets import fetch_lfw_people
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.decomposition import PCAData Loading and Exploration
Fetch the Labeled Faces in the Wild (LFW) dataset, filtering for individuals with a minimum of 70 images, and resize them. Display a sample image to verify the dataset structure.
face_dataset = fetch_lfw_people(min_faces_per_person=70, resize=0.4)
img_height, img_width = face_dataset.images[0].shape
identity_labels = face_dataset.target_names
plt.imshow(face_dataset.images[0], cmap='gray')
plt.show()Initial Model Training and Evaluation
Divide the data into training and testing subsets. Train a baseline Support Vector Classifier with an RBF kernel and balanced class weights, then evaluate its performance.
train_features, test_features, train_labels, test_labels = train_test_split(
face_dataset.data, face_dataset.target, test_size=0.25, random_state=42
)
svm_classifier = SVC(kernel='rbf', class_weight='balanced')
svm_classifier.fit(train_features, train_labels)
baseline_predictions = svm_classifier.predict(test_features)
print(classification_report(test_labels, baseline_predictions, target_names=identity_labels))Dimensionality Reduction with PCA
Principal Component Analysis (PCA) is applied to reduce the feature space while preserving variance, which helps improve SVM efficiency and accuracy.
num_components = 100
dim_reducer = PCA(n_components=num_components, whiten=True).fit(face_dataset.data)
train_features_reduced = dim_reducer.transform(train_features)
test_features_reduced = dim_reducer.transform(test_features)
pca_classifier = SVC(kernel='rbf', class_weight='balanced')
pca_classifier.fit(train_features_reduced, train_labels)
pca_predictions = pca_classifier.predict(test_features_reduced)
print(classification_report(test_labels, pca_predictions, target_names=identity_labels))Hyperparameter Optimization
Use cross-validated grid search to find the optimal penalty parameter (C) and kernel coefficient (gamma) for the SVM operating on the reduced feature set.
hyperparameter_options = {
'C': [1e-1, 1e0, 5e0, 1e1, 1e2],
'gamma': [5e-4, 1e-3, 5e-3, 1e-2]
}
grid_search = GridSearchCV(
SVC(kernel='rbf', class_weight='balanced'), hyperparameter_options
)
grid_search.fit(train_features_reduced, train_labels)
optimal_model = grid_search.best_estimator_
final_predictions = optimal_model.predict(test_features_reduced)
print(grid_search.best_estimator_)
print(classification_report(test_labels, final_predictions, target_names=identity_labels))Prediction Visualization
Define helper functions to format the predicted and actual names, then plot a gallery of test images with their corresponding prediction results.
def format_prediction_label(preds, true_vals, labels, idx):
predicted_identity = labels[preds[idx]].rsplit(' ', 1)[-1]
actual_identity = labels[true_vals[idx]].rsplit(' ', 1)[-1]
return f'Predicted: {predicted_identity}\nActual: {actual_identity}'
def display_prediction_grid(images, titles, h, w, rows=3, cols=5):
plt.figure(figsize=(1.8 * cols, 2.4 * rows))
plt.subplots_adjust(bottom=0, left=0.01, right=0.99, top=0.9, hspace=0.35)
for i in range(rows * cols):
plt.subplot(rows, cols, i + 1)
plt.imshow(images[i].reshape((h, w)), cmap=plt.cm.gray)
plt.title(titles[i], size=12)
plt.xticks(())
plt.yticks(())
result_titles = [format_prediction_label(final_predictions, test_labels, identity_labels, i)
for i in range(len(final_predictions))]
display_prediction_grid(test_features, result_titles, img_height, img_width)
plt.show()