Manual Clustering with Playing Cards
Draw 30 cards randomly and select three initial cluster centers with face values 10, 4, and 2. Assign the remaining cards to the nearest center based on absolute difference. Compute the means of the three groups; suppose they become 11, 5, and 2. Use these new centers to reassign the cards, then recompute the means. Iterate untill the cluster centers stop changing.
K‑Means from Scratch on Iris Petal Length
import numpy as np
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = 'SimHei'
dataset = load_iris()
samples = dataset.data[:, 0] # petal length
labels = np.zeros(samples.shape[0])
# Initialize cluster centers using the first k data points
def initialize_centers(data, k):
return data[:k].reshape(k)
# Find the index of the closest center
def closest_center_index(centers, value):
distances = np.abs(centers - value)
return np.argmin(distances)
# Assign every point to the nearest center
def assign_clusters(data, labels, centers):
for idx in range(data.shape[0]):
labels[idx] = closest_center_index(centers, data[idx])
return labels
# Recalculate cluster centers; return updated centers and whether any moved
def update_centers(data, labels, centers, k):
updated = list(centers)
moved = False
for cluster_id in range(k):
members = np.where(labels == cluster_id)
new_center = np.mean(data[members])
if updated[cluster_id] != new_center:
updated[cluster_id] = new_center
moved = True
return np.array(updated), moved
k_clusters = 3
centers = initialize_centers(samples, k_clusters)
changed = True
while changed:
labels = assign_clusters(samples, labels, centers)
centers, changed = update_centers(samples, labels, centers, k_clusters)
print("Cluster assignments:", labels)
print("Final centers:", centers)
# Visualize the clustering result
plt.scatter(samples, samples, c=labels, s=50, cmap='rainbow', alpha=0.5)
plt.title("Clustering of Iris Petal Length")
plt.xlabel("Petal Length")
plt.ylabel("Cluster Result")
plt.show()
Using scikit‑learn KMeans on Iris Petal Length
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = 'SimHei'
iris = load_iris()
petal_len = iris.data[:, 0].reshape(-1, 1)
model = KMeans(n_clusters=3)
model.fit(petal_len)
cluster_centers = model.cluster_centers_
cluster_labels = model.labels_
print("Cluster centers:", cluster_centers)
print("Labels:", cluster_labels)
plt.scatter(petal_len[:, 0], petal_len[:, 0], c=cluster_labels, s=50,
cmap='rainbow', alpha=0.5)
plt.title("K‑Means on Iris Petal Length (sklearn)")
plt.xlabel("Petal Length")
plt.ylabel("Cluster Result")
plt.show()
Full Iris Dataset Clustering
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = 'SimHei'
iris = load_iris()
features = iris.data
kmeans = KMeans(n_clusters=3)
kmeans.fit(features)
centers = kmeans.cluster_centers_
labels = kmeans.labels_
print("Centers:", centers)
print("Assignments:", labels)
# Plotting two selected features (petal length vs petal width)
plt.scatter(features[:, 2], features[:, 3], c=labels, s=50,
cmap='rainbow', alpha=0.5)
plt.title("Clustering Full Iris Data")
plt.xlabel("Petal Length")
plt.ylabel("Petal Width")
plt.show()
Practical Applications of K‑Means
- Document Grouping – Segment documents by topics, tags, and content similarity.
- Crime Hotspot Detection – Analyze location‑based crime reports to identify high‑risk urban areas.
- Customer Segmentation – Cluster users by purchase history, interests, or behavioral patterns for targeted marketing.
- Insurance Fraud Identification – Detect unusual claim patterns in automobile, health, and property insurance.
- Ride‑Sharing Trip Analysis – Mine publicly available ride data (e.g., Uber) to discover peak travel zones and traffic trends.
- Criminal Network Investigation – Uncover connections among individuals and groups through link analysis on criminal records.