Unveiling the power of unsupervised learning through a step-by-step implementation of the K-Means algorithm, transforming raw data into meaningful clusters.

1. implementation using numpy only

step 1: import numpy and matplotlib

import numpy as np
import matplotlib.pyplot as plt

*step 2:generate sample data *

# 1. Generate sample data
np.random.seed(0)## Set the random seed for reproducibility (like starting a game with the same dice roll every time)

# normal distribution N(mean,std,[rows,columns])
X = np.concatenate([np.random.normal(0, 1, (100, 2)),
                    np.random.normal(5, 1, (100, 2))])

step 3:initialize centroids randomly

k = 2 #number of clusters

#This function randomly selects k (which is 2 in this case) distinct indices (positions) from the range of 0 to the total number of data points.
#replace=False ensures that the same index is not chosen twice.
#X.shape[0] = number of rows

centroids = X[np.random.choice(X.shape[0], k, replace=False)]

step 4: interation

#K-means iterations
max_iterations = 100
for _ in range(max_iterations):
  # Assign points to nearest centroid
  distances = np.sqrt(np.sum((X[:, np.newaxis, :] - centroids)**2, axis=2))
  labels = np.argmin(distances, axis=1)

  # Update centroids
  new_centroids = np.array([X[labels == i].mean(axis=0) for i in range(k)])

  # Check for convergence
  if np.allclose(centroids, new_centroids):
    break

  centroids = new_centroids

step 5:plot

plt.scatter(X[:, 0], X[:, 1], c=labels)
plt.scatter(centroids[:, 0], centroids[:, 1], marker='*', s=200, c='red')
plt.title('K-means Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()

2. implementation using scikit-learn

step 1: import libraries

from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt

step 2: generate sample data( could be replaced with real world data)

# Sample data (replace with your actual data)
X = np.array([[1, 2], [1.5, 1.8], [5, 8], [8, 8], [1, 0.6], [9, 11]])

step 3: Determine the number of clusters (k) randomly

k = 2  # Example: 2 clusters

step 4: Create a KMeans object

kmeans = KMeans(n_clusters=k) # Initialize the KMeans model with the specified number of clusters

step 5: fit data(X) to model for training

kmeans.fit(X)  # Train the model on the data

step 6: get clusters

# Get the cluster centers
centroids = kmeans.cluster_centers_ # Get the coordinates of the cluster centers

step 7: Predict cluster labels for each data point

labels = kmeans.predict(X)

step 8: visualization of clusters

plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis')  # Plot data points, colored by cluster
plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=200, linewidths=3, color='r') # Plot centroids
plt.xlabel('Feature 1')  # Label the x-axis
plt.ylabel('Feature 2')  # Label the y-axis
plt.title('KMeans Clustering')  # Set the title of the plot
plt.show()  # Display the plot

kmeans clustering using a real world dataset (wine dataset)

The dataset for our small implementation can be gotten from
archive.ics.uci.edu/ml/datasets/wine+quality

i) import neccesery libraries

import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

ii)load the data

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"  # URL of the dataset
try:
  wine_data = pd.read_csv(url, sep=";")
except Exception as e:  # Catching a broader range of potential errors
  print(f"Error loading data from URL: {e}")
  exit()

iii)Select relevant features for clustering (example)

features = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']
X = wine_data[features]

iv)Determine the optimal number of clusters using the Elbow method (optional)

wcss = []
for i in range(1, 11):
  kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
  kmeans.fit(X)
  wcss.append(kmeans.inertia_)

plt.plot(range(1, 11), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

v)apply KMeans clustering

kmeans = KMeans(n_clusters=k, init='k-means++', random_state=42)
kmeans.fit(X)

vi)Add cluster labels to the DataFrame

wine_data['cluster'] = kmeans.labels_

vii)Analyze the clusters (example: calculate the mean of each feature for each cluster)

cluster_means = wine_data.groupby('cluster').mean()
print(cluster_means)

viii)Visualize the clusters (example: using the first two features)

plt.scatter(wine_data['fixed acidity'], wine_data['volatile acidity'], c=wine_data['cluster'], cmap='viridis')
plt.xlabel('Fixed Acidity')
plt.ylabel('Volatile Acidity')
plt.title('Wine Quality Clusters')
plt.show()

if you found this helpful please like and share : )
in case i made some errors i will be very grateful for corrections