#nbi:hide_in
import matplotlib.pyplot as plt
import numpy as np
import scipy
from ipywidgets import interact
from numpy.linalg import norm
from scipy.sparse import linalg
from sklearn.cluster import KMeans
from sklearn.datasets import make_moons, make_circles, make_blobs
from IPython.display import display, Markdown, Latex
#nbi:hide_in
def spectral_clustering(A, K, seed):
    # Calculate Laplacian
    D = np.zeros(A.shape)
    w = np.sum(A, axis=0)
    D.flat[::len(w) + 1] = w ** (-0.5)
    L = D.dot(A).dot(D)
    
    # Calculate Eigen Values
    e_val, e_vect = linalg.eigs(L, K)
    X = e_vect.real
    X_norm = norm(X, axis=1, ord=2)
    Y = (X.T / X_norm).T
    
    # Cluster
    kmeans = KMeans(n_clusters=K, random_state=seed)
    return kmeans.fit(X).labels_    
#nbi:hide_in
def gen_aff(X):
    N = X.shape[0]
    ans = np.zeros((N, N))
    sig = []
    for i in range(N):
        dists = []
        for j in range(N):
            dists.append(norm(X[i] - X[j]))
        dists.sort()
        sig.append(np.mean(dists[:5]))

    for i in range(N):
        for j in range(N):
            dist = norm(X[i] - X[j]) ** 2
            ans[i][j] = np.exp(- dist / (2 * sig[i] * sig[j]))
    return ans
# #nbi:hide_in
# #nbi:hide_out

# X_mn, y_mn = make_blobs(n_samples=200, centers=5, cluster_std=1.0)
# #X_mn, y_mn = make_moons(n_samples=150, noise=0.1)

# X_mn1, y_mn1 = make_circles(n_samples=150, noise=0.1)
# X_mn2, y_mn2 = make_circles(n_samples=150, noise=0.05)

# X_mn2 = np.array(X_mn2)*4
# y_mn2 = np.array(y_mn2)

# X_mn = np.concatenate((X_mn1,X_mn2), axis = 0)
# y_mn = np.append(y_mn1, y_mn2)


# K = 2
# A = gen_aff(X_mn)
# Y = spectral_clustering(A, K, seed=1111)

# cmap = 'Spectral'
# fig, ax = plt.subplots(figsize=(9,7))
# ax.scatter(X_mn[:, 0], X_mn[:, 1], c=Y, cmap=cmap)
# #nbi:hide_in
# #nbi:hide_out
# def generate(n_samples, centers, seed):
#     X_mn, y_mn = make_blobs(n_samples=n_samples, centers=centers, cluster_std=1.0, random_state = seed)
#     cmap = 'Spectral'
#     plt.scatter(X_mn[:, 0], X_mn[:, 1], c=y_mn, cmap=cmap)
#     A = gen_aff(X_mn)
#     return A

# interact(generate, n_samples=(100, 1000, 1), centers=(1, 10, 1), seed=(0, 500, 1));
# #nbi:hide_in
# #nbi:hide_out
# def cluster(k):
#     labels = spectral_clustering(A, k, seed=1111)
#     cmap = 'Spectral'
#     plt.scatter(X_mn[:, 0], X_mn[:, 1], c=labels, cmap=cmap)

# interact(cluster, k=(0, 10, 1));
#nbi:hide_in
#nbi:hide_in
data = "# Spectral Clustering \n -------------------------------------- \n ### Atishay Jain, Ritik Dutta, Shreyas Singh, Varun Gohil \n ---------------------------------------------------" 
display(Markdown(data))

Spectral Clustering


Atishay Jain, Ritik Dutta, Shreyas Singh, Varun Gohil


#nbi:hide_in
def spectral_cluster(k , data_type, n_samples, centers, seed, noise):
    if data_type == "Blob":
        X_mn, y_mn = make_blobs(n_samples=n_samples, centers=centers, cluster_std=1.0, random_state = seed)
    if data_type == "Moon":
        X_mn, y_mn = make_moons(n_samples=n_samples, random_state = seed, noise = noise)
    if data_type == "Circle":
        X_mn1, y_mn1 = make_circles(n_samples=n_samples, noise=noise)
        X_mn2, y_mn2 = make_circles(n_samples=n_samples, noise=noise/2)
        X_mn2 = np.array(X_mn2)*4
        y_mn2 = np.array(y_mn2)
        X_mn = np.concatenate((X_mn1,X_mn2), axis = 0)
        y_mn = np.append(y_mn1, y_mn2)
    if data_type == "Random":
        X_mn = np.random.rand(n_samples, 2)
    A = gen_aff(X_mn)
    labels = spectral_clustering(A, k, seed=1111)
    cmap = 'Spectral'
    fig, ax = plt.subplots(nrows = 1, ncols = 2, figsize = (20,7))
    ax[0].scatter(X_mn[:, 0], X_mn[:, 1], c=labels, cmap=cmap)
    ax[0].set(xlabel = "X1", ylabel = "X2")
    ax[0].set_title("Spectral Clustering")
    ax[1].set(xlabel = "X1", ylabel = "X2")
    ax[1].set_title("KMeans Clustering")
    kmeans = KMeans(n_clusters = k).fit(X_mn)
    ax[1].scatter(X_mn[:, 0], X_mn[:, 1], c=kmeans.labels_, cmap=cmap)
    
interact(spectral_cluster, k = (1, 10, 1), data_type = ["Blob", "Moon", "Circle", "Random"],  n_samples=(10, 200, 1), centers=(1, 10, 1), noise = (0, 0.1, 0.01), seed=(0, 500, 1));
#nbi:hide_in
display(Markdown("## Parameters for the Demo\n ------------------------------------ \n"))
display(Markdown("**K**: Choose the number of clusters to be made.\n"))
display(Markdown("**Data_Type**: Choose the structure of the data being clustered. You can choose from: \n - *Blob*: isotropic Gaussian blobs \n - *Moons*: two interleaving half circles \n - *Circle*: two concentric circles \n - *Random*: Random distribution of points \n"))
display(Markdown("**N_samples**: Choose the number of samples being clustered\n"))
display(Markdown("**Seed**: change the random state of the data being clustered.\n"))
display(Markdown("**Noise**: change the level of noise being introduced into the data being clustered\n"))

Parameters for the Demo


K: Choose the number of clusters to be made.

Data_Type: Choose the structure of the data being clustered. You can choose from:

  • Blob: isotropic Gaussian blobs
  • Moons: two interleaving half circles
  • Circle: two concentric circles
  • Random: Random distribution of points

N_samples: Choose the number of samples being clustered

Seed: change the random state of the data being clustered.

Noise: change the level of noise being introduced into the data being clustered

#nbi:hide_in
display(Markdown("## Why Spectral Clustering? \n -------------------------------"))
display(Markdown("Clustering is one of the most widely used techniques for exploratory data analysis. While k-means is popular due to its simplicity, it fails to give the expected clusters in cases where the aggregate shape of the points defines better clusters than naively checking for the nearest neighbors. Spectral clustering is a more efficient algorithm in these cases. The basic idea behind spectral clustering is to reduce the dimensionality of the data by using the most important eigenvectors, and then applying k-means on this transformed feature space.\n"))

Why Spectral Clustering?


Clustering is one of the most widely used techniques for exploratory data analysis. While k-means is popular due to its simplicity, it fails to give the expected clusters in cases where the aggregate shape of the points defines better clusters than naively checking for the nearest neighbors. Spectral clustering is a more efficient algorithm in these cases. The basic idea behind spectral clustering is to reduce the dimensionality of the data by using the most important eigenvectors, and then applying k-means on this transformed feature space.

#nbi:hide_in
display(Markdown("## The Algorithm \n -------------------------------"))
display(Markdown("The stages of the algorithm are, assuming k clusters are present: \n"))
display(Markdown(" - Construct a similarity graph \n"))
display(Markdown(" - Choose the eigenvectors corresponding to the lowest-k eigen values. \n"))
display(Markdown(" -  Embed the data points in low dimensional space (spectral embedding) in which the clusters are more obvious with the use of eigenvectors of the graph Laplacian. \n"))
display(Markdown(" -  Apply k-means on the transformed data, and return the labels obtained for the original data points. \n"))

The Algorithm


The stages of the algorithm are, assuming k clusters are present:

  • Construct a similarity graph
  • Choose the eigenvectors corresponding to the lowest-k eigen values.
  • Embed the data points in low dimensional space (spectral embedding) in which the clusters are more obvious with the use of eigenvectors of the graph Laplacian.
  • Apply k-means on the transformed data, and return the labels obtained for the original data points.
#nbi:hide_in
display(Markdown("### Github Repository : https://github.com/Varun1299/Spectral-Clustering" ))