# Method of k means¶

## First cook up some clustered data to try it on¶

In [20]:
d = 2  # feature space dimension
k = 3  # number of clusters
npercluster = 15
n = k*npercluster
r = .1
# generate random points in unit square
from numpy import *
centers = random.rand(k,d)
print(centers)

[[ 0.67131427  0.34281636]
[ 0.08652525  0.83616029]
[ 0.30721968  0.98870376]]

In [21]:
F = empty((n,d))
for i in range(k):
# create a cluster
start =     i*npercluster
stop  = (i+1)*npercluster
F[start:stop,:] = centers[i] + r*(2*random.rand(npercluster,d)-1)

In [22]:
%matplotlib inline
import matplotlib.pyplot as pl

In [23]:
pl.subplot(111,aspect=1)
colors = 'rgb'
for i in range(k):
start =     i*npercluster
stop  = (i+1)*npercluster
pl.plot(F[start:stop,0],F[start:stop,1],'o',color=colors[i]);


## Forget that you know which cluster each point belongs to¶

Implement k means

In [24]:
# initial choice of means
#means = random.rand(k,d)
means = F[random.choice( range(n), k, replace=False )]
#print(means)

oldassignments = k*ones(n,dtype=int)
count = 0
maxcount = 10
while(True):
count += 1
if count>maxcount: break
# compute the cluster assignments
displacements = empty((n,d,k))
for i in range(k):
displacements[:,:,i] = F - means[i]
sqdistances = (displacements**2).sum(axis=1)
assignments = argmin( sqdistances, axis=1 )
print(assignments)
if all( assignments == oldassignments ): break
oldassignments[:] = assignments

# update the means as the centroids of the clusters

for i in range(k):
means[i] = F[assignments==i].mean(axis=0)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 0 0
2 0 1 1 2 1 1 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2
2 2 2 2 2 2 1 2]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2]

In [25]:
pl.subplot(111,aspect=1)
colors = 'rgb'
for i in range(k):
cluster = assignments==i
pl.plot(F[cluster,0],F[cluster,1],'o',color=colors[i],alpha=0.75);
pl.plot(means[i][0],means[i][1],'*',color=colors[i],markersize=10,alpha=0.75)

In [5]:
import matplotlib.pyplot as pl
from time import sleep

def doplot():
pl.clf()
pl.subplot(111,aspect=1)
colors = 'rgb'
for i in range(k):
cluster = assignments==i
pl.plot(F[cluster,0],F[cluster,1],'o',color=colors[i],alpha=0.75);
pl.plot(means[i][0],means[i][1],'*',color=colors[i],markersize=10,alpha=0.75)
pl.draw()
# initial choice of means
#means = random.rand(k,d)
means = F[random.choice( range(n), k, replace=False )]
#print(means)

pl.ion()

oldassignments = k*ones(n,dtype=int)
count = 0
maxcount = 10
while(True):
count += 1
if count>maxcount: break
# compute the cluster assignments
displacements = empty((n,d,k))
for i in range(k):
displacements[:,:,i] = F - means[i]
sqdistances = (displacements**2).sum(axis=1)
assignments = argmin( sqdistances, axis=1 )
pl.pause(1.e-6)
doplot()
sleep(3)
print(assignments)
if all( assignments == oldassignments ): break
oldassignments[:] = assignments

# update the means as the centroids of the clusters

for i in range(k):
means[i] = F[assignments==i].mean(axis=0)
pl.ioff()
pl.show()

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2]

In [ ]: