# Day 19

Thursday, April 13, 2017

# Unsupervised learning: method of k means, cont'd

Code from last time extended to record steps of iteration: mykmeans.py

# mykmeans.py
# April 11, 2017

import matplotlib.pyplot as pl
from time import sleep
import os
from numpy import *

imagefolder = 'kmeans_images'
if not os.path.exists(imagefolder): os.makedirs(imagefolder)

# First cook up some clustered data to try it on

d = 2  # feature space dimension
k = 5  # number of clusters
npercluster = 25
n = k*npercluster
r = .05

# generate random points in unit square that are at least 2r apart
centers = [random.rand(d)]
while len(centers)<k:
trialcenter = random.rand(d)
farenough = True # optimistic!
for center in centers:
if linalg.norm(trialcenter-center,inf) < 2*r:
farenough = False
break
if farenough: centers.append(trialcenter)
centers = array(centers)
print(centers)

F = empty((n,d))
for i in range(k):
# create a cluster
start =     i*npercluster
stop  = (i+1)*npercluster
F[start:stop,:] = centers[i] + r*(2*random.rand(npercluster,d)-1)

#############################################################################################

# Now try to recover clusters

def doplot():
pl.clf()
pl.subplot(111,aspect=1)
colors = 'rgbmc'
for i in range(k):
cluster = assignments==i
pl.plot(F[cluster,0],F[cluster,1],'o',color=colors[i],alpha=0.75);
pl.plot(means[i][0],means[i][1],'*',color=colors[i],markersize=50,alpha=0.4)
pl.plot(means[i][0],means[i][1],'.',color='k')
pl.xlim(-r,1+r); pl.ylim(-r,1+r)
pl.title('iteration '+str(count-1))
pl.savefig(imagefolder+'/'+str(count).zfill(3)+'.png')

# initial choice of means
#means = random.rand(k,d) # we saw we can get a mean with *no* associated poiints
means = F[random.choice( range(n), k, replace=False )]

pl.subplot(111,aspect=1)

# Here is the actual k means algorithm:

oldassignments = k*ones(n,dtype=int)
count = 0
maxcount = 10
while(True):
count += 1
if count>maxcount: break
# compute the cluster assignments
displacements = F[:,:,newaxis] - means.T   # create 3D array  (done after class)
sqdistances = (displacements**2).sum(axis=1)
assignments = argmin( sqdistances, axis=1 )
doplot()
print(assignments)
if all( assignments == oldassignments ): break
oldassignments[:] = assignments
# update the means as the centroids of the clusters
for i in range(k):
means[i] = F[assignments==i].mean(axis=0)


# Other clustering algorithms

Scikit-learn package