# Day 18

Tuesday, April 11, 2017

# SVM quadratic programming problem: review

Putting our problem in the format used by cvxopt

# Unsupervised learning: method of k means

Attempt to identify intrinsic clusters in the training data, without anyone giving us information about which class each point lies in.

## Implement k means

feature array: n by d means array: k by d displacements from means: d by d by k distances from means: n by k assignments: n

# mykmeans.py
# April 11, 2017

import matplotlib.pyplot as pl
from time import sleep
import os
from numpy import *

imagefolder = 'kmeans_images'
if not os.path.exists(imagefolder): os.makedirs(imagefolder)

# First cook up some clustered data to try it on

d = 2  # feature space dimension
k = 5  # number of clusters
npercluster = 25
n = k*npercluster
r = .05

# generate random points in unit square that are at least 2r apart
centers = [random.rand(d)]
while len(centers)<k:
trialcenter = random.rand(d)
farenough = True # optimistic!
for center in centers:
if linalg.norm(trialcenter-center,inf) < 2*r:
farenough = False
break
if farenough: centers.append(trialcenter)
centers = array(centers)
print(centers)

F = empty((n,d))
for i in range(k):
# create a cluster
start =     i*npercluster
stop  = (i+1)*npercluster
F[start:stop,:] = centers[i] + r*(2*random.rand(npercluster,d)-1)

#############################################################################################

# Now try to recover clusters

def doplot():
pl.clf()
pl.subplot(111,aspect=1)
colors = 'rgbmc'
for i in range(k):
cluster = assignments==i
pl.plot(F[cluster,0],F[cluster,1],'o',color=colors[i],alpha=0.75);
pl.plot(means[i][0],means[i][1],'*',color=colors[i],markersize=50,alpha=0.4)
pl.plot(means[i][0],means[i][1],'.',color='k')
pl.xlim(-r,1+r); pl.ylim(-r,1+r)
pl.title('iteration '+str(count-1))
pl.savefig(imagefolder+'/'+str(count).zfill(3)+'.png')

# initial choice of means
#means = random.rand(k,d) # we saw we can get a mean with *no* associated poiints
means = F[random.choice( range(n), k, replace=False )]

pl.subplot(111,aspect=1)

# Here is the actual k means algorithm:

oldassignments = k*ones(n,dtype=int)
count = 0
maxcount = 10
while(True):
count += 1
if count>maxcount: break
# compute the cluster assignments
displacements = F[:,:,newaxis] - means.T   # create 3D array  (done after class)
sqdistances = (displacements**2).sum(axis=1)
assignments = argmin( sqdistances, axis=1 )
doplot()
print(assignments)
if all( assignments == oldassignments ): break
oldassignments[:] = assignments
# update the means as the centroids of the clusters
for i in range(k):
means[i] = F[assignments==i].mean(axis=0)


After class, I figured out how to use numpy.newaxis to avoid looping over k in the creation of "displacements" (as seen above).