Day 19

Thursday, April 13, 2017

Unsupervised learning: method of k means, cont'd

Code from last time extended to record steps of iteration: mykmeans.py

# mykmeans.py
# April 11, 2017

import matplotlib.pyplot as pl
from time import sleep
import os
from numpy import *

imagefolder = 'kmeans_images'
if not os.path.exists(imagefolder): os.makedirs(imagefolder)

# First cook up some clustered data to try it on

d = 2  # feature space dimension
k = 5  # number of clusters
npercluster = 25
n = k*npercluster
r = .05

# generate random points in unit square that are at least 2r apart
centers = [random.rand(d)]
while len(centers)<k:
        trialcenter = random.rand(d)
        farenough = True # optimistic!
        for center in centers:
                if linalg.norm(trialcenter-center,inf) < 2*r:
                        farenough = False
                        break
        if farenough: centers.append(trialcenter)
centers = array(centers)
print(centers)

F = empty((n,d))
for i in range(k):
    # create a cluster
    start =     i*npercluster
    stop  = (i+1)*npercluster
    F[start:stop,:] = centers[i] + r*(2*random.rand(npercluster,d)-1)

#############################################################################################

# Now try to recover clusters

def doplot():
    pl.clf()
    pl.subplot(111,aspect=1)
    colors = 'rgbmc'
    for i in range(k):
        cluster = assignments==i
        pl.plot(F[cluster,0],F[cluster,1],'o',color=colors[i],alpha=0.75);
        pl.plot(means[i][0],means[i][1],'*',color=colors[i],markersize=50,alpha=0.4)
        pl.plot(means[i][0],means[i][1],'.',color='k')
    pl.xlim(-r,1+r); pl.ylim(-r,1+r)
    pl.title('iteration '+str(count-1))
    pl.savefig(imagefolder+'/'+str(count).zfill(3)+'.png')

# initial choice of means
#means = random.rand(k,d) # we saw we can get a mean with *no* associated poiints
means = F[random.choice( range(n), k, replace=False )]


pl.subplot(111,aspect=1)

# Here is the actual k means algorithm:

oldassignments = k*ones(n,dtype=int)
count = 0
maxcount = 10
while(True):
    count += 1
    if count>maxcount: break
    # compute the cluster assignments
    displacements = F[:,:,newaxis] - means.T   # create 3D array  (done after class)
    sqdistances = (displacements**2).sum(axis=1)
    assignments = argmin( sqdistances, axis=1 )
    doplot()
    print(assignments)
    if all( assignments == oldassignments ): break
    oldassignments[:] = assignments
    # update the means as the centroids of the clusters
    for i in range(k):
        means[i] = F[assignments==i].mean(axis=0)
all_kmeans_images_2.png all_kmeans_images_3.png 20170413_105647.jpg

Other clustering algorithms

Scikit-learn package

sklearn_clustering_overview.png

Deep learning (convolutional neural networks)

Let us install Google Tensorflow

Handwriting project

Assistance