In [28]:
# code from last time

import numpy as np

def softmax(S):
    #print( S.max(), S.min())
    expS = np.exp(S)
    return expS/expS.sum(axis=0)

def loss(WT, X, y):
    
    n = X.shape[1]
    
    # forward pass for the loss itself
    S = WT @ X
    P = softmax(S)
    Py = P[ y , np.arange(n) ] # need the yith row of each column i of P
    l = -np.log(Py)
    L = l.sum()

    # back-propagation of the gradient
    
    dLdl = np.ones_like( l )

    dLdP = np.zeros_like( P )
    dLdP[ y , np.arange(n) ] = -1/Py

    dLdS = np.zeros_like( S )
    for k in range( c ):
        dLdS += dLdP[k]*(-P[k]*P)
    dLdS += dLdP*P

    dLdWT = dLdS @ X.T
    
    return L, dLdWT  # loss itself and its gradient

from PIL import Image
import glob

def charclass(png):     # extract character class name from file name
        return png.split('__')[1][:-4]

pngs = sorted( glob.glob('handwriting_f19/pngs/*.png') )
h,w = np.array(Image.open(pngs[0]))[:,:,0].shape #125,100
selection = sorted( {charclass(png) for png in pngs} )

# Load flattened images as columns of big array X
X = np.empty((h*w,len(pngs)))
for i,png in enumerate(pngs):
    X[:,i] = 255 - np.array(Image.open(png))[:,:,0].reshape(h*w)
X /= X.sum()  # to avoid having huge numbers in our network

# Get the true class numbers of all the images
y = [selection.index(charclass(png)) for png in pngs]  # true classes of images

n = len(pngs)
c = len(selection)
hw = h*w

# Suppose we have no idea what the weights should be, and start at all zeros:
WT = np.zeros( (c, hw) ) 

# get loss and its gradient
L,gradL = loss( WT, X, y )

# change the weights by a multiple "stepsize" of the negative gradient
stepsize = 1000000 # 100   # 1
WT -= stepsize*gradL
L,gradL = loss( WT, X, y )
L

import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(10,1))
for j in range(c):
    plt.subplot(1,c,j+1)
    plt.imshow( WT[j].reshape(h,w), cmap='seismic' )
    plt.axis('off')
In [29]:
# change the weights by a multiple "stepsize" of the negative gradient
stepsize = 10000000 # 100   # 1
WT = np.zeros( (c, hw) ) 
for k in range(1000):
    L,gradL = loss( WT, X, y )
    if k==0: print(L,'initially')
    print(L,'\r',end='')
    WT -= stepsize*gradL
L,gradL = loss( WT, X, y )
print(L)
plt.figure(figsize=(10,1))
for j in range(c):
    plt.subplot(1,c,j+1)
    plt.imshow( WT[j].reshape(h,w), cmap='seismic' )
    plt.axis('off')
3486.1138307929855 initially
211.6370183393639  
In [30]:
from os.path import join
def loadimages(folder):
    
    pngs = sorted( glob.glob( join(folder,'*.png') ) )
    h,w = np.array(Image.open(pngs[0]))[:,:,0].shape #125,100
    selection = sorted( {charclass(png) for png in pngs} )

    # Load flattened images as columns of big array X
    X = np.empty((h*w,len(pngs)))
    for i,png in enumerate(pngs):
        X[:,i] = 255 - np.array(Image.open(png))[:,:,0].reshape(h*w)
    #X /= X.sum()  # this should be done outside

    # Get the true class numbers of all the images
    y = [selection.index(charclass(png)) for png in pngs]  # true classes of images
    return X,y,(h,w)
In [31]:
Xtrain,ytrain,(_,_) = loadimages('handwriting_f19/pngs')
Xnorm = Xtrain.sum()
Xtrain = Xtrain/Xnorm
Xtest,ytest,(_,_) = loadimages('handwriting_f19_test/testpngs')
Xtest = Xtest/Xnorm  # should normalize Xtest with the same factor as for Xtrain !!!!

stepsize = 1e7 # 100   # 1
WT = np.zeros( (c, hw) ) 
Ltrainlist, Ltestlist = [],[]
nsteps = 50
for k in range(nsteps):
    L,gradL = loss( WT, Xtrain, ytrain )
    if k==0: print(L,'initially')
    print(k,'of',nsteps,L,' '*40,'\r',end='')
    Ltest,_ = loss( WT, Xtest, ytest )
    Ltrainlist.append(L)
    Ltestlist.append(Ltest)
    
    WT -= stepsize*gradL
L,gradL = loss( WT, X, y )
print(L)

plt.figure()
npts = len(Ltrainlist)
plt.semilogy( range(npts), Ltrainlist, label='train' )
plt.semilogy( range(npts), Ltestlist , label='test' )

plt.figure(figsize=(10,1))
for j in range(c):
    plt.subplot(1,c,j+1)
    plt.imshow( WT[j].reshape(h,w), cmap='seismic' )
    plt.axis('off')
3486.1138307929855 initially
905.7839843944075973833626                                             

Add classification to the loss function

In [32]:
def loss(WT, X, y):
    
    n = X.shape[1]
    
    # forward pass for the loss itself
    S = WT @ X
    P = softmax(S)
    ypredicted = np.argmax(P,axis=0)
    
    Py = P[ y , np.arange(n) ] # need the yith row of each column i of P
    l = -np.log(Py)
    L = l.sum()

    # back-propagation of the gradient
    
    dLdl = np.ones_like( l )

    dLdP = np.zeros_like( P )
    dLdP[ y , np.arange(n) ] = -1/Py

    dLdS = np.zeros_like( S )
    for k in range( c ):
        dLdS += dLdP[k]*(-P[k]*P)
    dLdS += dLdP*P

    dLdWT = dLdS @ X.T
    
    return L, dLdWT, ypredicted  # loss itself and its gradient
In [35]:
Xtrain,ytrain,(_,_) = loadimages('handwriting_f19/pngs')
Xnorm = Xtrain.sum()
Xtrain = Xtrain/Xnorm
Xtest,ytest,(_,_) = loadimages('handwriting_f19_test/testpngs')
Xtest = Xtest/Xnorm  # should normalize Xtest with the same factor as for Xtrain !!!!

WT = np.zeros( (c, hw) ) 

Ltrainlist, Ltestlist,trainaccuracylist,testaccuracylist = [],[],[],[]

schedule = [(100,1e7),(100,1e8),(100,2e8),(100,4e8),(400,1e9),(100,2e9)]  # a way of setting a schedule for learning rate

for nsteps, stepsize in schedule:
    print()
    for k in range(nsteps):
        L,gradL,ypredicted = loss( WT, Xtrain, ytrain )
        accuracy = 100*(ytrain==ypredicted).sum()/len(ytrain)
        trainaccuracylist.append(accuracy)
        #if k==0: print(L,'initially')
        Ltest,_,ypredicted = loss( WT, Xtest, ytest )
        accuracy = 100*(ytest==ypredicted).sum()/len(ytest)
        testaccuracylist.append(accuracy)
        Ltrainlist.append(L)
        Ltestlist.append(Ltest)
        print('{:>5} of {:>5} steps: loss = {:>10.3f}, accuracy = {:>10.3}%\r'.format(k,nsteps,L,accuracy),end='')
        WT -= stepsize*gradL
L,gradL,_ = loss( WT, X, y )
print()
print(L)

plt.figure()
npts = len(Ltrainlist)
plt.semilogy( range(npts), Ltrainlist, label='train' )
plt.semilogy( range(npts), Ltestlist , label='test' )
plt.legend()
plt.title('loss')

plt.figure()
plt.semilogy( range(npts), 100-np.array(trainaccuracylist), label='train')
plt.semilogy( range(npts), 100-np.array(testaccuracylist) , label='test' )
plt.title('percent inaccuracy')
plt.legend()

plt.figure(figsize=(15,8))
for j in range(c):
    plt.subplot(2,c/2,j+1)
    plt.imshow( WT[j].reshape(h,w), cmap='seismic' )
    plt.axis('off')
    plt.title(selection[j])
   99 of   100 steps: loss =    631.072, accuracy =       91.9%
   99 of   100 steps: loss =    202.084, accuracy =       94.6%
   99 of   100 steps: loss =    111.954, accuracy =       94.6%
   99 of   100 steps: loss =     67.119, accuracy =       94.5%
  399 of   400 steps: loss =     24.398, accuracy =       94.1%
   99 of   100 steps: loss =     21.349, accuracy =       93.7%
21.32676530784488

Compare with the class averages of the training set

In [34]:
plt.figure(figsize=(15,8))
ytrain = np.array(ytrain)
for j in range(c):
    plt.subplot(2,c/2,j+1)
    plt.imshow( Xtrain[:,ytrain==j].mean(axis=1).reshape(h,w), cmap='seismic' )
    plt.axis('off')

Let's replicate the above with Tensorflow

In [32]:
import tensorflow
tensorflow.__version__
Out[32]:
'2.0.0'
In [ ]:
def preparedata( folder ): ############## needs some mods
    
    pngs = sorted( glob.glob( join(folder,'*.png') ) )
    h,w = np.array(Image.open(pngs[0]))[:,:,0].shape #125,100
    selection = sorted( {charclass(png) for png in pngs} )

    # Load flattened images as columns of big array X
    X = np.empty((h*w,len(pngs)))
    for i,png in enumerate(pngs):
        X[:,i] = 255 - np.array(Image.open(png))[:,:,0].reshape(h*w)
    X /= X.sum()  # to avoid having huge numbers in our network

    # Get the true class numbers of all the images
    y = [selection.index(charclass(png)) for png in pngs]  # true classes of images
    return X,y,(h,w)
In [ ]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense   # fully connected layer 

model = Sequential()
model.add( Dense( 10, input_dim=125*100, activation='softmax' ) )    # 10 is the number of nodes i.e. the # outputs

from tensorflow.keras.optimizers import SGD
sgd = SGD( learning_rate = 1  )
model.compile( optimizer=sgd, loss='categorical_crossentropy' )

# generate the 1-hot encoding of the label data
X, Y = preparedata( folder )

model.fit( X?, Y, verbose=2   )