R scripts for the lecture course "Introduction to machine learning and
pattern recognition", MPIA Feb./March 2008
  Coryn Bailer-Jones

Lecture 1


########## Linear regression in R (from MASS section 1.3)
########## plus some general introduction to R

library(MASS)
x <- seq(1, 20, 0.5)
x
w <- 1 + x/2
set.seed(30)
y <- x + w*rnorm(x, mean=0, sd=1)       # rnorm(x) is same as rnorm(length(x))
dum <-  data.frame(x, y, w)
dum
rm(x, y, w)

fm <- lm(y ~ x,  data=dum)
#?lm
summary(fm)
attributes(fm)
fm$fitted.values
fitted(fm)
plot(x,y)                          # doesn't work as x,y are not on path
plot(dum$x,dum$y)
attach(dum)                     # put components of dum on path
lines(x,fitted(fm),col=2, lw=2)
fm1 <- lm(y ~ x,  data = dum, weight = 1/w^2)
summary(fm1)
abline(fm1,col=3, lw=2)                            # a simpler way to plot model fits
lrf <-  loess(y ~ x, dum)                            # local polynomial regression
lines(spline(x, fitted(lrf)), col = 4, lw=2)    # loess only predicts at points: spline 
				                                  # interpolation is used to plot 13

par(mfrow=c(1,2))
plot(x,y)
abline(fm,col=2,lw=2) ; abline(fm1, col=3,lw=3) ; lines(spline(x, fitted(lrf)), col=4,lw=2)    
plot(y, resid(fm), col=2, xlab = "y", ylab = "Residuals")
abline(h=0, lty=2, lw=2)
points(y, resid(lrf), col=4, pch=25)

detach()
rm(fm,fm1,lrf,dum)
par(mfrow=c(1,1))


########## Learning, generalization and regularization

# After Bishop ch. 9
x <- seq(0,1,0.05)
h <- function(x){0.5 + 0.4*sin(2*pi*x)}             # define a function
set.seed(10)
h_noisy <- h(x) + rnorm(n=x, sd=0.05)
plot(x, h_noisy, pch=16)
x_temp <- seq(0,1,0.01)
lines(x_temp, h(x_temp), lty=2, lwd=2)

# Predict using locpoly
# With locpoly we cannot define a prediction grid (only number of points spread uniformly)
library(KernSmooth)
kernelwidth <- 0.05
locpoly(x, h_noisy, bandwidth=kernelwidth)
lines(locpoly(x, h_noisy, bandwidth=kernelwidth), col='blue', lwd=2)
# Repeat with bandwith values:
# 0.15  too smooth
# 0.05  overfit

# Predict using ksmooth
library(stats)
# replot training data and true curve
plot(x, h_noisy, pch=16) ; lines(x_temp, h(x_temp), lty=2, lwd=2)
kernelwidth <- 0.05
ksmooth(x, h_noisy, x.points=x, bandwidth=0.2, kernel='normal')
lines(ksmooth(x, h_noisy, x.points=x, bandwidth=kernelwidth, kernel='normal'), col='red', lwd=2)
# Repeat with bandwith values:
# 0.5   too smooth
# 0.1   okay 
# 0.0   overfit

# Define a test set
xtest <- seq(0.025,1,0.05)
set.seed(20)
htest_noisy <- h(xtest) + rnorm(n=xtest, sd=0.05)
# plot and compare with the fits from the training data
points(xtest, htest_noisy, col='blue', pch=17)
# predict at test data points
ksmooth(x, h_noisy, x.points=xtest, bandwidth=kernelwidth, kernel='normal')
resid <- ksmooth(x, h_noisy, x.points=xtest, bandwidth=kernelwidth, kernel='normal')$y - htest_noisy
# define rmse function
rmse <- function(x){ sqrt( (1/length(x))*sum(x^2) ) }
rmse(resid)
# training function error
rmse( ksmooth(x, h_noisy, x.points=x, bandwidth=kernelwidth, kernel='normal')$y - h_noisy )
# test function error
rmse( ksmooth(x, h_noisy, x.points=xtest, bandwidth=kernelwidth, kernel='normal')$y - htest_noisy )

Note that the application of locpoly is using a linear fit over the bandwidth
region about each point, yet appears to produce a nonlinear
function. Actually, it doesn't produce a function at all: it is just doing
point estimations at 401 points equally spread over the x-axis (this value is
fixed by the parameter gridsize). That is, at each of these points it does a
linear fit over the data lying within the box of size +/- bandwidth/2 and this
is the predicted y-value.

# With neural network. Stubbornly refuses to overfit
predict(nnet(x,h_noisy,size=5,decay=0,maxits=1000,linout=TRUE), as.matrix(x))

A better fit would be obtained using a sine function, of course, or at least
something which is periodic. Domain knowledge helps, but only if it is
correct!


########## K-means clustering

library(cluster)
# swiss.x contains 5 measurements of socio-economic indicators in each of 46 swiss 
# provinces from 1888
swiss.x <- as.matrix(swiss[,-1])

#select Nclust cluster centres as being a random selection of Nclust vectors
Nclust <- 3
Imax <- 5
set.seed(100)
init <- sample(dim(swiss.x)[1], Nclust) # this is the initial Nclust prototypes
km <- kmeans(swiss.x, centers=swiss.x[init,], iter.max=Imax)
# plot data using these two columns. This is just a projection so the cluster may not be at
# all obvious, as is the case with j <- 2 ; j <- 5
i <- 1 ; j <- 2
plot(swiss.x[,i], swiss.x[,j], pch=20)
title(main=paste(Nclust, 'clusters with', Imax, 'iterations'), cex.main=0.75)
points(km$centers[,i], km$centers[,j], pch=3, cex=1.4, col=c('blue', 'green', 'red'))
sel <- which(km$cluster==1) ; points(swiss.x[sel,i], swiss.x[sel,j], pch=20, col='blue')
sel <- which(km$cluster==2) ; points(swiss.x[sel,i], swiss.x[sel,j], pch=20, col='green')
sel <- which(km$cluster==3) ; points(swiss.x[sel,i], swiss.x[sel,j], pch=20, col='red')

# Use the following to produce plots for different Imax
par(mfrow=c(2, 2), mgp=c(1.8,0.5,0), mar=c(3,3,3,3))
i <- 1 ; j <- 2
plot(swiss.x[,i], swiss.x[,j], pch=20)
title(main=paste(Nclust, 'initial cluster centres'), cex.main=1.0)
points(swiss.x[init,i], swiss.x[init,j], pch=3, cex=1.4,)
Nclust <- 3
for (Imax in 1:3) {
  set.seed(45)
  init <- sample(dim(swiss.x)[1], Nclust)
  km <- kmeans(swiss.x, centers=swiss.x[init,], iter.max=Imax)
  plot(swiss.x[,i], swiss.x[,j], pch=20)
  title(main=paste(Nclust, 'clusters with', Imax, 'iterations'), cex.main=1.0)
  points(km$centers[,i], km$centers[,j], pch=3, cex=1.8, col=c('blue', 'green', 'red'))
  sel <- which(km$cluster==1) ; points(swiss.x[sel,i], swiss.x[sel,j], pch=20, col='blue')
  sel <- which(km$cluster==2) ; points(swiss.x[sel,i], swiss.x[sel,j], pch=20, col='green')
  sel <- which(km$cluster==3) ; points(swiss.x[sel,i], swiss.x[sel,j], pch=20, col='red')
}

# Now repeat, using different initial cluster centres. Using seeds 100 and 45 we have quite 
# different starting conditions, yet converge quickly to the same solution. The final cluster 
# labels (colours) are, of course, arbitrary