R scripts for the lecture course
Machine Learning, pattern recognition and statistical data modelling
Coryn A.L. Bailer-Jones, 2007


Unsupervised learning
---------------------

Data

library(cluster)
# swiss.x contains 5 measurements of socio-economic indicators in each of 46 swiss 
# provinces from 1888
swiss.x <- as.matrix(swiss[,-1])


K-means clustering

#select Nclust cluster centres as being a random selection of Nclust vectors
Nclust <- 3
Imax <- 5
set.seed(100)
init <- sample(dim(swiss.x)[1], Nclust) # this is the initial Nclust prototypes
km <- kmeans(swiss.x, centers=swiss.x[init,], iter.max=Imax)
# plot data using these two columns. This is just a projection so the cluster may not be at
# all obvious, as is the case with j <- 2 ; j <- 5
i <- 1 ; j <- 2
plot(swiss.x[,i], swiss.x[,j], pch=20)
title(main=paste(Nclust, 'clusters with', Imax, 'iterations'), cex.main=0.75)
points(km$centers[,i], km$centers[,j], pch=3, cex=1.4, col=c('blue', 'green', 'red'))
sel <- which(km$cluster==1) ; points(swiss.x[sel,i], swiss.x[sel,j], pch=20, col='blue')
sel <- which(km$cluster==2) ; points(swiss.x[sel,i], swiss.x[sel,j], pch=20, col='green')
sel <- which(km$cluster==3) ; points(swiss.x[sel,i], swiss.x[sel,j], pch=20, col='red')

# Use the following to produce plots for different Imax
par(mfrow=c(2, 2), mgp=c(1.8,0.5,0), mar=c(3,3,3,3))
i <- 1 ; j <- 2
plot(swiss.x[,i], swiss.x[,j], pch=20)
title(main=paste(Nclust, 'initial cluster centres'), cex.main=1.0)
points(swiss.x[init,i], swiss.x[init,j], pch=3, cex=1.4,)
Nclust <- 3
for (Imax in 1:3) {
  set.seed(45)
  init <- sample(dim(swiss.x)[1], Nclust)
  km <- kmeans(swiss.x, centers=swiss.x[init,], iter.max=Imax)
  plot(swiss.x[,i], swiss.x[,j], pch=20)
  title(main=paste(Nclust, 'clusters with', Imax, 'iterations'), cex.main=1.0)
  points(km$centers[,i], km$centers[,j], pch=3, cex=1.8, col=c('blue', 'green', 'red'))
  sel <- which(km$cluster==1) ; points(swiss.x[sel,i], swiss.x[sel,j], pch=20, col='blue')
  sel <- which(km$cluster==2) ; points(swiss.x[sel,i], swiss.x[sel,j], pch=20, col='green')
  sel <- which(km$cluster==3) ; points(swiss.x[sel,i], swiss.x[sel,j], pch=20, col='red')
}

# Now repeat, using different initial cluster centres. Using seeds 100 and 45 we have quite 
# different starting conditions, yet converge quickly to the same solution. The final cluster 
# labels (colours) are, of course, arbitrary

K-medoids clustering

library(cluster)
Nclust <- 3
km <- pam(swiss.x, k=Nclust)
# plot data using these two columns. This is just a projection so the cluster may not be at
# all obvious, as is the case with j <- 2 ; j <- 5
i <- 1 ; j <- 2
#par(mfrow=c(1,1))
plot(swiss.x[,i], swiss.x[,j], pch=20)
title(main=paste(Nclust, 'clusters with k-medoids'), cex.main=0.75)
points(km$medoids[,i], km$medoids[,j], pch=3, cex=1.4, col=c('blue', 'green', 'red'))
sel <- which(km$clustering==1) ; points(swiss.x[sel,i], swiss.x[sel,j], pch=20, col='blue')
sel <- which(km$clustering==2) ; points(swiss.x[sel,i], swiss.x[sel,j], pch=20, col='green')
sel <- which(km$clustering==3) ; points(swiss.x[sel,i], swiss.x[sel,j], pch=20, col='red')


Hierarchical clustering: agglomerative and divisive

# dist measures the N(N-1)/2 Euclidean distances between all pairs in the data set
# hclust performs a hierarchical clustering
h <- hclust(dist(swiss.x), method = "single")
# plclust also plots the tree. The "height" is the value of the dissimilarity metric at which 
# the agglomeration took place. Thus "height" is a non-decreasing set of values
plot(h)
# cuttree cuts the three at the level where we get the specified number of groups and
# returns cluster number which each vector (province) lies in
cutree(h, 3)

# Compare two agglomeration methods
h_single <- hclust(dist(swiss.x), method = "single")
h_complete <- hclust(dist(swiss.x), method = "complete")
par(mfrow=c(1,2), cex=0.75)
plot(h_single);  plot(h_complete)

# Euclidean is the 2-norm. Manhattan is the 1-norm. Compare:
h_2norm <- hclust(dist(swiss.x, method='euclidean'), method = "single")
h_1norm <- hclust(dist(swiss.x, method='manhattan'), method = "single")
par(mfrow=c(1,2), cex=0.75)
plot(h_2norm);  plot(h_1norm)

# Divisive clustering. I specify a dissimilarity vector rather than give the data
d <- diana(dist(swiss.x))
par(mfrow=c(1,2), cex=0.75)
plot(d, which.plots=2)  # height is the diameter of the clusters prior to splitting
plot(h_1norm) # for comparison


Hierarchical clustering: hybrid approach

library(hybridHclust)
mc <- mutualCluster(swiss.x)
# hhc is in hclust format, so we can use tools in package {cluster}
hhc <- hybridHclust(swiss.x, themc=mc, trace=TRUE)
plot(hhc)


Self-organizing maps

# must pre-define grid size and topology
sgrid <- somgrid(xdim=5, ydim=3, topo='hexagonal')
som.swiss <- batchSOM(swiss.x, grid=sgrid, 1)
plot(som.swiss) 
# this uses the 'stars' representation: each vector from the centre represents one of the 
# original data dimensions. It's length is proportional to the (mean ?) of the vectors
# assigned to that node/prototype


MDS

library(MASS)
swiss.mds <- isoMDS(dist(swiss.x))
plot(swiss.mds$points, type = "n")
text(swiss.mds$points, labels = as.character(1:nrow(swiss.x)))