R scripts for the lecture course "Introduction to machine learning and
pattern recognition", MPIA Feb./March 2008
  Coryn Bailer-Jones

Lecture 2


########## 1D density estimation - grass diagram

png("figures/1dexample.png")
x <- c(rnorm(21,0.3,0.1), rnorm(9,0.8,0.1))
plot(x, rep(1,30), type="h", ylim=c(0,5), ylab="frequency", cex.axis=2, cex.lab=2, lwd=2, col="red")
dev.off()


########## Density estimation

# Figure 5.8 p. 127 in MASS4
geyser
attach(geyser)
plot(waiting, duration)
par(mfrow=c(2,3))
# histograms are sensitive to placing of bin centres
truehist(duration, h=0.5, x0=0.0, xlim=c(0, 6), ymax=0.7)
truehist(duration, h=0.5, x0=0.1, xlim=c(0, 6), ymax=0.7)
truehist(duration, h=0.5, x0=0.2, xlim=c(0, 6), ymax=0.7)
truehist(duration, h=0.5, x0=0.3, xlim=c(0, 6), ymax=0.7)
truehist(duration, h=0.5, x0=0.4, xlim=c(0, 6), ymax=0.7)
# Improve by averaging the histograms
breaks <- seq(0, 5.9, 0.1)
counts <- numeric(length(breaks))
for(i in (0:4)) counts[i+(1:55)] <- counts[i+(1:55)] +
    rep(hist(duration, breaks=0.1*i + seq(0, 5.5, 0.5),
    prob=TRUE, plot=FALSE)$intensities, rep(5,11))
plot(breaks+0.05, counts/5, type="l", xlab="duration",
    ylab="averaged", bty="n", xlim=c(0, 6), ylim=c(0, 0.7))
detach()

# So use Gaussian kernel density estimation instead

# Fig. 5.9 on p. 128 of MASS4
# nrd, SJ and SJ-dpi are different methods for choosing the fixed bandwidth
# do ?bw.nrd for more details. To get values used:
# bw.nrd(geyser$duration) = 0.389
# bw.SJ(geyser$duration) = 0.090
# bw.SJ(geyser$duration, method='dpi') = 0.144
attach(geyser)
par(mfrow=c(1,2))
truehist(duration, nbins = 15, xlim = c(0.5, 6), ymax = 1.2)
lines(density(duration, width = "nrd"))
truehist(duration, nbins = 15, xlim = c(0.5, 6), ymax = 1.2)
lines(density(duration, width = "SJ", n = 256), lty = 3)        # dotted line
lines(density(duration, n = 256, width = "SJ-dpi"), lty = 1)
detach()


########## Density estimation in 2D

# Fig. 5.11 on p. 131 of MASS4
geyser2 <- data.frame(as.data.frame(geyser)[-1, ], pduration = geyser$duration[-299])
attach(geyser2)
par(mfrow=c(2, 2), mgp=c(1.8,0.5,0), mar=c(3,3,3,3))
plot(pduration, waiting, xlim = c(0.5, 6), ylim = c(40, 110), xlab = "previous duration", ylab = "waiting")
f1 <- kde2d(pduration, waiting, n = 50, lims=c(0.5, 6, 40, 110))
# on a linear scale
#image(f1, zlim = c(0, 0.075), xlab = "previous duration", ylab = "waiting")
# to get on a log scale do:
 image(f1$x, f1$y, log(f1$z), zlim = c(-40, 0), xlab = "previous duration", ylab = "waiting")
f2 <- kde2d(pduration, waiting, n = 50, lims=c(0.5, 6, 40, 110), h = c(width.SJ(duration), width.SJ(waiting)) )
image(f2, zlim = c(0, 0.075), xlab = "previous duration", ylab = "waiting")
persp(f2,  phi = 30, theta = 20, d = 5, xlab = "previous duration", ylab = "waiting", zlab = "")
detach()


########## Classification via density modelling and using MLE

# Generate two clusters of Gaussian data
set.seed(20)
x1 <- rnorm(100, mean=0, sd=0.50)
y1 <- rnorm(100, mean=0, sd=0.50)
x2 <- rnorm(100, mean=1, sd=0.70)
y2 <- rnorm(100, mean=1, sd=0.30)
par(mfrow=c(1,1))
plot(x1, y1, type='n', xlim=c(-1.5,3.0), ylim=c(-1.5,3.0), xlab='x', ylab='y')
points(x1, y1, pch=19, col=2)
points(x2, y2, pch=23, col=3, bg=3)

# infer properties of PDF on assumption that x and y are independent
( mean1 <- c(mean(x1), mean(y1)) )
( mean2 <- c(mean(x2), mean(y2)) )
( sd1 <- c(sd(x1), sd(y1)) )
( sd2 <- c(sd(x2), sd(y2)) )
# build covariance matrices and work out eigenvalues and eigenvectors, i.e. without 
# assuming x and y are independent
covmat1 <- matrix( c(cov(x1,x1), cov(x1,y1), cov(y1,x1), cov(y1,y1)), nrow=2 )
covmat2 <- matrix( c(cov(x2,x2), cov(x2,y2), cov(y2,x2), cov(y2,y2)), nrow=2 )
eigen1 <- eigen(covmat1, symmetric=TRUE)
eigen2 <- eigen(covmat2, symmetric=TRUE)
# standard deviations in x and y are
sqrt(eigen1$values)
sqrt(eigen2$values)
# note the differences in the the principal directions (eigenvectors) for class 1. This is
# a symmetric Gaussian so the principal directions could happily lie in any direction.
# Alternatively use SVD, e.g.
svd(covmat2)
sqrt(svd(covmat2)$d)

# overplotted fitted Gaussians, using fact that x and y are independent
xgrid=seq(-2,3,0.01)
ygrid=seq(-2,3,0.01)
contour( xgrid, ygrid, dnorm(xgrid, mean=mean(x1), sd=sd(x1)) %o% dnorm(ygrid, mean=mean(y1), sd=sd(y1)), col=2, add=TRUE )
contour( xgrid, ygrid, dnorm(xgrid, mean=mean(x2), sd=sd(x2)) %o% dnorm(ygrid, mean=mean(y2), sd=sd(y2)), col=3, add=TRUE )  

Decision boundaries are quadratic in general. They are linear if the two class
covariance matrices are equal.


##########  LDA

Model class distributions as multivariate Gaussians. LDA is the
special case when we assume that the classes have equal covariance
matrices (Hastie et al. p. 86). If we drop this assumption, we have
QDA.  Let there be g classes in a p dimensional data space.  LDA
determines the means of each class, and uses the proximity of a new
data point to these to make a classification (taking into account the
covariance matrix and prior probabilities - if we sphere the data and
have equal priors, we just take the nearest centroid). Further, the g
centroids in the p-dimensional data space lie in an affine subspace of
dimension g-1. In locating the nearest centroid, directions orthogonal
to this subspace do not discriminate between the classes (within the
LDA assumptions), so we may as well project the data onto this
subspace. Thus LDA performs a dimension reduction: p -> g-1, i.e. for
g classes there are g-1 LDs.  The LDs are mutually orthogonal vectors
in the (sphered) data space which maximally discriminate between the
classes. Can also show that they maximise the between-class variance
relative to the within-class variance (MASS4 section 12.1). LDs are
ordered in terms of separation significance.

data(iris3)
Iris <- data.frame(rbind(iris3[,,1], iris3[,,2], iris3[,,3]), Sp = rep(c("s","c","v"), rep(50,3)))
ir.lda <- lda(Sp ~ ., Iris)

For dataframe Iris, perform an LDA where the groups are given by named column
Sp, and the explanatory variables are everything else (this is the meaning of
the "." on the RHS of the formula; at least, we get the same result if we use
"Sepal.L. + Sepal.W. + Petal.L. + Petal.W." on the RHS). To predict using this
we use

ir.ld <- predict(ir.lda)                    # as "newdata" not specified so predicts
                                                      # using the original data set
eqscplot(ir.ld$x, type="n", xlab="LD1", ylab="LD2")	# plot window
# plot data as coloured characters
text(ir.ld$x, labels=as.character(Iris$Sp), col=1+as.numeric(Iris$Sp), cex=1.0) 

Evaluate contingency table of classification results:

t(table(Iris$Sp, ir.ld$class))  # transpose so that a true class is a colum

In LDA class boundaries are linear hyperplanes: for g classes, there are
g(g-1) boundaries.  The separating hyperplanes (boundaries)between the groups
are NOT simply the perpendicular bisectors of the lines joining the class
centroids.  It is only this in the case that the covariance matrix is
spherical (= Identity matrix times constant) and the priors are equal. (The
former case could be satisfied by plotting with the sphered data.)  Boundaries
are plotted in crabs example in MASS at the bottom on p. 335 using the perp()
function.  See Figs. 4.1 and 4.5 of Hastie et al.. In practice, however, (esp. for QDA)
it might be easier to compute the boundaries exhaustively, i.e. compute the
decision rule on a finite lattice of points and then use a contouring method
to compute the boundaries (as in Hastie et al. - footnote on p. 88).

# Repeat but with separate train and test sets
train <- sample(1:150, 100)
ir.lda <- lda(Sp ~ ., Iris, subset=train)
ir.ld <- predict(ir.lda)
ir.ld.test <- predict(ir.lda, Iris[-train, ])$class
t(table(Iris$Sp[train], ir.ld$class))      # results on test set
t(table(Iris$Sp[-train], ir.ld.test))      # results on test set


##########  Splines

### First look at fitting with polynomials

attach(GAGurine)
#par(mfrow = c(3, 2))
plot(Age, GAG)
GAG.lm <- lm(GAG ~ Age + I(Age^2) + I(Age^3) + I(Age^4) + I(Age^5) + I(Age^6) + I(Age^7) + I(Age^8))
anova(GAG.lm)
# 7th and 8th powers are not significant
GAG.lm2 <- lm(GAG ~ Age + I(Age^2) + I(Age^3) + I(Age^4) + I(Age^5) + I(Age^6))
xx <- seq(0, 17, len = 200)
lines(xx, predict(GAG.lm2, data.frame(Age = xx)))
detach(GAGurine)


### Now look at various types of splines

# Regression spline exactly fits all points
library(fields)    # needed for rat.diet data set
x.grid <- seq( 0, 120, 200)
fit <- splint(rat.diet$t, rat.diet$trt, x.grid)
plot( rat.diet$t, rat.diet$trt)
lines( x,y)

# Now use GAGurine data
attach(GAGurine)

# Exact spline (i.e. regression spline with knot at each point)
age.grid <- seq(min(Age), max(Age), 0.01)
fit <- splint(Age, GAG, age.grid)
plot(Age, GAG)
lines(age.grid, fit)

# Linear fit -  do this to inspect degrees of freedom 
lm1 <- lm(GAG ~ Age)
# Need to put in a named data frame so that predict.lm properly recognises it
age.grid <- data.frame(seq(min(Age), max(Age), 0.1)) 
colnames(age.grid) <- "Age" 
plot(Age, GAG)
lines(age.grid$Age, predict(lm1, age.grid), col='red', lw=2)
# We know that df=2. The following gives 312, i.e. the total number of d.o.f
# available is equal to the number of measurements, which is 314
lm1$df.residual

# For some reason sreg does not work on the GAGurine data, e.g. 
# predict(sreg(Age, GAG)
# gives errors, even though it is the same syntax as the example in the sreg help page

More on splines in next lecture...