R scripts for the lecture course
Machine Learning, pattern recognition and statistical data modelling
Coryn A.L. Bailer-Jones, 2007

Data exploration
----------------


*** Density estimation

# Figure 5.8 p. 127 in MASS4
geyser
attach(geyser)
par(mfrow=c(2,3))
# histograms are sensitive to placing of bin centres
truehist(duration, h=0.5, x0=0.0, xlim=c(0, 6), ymax=0.7)
truehist(duration, h=0.5, x0=0.1, xlim=c(0, 6), ymax=0.7)
truehist(duration, h=0.5, x0=0.2, xlim=c(0, 6), ymax=0.7)
truehist(duration, h=0.5, x0=0.3, xlim=c(0, 6), ymax=0.7)
truehist(duration, h=0.5, x0=0.4, xlim=c(0, 6), ymax=0.7)
# Improve by averaging the histograms
breaks <- seq(0, 5.9, 0.1)
counts <- numeric(length(breaks))
for(i in (0:4)) counts[i+(1:55)] <- counts[i+(1:55)] +
    rep(hist(duration, breaks=0.1*i + seq(0, 5.5, 0.5),
    prob=TRUE, plot=FALSE)$intensities, rep(5,11))
plot(breaks+0.05, counts/5, type="l", xlab="duration",
    ylab="averaged", bty="n", xlim=c(0, 6), ylim=c(0, 0.7))
detach()

So use Gaussian kernel density estimation instead

# Fig. 5.9 on p. 128 of MASS4
# nrd, SJ and SJ-dpi are different methods for choosing the fixed bandwidth
# do ?bw.nrd for more details. To get values used:
# bw.nrd(geyser$duration) = 0.389
# bw.SJ(geyser$duration) = 0.090
# bw.SJ(geyser$duration, method='dpi') = 0.144
attach(geyser)
par(mfrow=c(1,2))
truehist(duration, nbins = 15, xlim = c(0.5, 6), ymax = 1.2)
lines(density(duration, width = "nrd"))
truehist(duration, nbins = 15, xlim = c(0.5, 6), ymax = 1.2)
lines(density(duration, width = "SJ", n = 256), lty = 3)        # dotted line
lines(density(duration, n = 256, width = "SJ-dpi"), lty = 1)
detach()

Density estimation in 2D

# Fig. 5.11 on p. 131 of MASS4
geyser2 <- data.frame(as.data.frame(geyser)[-1, ],
                      pduration = geyser$duration[-299])
attach(geyser2)
par(mfrow=c(2, 2), mgp=c(1.8,0.5,0), mar=c(3,3,3,3))
plot(pduration, waiting, xlim = c(0.5, 6), ylim = c(40, 110), xlab = "previous duration", ylab = "waiting")
f1 <- kde2d(pduration, waiting, n = 50, lims=c(0.5, 6, 40, 110))
# on a linear scale
image(f1, zlim = c(0, 0.075), xlab = "previous duration", ylab = "waiting")
# to get on a log scale do:
# image(f1$x, f1$y, log(f1$z), zlim = c(-40, 0), xlab = "previous duration", ylab = "waiting")
f2 <- kde2d(pduration, waiting, n = 50, lims=c(0.5, 6, 40, 110), h = c(width.SJ(duration), width.SJ(waiting)) )
image(f2, zlim = c(0, 0.075), xlab = "previous duration", ylab = "waiting")
persp(f2,  phi = 30, theta = 20, d = 5, xlab = "previous duration", ylab = "waiting", zlab = "")
detach()



*** Classification via density modelling (and MLE)

# Generate two clusters of Gaussian data
set.seed(20)
x1 <- rnorm(100, mean=0, sd=0.50)
y1 <- rnorm(100, mean=0, sd=0.50)
x2 <- rnorm(100, mean=1, sd=0.70)
y2 <- rnorm(100, mean=1, sd=0.30)
plot(x1, y1, type='n', xlim=c(-1.5,3.0), ylim=c(-1.5,3.0), xlab='x', ylab='y')
points(x1, y1, pch=19, col=2)
points(x2, y2, pch=23, col=3, bg=3)

# infer properties of PDF on assumption that x and y are independent
( mean1 <- c(mean(x1), mean(y1)) )
( mean2 <- c(mean(x2), mean(y2)) )
( sd1 <- c(sd(x1), sd(y1)) )
( sd2 <- c(sd(x2), sd(y2)) )
# build covariance matrices and work out eigenvalues and eigenvectors, i.e. without 
# assuming x and y are independent
covmat1 <- matrix( c(cov(x1,x1), cov(x1,y1), cov(y1,x1), cov(y1,y1)), nrow=2 )
covmat2 <- matrix( c(cov(x2,x2), cov(x2,y2), cov(y2,x2), cov(y2,y2)), nrow=2 )
eigen1 <- eigen(covmat1, symmetric=TRUE)
eigen2 <- eigen(covmat2, symmetric=TRUE)
# standard deviations in x and y are
sqrt(eigen1$values)
sqrt(eigen2$values)
# note the differences in the the principal directions (eigenvectors) for class 1. This is
# a symmetric Gaussian so the principal directions could happily lie in any direction.
# Alternatively use SVD, e.g.
svd(covmat2)
sqrt(svd(covmat2)$d)

# overplotted fitted Gaussians, using fact that x and y are independent
xgrid=seq(-1,3,0.1)
ygrid=seq(-1,3,0.1)
contour( xgrid, ygrid, dnorm(xgrid, mean=mean(x1), sd=sd(x1)) %o% dnorm(ygrid, mean=mean(y1), sd=sd(y1)), col=2, add=TRUE )
contour( xgrid, ygrid, dnorm(xgrid, mean=mean(x2), sd=sd(x2)) %o% dnorm(ygrid, mean=mean(y2), sd=sd(y2)), col=3, add=TRUE )  

Decision boundaries are quadratic in general. They are linear if the two class
covariance matrices are equal.
