# C.A.L. Bailer-Jones
# Astrostats 2013
# This file: kfoldCV.R
# R code for calculating the K-fold cross validation likelihood (and for defining partitions)

source("monte_carlo.R") # provides metrop() needed by kfoldcv()

# Return the log 10 K-fold CV likelihood for data set obsdata with K=Npart.
# This requires sampling the posterior, which is done with metrop() using
#   the following parameters: sampleCov, thetaInit, Nburnin, Nsamp.
# logpost and loglike provide the log10 posterior and likelihood respectively.
# ... contains, if needed, (1) the prior PDF hyperparameters (alpha),
#   (2) fixed model parameters.
kfoldcv <- function(Npart, obsdata, logpost, loglike, sampleCov, thetaInit, Nburnin, Nsamp, ...) {
  partInd <- define.partitions(Nind=nrow(obsdata), Npart=Npart)
  partLike <- vector(length=Npart, mode="numeric") # initializes to zero
  for(k in 1:Npart) {
    # draw samples from posterior using all data except this partition
    cat("Sampling posterior and calculating likelihood for partition", k, "of", Npart, "\n")
    ind <- setdiff(1:nrow(obsdata), partInd[[k]])
    postSamp <- metrop(func=logpost, thetaInit=thetaInit, Nburnin=Nburnin, Nsamp=Nsamp,
                       verbose=Nsamp/10, sampleCov=sampleCov, obsdata=obsdata, ind=ind, ...)
    # calculate likelihood of those samples using just the partition
    # (may need to apply an "help factor" if get numerical problems here)
    for(n in 1:nrow(postSamp)) {
      partLike[k] <- partLike[k] + 10^loglike(theta=postSamp[n,3:ncol(postSamp)], obsdata=obsdata, ind=partInd[[k]], ...)
    }
    partLike[k] <- log10(partLike[k]/nrow(postSamp))
  }
  return(sum(partLike))
}

# Assign the elements of the vector 1:Nind randomly without
# replacement into Npart partitions once (only).  Each partition
# (except perhaps the last) has size partSize = Nind %/% Npart (the
# last has size partSize + Nind %% Npart, which is equal or larger).
# E.g. if Nind=22, Npart=4, then three partitions have size 5, and one
# has size 7.  The partitions are returned as a list of length Npart,
# with each element a vector of the indices (each vector sorted,
# smallest first).
define.partitions <- function(Nind, Npart) {
  ind <- sample.int(n=Nind)
  partSize <- Nind %/% Npart
  partInd <- NULL
  for(k in 0:(Npart-1)) {
    if(k==(Npart-1)) {
      Len <- partSize + Nind %% Npart 
    } else {
      Len <- partSize
    }
    partInd <- c( partInd, list(sort(ind[ (k*partSize+1):(k*partSize+Len) ])) )
  }
  return(partInd)
}
