# GPPois.R: Gaussian Process-based inference for Poisson-noised data.
# Copyright (c) 2011-2012 Charles R. Hogg III
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.

# Author: Charles R. Hogg III (2011) <charles.r.hogg@gmail.com>

# FILE DESCRIPTION:
# Testing out new ideas for GPPois.R.  When I get satisfied with something from
# this file, I'll migrate it over to GPPois.R.

source("GPPois.R")

#-------------------------------------------------------------------------------
# Misc functions

InvertPairedSymbols <- function(x) {
  # Replace any "paired" symbol (e.g. brackets) by its left-right inverse.
  symbols.L <- '<({['
  symbols.R <- '>)}]'
  symbols.to   <- paste(sep='', symbols.L, symbols.R)
  symbols.from <- paste(sep='', symbols.R, symbols.L)
  # Now for the hard part! :)
}

ExpandedRange <- function(x, frac, use.log=FALSE) {
  # Give the range of 'x', plus a little 'wiggle room'.
  if (use.log) {
    x <- log(x)
  }
  r <- range(x)
  padding <- diff(r) * frac
  expanded.range <- c(r[1] - padding, r[2] + padding)
  if (use.log) {
    expanded.range <- exp(expanded.range)
  }
  return (expanded.range)
}

#-------------------------------------------------------------------------------
# Hypocrisy functions
#
# Calculate the degree to which an ell(X) curve contradicts itself, using any
# of various approaches.

HypocrisyOld <- function(X, ell, maxWidth=1.0, tau.sq=1.0, ...) {
  # Calculate the hypocrisy of ell(X), using the method from the old code.
  #
  # Args:
  #   X:  Numeric vector of X-values where ell is known.
  #   ell:  Numeric vector of ell-values.
  #   maxWidth:  The number of "ells" to interpolate in each interval.
  #
  # Returns:
  #   The hypocrisy (a logarithmic quantity) of a spline-interpolated version
  #   of ell(X).
  N <- length(X)
  dX <- diff(X)
  ell.avg <- (ell[-1] + ell[-N]) / 2.0
  num.widths <- dX / ell.avg
  num.pts.interp <- ceiling(num.widths / maxWidth)
  X.out <- X[1]
  for (i in 1:(N - 1)) {
    X.out <- c(X.out, X[i] + (1:num.pts.interp[i]) * dX[i] / num.pts.interp[i])
  }
  ell.out <- exp(spline(method="natural", x=X, xout=X.out, y=log(ell))$y)
  N.out <- length(ell.out)
  # Calculate (D)imensionless (D)istances: DD[i, j] is the distance from X[i]
  # to X[j], in units of ell[j].
  M.dX <- outer(X.out, X.out, '-')
  M.ell <- matrix(rep(ell.out, each=N.out), nrow=N.out, ncol=N.out)
  DD <- abs(M.dX) / M.ell
  DD2 <- DD ^ 2
  expDD2 <- exp(DD2)
  # Calculate hypocrisy
  prefactor <- 1.0 / (expDD2 + DD - 1.0)
  diag(prefactor) <- 0
  M.ell2 <- M.ell ^ 2
  X.width <- Widths(X.out)
  Diff <- outer(ell.out, ell.out, '-')
  h <- prefactor * Diff / M.ell2
  # Perform the double integral (this is where X.width comes into play)
  h.tot <- sum(outer(X.width, X.width, '*') * Diff * h) / (2.0 * tau.sq)
  return (-h.tot)
}

GradHypocrisyOld <- function(X, ell, tau.sq, ...) {
  # Calculates the empirical gradient of the hypocrisy w.r.t. the ell-values.
  env <- new.env()
  assign("X", X, envir=env)
  assign("ell", ell, envir=env)
  assign("tau.sq", tau.sq, envir=env)
  return (attr(
      numericDeriv(
        quote(HypocrisyOld(X=X, ell=ell, tau.sq=tau.sq)),
        "ell", env
        ),
      "gradient"
      ))
}

InterpolateEll <- function(X, ell, X.out) {
  # Interpolate the continuous function ell(X) to the points X.out.
  #
  # Args:
  #   X:  Numeric vector of X-values where ell is known.
  #   ell:  Numeric vector of ell-values.
  #   X.out:  The X-values where we desire to know ell(X)
  #
  # Returns:
  #   The values assumed by ell(X) at the points X.out
  ell.out <- exp(spline(method="natural", x=X, xout=X.out, y=log(ell))$y)
  return (ell.out)
}

DimensionlessLength <- function(X, ell, n.pts=length(X)) {
  # The length of ell(X), in units of itself.
  #
  # Args:
  #   X:  Numeric vector of X-values where ell is known.
  #   ell:  Numeric vector of ell-values.
  #   n.pts:  The number of points to use to approximate the integral.
  #
  # Returns:
  #   The dimensionless length of ell(X)
  X.out <- seq(from=min(X), to=max(X), length.out=n.pts)
  ell.inv <- 1.0 / InterpolateEll(X=X, ell=ell, X.out=X.out)
  avg.ell.inv <- 0.5 * (ell.inv[-1] + ell.inv[-length(ell.inv)])
  dX <- diff(X.out)
  return (sum(dX * avg.ell.inv))  # Tai's method ;)
}

HypocrisyDefault <- function(X, ell, n.pts=length(X), tau.sq=1.0, normalize=FALSE) {
  # Calculate the hypocrisy (default definition) of ell(X).
  #
  # Args:
  #   X:  Numeric vector of X-values where ell is known.
  #   ell:  Numeric vector of ell-values.
  #   n.pts:  The number of points to use to approximate the integral.
  #   tau.sq:  The 'tolerance for hypocrisy' parameter.
  #   normalize:  Whether to divide the final result by the total length.
  #
  # Returns:
  #   The hypocrisy (a logarithmic quantity) of a spline-interpolated version
  #   of ell(X).
  X.out <- seq(from=min(X), to=max(X), length.out=n.pts)
  ell.out <- InterpolateEll(X=X, ell=ell, X.out=X.out)
  # Calculate (D)imensionless (D)istances: DD[i, j] is the distance from X[i]
  # to X[j], in units of ell[j].
  M.dX <- outer(X.out, X.out, '-')
  M.ell <- matrix(rep(ell.out, each=n.pts), nrow=n.pts, ncol=n.pts)
  DD <- abs(M.dX) / M.ell
  expDD2 <- exp(DD ^ 2)
  # Calculate hypocrisy
  prefactor <- 1.0 / (expDD2 + DD - 1.0)
  diag(prefactor) <- 0  # Avoid a singularity here
  M.ell2 <- M.ell ^ 2
  X.width <- Widths(X.out)
  Diff <- outer(ell.out, ell.out, '-')
  h <- prefactor * Diff / M.ell2
  # Perform the double integral (this is where X.width comes into play)
  h.tot <- sum(outer(X.width, X.width, '*') * Diff * h) / (2.0 * tau.sq)
  if (normalize) {
    h.tot <- h.tot / DimensionlessLength(X=X, ell=ell, n.pts=n.pts)
  }
  return (h.tot)
}

HypocrisyDefaultLin <- function(X, ell, n.pts=length(X), tau.sq=1.0,
  normalize=FALSE) {
  # Linearized hypocrisy (based on default definition) of ell(X).
  #
  # Args:
  #   X:  Numeric vector of X-values where ell is known.
  #   ell:  Numeric vector of ell-values.
  #   n.pts:  The number of points to use to approximate the integral.
  #   tau.sq:  The 'tolerance for hypocrisy' parameter.
  #   normalize:  Whether to divide the final result by the total length.
  #
  # Returns:
  #   The total linearized hypocrisy for ell(X).
  X.out <- seq(from=min(X), to=max(X), length.out=n.pts)
  ell.out <- InterpolateEll(X=X, ell=ell, X.out=X.out)
  ell.inv <- 1.0 / ell.out
  avg.ell.inv <- 0.5 * (ell.inv[-1] + ell.inv[-n.pts])
  slope.sq.dX <- (diff(ell.out) ^ 2) / diff(X.out)
  h.tot <- (0.5 * exp(2) / tau.sq) * sum(avg.ell.inv * slope.sq.dX)
  if (normalize) {
    h.tot <- h.tot / DimensionlessLength(X=X, ell=ell, n.pts=n.pts)
  }
  return (h.tot)
}

HypocrisyFunction <- function(family, linearize) {
  # Find the appropriate hypocrisy function from the given family.
  #
  # Args:
  #   family:  Which 'family' of hypocrisy values to use.
  #   linearize:  Whether to choose the additive version.
  #
  # Returns:
  #   The requested hypocrisy function.
  lin.string <- ifelse(linearize, 'Lin', '')
  hyp.func.name <- sprintf('Hypocrisy%s%s', family, lin.string)
  return (get(hyp.func.name))
}

Hypocrisy <- function(X, ell, n.pts=length(X), tau.sq=1.0, family='Default',
  linearize=FALSE, normalize=FALSE) {
  # Generalized interface for calculating the hypocrisy of a function.
  #
  # Args:
  #   X:  Numeric vector of X-values where ell is known.
  #   ell:  Numeric vector of ell-values.
  #   n.pts:  The number of points to use to approximate the integral.
  #   tau.sq:  The 'tolerance for hypocrisy' parameter.
  #   family:  Which 'family' of hypocrisy values to use.
  #   linearize:  Whether to choose the additive version.
  #   normalize:  Whether to divide the final result by the total length.
  #
  # Returns:
  #   The total hypocrisy of ell(X) on the interval covered by X.
  hyp.func <- HypocrisyFunction(family=family, linearize=linearize)
  H <- hyp.func(X=X, ell=ell, n.pts=n.pts, tau.sq=tau.sq, normalize=normalize)
  return (H)
}

GradHypocrisy <- function(X, ell, n.pts=length(X), tau.sq=1.0,
  family='Default', linearize=FALSE, normalize=FALSE) {
  # Generalized interface for calculating the gradient of the hypocrisy of a
  # function.
  #
  # Args:
  #   X:  Numeric vector of X-values where ell is known.
  #   ell:  Numeric vector of ell-values.
  #   n.pts:  The number of points to use to approximate the integral.
  #   tau.sq:  The 'tolerance for hypocrisy' parameter.
  #   family:  Which 'family' of hypocrisy values to use.
  #   linearize:  Whether to choose the additive version.
  #   normalize:  Whether to divide the final result by the total length.
  #
  # Returns:
  #   The total hypocrisy of ell(X) on the interval covered by X.
  # Calculates the empirical gradient of the hypocrisy w.r.t. the ell-values.
  env <- new.env()
  assign("X", X, envir=env)
  assign("ell", ell, envir=env)
  assign("n.pts", n.pts, envir=env)
  assign("tau.sq", tau.sq, envir=env)
  assign("family", family, envir=env)
  assign("linearize", linearize, envir=env)
  assign("normalize", normalize, envir=env)
  return (attr(
      numericDeriv(
        quote(Hypocrisy(X=X, ell=ell, n.pts=n.pts, tau.sq=tau.sq,
            family=family, linearize=linearize, normalize=normalize)),
        "ell", env
        ),
      "gradient"
      ))
}

#-------------------------------------------------------------------------------
# Focus Region functions
#
# These functions facilitate breaking a Dataset into "Focus Regions", so that
# each region has a smaller number of datapoints (and is therefore much
# faster).

SampleDecayingDensity <- function(N, i, a=1.1, direction=1, ...) {
    # Generate indices which decay exponentially (with lengthscale
    # 'decay.length') away from 'i', in the direction 'direction'.
    #
    # Args:
    #   N:  The total number of indices (assumed to be 1:N).
    #   i:  The first index in the sequence.
    #   decay.length:  The characteristic lengthscale over which the sampling
    #      density decays.
    #   direction:  Either +1 or -1, telling whether we should increase or
    #      decrease.
    #
    # Returns:
    #   A sorted set of indices sampled with the given density.
    included <- array(0, dim=N)
    # TODO(chogg): Should just be 1 * direction, but this is to match up w/old
    # code.  Once I've verified reproducibility, I can remove the "2 * " and
    # the if-block below...
    i.first <- i + 2 * direction  
    if (i + direction <= N && i + direction >= 1) {
      included[i + direction] <- 1
    }
    jump <- 0
    d.jump <- 1
    index <- i.first
    while (index <= N && index >= 1) {
      included[index] <- 1
      jump <- jump + d.jump
      d.jump <- d.jump * a
      index <- round(i.first + direction * jump)
    }
    return (which(included == 1))
}


FocusRegionDecomposition <- function(d, n.pts.wide, decay.length) {
  # Decomposes a Dataset into smaller Datasets using "focus regions": these
  # include all points within regions of a given width ('n.pts.wide'), PLUS an
  # exponentially decaying density of points (based on lengthscale
  # 'decay.length') outside.
  #
  # Args:
  #   d:  The Dataset to decompose.
  #   n.pts.wide:  The width (in number of points) of the focus region.
  #   decay.length:  The characteristic lengthscale for the decay of the point
  #      density outside the region proper.
  #
  # Returns:
  #   A list() of list()s: each entry in the master list has an element $X
  #   giving a "location" of the Dataset, and an element $d giving the Dataset
  #   object.
  #
  # NOTES:
  #   I expect this concept will work better when the points in d$X are
  #      more-or-less evenly spaced, or at least when the spacing between
  #      neighbors varies slowly.
  #   The logic is copied more-or-less directly from the previous code which I
  #      (C. Hogg) used in mid 2011 to obtain results for the paper we're
  #      submitting to JAC.  At the moment I'm merely trying to reproduce those
  #      results with better software, though later I hope to revisit and
  #      improve the concept.
  N <- length(d$X)
  num.frs <- round(N / n.pts.wide)
  fr.spacing <- n.pts.wide + ((N %% n.pts.wide) / num.frs)
  first.indices <- round(seq(from=1, to=(N - n.pts.wide + 1), by=fr.spacing))
  focus.regions <- list()
  offsets <- (1:n.pts.wide) - 1
  for (f in first.indices) {
    i.central <- f + offsets
    X.centre <- mean(d$X[i.central])
    a <- exp(2.0 / decay.length)
    i.L <- SampleDecayingDensity(N=N, a=a, direction=-1, i=min(i.central))
    i.R <- SampleDecayingDensity(N=N, a=a, direction= 1, i=max(i.central))
    i.all <- c(i.L, i.central, i.R)
    data.subset <- clone(d)
    data.subset$DeleteRows(-i.all)
    focus.regions <- c(focus.regions, list(list(X=X.centre, d=data.subset)))
  }
  return (focus.regions)
}

FocusRegionXVals <- function(focus.regions) {
  # Extract the X-values from a list of focus regions.
  #
  # Args:
  #   focus.regions:  A list() of focus regions (such as might be returned by
  #      FocusRegionDecomposition()).
  #
  # Returns:
  #   A numeric vector of X-values for the focus regions.
  n.regions <- length(focus.regions)
  X <- c()
  for (i in 1:n.regions) {
    X <- c(X, focus.regions[[i]]$X)
  }
  return (X)
}

FocusRegionParams <- function(fr.models, label='Params') {
  # Creates a named numeric vector of adjustable parameters, given the list of
  # focus region models.
  #
  # Args:
  #   fr.models:  A list() of Model objects, one for each focus region.
  #   label:  One of 'Params', 'Upper', or 'Lower', specifying whether to
  #      retrieve the parameter values or one of the boundaries.
  #
  # Returns:
  #   A named numeric vector of adjustable parameters.
  values <- c()
  value.getting.func <- get(paste(sep='', 'get', label))
  for (model in fr.models) {
    model.values <- value.getting.func(this=model)
    values[names(model.values)] <- model.values[names(model.values)]
  }
  return (values)
}

HomogenizeFocusRegionParams <- function(fr.models) {
  # Set all sub-Models' parameters to consistent values (specifically for
  # parameters which are shared across sub-Models).
  params <- FocusRegionParams(fr.models)
  for (model in fr.models) {
    model$params <- params
  }
  return (invisible(fr.models))
}

VariableParameterName <- function(index, name, param.names) {
  # Find the one-and-only-one (or else!) parameter name, which ends in
  # "index.name" (e.g., "vary.1.ell" would match index=1, name='ell').
  #
  # Args:
  #   index:  numeric; the index for this parameter (i.e., which focus region
  #      does it belong to).
  #   name:  character; the basename for this parameter.
  #   param.names:  A list of the full parameter names for the model being
  #      considered.
  #
  # Returns:
  #   The name from param.names which matches the pattern.
  varying.ell.name <- grep(pattern=sprintf("%d.%s$", index, name),
    x=param.names)
  if (length(varying.ell.name) > 1) {
    stop(sprintf(index, name, fmt="Too many matches found for '%d.%s'"))
  } else if (length(varying.ell.name) < 1) {
    stop(sprintf(index, name, fmt="No matches found for '%d.%s'"))
  }
  return (varying.ell.name)
}

VariableParameterValues <- function(name, params) {
  # Find the parameters in 'params' which match 'name', and sort numerically
  #
  # Args:
  #   name:  character; the basename for this parameter.
  #   params:  A named numeric vector of parameters.
  #
  # Returns:
  #   The parameters from 'params' whose names match 'name'.
  x <- names(params)
  pattern <- paste(sep='', '^.*\\D(\\d+)\\.', name, '.*$')
  idx <- grep(perl=TRUE, pattern=pattern, x=x)
  key <- as.numeric(
    gsub(perl=TRUE, pattern=pattern, replacement='\\1', x[idx]))
  ordered.idx <- idx[order(key)]
  return (params[ordered.idx])
}

LogMLFocusRegions <- function(par=FocusRegionParams(fr.models), fr.models,
  focus.regions, tau.sq, hyp.norm=FALSE, hyp.lin=FALSE) {
  # Calculate the log of the (M)arginal (L)ikelihood for a dataset which has
  # been decomposed into focus regions.
  #
  # Args:
  #   par:  A named numeric vector holding the ell-values for each focus
  #      region.
  #   fr.models:  A list() of Model objects, one for each focus region.
  #   focus.regions:  A list() of list()s, where each element of the master
  #      list has a numeric $X giving its rough location, and a Dataset $d
  #      giving the data in that focus region.
  #   tau.sq:  The square of the "tolerance for hypocrisy" (0 means straight
  #      line, Inf means 'anything goes')
  #   update.params:  If TRUE (default), the parameters of the models will be
  #      updated to the values in par.
  #   hyp.norm:  logical; TRUE if we should normalize our prior by the total
  #      dimensionless length of ell(X).
  #   hyp.lin:  logical; TRUE if we should use 'linearized' hypocrisy (which
  #      neglects interactions).
  #
  # Returns:
  #   The log of the (M)arginal (L)ikelihood for this decomposed Dataset.
  log.ML <- 0
  num.regions <- length(fr.models)
  # Calculate the "likelihood" component (summing over focus-region ribbons);
  # also, build up the vectors for the hypocrisy calculation.
  for (i in 1:num.regions) {
    model <- fr.models[[i]]
    log.ML <- log.ML + LogML(par=par, model=model,
      d=focus.regions[[i]]$d, update.params=TRUE)
  }
  # Calculate the "priors" based on hypocrisy
  fr.params <- FocusRegionParams(fr.models)
  ell <- VariableParameterValues(name='ell', params=fr.params)
  X <- FocusRegionXVals(focus.regions=focus.regions)
  hypocrisy.prior <- Hypocrisy(X=X, ell=ell, n.pts=num.regions * 10,
    tau.sq=tau.sq, linearize=hyp.lin, normalize=hyp.norm)
  return (log.ML - hypocrisy.prior)
}

NumericGradLogMLFocusRegions <- function(par=FocusRegionParams(fr.models), fr.models,
  focus.regions, tau.sq, hyp.norm, hyp.lin) {
  env <- new.env()
  assign("par", par, envir=env)
  assign("fr.models", fr.models, envir=env)
  assign("focus.regions", focus.regions, envir=env)
  assign("tau.sq", tau.sq, envir=env)
  assign("hyp.norm", hyp.norm, envir=env)
  assign("hyp.lin", hyp.lin, envir=env)
  return (attr(
      numericDeriv(
        quote(LogMLFocusRegions(par=par, fr.models=fr.models,
            focus.regions=focus.regions, tau.sq=tau.sq,
            hyp.norm=hyp.norm, hyp.lin=hyp.lin)),
        "par", env
        ),
      "gradient"
      ))
}

GradLogMLFocusRegions <- function(par=FocusRegionParams(fr.models), fr.models,
  focus.regions, tau.sq, update.params=TRUE, hyp.norm=FALSE, hyp.lin=FALSE) {
  # Calculate the gradient of the log of the (M)arginal (L)ikelihood, with
  # respect to each focus region's ell-value, for a dataset which has been
  # decomposed into focus regions.
  #
  # Args:
  #   par:  A named numeric vector holding the ell-values for each focus
  #      region.
  #   fr.models:  A list() of Model objects, one for each focus region.
  #   focus.regions:  A list() of list()s, where each element of the master
  #      list has a numeric $X giving its rough location, and a Dataset $d
  #      giving the data in that focus region.
  #   tau.sq:  The square of the "tolerance for hypocrisy" (0 means straight
  #      line, Inf means 'anything goes')
  #   update.params:  If TRUE (default), the parameters of the models will be
  #      updated to the values in par.
  #   hyp.norm:  logical; TRUE if we should normalize our prior by the total
  #      dimensionless length of ell(X).
  #   hyp.lin:  logical; TRUE if we should use 'linearized' hypocrisy (which
  #      neglects interactions).
  #
  # Returns:
  #   The gradient of the log of the (M)arginal (L)ikelihood for this
  #   decomposed Dataset.
  X <- FocusRegionXVals(focus.regions=focus.regions)
  num.regions <- length(fr.models)
  fr.params <- FocusRegionParams(fr.models)
  ell <- VariableParameterValues(name='ell', params=fr.params)
  grad <- GradHypocrisy(X=X, ell=ell, n.pts=num.regions * 10,
    tau.sq=tau.sq, linearize=hyp.lin, normalize=hyp.norm)
  # Cycle through the models, and get GradLogML for each focus region.
  for (i in 1:num.regions) {
    model <- fr.models[[i]]
    grad.i <- GradLogML(par=par, model=model, d=focus.regions[[i]]$d,
      update.params=update.params)
    # Already-existing entries get added to, but any new parameters will have
    # their values tacked on.
    existing <- which(names(grad.i) %in% names(grad))
    names.existent <- names(grad.i)[existing]
    names.nonexistent <- names(grad.i)[-existing]
    grad[names.nonexistent] <-                        grad.i[names.nonexistent]
    grad[names.existent]    <- grad[names.existent] + grad.i[names.existent]
  }
  return (grad)
}

FocusRegionStartingPoint <- function(d, n.pts.wide, decay.length,
  model=Model(id='VaryingEll'), varying.ell.bounds=NA,
  varying.sigma.f.bounds=NA) {
  # Setup a list of sub-models which provide good starting points for a Focus
  # Region decomposition based on varying-ell SE covariance.
  #
  # Args:
  #   d:  The Dataset to decompose.
  #   n.pts.wide:  The width (in number of points) of the focus region.
  #   decay.length:  The characteristic lengthscale for the decay of the point
  #      density outside the region proper.
  #   tau.sq:  The "tolerance for hypocrisy": high values do not constrain the
  #      curve, while low values make it flatter.
  #   model:  The base model that will be cloned for every focus region.
  #   varying.ell.bounds:  Range of acceptable values of ell for finer features
  #   varying.sigma.f.bounds:  Range of acceptable values of sigma.f for finer
  #      features
  # Returns:
  #   A list() of list()s: 
  #     $models: a list() of Model objects, whose hyperparameters are a good
  #        fit to the data (neglecting hypocrisy).
  #     $focus.regions: a list() structured like the return values from
  #        FocusRegionDecomposition().
  fr.datasets <- FocusRegionDecomposition(d, n.pts.wide, decay.length)
  n.regions <- length(fr.datasets)
  fr.models <- list()
  cat(sprintf(n.regions, d$id,
      fmt="Finding starting values for %d focus regions in Dataset '%s'...\n"))
  pb <- txtProgressBar(min = 0, max = n.regions, style = 3)
  setTxtProgressBar(pb, 0)
  for (i in 1:n.regions) {
    model.i <- clone(model)
    model.i$id <- paste(sep='', 'fr.', i)
    model.i$AddCovariance(CovarianceSE(id=paste(sep='', "vary.", i),
        ell.bounds=varying.ell.bounds, sigma.f.bounds=varying.sigma.f.bounds))
    model.i$Train(d=fr.datasets[[i]]$d)
    fr.models[[i]] <- model.i
    setTxtProgressBar(pb, i)
  }
  close(pb)
  ell <- VariableParameterValues(name='ell',
    params=FocusRegionParams(fr.models))
  sigma.f <- VariableParameterValues(name='sigma.f',
    params=FocusRegionParams(fr.models))
  sigma.f.rms <- sqrt(mean(sigma.f^2))
  # Now, restrict our ell-search to the range of the optimal values found
  # (after all, going outside this range could really only make our solution
  # worse!).
  for (i in 1:n.regions) {
    ell.bounds <- ExpandedRange(ell, 1e-2, use.log=TRUE)
    names(ell.bounds) <- rep(paste(sep='', 'vary.', i, '.ell'), 2)
    names(sigma.f.rms) <- paste(sep='', 'vary.', i, '.sigma.f')
    fr.models[[i]]$params <- sigma.f.rms
    fr.models[[i]]$lower <- c(ell.bounds[1], sigma.f.rms)
    fr.models[[i]]$upper <- c(ell.bounds[2], sigma.f.rms)
  }
  return (list(models=fr.models, focus.regions=fr.datasets))
}

TrainSEVaryingEllFocusRegions <- function(d, n.pts.wide, decay.length, tau.sq,
  hyp.norm=FALSE, hyp.lin=FALSE, model=Model(id='VaryingEll'),
  varying.ell.bounds=NA, varying.sigma.f.bounds=NA) {
  # Find non-stationary hyperparameters for the given dataset, by decomposing
  # into focus regions of width 'n.pts.wide' and decay length 'decay.length'.
  # 
  # Args:
  #   d:  The Dataset to decompose.
  #   n.pts.wide:  The width (in number of points) of the focus region.
  #   decay.length:  The characteristic lengthscale for the decay of the point
  #      density outside the region proper.
  #   tau.sq:  The "tolerance for hypocrisy": high values do not constrain the
  #      curve, while low values make it flatter.
  #   hyp.norm:  logical; TRUE if we should normalize our prior by the total
  #      dimensionless length of ell(X).
  #   hyp.lin:  logical; TRUE if we should use 'linearized' hypocrisy (which
  #      neglects interactions).
  #   model:  The base model that will be cloned for every focus region.
  #   varying.ell.bounds:  Range of acceptable values of ell for finer features
  #   varying.sigma.f.bounds:  Range of acceptable values of sigma.f for finer
  #      features
  #
  # Returns:
  #   A list() with elements X (the X-values of the focus regions) and opt (the
  #   output of optim(), which contains both log.ML=opt$value, and the optimal
  #   parameters opt$par).
  start.params <- FocusRegionStartingPoint(d=d, n.pts.wide=n.pts.wide,
    decay.length=decay.length, model=model,
    varying.ell.bounds=varying.ell.bounds,
    varying.sigma.f.bounds=varying.sigma.f.bounds)
  fr.models <- start.params$models
  fr.datasets <- start.params$focus.regions
  # Jointly optimize all the ell-values.
  opt <- optim(par=FocusRegionParams(fr.models=fr.models),
    fn=LogMLFocusRegions, gr=NumericGradLogMLFocusRegions, method="L-BFGS-B",
    lower=FocusRegionParams(fr.models=fr.models, label='Lower'),
    upper=FocusRegionParams(fr.models=fr.models, label='Upper'),
    control=list(fnscale=-1, trace=0, maxit=400),
    # Extra parameters needed for fn and gr:
    fr.models=fr.models, focus.regions=fr.datasets, tau.sq=tau.sq,
    hyp.norm=hyp.norm, hyp.lin=hyp.lin)
  X <- FocusRegionXVals(start.params$focus.regions)
  for (model in fr.models) {
    model$Forget()
  }
  rm(start.params)
  gc()  # Free up memory, garbage collection
  return (list(X=X, opt=opt))
}

TestLogMLFocusRegions <- function(d, n.pts.wide, decay.length, tau.sq,
  ell=NA, sigma.f=NA, ell.bounds=NA, sigma.f.bounds=NA) {
  # Check that GradLogMLFocusRegions really does give the gradient of
  # LogMLFocusRegions.
  #
  # Args:
  #   d:  The Dataset to decompose.
  #   n.pts.wide:  The width (in number of points) of the focus region.
  #   decay.length:  The characteristic lengthscale for the decay of the point
  #      density outside the region proper.
  #   ell: A characteristic ('horizontal') lengthscale over which function
  #      values are correlated.
  #   sigma.f: The "vertical" lengthscale.
  #   ell.bounds: Range of acceptable values of ell
  #   sigma.f.bounds: Range of acceptable values of sigma.f
  #
  # Returns:
  #   Used for its side-effect.
  # 
  # NOTES:
  #   Code is basically copied from TrainSEVaryingEllFocusRegions, up to the
  #   call to ''optim''; after that point, it just checks the gradient.

  # Calculate the focus regions and count how many there are
  fr.datasets <- FocusRegionDecomposition(d, n.pts.wide, decay.length)
  n.regions <- length(fr.datasets)
  # Construct a separate model for each focus region
  fr.models <- list()
  model <- Model()
  model$AddCovariance(CovarianceSE(id="SE", ell=ell, sigma.f=sigma.f,
      ell.bounds=ell.bounds, sigma.f.bounds=sigma.f.bounds))
  # Find good starting values for the parameters
  total.sigma.f.sq <- 0
  X <- ell <- ell.min <- ell.max <- c()
  ANSWERS <- c(0.269666, 0.27126, 0.344723, 0.355278, 0.230487, 0.284245,
    0.296604, 0.303749, 0.342234, 0.339296, 0.474818, 0.601531, 0.649259, 0.82,
    0.783772, 1.00576, 0.926831, 0.847478, 0.85239, 1.13945, 0.985747, 1.28076,
    1.05218, 1.71102, 2.41193, 2.42802, 3, 3, 2.6793, 3, 3, 3, 3)
  for (i in 1:n.regions) {
    model.i <- clone(model)
    model.i$id <- paste(sep='', 'fr.', i)
    #model.i$Train(d=fr.datasets[[i]]$d)
    model.i$params <- c(SE.ell=ANSWERS[i], SE.sigma.f=174)  # Delete this line!  Replace w/above!
    fr.models[[i]] <- model.i
    total.sigma.f.sq <- total.sigma.f.sq + model.i$params["SE.sigma.f"] ^ 2
    ell     <- c(ell    , model.i$params["SE.ell"])
    ell.min <- c(ell.min, model.i$lower ["SE.ell"])
    ell.max <- c(ell.max, model.i$upper ["SE.ell"])
    X <- c(X, fr.datasets[[i]]$X)
    model <- model.i  # Next model might as well start from what just worked!
  }
  names(ell) <- names(ell.min) <- names(ell.max) <- paste(sep='',
    'ell.', 1:n.regions)
  # The old approach clamps all models' sigma.f to the average.  Stick with
  # that... for now!
  sigma.f.best <- c(SE.sigma.f=sqrt(total.sigma.f.sq / n.regions))
  for (i in 1:n.regions) {
    fr.models[[i]]$params <- sigma.f.best
    fr.models[[i]]$lower  <- sigma.f.best
    fr.models[[i]]$upper  <- sigma.f.best
  }
  log.ML <- LogMLFocusRegions(fr.models=fr.models, focus.regions=fr.datasets,
    tau.sq=tau.sq)
  grad.log.ML <- GradLogMLFocusRegions(fr.models=fr.models,
    focus.regions=fr.datasets, tau.sq=tau.sq)
  params <- FocusRegionParams(fr.models)
  d.params <- -abs(rnorm(n=length(params), sd=1e-3))
  for (i in 1:10) {
    log.ML.new <- LogMLFocusRegions(par=params + d.params, fr.models=fr.models,
      focus.regions=fr.datasets, tau.sq=tau.sq)
    actual <- log.ML.new - log.ML
    expected <- sum(grad.log.ML * d.params)
    cat(sprintf("Ratio: %8.6f (actual=%.2e)/(expected=%.2e)\n",
        actual/expected, actual, expected))
    d.params <- d.params * 0.1
  }
}

#-------------------------------------------------------------------------------
# Testing different tau.sq values

MapTauSq <- function(tau.range, n.tau.vals, base.filename="Tau-sq_map_default") {
  file.name <- paste(base.filename, ".ROBJ", sep='')
  sigma.approach <- "independent"
  logic <- c('TRUE', 'FALSE')
  log.tau.range <- log(tau.range)
  tau.sq.vals <- exp(seq(from=log.tau.range[1], to=log.tau.range[2],
      length.out=n.tau.vals))
  n.pts.wide <- 200
  decay.length <- 20
  fr.datasets <- FocusRegionDecomposition(d=kates.data, n.pts.wide=n.pts.wide,
    decay.length=decay.length)
  df.full <- df.summary <- data.frame()
  for (tau.sq in tau.sq.vals) {
    for (linearize in logic) {
      for (normalize in logic) {
        best <- TrainSEVaryingEllFocusRegions(d=kates.data,
          n.pts.wide=n.pts.wide, decay.length=decay.length, tau.sq=tau.sq,
          varying.ell.bounds=ell.bounds, varying.sigma.f.bounds=sigma.f.bounds,
          hyp.norm=normalize, hyp.lin=linearize)
        msg <- "DELETE this line and the one below."
        save(best, msg, file="msg.ROBJ")
        # Calculate the full ell(Q) function and denoised curve
        model <- Model()
        Cov <- CovarianceSEVaryingEll(X.ell=FocusRegionXVals(fr.datasets),
            ell    =VariableParameterValues('ell'    , best$opt$par),
            sigma.f=VariableParameterValues('sigma.f', best$opt$par))
        model$AddCovariance(Cov)
        f <- model$PosteriorMean(d=kates.data)
        kates.data$quantity <- 'true'
        MSR <- kates.data$MSR(test.data=f)
        kates.data$quantity <- 'noisy'
        ell <- Cov$ell(X=kates.data$X)
        rm(model)
        # Add summary stats to data.frames
        df.summary <- rbind(df.summary, data.frame(
            linearize=linearize,
            normalize=normalize,
            sigma.approach=sigma.approach,
            tau.sq=tau.sq,
            log.ML=best$opt$value,
            MSR=MSR))
        df.full <- rbind(df.full, data.frame(
            X=kates.data$X,
            ell=ell,
            f=f,
            tau.sq=tau.sq,
            linearize=linearize,
            normalize=normalize,
            sigma.approach=sigma.approach))
        save(df.summary, df.full, file=file.name)
      }
    }
  }
}
