From 312347fba94648a056847e583e19fe9cda999970 Mon Sep 17 00:00:00 2001
From: Nick Golding <goldingn35@gmail.com>
Date: Mon, 7 Jan 2019 12:08:25 +1100
Subject: [PATCH 01/38] flesh out marginalisation interface

---
 DESCRIPTION            |   3 +-
 NAMESPACE              |   3 +
 R/marginalise.R        | 151 +++++++++++++++++++++++++++++++++++++++++
 man/marginalisation.Rd |  83 ++++++++++++++++++++++
 4 files changed, 239 insertions(+), 1 deletion(-)
 create mode 100644 R/marginalise.R
 create mode 100644 man/marginalisation.Rd

diff --git a/DESCRIPTION b/DESCRIPTION
index 6e05812f..64ebfb03 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -61,6 +61,7 @@ Collate:
     'internals.R'
     'calculate.R'
     'callbacks.R'
+    'marginalise.R'
 Imports:
     R6,
     tensorflow,
@@ -86,4 +87,4 @@ Suggests:
     MASS,
     abind
 VignetteBuilder: knitr
-RoxygenNote: 6.1.0
+RoxygenNote: 6.1.1
diff --git a/NAMESPACE b/NAMESPACE
index 07c1b749..24e7e5c3 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -85,6 +85,7 @@ S3method(plot,greta_model)
 S3method(print,greta_array)
 S3method(print,greta_model)
 S3method(print,initials)
+S3method(print,marginaliser)
 S3method(print,optimiser)
 S3method(print,sampler)
 S3method(print,unknowns)
@@ -141,6 +142,7 @@ export(cov2cor)
 export(diag)
 export(dirichlet)
 export(dirichlet_multinomial)
+export(discrete_marginalisation)
 export(distribution)
 export(eigen)
 export(exponential)
@@ -169,6 +171,7 @@ export(lkj_correlation)
 export(log1pe)
 export(logistic)
 export(lognormal)
+export(marginalise)
 export(mcmc)
 export(mixture)
 export(model)
diff --git a/R/marginalise.R b/R/marginalise.R
new file mode 100644
index 00000000..42f3af61
--- /dev/null
+++ b/R/marginalise.R
@@ -0,0 +1,151 @@
+# marginalising random variables
+
+#' @name marginalisation
+#'
+#' @title direct marginalisation of random variables
+#' @description Inference on many statistical models requires marginalisation of
+#'   (ie. integration of functions over) random variables. In addition to
+#'   general purpose marginalisation methods like MCMC that act on an entire
+#'   model, it is often useful to directly marginalise a single random variable
+#'   within a larger model; e.g. for random variables where marginalisation by
+#'   MCMC is inefficient, impossible, or when an approximate marginalisation is
+#'   substantially more efficient. \code{marginalise()} performs direct
+#'   marginalisation within a greta model, using one of the available marginalisation methods.
+#'
+#' @return \code{discrete_marginalisation} - a \code{marginaliser} object that
+#'   can be passed to \code{marginalise}.
+#'
+#' @examples
+#' \dontrun{
+#' # marginalise a discrete random variable and carry out HMC on the
+#' # continouous variables:
+#' y <- rnorm(100)
+#'
+#' lambda <- lognormal(0, 1)
+#' theta <- normal(0, 1)
+#' sd <- lognormal(0, 1)
+#'
+#' # if discrete variables could be sampled by MCMC in greta,
+#' # we would be able to simply do:
+#' #   n <- poisson(lambda)
+#' #   mu <- theta ^ n
+#' #   distribution(y) <- normal(mu, sd)
+#'
+#' # the function to marginalise over
+#' # (only the variable to marginalise is passed in)
+#' fun <- function (n) {
+#'   mu <- theta ^ n
+#'   distribution(y) <- normal(mu, sd)
+#' }
+#'
+#' # integrate the function values w.r.t. the poisson distribution
+#' marginalise(fun,
+#'             poisson(lambda),
+#'             method = discrete_marginalisation(values = 0:10))
+#'
+#' m <- model(lambda)
+#' draws <- mcmc(m, hmc())
+#' }
+NULL
+
+#' @rdname marginalisation
+#' @export
+#'
+#' @param fun an R function to integrate with respect to the random variable.
+#'   Must take a single argument - a greta array giving the value of the random
+#'   variable
+#' @param variable a variable greta array with a distribution, representing the
+#'   random variable to marginalise
+#' @param method a \code{marginaliser} object giving the method for carrying out
+#'   the marginalisation
+marginalise <- function(fun, variable, method) {
+
+  distrib <- get_node(variable)$distribution
+
+  # check the inputs
+  if (!is.function(fun)) {
+    stop ("'fun' must be an R function")
+  }
+
+  if (!inherits(distrib, "distribution_node")) {
+    stop ("'variable' must be a variable greta array with a distribution")
+  }
+
+  # handle case when the method is passed like: `laplace_approximation`, rather
+  # than `laplace_approximation()`
+  if (is.function(method)) {
+    method <- method()
+  }
+
+  if (!inherits(method, "marginalisation_method")) {
+    stop ("'method' must be a valid marginalisation method. ",
+          "See ?marginalise for options")
+  }
+
+  # check the distribution is compatible with the method
+  method$distribution_check(distrib)
+
+  stop ("not yet implemented")
+
+}
+
+#' @rdname marginalisation
+#' @export
+#'
+#' @param fun an R vector giving values at which to evaluate the function for a
+#'   discrete marginalisation
+#'
+#' @details \code{discrete_marginalisation} can only be used with discrete
+#'   probability distributions, e.g. those defined with \code{poisson()} and
+#'   \code{binomial()}. For discrete distributions with finite support (such as
+#'   \code{bernoulli()}) the marginalisation will be exact, so long as
+#'   \code{values} includes all possible values of the variable. For discrete
+#'   distributions with non-finite support (such as \code{poisson()}, which has
+#'   no upper bound), the marginalisation can only ever be approximate. However
+#'   if \code{values} cover a range of values with sufficiently high support in
+#'   the distribution, that approximation error will be minimal.
+discrete_marginalisation <- function(values) {
+
+  if (inherits(values, "greta_array")) {
+    stop ("'values' must be an R numeric vector, not a greta array")
+  }
+
+  # define the marginalisation function
+  marginalisation_function <- function() {
+    stop ("not yet implemented")
+  }
+
+  as_marginaliser(name = "discrete",
+                  method = marginalisation_function,
+                  parameters = list(values = values),
+                  distribution_check = discrete_check)
+
+}
+
+# check that the distribution is discrete
+discrete_check <- function(node) {
+  if (!distrib$discrete) {
+    stop ("this marginalisation method can only be used with discrete distributions")
+  }
+}
+
+# helper to contruct marginalisers
+as_marginaliser <- function (name, method, parameters, distribution_check) {
+
+  obj <- list(name = name,
+              method = method,
+              parameters = parameters,
+              distribution_check = distribution_check)
+
+  class_name <- paste0(name, "_marginaliser")
+  class(obj) <- c(class_name, "marginaliser")
+  obj
+
+}
+
+#' @noRd
+#' @export
+print.marginaliser <- function(x, ...) {
+  msg <- paste(x$name, "marginaliser object")
+  cat(msg)
+}
diff --git a/man/marginalisation.Rd b/man/marginalisation.Rd
new file mode 100644
index 00000000..5909725b
--- /dev/null
+++ b/man/marginalisation.Rd
@@ -0,0 +1,83 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/marginalise.R
+\name{marginalisation}
+\alias{marginalisation}
+\alias{marginalise}
+\alias{discrete_marginalisation}
+\title{direct marginalisation of random variables}
+\usage{
+marginalise(fun, variable, method)
+
+discrete_marginalisation(values)
+}
+\arguments{
+\item{fun}{an R function to integrate with respect to the random variable.
+Must take a single argument - a greta array giving the value of the random
+variable}
+
+\item{variable}{a variable greta array with a distribution, representing the
+random variable to marginalise}
+
+\item{method}{a \code{marginaliser} object giving the method for carrying out
+the marginalisation}
+
+\item{fun}{an R vector giving values at which to evaluate the function for a
+discrete marginalisation}
+}
+\value{
+\code{discrete_marginalisation} - a \code{marginaliser} object that
+  can be passed to \code{marginalise}.
+}
+\description{
+Inference on many statistical models requires marginalisation of
+  (ie. integration of functions over) random variables. In addition to
+  general purpose marginalisation methods like MCMC that act on an entire
+  model, it is often useful to directly marginalise a single random variable
+  within a larger model; e.g. for random variables where marginalisation by
+  MCMC is inefficient, impossible, or when an approximate marginalisation is
+  substantially more efficient. \code{marginalise()} performs direct
+  marginalisation within a greta model, using one of the available marginalisation methods.
+}
+\details{
+\code{discrete_marginalisation} can only be used with discrete
+  probability distributions, e.g. those defined with \code{poisson()} and
+  \code{binomial()}. For discrete distributions with finite support (such as
+  \code{bernoulli()}) the marginalisation will be exact, so long as
+  \code{values} includes all possible values of the variable. For discrete
+  distributions with non-finite support (such as \code{poisson()}, which has
+  no upper bound), the marginalisation can only ever be approximate. However
+  if \code{values} cover a range of values with sufficiently high support in
+  the distribution, that approximation error will be minimal.
+}
+\examples{
+\dontrun{
+# marginalise a discrete random variable and carry out HMC on the
+# continouous variables:
+y <- rnorm(100)
+
+lambda <- lognormal(0, 1)
+theta <- normal(0, 1)
+sd <- lognormal(0, 1)
+
+# if discrete variables could be sampled by MCMC in greta,
+# we would be able to simply do:
+#   n <- poisson(lambda)
+#   mu <- theta ^ n
+#   distribution(y) <- normal(mu, sd)
+
+# the function to marginalise over
+# (only the variable to marginalise is passed in)
+fun <- function (n) {
+  mu <- theta ^ n
+  distribution(y) <- normal(mu, sd)
+}
+
+# integrate the function values w.r.t. the poisson distribution
+marginalise(fun,
+            poisson(lambda),
+            method = discrete_marginalisation(values = 0:10))
+
+m <- model(lambda)
+draws <- mcmc(m, hmc())
+}
+}

From 1e5e9077c65a3a1df9d13501b2a83a801d38a459 Mon Sep 17 00:00:00 2001
From: Nick Golding <goldingn35@gmail.com>
Date: Mon, 7 Jan 2019 14:40:31 +1100
Subject: [PATCH 02/38] add tests for marginalisation error messages

---
 R/marginalise.R                       | 24 +++++------
 tests/testthat/test_marginalisation.R | 57 +++++++++++++++++++++++++++
 2 files changed, 70 insertions(+), 11 deletions(-)
 create mode 100644 tests/testthat/test_marginalisation.R

diff --git a/R/marginalise.R b/R/marginalise.R
index 42f3af61..38637461 100644
--- a/R/marginalise.R
+++ b/R/marginalise.R
@@ -71,13 +71,7 @@ marginalise <- function(fun, variable, method) {
     stop ("'variable' must be a variable greta array with a distribution")
   }
 
-  # handle case when the method is passed like: `laplace_approximation`, rather
-  # than `laplace_approximation()`
-  if (is.function(method)) {
-    method <- method()
-  }
-
-  if (!inherits(method, "marginalisation_method")) {
+  if (!inherits(method, "marginaliser")) {
     stop ("'method' must be a valid marginalisation method. ",
           "See ?marginalise for options")
   }
@@ -106,8 +100,14 @@ marginalise <- function(fun, variable, method) {
 #'   the distribution, that approximation error will be minimal.
 discrete_marginalisation <- function(values) {
 
-  if (inherits(values, "greta_array")) {
-    stop ("'values' must be an R numeric vector, not a greta array")
+  if (!is.vector(values) | !is.numeric(values)) {
+    msg <- "'values' must be an R numeric vector"
+
+    if (inherits(values, "greta_array")) {
+      msg <- paste0(msg, ", not a greta array")
+    }
+
+    stop (msg)
   }
 
   # define the marginalisation function
@@ -123,9 +123,11 @@ discrete_marginalisation <- function(values) {
 }
 
 # check that the distribution is discrete
-discrete_check <- function(node) {
+discrete_check <- function(distrib) {
   if (!distrib$discrete) {
-    stop ("this marginalisation method can only be used with discrete distributions")
+    stop ("this marginalisation method can only be used ",
+          "with discrete distributions",
+          call. = FALSE)
   }
 }
 
diff --git a/tests/testthat/test_marginalisation.R b/tests/testthat/test_marginalisation.R
new file mode 100644
index 00000000..1558dbb3
--- /dev/null
+++ b/tests/testthat/test_marginalisation.R
@@ -0,0 +1,57 @@
+context("marginalisation")
+
+test_that("marginalise errors nicely", {
+
+  skip_if_not(check_tf_version())
+  source("helpers.R")
+
+  # not a function
+  expect_error(
+    marginalise("apple", normal(0, 1), discrete_marginalisation()),
+    "must be an R function"
+  )
+
+  # not a greta array
+  expect_error(
+    marginalise(I, 1:5, discrete_marginalisation()),
+    "must be a variable greta array with a distribution"
+  )
+
+  # greta array but no distribution
+  expect_error(
+    marginalise(I, variable(), discrete_marginalisation()),
+    "must be a variable greta array with a distribution"
+  )
+
+  # not a marginaliser
+  expect_error(
+    marginalise(I, normal(0, 1), mean),
+    "'method' must be a valid marginalisation method"
+  )
+
+})
+
+test_that("discrete_marginalisation errors nicely", {
+
+  skip_if_not(check_tf_version())
+  source("helpers.R")
+
+  # greta array, not a numeric
+  expect_error(
+    discrete_marginalisation(values = variable()),
+    "must be an R numeric vector, not a greta array"
+  )
+
+  # not a numeric
+  expect_error(
+    discrete_marginalisation(values = c("apple", "banana")),
+    "must be an R numeric vector$"
+  )
+
+  # mismatch with distribution
+  expect_error(
+    marginalise(I, normal(0, 1), discrete_marginalisation(values = 1:5)),
+    "can only be used with discrete distributions"
+  )
+
+})

From 3b56a6cddf54cc0a6001d7380c5fc67768c5a86e Mon Sep 17 00:00:00 2001
From: Nick Golding <goldingn35@gmail.com>
Date: Mon, 7 Jan 2019 15:07:17 +1100
Subject: [PATCH 03/38] fix typo in marginalisation docs

---
 R/marginalise.R        | 2 +-
 man/marginalisation.Rd | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/marginalise.R b/R/marginalise.R
index 38637461..9c8bb37e 100644
--- a/R/marginalise.R
+++ b/R/marginalise.R
@@ -86,7 +86,7 @@ marginalise <- function(fun, variable, method) {
 #' @rdname marginalisation
 #' @export
 #'
-#' @param fun an R vector giving values at which to evaluate the function for a
+#' @param values an R vector giving values at which to evaluate the function for a
 #'   discrete marginalisation
 #'
 #' @details \code{discrete_marginalisation} can only be used with discrete
diff --git a/man/marginalisation.Rd b/man/marginalisation.Rd
index 5909725b..fb05930f 100644
--- a/man/marginalisation.Rd
+++ b/man/marginalisation.Rd
@@ -21,7 +21,7 @@ random variable to marginalise}
 \item{method}{a \code{marginaliser} object giving the method for carrying out
 the marginalisation}
 
-\item{fun}{an R vector giving values at which to evaluate the function for a
+\item{values}{an R vector giving values at which to evaluate the function for a
 discrete marginalisation}
 }
 \value{

From 0c766e6e0dc4a3947dcbba62f51edc891fcffcfe Mon Sep 17 00:00:00 2001
From: Nick Golding <goldingn35@gmail.com>
Date: Mon, 7 Jan 2019 16:52:42 +1100
Subject: [PATCH 04/38] flesh out internals of marginalisation

---
 R/marginalise.R        | 128 ++++++++++++++++++++++++++++++++++++++---
 man/marginalisation.Rd |  19 +++---
 2 files changed, 131 insertions(+), 16 deletions(-)

diff --git a/R/marginalise.R b/R/marginalise.R
index 9c8bb37e..d018348c 100644
--- a/R/marginalise.R
+++ b/R/marginalise.R
@@ -31,9 +31,9 @@
 #' #   mu <- theta ^ n
 #' #   distribution(y) <- normal(mu, sd)
 #'
-#' # the function to marginalise over
-#' # (only the variable to marginalise is passed in)
-#' fun <- function (n) {
+#' # the function to marginalise over - the variable to marginalise
+#' # must be first, others must be passed to marginalise()
+#' fun <- function (n, theta, mu, sd) {
 #'   mu <- theta ^ n
 #'   distribution(y) <- normal(mu, sd)
 #' }
@@ -41,7 +41,10 @@
 #' # integrate the function values w.r.t. the poisson distribution
 #' marginalise(fun,
 #'             poisson(lambda),
-#'             method = discrete_marginalisation(values = 0:10))
+#'             method = discrete_marginalisation(values = 0:10),
+#'             theta = theta,
+#'             mu = mu,
+#'             sd = sd)
 #'
 #' m <- model(lambda)
 #' draws <- mcmc(m, hmc())
@@ -52,14 +55,16 @@ NULL
 #' @export
 #'
 #' @param fun an R function to integrate with respect to the random variable.
-#'   Must take a single argument - a greta array giving the value of the random
-#'   variable
+#'   The first argument must be - a greta array giving the value of the random
+#'   variable, any subsequent arguments must passed in via the \dots argument.
 #' @param variable a variable greta array with a distribution, representing the
 #'   random variable to marginalise
 #' @param method a \code{marginaliser} object giving the method for carrying out
 #'   the marginalisation
-marginalise <- function(fun, variable, method) {
+#' @param \dots named greta arrays to be passed to \code{fun}
+marginalise <- function(fun, variable, method, ...) {
 
+  # get the distribution for the random variable
   distrib <- get_node(variable)$distribution
 
   # check the inputs
@@ -67,6 +72,14 @@ marginalise <- function(fun, variable, method) {
     stop ("'fun' must be an R function")
   }
 
+  # check the additional arguments to fun are given in dots
+  dots <- list(...)
+  dot_names <- names(dots)
+  expected_names <- names(formals(fun)[-1])
+  if (!all(expected_names %in% dot_names)) {
+    stop ("all arguments to 'fun' must be passed, named, to marginalise")
+  }
+
   if (!inherits(distrib, "distribution_node")) {
     stop ("'variable' must be a variable greta array with a distribution")
   }
@@ -79,10 +92,98 @@ marginalise <- function(fun, variable, method) {
   # check the distribution is compatible with the method
   method$distribution_check(distrib)
 
+  # excise the variable from the distribution
+  distrib$remove_target()
+
   stop ("not yet implemented")
 
+  # turn the greta function into a TF conditional density function; doing
+  # something very similar to as_tf_function(), but giving a function that
+  # returns a tensorflow scalar for the density
+  conditional_joint_density <- as_conditional_density(fun)
+
+  # get a tf function from the method which will turn that conditional density
+  # function into a marginal density (conditional on the inputs to the
+  # distribution) to be added to the overall model density
+  method$tf_marginaliser
+
+  # pass the conditional density function and the marginaliser TF function to a
+  # marginalisation distribution (needs an R6 class)
+
+  # create the distribution
+  vble <- distrib(
+    "marginalisation",
+    marginaliser = method,
+    conditional_density_fun = conditional_joint_density,
+    distribution = distrib,
+    dots = dots
+  )
+
+  # excise the variable from the marginalisation distribution
+  distrib <- get_node(variable)$distribution
+  distrib$remove_target()
+
+  # return nothing
+  invisible(NULL)
+
 }
 
+marginalisation_distribution <- R6Class(
+  "marginalisation_distribution",
+  inherit = distribution_node,
+  public = list(
+
+    tf_marginaliser = NULL,
+    conditional_density_fun = NULL,
+
+    initialize = function(marginaliser,
+                          conditional_density_fun,
+                          distribution,
+                          dots) {
+
+      # initialize class, and add methods
+      super$initialize("marginalisation")
+      self$tf_marginaliser <- marginaliser$tf_marginaliser
+      self$conditional_density_fun <- conditional_density_fun
+
+      # add the dots (extra inputs to conditional_density_fun) as parameters
+      dot_nodes <- lapply(dots, get_node)
+      for (i in seq_len(dot_nodes)) {
+        self$add_parameter(dot_nodes[[i]],
+                           paste("input", i))
+      }
+
+      # add the distribution as a parameter
+      self$add_parameter(distribution, "distribution")
+
+    },
+
+    tf_distrib = function(parameters, dag) {
+
+      # do the marginalisation
+
+      # but it has no target!!
+      # should we handle distributions with no targets as a special case?
+      # or should this not be a distribution after all?
+
+      log_prob <- function(x) {
+
+        self$tf_marginaliser(self$conditional_density_fun,
+                             parameters$distribution,
+                             parameters[names(parameters) != "distribution"])
+
+      }
+
+      list(log_prob = log_prob, cdf = NULL, log_cdf = NULL)
+
+    },
+
+    tf_cdf_function = NULL,
+    tf_log_cdf_function = NULL
+
+  )
+)
+
 #' @rdname marginalisation
 #' @export
 #'
@@ -111,12 +212,21 @@ discrete_marginalisation <- function(values) {
   }
 
   # define the marginalisation function
-  marginalisation_function <- function() {
+  tf_marginaliser <- function(conditional_density_fun,
+                              distribution,
+                              other_args) {
+    # base this on mixture;
+    # 1. get weights from the distribution (use its log_pdf tf method on the
+    #   values)
+    # 2. compute the conditional joint density for each value (passing in
+    #   other_args)
+    # 3. compute a weighted sum with tf_reduce_log_sum_exp()
+
     stop ("not yet implemented")
   }
 
   as_marginaliser(name = "discrete",
-                  method = marginalisation_function,
+                  tf_marginaliser = tf_marginaliser,
                   parameters = list(values = values),
                   distribution_check = discrete_check)
 
diff --git a/man/marginalisation.Rd b/man/marginalisation.Rd
index fb05930f..f5968662 100644
--- a/man/marginalisation.Rd
+++ b/man/marginalisation.Rd
@@ -6,14 +6,14 @@
 \alias{discrete_marginalisation}
 \title{direct marginalisation of random variables}
 \usage{
-marginalise(fun, variable, method)
+marginalise(fun, variable, method, ...)
 
 discrete_marginalisation(values)
 }
 \arguments{
 \item{fun}{an R function to integrate with respect to the random variable.
-Must take a single argument - a greta array giving the value of the random
-variable}
+The first argument must be - a greta array giving the value of the random
+variable, any subsequent arguments must passed in via the \dots argument.}
 
 \item{variable}{a variable greta array with a distribution, representing the
 random variable to marginalise}
@@ -21,6 +21,8 @@ random variable to marginalise}
 \item{method}{a \code{marginaliser} object giving the method for carrying out
 the marginalisation}
 
+\item{\dots}{named greta arrays to be passed to \code{fun}}
+
 \item{values}{an R vector giving values at which to evaluate the function for a
 discrete marginalisation}
 }
@@ -65,9 +67,9 @@ sd <- lognormal(0, 1)
 #   mu <- theta ^ n
 #   distribution(y) <- normal(mu, sd)
 
-# the function to marginalise over
-# (only the variable to marginalise is passed in)
-fun <- function (n) {
+# the function to marginalise over - the variable to marginalise
+# must be first, others must be passed to marginalise()
+fun <- function (n, theta, mu, sd) {
   mu <- theta ^ n
   distribution(y) <- normal(mu, sd)
 }
@@ -75,7 +77,10 @@ fun <- function (n) {
 # integrate the function values w.r.t. the poisson distribution
 marginalise(fun,
             poisson(lambda),
-            method = discrete_marginalisation(values = 0:10))
+            method = discrete_marginalisation(values = 0:10),
+            theta = theta,
+            mu = mu,
+            sd = sd)
 
 m <- model(lambda)
 draws <- mcmc(m, hmc())

From 7e802cbce3d8d131f075b032dbae7da7b61ad74e Mon Sep 17 00:00:00 2001
From: Nick Golding <goldingn35@gmail.com>
Date: Tue, 8 Jan 2019 12:00:25 +1100
Subject: [PATCH 05/38] fill in last of guess at internals

---
 R/marginalise.R | 164 ++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 130 insertions(+), 34 deletions(-)

diff --git a/R/marginalise.R b/R/marginalise.R
index d018348c..977ba05f 100644
--- a/R/marginalise.R
+++ b/R/marginalise.R
@@ -55,13 +55,18 @@ NULL
 #' @export
 #'
 #' @param fun an R function to integrate with respect to the random variable.
-#'   The first argument must be - a greta array giving the value of the random
-#'   variable, any subsequent arguments must passed in via the \dots argument.
+#'   The first argument must be a greta array for the random variable, any
+#'   subsequent arguments must passed in via the \dots argument.
 #' @param variable a variable greta array with a distribution, representing the
 #'   random variable to marginalise
 #' @param method a \code{marginaliser} object giving the method for carrying out
 #'   the marginalisation
 #' @param \dots named greta arrays to be passed to \code{fun}
+#'
+#' @details The code in \code{fun} must define at least one distribution over
+#'   data (ie. a model likelihood), and cannot create any new variables. Any
+#'   variables must be created outide this function, and passed in via the \dots
+#'   argument.
 marginalise <- function(fun, variable, method, ...) {
 
   # get the distribution for the random variable
@@ -95,20 +100,19 @@ marginalise <- function(fun, variable, method, ...) {
   # excise the variable from the distribution
   distrib$remove_target()
 
-  stop ("not yet implemented")
-
   # turn the greta function into a TF conditional density function; doing
   # something very similar to as_tf_function(), but giving a function that
-  # returns a tensorflow scalar for the density
-  conditional_joint_density <- as_conditional_density(fun)
+  # returns a tensorflow scalar for the density (unadjusted since there will be
+  # no new variables)
+  args <- c(list(variable), dots)
+  conditional_joint_density <- as_conditional_density(fun, args)
 
   # get a tf function from the method which will turn that conditional density
   # function into a marginal density (conditional on the inputs to the
   # distribution) to be added to the overall model density
-  method$tf_marginaliser
 
-  # pass the conditional density function and the marginaliser TF function to a
-  # marginalisation distribution (needs an R6 class)
+  # pass the conditional density function and the marginaliser to a
+  # marginalisation distribution
 
   # create the distribution
   vble <- distrib(
@@ -119,11 +123,7 @@ marginalise <- function(fun, variable, method, ...) {
     dots = dots
   )
 
-  # excise the variable from the marginalisation distribution
-  distrib <- get_node(variable)$distribution
-  distrib$remove_target()
-
-  # return nothing
+  # return nothing (users can't have distribution nodes on their own)
   invisible(NULL)
 
 }
@@ -153,24 +153,22 @@ marginalisation_distribution <- R6Class(
                            paste("input", i))
       }
 
-      # add the distribution as a parameter
-      self$add_parameter(distribution, "distribution")
+      # make the distribution the target
+      self$remove_target()
+      self$add_target(distribution)
 
     },
 
     tf_distrib = function(parameters, dag) {
 
-      # do the marginalisation
-
-      # but it has no target!!
-      # should we handle distributions with no targets as a special case?
-      # or should this not be a distribution after all?
-
+      # the marginal density implied by the function
       log_prob <- function(x) {
 
+        # x will be the target; a tf function for the distribution being
+        # marginalised
         self$tf_marginaliser(self$conditional_density_fun,
-                             parameters$distribution,
-                             parameters[names(parameters) != "distribution"])
+                             tf_distribution_log_pdf = x,
+                             parameters)
 
       }
 
@@ -212,17 +210,37 @@ discrete_marginalisation <- function(values) {
   }
 
   # define the marginalisation function
-  tf_marginaliser <- function(conditional_density_fun,
-                              distribution,
+  tf_marginaliser <- function(tf_conditional_density_fun,
+                              tf_distribution_log_pdf,
                               other_args) {
-    # base this on mixture;
-    # 1. get weights from the distribution (use its log_pdf tf method on the
-    #   values)
+
+    values_list <- as.list(values)
+
+    # 1. get weights from the distribution log pdf
+
+    # assuming values is a list, get tensors for the weights
+    weights_list <- lapply(values_list, tf_distribution_log_pdf)
+    weights_list <- lapply(weights_list, tf_sum)
+
+    # convert to a vector and make them sum to 1
+    weights_vec <- tf$concat(weights_list, axis = 1L)
+    weights_vec <- weights_vec / tf_sum(weights_vec)
+    log_weights_vec <- tf$log(weights_vec)
+
     # 2. compute the conditional joint density for each value (passing in
-    #   other_args)
-    # 3. compute a weighted sum with tf_reduce_log_sum_exp()
+    # other_args)
+    log_density_list <- lapply(
+      values_list,
+      tf_conditional_density_fun,
+      other_args
+    )
+
+    log_density_vec <- tf$concat(log_density_list, axis = 1L)
+
+    # 3. compute a weighted sum
+    log_density_weighted_vec <- log_density_vec + log_weights_vec
+    tf$reduce_logsumexp(log_density_weighted_vec, axis = 1L)
 
-    stop ("not yet implemented")
   }
 
   as_marginaliser(name = "discrete",
@@ -242,10 +260,10 @@ discrete_check <- function(distrib) {
 }
 
 # helper to contruct marginalisers
-as_marginaliser <- function (name, method, parameters, distribution_check) {
+as_marginaliser <- function (name, tf_marginaliser, parameters, distribution_check) {
 
   obj <- list(name = name,
-              method = method,
+              tf_marginaliser = tf_marginaliser,
               parameters = parameters,
               distribution_check = distribution_check)
 
@@ -261,3 +279,81 @@ print.marginaliser <- function(x, ...) {
   msg <- paste(x$name, "marginaliser object")
   cat(msg)
 }
+
+# turn the greta function into a TF conditional density function; doing
+# something very similar to as_tf_function(), but giving a function that
+# returns a tensorflow scalar for the density
+as_conditional_density <- function(r_fun, args) {
+
+  # run the operation on isolated greta arrays, so nothing gets attached to the
+  # model. Real greta arrays in args
+  ga_dummies <- lapply(args, dummy_greta_array)
+  out <- do.call(r_fun, ga_dummies)
+
+  # we don't want to rely on the function returning a greta array that depends
+  # on everything internally. instead, we want to grab any densities defined in
+  # the function.
+
+  # this function will take in tensors corresponding to the things in args
+  function(...) {
+
+    tensor_inputs <- list(...)
+
+    # if any of these are shapeless, make them into greta scalars (3D)
+    tensor_inputs <- lapply(tensor_inputs,
+                            function(x) {
+                              if (identical(dim(x), list()))
+                                x <- tf$reshape(x, shape(1, 1, 1))
+                              x
+                            })
+
+    # transfer batch dimensions if needed
+    tensor_inputs <- match_batches(tensor_inputs)
+
+    # create a sub-dag for everything connected to the inputs
+    sub_dag <- dag_class$new(ga_dummies)
+
+    # check there are distributions
+    if (!any(sub_dag$node_types == "distribution")) {
+      stop ("'fun' must constain at least one distribution over data",
+            call. = FALSE)
+    }
+
+    # check there are no variables
+    if (any(sub_dag$node_types == "variable")) {
+      stop ("'fun' must not create any new variables; ",
+            "variables can be passed in as arguments",
+            call. = FALSE)
+    }
+
+    # use the default graph, so that it can be overwritten when this is called
+    # (using on_graph())
+    sub_dag$tf_graph <- tf$get_default_graph()
+    sub_tfe <- sub_dag$tf_environment
+
+    # set the input tensors as the values for the dummy greta arrays in the new
+    # tf_environment
+    node_dummies <- lapply(ga_dummies, get_node)
+    tf_names <- lapply(node_dummies, sub_dag$tf_name)
+    for (i in seq_along(tf_names))
+      assign(tf_names[[i]], tensor_inputs[[i]], envir = sub_tfe)
+
+    # have any new data defined as constants, to avoid placeholding issues in
+    # the wider model
+    greta_stash$data_as_constants <- TRUE
+    on.exit(greta_stash$data_as_constants <- NULL)
+
+    # define all nodes (only data, operation and distribution) in the
+    # environment, and on the graph
+    sub_dag$on_graph(lapply(sub_dag$node_list,
+                            function(x) x$define_tf(self)))
+
+    # define and return the joint density tensor (no variables, so no need for
+    # log jacobian adjustment)
+    sub_dag$define_joint_density()
+
+    sub_tfe$joint_density
+
+  }
+
+}

From ad98d72ea74d629c2e4cea7a99413b6b82660e8a Mon Sep 17 00:00:00 2001
From: Nick Golding <goldingn35@gmail.com>
Date: Tue, 8 Jan 2019 14:46:04 +1100
Subject: [PATCH 06/38] be strict about float type in as_tf_function

---
 R/utils.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/R/utils.R b/R/utils.R
index 9bfc0f6f..dc50d44e 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -1151,7 +1151,8 @@ as_tf_function <- function(r_fun, ...) {
     if (!is.list(ga_out))
       ga_out <- list(ga_out)
     targets <- c(ga_out, ga_dummies)
-    sub_dag <- dag_class$new(targets)
+    sub_dag <- dag_class$new(targets,
+                             tf_float = options()$greta_tf_float)
 
     # use the default graph, so that it can be overwritten when this is called?
     # alternatively fetch from above, or put it in greta_stash?

From 9219a42a97acc4e76c18cafc6794085b1e00bece Mon Sep 17 00:00:00 2001
From: Nick Golding <goldingn35@gmail.com>
Date: Tue, 8 Jan 2019 14:46:52 +1100
Subject: [PATCH 07/38] bugfixes to get marginalise apparently working

---
 R/dag_class.R          | 30 ++++++++++++++++------------
 R/marginalise.R        | 45 +++++++++++++++++++++++-------------------
 man/marginalisation.Rd |  9 +++++++--
 3 files changed, 49 insertions(+), 35 deletions(-)

diff --git a/R/dag_class.R b/R/dag_class.R
index eb6fe48e..b05bae02 100644
--- a/R/dag_class.R
+++ b/R/dag_class.R
@@ -242,7 +242,7 @@ dag_class <- R6Class(
     },
 
     # define tensor for overall log density and gradients
-    define_joint_density = function() {
+    define_joint_density = function(adjusted = TRUE) {
 
       tfe <- self$tf_environment
 
@@ -293,22 +293,26 @@ dag_class <- R6Class(
              joint_density,
              envir = self$tf_environment)
 
-      # define adjusted joint density
+      if (adjusted) {
 
-      # get names of adjustment tensors for all variable nodes
-      adj_names <- paste0(self$get_tf_names(types = "variable"), "_adj")
+        # optionally define adjusted joint density
 
-      # get TF density tensors for all distribution
-      adj <- lapply(adj_names, get, envir = self$tf_environment)
+        # get names of adjustment tensors for all variable nodes
+        adj_names <- paste0(self$get_tf_names(types = "variable"), "_adj")
 
-      # remove their names and sum them together
-      names(adj) <- NULL
-      self$on_graph(total_adj <- tf$add_n(adj))
+        # get TF density tensors for all distribution
+        adj <- lapply(adj_names, get, envir = self$tf_environment)
 
-      # assign overall density to environment
-      assign("joint_density_adj",
-             joint_density + total_adj,
-             envir = self$tf_environment)
+        # remove their names and sum them together
+        names(adj) <- NULL
+        self$on_graph(total_adj <- tf$add_n(adj))
+
+        # assign overall density to environment
+        assign("joint_density_adj",
+               joint_density + total_adj,
+               envir = self$tf_environment)
+
+      }
 
     },
 
diff --git a/R/marginalise.R b/R/marginalise.R
index 977ba05f..513000c3 100644
--- a/R/marginalise.R
+++ b/R/marginalise.R
@@ -70,7 +70,7 @@ NULL
 marginalise <- function(fun, variable, method, ...) {
 
   # get the distribution for the random variable
-  distrib <- get_node(variable)$distribution
+  distr <- get_node(variable)$distribution
 
   # check the inputs
   if (!is.function(fun)) {
@@ -85,7 +85,7 @@ marginalise <- function(fun, variable, method, ...) {
     stop ("all arguments to 'fun' must be passed, named, to marginalise")
   }
 
-  if (!inherits(distrib, "distribution_node")) {
+  if (!inherits(distr, "distribution_node")) {
     stop ("'variable' must be a variable greta array with a distribution")
   }
 
@@ -95,10 +95,10 @@ marginalise <- function(fun, variable, method, ...) {
   }
 
   # check the distribution is compatible with the method
-  method$distribution_check(distrib)
+  method$distribution_check(distr)
 
   # excise the variable from the distribution
-  distrib$remove_target()
+  distr$remove_target()
 
   # turn the greta function into a TF conditional density function; doing
   # something very similar to as_tf_function(), but giving a function that
@@ -116,10 +116,10 @@ marginalise <- function(fun, variable, method, ...) {
 
   # create the distribution
   vble <- distrib(
-    "marginalisation",
+    distribution = "marginalisation",
     marginaliser = method,
     conditional_density_fun = conditional_joint_density,
-    distribution = distrib,
+    target_distribution = distr,
     dots = dots
   )
 
@@ -138,7 +138,7 @@ marginalisation_distribution <- R6Class(
 
     initialize = function(marginaliser,
                           conditional_density_fun,
-                          distribution,
+                          target_distribution,
                           dots) {
 
       # initialize class, and add methods
@@ -148,14 +148,14 @@ marginalisation_distribution <- R6Class(
 
       # add the dots (extra inputs to conditional_density_fun) as parameters
       dot_nodes <- lapply(dots, get_node)
-      for (i in seq_len(dot_nodes)) {
+      for (i in seq_along(dot_nodes)) {
         self$add_parameter(dot_nodes[[i]],
                            paste("input", i))
       }
 
       # make the distribution the target
       self$remove_target()
-      self$add_target(distribution)
+      self$add_target(target_distribution)
 
     },
 
@@ -213,11 +213,14 @@ discrete_marginalisation <- function(values) {
   tf_marginaliser <- function(tf_conditional_density_fun,
                               tf_distribution_log_pdf,
                               other_args) {
-
+    # convert these into a list of constant tensors with the correct dimensions
+    # and float types
     values_list <- as.list(values)
+    values_list <- lapply(values_list, as_2D_array)
+    values_list <- lapply(values_list, add_first_dim)
+    values_list <- lapply(values_list, fl)
 
     # 1. get weights from the distribution log pdf
-
     # assuming values is a list, get tensors for the weights
     weights_list <- lapply(values_list, tf_distribution_log_pdf)
     weights_list <- lapply(weights_list, tf_sum)
@@ -229,12 +232,14 @@ discrete_marginalisation <- function(values) {
 
     # 2. compute the conditional joint density for each value (passing in
     # other_args)
-    log_density_list <- lapply(
-      values_list,
-      tf_conditional_density_fun,
-      other_args
-    )
+    log_density_list <- list()
+    for (i in seq_along(values_list)) {
+      args <- c(list(values_list[[i]]), other_args)
+      log_density_list[[i]] <- do.call(tf_conditional_density_fun, args)
+    }
 
+    log_density_list <- lapply(log_density_list, tf$expand_dims, 1L)
+    log_density_list <- lapply(log_density_list, tf$expand_dims, 2L)
     log_density_vec <- tf$concat(log_density_list, axis = 1L)
 
     # 3. compute a weighted sum
@@ -311,7 +316,7 @@ as_conditional_density <- function(r_fun, args) {
     tensor_inputs <- match_batches(tensor_inputs)
 
     # create a sub-dag for everything connected to the inputs
-    sub_dag <- dag_class$new(ga_dummies)
+    sub_dag <- dag_class$new(ga_dummies, tf_float = options()$greta_tf_float)
 
     # check there are distributions
     if (!any(sub_dag$node_types == "distribution")) {
@@ -321,7 +326,7 @@ as_conditional_density <- function(r_fun, args) {
 
     # check there are no variables
     if (any(sub_dag$node_types == "variable")) {
-      stop ("'fun' must not create any new variables; ",
+      stop ("'fun' must not create any new variables, ",
             "variables can be passed in as arguments",
             call. = FALSE)
     }
@@ -346,11 +351,11 @@ as_conditional_density <- function(r_fun, args) {
     # define all nodes (only data, operation and distribution) in the
     # environment, and on the graph
     sub_dag$on_graph(lapply(sub_dag$node_list,
-                            function(x) x$define_tf(self)))
+                            function(x) x$define_tf(sub_dag)))
 
     # define and return the joint density tensor (no variables, so no need for
     # log jacobian adjustment)
-    sub_dag$define_joint_density()
+    sub_dag$define_joint_density(adjusted = FALSE)
 
     sub_tfe$joint_density
 
diff --git a/man/marginalisation.Rd b/man/marginalisation.Rd
index f5968662..958a3ec6 100644
--- a/man/marginalisation.Rd
+++ b/man/marginalisation.Rd
@@ -12,8 +12,8 @@ discrete_marginalisation(values)
 }
 \arguments{
 \item{fun}{an R function to integrate with respect to the random variable.
-The first argument must be - a greta array giving the value of the random
-variable, any subsequent arguments must passed in via the \dots argument.}
+The first argument must be a greta array for the random variable, any
+subsequent arguments must passed in via the \dots argument.}
 
 \item{variable}{a variable greta array with a distribution, representing the
 random variable to marginalise}
@@ -41,6 +41,11 @@ Inference on many statistical models requires marginalisation of
   marginalisation within a greta model, using one of the available marginalisation methods.
 }
 \details{
+The code in \code{fun} must define at least one distribution over
+  data (ie. a model likelihood), and cannot create any new variables. Any
+  variables must be created outide this function, and passed in via the \dots
+  argument.
+
 \code{discrete_marginalisation} can only be used with discrete
   probability distributions, e.g. those defined with \code{poisson()} and
   \code{binomial()}. For discrete distributions with finite support (such as

From 24fabd1a959308e784e178567cbf8a066d132bf6 Mon Sep 17 00:00:00 2001
From: Nick Golding <goldingn35@gmail.com>
Date: Tue, 8 Jan 2019 16:53:36 +1100
Subject: [PATCH 08/38] add marginalisation tests and realted fixes; fix
 marginalise() example

---
 R/marginalise.R                       | 11 ++--
 man/marginalisation.Rd                |  5 +-
 tests/testthat/test_marginalisation.R | 95 ++++++++++++++++++++++++++-
 3 files changed, 100 insertions(+), 11 deletions(-)

diff --git a/R/marginalise.R b/R/marginalise.R
index 513000c3..6a4ff340 100644
--- a/R/marginalise.R
+++ b/R/marginalise.R
@@ -33,17 +33,16 @@
 #'
 #' # the function to marginalise over - the variable to marginalise
 #' # must be first, others must be passed to marginalise()
-#' fun <- function (n, theta, mu, sd) {
+#' likelihood <- function (n, theta, sd) {
 #'   mu <- theta ^ n
 #'   distribution(y) <- normal(mu, sd)
 #' }
 #'
 #' # integrate the function values w.r.t. the poisson distribution
-#' marginalise(fun,
-#'             poisson(lambda),
+#' marginalise(fun = likelihood,
+#'             variable = poisson(lambda),
 #'             method = discrete_marginalisation(values = 0:10),
 #'             theta = theta,
-#'             mu = mu,
 #'             sd = sd)
 #'
 #' m <- model(lambda)
@@ -213,6 +212,7 @@ discrete_marginalisation <- function(values) {
   tf_marginaliser <- function(tf_conditional_density_fun,
                               tf_distribution_log_pdf,
                               other_args) {
+
     # convert these into a list of constant tensors with the correct dimensions
     # and float types
     values_list <- as.list(values)
@@ -225,8 +225,9 @@ discrete_marginalisation <- function(values) {
     weights_list <- lapply(values_list, tf_distribution_log_pdf)
     weights_list <- lapply(weights_list, tf_sum)
 
-    # convert to a vector and make them sum to 1
+    # convert to a vector of discrete probabilities and make them sum to 1
     weights_vec <- tf$concat(weights_list, axis = 1L)
+    weights_vec <- tf$exp(weights_vec)
     weights_vec <- weights_vec / tf_sum(weights_vec)
     log_weights_vec <- tf$log(weights_vec)
 
diff --git a/man/marginalisation.Rd b/man/marginalisation.Rd
index 958a3ec6..e3365ab7 100644
--- a/man/marginalisation.Rd
+++ b/man/marginalisation.Rd
@@ -74,17 +74,16 @@ sd <- lognormal(0, 1)
 
 # the function to marginalise over - the variable to marginalise
 # must be first, others must be passed to marginalise()
-fun <- function (n, theta, mu, sd) {
+likelihood <- function (n, theta, sd) {
   mu <- theta ^ n
   distribution(y) <- normal(mu, sd)
 }
 
 # integrate the function values w.r.t. the poisson distribution
-marginalise(fun,
+marginalise(likelihood,
             poisson(lambda),
             method = discrete_marginalisation(values = 0:10),
             theta = theta,
-            mu = mu,
             sd = sd)
 
 m <- model(lambda)
diff --git a/tests/testthat/test_marginalisation.R b/tests/testthat/test_marginalisation.R
index 1558dbb3..27ce7c23 100644
--- a/tests/testthat/test_marginalisation.R
+++ b/tests/testthat/test_marginalisation.R
@@ -7,13 +7,13 @@ test_that("marginalise errors nicely", {
 
   # not a function
   expect_error(
-    marginalise("apple", normal(0, 1), discrete_marginalisation()),
+    marginalise("apple", poisson(0.1), discrete_marginalisation(0:2)),
     "must be an R function"
   )
 
   # not a greta array
   expect_error(
-    marginalise(I, 1:5, discrete_marginalisation()),
+    marginalise(I, 1:5, discrete_marginalisation(0:2)),
     "must be a variable greta array with a distribution"
   )
 
@@ -25,12 +25,33 @@ test_that("marginalise errors nicely", {
 
   # not a marginaliser
   expect_error(
-    marginalise(I, normal(0, 1), mean),
+    marginalise(I, poisson(0.1), mean),
     "'method' must be a valid marginalisation method"
   )
 
+  # the following can only be assessed on final model definition
+
+  # function adds variables
+  fun <- function (x) {x + normal(0, 1)}
+  lambda <- variable()
+  marginalise(fun, poisson(lambda), discrete_marginalisation(0:2))
+  expect_error(
+    mcmc(model(lambda)),
+    "must not create any new variables"
+  )
+
+  # function has no distribution
+  fun <- function (x) {x * 2}
+  lambda <- variable()
+  marginalise(fun, poisson(lambda), discrete_marginalisation(0:2))
+  expect_error(
+    mcmc(model(lambda)),
+    "must constain at least one distribution over data"
+  )
+
 })
 
+
 test_that("discrete_marginalisation errors nicely", {
 
   skip_if_not(check_tf_version())
@@ -55,3 +76,71 @@ test_that("discrete_marginalisation errors nicely", {
   )
 
 })
+
+
+test_that("inference runs with discrete marginalisation", {
+
+  skip_if_not(check_tf_version())
+  source("helpers.R")
+
+  # spike+slab poisson glm (in reality this is poorly identified)
+  n <- 100
+  x <- runif(n, -5, 5)
+  y <- rpois(n, exp(1))
+  p <- variable(lower = 0, upper = 1)
+  alpha <- variable()
+  beta <- variable()
+  likelihood <- function (z, alpha, beta) {
+    eta <- alpha + beta * x * z
+    lambda <- exp(eta)
+    distribution(y) <- poisson(lambda)
+  }
+  marginalise(likelihood,
+              bernoulli(p),
+              method = discrete_marginalisation(0:1),
+              alpha = alpha,
+              beta = beta)
+  m <- model(alpha, beta, p)
+
+  expect_ok(o <- opt(m))
+  expect_ok(draws <- mcmc(m, warmup = 20, n_samples = 20))
+
+})
+
+
+test_that("discrete marginalisation gives correct densities", {
+
+  skip_if_not(check_tf_version())
+  source("helpers.R")
+
+  # no constraints to no adjustment to account for
+  lambda <- variable()
+  y <- rnorm(100)
+  likelihood <- function (z) {
+    distribution(y) <- normal(z, 1)
+  }
+  values <- 0:5
+  marginalise(likelihood,
+              poisson(lambda),
+              method = discrete_marginalisation(values))
+  m <- model(lambda)
+  o <- opt(m)
+
+  lambda_val <- runif(1, 0, 5)
+
+  # manually compute expected marginal likelihood given this value of lambda
+  wt <- dpois(values, lambda_val)
+  wt <- wt / sum(wt)
+  dens <- vapply(y, dnorm, values, 1,
+                 FUN.VALUE = rep(1, length(values)))
+  dens <- apply(dens, 1, prod)
+  expected <- log(sum(wt * dens))
+
+  # compute with greta
+  m$dag$send_parameters(lambda_val)
+  observed <- m$dag$log_density()
+
+  # compare results (within tolerance)
+  compare_op(expected, observed)
+
+})

From 7b9d845004d2b25e0af1213143e96b15689d0256 Mon Sep 17 00:00:00 2001
From: Nick Golding <goldingn35@gmail.com>
Date: Wed, 9 Jan 2019 14:16:14 +1100
Subject: [PATCH 09/38] split marginalisers into separate helpfile

---
 DESCRIPTION            |   1 +
 R/marginalise.R        | 111 +-------------------------------------
 R/marginalisers.R      | 117 +++++++++++++++++++++++++++++++++++++++++
 man/marginalisation.Rd |  27 ++--------
 man/marginalisers.Rd   |  33 ++++++++++++
 5 files changed, 157 insertions(+), 132 deletions(-)
 create mode 100644 R/marginalisers.R
 create mode 100644 man/marginalisers.Rd

diff --git a/DESCRIPTION b/DESCRIPTION
index 64ebfb03..ee6dbb9e 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -62,6 +62,7 @@ Collate:
     'calculate.R'
     'callbacks.R'
     'marginalise.R'
+    'marginalisers.R'
 Imports:
     R6,
     tensorflow,
diff --git a/R/marginalise.R b/R/marginalise.R
index 6a4ff340..d706f1fc 100644
--- a/R/marginalise.R
+++ b/R/marginalise.R
@@ -10,10 +10,8 @@
 #'   within a larger model; e.g. for random variables where marginalisation by
 #'   MCMC is inefficient, impossible, or when an approximate marginalisation is
 #'   substantially more efficient. \code{marginalise()} performs direct
-#'   marginalisation within a greta model, using one of the available marginalisation methods.
-#'
-#' @return \code{discrete_marginalisation} - a \code{marginaliser} object that
-#'   can be passed to \code{marginalise}.
+#'   marginalisation within a greta model, using one of the available
+#'   marginalisation methods, detailed in \link{marginalisers}.
 #'
 #' @examples
 #' \dontrun{
@@ -181,111 +179,6 @@ marginalisation_distribution <- R6Class(
   )
 )
 
-#' @rdname marginalisation
-#' @export
-#'
-#' @param values an R vector giving values at which to evaluate the function for a
-#'   discrete marginalisation
-#'
-#' @details \code{discrete_marginalisation} can only be used with discrete
-#'   probability distributions, e.g. those defined with \code{poisson()} and
-#'   \code{binomial()}. For discrete distributions with finite support (such as
-#'   \code{bernoulli()}) the marginalisation will be exact, so long as
-#'   \code{values} includes all possible values of the variable. For discrete
-#'   distributions with non-finite support (such as \code{poisson()}, which has
-#'   no upper bound), the marginalisation can only ever be approximate. However
-#'   if \code{values} cover a range of values with sufficiently high support in
-#'   the distribution, that approximation error will be minimal.
-discrete_marginalisation <- function(values) {
-
-  if (!is.vector(values) | !is.numeric(values)) {
-    msg <- "'values' must be an R numeric vector"
-
-    if (inherits(values, "greta_array")) {
-      msg <- paste0(msg, ", not a greta array")
-    }
-
-    stop (msg)
-  }
-
-  # define the marginalisation function
-  tf_marginaliser <- function(tf_conditional_density_fun,
-                              tf_distribution_log_pdf,
-                              other_args) {
-
-    # convert these into a list of constant tensors with the correct dimensions
-    # and float types
-    values_list <- as.list(values)
-    values_list <- lapply(values_list, as_2D_array)
-    values_list <- lapply(values_list, add_first_dim)
-    values_list <- lapply(values_list, fl)
-
-    # 1. get weights from the distribution log pdf
-    # assuming values is a list, get tensors for the weights
-    weights_list <- lapply(values_list, tf_distribution_log_pdf)
-    weights_list <- lapply(weights_list, tf_sum)
-
-    # convert to a vector of discrete probabilities and make them sum to 1
-    weights_vec <- tf$concat(weights_list, axis = 1L)
-    weights_vec <- tf$exp(weights_vec)
-    weights_vec <- weights_vec / tf_sum(weights_vec)
-    log_weights_vec <- tf$log(weights_vec)
-
-    # 2. compute the conditional joint density for each value (passing in
-    # other_args)
-    log_density_list <- list()
-    for (i in seq_along(values_list)) {
-      args <- c(list(values_list[[i]]), other_args)
-      log_density_list[[i]] <- do.call(tf_conditional_density_fun, args)
-    }
-
-    log_density_list <- lapply(log_density_list, tf$expand_dims, 1L)
-    log_density_list <- lapply(log_density_list, tf$expand_dims, 2L)
-    log_density_vec <- tf$concat(log_density_list, axis = 1L)
-
-    # 3. compute a weighted sum
-    log_density_weighted_vec <- log_density_vec + log_weights_vec
-    tf$reduce_logsumexp(log_density_weighted_vec, axis = 1L)
-
-  }
-
-  as_marginaliser(name = "discrete",
-                  tf_marginaliser = tf_marginaliser,
-                  parameters = list(values = values),
-                  distribution_check = discrete_check)
-
-}
-
-# check that the distribution is discrete
-discrete_check <- function(distrib) {
-  if (!distrib$discrete) {
-    stop ("this marginalisation method can only be used ",
-          "with discrete distributions",
-          call. = FALSE)
-  }
-}
-
-# helper to contruct marginalisers
-as_marginaliser <- function (name, tf_marginaliser, parameters, distribution_check) {
-
-  obj <- list(name = name,
-              tf_marginaliser = tf_marginaliser,
-              parameters = parameters,
-              distribution_check = distribution_check)
-
-  class_name <- paste0(name, "_marginaliser")
-  class(obj) <- c(class_name, "marginaliser")
-  obj
-
-}
-
-#' @noRd
-#' @export
-print.marginaliser <- function(x, ...) {
-  msg <- paste(x$name, "marginaliser object")
-  cat(msg)
-}
-
 # turn the greta function into a TF conditional density function; doing
 # something very similar to as_tf_function(), but giving a function that
 # returns a tensorflow scalar for the density
diff --git a/R/marginalisers.R b/R/marginalisers.R
new file mode 100644
index 00000000..4612cafc
--- /dev/null
+++ b/R/marginalisers.R
@@ -0,0 +1,117 @@
+# marginaliser constructors
+
+#' @name marginalisers
+#'
+#' @title marginalisation methods
+#' @description Functions to set up marginalisers (which explicitly integrate
+#'   out random variables) and modify their behaviour, for use in
+#'   \code{\link{marginalise}()}.
+#'
+#' @return a \code{marginaliser} object that can be passed to
+#'   \code{marginalise}.
+NULL
+
+#' @rdname marginalisers
+#' @export
+#'
+#' @param values an R vector giving values at which to evaluate the function for a
+#'   discrete marginalisation
+#'
+#' @details \code{discrete_marginalisation} can only be used with discrete
+#'   probability distributions, e.g. those defined with \code{poisson()} and
+#'   \code{binomial()}. For discrete distributions with finite support (such as
+#'   \code{bernoulli()}) the marginalisation will be exact, so long as
+#'   \code{values} includes all possible values of the variable. For discrete
+#'   distributions with non-finite support (such as \code{poisson()}, which has
+#'   no upper bound), the marginalisation can only ever be approximate. However
+#'   if \code{values} cover a range of values with sufficiently high support in
+#'   the distribution, that approximation error will be minimal.
+discrete_marginalisation <- function(values) {
+
+  if (!is.vector(values) | !is.numeric(values)) {
+    msg <- "'values' must be an R numeric vector"
+
+    if (inherits(values, "greta_array")) {
+      msg <- paste0(msg, ", not a greta array")
+    }
+
+    stop (msg)
+  }
+
+  # define the marginalisation function
+  tf_marginaliser <- function(tf_conditional_density_fun,
+                              tf_distribution_log_pdf,
+                              other_args) {
+
+    # convert these into a list of constant tensors with the correct dimensions
+    # and float types
+    values_list <- as.list(values)
+    values_list <- lapply(values_list, as_2D_array)
+    values_list <- lapply(values_list, add_first_dim)
+    values_list <- lapply(values_list, fl)
+
+    # 1. get weights from the distribution log pdf
+    # assuming values is a list, get tensors for the weights
+    weights_list <- lapply(values_list, tf_distribution_log_pdf)
+    weights_list <- lapply(weights_list, tf_sum)
+
+    # convert to a vector of discrete probabilities and make them sum to 1
+    weights_vec <- tf$concat(weights_list, axis = 1L)
+    weights_vec <- tf$exp(weights_vec)
+    weights_vec <- weights_vec / tf_sum(weights_vec)
+    log_weights_vec <- tf$log(weights_vec)
+
+    # 2. compute the conditional joint density for each value (passing in
+    # other_args)
+    log_density_list <- list()
+    for (i in seq_along(values_list)) {
+      args <- c(list(values_list[[i]]), other_args)
+      log_density_list[[i]] <- do.call(tf_conditional_density_fun, args)
+    }
+
+    log_density_list <- lapply(log_density_list, tf$expand_dims, 1L)
+    log_density_list <- lapply(log_density_list, tf$expand_dims, 2L)
+    log_density_vec <- tf$concat(log_density_list, axis = 1L)
+
+    # 3. compute a weighted sum
+    log_density_weighted_vec <- log_density_vec + log_weights_vec
+    tf$reduce_logsumexp(log_density_weighted_vec, axis = 1L)
+
+  }
+
+  as_marginaliser(name = "discrete",
+                  tf_marginaliser = tf_marginaliser,
+                  parameters = list(values = values),
+                  distribution_check = discrete_check)
+
+}
+
+# check that the distribution is discrete
+discrete_check <- function(distrib) {
+  if (!distrib$discrete) {
+    stop ("this marginalisation method can only be used ",
+          "with discrete distributions",
+          call. = FALSE)
+  }
+}
+
+# helper to contruct marginalisers
+as_marginaliser <- function (name, tf_marginaliser, parameters, distribution_check) {
+
+  obj <- list(name = name,
+              tf_marginaliser = tf_marginaliser,
+              parameters = parameters,
+              distribution_check = distribution_check)
+
+  class_name <- paste0(name, "_marginaliser")
+  class(obj) <- c(class_name, "marginaliser")
+  obj
+
+}
+
+#' @noRd
+#' @export
+print.marginaliser <- function(x, ...) {
+  msg <- paste(x$name, "marginaliser object")
+  cat(msg)
+}
diff --git a/man/marginalisation.Rd b/man/marginalisation.Rd
index e3365ab7..10eb2a0e 100644
--- a/man/marginalisation.Rd
+++ b/man/marginalisation.Rd
@@ -3,12 +3,9 @@
 \name{marginalisation}
 \alias{marginalisation}
 \alias{marginalise}
-\alias{discrete_marginalisation}
 \title{direct marginalisation of random variables}
 \usage{
 marginalise(fun, variable, method, ...)
-
-discrete_marginalisation(values)
 }
 \arguments{
 \item{fun}{an R function to integrate with respect to the random variable.
@@ -22,13 +19,6 @@ random variable to marginalise}
 the marginalisation}
 
 \item{\dots}{named greta arrays to be passed to \code{fun}}
-
-\item{values}{an R vector giving values at which to evaluate the function for a
-discrete marginalisation}
-}
-\value{
-\code{discrete_marginalisation} - a \code{marginaliser} object that
-  can be passed to \code{marginalise}.
 }
 \description{
 Inference on many statistical models requires marginalisation of
@@ -38,23 +28,14 @@ Inference on many statistical models requires marginalisation of
   within a larger model; e.g. for random variables where marginalisation by
   MCMC is inefficient, impossible, or when an approximate marginalisation is
   substantially more efficient. \code{marginalise()} performs direct
-  marginalisation within a greta model, using one of the available marginalisation methods.
+  marginalisation within a greta model, using one of the available
+  marginalisation methods, detailed in \link{marginalisers}.
 }
 \details{
 The code in \code{fun} must define at least one distribution over
   data (ie. a model likelihood), and cannot create any new variables. Any
   variables must be created outide this function, and passed in via the \dots
   argument.
-
-\code{discrete_marginalisation} can only be used with discrete
-  probability distributions, e.g. those defined with \code{poisson()} and
-  \code{binomial()}. For discrete distributions with finite support (such as
-  \code{bernoulli()}) the marginalisation will be exact, so long as
-  \code{values} includes all possible values of the variable. For discrete
-  distributions with non-finite support (such as \code{poisson()}, which has
-  no upper bound), the marginalisation can only ever be approximate. However
-  if \code{values} cover a range of values with sufficiently high support in
-  the distribution, that approximation error will be minimal.
 }
 \examples{
 \dontrun{
@@ -80,8 +61,8 @@ likelihood <- function (n, theta, sd) {
 }
 
 # integrate the function values w.r.t. the poisson distribution
-marginalise(likelihood,
-            poisson(lambda),
+marginalise(fun = likelihood,
+            variable = poisson(lambda),
             method = discrete_marginalisation(values = 0:10),
             theta = theta,
             sd = sd)
diff --git a/man/marginalisers.Rd b/man/marginalisers.Rd
new file mode 100644
index 00000000..aea7562e
--- /dev/null
+++ b/man/marginalisers.Rd
@@ -0,0 +1,33 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/marginalisers.R
+\name{marginalisers}
+\alias{marginalisers}
+\alias{discrete_marginalisation}
+\title{marginalisation methods}
+\usage{
+discrete_marginalisation(values)
+}
+\arguments{
+\item{values}{an R vector giving values at which to evaluate the function for a
+discrete marginalisation}
+}
+\value{
+a \code{marginaliser} object that can be passed to
+  \code{marginalise}.
+}
+\description{
+Functions to set up marginalisers (which explicitly integrate
+  out random variables) and modify their behaviour, for use in
+  \code{\link{marginalise}()}.
+}
+\details{
+\code{discrete_marginalisation} can only be used with discrete
+  probability distributions, e.g. those defined with \code{poisson()} and
+  \code{binomial()}. For discrete distributions with finite support (such as
+  \code{bernoulli()}) the marginalisation will be exact, so long as
+  \code{values} includes all possible values of the variable. For discrete
+  distributions with non-finite support (such as \code{poisson()}, which has
+  no upper bound), the marginalisation can only ever be approximate. However
+  if \code{values} cover a range of values with sufficiently high support in
+  the distribution, that approximation error will be minimal.
+}

From 86894816407045b74031c00e9403b7ffd8012361 Mon Sep 17 00:00:00 2001
From: Nick Golding <goldingn35@gmail.com>
Date: Wed, 9 Jan 2019 15:39:31 +1100
Subject: [PATCH 10/38] flesh out laplace_approximation marginaliser

---
 NAMESPACE            |   1 +
 R/marginalisers.R    | 225 +++++++++++++++++++++++++++++++++++++++++++
 man/marginalisers.Rd |  35 +++++++
 3 files changed, 261 insertions(+)

diff --git a/NAMESPACE b/NAMESPACE
index 24e7e5c3..9f8bb8f2 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -167,6 +167,7 @@ export(iprobit)
 export(joint)
 export(l_bfgs_b)
 export(laplace)
+export(laplace_approximation)
 export(lkj_correlation)
 export(log1pe)
 export(logistic)
diff --git a/R/marginalisers.R b/R/marginalisers.R
index 4612cafc..3274a4f3 100644
--- a/R/marginalisers.R
+++ b/R/marginalisers.R
@@ -7,6 +7,14 @@
 #'   out random variables) and modify their behaviour, for use in
 #'   \code{\link{marginalise}()}.
 #'
+#'   \code{discrete_marginalisation} marginalises discrete random variables via
+#'   a Riemann sum over a range of plausible values.
+#'
+#'   \code{laplace_approximation} marginalises random variables (with
+#'   multivariate normal priors) approximately, using by Newton-Raphson
+#'   optimisation and assuming the posterior follows a multivariate normal
+#'   distribution.
+#'
 #' @return a \code{marginaliser} object that can be passed to
 #'   \code{marginalise}.
 NULL
@@ -86,6 +94,214 @@ discrete_marginalisation <- function(values) {
 
 }
 
+#' @rdname marginalisers
+#' @export
+#'
+#' @param stepsize the (positive) size of steps in the Newton-Raphson
+#'   optimisation algorithm used to find the approximate conditional posterior
+#'   mode of the random variable
+#'
+#' @param stepsize the (positive) numerical convergence tolerance (in units of
+#'   the log conditional posterior density) of the Newton-Raphson optimisation
+#'   algorithm
+#'
+#' @param max_iterations the (positive) integer-like maximum number of iterations of
+#'   the Newton-Raphson optimisation algorithm
+#'
+#' @param warm_start whether to start the Newton-Raphson optimisation algorithm
+#'   at the optimal value from the last iteration of model inference (if
+#'   \code{TRUE}), or to randomly re-initialise it.
+#'
+#' @details \code{laplace_approximation} can only be used to marginalise
+#'   variables following a multivariate normal distribution. In addition, the
+#'   function to be marginalised must \emph{factorise}; ie. it must return a
+#'   vector-valued density with as many elements as the vector variable being
+#'   marginalised, and each of element of the density must depend only on the
+#'   corresponding element of the variable vector. This is the responsibility of
+#'   the user, and is not checked.
+laplace_approximation <- function(stepsize = 0.1,
+                                  tolerance = 1e-6,
+                                  max_iterations = 50,
+                                  warm_start = TRUE) {
+
+  # to do:
+  #  - get tensors for parameters of the distribution
+
+  # in future:
+  #  - enable a non-factorising version (have the user say whether it is
+  # factorising)
+  #  - handle an iid normal distribution too.
+
+  # check arguments, Newton-Raphson iterations etc.
+  if (!(is.numeric(stepsize) &&
+        is.vector(stepsize) &&
+        length(stepsize) == 1 &&
+        stepsize > 0)) {
+    stop ("'stepsize' must be a positive, scalar numeric value")
+  }
+
+  if (!(is.numeric(tolerance) &&
+        is.vector(tolerance) &&
+        length(tolerance) == 1 &&
+        tolerance > 0)) {
+    stop ("'tolerance' must be a positive, scalar numeric value")
+  }
+
+  max_iterations <- as.integer(max_iterations)
+  if (!(is.vector(max_iterations) &&
+        length(max_iterations) == 1 &&
+        max_iterations > 0)) {
+    stop ("'max_iterations' must be a positive, scalar integer value")
+  }
+
+  if (!(is.logical(warm_start) && warm_start %in% c(TRUE, FALSE))) {
+    stop ("'warm_start' must be either TRUE or FALSE")
+  }
+
+  # define the marginalisation function
+  tf_marginaliser <- function(tf_conditional_density_fun,
+                              tf_distribution_log_pdf,
+                              other_args) {
+
+    # get the first and second derivatives of the conditional density function,
+    # w.r.t. the variable being marginalised
+    derivs <- function (z) {
+      y <- tf_conditional_density_fun(z)
+      d1 <- tf$gradients(y, z)[[1]]
+      d2 <- tf$gradients(d1, z)[[1]]
+      list(d1, d2)
+    }
+
+    # negative log-posterior for the current value of z under MVN assumption
+    psi <- function (a, z, mu, d0) {
+      fl(0.5) * t(a) %*% (z - mu) - tf_sum(tf_conditional_density_fun(z))
+    }
+
+    # # compute psi for different values of the stepsize (s)
+    # # to be added later, if I can work out how to nest/recode this linesearch optimisation
+    # psiline <- function (s, adiff, a, K, mu, d0) {
+    #   a <- a + s * as.vector(adiff)
+    #   z <- K %*% a + mu
+    #   psi(a, z, mu, d0)
+    # }
+
+    # dimension of the MVN distribution - how to pass in this and the parameters?
+    n <- ?
+
+    # if the user wants hot starts, stash the last estimate of z, so we can
+    # start there in the next iteration of the outer inference algorithm.
+    z <- NULL
+    if (warm_start) {
+      z <- greta_stash$laplace_z_est
+    }
+
+    # otherwise randomly initialise
+    if (is.null(z)) {
+      z_value <- add_first_dim(as_2D_array(rnorm(n)))
+      z <- tf$constant(z_value, dtype = tf_float())
+    }
+
+    # get parameters of MVN distribution
+    mu <- ?
+    Sigma <- ?
+
+    # Newton-Raphson parameters
+    tolerance <- add_first_dim(as_2D_array(tolerance))
+    tol <- tf$constant(tolerance, tf_float())
+    obj_old <- tf$constant(-Inf, tf_float(), shape(1, 1, 1))
+    obj <- -tf_sum(tf_conditional_density_fun(z))
+    iter <- tf$constant(0L)
+    maxiter <- tf$constant(as.integer(max_iterations))
+
+    # other objects
+    a_value <- add_first_dim(as_2D_array(rep(0, n)))
+    a <- tf$constant(a_value, dtype = tf_float())
+    U_value <- add_first_dim(diag(n))
+    U <- tf$constant(U_value, tf_float())
+    eye <- tf$constant(add_first_dim(diag(n)),
+                       dtype = tf_float())
+
+
+    # need tensorflow while loop to do Newton-Raphson iterations
+
+    body <- function(z, a, U, obj_old, obj, tol, iter, maxiter) {
+
+      obj.old <- obj
+
+      deriv <- derivs(z)
+      d1 <- deriv[[2]]
+      d2 <- deriv[[3]]
+
+      # curvature of the likelihood
+      W <- -d2
+      rW <- sqrt(W)
+
+      # decentred values of z
+      cf <- z - mu
+
+      # approximate posterior covariance & cholesky factor
+      mat1 <- rW %*% t(rW) * Sigma + eye
+      U <- tf$cholesky(mat1)
+      L <- tf_transpose(U)
+
+      # compute Newton-Raphson update direction
+      b <- W * cf + d1
+      mat2 <- rW * (Sigma %*% b)
+      mat3 <- tf$matrix_triangular_solve(U, mat2)
+      adiff <- b - rW * tf$matrix_triangular_solve(L, mat3, lower = FALSE) - a
+
+      # # find the optimal stepsize in that dimension (to be added later)
+      # res <- optimise(psiline, c(0, 2), adiff, a, Sigma, mu)
+      # stepsize <- res$minimum
+
+      # do the update and compute new z and objective
+      a_new <- a + stepsize * adiff
+      z_new <- Sigma %*% a_new + mu
+      obj <- psi(a_new, z_new, mu)
+
+      list(z_new, a_new, U, obj_old, obj, tol, iter + 1, maxiter)
+
+    }
+
+    cond <- function(z, a, U, obj_old, obj, tol, iter, maxiter) {
+        tf$less(iter, maxiter) & tf$greater(obj_old - obj, tol)
+    }
+
+    values <- list(z,
+                   a,
+                   U,
+                   obj_old,
+                   obj,
+                   tol,
+                   iter,
+                   maxiter)
+
+    # run the Newton-Raphson optimisation to find the posterior mode of z
+    out <- tf$while_loop(cond, body, values)
+
+    z <- out$z
+    a <- out$a
+    U <- out$U
+
+    # store the estimate in the stash as an R vector (this will break with nested laplace
+    # approximations!)
+    greta_stash$laplace_z_est <- z
+
+    # the approximate marginal conditional posterior
+    lp <- tf_sum(tf_conditional_density_fun(z))
+    mnll <- (a %*% (z - mu)) / fl(2) - lp + tf_sum(tf$log(tf$matrix_diag_part(U)))
+
+    -mnll
+
+  }
+
+  as_marginaliser(name = "laplace",
+                  tf_marginaliser = tf_marginaliser,
+                  parameters = list(),
+                  distribution_check = multivariate_normal_check)
+
+}
+
 # check that the distribution is discrete
 discrete_check <- function(distrib) {
   if (!distrib$discrete) {
@@ -95,6 +311,15 @@ discrete_check <- function(distrib) {
   }
 }
 
+# check that the distribution is multiariate normal
+multivariate_normal_check <- function (distrib) {
+  if (distrib$distribution_name != "multivariate_normal") {
+    stop ("the Laplace approximation can only be used ",
+          "with a multivariate normal distribution",
+          call. = FALSE)
+  }
+}
+
 # helper to contruct marginalisers
 as_marginaliser <- function (name, tf_marginaliser, parameters, distribution_check) {
 
diff --git a/man/marginalisers.Rd b/man/marginalisers.Rd
index aea7562e..b2c19b24 100644
--- a/man/marginalisers.Rd
+++ b/man/marginalisers.Rd
@@ -3,13 +3,32 @@
 \name{marginalisers}
 \alias{marginalisers}
 \alias{discrete_marginalisation}
+\alias{laplace_approximation}
 \title{marginalisation methods}
 \usage{
 discrete_marginalisation(values)
+
+laplace_approximation(stepsize = 0.1, tolerance = 1e-06,
+  max_iterations = 50, warm_start = TRUE)
 }
 \arguments{
 \item{values}{an R vector giving values at which to evaluate the function for a
 discrete marginalisation}
+
+\item{stepsize}{the (positive) size of steps in the Newton-Raphson
+optimisation algorithm used to find the approximate conditional posterior
+mode of the random variable}
+
+\item{max_iterations}{the (positive) integer-like maximum number of iterations of
+the Newton-Raphson optimisation algorithm}
+
+\item{warm_start}{whether to start the Newton-Raphson optimisation algorithm
+at the optimal value from the last iteration of model inference (if
+\code{TRUE}), or to randomly re-initialise it.}
+
+\item{stepsize}{the (positive) numerical convergence tolerance (in units of
+the log conditional posterior density) of the Newton-Raphson optimisation
+algorithm}
 }
 \value{
 a \code{marginaliser} object that can be passed to
@@ -19,6 +38,14 @@ a \code{marginaliser} object that can be passed to
 Functions to set up marginalisers (which explicitly integrate
   out random variables) and modify their behaviour, for use in
   \code{\link{marginalise}()}.
+
+  \code{discrete_marginalisation} marginalises discrete random variables via
+  a Riemann sum over a range of plausible values.
+
+  \code{laplace_approximation} marginalises random variables (with
+  multivariate normal priors) approximately, using by Newton-Raphson
+  optimisation and assuming the posterior follows a multivariate normal
+  distribution.
 }
 \details{
 \code{discrete_marginalisation} can only be used with discrete
@@ -30,4 +57,12 @@ Functions to set up marginalisers (which explicitly integrate
   no upper bound), the marginalisation can only ever be approximate. However
   if \code{values} cover a range of values with sufficiently high support in
   the distribution, that approximation error will be minimal.
+
+\code{laplace_approximation} can only be used to marginalise
+  variables following a multivariate normal distribution. In addition, the
+  function to be marginalised must \emph{factorise}; ie. it must return a
+  vector-valued density with as many elements as the vector variable being
+  marginalised, and each of element of the density must depend only on the
+  corresponding element of the variable vector. This is the responsibility of
+  the user, and is not checked.
 }

From 125a989bef4742d6efbae9a735281676afe9da1f Mon Sep 17 00:00:00 2001
From: Nick Golding <goldingn35@gmail.com>
Date: Wed, 9 Jan 2019 18:10:18 +1100
Subject: [PATCH 11/38] get laplace approximation (almost) working

---
 R/dag_class.R        |   5 ++
 R/marginalise.R      |  24 +++++++--
 R/marginalisers.R    | 123 ++++++++++++++++++++++---------------------
 man/marginalisers.Rd |   8 +--
 4 files changed, 90 insertions(+), 70 deletions(-)

diff --git a/R/dag_class.R b/R/dag_class.R
index b05bae02..ccf8f20a 100644
--- a/R/dag_class.R
+++ b/R/dag_class.R
@@ -281,6 +281,11 @@ dag_class <- R6Class(
                           MoreArgs = list(envir = tfe),
                           SIMPLIFY = FALSE)
 
+      # assign the un-reduced densities, for use in marginalisation
+      assign("component_densities",
+             densities,
+             envir = self$tf_environment)
+
       # reduce_sum them
       self$on_graph(summed_densities <- lapply(densities, tf_sum, drop = TRUE))
 
diff --git a/R/marginalise.R b/R/marginalise.R
index d706f1fc..f742dcb4 100644
--- a/R/marginalise.R
+++ b/R/marginalise.R
@@ -162,10 +162,13 @@ marginalisation_distribution <- R6Class(
       log_prob <- function(x) {
 
         # x will be the target; a tf function for the distribution being
-        # marginalised
+        # marginalised. Pass in the distribution node and the dag too, in case
+        # the marginalisers need them.
         self$tf_marginaliser(self$conditional_density_fun,
                              tf_distribution_log_pdf = x,
-                             parameters)
+                             other_args = parameters,
+                             dag = dag,
+                             distribution_node = self$target)
 
       }
 
@@ -194,7 +197,7 @@ as_conditional_density <- function(r_fun, args) {
   # the function.
 
   # this function will take in tensors corresponding to the things in args
-  function(...) {
+  function(..., reduce = TRUE) {
 
     tensor_inputs <- list(...)
 
@@ -251,7 +254,20 @@ as_conditional_density <- function(r_fun, args) {
     # log jacobian adjustment)
     sub_dag$define_joint_density(adjusted = FALSE)
 
-    sub_tfe$joint_density
+    # whether to reduce_sum the densities, or sum them elementwise
+    if (reduce) {
+      result <- sub_tfe$joint_density
+    } else {
+      densities <- sub_tfe$component_densities
+      if (length(densities) > 1) {
+        names(densities) <- NULL
+        result <- tf$add_n(densities)
+      } else {
+        result <- densities[[1]]
+      }
+    }
+
+    result
 
   }
 
diff --git a/R/marginalisers.R b/R/marginalisers.R
index 3274a4f3..1908066b 100644
--- a/R/marginalisers.R
+++ b/R/marginalisers.R
@@ -49,7 +49,9 @@ discrete_marginalisation <- function(values) {
   # define the marginalisation function
   tf_marginaliser <- function(tf_conditional_density_fun,
                               tf_distribution_log_pdf,
-                              other_args) {
+                              other_args,
+                              dag,
+                              distribution_node) {
 
     # convert these into a list of constant tensors with the correct dimensions
     # and float types
@@ -108,10 +110,6 @@ discrete_marginalisation <- function(values) {
 #' @param max_iterations the (positive) integer-like maximum number of iterations of
 #'   the Newton-Raphson optimisation algorithm
 #'
-#' @param warm_start whether to start the Newton-Raphson optimisation algorithm
-#'   at the optimal value from the last iteration of model inference (if
-#'   \code{TRUE}), or to randomly re-initialise it.
-#'
 #' @details \code{laplace_approximation} can only be used to marginalise
 #'   variables following a multivariate normal distribution. In addition, the
 #'   function to be marginalised must \emph{factorise}; ie. it must return a
@@ -119,15 +117,13 @@ discrete_marginalisation <- function(values) {
 #'   marginalised, and each of element of the density must depend only on the
 #'   corresponding element of the variable vector. This is the responsibility of
 #'   the user, and is not checked.
-laplace_approximation <- function(stepsize = 0.1,
+laplace_approximation <- function(stepsize = 0.05,
                                   tolerance = 1e-6,
-                                  max_iterations = 50,
-                                  warm_start = TRUE) {
-
-  # to do:
-  #  - get tensors for parameters of the distribution
+                                  max_iterations = 50) {
 
   # in future:
+  #  - enable warm starts for subsequent steps of the outer inference algorithm
+  # do linesearch in the Newton-Raphson iterations
   #  - enable a non-factorising version (have the user say whether it is
   # factorising)
   #  - handle an iid normal distribution too.
@@ -154,27 +150,31 @@ laplace_approximation <- function(stepsize = 0.1,
     stop ("'max_iterations' must be a positive, scalar integer value")
   }
 
-  if (!(is.logical(warm_start) && warm_start %in% c(TRUE, FALSE))) {
-    stop ("'warm_start' must be either TRUE or FALSE")
-  }
-
   # define the marginalisation function
   tf_marginaliser <- function(tf_conditional_density_fun,
                               tf_distribution_log_pdf,
-                              other_args) {
+                              other_args,
+                              dag,
+                              distribution_node) {
+
+    # simpler interface to conditional density. If reduce = TRUE, it does reduce_sum on component densities
+    d0 <- function (z, reduce = TRUE) {
+      args <- c(list(z), other_args, list(reduce = reduce))
+      do.call(tf_conditional_density_fun, args)
+    }
 
-    # get the first and second derivatives of the conditional density function,
+    # get the vectors of first and second derivatives of the conditional density function,
     # w.r.t. the variable being marginalised
     derivs <- function (z) {
-      y <- tf_conditional_density_fun(z)
+      y <- d0(z, reduce = FALSE)
       d1 <- tf$gradients(y, z)[[1]]
       d2 <- tf$gradients(d1, z)[[1]]
       list(d1, d2)
     }
 
     # negative log-posterior for the current value of z under MVN assumption
-    psi <- function (a, z, mu, d0) {
-      fl(0.5) * t(a) %*% (z - mu) - tf_sum(tf_conditional_density_fun(z))
+    psi <- function (a, z, mu) {
+      fl(0.5) * tf$matmul(tf_transpose(a), (z - mu)) - d0(z)
     }
 
     # # compute psi for different values of the stepsize (s)
@@ -185,33 +185,26 @@ laplace_approximation <- function(stepsize = 0.1,
     #   psi(a, z, mu, d0)
     # }
 
-    # dimension of the MVN distribution - how to pass in this and the parameters?
-    n <- ?
+    # tensors for parameters of MVN distribution (make mu column vector now)
+    mu_node <- distribution_node$parameters$mean
+    mu <- dag$tf_environment[[dag$tf_name(mu_node)]]
+    mu <- tf_transpose(mu)
+    Sigma_node <- distribution_node$parameters$Sigma
+    Sigma <- dag$tf_environment[[dag$tf_name(Sigma_node)]]
 
-    # if the user wants hot starts, stash the last estimate of z, so we can
-    # start there in the next iteration of the outer inference algorithm.
-    z <- NULL
-    if (warm_start) {
-      z <- greta_stash$laplace_z_est
-    }
-
-    # otherwise randomly initialise
-    if (is.null(z)) {
-      z_value <- add_first_dim(as_2D_array(rnorm(n)))
-      z <- tf$constant(z_value, dtype = tf_float())
-    }
+    # dimension of the MVN distribution
+    n <- dim(mu)[[2]]
 
-    # get parameters of MVN distribution
-    mu <- ?
-    Sigma <- ?
+    # randomly initialise z, and expand to batch (warm starts will need some TF
+    # trickery)
+    z_value <- add_first_dim(as_2D_array(rnorm(n)))
+    z <- tf$constant(z_value, dtype = tf_float())
 
     # Newton-Raphson parameters
-    tolerance <- add_first_dim(as_2D_array(tolerance))
-    tol <- tf$constant(tolerance, tf_float())
-    obj_old <- tf$constant(-Inf, tf_float(), shape(1, 1, 1))
-    obj <- -tf_sum(tf_conditional_density_fun(z))
+    tol <- tf$constant(tolerance, tf_float(), shape(1))
+    obj_old <- tf$constant(-Inf, tf_float(), shape(1))
     iter <- tf$constant(0L)
-    maxiter <- tf$constant(as.integer(max_iterations))
+    maxiter <- tf$constant(max_iterations)
 
     # other objects
     a_value <- add_first_dim(as_2D_array(rep(0, n)))
@@ -220,17 +213,26 @@ laplace_approximation <- function(stepsize = 0.1,
     U <- tf$constant(U_value, tf_float())
     eye <- tf$constant(add_first_dim(diag(n)),
                        dtype = tf_float())
+    stepsize <- fl(stepsize)
 
+    # match batches on everything going into the loop that will have a batch
+    # dimension
+    z <- expand_to_batch(z, mu)
+    a <- expand_to_batch(a, mu)
+    U <- expand_to_batch(U, mu)
+    obj_old <- expand_to_batch(obj_old, mu)
+    tol <- expand_to_batch(tol, mu)
 
-    # need tensorflow while loop to do Newton-Raphson iterations
+    obj <- -d0(z)
 
+    # tensorflow while loop to do Newton-Raphson iterations
     body <- function(z, a, U, obj_old, obj, tol, iter, maxiter) {
 
       obj.old <- obj
 
       deriv <- derivs(z)
-      d1 <- deriv[[2]]
-      d2 <- deriv[[3]]
+      d1 <- deriv[[1]]
+      d2 <- deriv[[2]]
 
       # curvature of the likelihood
       W <- -d2
@@ -240,13 +242,13 @@ laplace_approximation <- function(stepsize = 0.1,
       cf <- z - mu
 
       # approximate posterior covariance & cholesky factor
-      mat1 <- rW %*% t(rW) * Sigma + eye
+      mat1 <- tf$matmul(rW, tf_transpose(rW)) * Sigma + eye
       U <- tf$cholesky(mat1)
       L <- tf_transpose(U)
 
       # compute Newton-Raphson update direction
       b <- W * cf + d1
-      mat2 <- rW * (Sigma %*% b)
+      mat2 <- rW * tf$matmul(Sigma, b)
       mat3 <- tf$matrix_triangular_solve(U, mat2)
       adiff <- b - rW * tf$matrix_triangular_solve(L, mat3, lower = FALSE) - a
 
@@ -256,15 +258,18 @@ laplace_approximation <- function(stepsize = 0.1,
 
       # do the update and compute new z and objective
       a_new <- a + stepsize * adiff
-      z_new <- Sigma %*% a_new + mu
+      z_new <- tf$matmul(Sigma, a_new) + mu
       obj <- psi(a_new, z_new, mu)
+      obj <- tf$squeeze(obj, 1:2)
 
-      list(z_new, a_new, U, obj_old, obj, tol, iter + 1, maxiter)
+      list(z_new, a_new, U, obj_old, obj, tol, iter + 1L, maxiter)
 
     }
 
     cond <- function(z, a, U, obj_old, obj, tol, iter, maxiter) {
-        tf$less(iter, maxiter) & tf$greater(obj_old - obj, tol)
+      none_converged <- tf$reduce_any(tf$greater(obj_old - obj, tol))
+      in_time <- tf$less(iter, maxiter)
+      in_time & none_converged
     }
 
     values <- list(z,
@@ -279,19 +284,17 @@ laplace_approximation <- function(stepsize = 0.1,
     # run the Newton-Raphson optimisation to find the posterior mode of z
     out <- tf$while_loop(cond, body, values)
 
-    z <- out$z
-    a <- out$a
-    U <- out$U
-
-    # store the estimate in the stash as an R vector (this will break with nested laplace
-    # approximations!)
-    greta_stash$laplace_z_est <- z
+    z <- out[[1]]
+    a <- out[[2]]
+    U <- out[[3]]
 
     # the approximate marginal conditional posterior
-    lp <- tf_sum(tf_conditional_density_fun(z))
-    mnll <- (a %*% (z - mu)) / fl(2) - lp + tf_sum(tf$log(tf$matrix_diag_part(U)))
+    lp <- d0(z)
+    p1 <- tf$matmul(tf_transpose(a), z - mu) / fl(2)
+    p2 <- tf_sum(tf$log(tf$matrix_diag_part(U)))
+    nmcp <- tf$squeeze(p1, 1:2) - lp + tf$squeeze(p2, 1)
 
-    -mnll
+    -nmcp
 
   }
 
diff --git a/man/marginalisers.Rd b/man/marginalisers.Rd
index b2c19b24..cb65a0ae 100644
--- a/man/marginalisers.Rd
+++ b/man/marginalisers.Rd
@@ -8,8 +8,8 @@
 \usage{
 discrete_marginalisation(values)
 
-laplace_approximation(stepsize = 0.1, tolerance = 1e-06,
-  max_iterations = 50, warm_start = TRUE)
+laplace_approximation(stepsize = 0.05, tolerance = 1e-06,
+  max_iterations = 50)
 }
 \arguments{
 \item{values}{an R vector giving values at which to evaluate the function for a
@@ -22,10 +22,6 @@ mode of the random variable}
 \item{max_iterations}{the (positive) integer-like maximum number of iterations of
 the Newton-Raphson optimisation algorithm}
 
-\item{warm_start}{whether to start the Newton-Raphson optimisation algorithm
-at the optimal value from the last iteration of model inference (if
-\code{TRUE}), or to randomly re-initialise it.}
-
 \item{stepsize}{the (positive) numerical convergence tolerance (in units of
 the log conditional posterior density) of the Newton-Raphson optimisation
 algorithm}

From 990e4970d7e54401c5c06ab573bcbceb0de68b56 Mon Sep 17 00:00:00 2001
From: Nick Golding <goldingn35@gmail.com>
Date: Thu, 10 Jan 2019 14:16:58 +1100
Subject: [PATCH 12/38] finish making laplace approximation run

---
 R/dag_class.R     |  1 +
 R/marginalisers.R | 27 ++++++++++++++++++---------
 2 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/R/dag_class.R b/R/dag_class.R
index ccf8f20a..5e01afa4 100644
--- a/R/dag_class.R
+++ b/R/dag_class.R
@@ -281,6 +281,7 @@ dag_class <- R6Class(
                           MoreArgs = list(envir = tfe),
                           SIMPLIFY = FALSE)
 
+      names(densities) <- NULL
       # assign the un-reduced densities, for use in marginalisation
       assign("component_densities",
              densities,
diff --git a/R/marginalisers.R b/R/marginalisers.R
index 1908066b..c0f1c964 100644
--- a/R/marginalisers.R
+++ b/R/marginalisers.R
@@ -216,15 +216,20 @@ laplace_approximation <- function(stepsize = 0.05,
     stepsize <- fl(stepsize)
 
     # match batches on everything going into the loop that will have a batch
-    # dimension
-    z <- expand_to_batch(z, mu)
-    a <- expand_to_batch(a, mu)
-    U <- expand_to_batch(U, mu)
-    obj_old <- expand_to_batch(obj_old, mu)
-    tol <- expand_to_batch(tol, mu)
+    # dimension later
+    objects <- list(mu, Sigma, z, a, U, obj_old, tol)
+    objects <- greta:::match_batches(objects)
+    mu <- objects[[1]]
+    Sigma <- objects[[2]]
+    z <- objects[[3]]
+    a <- objects[[4]]
+    U <- objects[[5]]
+    obj_old <- objects[[6]]
+    tol <- objects[[7]]
 
     obj <- -d0(z)
 
+
     # tensorflow while loop to do Newton-Raphson iterations
     body <- function(z, a, U, obj_old, obj, tol, iter, maxiter) {
 
@@ -284,9 +289,13 @@ laplace_approximation <- function(stepsize = 0.05,
     # run the Newton-Raphson optimisation to find the posterior mode of z
     out <- tf$while_loop(cond, body, values)
 
-    z <- out[[1]]
-    a <- out[[2]]
-    U <- out[[3]]
+    # get the two components, preventing the gradient from being computed through the optimisation
+    a <- tf$stop_gradient(out[[2]])
+    U <- tf$stop_gradient(out[[3]])
+
+    # recompute z (gradient of z w.r.t. the free state should include this op,
+    # but not optimisation)
+    z <- tf$matmul(Sigma, a) + mu
 
     # the approximate marginal conditional posterior
     lp <- d0(z)

From 1a8901cc78ff0576b3154e49c69c5f2a93fe6e73 Mon Sep 17 00:00:00 2001
From: Nick Golding <goldingn35@gmail.com>
Date: Fri, 11 Jan 2019 14:43:29 +1100
Subject: [PATCH 13/38] add golden section search and get laplace approximation
 working

---
 R/marginalisers.R          | 67 ++++++++++++++++--------------
 R/utils.R                  | 84 +++++++++++++++++++++++++++++++++++++-
 tests/testthat/test_misc.R | 22 ++++++++++
 3 files changed, 141 insertions(+), 32 deletions(-)

diff --git a/R/marginalisers.R b/R/marginalisers.R
index c0f1c964..c63bb7dc 100644
--- a/R/marginalisers.R
+++ b/R/marginalisers.R
@@ -159,7 +159,9 @@ laplace_approximation <- function(stepsize = 0.05,
 
     # simpler interface to conditional density. If reduce = TRUE, it does reduce_sum on component densities
     d0 <- function (z, reduce = TRUE) {
-      args <- c(list(z), other_args, list(reduce = reduce))
+      # transpose z to a row vector, which the dag is expecting
+      t_z <- tf_transpose(z)
+      args <- c(list(t_z), other_args, list(reduce = reduce))
       do.call(tf_conditional_density_fun, args)
     }
 
@@ -174,17 +176,10 @@ laplace_approximation <- function(stepsize = 0.05,
 
     # negative log-posterior for the current value of z under MVN assumption
     psi <- function (a, z, mu) {
-      fl(0.5) * tf$matmul(tf_transpose(a), (z - mu)) - d0(z)
+      p1 <- tf$matmul(tf_transpose(a), z - mu)
+      fl(0.5) * tf$squeeze(p1, 1:2) - d0(z)
     }
 
-    # # compute psi for different values of the stepsize (s)
-    # # to be added later, if I can work out how to nest/recode this linesearch optimisation
-    # psiline <- function (s, adiff, a, K, mu, d0) {
-    #   a <- a + s * as.vector(adiff)
-    #   z <- K %*% a + mu
-    #   psi(a, z, mu, d0)
-    # }
-
     # tensors for parameters of MVN distribution (make mu column vector now)
     mu_node <- distribution_node$parameters$mean
     mu <- dag$tf_environment[[dag$tf_name(mu_node)]]
@@ -197,12 +192,15 @@ laplace_approximation <- function(stepsize = 0.05,
 
     # randomly initialise z, and expand to batch (warm starts will need some TF
     # trickery)
+
+    # here z is a *column vector* to simplify later calculations, it needs to be
+    # transposed to a row vector before feeding into the likelihood function(s)
     z_value <- add_first_dim(as_2D_array(rnorm(n)))
     z <- tf$constant(z_value, dtype = tf_float())
 
     # Newton-Raphson parameters
     tol <- tf$constant(tolerance, tf_float(), shape(1))
-    obj_old <- tf$constant(-Inf, tf_float(), shape(1))
+    obj_old <- tf$constant(Inf, tf_float(), shape(1))
     iter <- tf$constant(0L)
     maxiter <- tf$constant(max_iterations)
 
@@ -213,11 +211,12 @@ laplace_approximation <- function(stepsize = 0.05,
     U <- tf$constant(U_value, tf_float())
     eye <- tf$constant(add_first_dim(diag(n)),
                        dtype = tf_float())
-    stepsize <- fl(stepsize)
+
+    stepsize <- fl(add_first_dim(as_2D_array(stepsize)))
 
     # match batches on everything going into the loop that will have a batch
     # dimension later
-    objects <- list(mu, Sigma, z, a, U, obj_old, tol)
+    objects <- list(mu, Sigma, z, a, U, obj_old, tol, stepsize)
     objects <- greta:::match_batches(objects)
     mu <- objects[[1]]
     Sigma <- objects[[2]]
@@ -226,14 +225,18 @@ laplace_approximation <- function(stepsize = 0.05,
     U <- objects[[5]]
     obj_old <- objects[[6]]
     tol <- objects[[7]]
+    stepsize <- objects[[8]]
 
     obj <- -d0(z)
 
+    # get the batch dim explicitly, for the step size optimisation
+    batch_dim <- tf$shape(mu)[[0]]
+    batch_dim <- tf$expand_dims(batch_dim, 0L)
 
     # tensorflow while loop to do Newton-Raphson iterations
     body <- function(z, a, U, obj_old, obj, tol, iter, maxiter) {
 
-      obj.old <- obj
+      obj_old <- obj
 
       deriv <- derivs(z)
       d1 <- deriv[[1]]
@@ -257,24 +260,32 @@ laplace_approximation <- function(stepsize = 0.05,
       mat3 <- tf$matrix_triangular_solve(U, mat2)
       adiff <- b - rW * tf$matrix_triangular_solve(L, mat3, lower = FALSE) - a
 
-      # # find the optimal stepsize in that dimension (to be added later)
-      # res <- optimise(psiline, c(0, 2), adiff, a, Sigma, mu)
-      # stepsize <- res$minimum
+      # use golden section search to find the optimum distance to step in this
+      # direction, for each batch simultaneously
+      psiline <- function (s) {
+        a_new <- a + s * adiff
+        z_new <- tf$matmul(Sigma, a) + mu
+        psi(a_new, z_new, mu)
+      }
+
+      ls_results <- gss(psiline, batch_dim)
+      stepsize <- ls_results$minimum
+      stepsize <- tf$expand_dims(stepsize, 1L)
+      stepsize <- tf$expand_dims(stepsize, 2L)
 
       # do the update and compute new z and objective
       a_new <- a + stepsize * adiff
       z_new <- tf$matmul(Sigma, a_new) + mu
       obj <- psi(a_new, z_new, mu)
-      obj <- tf$squeeze(obj, 1:2)
 
       list(z_new, a_new, U, obj_old, obj, tol, iter + 1L, maxiter)
 
     }
 
     cond <- function(z, a, U, obj_old, obj, tol, iter, maxiter) {
-      none_converged <- tf$reduce_any(tf$greater(obj_old - obj, tol))
+      not_all_converged <- tf$reduce_any(tf$less(tol, obj_old - obj))
       in_time <- tf$less(iter, maxiter)
-      in_time & none_converged
+      in_time & not_all_converged
     }
 
     values <- list(z,
@@ -289,19 +300,13 @@ laplace_approximation <- function(stepsize = 0.05,
     # run the Newton-Raphson optimisation to find the posterior mode of z
     out <- tf$while_loop(cond, body, values)
 
-    # get the two components, preventing the gradient from being computed through the optimisation
-    a <- tf$stop_gradient(out[[2]])
-    U <- tf$stop_gradient(out[[3]])
-
-    # recompute z (gradient of z w.r.t. the free state should include this op,
-    # but not optimisation)
-    z <- tf$matmul(Sigma, a) + mu
+    z <- out[[1]]
+    a <- out[[2]]
+    U <- out[[3]]
 
     # the approximate marginal conditional posterior
-    lp <- d0(z)
-    p1 <- tf$matmul(tf_transpose(a), z - mu) / fl(2)
-    p2 <- tf_sum(tf$log(tf$matrix_diag_part(U)))
-    nmcp <- tf$squeeze(p1, 1:2) - lp + tf$squeeze(p2, 1)
+    logdet <- tf_sum(tf$log(tf$matrix_diag_part(U)))
+    nmcp <- psi(a, z, mu) + tf$squeeze(logdet, 1)
 
     -nmcp
 
diff --git a/R/utils.R b/R/utils.R
index dc50d44e..e047c30c 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -398,6 +398,87 @@ rhex <- function()
   paste(as.raw(sample.int(256L, 4, TRUE) - 1L), collapse = "")
 
 
+# vectorised golden section search algorithm for linesearch (1D constrained
+# minimisation) function must accept and return a 1D tensor of values, and
+# return a a tensor of the same shape returning objectives; this can therefore
+# run linesearch for multiple chains simultaneously. The batch dimension must be
+# known in advance however.
+gss <- function(func,
+                batch_dim = 1,
+                lower = 0,
+                upper = 1,
+                max_iterations = 50,
+                tolerance = 1e-32) {
+
+  lower <- tf$constant(lower, tf_float(), shape(1))
+  upper <- tf$constant(upper, tf_float(), shape(1))
+  maxiter <- tf$constant(as.integer(max_iterations), tf$int32)
+  tol <- fl(tolerance)
+  iter <- tf$constant(1L, tf$int32)
+
+  phi <- (1 + sqrt(5)) / 2
+  phim1 <- fl(phi - 1)
+  twomphi <- fl(2.0 - phi)
+
+  # expand out to batch dimensions
+  if (!inherits(batch_dim, "tensorflow.tensor")) {
+    batch_dim <- as.integer(batch_dim)
+    batch_dim <- tf$constant(batch_dim, tf$int32, shape(1))
+  }
+
+  left <- tf$tile(lower, batch_dim)
+  right <- tf$tile(upper, batch_dim)
+  width <- right - left
+  optimum <- tf$zeros(batch_dim, tf_float())
+
+  # status of the multiple consecutive searches; three columns, for left, right
+  # and optimum
+  status <- tf$stack(list(left, right, optimum), axis = 1L)
+
+  # start loop
+  body <- function (left, right, optimum, width, iter) {
+
+    d <- phim1 * width
+
+    # find evaluation points
+    a <- left + d
+    b <- right - d
+
+    # prep reordered matrices for whether steps are above or below
+    # if func(a) < func(b)
+    above <- tf$stack(list(left, b, a), axis = 1L)
+    # else
+    not_above <- tf$stack(list(a, right, b), axis = 1L)
+
+    status <- tf$where(tf$less(func(b), func(a)), above, not_above)
+
+    left <- status[, 0]
+    right <- status[, 1]
+    optimum <- status[, 2]
+
+    width <- right - left
+
+    list(left, right, optimum, width, iter + 1L)
+  }
+
+  cond <- function (left, right, optimum, width, iter) {
+    err <- twomphi * tf$abs(width / optimum)
+    not_converged <- tf$less(tol, err)
+    not_all_converged <- tf$reduce_any(not_converged)
+    in_time <- tf$less(iter, maxiter)
+    in_time & not_all_converged
+  }
+
+  values <- list(left, right, optimum, width, iter)
+
+  out <- tf$while_loop(cond, body, values)
+
+  result <- out[3:5]
+  names(result) <- c("minimum", "width", "iterations")
+  result
+
+}
+
 misc_module <- module(module,
                       check_tf_version,
                       member,
@@ -422,7 +503,8 @@ misc_module <- module(module,
                       match_batches,
                       split_chains,
                       hessian_dims,
-                      rhex)
+                      rhex,
+                      gss)
 
 # check dimensions of arguments to ops, and return the maximum dimension
 check_dims <- function(..., target_dim = NULL) {
diff --git a/tests/testthat/test_misc.R b/tests/testthat/test_misc.R
index 49f38073..ccbeebdd 100644
--- a/tests/testthat/test_misc.R
+++ b/tests/testthat/test_misc.R
@@ -285,3 +285,25 @@ test_that("module works", {
   expect_identical(names(mod$functions), c("exp", "log", "sum"))
 
 })
+
+test_that("golden section search works", {
+
+  skip_if_not(check_tf_version())
+  source("helpers.R")
+
+  n_batch <- 4
+
+  # simple quadratic function with different optima for each batch
+  func <- function (x) {
+    (x - locs) ^ 2
+  }
+
+  # random optima
+  locs <- runif(n_batch, 0.2, 0.8)
+
+  # run the minimisation
+  ss <- gss(func, n_batch)
+  result <- tf$Session()$run(ss)
+  compare_op(result$minimum, locs, tolerance = 0.2)
+
+})

From f08650745395d193dfc79911ea0f094763cfad9b Mon Sep 17 00:00:00 2001
From: Nick Golding <goldingn35@gmail.com>
Date: Fri, 11 Jan 2019 17:04:47 +1100
Subject: [PATCH 14/38] remove stepsize from laplace_approximation options (we
 optimise it now)

---
 R/marginalisers.R    | 23 +++--------------------
 man/marginalisers.Rd | 13 ++++---------
 2 files changed, 7 insertions(+), 29 deletions(-)

diff --git a/R/marginalisers.R b/R/marginalisers.R
index c63bb7dc..b8804185 100644
--- a/R/marginalisers.R
+++ b/R/marginalisers.R
@@ -99,11 +99,7 @@ discrete_marginalisation <- function(values) {
 #' @rdname marginalisers
 #' @export
 #'
-#' @param stepsize the (positive) size of steps in the Newton-Raphson
-#'   optimisation algorithm used to find the approximate conditional posterior
-#'   mode of the random variable
-#'
-#' @param stepsize the (positive) numerical convergence tolerance (in units of
+#' @param tolerance the (positive) numerical convergence tolerance (in units of
 #'   the log conditional posterior density) of the Newton-Raphson optimisation
 #'   algorithm
 #'
@@ -117,25 +113,15 @@ discrete_marginalisation <- function(values) {
 #'   marginalised, and each of element of the density must depend only on the
 #'   corresponding element of the variable vector. This is the responsibility of
 #'   the user, and is not checked.
-laplace_approximation <- function(stepsize = 0.05,
-                                  tolerance = 1e-6,
+laplace_approximation <- function(tolerance = 1e-6,
                                   max_iterations = 50) {
 
   # in future:
   #  - enable warm starts for subsequent steps of the outer inference algorithm
-  # do linesearch in the Newton-Raphson iterations
   #  - enable a non-factorising version (have the user say whether it is
   # factorising)
   #  - handle an iid normal distribution too.
 
-  # check arguments, Newton-Raphson iterations etc.
-  if (!(is.numeric(stepsize) &&
-        is.vector(stepsize) &&
-        length(stepsize) == 1 &&
-        stepsize > 0)) {
-    stop ("'stepsize' must be a positive, scalar numeric value")
-  }
-
   if (!(is.numeric(tolerance) &&
         is.vector(tolerance) &&
         length(tolerance) == 1 &&
@@ -212,11 +198,9 @@ laplace_approximation <- function(stepsize = 0.05,
     eye <- tf$constant(add_first_dim(diag(n)),
                        dtype = tf_float())
 
-    stepsize <- fl(add_first_dim(as_2D_array(stepsize)))
-
     # match batches on everything going into the loop that will have a batch
     # dimension later
-    objects <- list(mu, Sigma, z, a, U, obj_old, tol, stepsize)
+    objects <- list(mu, Sigma, z, a, U, obj_old, tol)
     objects <- greta:::match_batches(objects)
     mu <- objects[[1]]
     Sigma <- objects[[2]]
@@ -225,7 +209,6 @@ laplace_approximation <- function(stepsize = 0.05,
     U <- objects[[5]]
     obj_old <- objects[[6]]
     tol <- objects[[7]]
-    stepsize <- objects[[8]]
 
     obj <- -d0(z)
 
diff --git a/man/marginalisers.Rd b/man/marginalisers.Rd
index cb65a0ae..b5a8fdfc 100644
--- a/man/marginalisers.Rd
+++ b/man/marginalisers.Rd
@@ -8,23 +8,18 @@
 \usage{
 discrete_marginalisation(values)
 
-laplace_approximation(stepsize = 0.05, tolerance = 1e-06,
-  max_iterations = 50)
+laplace_approximation(tolerance = 1e-06, max_iterations = 50)
 }
 \arguments{
 \item{values}{an R vector giving values at which to evaluate the function for a
 discrete marginalisation}
 
-\item{stepsize}{the (positive) size of steps in the Newton-Raphson
-optimisation algorithm used to find the approximate conditional posterior
-mode of the random variable}
+\item{tolerance}{the (positive) numerical convergence tolerance (in units of
+the log conditional posterior density) of the Newton-Raphson optimisation
+algorithm}
 
 \item{max_iterations}{the (positive) integer-like maximum number of iterations of
 the Newton-Raphson optimisation algorithm}
-
-\item{stepsize}{the (positive) numerical convergence tolerance (in units of
-the log conditional posterior density) of the Newton-Raphson optimisation
-algorithm}
 }
 \value{
 a \code{marginaliser} object that can be passed to

From 0744608609930fd7c79b43a3608605b46524cda8 Mon Sep 17 00:00:00 2001
From: Nick Golding <goldingn35@gmail.com>
Date: Fri, 11 Jan 2019 17:05:18 +1100
Subject: [PATCH 15/38] bugfixes and tests to make sure laplace approximation
 works

---
 R/marginalisers.R                     | 24 ++++++++++-
 tests/testthat/test_marginalisation.R | 62 +++++++++++++++++++++++++++
 2 files changed, 84 insertions(+), 2 deletions(-)

diff --git a/R/marginalisers.R b/R/marginalisers.R
index b8804185..bd9a6f49 100644
--- a/R/marginalisers.R
+++ b/R/marginalisers.R
@@ -246,6 +246,8 @@ laplace_approximation <- function(tolerance = 1e-6,
       # use golden section search to find the optimum distance to step in this
       # direction, for each batch simultaneously
       psiline <- function (s) {
+        s <- tf$expand_dims(s, 1L)
+        s <- tf$expand_dims(s, 2L)
         a_new <- a + s * adiff
         z_new <- tf$matmul(Sigma, a) + mu
         psi(a_new, z_new, mu)
@@ -283,9 +285,27 @@ laplace_approximation <- function(tolerance = 1e-6,
     # run the Newton-Raphson optimisation to find the posterior mode of z
     out <- tf$while_loop(cond, body, values)
 
-    z <- out[[1]]
     a <- out[[2]]
-    U <- out[[3]]
+
+    # apparently we need to redefine z and U here, or the backprop errors
+
+    # lots of duplicated code; this could be tidied up, but I ran out of time!
+    z <- tf$matmul(Sigma, a) + mu
+
+    deriv <- derivs(z)
+    d1 <- deriv[[1]]
+    d2 <- deriv[[2]]
+
+    # curvature of the likelihood
+    W <- -d2
+    rW <- sqrt(W)
+
+    # decentred values of z
+    cf <- z - mu
+
+    # approximate posterior covariance & cholesky factor
+    mat1 <- tf$matmul(rW, tf_transpose(rW)) * Sigma + eye
+    U <- tf$cholesky(mat1)
 
     # the approximate marginal conditional posterior
     logdet <- tf_sum(tf$log(tf$matrix_diag_part(U)))
diff --git a/tests/testthat/test_marginalisation.R b/tests/testthat/test_marginalisation.R
index 27ce7c23..594124fc 100644
--- a/tests/testthat/test_marginalisation.R
+++ b/tests/testthat/test_marginalisation.R
@@ -144,3 +144,65 @@ test_that("discrete marginalisation gives correct densities", {
   compare_op(expected, observed)
 
 })
+
+
+test_that("laplace_approximation errors nicely", {
+
+  skip_if_not(check_tf_version())
+  source("helpers.R")
+
+  # bad tolerance
+  expect_error(
+    laplace_approximation(tolerance = -1),
+    "must be a positive, scalar numeric value"
+  )
+
+  # bad max iterations
+  expect_error(
+    laplace_approximation(max_iterations = 0),
+    "must be a positive, scalar integer value"
+  )
+
+  # mismatch with distribution
+  expect_error(
+    marginalise(I, normal(0, 1), laplace_approximation()),
+    "can only be used with a multivariate normal distribution"
+  )
+
+})
+
+
+test_that("inference runs with laplace approximation", {
+
+  skip_if_not(check_tf_version())
+  source("helpers.R")
+
+  # the eight schools model
+  N <- letters[1:8]
+  treatment_effects <- c(28.39, 7.94, -2.75 , 6.82, -0.64, 0.63, 18.01, 12.16)
+  treatment_stddevs <- c(14.9, 10.2, 16.3, 11.0, 9.4, 11.4, 10.4, 17.6)
+
+  int <- variable()
+  sd <- variable(lower = 0)
+  lik <- function (theta) {
+    distribution(treatment_effects) <- normal(t(theta), treatment_stddevs)
+  }
+
+  # mock up as a multivariate normal distribution
+  Sigma <- diag(8) * sd ^ 2
+  mu <- ones(1, 8) * int
+  marginalise(lik,
+              multivariate_normal(mu, Sigma),
+              laplace_approximation())
+
+  m <- model(int, sd)
+
+  expect_ok(o <- opt(m))
+  expect_ok(draws <- mcmc(m, warmup = 20, n_samples = 20,
+                          verbose = FALSE))
+
+  # the optimisation result should be similar to a very long run of
+
+})
+
+

From d65ad2599bfce3b22b5af5e3cc3e2b9b8a3aff66 Mon Sep 17 00:00:00 2001
From: Nick Golding <goldingn35@gmail.com>
Date: Fri, 14 Feb 2020 17:04:21 +1100
Subject: [PATCH 16/38] fix lints

---
 R/marginalise.R                       |  20 ++---
 R/marginalisers.R                     | 106 +++++++++++++-------------
 R/utils.R                             |   4 +-
 tests/testthat/test_marginalisation.R |  21 ++---
 tests/testthat/test_misc.R            |   2 +-
 5 files changed, 79 insertions(+), 74 deletions(-)

diff --git a/R/marginalise.R b/R/marginalise.R
index f742dcb4..641af894 100644
--- a/R/marginalise.R
+++ b/R/marginalise.R
@@ -71,7 +71,7 @@ marginalise <- function(fun, variable, method, ...) {
 
   # check the inputs
   if (!is.function(fun)) {
-    stop ("'fun' must be an R function")
+    stop("'fun' must be an R function")
   }
 
   # check the additional arguments to fun are given in dots
@@ -79,16 +79,16 @@ marginalise <- function(fun, variable, method, ...) {
   dot_names <- names(dots)
   expected_names <- names(formals(fun)[-1])
   if (!all(expected_names %in% dot_names)) {
-    stop ("all arguments to 'fun' must be passed, named, to marginalise")
+    stop("all arguments to 'fun' must be passed, named, to marginalise")
   }
 
   if (!inherits(distr, "distribution_node")) {
-    stop ("'variable' must be a variable greta array with a distribution")
+    stop("'variable' must be a variable greta array with a distribution")
   }
 
   if (!inherits(method, "marginaliser")) {
-    stop ("'method' must be a valid marginalisation method. ",
-          "See ?marginalise for options")
+    stop("'method' must be a valid marginalisation method. ",
+         "See ?marginalise for options")
   }
 
   # check the distribution is compatible with the method
@@ -217,15 +217,15 @@ as_conditional_density <- function(r_fun, args) {
 
     # check there are distributions
     if (!any(sub_dag$node_types == "distribution")) {
-      stop ("'fun' must constain at least one distribution over data",
-            call. = FALSE)
+      stop("'fun' must constain at least one distribution over data",
+           call. = FALSE)
     }
 
     # check there are no variables
     if (any(sub_dag$node_types == "variable")) {
-      stop ("'fun' must not create any new variables, ",
-            "variables can be passed in as arguments",
-            call. = FALSE)
+      stop("'fun' must not create any new variables, ",
+           "variables can be passed in as arguments",
+           call. = FALSE)
     }
 
     # use the default graph, so that it can be overwritten when this is called
diff --git a/R/marginalisers.R b/R/marginalisers.R
index bd9a6f49..54d9fa15 100644
--- a/R/marginalisers.R
+++ b/R/marginalisers.R
@@ -22,8 +22,8 @@ NULL
 #' @rdname marginalisers
 #' @export
 #'
-#' @param values an R vector giving values at which to evaluate the function for a
-#'   discrete marginalisation
+#' @param values an R vector giving values at which to evaluate the function for
+#'   a discrete marginalisation
 #'
 #' @details \code{discrete_marginalisation} can only be used with discrete
 #'   probability distributions, e.g. those defined with \code{poisson()} and
@@ -43,7 +43,7 @@ discrete_marginalisation <- function(values) {
       msg <- paste0(msg, ", not a greta array")
     }
 
-    stop (msg)
+    stop(msg)
   }
 
   # define the marginalisation function
@@ -103,8 +103,8 @@ discrete_marginalisation <- function(values) {
 #'   the log conditional posterior density) of the Newton-Raphson optimisation
 #'   algorithm
 #'
-#' @param max_iterations the (positive) integer-like maximum number of iterations of
-#'   the Newton-Raphson optimisation algorithm
+#' @param max_iterations the (positive) integer-like maximum number of
+#'   iterations of the Newton-Raphson optimisation algorithm
 #'
 #' @details \code{laplace_approximation} can only be used to marginalise
 #'   variables following a multivariate normal distribution. In addition, the
@@ -126,14 +126,14 @@ laplace_approximation <- function(tolerance = 1e-6,
         is.vector(tolerance) &&
         length(tolerance) == 1 &&
         tolerance > 0)) {
-    stop ("'tolerance' must be a positive, scalar numeric value")
+    stop("'tolerance' must be a positive, scalar numeric value")
   }
 
   max_iterations <- as.integer(max_iterations)
   if (!(is.vector(max_iterations) &&
         length(max_iterations) == 1 &&
         max_iterations > 0)) {
-    stop ("'max_iterations' must be a positive, scalar integer value")
+    stop("'max_iterations' must be a positive, scalar integer value")
   }
 
   # define the marginalisation function
@@ -143,17 +143,18 @@ laplace_approximation <- function(tolerance = 1e-6,
                               dag,
                               distribution_node) {
 
-    # simpler interface to conditional density. If reduce = TRUE, it does reduce_sum on component densities
-    d0 <- function (z, reduce = TRUE) {
+    # simpler interface to conditional density. If reduce = TRUE, it does
+    # reduce_sum on component densities
+    d0 <- function(z, reduce = TRUE) {
       # transpose z to a row vector, which the dag is expecting
       t_z <- tf_transpose(z)
       args <- c(list(t_z), other_args, list(reduce = reduce))
       do.call(tf_conditional_density_fun, args)
     }
 
-    # get the vectors of first and second derivatives of the conditional density function,
-    # w.r.t. the variable being marginalised
-    derivs <- function (z) {
+    # get the vectors of first and second derivatives of the conditional density
+    # function, w.r.t. the variable being marginalised
+    derivs <- function(z) {
       y <- d0(z, reduce = FALSE)
       d1 <- tf$gradients(y, z)[[1]]
       d2 <- tf$gradients(d1, z)[[1]]
@@ -161,7 +162,7 @@ laplace_approximation <- function(tolerance = 1e-6,
     }
 
     # negative log-posterior for the current value of z under MVN assumption
-    psi <- function (a, z, mu) {
+    psi <- function(a, z, mu) {
       p1 <- tf$matmul(tf_transpose(a), z - mu)
       fl(0.5) * tf$squeeze(p1, 1:2) - d0(z)
     }
@@ -170,8 +171,8 @@ laplace_approximation <- function(tolerance = 1e-6,
     mu_node <- distribution_node$parameters$mean
     mu <- dag$tf_environment[[dag$tf_name(mu_node)]]
     mu <- tf_transpose(mu)
-    Sigma_node <- distribution_node$parameters$Sigma
-    Sigma <- dag$tf_environment[[dag$tf_name(Sigma_node)]]
+    sigma_node <- distribution_node$parameters$Sigma
+    sigma <- dag$tf_environment[[dag$tf_name(sigma_node)]]
 
     # dimension of the MVN distribution
     n <- dim(mu)[[2]]
@@ -193,20 +194,20 @@ laplace_approximation <- function(tolerance = 1e-6,
     # other objects
     a_value <- add_first_dim(as_2D_array(rep(0, n)))
     a <- tf$constant(a_value, dtype = tf_float())
-    U_value <- add_first_dim(diag(n))
-    U <- tf$constant(U_value, tf_float())
+    u_value <- add_first_dim(diag(n))
+    u <- tf$constant(u_value, tf_float())
     eye <- tf$constant(add_first_dim(diag(n)),
                        dtype = tf_float())
 
     # match batches on everything going into the loop that will have a batch
     # dimension later
-    objects <- list(mu, Sigma, z, a, U, obj_old, tol)
+    objects <- list(mu, sigma, z, a, u, obj_old, tol)
     objects <- greta:::match_batches(objects)
     mu <- objects[[1]]
-    Sigma <- objects[[2]]
+    sigma <- objects[[2]]
     z <- objects[[3]]
     a <- objects[[4]]
-    U <- objects[[5]]
+    u <- objects[[5]]
     obj_old <- objects[[6]]
     tol <- objects[[7]]
 
@@ -217,7 +218,7 @@ laplace_approximation <- function(tolerance = 1e-6,
     batch_dim <- tf$expand_dims(batch_dim, 0L)
 
     # tensorflow while loop to do Newton-Raphson iterations
-    body <- function(z, a, U, obj_old, obj, tol, iter, maxiter) {
+    body <- function(z, a, u, obj_old, obj, tol, iter, maxiter) {
 
       obj_old <- obj
 
@@ -226,30 +227,30 @@ laplace_approximation <- function(tolerance = 1e-6,
       d2 <- deriv[[2]]
 
       # curvature of the likelihood
-      W <- -d2
-      rW <- sqrt(W)
+      w <- -d2
+      rw <- sqrt(w)
 
       # decentred values of z
       cf <- z - mu
 
       # approximate posterior covariance & cholesky factor
-      mat1 <- tf$matmul(rW, tf_transpose(rW)) * Sigma + eye
-      U <- tf$cholesky(mat1)
-      L <- tf_transpose(U)
+      mat1 <- tf$matmul(rw, tf_transpose(rw)) * sigma + eye
+      u <- tf$cholesky(mat1)
+      L <- tf_transpose(u)
 
       # compute Newton-Raphson update direction
-      b <- W * cf + d1
-      mat2 <- rW * tf$matmul(Sigma, b)
-      mat3 <- tf$matrix_triangular_solve(U, mat2)
-      adiff <- b - rW * tf$matrix_triangular_solve(L, mat3, lower = FALSE) - a
+      b <- w * cf + d1
+      mat2 <- rw * tf$matmul(sigma, b)
+      mat3 <- tf$matrix_triangular_solve(u, mat2)
+      adiff <- b - rw * tf$matrix_triangular_solve(L, mat3, lower = FALSE) - a
 
       # use golden section search to find the optimum distance to step in this
       # direction, for each batch simultaneously
-      psiline <- function (s) {
+      psiline <- function(s) {
         s <- tf$expand_dims(s, 1L)
         s <- tf$expand_dims(s, 2L)
         a_new <- a + s * adiff
-        z_new <- tf$matmul(Sigma, a) + mu
+        z_new <- tf$matmul(sigma, a) + mu
         psi(a_new, z_new, mu)
       }
 
@@ -260,14 +261,14 @@ laplace_approximation <- function(tolerance = 1e-6,
 
       # do the update and compute new z and objective
       a_new <- a + stepsize * adiff
-      z_new <- tf$matmul(Sigma, a_new) + mu
+      z_new <- tf$matmul(sigma, a_new) + mu
       obj <- psi(a_new, z_new, mu)
 
-      list(z_new, a_new, U, obj_old, obj, tol, iter + 1L, maxiter)
+      list(z_new, a_new, u, obj_old, obj, tol, iter + 1L, maxiter)
 
     }
 
-    cond <- function(z, a, U, obj_old, obj, tol, iter, maxiter) {
+    cond <- function(z, a, u, obj_old, obj, tol, iter, maxiter) {
       not_all_converged <- tf$reduce_any(tf$less(tol, obj_old - obj))
       in_time <- tf$less(iter, maxiter)
       in_time & not_all_converged
@@ -275,7 +276,7 @@ laplace_approximation <- function(tolerance = 1e-6,
 
     values <- list(z,
                    a,
-                   U,
+                   u,
                    obj_old,
                    obj,
                    tol,
@@ -287,28 +288,28 @@ laplace_approximation <- function(tolerance = 1e-6,
 
     a <- out[[2]]
 
-    # apparently we need to redefine z and U here, or the backprop errors
+    # apparently we need to redefine z and u here, or the backprop errors
 
     # lots of duplicated code; this could be tidied up, but I ran out of time!
-    z <- tf$matmul(Sigma, a) + mu
+    z <- tf$matmul(sigma, a) + mu
 
     deriv <- derivs(z)
     d1 <- deriv[[1]]
     d2 <- deriv[[2]]
 
     # curvature of the likelihood
-    W <- -d2
-    rW <- sqrt(W)
+    w <- -d2
+    rw <- sqrt(w)
 
     # decentred values of z
     cf <- z - mu
 
     # approximate posterior covariance & cholesky factor
-    mat1 <- tf$matmul(rW, tf_transpose(rW)) * Sigma + eye
-    U <- tf$cholesky(mat1)
+    mat1 <- tf$matmul(rw, tf_transpose(rw)) * sigma + eye
+    u <- tf$cholesky(mat1)
 
     # the approximate marginal conditional posterior
-    logdet <- tf_sum(tf$log(tf$matrix_diag_part(U)))
+    logdet <- tf_sum(tf$log(tf$matrix_diag_part(u)))
     nmcp <- psi(a, z, mu) + tf$squeeze(logdet, 1)
 
     -nmcp
@@ -325,23 +326,26 @@ laplace_approximation <- function(tolerance = 1e-6,
 # check that the distribution is discrete
 discrete_check <- function(distrib) {
   if (!distrib$discrete) {
-    stop ("this marginalisation method can only be used ",
-          "with discrete distributions",
-          call. = FALSE)
+    stop("this marginalisation method can only be used ",
+         "with discrete distributions",
+         call. = FALSE)
   }
 }
 
 # check that the distribution is multiariate normal
-multivariate_normal_check <- function (distrib) {
+multivariate_normal_check <- function(distrib) {
   if (distrib$distribution_name != "multivariate_normal") {
-    stop ("the Laplace approximation can only be used ",
-          "with a multivariate normal distribution",
-          call. = FALSE)
+    stop("the Laplace approximation can only be used ",
+         "with a multivariate normal distribution",
+         call. = FALSE)
   }
 }
 
 # helper to contruct marginalisers
-as_marginaliser <- function (name, tf_marginaliser, parameters, distribution_check) {
+as_marginaliser <- function(name,
+                            tf_marginaliser,
+                            parameters,
+                            distribution_check) {
 
   obj <- list(name = name,
               tf_marginaliser = tf_marginaliser,
diff --git a/R/utils.R b/R/utils.R
index d8194d63..908cc31b 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -421,7 +421,7 @@ gss <- function(func,
   status <- tf$stack(list(left, right, optimum), axis = 1L)
 
   # start loop
-  body <- function (left, right, optimum, width, iter) {
+  body <- function(left, right, optimum, width, iter) {
 
     d <- phim1 * width
 
@@ -446,7 +446,7 @@ gss <- function(func,
     list(left, right, optimum, width, iter + 1L)
   }
 
-  cond <- function (left, right, optimum, width, iter) {
+  cond <- function(left, right, optimum, width, iter) {
     err <- twomphi * tf$abs(width / optimum)
     not_converged <- tf$less(tol, err)
     not_all_converged <- tf$reduce_any(not_converged)
diff --git a/tests/testthat/test_marginalisation.R b/tests/testthat/test_marginalisation.R
index 594124fc..f1cdad3a 100644
--- a/tests/testthat/test_marginalisation.R
+++ b/tests/testthat/test_marginalisation.R
@@ -32,7 +32,9 @@ test_that("marginalise errors nicely", {
   # the following can only be assessed on final model definition
 
   # function adds variables
-  fun <- function (x) {x + normal(0, 1)}
+  fun <- function(x) {
+    x + normal(0, 1)
+  }
   lambda <- variable()
   marginalise(fun, poisson(lambda), discrete_marginalisation(0:2))
   expect_error(
@@ -41,7 +43,9 @@ test_that("marginalise errors nicely", {
   )
 
   # function has no distribution
-  fun <- function (x) {x * 2}
+  fun <- function(x) {
+    x * 2
+  }
   lambda <- variable()
   marginalise(fun, poisson(lambda), discrete_marginalisation(0:2))
   expect_error(
@@ -90,7 +94,7 @@ test_that("inference runs with discrete marginalisation", {
   p <- variable(lower = 0, upper = 1)
   alpha <- variable()
   beta <- variable()
-  likelihood <- function (z, alpha, beta) {
+  likelihood <- function(z, alpha, beta) {
     eta <- alpha + beta * x * z
     lambda <- exp(eta)
     distribution(y) <- poisson(lambda)
@@ -116,7 +120,7 @@ test_that("discrete marginalisation gives correct densities", {
   # no constraints to no adjustment to account for
   lambda <- variable()
   y <- rnorm(100)
-  likelihood <- function (z) {
+  likelihood <- function(z) {
     distribution(y) <- normal(z, 1)
   }
   values <- 0:5
@@ -178,21 +182,20 @@ test_that("inference runs with laplace approximation", {
   source("helpers.R")
 
   # the eight schools model
-  N <- letters[1:8]
   treatment_effects <- c(28.39, 7.94, -2.75 , 6.82, -0.64, 0.63, 18.01, 12.16)
   treatment_stddevs <- c(14.9, 10.2, 16.3, 11.0, 9.4, 11.4, 10.4, 17.6)
 
   int <- variable()
   sd <- variable(lower = 0)
-  lik <- function (theta) {
+  lik <- function(theta) {
     distribution(treatment_effects) <- normal(t(theta), treatment_stddevs)
   }
 
   # mock up as a multivariate normal distribution
-  Sigma <- diag(8) * sd ^ 2
+  sigma <- diag(8) * sd ^ 2
   mu <- ones(1, 8) * int
   marginalise(lik,
-              multivariate_normal(mu, Sigma),
+              multivariate_normal(mu, sigma),
               laplace_approximation())
 
   m <- model(int, sd)
@@ -204,5 +207,3 @@ test_that("inference runs with laplace approximation", {
   # the optimisation result should be similar to a very long run of
 
 })
-
-
diff --git a/tests/testthat/test_misc.R b/tests/testthat/test_misc.R
index 259ba8f2..d4a7590b 100644
--- a/tests/testthat/test_misc.R
+++ b/tests/testthat/test_misc.R
@@ -294,7 +294,7 @@ test_that("golden section search works", {
   n_batch <- 4
 
   # simple quadratic function with different optima for each batch
-  func <- function (x) {
+  func <- function(x) {
     (x - locs) ^ 2
   }
 

From f8c88dcdd176d3eae06bcfb80fa109ed16feb68a Mon Sep 17 00:00:00 2001
From: Nick Golding <goldingn35@gmail.com>
Date: Fri, 14 Feb 2020 17:13:20 +1100
Subject: [PATCH 17/38] fix lints

---
 R/marginalisers.R | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/R/marginalisers.R b/R/marginalisers.R
index 54d9fa15..5cb80479 100644
--- a/R/marginalisers.R
+++ b/R/marginalisers.R
@@ -56,7 +56,7 @@ discrete_marginalisation <- function(values) {
     # convert these into a list of constant tensors with the correct dimensions
     # and float types
     values_list <- as.list(values)
-    values_list <- lapply(values_list, as_2D_array)
+    values_list <- lapply(values_list, as_2d_array)
     values_list <- lapply(values_list, add_first_dim)
     values_list <- lapply(values_list, fl)
 
@@ -182,7 +182,7 @@ laplace_approximation <- function(tolerance = 1e-6,
 
     # here z is a *column vector* to simplify later calculations, it needs to be
     # transposed to a row vector before feeding into the likelihood function(s)
-    z_value <- add_first_dim(as_2D_array(rnorm(n)))
+    z_value <- add_first_dim(as_2d_array(rnorm(n)))
     z <- tf$constant(z_value, dtype = tf_float())
 
     # Newton-Raphson parameters
@@ -192,7 +192,7 @@ laplace_approximation <- function(tolerance = 1e-6,
     maxiter <- tf$constant(max_iterations)
 
     # other objects
-    a_value <- add_first_dim(as_2D_array(rep(0, n)))
+    a_value <- add_first_dim(as_2d_array(rep(0, n)))
     a <- tf$constant(a_value, dtype = tf_float())
     u_value <- add_first_dim(diag(n))
     u <- tf$constant(u_value, tf_float())
@@ -236,13 +236,13 @@ laplace_approximation <- function(tolerance = 1e-6,
       # approximate posterior covariance & cholesky factor
       mat1 <- tf$matmul(rw, tf_transpose(rw)) * sigma + eye
       u <- tf$cholesky(mat1)
-      L <- tf_transpose(u)
+      l <- tf_transpose(u)
 
       # compute Newton-Raphson update direction
       b <- w * cf + d1
       mat2 <- rw * tf$matmul(sigma, b)
       mat3 <- tf$matrix_triangular_solve(u, mat2)
-      adiff <- b - rw * tf$matrix_triangular_solve(L, mat3, lower = FALSE) - a
+      adiff <- b - rw * tf$matrix_triangular_solve(l, mat3, lower = FALSE) - a
 
       # use golden section search to find the optimum distance to step in this
       # direction, for each batch simultaneously

From faa9cadfa4ea5f2a483144bdd0213f26c5e3e32d Mon Sep 17 00:00:00 2001
From: Nick Golding <goldingn35@gmail.com>
Date: Tue, 18 Feb 2020 17:23:17 +1100
Subject: [PATCH 18/38] bring marginalisation branch up to date with greta

---
 R/dag_class.R                         |  3 +++
 R/marginalise.R                       | 24 ++++++++++++++++--------
 R/marginalisers.R                     | 14 +++++++-------
 tests/testthat/test_marginalisation.R |  2 +-
 4 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/R/dag_class.R b/R/dag_class.R
index bf3f4c6e..3bce0502 100644
--- a/R/dag_class.R
+++ b/R/dag_class.R
@@ -530,6 +530,9 @@ dag_class <- R6Class(
         for (name in data_names)
           tfe[[name]] <- tfe_old[[name]]
 
+        # copy the batch size over
+        tfe$batch_size <- tfe_old$batch_size
+
         # put the free state in the environment, and build out the tf graph
         tfe$free_state <- free_state
         self$define_tf_body()
diff --git a/R/marginalise.R b/R/marginalise.R
index 641af894..3d80f1ca 100644
--- a/R/marginalise.R
+++ b/R/marginalise.R
@@ -158,26 +158,31 @@ marginalisation_distribution <- R6Class(
 
     tf_distrib = function(parameters, dag) {
 
+      self$target
+
+      # build the tfp distribution for the marginalisation distribution
+      parameter_nodes <- self$target$parameters
+      distrib_constructor <- dag$get_tf_object(self$target)
+      tf_parameter_list <- lapply(parameter_nodes, dag$get_tf_object)
+      tfp_distribution <- distrib_constructor(tf_parameter_list, dag = dag)
+
       # the marginal density implied by the function
       log_prob <- function(x) {
 
-        # x will be the target; a tf function for the distribution being
+        # x will be the target; a tf constructor function for the distribution being
         # marginalised. Pass in the distribution node and the dag too, in case
         # the marginalisers need them.
         self$tf_marginaliser(self$conditional_density_fun,
-                             tf_distribution_log_pdf = x,
+                             tfp_distribution = tfp_distribution,
                              other_args = parameters,
                              dag = dag,
                              distribution_node = self$target)
 
       }
 
-      list(log_prob = log_prob, cdf = NULL, log_cdf = NULL)
-
-    },
+      list(log_prob = log_prob)
 
-    tf_cdf_function = NULL,
-    tf_log_cdf_function = NULL
+    }
 
   )
 )
@@ -197,7 +202,7 @@ as_conditional_density <- function(r_fun, args) {
   # the function.
 
   # this function will take in tensors corresponding to the things in args
-  function(..., reduce = TRUE) {
+  function(..., reduce = TRUE, dag = NULL) {
 
     tensor_inputs <- list(...)
 
@@ -233,6 +238,9 @@ as_conditional_density <- function(r_fun, args) {
     sub_dag$tf_graph <- tf$get_default_graph()
     sub_tfe <- sub_dag$tf_environment
 
+    # pass on the batch size, used when defining data
+    sub_tfe$batch_size <- dag$tf_environment$batch_size
+
     # set the input tensors as the values for the dummy greta arrays in the new
     # tf_environment
     node_dummies <- lapply(ga_dummies, get_node)
diff --git a/R/marginalisers.R b/R/marginalisers.R
index 5cb80479..b2ea8538 100644
--- a/R/marginalisers.R
+++ b/R/marginalisers.R
@@ -48,7 +48,7 @@ discrete_marginalisation <- function(values) {
 
   # define the marginalisation function
   tf_marginaliser <- function(tf_conditional_density_fun,
-                              tf_distribution_log_pdf,
+                              tfp_distribution,
                               other_args,
                               dag,
                               distribution_node) {
@@ -62,7 +62,7 @@ discrete_marginalisation <- function(values) {
 
     # 1. get weights from the distribution log pdf
     # assuming values is a list, get tensors for the weights
-    weights_list <- lapply(values_list, tf_distribution_log_pdf)
+    weights_list <- lapply(values_list, tfp_distribution$log_prob)
     weights_list <- lapply(weights_list, tf_sum)
 
     # convert to a vector of discrete probabilities and make them sum to 1
@@ -72,10 +72,10 @@ discrete_marginalisation <- function(values) {
     log_weights_vec <- tf$log(weights_vec)
 
     # 2. compute the conditional joint density for each value (passing in
-    # other_args)
+    # other_args and the outer dag)
     log_density_list <- list()
     for (i in seq_along(values_list)) {
-      args <- c(list(values_list[[i]]), other_args)
+      args <- c(list(values_list[[i]]), other_args, dag = dag)
       log_density_list[[i]] <- do.call(tf_conditional_density_fun, args)
     }
 
@@ -138,7 +138,7 @@ laplace_approximation <- function(tolerance = 1e-6,
 
   # define the marginalisation function
   tf_marginaliser <- function(tf_conditional_density_fun,
-                              tf_distribution_log_pdf,
+                              tfp_distribution,
                               other_args,
                               dag,
                               distribution_node) {
@@ -148,7 +148,7 @@ laplace_approximation <- function(tolerance = 1e-6,
     d0 <- function(z, reduce = TRUE) {
       # transpose z to a row vector, which the dag is expecting
       t_z <- tf_transpose(z)
-      args <- c(list(t_z), other_args, list(reduce = reduce))
+      args <- c(list(t_z), other_args, list(reduce = reduce, dag = dag))
       do.call(tf_conditional_density_fun, args)
     }
 
@@ -171,7 +171,7 @@ laplace_approximation <- function(tolerance = 1e-6,
     mu_node <- distribution_node$parameters$mean
     mu <- dag$tf_environment[[dag$tf_name(mu_node)]]
     mu <- tf_transpose(mu)
-    sigma_node <- distribution_node$parameters$Sigma
+    sigma_node <- distribution_node$parameters$sigma
     sigma <- dag$tf_environment[[dag$tf_name(sigma_node)]]
 
     # dimension of the MVN distribution
diff --git a/tests/testthat/test_marginalisation.R b/tests/testthat/test_marginalisation.R
index f1cdad3a..c8e5deb6 100644
--- a/tests/testthat/test_marginalisation.R
+++ b/tests/testthat/test_marginalisation.R
@@ -107,7 +107,7 @@ test_that("inference runs with discrete marginalisation", {
   m <- model(alpha, beta, p)
 
   expect_ok(o <- opt(m))
-  expect_ok(draws <- mcmc(m, warmup = 20, n_samples = 20))
+  expect_ok(draws <- mcmc(m, warmup = 20, n_samples = 20, verbose = FALSE))
 
 })
 

From ff341719f1b9ed9733892719c7a9152c15f2bf33 Mon Sep 17 00:00:00 2001
From: Nick Golding <goldingn35@gmail.com>
Date: Wed, 19 Feb 2020 08:25:40 +1100
Subject: [PATCH 19/38] more stable computation of discrete marginalisation
 weights

---
 R/marginalisers.R | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/R/marginalisers.R b/R/marginalisers.R
index b2ea8538..afdaf174 100644
--- a/R/marginalisers.R
+++ b/R/marginalisers.R
@@ -62,14 +62,20 @@ discrete_marginalisation <- function(values) {
 
     # 1. get weights from the distribution log pdf
     # assuming values is a list, get tensors for the weights
-    weights_list <- lapply(values_list, tfp_distribution$log_prob)
-    weights_list <- lapply(weights_list, tf_sum)
+    log_weights_list <- lapply(values_list, tfp_distribution$log_prob)
+    log_weights_list <- lapply(log_weights_list, tf_sum)
 
-    # convert to a vector of discrete probabilities and make them sum to 1
-    weights_vec <- tf$concat(weights_list, axis = 1L)
-    weights_vec <- tf$exp(weights_vec)
-    weights_vec <- weights_vec / tf_sum(weights_vec)
-    log_weights_vec <- tf$log(weights_vec)
+    # convert to a vector of discrete probabilities and make them sum to 1,
+    # whilst staying on the log scale
+    log_weights <- tf$concat(log_weights_list, axis = 1L)
+
+    # normalise weights on log scale
+    log_weights_sum <- tf$reduce_logsumexp(
+      log_weights,
+      axis = 1L,
+      keepdims = TRUE
+    )
+    log_weights <- log_weights - log_weights_sum
 
     # 2. compute the conditional joint density for each value (passing in
     # other_args and the outer dag)
@@ -84,7 +90,7 @@ discrete_marginalisation <- function(values) {
     log_density_vec <- tf$concat(log_density_list, axis = 1L)
 
     # 3. compute a weighted sum
-    log_density_weighted_vec <- log_density_vec + log_weights_vec
+    log_density_weighted_vec <- log_density_vec + log_weights
     tf$reduce_logsumexp(log_density_weighted_vec, axis = 1L)
 
   }

From 2c6ff3996b422a635059f3f3acd3a1e8d6e9e60f Mon Sep 17 00:00:00 2001
From: Nick Golding <goldingn35@gmail.com>
Date: Wed, 19 Feb 2020 10:34:10 +1100
Subject: [PATCH 20/38] add posterior conjugate normal test; run short
 posterior checks every time

---
 tests/testthat/helpers.R         | 20 +++++-----
 tests/testthat/test_posteriors.R | 65 ++++++++++++++++++++++++++++----
 2 files changed, 68 insertions(+), 17 deletions(-)

diff --git a/tests/testthat/helpers.R b/tests/testthat/helpers.R
index 0e5a47ed..e593b532 100644
--- a/tests/testthat/helpers.R
+++ b/tests/testthat/helpers.R
@@ -835,7 +835,7 @@ not_finished <- function(draws, target_samples = 5000) {
 new_samples <- function(draws, target_samples = 5000) {
   neff <- min(coda::effectiveSize(draws))
   efficiency <- neff / coda::niter(draws)
-  1.2 * (target_samples - neff) / efficiency
+  1.5 * (target_samples - neff) / efficiency
 }
 
 not_timed_out <- function(start_time, time_limit = 300) {
@@ -844,15 +844,17 @@ not_timed_out <- function(start_time, time_limit = 300) {
 }
 
 get_enough_draws <- function(model,
-                             sampler = sampler,
                              n_effective = 5000,
+                             sampler = hmc(),
                              time_limit = 300,
-                             verbose = TRUE,
+                             chains = 50,
+                             verbose = FALSE,
                              one_by_one = FALSE) {
 
   start_time <- Sys.time()
   draws <- mcmc(model,
                 sampler = sampler,
+                chains = chains,
                 verbose = verbose,
                 one_by_one = one_by_one)
 
@@ -917,8 +919,8 @@ check_mvn_samples <- function(sampler, n_effective = 3000) {
   m <- model(x, precision = "single")
 
   draws <- get_enough_draws(m,
-                            sampler = sampler,
                             n_effective = n_effective,
+                            sampler = sampler,
                             verbose = FALSE)
 
   # get MCMC samples for statistics of the samples (value, variance and
@@ -938,10 +940,10 @@ check_mvn_samples <- function(sampler, n_effective = 3000) {
   # get absolute errors between posterior means and true values, and scale them
   # by time-series Monte Carlo standard errors (the expected amount of
   # uncertainty in the MCMC estimate), to give the number of standard errors
-  # away from truth. There's a 1/100 chance of any one of these scaled errors
-  # being greater than qnorm(0.99) if the sampler is correct
+  # away from truth. There's a 1/200 chance of any one of these scaled errors
+  # being greater than qnorm(1 - 0.005) if the sampler is correct
   errors <- scaled_error(stat_draws, stat_truth)
-  expect_lte(max(errors), qnorm(0.99))
+  expect_lte(max(errors), qnorm(1 - 0.005))
 
 }
 
@@ -958,8 +960,8 @@ check_samples <- function(x,
 
   m <- model(x, precision = "single")
   draws <- get_enough_draws(m,
-                            sampler = sampler,
                             n_effective = n_effective,
+                            sampler = sampler,
                             verbose = FALSE,
                             one_by_one = one_by_one)
 
@@ -979,6 +981,6 @@ check_samples <- function(x,
 
   # do a formal hypothesis test
   suppressWarnings(stat <- ks.test(mcmc_samples, iid_samples))
-  testthat::expect_gte(stat$p.value, 0.01)
+  testthat::expect_gte(stat$p.value, 0.005)
 
 }
diff --git a/tests/testthat/test_posteriors.R b/tests/testthat/test_posteriors.R
index 68bae814..71e7bbdc 100644
--- a/tests/testthat/test_posteriors.R
+++ b/tests/testthat/test_posteriors.R
@@ -4,7 +4,6 @@ test_that("posterior is correct (binomial)", {
 
   skip_if_not(check_tf_version())
   source("helpers.R")
-  skip_if_not_release()
 
   # analytic solution to the posterior of the paramter of a binomial
   # distribution, with uniform prior
@@ -15,7 +14,7 @@ test_that("posterior is correct (binomial)", {
   distribution(pos) <- binomial(n, theta)
   m <- model(theta)
 
-  draws <- get_enough_draws(m, hmc(), 2000, verbose = FALSE)
+  draws <- get_enough_draws(m, n_effective = 5000)
 
   samples <- as.matrix(draws)
 
@@ -33,7 +32,61 @@ test_that("posterior is correct (binomial)", {
   n_draws <- round(coda::effectiveSize(draws))
   comparison <- rbeta(n_draws, shape1, shape2)
   suppressWarnings(test <- ks.test(samples, comparison))
-  expect_gte(test$p.value, 0.01)
+  expect_gte(test$p.value, 0.05)
+
+})
+
+test_that("posterior is correct (normal)", {
+
+  skip_if_not(check_tf_version())
+  source("helpers.R")
+
+  # test vs analytic posterior on 8 schools data with no pooling:
+  #   y_i ~ N(theta_i, obs_sd_i ^ 2)
+  #   theta_i ~ N(mu, sd ^ 2)
+
+  # eight schools data
+  y <- c(28.39, 7.94, -2.75 , 6.82, -0.64, 0.63, 18.01, 12.16)
+  obs_sd <- c(14.9, 10.2, 16.3, 11.0, 9.4, 11.4, 10.4, 17.6)
+
+  # prior parameters for int, and fixed variance of theta
+  mu <- rnorm(1)
+  sd <- abs(rnorm(1))
+
+  # Bayes theorum gives:
+  #   p(theta | y) \propto p(y|theta) p(theta)
+  # which with normal densities is:
+  #   p(theta_i | y_i) \propto N(y_i | theta_i, obs_sd_i ^ 2) * N(theta_i | mu, sd ^ 2)
+  # which is equivalent to:
+  #   p(theta_i | y_i) \propto N(theta_mu_i, theta_var_i)
+  #   theta_var_i = 1 / (1 / sd ^ 2 + 1 / obs_sd_i ^ 2)
+  #   theta_mu_i = (mu / sd ^ 2 + y_i / obs_sd_i ^ 2) * theta_var_i
+  # conjugate prior, see Wikipedia conjugate prior table
+
+  obs_prec <- 1 / (obs_sd ^ 2)
+  prec <- 1 / (sd ^ 2)
+  theta_var <- 1 / (obs_prec + prec)
+  theta_mu <- (y * obs_prec  + mu * prec) * theta_var
+  analytic <- cbind(mean = theta_mu, sd = sqrt(theta_var))
+
+  # mcmc solution:
+  theta <- normal(mu, sd, dim = 8)
+  distribution(y) <- normal(theta, obs_sd)
+  m <- model(theta)
+
+  # draw at least 5000 effective samples of each parameter from the posterior
+  draws <- get_enough_draws(m, n_effective = 5000)
+
+  mcmc_stats <- summary(draws)$statistics
+  mcmc <- mcmc_stats[, 1:2]
+  error <- abs(mcmc - analytic)
+  mcmc_se <- mcmc_stats[, 4]
+
+  # ratio of error to that expected due to Monte Carlo noise
+  error_multiples <- error / cbind(mcmc_se, mcmc_se)
+
+  within_error_margin <- error_multiples < qnorm(1 - 0.005)
+  expect_true(all(within_error_margin))
 
 })
 
@@ -41,7 +94,6 @@ test_that("samplers are unbiased for bivariate normals", {
 
   skip_if_not(check_tf_version())
   source("helpers.R")
-  skip_if_not_release()
 
   check_mvn_samples(hmc())
   check_mvn_samples(rwmh())
@@ -53,7 +105,6 @@ test_that("samplers are unbiased for chi-squared", {
 
   skip_if_not(check_tf_version())
   source("helpers.R")
-  skip_if_not_release()
 
   df <- 5
   x <- chi_squared(df)
@@ -67,7 +118,6 @@ test_that("samplers are unbiased for standard uniform", {
 
   skip_if_not(check_tf_version())
   source("helpers.R")
-  skip_if_not_release()
 
   x <- uniform(0, 1)
   iid <- runif
@@ -80,13 +130,12 @@ test_that("samplers are unbiased for LKJ", {
 
   skip_if_not(check_tf_version())
   source("helpers.R")
-  skip_if_not_release()
 
   x <- lkj_correlation(3, 2)[1, 2]
   iid <- function(n)
     rlkjcorr(n, 3, 2)[, 1, 2]
 
-  check_samples(x, iid, hmc(), one_by_one = TRUE)
+  check_samples(x, iid)
 
 })
 

From ddf9e9ecdffe0a2c2f9110a8a2b147b9476b4a61 Mon Sep 17 00:00:00 2001
From: Nick Golding <goldingn35@gmail.com>
Date: Wed, 19 Feb 2020 10:34:39 +1100
Subject: [PATCH 21/38] add failing test for laplace approximation

---
 tests/testthat/test_marginalisation.R | 61 +++++++++++++++++++++++++--
 1 file changed, 58 insertions(+), 3 deletions(-)

diff --git a/tests/testthat/test_marginalisation.R b/tests/testthat/test_marginalisation.R
index c8e5deb6..379038e9 100644
--- a/tests/testthat/test_marginalisation.R
+++ b/tests/testthat/test_marginalisation.R
@@ -55,7 +55,6 @@ test_that("marginalise errors nicely", {
 
 })
 
-
 test_that("discrete_marginalisation errors nicely", {
 
   skip_if_not(check_tf_version())
@@ -175,7 +174,6 @@ test_that("laplace_approximation errors nicely", {
 
 })
 
-
 test_that("inference runs with laplace approximation", {
 
   skip_if_not(check_tf_version())
@@ -204,6 +202,63 @@ test_that("inference runs with laplace approximation", {
   expect_ok(draws <- mcmc(m, warmup = 20, n_samples = 20,
                           verbose = FALSE))
 
-  # the optimisation result should be similar to a very long run of
+})
+
+test_that("laplace approximation converges on correct posterior", {
+
+  skip_if_not(check_tf_version())
+  source("helpers.R")
+
+  # test vs analytic posterior on 8 schools data with no pooling:
+  #   y_i ~ N(theta_i, obs_sd_i ^ 2)
+  #   theta_i ~ N(mu, sd ^ 2)
+  # the posterior for theta is normal, so laplace should be exact
+
+  # eight schools data
+  y <- c(28.39, 7.94, -2.75 , 6.82, -0.64, 0.63, 18.01, 12.16)
+  obs_sd <- c(14.9, 10.2, 16.3, 11.0, 9.4, 11.4, 10.4, 17.6)
+
+  # prior parameters for int, and fixed variance of theta
+  mu <- rnorm(1)
+  sd <- abs(rnorm(1))
+
+  # analytic solution:
+
+  # Marginalising int we can write:
+  # Bayes theorum gives:
+  #   p(theta | y) \propto p(y|theta) p(theta)
+  # which with normal densities is:
+  #   p(theta_i | y_i) \propto N(y_i | theta_i, obs_sd_i ^ 2) * N(theta_i | mu, sd ^ 2)
+  # which is equivalent to:
+  #   p(theta_i | y_i) \propto N(theta_mu_i, theta_var_i)
+  #   theta_var_i = 1 / (1 / sd ^ 2 + 1 / obs_sd_i ^ 2)
+  #   theta_mu_i = (mu / sd ^ 2 + y_i / obs_sd_i ^ 2) * theta_var_i
+  # conjugate prior, see Wikipedia conjugate prior table
+
+  obs_prec <- 1 / (obs_sd ^ 2)
+  prec <- 1 / (sd ^ 2)
+  theta_var <- 1 / (obs_prec + prec)
+  theta_mu <- (y * obs_prec  + mu * prec) * theta_var
+  theta_sd <- sqrt(theta_var)
+
+  # Laplace solution:
+  lik <- function(theta) {
+    distribution(y) <- normal(t(theta), obs_sd)
+  }
+
+  # mock up as a multivariate normal distribution
+  mean <- ones(1, 8) * mu
+  sigma <- diag(8) * sd ^ 2
+  out <- marginalise(lik,
+                     multivariate_normal(mean, sigma),
+                     laplace_approximation())
+  theta_mu_est <- calculate(out$mean)
+  theta_var_est <- calculate(diag(out$Sigma))
+
+  analytic <- cbind(mean = theta_mu, sd = sqrt(theta_var))
+  laplace <- cbind(mean = theta_mu_est, sd = sqrt(theta_var_est))
+
+  # compare these to within a tolerance
+  compare_op(analytic, laplace)
 
 })

From 18049ea4120d56d07de5bb3a2f8227088d0f2c47 Mon Sep 17 00:00:00 2001
From: Nick Golding <goldingn35@gmail.com>
Date: Wed, 19 Feb 2020 10:47:24 +1100
Subject: [PATCH 22/38] prepare for non-diagonal hessians

---
 R/marginalisers.R | 33 +++++++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/R/marginalisers.R b/R/marginalisers.R
index afdaf174..5bb9a1ee 100644
--- a/R/marginalisers.R
+++ b/R/marginalisers.R
@@ -112,15 +112,23 @@ discrete_marginalisation <- function(values) {
 #' @param max_iterations the (positive) integer-like maximum number of
 #'   iterations of the Newton-Raphson optimisation algorithm
 #'
+#' @param diagonal_hessian whether the Hessian matrix should be assumed to be
+#'   diagonal, to speed up computations. See Details.
+#'
 #' @details \code{laplace_approximation} can only be used to marginalise
-#'   variables following a multivariate normal distribution. In addition, the
-#'   function to be marginalised must \emph{factorise}; ie. it must return a
-#'   vector-valued density with as many elements as the vector variable being
-#'   marginalised, and each of element of the density must depend only on the
-#'   corresponding element of the variable vector. This is the responsibility of
-#'   the user, and is not checked.
+#'   variables following a multivariate normal distribution.
+#'
+#'  The argument \code{diagonal_hessian} can be used to state that the
+#'  conditional density factorises along the elements of the variable being
+#'  marginalised, and therefore the Hessian matrix of this function can be
+#'  assumed to be diagonal. A conditional density function factorises if each
+#'  observation in the conditional density depends only on the corresponding per
+#'  element of the variable being marginalised. If this is not the case and you
+#'  set \code{diagonal_hessian = TRUE}, your inferences will be incorrect.
+#'
 laplace_approximation <- function(tolerance = 1e-6,
-                                  max_iterations = 50) {
+                                  max_iterations = 50,
+                                  diagonal_hessian = FALSE) {
 
   # in future:
   #  - enable warm starts for subsequent steps of the outer inference algorithm
@@ -161,10 +169,19 @@ laplace_approximation <- function(tolerance = 1e-6,
     # get the vectors of first and second derivatives of the conditional density
     # function, w.r.t. the variable being marginalised
     derivs <- function(z) {
+
       y <- d0(z, reduce = FALSE)
       d1 <- tf$gradients(y, z)[[1]]
-      d2 <- tf$gradients(d1, z)[[1]]
+
+      if (diagonal_hessian) {
+        d2 <- tf$map_fn(tf$gradients, list(d1, z))[[1]]
+      } else {
+        stop ("inference with non-diagonal hessians not yet implemented")
+        d2 <- tf$hessians(y, z)[[1]]
+      }
+
       list(d1, d2)
+
     }
 
     # negative log-posterior for the current value of z under MVN assumption

From e6ae1cbd3fa1f69549bcc71c14e98928c2cdeb1e Mon Sep 17 00:00:00 2001
From: Nick Golding <goldingn35@gmail.com>
Date: Wed, 19 Feb 2020 14:56:08 +1100
Subject: [PATCH 23/38] define distributions as their tfp distribution objects

---
 R/dag_class.R  | 29 ++---------------------------
 R/joint.R      | 10 +++++-----
 R/mixture.R    | 13 ++++++-------
 R/node_types.R |  8 ++++++--
 4 files changed, 19 insertions(+), 41 deletions(-)

diff --git a/R/dag_class.R b/R/dag_class.R
index 3bce0502..936400a1 100644
--- a/R/dag_class.R
+++ b/R/dag_class.R
@@ -439,18 +439,8 @@ dag_class <- R6Class(
     # target tensor
     evaluate_density = function(distribution_node, target_node) {
 
-      tfe <- self$tf_environment
-
-      parameter_nodes <- distribution_node$parameters
-
-      # get the tensorflow objects for these
-      distrib_constructor <- self$get_tf_object(distribution_node)
+      tfp_distribution <- self$get_tf_object(distribution_node)
       tf_target <- self$get_tf_object(target_node)
-      tf_parameter_list <- lapply(parameter_nodes, self$get_tf_object)
-
-      # execute the distribution constructor functions to return a tfp
-      # distribution object
-      tfp_distribution <- distrib_constructor(tf_parameter_list, dag = self)
 
       self$tf_evaluate_density(tfp_distribution,
                                tf_target,
@@ -788,25 +778,10 @@ dag_class <- R6Class(
 
     },
 
-    # get the tfp distribution object for a distribution node
-    get_tfp_distribution = function(distrib_node) {
-
-      # build the tfp distribution object for the distribution, and use it
-      # to get the tensor for the sample
-      distrib_constructor <- self$get_tf_object(distrib_node)
-      parameter_nodes <- distrib_node$parameters
-      tf_parameter_list <- lapply(parameter_nodes, self$get_tf_object)
-
-      # execute the distribution constructor functions to return a tfp
-      # distribution object
-      tfp_distribution <- distrib_constructor(tf_parameter_list, dag = self)
-
-    },
-
     # try to draw a random sample from a distribution node
     draw_sample = function(distribution_node) {
 
-      tfp_distribution <- self$get_tfp_distribution(distribution_node)
+      tfp_distribution <- self$get_tf_object(distribution_node)
 
       sample <- tfp_distribution$sample
 
diff --git a/R/joint.R b/R/joint.R
index c61605af..045258a7 100644
--- a/R/joint.R
+++ b/R/joint.R
@@ -124,15 +124,15 @@ joint_distribution <- R6Class(
 
     tf_distrib = function(parameters, dag) {
 
-      # get information from the *nodes* for component distributions, not the tf
-      # objects passed in here
+      # get tfp distributions
+      tfp_distributions <- parameters
+      names(tfp_distributions) <- NULL
 
-      # get tfp distributions, truncations, & bounds of component distributions
+      # get information on truncations, & bounds of component distributions from
+      # the *nodes* for component distributions
       distribution_nodes <- self$parameters
       truncations <- lapply(distribution_nodes, member, "truncation")
       bounds <- lapply(distribution_nodes, member, "bounds")
-      tfp_distributions <- lapply(distribution_nodes, dag$get_tfp_distribution)
-      names(tfp_distributions) <- NULL
 
       log_prob <- function(x) {
 
diff --git a/R/mixture.R b/R/mixture.R
index 5c96de8f..23f12587 100644
--- a/R/mixture.R
+++ b/R/mixture.R
@@ -210,17 +210,16 @@ mixture_distribution <- R6Class(
 
     tf_distrib = function(parameters, dag) {
 
-      # get information from the *nodes* for component distributions, not the tf
-      # objects passed in here
+      # get tfp distributions and mixture weights
+      weights <- parameters$weights
+      tfp_distributions <- parameters[names(parameters) != "weights"]
+      names(tfp_distributions) <- NULL
 
-      # get tfp distributions, truncations, & bounds of component distributions
+      # get information on truncations, & bounds of component distributions from
+      # the *nodes* for component distributions
       distribution_nodes <- self$parameters[names(self$parameters) != "weights"]
       truncations <- lapply(distribution_nodes, member, "truncation")
       bounds <- lapply(distribution_nodes, member, "bounds")
-      tfp_distributions <- lapply(distribution_nodes, dag$get_tfp_distribution)
-      names(tfp_distributions) <- NULL
-
-      weights <- parameters$weights
 
       # use log weights if available
       if (self$weights_is_log) {
diff --git a/R/node_types.R b/R/node_types.R
index 281b18e8..feff453d 100644
--- a/R/node_types.R
+++ b/R/node_types.R
@@ -543,9 +543,13 @@ distribution_node <- R6Class(
 
     tf = function(dag) {
 
-      # assign the distribution object constructor function to the environment
+      # build a tfp distribution object
+      tf_parameter_list <- lapply(self$parameters, dag$get_tf_object)
+      tfp_distribution <- self$tf_distrib(tf_parameter_list, dag = dag)
+
+      # assign it to the environment
       assign(dag$tf_name(self),
-             self$tf_distrib,
+             tfp_distribution,
              envir = dag$tf_environment)
 
     },

From e0f7e238a8ed4a505ecb8784a02f9ca3ebf755bb Mon Sep 17 00:00:00 2001
From: Nick Golding <goldingn35@gmail.com>
Date: Wed, 19 Feb 2020 14:56:34 +1100
Subject: [PATCH 24/38] tidy up tiling to batch size

---
 R/node_types.R |  3 +--
 R/utils.R      | 11 +++++++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/R/node_types.R b/R/node_types.R
index feff453d..519ce3b2 100644
--- a/R/node_types.R
+++ b/R/node_types.R
@@ -56,8 +56,7 @@ data_node <- R6Class(
         }
 
         # expand up to batch size
-        tiling <- c(tfe$batch_size, rep(1L, ndim))
-        batched_tensor <- tf$tile(unbatched_tensor, tiling)
+        batched_tensor <- tile_to_batch(unbatched_tensor, tfe$batch_size)
 
         # put unbatched tensor in environment so it can be set
         assign(unbatched_name, unbatched_tensor, envir = tfe)
diff --git a/R/utils.R b/R/utils.R
index 8f7e1518..5c4847da 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -299,13 +299,19 @@ drop_column_dim <- function(x) {
   x
 }
 
+# given a tensor (with batch dimension of size 1) and a batch size tensor, tile
+# the tensor to match the batch size
+tile_to_batch <- function(x, batch_size) {
+  ndim <- length(dim(x))
+  tf$tile(x, c(batch_size, rep(1L, ndim - 1)))
+}
+
 # where x is a tensor with no batch dimension, and y is a tensor with a batch
 # dimension, tile x to have first dimension matching y (dimension determined at
 # run time)
 expand_to_batch <- function(x, y) {
   batch_size <- tf$shape(y)[[0]]
-  ndim <- length(dim(x))
-  tf$tile(x, c(batch_size, rep(1L, ndim - 1)))
+  tile_to_batch(x, batch_size)
 }
 
 # does this tensor have a batch dimension (of unknown size) as its first
@@ -497,6 +503,7 @@ misc_module <- module(module,
                       drop_first_dim,
                       tile_first_dim,
                       drop_column_dim,
+                      tile_to_batch,
                       expand_to_batch,
                       has_batch,
                       match_batches,

From 3138eff9aec8a8d140f976f88fd08e3dc2ec6dcb Mon Sep 17 00:00:00 2001
From: Nick Golding <goldingn35@gmail.com>
Date: Wed, 19 Feb 2020 17:48:17 +1100
Subject: [PATCH 25/38] stop double-definition of dags

---
 R/inference.R       | 1 -
 R/inference_class.R | 8 ++++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/R/inference.R b/R/inference.R
index 2402c2e4..ddaca382 100644
--- a/R/inference.R
+++ b/R/inference.R
@@ -291,7 +291,6 @@ run_samplers <- function(samplers,
   thin <- as.integer(thin)
 
   dag <- samplers[[1]]$model$dag
-  chains <- samplers[[1]]$n_chains
   n_cores <- check_n_cores(n_cores, length(samplers), plan_is)
   float_type <- dag$tf_float
 
diff --git a/R/inference_class.R b/R/inference_class.R
index a019b398..d49ff011 100644
--- a/R/inference_class.R
+++ b/R/inference_class.R
@@ -46,8 +46,6 @@ inference <- R6Class(
       free_parameters <- model$dag$example_parameters(free = TRUE)
       free_parameters <- unlist_tf(free_parameters)
       self$n_free <- length(free_parameters)
-      self$set_initial_values(initial_values)
-      self$n_traced <- length(model$dag$trace_values(self$free_state))
       self$seed <- seed
 
     },
@@ -277,8 +275,6 @@ sampler <- R6Class(
                        parameters = parameters,
                        seed = seed)
 
-      self$n_chains <- nrow(self$free_state)
-
       # duplicate diag_sd if needed
       n_diag <- length(self$parameters$diag_sd)
       n_parameters <- self$n_free
@@ -289,6 +285,8 @@ sampler <- R6Class(
 
       # define the draws tensor on the tf graph
       self$define_tf_draws()
+      self$set_initial_values(initial_values)
+      self$n_chains <- nrow(self$free_state)
 
     },
 
@@ -1025,6 +1023,8 @@ optimiser <- R6Class(
 
       self$create_optimiser_objective()
       self$create_tf_minimiser()
+      self$set_initial_values(initial_values)
+      self$n_traced <- length(model$dag$trace_values(self$free_state))
 
     },
 

From 8346c8ebd295d173eb10ebf00ddb59f6d098a358 Mon Sep 17 00:00:00 2001
From: Nick Golding <goldingn35@gmail.com>
Date: Wed, 19 Feb 2020 17:49:12 +1100
Subject: [PATCH 26/38] enable batch size to be found without access to dag

---
 R/dag_class.R | 13 +++++++++----
 R/utils.R     | 10 ++++++++++
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/R/dag_class.R b/R/dag_class.R
index 936400a1..e1604ac6 100644
--- a/R/dag_class.R
+++ b/R/dag_class.R
@@ -54,11 +54,16 @@ dag_class <- R6Class(
     # float type
     on_graph = function(expr) {
 
-      # temporarily pass float type info to options, so it can be accessed by
-      # nodes on definition, without cluncky explicit passing
+      # temporarily pass float type and batch size info to options, so it can be accessed by
+      # nodes on definition, without clunky explicit passing
       old_float_type <- options()$greta_tf_float
-      on.exit(options(greta_tf_float = old_float_type))
-      options(greta_tf_float = self$tf_float)
+      old_batch_size <- options()$greta_batch_size
+
+      on.exit(options(greta_tf_float = old_float_type,
+                      greta_batch_size = old_batch_size))
+
+      options(greta_tf_float = self$tf_float,
+              greta_batch_size = self$tf_environment$batch_size)
 
       with(self$tf_graph$as_default(), expr)
     },
diff --git a/R/utils.R b/R/utils.R
index 5c4847da..8701c648 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -164,6 +164,10 @@ check_tf_version <- function(alert = c("none",
 member <- function(x, method)
   eval(parse(text = paste0("x$", method)))
 
+get_element <- function(x, element) {
+  x[[element]]
+}
+
 node_type <- function(node) {
   classes <- class(node)
   type <- grep("*_node", classes, value = TRUE)
@@ -181,6 +185,12 @@ fl <- function(x) {
   tf$constant(x, dtype = tf_float())
 }
 
+# get the tensor for the batch size in the dag currently defining (since it's
+# not alway possible to pass the dag in)
+get_batch_size <- function() {
+  options()$greta_batch_size
+}
+
 # coerce an integer(ish) vector to a list as expected in tensorflow shape
 # arguments
 #' @noRd

From 6a02b8daf8dc462aa970902e9bc8bc12d353c404 Mon Sep 17 00:00:00 2001
From: Nick Golding <goldingn35@gmail.com>
Date: Wed, 19 Feb 2020 17:49:41 +1100
Subject: [PATCH 27/38] let marginalisation return the marginalation parameters

---
 R/marginalise.R                       |  82 ++++----
 R/marginalisers.R                     | 263 ++++++++++++++++++++------
 tests/testthat/test_marginalisation.R |  11 +-
 3 files changed, 252 insertions(+), 104 deletions(-)

diff --git a/R/marginalise.R b/R/marginalise.R
index 3d80f1ca..1120a10b 100644
--- a/R/marginalise.R
+++ b/R/marginalise.R
@@ -67,7 +67,7 @@ NULL
 marginalise <- function(fun, variable, method, ...) {
 
   # get the distribution for the random variable
-  distr <- get_node(variable)$distribution
+  distribution_node <- get_node(variable)$distribution
 
   # check the inputs
   if (!is.function(fun)) {
@@ -82,7 +82,7 @@ marginalise <- function(fun, variable, method, ...) {
     stop("all arguments to 'fun' must be passed, named, to marginalise")
   }
 
-  if (!inherits(distr, "distribution_node")) {
+  if (!inherits(distribution_node, "distribution_node")) {
     stop("'variable' must be a variable greta array with a distribution")
   }
 
@@ -92,36 +92,42 @@ marginalise <- function(fun, variable, method, ...) {
   }
 
   # check the distribution is compatible with the method
-  method$distribution_check(distr)
+  method$distribution_check(distribution_node)
 
   # excise the variable from the distribution
-  distr$remove_target()
+  distribution_node$remove_target()
 
   # turn the greta function into a TF conditional density function; doing
   # something very similar to as_tf_function(), but giving a function that
   # returns a tensorflow scalar for the density (unadjusted since there will be
   # no new variables)
-  args <- c(list(variable), dots)
-  conditional_joint_density <- as_conditional_density(fun, args)
+  conditional_joint_density <- as_conditional_density(fun, c(list(variable), dots))
 
-  # get a tf function from the method which will turn that conditional density
-  # function into a marginal density (conditional on the inputs to the
-  # distribution) to be added to the overall model density
+  # get a list of greta arrays for the marginalisation parameters:
+  parameters <- method$compute_parameters(
+    conditional_density_fun = conditional_joint_density,
+    distribution_node = distribution_node,
+    dots = dots,
+    marginaliser = method
+  )
 
-  # pass the conditional density function and the marginaliser to a
-  # marginalisation distribution
+  # add on any pre-computed parameters
+  parameters <- c(parameters, method$other_parameters)
 
   # create the distribution
-  vble <- distrib(
+  distribution <- distrib(
     distribution = "marginalisation",
     marginaliser = method,
+    parameters = parameters,
     conditional_density_fun = conditional_joint_density,
-    target_distribution = distr,
+    distribution_node = distribution_node,
     dots = dots
   )
 
-  # return nothing (users can't have distribution nodes on their own)
-  invisible(NULL)
+  # return a list of the greta arrays computed during the marginalisation, and any other
+  # things from the method
+  output <- method$return_list(parameters)
+  invisible(output)
 
 }
 
@@ -130,19 +136,28 @@ marginalisation_distribution <- R6Class(
   inherit = distribution_node,
   public = list(
 
-    tf_marginaliser = NULL,
+    marginaliser = NULL,
+    tf_marginalisation_density = NULL,
     conditional_density_fun = NULL,
 
     initialize = function(marginaliser,
+                          parameters = parameters,
                           conditional_density_fun,
-                          target_distribution,
+                          distribution_node,
                           dots) {
 
       # initialize class, and add methods
       super$initialize("marginalisation")
-      self$tf_marginaliser <- marginaliser$tf_marginaliser
+      self$marginaliser <- marginaliser
       self$conditional_density_fun <- conditional_density_fun
 
+      # add the parameters
+      parameter_names <- names(parameters)
+      for (i in seq_along(parameters)) {
+        self$add_parameter(parameters[[i]],
+                           parameter_names[i])
+      }
+
       # add the dots (extra inputs to conditional_density_fun) as parameters
       dot_nodes <- lapply(dots, get_node)
       for (i in seq_along(dot_nodes)) {
@@ -152,34 +167,29 @@ marginalisation_distribution <- R6Class(
 
       # make the distribution the target
       self$remove_target()
-      self$add_target(target_distribution)
+      self$add_target(distribution_node)
 
     },
 
     tf_distrib = function(parameters, dag) {
 
-      self$target
-
-      # build the tfp distribution for the marginalisation distribution
-      parameter_nodes <- self$target$parameters
-      distrib_constructor <- dag$get_tf_object(self$target)
-      tf_parameter_list <- lapply(parameter_nodes, dag$get_tf_object)
-      tfp_distribution <- distrib_constructor(tf_parameter_list, dag = dag)
+      # unpack the parameters here
+      are_dots <- grepl("^input ", names(parameters))
+      dots <- parameters[are_dots]
+      parameters <- parameters[!are_dots]
 
       # the marginal density implied by the function
       log_prob <- function(x) {
 
-        # x will be the target; a tf constructor function for the distribution being
-        # marginalised. Pass in the distribution node and the dag too, in case
-        # the marginalisers need them.
-        self$tf_marginaliser(self$conditional_density_fun,
-                             tfp_distribution = tfp_distribution,
-                             other_args = parameters,
-                             dag = dag,
-                             distribution_node = self$target)
+        marginal_density <- self$marginaliser$tf_marginalisation_density
+        marginal_density(parameters = parameters,
+                         tf_conditional_density_fun = self$conditional_density_fun,
+                         dots = dots,
+                         other_args = self$marginaliser$other_args)
 
       }
 
+
       list(log_prob = log_prob)
 
     }
@@ -202,7 +212,7 @@ as_conditional_density <- function(r_fun, args) {
   # the function.
 
   # this function will take in tensors corresponding to the things in args
-  function(..., reduce = TRUE, dag = NULL) {
+  function(..., reduce = TRUE) {
 
     tensor_inputs <- list(...)
 
@@ -239,7 +249,7 @@ as_conditional_density <- function(r_fun, args) {
     sub_tfe <- sub_dag$tf_environment
 
     # pass on the batch size, used when defining data
-    sub_tfe$batch_size <- dag$tf_environment$batch_size
+    sub_tfe$batch_size <- get_batch_size()
 
     # set the input tensors as the values for the dummy greta arrays in the new
     # tf_environment
diff --git a/R/marginalisers.R b/R/marginalisers.R
index 5bb9a1ee..eaf25df3 100644
--- a/R/marginalisers.R
+++ b/R/marginalisers.R
@@ -46,22 +46,16 @@ discrete_marginalisation <- function(values) {
     stop(msg)
   }
 
-  # define the marginalisation function
-  tf_marginaliser <- function(tf_conditional_density_fun,
-                              tfp_distribution,
-                              other_args,
-                              dag,
-                              distribution_node) {
-
-    # convert these into a list of constant tensors with the correct dimensions
-    # and float types
-    values_list <- as.list(values)
-    values_list <- lapply(values_list, as_2d_array)
-    values_list <- lapply(values_list, add_first_dim)
-    values_list <- lapply(values_list, fl)
-
-    # 1. get weights from the distribution log pdf
-    # assuming values is a list, get tensors for the weights
+  # convert values to a list of data greta arrays
+  values_list <- as.list(values)
+  values_list <- lapply(values_list, as_data)
+  names(values_list) <- paste0("value_", seq_along(values_list))
+
+  # function to compute the parameter (log probabilities) of the marginalisation
+  tf_compute_log_weights <- function(tfp_distribution, ...) {
+
+    # and compute log probabilities for the values
+    values_list <- list(...)
     log_weights_list <- lapply(values_list, tfp_distribution$log_prob)
     log_weights_list <- lapply(log_weights_list, tf_sum)
 
@@ -75,30 +69,69 @@ discrete_marginalisation <- function(values) {
       axis = 1L,
       keepdims = TRUE
     )
-    log_weights <- log_weights - log_weights_sum
 
-    # 2. compute the conditional joint density for each value (passing in
-    # other_args and the outer dag)
+    log_weights - log_weights_sum
+
+  }
+
+  # return a named list of operation greta arrays for the marginalisation parameters
+  compute_parameters <- function(conditional_density_fun,
+                                 distribution_node,
+                                 dots,
+                                 marginaliser) {
+
+    # get a list of parameters in a mock greta array (just representing a tf list)
+    args <- c(operation = "marginalisation_log_weights",
+              distribution_node,
+              values_list,
+              tf_operation = "tf_compute_log_weights",
+              list(dim = c(length(values_list), 1L)))
+    log_weights <- do.call(op, args)
+
+    list(log_weights = log_weights)
+
+  }
+
+  # compute the marginal joint density
+  tf_marginalisation_density <- function(parameters,
+                                         tf_conditional_density_fun,
+                                         dots,
+                                         other_args) {
+
+    # unpack the parameters
+    log_weights <- parameters$log_weights
+    values_list <- parameters[names(parameters) != "log_weights"]
+
+    # compute the conditional joint density for each value (passing in
+    # dots too)
     log_density_list <- list()
     for (i in seq_along(values_list)) {
-      args <- c(list(values_list[[i]]), other_args, dag = dag)
+      args <- c(list(values_list[[i]]), dots)
       log_density_list[[i]] <- do.call(tf_conditional_density_fun, args)
     }
 
+    # expand and combine the densities
     log_density_list <- lapply(log_density_list, tf$expand_dims, 1L)
     log_density_list <- lapply(log_density_list, tf$expand_dims, 2L)
     log_density_vec <- tf$concat(log_density_list, axis = 1L)
 
-    # 3. compute a weighted sum
+    # compute a weighted sum
     log_density_weighted_vec <- log_density_vec + log_weights
-    tf$reduce_logsumexp(log_density_weighted_vec, axis = 1L)
+    density <- tf$reduce_logsumexp(log_density_weighted_vec, axis = 1L)
+    density
+
+  }
 
+  return_list_function <- function(parameters) {
+    list(probabilities = exp(parameters$log_weights))
   }
 
   as_marginaliser(name = "discrete",
-                  tf_marginaliser = tf_marginaliser,
-                  parameters = list(values = values),
-                  distribution_check = discrete_check)
+                  compute_parameters = compute_parameters,
+                  tf_marginalisation_density = tf_marginalisation_density,
+                  return_list = return_list_function,
+                  distribution_check = discrete_check,
+                  other_parameters = values_list)
 
 }
 
@@ -150,38 +183,45 @@ laplace_approximation <- function(tolerance = 1e-6,
     stop("'max_iterations' must be a positive, scalar integer value")
   }
 
+  # function to compute the parameter (log probabilities) of the marginalisation
   # define the marginalisation function
-  tf_marginaliser <- function(tf_conditional_density_fun,
-                              tfp_distribution,
-                              other_args,
-                              dag,
-                              distribution_node) {
+  tf_compute_laplace_parameters <- function(mu,
+                                            sigma,
+                                            ...,
+                                            tf_conditional_density_fun,
+                                            diagonal_hessian) {
+
+    dots <- list(...)
 
     # simpler interface to conditional density. If reduce = TRUE, it does
     # reduce_sum on component densities
     d0 <- function(z, reduce = TRUE) {
       # transpose z to a row vector, which the dag is expecting
       t_z <- tf_transpose(z)
-      args <- c(list(t_z), other_args, list(reduce = reduce, dag = dag))
+      args <- c(list(t_z), dots, list(reduce = reduce))
       do.call(tf_conditional_density_fun, args)
     }
 
     # get the vectors of first and second derivatives of the conditional density
     # function, w.r.t. the variable being marginalised
-    derivs <- function(z) {
+    if (diagonal_hessian) {
+      derivs <- function(z) {
+        y <- d0(z, reduce = FALSE)
+        d1 <- tf$gradients(y, z)[[1]]
+        d2 <- tf$gradients(d1, z)[[1]]
+        list(d1, d2)
+      }
+    } else {
 
-      y <- d0(z, reduce = FALSE)
-      d1 <- tf$gradients(y, z)[[1]]
+      stop("inference with non-diagonal hessians is not yet implemented",
+           call. = FALSE)
 
-      if (diagonal_hessian) {
-        d2 <- tf$map_fn(tf$gradients, list(d1, z))[[1]]
-      } else {
-        stop ("inference with non-diagonal hessians not yet implemented")
+      derivs <- function(z) {
+        y <- d0(z, reduce = FALSE)
+        d1 <- tf$gradients(y, z)[[1]]
         d2 <- tf$hessians(y, z)[[1]]
+        list(d1, d2)
       }
-
-      list(d1, d2)
-
     }
 
     # negative log-posterior for the current value of z under MVN assumption
@@ -191,11 +231,7 @@ laplace_approximation <- function(tolerance = 1e-6,
     }
 
     # tensors for parameters of MVN distribution (make mu column vector now)
-    mu_node <- distribution_node$parameters$mean
-    mu <- dag$tf_environment[[dag$tf_name(mu_node)]]
     mu <- tf_transpose(mu)
-    sigma_node <- distribution_node$parameters$sigma
-    sigma <- dag$tf_environment[[dag$tf_name(sigma_node)]]
 
     # dimension of the MVN distribution
     n <- dim(mu)[[2]]
@@ -316,33 +352,132 @@ laplace_approximation <- function(tolerance = 1e-6,
     # lots of duplicated code; this could be tidied up, but I ran out of time!
     z <- tf$matmul(sigma, a) + mu
 
+    # curvature of the likelihood at the mode
     deriv <- derivs(z)
-    d1 <- deriv[[1]]
     d2 <- deriv[[2]]
-
-    # curvature of the likelihood
     w <- -d2
     rw <- sqrt(w)
 
-    # decentred values of z
-    cf <- z - mu
-
     # approximate posterior covariance & cholesky factor
     mat1 <- tf$matmul(rw, tf_transpose(rw)) * sigma + eye
     u <- tf$cholesky(mat1)
 
+    # return a list of these things
+    list(z = z,
+         mu = mu,
+         a = a,
+         u = u)
+
+  }
+
+  # return a named list of operation greta arrays for the marginalisation parameters
+  compute_parameters <- function(conditional_density_fun,
+                                 distribution_node,
+                                 dots,
+                                 marginaliser) {
+
+    # get greta arrays for parameters of distribution node
+    mean <- distribution_node$parameters$mean
+    sigma <- distribution_node$parameters$sigma
+
+    # run the laplace approximation fitting and get a list of parameters in a
+    # mock greta array (just representing a tf list)
+    args <- c(operation = "marginalisation_parameters",
+              mu = mean,
+              sigma = sigma,
+              dots,
+              list(
+                operation_args = list(
+                  tf_conditional_density_fun = conditional_density_fun,
+                  diagonal_hessian = marginaliser$other_args$diagonal_hessian
+                ),
+                tf_operation = "tf_compute_laplace_parameters",
+                dim = 1
+              ))
+
+    parameter_list <- do.call(op, args)
+
+    # extract the elements to operation greta arrays with the correct shapes
+    z <- op("z",
+            parameter_list,
+            dim = dim(mean),
+            tf_operation = "get_element",
+            operation_args = list("z"))
+
+    a <- op("a",
+            parameter_list,
+            dim = dim(mean),
+            tf_operation = "get_element",
+            operation_args = list("a"))
+
+    mu <- op("mu",
+            parameter_list,
+            dim = dim(mean),
+            tf_operation = "get_element",
+            operation_args = list("mu"))
+
+    u <- op("chol_sigma",
+            parameter_list,
+            dim = dim(sigma),
+            tf_operation = "get_element",
+            operation_args = list("u"))
+
+
+    # pull out the elements
+    list(z = z,
+         a = a,
+         mu = mu,
+         u = u)
+
+  }
+
+  tf_marginalisation_density <- function(parameters,
+                                         tf_conditional_density_fun,
+                                         dots,
+                                         other_args) {
+
+    diagonal_hessian <- other_args$diagonal_hessian
+
+    # simpler interface to conditional density. If reduce = TRUE, it does
+    # reduce_sum on component densities
+    d0 <- function(z, reduce = TRUE) {
+      # transpose z to a row vector, which the dag is expecting
+      t_z <- tf_transpose(z)
+      args <- c(list(t_z), dots, list(reduce = reduce))
+      do.call(tf_conditional_density_fun, args)
+    }
+
+    # negative log-posterior for the current value of z under MVN assumption
+    psi <- function(a, z, mu) {
+      p1 <- tf$matmul(tf_transpose(a), z - mu)
+      fl(0.5) * tf$squeeze(p1, 1:2) - d0(z)
+    }
+
+    mu <- parameters$mu
+    u <- parameters$u
+    z <- parameters$z
+    a <- parameters$a
+
     # the approximate marginal conditional posterior
-    logdet <- tf_sum(tf$log(tf$matrix_diag_part(u)))
+    u_diag <- tf$matrix_diag_part(u)
+    logdet <- tf_sum(tf$log(u_diag))
     nmcp <- psi(a, z, mu) + tf$squeeze(logdet, 1)
 
     -nmcp
 
   }
 
+  return_list_function <- function(parameters) {
+    list(mean = parameters$z - parameters$mu,
+         sigma = chol2symm(parameters$u))
+  }
+
   as_marginaliser(name = "laplace",
-                  tf_marginaliser = tf_marginaliser,
-                  parameters = list(),
-                  distribution_check = multivariate_normal_check)
+                  compute_parameters = compute_parameters,
+                  tf_marginalisation_density = tf_marginalisation_density,
+                  return_list = return_list_function,
+                  distribution_check = multivariate_normal_check,
+                  other_args = list(diagonal_hessian = diagonal_hessian))
 
 }
 
@@ -366,14 +501,20 @@ multivariate_normal_check <- function(distrib) {
 
 # helper to contruct marginalisers
 as_marginaliser <- function(name,
-                            tf_marginaliser,
-                            parameters,
-                            distribution_check) {
+                            compute_parameters,
+                            tf_marginalisation_density,
+                            distribution_check,
+                            return_list,
+                            other_parameters = list(),
+                            other_args = list()) {
 
   obj <- list(name = name,
-              tf_marginaliser = tf_marginaliser,
-              parameters = parameters,
-              distribution_check = distribution_check)
+              compute_parameters = compute_parameters,
+              tf_marginalisation_density = tf_marginalisation_density,
+              distribution_check = distribution_check,
+              return_list = return_list,
+              other_args = other_args,
+              other_parameters = other_parameters)
 
   class_name <- paste0(name, "_marginaliser")
   class(obj) <- c(class_name, "marginaliser")
diff --git a/tests/testthat/test_marginalisation.R b/tests/testthat/test_marginalisation.R
index 379038e9..17dc2146 100644
--- a/tests/testthat/test_marginalisation.R
+++ b/tests/testthat/test_marginalisation.R
@@ -80,7 +80,6 @@ test_that("discrete_marginalisation errors nicely", {
 
 })
 
-
 test_that("inference runs with discrete marginalisation", {
 
   skip_if_not(check_tf_version())
@@ -110,7 +109,6 @@ test_that("inference runs with discrete marginalisation", {
 
 })
 
-
 test_that("discrete marginalisation gives correct densities", {
 
   skip_if_not(check_tf_version())
@@ -148,7 +146,6 @@ test_that("discrete marginalisation gives correct densities", {
 
 })
 
-
 test_that("laplace_approximation errors nicely", {
 
   skip_if_not(check_tf_version())
@@ -194,7 +191,7 @@ test_that("inference runs with laplace approximation", {
   mu <- ones(1, 8) * int
   marginalise(lik,
               multivariate_normal(mu, sigma),
-              laplace_approximation())
+              laplace_approximation(diagonal_hessian = TRUE))
 
   m <- model(int, sd)
 
@@ -251,9 +248,9 @@ test_that("laplace approximation converges on correct posterior", {
   sigma <- diag(8) * sd ^ 2
   out <- marginalise(lik,
                      multivariate_normal(mean, sigma),
-                     laplace_approximation())
-  theta_mu_est <- calculate(out$mean)
-  theta_var_est <- calculate(diag(out$Sigma))
+                     laplace_approximation(diagonal_hessian = TRUE))
+  theta_mu_est <- t(calculate(out$mean)[[1]])
+  theta_var_est <- calculate(diag(out$sigma))[[1]]
 
   analytic <- cbind(mean = theta_mu, sd = sqrt(theta_var))
   laplace <- cbind(mean = theta_mu_est, sd = sqrt(theta_var_est))

From fd532a525b53675f514b95b7999a3d9215ce66a4 Mon Sep 17 00:00:00 2001
From: Nick Golding <goldingn35@gmail.com>
Date: Wed, 19 Feb 2020 18:06:58 +1100
Subject: [PATCH 28/38] fix lints

---
 R/dag_class.R                         |  4 ++--
 R/marginalise.R                       | 17 +++++++-------
 R/marginalisers.R                     |  9 +++++---
 man/marginalisers.Rd                  | 32 ++++++++++++++++++---------
 tests/testthat/test_marginalisation.R |  7 +++++-
 tests/testthat/test_posteriors.R      |  4 ++++
 6 files changed, 48 insertions(+), 25 deletions(-)

diff --git a/R/dag_class.R b/R/dag_class.R
index e1604ac6..4b3f4dff 100644
--- a/R/dag_class.R
+++ b/R/dag_class.R
@@ -54,8 +54,8 @@ dag_class <- R6Class(
     # float type
     on_graph = function(expr) {
 
-      # temporarily pass float type and batch size info to options, so it can be accessed by
-      # nodes on definition, without clunky explicit passing
+      # temporarily pass float type and batch size info to options, so it can be
+      # accessed by nodes on definition, without clunky explicit passing
       old_float_type <- options()$greta_tf_float
       old_batch_size <- options()$greta_batch_size
 
diff --git a/R/marginalise.R b/R/marginalise.R
index 1120a10b..5db116b7 100644
--- a/R/marginalise.R
+++ b/R/marginalise.R
@@ -101,7 +101,8 @@ marginalise <- function(fun, variable, method, ...) {
   # something very similar to as_tf_function(), but giving a function that
   # returns a tensorflow scalar for the density (unadjusted since there will be
   # no new variables)
-  conditional_joint_density <- as_conditional_density(fun, c(list(variable), dots))
+  conditional_joint_density <- as_conditional_density(fun,
+                                                      c(list(variable), dots))
 
   # get a list of greta arrays for the marginalisation parameters:
   parameters <- method$compute_parameters(
@@ -124,8 +125,8 @@ marginalise <- function(fun, variable, method, ...) {
     dots = dots
   )
 
-  # return a list of the greta arrays computed during the marginalisation, and any other
-  # things from the method
+  # return a list of the greta arrays computed during the marginalisation, and
+  # any other things from the method
   output <- method$return_list(parameters)
   invisible(output)
 
@@ -181,11 +182,11 @@ marginalisation_distribution <- R6Class(
       # the marginal density implied by the function
       log_prob <- function(x) {
 
-        marginal_density <- self$marginaliser$tf_marginalisation_density
-        marginal_density(parameters = parameters,
-                         tf_conditional_density_fun = self$conditional_density_fun,
-                         dots = dots,
-                         other_args = self$marginaliser$other_args)
+        density <- self$marginaliser$tf_marginalisation_density
+        density(parameters = parameters,
+                tf_conditional_density_fun = self$conditional_density_fun,
+                dots = dots,
+                other_args = self$marginaliser$other_args)
 
       }
 
diff --git a/R/marginalisers.R b/R/marginalisers.R
index eaf25df3..ba0f562f 100644
--- a/R/marginalisers.R
+++ b/R/marginalisers.R
@@ -74,13 +74,14 @@ discrete_marginalisation <- function(values) {
 
   }
 
-  # return a named list of operation greta arrays for the marginalisation parameters
+  # return a named list of operation greta arrays for the marginalisation
+  # parameters
   compute_parameters <- function(conditional_density_fun,
                                  distribution_node,
                                  dots,
                                  marginaliser) {
 
-    # get a list of parameters in a mock greta array (just representing a tf list)
+    # list of parameters in a mock greta array (just representing a tf list)
     args <- c(operation = "marginalisation_log_weights",
               distribution_node,
               values_list,
@@ -370,7 +371,7 @@ laplace_approximation <- function(tolerance = 1e-6,
 
   }
 
-  # return a named list of operation greta arrays for the marginalisation parameters
+  # named list of operation greta arrays for the marginalisation parameters
   compute_parameters <- function(conditional_density_fun,
                                  distribution_node,
                                  dots,
@@ -468,8 +469,10 @@ laplace_approximation <- function(tolerance = 1e-6,
   }
 
   return_list_function <- function(parameters) {
+
     list(mean = parameters$z - parameters$mu,
          sigma = chol2symm(parameters$u))
+
   }
 
   as_marginaliser(name = "laplace",
diff --git a/man/marginalisers.Rd b/man/marginalisers.Rd
index b5a8fdfc..c9268f6e 100644
--- a/man/marginalisers.Rd
+++ b/man/marginalisers.Rd
@@ -8,18 +8,25 @@
 \usage{
 discrete_marginalisation(values)
 
-laplace_approximation(tolerance = 1e-06, max_iterations = 50)
+laplace_approximation(
+  tolerance = 1e-06,
+  max_iterations = 50,
+  diagonal_hessian = FALSE
+)
 }
 \arguments{
-\item{values}{an R vector giving values at which to evaluate the function for a
-discrete marginalisation}
+\item{values}{an R vector giving values at which to evaluate the function for
+a discrete marginalisation}
 
 \item{tolerance}{the (positive) numerical convergence tolerance (in units of
 the log conditional posterior density) of the Newton-Raphson optimisation
 algorithm}
 
-\item{max_iterations}{the (positive) integer-like maximum number of iterations of
-the Newton-Raphson optimisation algorithm}
+\item{max_iterations}{the (positive) integer-like maximum number of
+iterations of the Newton-Raphson optimisation algorithm}
+
+\item{diagonal_hessian}{whether the Hessian matrix should be assumed to be
+diagonal, to speed up computations. See Details.}
 }
 \value{
 a \code{marginaliser} object that can be passed to
@@ -50,10 +57,13 @@ Functions to set up marginalisers (which explicitly integrate
   the distribution, that approximation error will be minimal.
 
 \code{laplace_approximation} can only be used to marginalise
-  variables following a multivariate normal distribution. In addition, the
-  function to be marginalised must \emph{factorise}; ie. it must return a
-  vector-valued density with as many elements as the vector variable being
-  marginalised, and each of element of the density must depend only on the
-  corresponding element of the variable vector. This is the responsibility of
-  the user, and is not checked.
+  variables following a multivariate normal distribution.
+
+ The argument \code{diagonal_hessian} can be used to state that the
+ conditional density factorises along the elements of the variable being
+ marginalised, and therefore the Hessian matrix of this function can be
+ assumed to be diagonal. A conditional density function factorises if each
+ observation in the conditional density depends only on the corresponding per
+ element of the variable being marginalised. If this is not the case and you
+ set \code{diagonal_hessian = TRUE}, your inferences will be incorrect.
 }
diff --git a/tests/testthat/test_marginalisation.R b/tests/testthat/test_marginalisation.R
index 17dc2146..fc0520d6 100644
--- a/tests/testthat/test_marginalisation.R
+++ b/tests/testthat/test_marginalisation.R
@@ -206,10 +206,12 @@ test_that("laplace approximation converges on correct posterior", {
   skip_if_not(check_tf_version())
   source("helpers.R")
 
+  # nolint start
   # test vs analytic posterior on 8 schools data with no pooling:
   #   y_i ~ N(theta_i, obs_sd_i ^ 2)
   #   theta_i ~ N(mu, sd ^ 2)
   # the posterior for theta is normal, so laplace should be exact
+  # nolint end
 
   # eight schools data
   y <- c(28.39, 7.94, -2.75 , 6.82, -0.64, 0.63, 18.01, 12.16)
@@ -221,16 +223,19 @@ test_that("laplace approximation converges on correct posterior", {
 
   # analytic solution:
 
+  # nolint start
   # Marginalising int we can write:
   # Bayes theorum gives:
   #   p(theta | y) \propto p(y|theta) p(theta)
   # which with normal densities is:
-  #   p(theta_i | y_i) \propto N(y_i | theta_i, obs_sd_i ^ 2) * N(theta_i | mu, sd ^ 2)
+  #   p(theta_i | y_i) \propto N(y_i | theta_i, obs_sd_i ^ 2) *
+  #         N(theta_i | mu, sd ^ 2)
   # which is equivalent to:
   #   p(theta_i | y_i) \propto N(theta_mu_i, theta_var_i)
   #   theta_var_i = 1 / (1 / sd ^ 2 + 1 / obs_sd_i ^ 2)
   #   theta_mu_i = (mu / sd ^ 2 + y_i / obs_sd_i ^ 2) * theta_var_i
   # conjugate prior, see Wikipedia conjugate prior table
+  # nolint end
 
   obs_prec <- 1 / (obs_sd ^ 2)
   prec <- 1 / (sd ^ 2)
diff --git a/tests/testthat/test_posteriors.R b/tests/testthat/test_posteriors.R
index 71e7bbdc..d2e91c5c 100644
--- a/tests/testthat/test_posteriors.R
+++ b/tests/testthat/test_posteriors.R
@@ -41,9 +41,11 @@ test_that("posterior is correct (normal)", {
   skip_if_not(check_tf_version())
   source("helpers.R")
 
+  # nolint start
   # test vs analytic posterior on 8 schools data with no pooling:
   #   y_i ~ N(theta_i, obs_sd_i ^ 2)
   #   theta_i ~ N(mu, sd ^ 2)
+  # nolint end
 
   # eight schools data
   y <- c(28.39, 7.94, -2.75 , 6.82, -0.64, 0.63, 18.01, 12.16)
@@ -53,6 +55,7 @@ test_that("posterior is correct (normal)", {
   mu <- rnorm(1)
   sd <- abs(rnorm(1))
 
+  # nolint start
   # Bayes theorum gives:
   #   p(theta | y) \propto p(y|theta) p(theta)
   # which with normal densities is:
@@ -62,6 +65,7 @@ test_that("posterior is correct (normal)", {
   #   theta_var_i = 1 / (1 / sd ^ 2 + 1 / obs_sd_i ^ 2)
   #   theta_mu_i = (mu / sd ^ 2 + y_i / obs_sd_i ^ 2) * theta_var_i
   # conjugate prior, see Wikipedia conjugate prior table
+  # nolint end
 
   obs_prec <- 1 / (obs_sd ^ 2)
   prec <- 1 / (sd ^ 2)

From 57047d83018cbd50cbaa91104e51ab591496c8f9 Mon Sep 17 00:00:00 2001
From: Nick Golding <goldingn35@gmail.com>
Date: Mon, 24 Feb 2020 16:59:15 +1100
Subject: [PATCH 29/38] fix golden section search

---
 R/utils.R                  | 93 +++++++++++++++++++++++---------------
 tests/testthat/test_misc.R |  8 ++--
 2 files changed, 60 insertions(+), 41 deletions(-)

diff --git a/R/utils.R b/R/utils.R
index 8701c648..da482f32 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -409,17 +409,14 @@ gss <- function(func,
                 lower = 0,
                 upper = 1,
                 max_iterations = 50,
-                tolerance = 1e-32) {
+                tolerance = 1e-8) {
 
   lower <- tf$constant(lower, tf_float(), shape(1))
   upper <- tf$constant(upper, tf_float(), shape(1))
   maxiter <- tf$constant(as.integer(max_iterations), tf$int32)
   tol <- fl(tolerance)
-  iter <- tf$constant(1L, tf$int32)
-
-  phi <- (1 + sqrt(5)) / 2
-  phim1 <- fl(phi - 1)
-  twomphi <- fl(2.0 - phi)
+  iter <- tf$constant(0L, tf$int32)
+  golden_ratio <- fl(2 / (sqrt(5) + 1))
 
   # expand out to batch dimensions
   if (!inherits(batch_dim, "tensorflow.tensor")) {
@@ -427,56 +424,78 @@ gss <- function(func,
     batch_dim <- tf$constant(batch_dim, tf$int32, shape(1))
   }
 
+  # initial evaluation points
   left <- tf$tile(lower, batch_dim)
   right <- tf$tile(upper, batch_dim)
   width <- right - left
-  optimum <- tf$zeros(batch_dim, tf_float())
+  d <- golden_ratio * width
+  x1 <- right - d
+  x2 <- left + d
 
-  # status of the multiple consecutive searches; three columns, for left, right
-  # and optimum
-  status <- tf$stack(list(left, right, optimum), axis = 1L)
+  values <- list(left, right, x1, x2, width, iter)
 
   # start loop
-  body <- function(left, right, optimum, width, iter) {
-
-    d <- phim1 * width
-
-    # find evaluation points
-    a <- left + d
-    b <- right - d
-
-    # prep reordered matrices for whether steps are above or below
-    # if func(a) < func(b)
-    above <- tf$stack(list(left, b, a), axis = 1L)
-    # else
-    not_above <- tf$stack(list(a, right, b), axis = 1L)
-
-    status <- tf$where(tf$less(func(b), func(a)), above, not_above)
+  body <- function(left, right, x1, x2, width, iter) {
+
+    # prep lists of vectors for whether steps are above x1 or below x2
+
+    # order: lower, upper, x1, x2, width
+
+    # if the minimum is below x2, shift the bounds and reuse x1 as the new x2
+    below_width <- x2 - left
+    below <- list(
+      left,
+      x2,
+      x2 - golden_ratio * below_width,
+      x1,
+      below_width
+    )
+
+    # if the minimum is above x1, shift the bounds and reuse x2 as the new x1
+    above_width <- right - x1
+    above <- list(
+      x1,
+      right,
+      x2,
+      x1 + golden_ratio * above_width,
+      above_width
+    )
+
+    # convert to matrices
+    below <- tf$stack(below, axis = 1L)
+    above <- tf$stack(above, axis = 1L)
+
+    # can we stash and reuse these and reduce the number of function evaluations?
+    f1 <- func(x1)
+    f2 <- func(x2)
+
+    status <- tf$where(tf$greater(f2, f1), below, above)
 
     left <- status[, 0]
     right <- status[, 1]
-    optimum <- status[, 2]
-
-    width <- right - left
+    x1 <- status[, 2]
+    x2 <- status[, 3]
+    width <- status[, 4]
 
-    list(left, right, optimum, width, iter + 1L)
+    list(left, right, x1, x2, width, iter + 1L)
   }
 
-  cond <- function(left, right, optimum, width, iter) {
-    err <- twomphi * tf$abs(width / optimum)
-    not_converged <- tf$less(tol, err)
+  cond <- function(left, right, x1, x2, width, iter) {
+    not_converged <- tf$less(tol, tf$abs(width))
     not_all_converged <- tf$reduce_any(not_converged)
     in_time <- tf$less(iter, maxiter)
     in_time & not_all_converged
   }
 
-  values <- list(left, right, optimum, width, iter)
-
   out <- tf$while_loop(cond, body, values)
 
-  result <- out[3:5]
-  names(result) <- c("minimum", "width", "iterations")
-  result
+  # get minimum value
+  width <- out[[5]]
+  min <- out[[1]] + width / fl(2)
+
+  list(minimum = min,
+       width = width,
+       iterations = out[[6]])
 
 }
 
diff --git a/tests/testthat/test_misc.R b/tests/testthat/test_misc.R
index d4a7590b..ef0e39e6 100644
--- a/tests/testthat/test_misc.R
+++ b/tests/testthat/test_misc.R
@@ -293,17 +293,17 @@ test_that("golden section search works", {
 
   n_batch <- 4
 
+  # random optima
+  locs <- runif(n_batch)
+
   # simple quadratic function with different optima for each batch
   func <- function(x) {
     (x - locs) ^ 2
   }
 
-  # random optima
-  locs <- runif(n_batch, 0.2, 0.8)
-
   # run the minimisation
   ss <- gss(func, n_batch)
   result <- tf$Session()$run(ss)
-  compare_op(result$minimum, locs, tolerance = 0.2)
+  compare_op(result$minimum, locs)
 
 })

From 60ba1b26bff63aacc88a6842613f3aaf67f44a6b Mon Sep 17 00:00:00 2001
From: Nick Golding <goldingn35@gmail.com>
Date: Mon, 24 Feb 2020 17:09:09 +1100
Subject: [PATCH 30/38] stash function calls in gss()

---
 R/utils.R | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/R/utils.R b/R/utils.R
index da482f32..08c75b48 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -409,7 +409,7 @@ gss <- function(func,
                 lower = 0,
                 upper = 1,
                 max_iterations = 50,
-                tolerance = 1e-8) {
+                tolerance = .Machine$double.eps ^ 0.25) {
 
   lower <- tf$constant(lower, tf_float(), shape(1))
   upper <- tf$constant(upper, tf_float(), shape(1))
@@ -431,14 +431,15 @@ gss <- function(func,
   d <- golden_ratio * width
   x1 <- right - d
   x2 <- left + d
+  f1 <- func(x1)
+  f2 <- func(x2)
 
-  values <- list(left, right, x1, x2, width, iter)
+  values <- list(left, right, x1, x2, f1, f2, width, iter)
 
   # start loop
-  body <- function(left, right, x1, x2, width, iter) {
+  body <- function(left, right, x1, x2, f1, f2, width, iter) {
 
     # prep lists of vectors for whether steps are above x1 or below x2
-
     # order: lower, upper, x1, x2, width
 
     # if the minimum is below x2, shift the bounds and reuse x1 as the new x2
@@ -465,11 +466,8 @@ gss <- function(func,
     below <- tf$stack(below, axis = 1L)
     above <- tf$stack(above, axis = 1L)
 
-    # can we stash and reuse these and reduce the number of function evaluations?
-    f1 <- func(x1)
-    f2 <- func(x2)
-
-    status <- tf$where(tf$greater(f2, f1), below, above)
+    is_below <- tf$greater(f2, f1)
+    status <- tf$where(is_below, below, above)
 
     left <- status[, 0]
     right <- status[, 1]
@@ -477,10 +475,19 @@ gss <- function(func,
     x2 <- status[, 3]
     width <- status[, 4]
 
-    list(left, right, x1, x2, width, iter + 1L)
+    # either recompute f1 (f2) at new location, or use f2 (f1) in its place
+
+    # this is a bit convoluted, but ensures function calls don't need to be
+    # duplicated, whilst maintaining the vectorisation
+    x_to_evaluate <- tf$where(is_below, x1, x2)
+    new_f <- func(x_to_evaluate)
+    new_f1 <- tf$where(is_below, new_f, f2)
+    new_f2 <- tf$where(is_below, f1, new_f)
+
+    list(left, right, x1, x2, new_f1, new_f2, width, iter + 1L)
   }
 
-  cond <- function(left, right, x1, x2, width, iter) {
+  cond <- function(left, right, x1, x2, f1, f2, width, iter) {
     not_converged <- tf$less(tol, tf$abs(width))
     not_all_converged <- tf$reduce_any(not_converged)
     in_time <- tf$less(iter, maxiter)
@@ -490,12 +497,12 @@ gss <- function(func,
   out <- tf$while_loop(cond, body, values)
 
   # get minimum value
-  width <- out[[5]]
+  width <- out[[7]]
   min <- out[[1]] + width / fl(2)
 
   list(minimum = min,
        width = width,
-       iterations = out[[6]])
+       iterations = out[[8]])
 
 }
 

From 5e3dd969e6e2174d275cebd838e1b9f67163476a Mon Sep 17 00:00:00 2001
From: Nick Golding <goldingn35@gmail.com>
Date: Wed, 26 Feb 2020 16:23:36 +1100
Subject: [PATCH 31/38] make laplace agree with GRaF

---
 R/marginalisers.R                     | 46 +++++++++++++++++----------
 tests/testthat/test_marginalisation.R |  7 ++--
 2 files changed, 35 insertions(+), 18 deletions(-)

diff --git a/R/marginalisers.R b/R/marginalisers.R
index ba0f562f..40d4a161 100644
--- a/R/marginalisers.R
+++ b/R/marginalisers.R
@@ -220,7 +220,7 @@ laplace_approximation <- function(tolerance = 1e-6,
       derivs <- function(z) {
         y <- d0(z, reduce = FALSE)
         d1 <- tf$gradients(y, z)[[1]]
-        d2 <- tf$hessians(y, z)[[1]]
+        d2 <- tf$hessians(y, z)[[1]]  # this won't work!
         list(d1, d2)
       }
     }
@@ -237,13 +237,9 @@ laplace_approximation <- function(tolerance = 1e-6,
     # dimension of the MVN distribution
     n <- dim(mu)[[2]]
 
-    # randomly initialise z, and expand to batch (warm starts will need some TF
-    # trickery)
-
     # here z is a *column vector* to simplify later calculations, it needs to be
     # transposed to a row vector before feeding into the likelihood function(s)
-    z_value <- add_first_dim(as_2d_array(rnorm(n)))
-    z <- tf$constant(z_value, dtype = tf_float())
+    z <- tf$identity(mu)
 
     # Newton-Raphson parameters
     tol <- tf$constant(tolerance, tf_float(), shape(1))
@@ -253,11 +249,10 @@ laplace_approximation <- function(tolerance = 1e-6,
 
     # other objects
     a_value <- add_first_dim(as_2d_array(rep(0, n)))
-    a <- tf$constant(a_value, dtype = tf_float())
+    a <- fl(a_value)
     u_value <- add_first_dim(diag(n))
-    u <- tf$constant(u_value, tf_float())
-    eye <- tf$constant(add_first_dim(diag(n)),
-                       dtype = tf_float())
+    u <- fl(u_value)
+    eye <- fl(add_first_dim(diag(n)))
 
     # match batches on everything going into the loop that will have a batch
     # dimension later
@@ -310,11 +305,11 @@ laplace_approximation <- function(tolerance = 1e-6,
         s <- tf$expand_dims(s, 1L)
         s <- tf$expand_dims(s, 2L)
         a_new <- a + s * adiff
-        z_new <- tf$matmul(sigma, a) + mu
+        z_new <- tf$matmul(sigma, a_new) + mu
         psi(a_new, z_new, mu)
       }
 
-      ls_results <- gss(psiline, batch_dim)
+      ls_results <- gss(psiline, batch_dim, upper = 2)
       stepsize <- ls_results$minimum
       stepsize <- tf$expand_dims(stepsize, 1L)
       stepsize <- tf$expand_dims(stepsize, 2L)
@@ -363,11 +358,17 @@ laplace_approximation <- function(tolerance = 1e-6,
     mat1 <- tf$matmul(rw, tf_transpose(rw)) * sigma + eye
     u <- tf$cholesky(mat1)
 
+    # convergence information
+    iter <- out[[7]]
+    converged <- tf$less(iter, maxiter)
+
     # return a list of these things
     list(z = z,
          mu = mu,
          a = a,
-         u = u)
+         u = u,
+         iterations = iter,
+         converged = converged)
 
   }
 
@@ -423,12 +424,23 @@ laplace_approximation <- function(tolerance = 1e-6,
             tf_operation = "get_element",
             operation_args = list("u"))
 
+    iterations <- op("iterations",
+                     parameter_list,
+                     tf_operation = "get_element",
+                     operation_args = list("iterations"))
+
+    converged <- op("converged",
+                    parameter_list,
+                    tf_operation = "get_element",
+                    operation_args = list("converged"))
 
     # pull out the elements
     list(z = z,
          a = a,
          mu = mu,
-         u = u)
+         u = u,
+         iterations = iterations,
+         converged = converged)
 
   }
 
@@ -470,8 +482,10 @@ laplace_approximation <- function(tolerance = 1e-6,
 
   return_list_function <- function(parameters) {
 
-    list(mean = parameters$z - parameters$mu,
-         sigma = chol2symm(parameters$u))
+    list(mean = t(parameters$z),
+         sigma = chol2symm(parameters$u),
+         iterations = parameters$iterations,
+         converged = parameters$converged)
 
   }
 
diff --git a/tests/testthat/test_marginalisation.R b/tests/testthat/test_marginalisation.R
index fc0520d6..04f43382 100644
--- a/tests/testthat/test_marginalisation.R
+++ b/tests/testthat/test_marginalisation.R
@@ -254,8 +254,9 @@ test_that("laplace approximation converges on correct posterior", {
   out <- marginalise(lik,
                      multivariate_normal(mean, sigma),
                      laplace_approximation(diagonal_hessian = TRUE))
-  theta_mu_est <- t(calculate(out$mean)[[1]])
-  theta_var_est <- calculate(diag(out$sigma))[[1]]
+  res <- calculate(mean =  t(out$mean), diag_sigma = diag(out$sigma))
+  theta_mu_est <- res$mean
+  theta_var_est <- res$diag_sigma
 
   analytic <- cbind(mean = theta_mu, sd = sqrt(theta_var))
   laplace <- cbind(mean = theta_mu_est, sd = sqrt(theta_var_est))
@@ -263,4 +264,6 @@ test_that("laplace approximation converges on correct posterior", {
   # compare these to within a tolerance
   compare_op(analytic, laplace)
 
+  # modes are right, sds are not!
+
 })

From 2bae9abf7dd1fc8a1a24ae1de78bb3692b22800f Mon Sep 17 00:00:00 2001
From: Nick Golding <goldingn35@gmail.com>
Date: Wed, 26 Feb 2020 17:17:33 +1100
Subject: [PATCH 32/38] get laplace variances working

---
 R/marginalisers.R                     | 43 ++++++++++++++++++---------
 tests/testthat/test_marginalisation.R |  9 ++----
 2 files changed, 32 insertions(+), 20 deletions(-)

diff --git a/R/marginalisers.R b/R/marginalisers.R
index 40d4a161..c5b3f3a9 100644
--- a/R/marginalisers.R
+++ b/R/marginalisers.R
@@ -354,9 +354,19 @@ laplace_approximation <- function(tolerance = 1e-6,
     w <- -d2
     rw <- sqrt(w)
 
-    # approximate posterior covariance & cholesky factor
+    # approximate posterior covariance
+    # do we need the eye?
     mat1 <- tf$matmul(rw, tf_transpose(rw)) * sigma + eye
-    u <- tf$cholesky(mat1)
+    l <- tf$cholesky(mat1)
+    v <- tf$linalg$triangular_solve(matrix = l,
+                                    rhs = sigma * rw,
+                                    lower = TRUE,
+                                    adjoint = TRUE)
+    covar <- sigma - tf$linalg$matmul(v, v, transpose_b = TRUE)
+
+    # log-determinant of l
+    l_diag <- tf$matrix_diag_part(l)
+    logdet <- tf_sum(tf$log(l_diag))
 
     # convergence information
     iter <- out[[7]]
@@ -366,7 +376,8 @@ laplace_approximation <- function(tolerance = 1e-6,
     list(z = z,
          mu = mu,
          a = a,
-         u = u,
+         logdet = logdet,
+         covar = covar,
          iterations = iter,
          converged = converged)
 
@@ -418,11 +429,16 @@ laplace_approximation <- function(tolerance = 1e-6,
             tf_operation = "get_element",
             operation_args = list("mu"))
 
-    u <- op("chol_sigma",
-            parameter_list,
-            dim = dim(sigma),
-            tf_operation = "get_element",
-            operation_args = list("u"))
+    logdet <- op("log determinant",
+                parameter_list,
+                tf_operation = "get_element",
+                operation_args = list("logdet"))
+
+    covar <- op("covar",
+                parameter_list,
+                dim = dim(sigma),
+                tf_operation = "get_element",
+                operation_args = list("covar"))
 
     iterations <- op("iterations",
                      parameter_list,
@@ -438,7 +454,8 @@ laplace_approximation <- function(tolerance = 1e-6,
     list(z = z,
          a = a,
          mu = mu,
-         u = u,
+         logdet = logdet,
+         covar = covar,
          iterations = iterations,
          converged = converged)
 
@@ -467,14 +484,12 @@ laplace_approximation <- function(tolerance = 1e-6,
     }
 
     mu <- parameters$mu
-    u <- parameters$u
+    logdet <- parameters$logdet
     z <- parameters$z
     a <- parameters$a
 
     # the approximate marginal conditional posterior
-    u_diag <- tf$matrix_diag_part(u)
-    logdet <- tf_sum(tf$log(u_diag))
-    nmcp <- psi(a, z, mu) + tf$squeeze(logdet, 1)
+    nmcp <- psi(a, z, mu) + tf$squeeze(u_logdet, 1)
 
     -nmcp
 
@@ -483,7 +498,7 @@ laplace_approximation <- function(tolerance = 1e-6,
   return_list_function <- function(parameters) {
 
     list(mean = t(parameters$z),
-         sigma = chol2symm(parameters$u),
+         sigma = parameters$covar,
          iterations = parameters$iterations,
          converged = parameters$converged)
 
diff --git a/tests/testthat/test_marginalisation.R b/tests/testthat/test_marginalisation.R
index 04f43382..72f70782 100644
--- a/tests/testthat/test_marginalisation.R
+++ b/tests/testthat/test_marginalisation.R
@@ -243,14 +243,12 @@ test_that("laplace approximation converges on correct posterior", {
   theta_mu <- (y * obs_prec  + mu * prec) * theta_var
   theta_sd <- sqrt(theta_var)
 
-  # Laplace solution:
-  lik <- function(theta) {
-    distribution(y) <- normal(t(theta), obs_sd)
-  }
-
   # mock up as a multivariate normal distribution
   mean <- ones(1, 8) * mu
   sigma <- diag(8) * sd ^ 2
+  lik <- function(theta) {
+    distribution(y) <- normal(t(theta), obs_sd)
+  }
   out <- marginalise(lik,
                      multivariate_normal(mean, sigma),
                      laplace_approximation(diagonal_hessian = TRUE))
@@ -264,6 +262,5 @@ test_that("laplace approximation converges on correct posterior", {
   # compare these to within a tolerance
   compare_op(analytic, laplace)
 
-  # modes are right, sds are not!
 
 })

From ca8920ba882b7e94e7c5e9dfcd15c2f68361be15 Mon Sep 17 00:00:00 2001
From: Nick Golding <goldingn35@gmail.com>
Date: Wed, 26 Feb 2020 18:12:35 +1100
Subject: [PATCH 33/38] make laplace work for MVN test case

---
 R/marginalisers.R                     | 14 +++----
 tests/testthat/test_marginalisation.R | 60 ++++++++++++++++++++++++++-
 2 files changed, 63 insertions(+), 11 deletions(-)

diff --git a/R/marginalisers.R b/R/marginalisers.R
index c5b3f3a9..7dd0c37c 100644
--- a/R/marginalisers.R
+++ b/R/marginalisers.R
@@ -343,7 +343,7 @@ laplace_approximation <- function(tolerance = 1e-6,
 
     a <- out[[2]]
 
-    # apparently we need to redefine z and u here, or the backprop errors
+    # apparently we need to redefine z etc. here, or the backprop errors
 
     # lots of duplicated code; this could be tidied up, but I ran out of time!
     z <- tf$matmul(sigma, a) + mu
@@ -353,18 +353,14 @@ laplace_approximation <- function(tolerance = 1e-6,
     d2 <- deriv[[2]]
     w <- -d2
     rw <- sqrt(w)
+    hessian <- tf$linalg$diag(tf$squeeze(w, 2L))
 
     # approximate posterior covariance
-    # do we need the eye?
-    mat1 <- tf$matmul(rw, tf_transpose(rw)) * sigma + eye
-    l <- tf$cholesky(mat1)
-    v <- tf$linalg$triangular_solve(matrix = l,
-                                    rhs = sigma * rw,
-                                    lower = TRUE,
-                                    adjoint = TRUE)
-    covar <- sigma - tf$linalg$matmul(v, v, transpose_b = TRUE)
+    covar <- tf$linalg$inv(tf$linalg$inv(sigma) + hessian)
 
     # log-determinant of l
+    mat1 <- tf$matmul(rw, tf_transpose(rw)) * sigma + eye
+    l <- tf$cholesky(mat1)
     l_diag <- tf$matrix_diag_part(l)
     logdet <- tf_sum(tf$log(l_diag))
 
diff --git a/tests/testthat/test_marginalisation.R b/tests/testthat/test_marginalisation.R
index 72f70782..b2a5113d 100644
--- a/tests/testthat/test_marginalisation.R
+++ b/tests/testthat/test_marginalisation.R
@@ -201,7 +201,7 @@ test_that("inference runs with laplace approximation", {
 
 })
 
-test_that("laplace approximation converges on correct posterior", {
+test_that("laplace approximation has correct posterior for univariate normal", {
 
   skip_if_not(check_tf_version())
   source("helpers.R")
@@ -224,7 +224,6 @@ test_that("laplace approximation converges on correct posterior", {
   # analytic solution:
 
   # nolint start
-  # Marginalising int we can write:
   # Bayes theorum gives:
   #   p(theta | y) \propto p(y|theta) p(theta)
   # which with normal densities is:
@@ -262,5 +261,62 @@ test_that("laplace approximation converges on correct posterior", {
   # compare these to within a tolerance
   compare_op(analytic, laplace)
 
+})
+
+test_that("laplace approximation has correct posterior for multivariate normal", {
+
+  skip_if_not(check_tf_version())
+  source("helpers.R")
+
+  # nolint start
+  # test vs analytic posterior on 8 schools data with no pooling:
+  #   y_i ~ N(theta_i, obs_sd_i ^ 2)
+  #   theta ~ MVN(mu, sigma)
+  # the posterior for theta is multivariate normal, so laplace should be exact
+  # nolint end
+
+  # eight schools data
+  y <- c(28.39, 7.94, -2.75 , 6.82, -0.64, 0.63, 18.01, 12.16)
+  obs_sd <- c(14.9, 10.2, 16.3, 11.0, 9.4, 11.4, 10.4, 17.6)
+
+  # prior parameters for mu and sigma
+  mu <- rnorm(8)
+  sigma <- rwish(1, 9, diag(8))[1, , ]
+
+  # analytic solution:
+
+  # nolint start
+  # Bayes theorum gives:
+  #   p(theta | y) \propto p(y|theta) p(theta)
+  # which with normal densities is:
+  #   p(theta_i | y_i) \propto N(y_i | theta_i, obs_sd_i ^ 2) *
+  #         N(theta_i | mu, sd ^ 2)
+  # which is equivalent to:
+  #   p(theta | y) \propto MNN(theta_mu, theta_sigma)
+  #   theta_sigma = (sigma^-1 + diag(1 /obs_sd^2))^-1
+  #   theta_mu = theta_sigma (sigma^-1 mu + diag(1 /obs_sd^2) mean(y))^-1
+  # conjugate prior, see Wikipedia conjugate prior table
+  # nolint end
+
+  i_obs_sigma <- diag(1 / (obs_sd ^ 2))
+  i_sigma <- solve(sigma)
+  theta_sigma <- solve(i_sigma + i_obs_sigma)
+  theta_mu <- theta_sigma %*% (i_sigma %*% mu + i_obs_sigma %*% y)
+  theta_sigma_flat <- theta_sigma[upper.tri(theta_sigma, diag = TRUE)]
+
+  lik <- function(theta) {
+    distribution(y) <- normal(t(theta), obs_sd)
+  }
+  out <- marginalise(lik,
+                     multivariate_normal(t(mu), sigma),
+                     laplace_approximation(diagonal_hessian = TRUE, tolerance = 1e-32))
+  res <- calculate(mean =  t(out$mean), sigma = out$sigma, iterations = out$iterations)
+  theta_mu_est <- res$mean
+  theta_sigma_est <- res$sigma
+  theta_sigma_est_flat <- theta_sigma_est[upper.tri(theta_sigma_est, diag = TRUE)]
+
+  # compare these to within a tolerance
+  compare_op(theta_mu, theta_mu_est)
+  compare_op(theta_sigma_flat, theta_sigma_est_flat)
 
 })

From 5191f45219966f31e56392690cc4ea18bade40c3 Mon Sep 17 00:00:00 2001
From: Nick Golding <goldingn35@gmail.com>
Date: Thu, 27 Feb 2020 09:15:54 +1100
Subject: [PATCH 34/38] bugfix and add notes for univariate Laplace

---
 R/marginalisers.R | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/R/marginalisers.R b/R/marginalisers.R
index 7dd0c37c..9ffdd8a3 100644
--- a/R/marginalisers.R
+++ b/R/marginalisers.R
@@ -170,6 +170,30 @@ laplace_approximation <- function(tolerance = 1e-6,
   # factorising)
   #  - handle an iid normal distribution too.
 
+  # the following algorithm for a univariate normal distribution is derived
+  # from LA in Rasmussen & Williams for diagonal Hessian & diagonal prior
+  # it neeeds checking!
+
+  # nolint start
+
+  # with univariate normal and diagonal hessian, iteration is:
+  #   z_new = mu + (w * (z - mu) + d1(z)) / (1 / sd + w)
+
+  # instead work with decentred variable a where z = a * sd ^ 2 + mu
+  #   l = sqrt(1 + w * sd)
+  #   b = w * (z - mu) * d1(z)
+  #   a_diff = b − (w * sd * b) / l ^ 2 - a
+  #   a_new = a + s * a_diff
+  #   z_new = a_new * sd ^ 2 + mu
+
+  # estimated sd at mode is:
+  #   sqrt(1 / (1 / sd + w))
+
+  # the approximated negative marginal density is:
+  #   nmcp = 0.5 * sum((z - mu) ^ 2 / sd) - d0(z) + 0.5 * sum(log1p(w * sd))
+
+  # nolint end
+
   if (!(is.numeric(tolerance) &&
         is.vector(tolerance) &&
         length(tolerance) == 1 &&
@@ -288,7 +312,8 @@ laplace_approximation <- function(tolerance = 1e-6,
       # decentred values of z
       cf <- z - mu
 
-      # approximate posterior covariance & cholesky factor
+      # approximate posterior covariance & cholesky factor (using the matrix inverse
+      # lemma for numerical stability)
       mat1 <- tf$matmul(rw, tf_transpose(rw)) * sigma + eye
       u <- tf$cholesky(mat1)
       l <- tf_transpose(u)
@@ -485,7 +510,7 @@ laplace_approximation <- function(tolerance = 1e-6,
     a <- parameters$a
 
     # the approximate marginal conditional posterior
-    nmcp <- psi(a, z, mu) + tf$squeeze(u_logdet, 1)
+    nmcp <- psi(a, z, mu) + tf$squeeze(logdet, 1)
 
     -nmcp
 

From f8b4755a2c1291e2fc91596458bfe85af379d55c Mon Sep 17 00:00:00 2001
From: Nick Golding <goldingn35@gmail.com>
Date: Fri, 28 Feb 2020 11:38:51 +1100
Subject: [PATCH 35/38] add failing test for univariate normal laplace

---
 tests/testthat/test_marginalisation.R | 79 +++++++++++++--------------
 1 file changed, 37 insertions(+), 42 deletions(-)

diff --git a/tests/testthat/test_marginalisation.R b/tests/testthat/test_marginalisation.R
index b2a5113d..08d2f521 100644
--- a/tests/testthat/test_marginalisation.R
+++ b/tests/testthat/test_marginalisation.R
@@ -211,19 +211,6 @@ test_that("laplace approximation has correct posterior for univariate normal", {
   #   y_i ~ N(theta_i, obs_sd_i ^ 2)
   #   theta_i ~ N(mu, sd ^ 2)
   # the posterior for theta is normal, so laplace should be exact
-  # nolint end
-
-  # eight schools data
-  y <- c(28.39, 7.94, -2.75 , 6.82, -0.64, 0.63, 18.01, 12.16)
-  obs_sd <- c(14.9, 10.2, 16.3, 11.0, 9.4, 11.4, 10.4, 17.6)
-
-  # prior parameters for int, and fixed variance of theta
-  mu <- rnorm(1)
-  sd <- abs(rnorm(1))
-
-  # analytic solution:
-
-  # nolint start
   # Bayes theorum gives:
   #   p(theta | y) \propto p(y|theta) p(theta)
   # which with normal densities is:
@@ -236,30 +223,43 @@ test_that("laplace approximation has correct posterior for univariate normal", {
   # conjugate prior, see Wikipedia conjugate prior table
   # nolint end
 
+  # eight schools data
+  y <- c(28.39, 7.94, -2.75 , 6.82, -0.64, 0.63, 18.01, 12.16)
+  obs_sd <- c(14.9, 10.2, 16.3, 11.0, 9.4, 11.4, 10.4, 17.6)
+
+  # prior parameters for int, and fixed variance of theta
+  mu <- rnorm(1)
+  sd <- abs(rnorm(1))
+
+  # analytic solution
   obs_prec <- 1 / (obs_sd ^ 2)
   prec <- 1 / (sd ^ 2)
   theta_var <- 1 / (obs_prec + prec)
   theta_mu <- (y * obs_prec  + mu * prec) * theta_var
   theta_sd <- sqrt(theta_var)
+  analytic <- cbind(mean = theta_mu, sd = sqrt(theta_var))
 
-  # mock up as a multivariate normal distribution
-  mean <- ones(1, 8) * mu
-  sigma <- diag(8) * sd ^ 2
   lik <- function(theta) {
     distribution(y) <- normal(t(theta), obs_sd)
   }
+
+  # analyse as univariate normal
   out <- marginalise(lik,
-                     multivariate_normal(mean, sigma),
+                     normal(mu, sd, dim = 8),
                      laplace_approximation(diagonal_hessian = TRUE))
-  res <- calculate(mean =  t(out$mean), diag_sigma = diag(out$sigma))
-  theta_mu_est <- res$mean
-  theta_var_est <- res$diag_sigma
+  res <- do.call(calculate, out)
+  laplace_uni <- cbind(mean = res$mean, sd = res$sd)
+  compare_op(analytic, laplace_uni)
 
-  analytic <- cbind(mean = theta_mu, sd = sqrt(theta_var))
-  laplace <- cbind(mean = theta_mu_est, sd = sqrt(theta_var_est))
-
-  # compare these to within a tolerance
-  compare_op(analytic, laplace)
+  # analyse as multivariate normal
+  mean <- ones(1, 8) * mu
+  sigma <- diag(8) * sd ^ 2
+  out <- marginalise(lik,
+                     multivariate_normal(mean, sigma),
+                     laplace_approximation(diagonal_hessian = TRUE))
+  res <- do.call(calculate, out)
+  laplace_multi <- cbind(mean = res$mean[1, ], sd = sqrt(diag(res$sigma)))
+  compare_op(analytic, laplace_multi)
 
 })
 
@@ -273,19 +273,6 @@ test_that("laplace approximation has correct posterior for multivariate normal",
   #   y_i ~ N(theta_i, obs_sd_i ^ 2)
   #   theta ~ MVN(mu, sigma)
   # the posterior for theta is multivariate normal, so laplace should be exact
-  # nolint end
-
-  # eight schools data
-  y <- c(28.39, 7.94, -2.75 , 6.82, -0.64, 0.63, 18.01, 12.16)
-  obs_sd <- c(14.9, 10.2, 16.3, 11.0, 9.4, 11.4, 10.4, 17.6)
-
-  # prior parameters for mu and sigma
-  mu <- rnorm(8)
-  sigma <- rwish(1, 9, diag(8))[1, , ]
-
-  # analytic solution:
-
-  # nolint start
   # Bayes theorum gives:
   #   p(theta | y) \propto p(y|theta) p(theta)
   # which with normal densities is:
@@ -298,10 +285,18 @@ test_that("laplace approximation has correct posterior for multivariate normal",
   # conjugate prior, see Wikipedia conjugate prior table
   # nolint end
 
-  i_obs_sigma <- diag(1 / (obs_sd ^ 2))
-  i_sigma <- solve(sigma)
-  theta_sigma <- solve(i_sigma + i_obs_sigma)
-  theta_mu <- theta_sigma %*% (i_sigma %*% mu + i_obs_sigma %*% y)
+  # eight schools data
+  y <- c(28.39, 7.94, -2.75 , 6.82, -0.64, 0.63, 18.01, 12.16)
+  obs_sd <- c(14.9, 10.2, 16.3, 11.0, 9.4, 11.4, 10.4, 17.6)
+
+  # prior parameters for mu and sigma
+  mu <- rnorm(8)
+  sigma <- rwish(1, 9, diag(8))[1, , ]
+
+  obs_prec <- diag(1 / (obs_sd ^ 2))
+  prec <- solve(sigma)
+  theta_sigma <- solve(prec + obs_prec)
+  theta_mu <- theta_sigma %*% (prec %*% mu + obs_prec %*% y)
   theta_sigma_flat <- theta_sigma[upper.tri(theta_sigma, diag = TRUE)]
 
   lik <- function(theta) {

From fc8fe92688b41fab84edc0329b1944204cf8ff73 Mon Sep 17 00:00:00 2001
From: Nick Golding <goldingn35@gmail.com>
Date: Fri, 28 Feb 2020 12:46:32 +1100
Subject: [PATCH 36/38] switch marginalisers to R6 classes

---
 R/marginalise.R                       |   17 +-
 R/marginalisers.R                     | 1046 +++++++++++++------------
 tests/testthat/test_marginalisation.R |   45 +-
 3 files changed, 578 insertions(+), 530 deletions(-)

diff --git a/R/marginalise.R b/R/marginalise.R
index 5db116b7..34b393b4 100644
--- a/R/marginalise.R
+++ b/R/marginalise.R
@@ -91,8 +91,12 @@ marginalise <- function(fun, variable, method, ...) {
          "See ?marginalise for options")
   }
 
+  # construct the marginaliser
+  method_args <- method[which(names(method) != "class")]
+  marginaliser <- do.call(method$class$new, method_args)
+
   # check the distribution is compatible with the method
-  method$distribution_check(distribution_node)
+  marginaliser$distribution_check(distribution_node)
 
   # excise the variable from the distribution
   distribution_node$remove_target()
@@ -105,20 +109,19 @@ marginalise <- function(fun, variable, method, ...) {
                                                       c(list(variable), dots))
 
   # get a list of greta arrays for the marginalisation parameters:
-  parameters <- method$compute_parameters(
+  parameters <- marginaliser$compute_parameters(
     conditional_density_fun = conditional_joint_density,
     distribution_node = distribution_node,
-    dots = dots,
-    marginaliser = method
+    dots = dots
   )
 
   # add on any pre-computed parameters
-  parameters <- c(parameters, method$other_parameters)
+  parameters <- c(parameters, marginaliser$other_parameters)
 
   # create the distribution
   distribution <- distrib(
     distribution = "marginalisation",
-    marginaliser = method,
+    marginaliser = marginaliser,
     parameters = parameters,
     conditional_density_fun = conditional_joint_density,
     distribution_node = distribution_node,
@@ -127,7 +130,7 @@ marginalise <- function(fun, variable, method, ...) {
 
   # return a list of the greta arrays computed during the marginalisation, and
   # any other things from the method
-  output <- method$return_list(parameters)
+  output <- marginaliser$return_list(parameters)
   invisible(output)
 
 }
diff --git a/R/marginalisers.R b/R/marginalisers.R
index 9ffdd8a3..d33979bf 100644
--- a/R/marginalisers.R
+++ b/R/marginalisers.R
@@ -1,5 +1,538 @@
 # marginaliser constructors
 
+# add a marginaliser R6 class
+
+marginaliser <- R6Class(
+  "marginaliser",
+  public = list(
+
+    name = "marginaliser",
+
+    other_parameters = list(),
+    other_args = list(),
+
+    initialize = function(...) {
+      invisible(NULL)
+    },
+
+    return_list = function(parameters) {
+      list()
+    }
+
+  )
+)
+
+discrete_marginaliser <- R6Class(
+  "discrete marginaliser",
+  inherit = marginaliser,
+  public = list(
+
+    initialize = function(values) {
+
+      super$initialize()
+
+      if (!is.vector(values) | !is.numeric(values)) {
+        msg <- "'values' must be an R numeric vector"
+
+        if (inherits(values, "greta_array")) {
+          msg <- paste0(msg, ", not a greta array")
+        }
+
+        stop(msg)
+      }
+
+      # convert values to a list of data greta arrays
+      values_list <- as.list(values)
+      values_list <- lapply(values_list, as_data)
+      names(values_list) <- paste0("value_", seq_along(values_list))
+      self$other_parameters <- values_list
+
+    },
+
+    # return a named list of operation greta arrays for the marginalisation
+    # parameters
+    compute_parameters = function(conditional_density_fun,
+                                  distribution_node,
+                                  dots) {
+
+      # function to compute the parameter (log probabilities) of the marginalisation
+      tf_compute_log_weights <- function(tfp_distribution, ...) {
+
+        # and compute log probabilities for the values
+        values_list <- list(...)
+        log_weights_list <- lapply(values_list, tfp_distribution$log_prob)
+        log_weights_list <- lapply(log_weights_list, tf_sum)
+
+        # convert to a vector of discrete probabilities and make them sum to 1,
+        # whilst staying on the log scale
+        log_weights <- tf$concat(log_weights_list, axis = 1L)
+
+        # normalise weights on log scale
+        log_weights_sum <- tf$reduce_logsumexp(
+          log_weights,
+          axis = 1L,
+          keepdims = TRUE
+        )
+
+        log_weights - log_weights_sum
+
+      }
+
+      # list of parameters in a mock greta array (just representing a tf list)
+      args <- c(operation = "marginalisation_log_weights",
+                distribution_node,
+                self$other_parameters,
+                tf_operation = "tf_compute_log_weights",
+                list(dim = c(length(self$other_parameters), 1L)))
+      log_weights <- do.call(op, args)
+
+      list(log_weights = log_weights)
+
+    },
+
+    # compute the marginal joint density
+    tf_marginalisation_density = function(parameters,
+                                          tf_conditional_density_fun,
+                                          dots,
+                                          other_args) {
+
+      # unpack the parameters
+      log_weights <- parameters$log_weights
+      values_list <- parameters[names(parameters) != "log_weights"]
+
+      # compute the conditional joint density for each value (passing in
+      # dots too)
+      log_density_list <- list()
+      for (i in seq_along(values_list)) {
+        args <- c(list(values_list[[i]]), dots)
+        log_density_list[[i]] <- do.call(tf_conditional_density_fun, args)
+      }
+
+      # expand and combine the densities
+      log_density_list <- lapply(log_density_list, tf$expand_dims, 1L)
+      log_density_list <- lapply(log_density_list, tf$expand_dims, 2L)
+      log_density_vec <- tf$concat(log_density_list, axis = 1L)
+
+      # compute a weighted sum
+      log_density_weighted_vec <- log_density_vec + log_weights
+      density <- tf$reduce_logsumexp(log_density_weighted_vec, axis = 1L)
+      density
+
+    },
+
+    return_list = function(parameters) {
+      list(probabilities = exp(parameters$log_weights))
+    },
+
+    # check that the distribution is discrete
+    distribution_check = function(distrib) {
+      if (!distrib$discrete) {
+        stop("this marginalisation method can only be used ",
+             "with discrete distributions",
+             call. = FALSE)
+      }
+    }
+
+  )
+)
+
+laplace_marginaliser <- R6Class(
+  "laplace marginaliser",
+  inherit = marginaliser,
+  public = list(
+
+    tolerance = NULL,
+    max_iterations = NULL,
+    diagonal_hessian = NULL,
+    multivariate = FALSE,
+
+    initialize = function(tolerance,
+                          max_iterations,
+                          diagonal_hessian) {
+
+      super$initialize()
+
+      # in future:
+      #  - enable warm starts for subsequent steps of the outer inference algorithm
+      #  - enable a non-factorising version (have the user say whether it is
+      # factorising)
+      #  - handle an iid normal distribution too.
+
+      # the following algorithm for a univariate normal distribution is derived
+      # from LA in Rasmussen & Williams for diagonal Hessian & diagonal prior
+      # it neeeds checking!
+
+      # nolint start
+
+      # with univariate normal and diagonal hessian, iteration is:
+      #   z_new = mu + (w * (z - mu) + d1(z)) / (1 / sd + w)
+
+      # instead work with decentred variable a where z = a * sd ^ 2 + mu
+      #   l = sqrt(1 + w * sd)
+      #   b = w * (z - mu) * d1(z)
+      #   a_diff = b − (w * sd * b) / l ^ 2 - a
+      #   a_new = a + s * a_diff
+      #   z_new = a_new * sd ^ 2 + mu
+
+      # estimated sd at mode is:
+      #   sqrt(1 / (1 / sd + w))
+
+      # the approximated negative marginal density is:
+      #   nmcp = 0.5 * sum((z - mu) ^ 2 / sd) - d0(z) + 0.5 * sum(log1p(w * sd))
+
+      # nolint end
+
+      if (!(is.numeric(tolerance) &&
+            is.vector(tolerance) &&
+            length(tolerance) == 1 &&
+            tolerance > 0)) {
+        stop("'tolerance' must be a positive, scalar numeric value")
+      }
+
+      max_iterations <- as.integer(max_iterations)
+      if (!(is.vector(max_iterations) &&
+            length(max_iterations) == 1 &&
+            max_iterations > 0)) {
+        stop("'max_iterations' must be a positive, scalar integer value")
+      }
+
+      self$other_args <- list(tolerance = tolerance,
+                              max_iterations = max_iterations,
+                              diagonal_hessian = diagonal_hessian)
+
+    },
+
+    # named list of operation greta arrays for the marginalisation parameters
+    compute_parameters = function(conditional_density_fun,
+                                   distribution_node,
+                                   dots) {
+
+      # function to compute the parameter (log probabilities) of the marginalisation
+      # define the marginalisation function
+      tf_compute_laplace_parameters = function(mu,
+                                               sigma,
+                                               ...,
+                                               tf_conditional_density_fun,
+                                               diagonal_hessian,
+                                               tolerance,
+                                               max_iterations) {
+
+        dots <- list(...)
+
+        # simpler interface to conditional density. If reduce = TRUE, it does
+        # reduce_sum on component densities
+        d0 <- function(z, reduce = TRUE) {
+          # transpose z to a row vector, which the dag is expecting
+          t_z <- tf_transpose(z)
+          args <- c(list(t_z), dots, list(reduce = reduce))
+          do.call(tf_conditional_density_fun, args)
+        }
+
+        # get the vectors of first and second derivatives of the conditional density
+        # function, w.r.t. the variable being marginalised
+        if (diagonal_hessian) {
+          derivs <- function(z) {
+            y <- d0(z, reduce = FALSE)
+            d1 <- tf$gradients(y, z)[[1]]
+            d2 <- tf$gradients(d1, z)[[1]]
+            list(d1, d2)
+          }
+        } else {
+
+          stop("inference with non-diagonal hessians is not yet implemented",
+               call. = FALSE)
+
+          derivs <- function(z) {
+            y <- d0(z, reduce = FALSE)
+            d1 <- tf$gradients(y, z)[[1]]
+            d2 <- tf$hessians(y, z)[[1]]  # this won't work!
+            list(d1, d2)
+          }
+        }
+
+        # negative log-posterior for the current value of z under MVN assumption
+        psi <- function(a, z, mu) {
+          p1 <- tf$matmul(tf_transpose(a), z - mu)
+          fl(0.5) * tf$squeeze(p1, 1:2) - d0(z)
+        }
+
+        # tensors for parameters of MVN distribution (make mu column vector now)
+        mu <- tf_transpose(mu)
+
+        # dimension of the MVN distribution
+        n <- dim(mu)[[2]]
+
+        # here z is a *column vector* to simplify later calculations, it needs to be
+        # transposed to a row vector before feeding into the likelihood function(s)
+        z <- tf$identity(mu)
+
+        # Newton-Raphson parameters
+        tol <- tf$constant(tolerance, tf_float(), shape(1))
+        obj_old <- tf$constant(Inf, tf_float(), shape(1))
+        iter <- tf$constant(0L)
+        maxiter <- tf$constant(max_iterations)
+
+        # other objects
+        a_value <- add_first_dim(as_2d_array(rep(0, n)))
+        a <- fl(a_value)
+        u_value <- add_first_dim(diag(n))
+        u <- fl(u_value)
+        eye <- fl(add_first_dim(diag(n)))
+
+        # match batches on everything going into the loop that will have a batch
+        # dimension later
+        objects <- list(mu, sigma, z, a, u, obj_old, tol)
+        objects <- greta:::match_batches(objects)
+        mu <- objects[[1]]
+        sigma <- objects[[2]]
+        z <- objects[[3]]
+        a <- objects[[4]]
+        u <- objects[[5]]
+        obj_old <- objects[[6]]
+        tol <- objects[[7]]
+
+        obj <- -d0(z)
+
+        # get the batch dim explicitly, for the step size optimisation
+        batch_dim <- tf$shape(mu)[[0]]
+        batch_dim <- tf$expand_dims(batch_dim, 0L)
+
+        # tensorflow while loop to do Newton-Raphson iterations
+        body <- function(z, a, u, obj_old, obj, tol, iter, maxiter) {
+
+          obj_old <- obj
+
+          deriv <- derivs(z)
+          d1 <- deriv[[1]]
+          d2 <- deriv[[2]]
+
+          # curvature of the likelihood
+          w <- -d2
+          rw <- sqrt(w)
+
+          # decentred values of z
+          cf <- z - mu
+
+          # approximate posterior covariance & cholesky factor (using the matrix inverse
+          # lemma for numerical stability)
+          mat1 <- tf$matmul(rw, tf_transpose(rw)) * sigma + eye
+          u <- tf$cholesky(mat1)
+          l <- tf_transpose(u)
+
+          # compute Newton-Raphson update direction
+          b <- w * cf + d1
+          mat2 <- rw * tf$matmul(sigma, b)
+          mat3 <- tf$matrix_triangular_solve(u, mat2)
+          adiff <- b - rw * tf$matrix_triangular_solve(l, mat3, lower = FALSE) - a
+
+          # use golden section search to find the optimum distance to step in this
+          # direction, for each batch simultaneously
+          psiline <- function(s) {
+            s <- tf$expand_dims(s, 1L)
+            s <- tf$expand_dims(s, 2L)
+            a_new <- a + s * adiff
+            z_new <- tf$matmul(sigma, a_new) + mu
+            psi(a_new, z_new, mu)
+          }
+
+          ls_results <- gss(psiline, batch_dim, upper = 2)
+          stepsize <- ls_results$minimum
+          stepsize <- tf$expand_dims(stepsize, 1L)
+          stepsize <- tf$expand_dims(stepsize, 2L)
+
+          # do the update and compute new z and objective
+          a_new <- a + stepsize * adiff
+          z_new <- tf$matmul(sigma, a_new) + mu
+          obj <- psi(a_new, z_new, mu)
+
+          list(z_new, a_new, u, obj_old, obj, tol, iter + 1L, maxiter)
+
+        }
+
+        cond <- function(z, a, u, obj_old, obj, tol, iter, maxiter) {
+          not_all_converged <- tf$reduce_any(tf$less(tol, obj_old - obj))
+          in_time <- tf$less(iter, maxiter)
+          in_time & not_all_converged
+        }
+
+        values <- list(z,
+                       a,
+                       u,
+                       obj_old,
+                       obj,
+                       tol,
+                       iter,
+                       maxiter)
+
+        # run the Newton-Raphson optimisation to find the posterior mode of z
+        out <- tf$while_loop(cond, body, values)
+
+        a <- out[[2]]
+
+        # apparently we need to redefine z etc. here, or the backprop errors
+
+        # lots of duplicated code; this could be tidied up, but I ran out of time!
+        z <- tf$matmul(sigma, a) + mu
+
+        # curvature of the likelihood at the mode
+        deriv <- derivs(z)
+        d2 <- deriv[[2]]
+        w <- -d2
+        rw <- sqrt(w)
+        hessian <- tf$linalg$diag(tf$squeeze(w, 2L))
+
+        # approximate posterior covariance
+        covar <- tf$linalg$inv(tf$linalg$inv(sigma) + hessian)
+
+        # log-determinant of l
+        mat1 <- tf$matmul(rw, tf_transpose(rw)) * sigma + eye
+        l <- tf$cholesky(mat1)
+        l_diag <- tf$matrix_diag_part(l)
+        logdet <- tf_sum(tf$log(l_diag))
+
+        # convergence information
+        iter <- out[[7]]
+        converged <- tf$less(iter, maxiter)
+
+        # return a list of these things
+        list(z = z,
+             mu = mu,
+             a = a,
+             logdet = logdet,
+             covar = covar,
+             iterations = iter,
+             converged = converged)
+
+      }
+
+      # get greta arrays for parameters of distribution node
+      mean <- distribution_node$parameters$mean
+      sigma <- distribution_node$parameters$sigma
+
+      # run the laplace approximation fitting and get a list of parameters in a
+      # mock greta array (just representing a tf list)
+      args <- c(operation = "marginalisation_parameters",
+                mu = mean,
+                sigma = sigma,
+                dots,
+                list(
+                  operation_args = c(
+                    tf_conditional_density_fun = conditional_density_fun,
+                    self$other_args
+                  ),
+                  tf_operation = "tf_compute_laplace_parameters",
+                  dim = 1
+                ))
+
+      parameter_list <- do.call(op, args)
+
+      # extract the elements to operation greta arrays with the correct shapes
+      z <- op("z",
+              parameter_list,
+              dim = dim(mean),
+              tf_operation = "get_element",
+              operation_args = list("z"))
+
+      a <- op("a",
+              parameter_list,
+              dim = dim(mean),
+              tf_operation = "get_element",
+              operation_args = list("a"))
+
+      mu <- op("mu",
+               parameter_list,
+               dim = dim(mean),
+               tf_operation = "get_element",
+               operation_args = list("mu"))
+
+      logdet <- op("log determinant",
+                   parameter_list,
+                   tf_operation = "get_element",
+                   operation_args = list("logdet"))
+
+      covar <- op("covar",
+                  parameter_list,
+                  dim = dim(sigma),
+                  tf_operation = "get_element",
+                  operation_args = list("covar"))
+
+      iterations <- op("iterations",
+                       parameter_list,
+                       tf_operation = "get_element",
+                       operation_args = list("iterations"))
+
+      converged <- op("converged",
+                      parameter_list,
+                      tf_operation = "get_element",
+                      operation_args = list("converged"))
+
+      # pull out the elements
+      list(z = z,
+           a = a,
+           mu = mu,
+           logdet = logdet,
+           covar = covar,
+           iterations = iterations,
+           converged = converged)
+
+    },
+
+    tf_marginalisation_density = function(parameters,
+                                           tf_conditional_density_fun,
+                                           dots,
+                                           other_args) {
+
+      diagonal_hessian <- other_args$diagonal_hessian
+
+      # simpler interface to conditional density. If reduce = TRUE, it does
+      # reduce_sum on component densities
+      d0 <- function(z, reduce = TRUE) {
+        # transpose z to a row vector, which the dag is expecting
+        t_z <- tf_transpose(z)
+        args <- c(list(t_z), dots, list(reduce = reduce))
+        do.call(tf_conditional_density_fun, args)
+      }
+
+      # negative log-posterior for the current value of z under MVN assumption
+      psi <- function(a, z, mu) {
+        p1 <- tf$matmul(tf_transpose(a), z - mu)
+        fl(0.5) * tf$squeeze(p1, 1:2) - d0(z)
+      }
+
+      mu <- parameters$mu
+      logdet <- parameters$logdet
+      z <- parameters$z
+      a <- parameters$a
+
+      # the approximate marginal conditional posterior
+      nmcp <- psi(a, z, mu) + tf$squeeze(logdet, 1)
+
+      -nmcp
+
+    },
+
+    return_list = function(parameters) {
+
+      list(mean = t(parameters$z),
+           sigma = parameters$covar,
+           iterations = parameters$iterations,
+           converged = parameters$converged)
+
+    },
+
+    # check that the distribution is normal
+    distribution_check = function(distrib) {
+      name <- distrib$distribution_name
+      if (!name %in% c("multivariate_normal")) {
+        stop("the Laplace approximation can only be used ",
+             "with a multivariate normal distribution",
+             call. = FALSE)
+      }
+    }
+
+  )
+)
+
 #' @name marginalisers
 #'
 #' @title marginalisation methods
@@ -36,103 +569,8 @@ NULL
 #'   the distribution, that approximation error will be minimal.
 discrete_marginalisation <- function(values) {
 
-  if (!is.vector(values) | !is.numeric(values)) {
-    msg <- "'values' must be an R numeric vector"
-
-    if (inherits(values, "greta_array")) {
-      msg <- paste0(msg, ", not a greta array")
-    }
-
-    stop(msg)
-  }
-
-  # convert values to a list of data greta arrays
-  values_list <- as.list(values)
-  values_list <- lapply(values_list, as_data)
-  names(values_list) <- paste0("value_", seq_along(values_list))
-
-  # function to compute the parameter (log probabilities) of the marginalisation
-  tf_compute_log_weights <- function(tfp_distribution, ...) {
-
-    # and compute log probabilities for the values
-    values_list <- list(...)
-    log_weights_list <- lapply(values_list, tfp_distribution$log_prob)
-    log_weights_list <- lapply(log_weights_list, tf_sum)
-
-    # convert to a vector of discrete probabilities and make them sum to 1,
-    # whilst staying on the log scale
-    log_weights <- tf$concat(log_weights_list, axis = 1L)
-
-    # normalise weights on log scale
-    log_weights_sum <- tf$reduce_logsumexp(
-      log_weights,
-      axis = 1L,
-      keepdims = TRUE
-    )
-
-    log_weights - log_weights_sum
-
-  }
-
-  # return a named list of operation greta arrays for the marginalisation
-  # parameters
-  compute_parameters <- function(conditional_density_fun,
-                                 distribution_node,
-                                 dots,
-                                 marginaliser) {
-
-    # list of parameters in a mock greta array (just representing a tf list)
-    args <- c(operation = "marginalisation_log_weights",
-              distribution_node,
-              values_list,
-              tf_operation = "tf_compute_log_weights",
-              list(dim = c(length(values_list), 1L)))
-    log_weights <- do.call(op, args)
-
-    list(log_weights = log_weights)
-
-  }
-
-  # compute the marginal joint density
-  tf_marginalisation_density <- function(parameters,
-                                         tf_conditional_density_fun,
-                                         dots,
-                                         other_args) {
-
-    # unpack the parameters
-    log_weights <- parameters$log_weights
-    values_list <- parameters[names(parameters) != "log_weights"]
-
-    # compute the conditional joint density for each value (passing in
-    # dots too)
-    log_density_list <- list()
-    for (i in seq_along(values_list)) {
-      args <- c(list(values_list[[i]]), dots)
-      log_density_list[[i]] <- do.call(tf_conditional_density_fun, args)
-    }
-
-    # expand and combine the densities
-    log_density_list <- lapply(log_density_list, tf$expand_dims, 1L)
-    log_density_list <- lapply(log_density_list, tf$expand_dims, 2L)
-    log_density_vec <- tf$concat(log_density_list, axis = 1L)
-
-    # compute a weighted sum
-    log_density_weighted_vec <- log_density_vec + log_weights
-    density <- tf$reduce_logsumexp(log_density_weighted_vec, axis = 1L)
-    density
-
-  }
-
-  return_list_function <- function(parameters) {
-    list(probabilities = exp(parameters$log_weights))
-  }
-
-  as_marginaliser(name = "discrete",
-                  compute_parameters = compute_parameters,
-                  tf_marginalisation_density = tf_marginalisation_density,
-                  return_list = return_list_function,
-                  distribution_check = discrete_check,
-                  other_parameters = values_list)
+  as_marginaliser(values = values,
+                  class = discrete_marginaliser)
 
 }
 
@@ -164,420 +602,24 @@ laplace_approximation <- function(tolerance = 1e-6,
                                   max_iterations = 50,
                                   diagonal_hessian = FALSE) {
 
-  # in future:
-  #  - enable warm starts for subsequent steps of the outer inference algorithm
-  #  - enable a non-factorising version (have the user say whether it is
-  # factorising)
-  #  - handle an iid normal distribution too.
-
-  # the following algorithm for a univariate normal distribution is derived
-  # from LA in Rasmussen & Williams for diagonal Hessian & diagonal prior
-  # it neeeds checking!
-
-  # nolint start
-
-  # with univariate normal and diagonal hessian, iteration is:
-  #   z_new = mu + (w * (z - mu) + d1(z)) / (1 / sd + w)
-
-  # instead work with decentred variable a where z = a * sd ^ 2 + mu
-  #   l = sqrt(1 + w * sd)
-  #   b = w * (z - mu) * d1(z)
-  #   a_diff = b − (w * sd * b) / l ^ 2 - a
-  #   a_new = a + s * a_diff
-  #   z_new = a_new * sd ^ 2 + mu
-
-  # estimated sd at mode is:
-  #   sqrt(1 / (1 / sd + w))
-
-  # the approximated negative marginal density is:
-  #   nmcp = 0.5 * sum((z - mu) ^ 2 / sd) - d0(z) + 0.5 * sum(log1p(w * sd))
-
-  # nolint end
-
-  if (!(is.numeric(tolerance) &&
-        is.vector(tolerance) &&
-        length(tolerance) == 1 &&
-        tolerance > 0)) {
-    stop("'tolerance' must be a positive, scalar numeric value")
-  }
-
-  max_iterations <- as.integer(max_iterations)
-  if (!(is.vector(max_iterations) &&
-        length(max_iterations) == 1 &&
-        max_iterations > 0)) {
-    stop("'max_iterations' must be a positive, scalar integer value")
-  }
-
-  # function to compute the parameter (log probabilities) of the marginalisation
-  # define the marginalisation function
-  tf_compute_laplace_parameters <- function(mu,
-                                            sigma,
-                                            ...,
-                                            tf_conditional_density_fun,
-                                            diagonal_hessian) {
-
-    dots <- list(...)
-
-    # simpler interface to conditional density. If reduce = TRUE, it does
-    # reduce_sum on component densities
-    d0 <- function(z, reduce = TRUE) {
-      # transpose z to a row vector, which the dag is expecting
-      t_z <- tf_transpose(z)
-      args <- c(list(t_z), dots, list(reduce = reduce))
-      do.call(tf_conditional_density_fun, args)
-    }
-
-    # get the vectors of first and second derivatives of the conditional density
-    # function, w.r.t. the variable being marginalised
-    if (diagonal_hessian) {
-      derivs <- function(z) {
-        y <- d0(z, reduce = FALSE)
-        d1 <- tf$gradients(y, z)[[1]]
-        d2 <- tf$gradients(d1, z)[[1]]
-        list(d1, d2)
-      }
-    } else {
-
-      stop("inference with non-diagonal hessians is not yet implemented",
-           call. = FALSE)
+  as_marginaliser(tolerance = tolerance,
+                  max_iterations = max_iterations,
+                  diagonal_hessian = diagonal_hessian,
+                  class = laplace_marginaliser)
 
-      derivs <- function(z) {
-        y <- d0(z, reduce = FALSE)
-        d1 <- tf$gradients(y, z)[[1]]
-        d2 <- tf$hessians(y, z)[[1]]  # this won't work!
-        list(d1, d2)
-      }
-    }
-
-    # negative log-posterior for the current value of z under MVN assumption
-    psi <- function(a, z, mu) {
-      p1 <- tf$matmul(tf_transpose(a), z - mu)
-      fl(0.5) * tf$squeeze(p1, 1:2) - d0(z)
-    }
-
-    # tensors for parameters of MVN distribution (make mu column vector now)
-    mu <- tf_transpose(mu)
-
-    # dimension of the MVN distribution
-    n <- dim(mu)[[2]]
-
-    # here z is a *column vector* to simplify later calculations, it needs to be
-    # transposed to a row vector before feeding into the likelihood function(s)
-    z <- tf$identity(mu)
-
-    # Newton-Raphson parameters
-    tol <- tf$constant(tolerance, tf_float(), shape(1))
-    obj_old <- tf$constant(Inf, tf_float(), shape(1))
-    iter <- tf$constant(0L)
-    maxiter <- tf$constant(max_iterations)
-
-    # other objects
-    a_value <- add_first_dim(as_2d_array(rep(0, n)))
-    a <- fl(a_value)
-    u_value <- add_first_dim(diag(n))
-    u <- fl(u_value)
-    eye <- fl(add_first_dim(diag(n)))
-
-    # match batches on everything going into the loop that will have a batch
-    # dimension later
-    objects <- list(mu, sigma, z, a, u, obj_old, tol)
-    objects <- greta:::match_batches(objects)
-    mu <- objects[[1]]
-    sigma <- objects[[2]]
-    z <- objects[[3]]
-    a <- objects[[4]]
-    u <- objects[[5]]
-    obj_old <- objects[[6]]
-    tol <- objects[[7]]
-
-    obj <- -d0(z)
-
-    # get the batch dim explicitly, for the step size optimisation
-    batch_dim <- tf$shape(mu)[[0]]
-    batch_dim <- tf$expand_dims(batch_dim, 0L)
-
-    # tensorflow while loop to do Newton-Raphson iterations
-    body <- function(z, a, u, obj_old, obj, tol, iter, maxiter) {
-
-      obj_old <- obj
-
-      deriv <- derivs(z)
-      d1 <- deriv[[1]]
-      d2 <- deriv[[2]]
-
-      # curvature of the likelihood
-      w <- -d2
-      rw <- sqrt(w)
-
-      # decentred values of z
-      cf <- z - mu
-
-      # approximate posterior covariance & cholesky factor (using the matrix inverse
-      # lemma for numerical stability)
-      mat1 <- tf$matmul(rw, tf_transpose(rw)) * sigma + eye
-      u <- tf$cholesky(mat1)
-      l <- tf_transpose(u)
-
-      # compute Newton-Raphson update direction
-      b <- w * cf + d1
-      mat2 <- rw * tf$matmul(sigma, b)
-      mat3 <- tf$matrix_triangular_solve(u, mat2)
-      adiff <- b - rw * tf$matrix_triangular_solve(l, mat3, lower = FALSE) - a
-
-      # use golden section search to find the optimum distance to step in this
-      # direction, for each batch simultaneously
-      psiline <- function(s) {
-        s <- tf$expand_dims(s, 1L)
-        s <- tf$expand_dims(s, 2L)
-        a_new <- a + s * adiff
-        z_new <- tf$matmul(sigma, a_new) + mu
-        psi(a_new, z_new, mu)
-      }
-
-      ls_results <- gss(psiline, batch_dim, upper = 2)
-      stepsize <- ls_results$minimum
-      stepsize <- tf$expand_dims(stepsize, 1L)
-      stepsize <- tf$expand_dims(stepsize, 2L)
-
-      # do the update and compute new z and objective
-      a_new <- a + stepsize * adiff
-      z_new <- tf$matmul(sigma, a_new) + mu
-      obj <- psi(a_new, z_new, mu)
-
-      list(z_new, a_new, u, obj_old, obj, tol, iter + 1L, maxiter)
-
-    }
-
-    cond <- function(z, a, u, obj_old, obj, tol, iter, maxiter) {
-      not_all_converged <- tf$reduce_any(tf$less(tol, obj_old - obj))
-      in_time <- tf$less(iter, maxiter)
-      in_time & not_all_converged
-    }
-
-    values <- list(z,
-                   a,
-                   u,
-                   obj_old,
-                   obj,
-                   tol,
-                   iter,
-                   maxiter)
-
-    # run the Newton-Raphson optimisation to find the posterior mode of z
-    out <- tf$while_loop(cond, body, values)
-
-    a <- out[[2]]
-
-    # apparently we need to redefine z etc. here, or the backprop errors
-
-    # lots of duplicated code; this could be tidied up, but I ran out of time!
-    z <- tf$matmul(sigma, a) + mu
-
-    # curvature of the likelihood at the mode
-    deriv <- derivs(z)
-    d2 <- deriv[[2]]
-    w <- -d2
-    rw <- sqrt(w)
-    hessian <- tf$linalg$diag(tf$squeeze(w, 2L))
-
-    # approximate posterior covariance
-    covar <- tf$linalg$inv(tf$linalg$inv(sigma) + hessian)
-
-    # log-determinant of l
-    mat1 <- tf$matmul(rw, tf_transpose(rw)) * sigma + eye
-    l <- tf$cholesky(mat1)
-    l_diag <- tf$matrix_diag_part(l)
-    logdet <- tf_sum(tf$log(l_diag))
-
-    # convergence information
-    iter <- out[[7]]
-    converged <- tf$less(iter, maxiter)
-
-    # return a list of these things
-    list(z = z,
-         mu = mu,
-         a = a,
-         logdet = logdet,
-         covar = covar,
-         iterations = iter,
-         converged = converged)
-
-  }
-
-  # named list of operation greta arrays for the marginalisation parameters
-  compute_parameters <- function(conditional_density_fun,
-                                 distribution_node,
-                                 dots,
-                                 marginaliser) {
-
-    # get greta arrays for parameters of distribution node
-    mean <- distribution_node$parameters$mean
-    sigma <- distribution_node$parameters$sigma
-
-    # run the laplace approximation fitting and get a list of parameters in a
-    # mock greta array (just representing a tf list)
-    args <- c(operation = "marginalisation_parameters",
-              mu = mean,
-              sigma = sigma,
-              dots,
-              list(
-                operation_args = list(
-                  tf_conditional_density_fun = conditional_density_fun,
-                  diagonal_hessian = marginaliser$other_args$diagonal_hessian
-                ),
-                tf_operation = "tf_compute_laplace_parameters",
-                dim = 1
-              ))
-
-    parameter_list <- do.call(op, args)
-
-    # extract the elements to operation greta arrays with the correct shapes
-    z <- op("z",
-            parameter_list,
-            dim = dim(mean),
-            tf_operation = "get_element",
-            operation_args = list("z"))
-
-    a <- op("a",
-            parameter_list,
-            dim = dim(mean),
-            tf_operation = "get_element",
-            operation_args = list("a"))
-
-    mu <- op("mu",
-            parameter_list,
-            dim = dim(mean),
-            tf_operation = "get_element",
-            operation_args = list("mu"))
-
-    logdet <- op("log determinant",
-                parameter_list,
-                tf_operation = "get_element",
-                operation_args = list("logdet"))
-
-    covar <- op("covar",
-                parameter_list,
-                dim = dim(sigma),
-                tf_operation = "get_element",
-                operation_args = list("covar"))
-
-    iterations <- op("iterations",
-                     parameter_list,
-                     tf_operation = "get_element",
-                     operation_args = list("iterations"))
-
-    converged <- op("converged",
-                    parameter_list,
-                    tf_operation = "get_element",
-                    operation_args = list("converged"))
-
-    # pull out the elements
-    list(z = z,
-         a = a,
-         mu = mu,
-         logdet = logdet,
-         covar = covar,
-         iterations = iterations,
-         converged = converged)
-
-  }
-
-  tf_marginalisation_density <- function(parameters,
-                                         tf_conditional_density_fun,
-                                         dots,
-                                         other_args) {
-
-    diagonal_hessian <- other_args$diagonal_hessian
-
-    # simpler interface to conditional density. If reduce = TRUE, it does
-    # reduce_sum on component densities
-    d0 <- function(z, reduce = TRUE) {
-      # transpose z to a row vector, which the dag is expecting
-      t_z <- tf_transpose(z)
-      args <- c(list(t_z), dots, list(reduce = reduce))
-      do.call(tf_conditional_density_fun, args)
-    }
-
-    # negative log-posterior for the current value of z under MVN assumption
-    psi <- function(a, z, mu) {
-      p1 <- tf$matmul(tf_transpose(a), z - mu)
-      fl(0.5) * tf$squeeze(p1, 1:2) - d0(z)
-    }
-
-    mu <- parameters$mu
-    logdet <- parameters$logdet
-    z <- parameters$z
-    a <- parameters$a
-
-    # the approximate marginal conditional posterior
-    nmcp <- psi(a, z, mu) + tf$squeeze(logdet, 1)
-
-    -nmcp
-
-  }
-
-  return_list_function <- function(parameters) {
-
-    list(mean = t(parameters$z),
-         sigma = parameters$covar,
-         iterations = parameters$iterations,
-         converged = parameters$converged)
-
-  }
-
-  as_marginaliser(name = "laplace",
-                  compute_parameters = compute_parameters,
-                  tf_marginalisation_density = tf_marginalisation_density,
-                  return_list = return_list_function,
-                  distribution_check = multivariate_normal_check,
-                  other_args = list(diagonal_hessian = diagonal_hessian))
-
-}
-
-# check that the distribution is discrete
-discrete_check <- function(distrib) {
-  if (!distrib$discrete) {
-    stop("this marginalisation method can only be used ",
-         "with discrete distributions",
-         call. = FALSE)
-  }
-}
-
-# check that the distribution is multiariate normal
-multivariate_normal_check <- function(distrib) {
-  if (distrib$distribution_name != "multivariate_normal") {
-    stop("the Laplace approximation can only be used ",
-         "with a multivariate normal distribution",
-         call. = FALSE)
-  }
 }
 
 # helper to contruct marginalisers
-as_marginaliser <- function(name,
-                            compute_parameters,
-                            tf_marginalisation_density,
-                            distribution_check,
-                            return_list,
-                            other_parameters = list(),
-                            other_args = list()) {
-
-  obj <- list(name = name,
-              compute_parameters = compute_parameters,
-              tf_marginalisation_density = tf_marginalisation_density,
-              distribution_check = distribution_check,
-              return_list = return_list,
-              other_args = other_args,
-              other_parameters = other_parameters)
-
-  class_name <- paste0(name, "_marginaliser")
+as_marginaliser <- function(..., class) {
+
+  obj <- list(..., class = class)
+  class_name <- class$classname
   class(obj) <- c(class_name, "marginaliser")
   obj
 
 }
 
-#' @noRd
-#' @export
 print.marginaliser <- function(x, ...) {
-  msg <- paste(x$name, "marginaliser object")
+  msg <- paste(x$class$classname, "object")
   cat(msg)
 }
diff --git a/tests/testthat/test_marginalisation.R b/tests/testthat/test_marginalisation.R
index 08d2f521..23230f58 100644
--- a/tests/testthat/test_marginalisation.R
+++ b/tests/testthat/test_marginalisation.R
@@ -60,15 +60,24 @@ test_that("discrete_marginalisation errors nicely", {
   skip_if_not(check_tf_version())
   source("helpers.R")
 
+  y <- 1:3
+  fun <- function(x) {
+    distribution(y) <- normal(x)
+  }
+
   # greta array, not a numeric
   expect_error(
-    discrete_marginalisation(values = variable()),
+    marginalise(fun,
+                poisson(3),
+                discrete_marginalisation(values = variable())),
     "must be an R numeric vector, not a greta array"
   )
 
   # not a numeric
   expect_error(
-    discrete_marginalisation(values = c("apple", "banana")),
+    marginalise(fun,
+                poisson(3),
+                discrete_marginalisation(values = c("apple", "banana"))),
     "must be an R numeric vector$"
   )
 
@@ -151,21 +160,30 @@ test_that("laplace_approximation errors nicely", {
   skip_if_not(check_tf_version())
   source("helpers.R")
 
+  y <- 1:3
+  fun <- function(x) {
+    distribution(y) <- normal(x)
+  }
+
   # bad tolerance
   expect_error(
-    laplace_approximation(tolerance = -1),
+    marginalise(fun,
+                normal(0, 1, dim = 3),
+                laplace_approximation(tolerance = -1)),
     "must be a positive, scalar numeric value"
   )
 
   # bad max iterations
   expect_error(
-    laplace_approximation(max_iterations = 0),
+    marginalise(fun,
+                normal(0, 1, dim = 3),
+                laplace_approximation(max_iterations = 0)),
     "must be a positive, scalar integer value"
   )
 
   # mismatch with distribution
   expect_error(
-    marginalise(I, normal(0, 1), laplace_approximation()),
+    marginalise(I, beta(2, 2), laplace_approximation()),
     "can only be used with a multivariate normal distribution"
   )
 
@@ -239,19 +257,10 @@ test_that("laplace approximation has correct posterior for univariate normal", {
   theta_sd <- sqrt(theta_var)
   analytic <- cbind(mean = theta_mu, sd = sqrt(theta_var))
 
+  # analyse as multivariate normal
   lik <- function(theta) {
     distribution(y) <- normal(t(theta), obs_sd)
   }
-
-  # analyse as univariate normal
-  out <- marginalise(lik,
-                     normal(mu, sd, dim = 8),
-                     laplace_approximation(diagonal_hessian = TRUE))
-  res <- do.call(calculate, out)
-  laplace_uni <- cbind(mean = res$mean, sd = res$sd)
-  compare_op(analytic, laplace_uni)
-
-  # analyse as multivariate normal
   mean <- ones(1, 8) * mu
   sigma <- diag(8) * sd ^ 2
   out <- marginalise(lik,
@@ -273,12 +282,6 @@ test_that("laplace approximation has correct posterior for multivariate normal",
   #   y_i ~ N(theta_i, obs_sd_i ^ 2)
   #   theta ~ MVN(mu, sigma)
   # the posterior for theta is multivariate normal, so laplace should be exact
-  # Bayes theorum gives:
-  #   p(theta | y) \propto p(y|theta) p(theta)
-  # which with normal densities is:
-  #   p(theta_i | y_i) \propto N(y_i | theta_i, obs_sd_i ^ 2) *
-  #         N(theta_i | mu, sd ^ 2)
-  # which is equivalent to:
   #   p(theta | y) \propto MNN(theta_mu, theta_sigma)
   #   theta_sigma = (sigma^-1 + diag(1 /obs_sd^2))^-1
   #   theta_mu = theta_sigma (sigma^-1 mu + diag(1 /obs_sd^2) mean(y))^-1

From 0780a318c8aa0b99f469cee46e986aeed49c7b05 Mon Sep 17 00:00:00 2001
From: Nick Golding <goldingn35@gmail.com>
Date: Fri, 28 Feb 2020 13:44:09 +1100
Subject: [PATCH 37/38] pass the distribution to the marginaliser and check
 inputs early

---
 R/marginalise.R                       |  19 ++---
 R/marginalisers.R                     | 105 ++++++++++++--------------
 tests/testthat/test_marginalisation.R |  39 ++++------
 3 files changed, 71 insertions(+), 92 deletions(-)

diff --git a/R/marginalise.R b/R/marginalise.R
index 34b393b4..7ec07680 100644
--- a/R/marginalise.R
+++ b/R/marginalise.R
@@ -91,27 +91,22 @@ marginalise <- function(fun, variable, method, ...) {
          "See ?marginalise for options")
   }
 
-  # construct the marginaliser
-  method_args <- method[which(names(method) != "class")]
-  marginaliser <- do.call(method$class$new, method_args)
-
-  # check the distribution is compatible with the method
-  marginaliser$distribution_check(distribution_node)
-
   # excise the variable from the distribution
   distribution_node$remove_target()
 
+  # construct the marginaliser
+  marginaliser <- do.call(method$class$new, c(distribution_node, method$args))
+
   # turn the greta function into a TF conditional density function; doing
   # something very similar to as_tf_function(), but giving a function that
   # returns a tensorflow scalar for the density (unadjusted since there will be
   # no new variables)
-  conditional_joint_density <- as_conditional_density(fun,
-                                                      c(list(variable), dots))
+  args <- c(list(variable), dots)
+  conditional_joint_density <- as_conditional_density(fun, args)
 
   # get a list of greta arrays for the marginalisation parameters:
   parameters <- marginaliser$compute_parameters(
     conditional_density_fun = conditional_joint_density,
-    distribution_node = distribution_node,
     dots = dots
   )
 
@@ -124,7 +119,6 @@ marginalise <- function(fun, variable, method, ...) {
     marginaliser = marginaliser,
     parameters = parameters,
     conditional_density_fun = conditional_joint_density,
-    distribution_node = distribution_node,
     dots = dots
   )
 
@@ -147,7 +141,6 @@ marginalisation_distribution <- R6Class(
     initialize = function(marginaliser,
                           parameters = parameters,
                           conditional_density_fun,
-                          distribution_node,
                           dots) {
 
       # initialize class, and add methods
@@ -171,7 +164,7 @@ marginalisation_distribution <- R6Class(
 
       # make the distribution the target
       self$remove_target()
-      self$add_target(distribution_node)
+      self$add_target(marginaliser$distribution)
 
     },
 
diff --git a/R/marginalisers.R b/R/marginalisers.R
index d33979bf..87355875 100644
--- a/R/marginalisers.R
+++ b/R/marginalisers.R
@@ -7,12 +7,13 @@ marginaliser <- R6Class(
   public = list(
 
     name = "marginaliser",
+    distribution = NULL,
 
     other_parameters = list(),
     other_args = list(),
 
-    initialize = function(...) {
-      invisible(NULL)
+    initialize = function(distribution, ...) {
+      self$distribution <- distribution
     },
 
     return_list = function(parameters) {
@@ -27,20 +28,16 @@ discrete_marginaliser <- R6Class(
   inherit = marginaliser,
   public = list(
 
-    initialize = function(values) {
+    initialize = function(distribution, values) {
 
-      super$initialize()
-
-      if (!is.vector(values) | !is.numeric(values)) {
-        msg <- "'values' must be an R numeric vector"
-
-        if (inherits(values, "greta_array")) {
-          msg <- paste0(msg, ", not a greta array")
-        }
-
-        stop(msg)
+      if (!distribution$discrete) {
+        stop("this marginalisation method can only be used ",
+             "with discrete distributions",
+             call. = FALSE)
       }
 
+      super$initialize(distribution)
+
       # convert values to a list of data greta arrays
       values_list <- as.list(values)
       values_list <- lapply(values_list, as_data)
@@ -52,7 +49,6 @@ discrete_marginaliser <- R6Class(
     # return a named list of operation greta arrays for the marginalisation
     # parameters
     compute_parameters = function(conditional_density_fun,
-                                  distribution_node,
                                   dots) {
 
       # function to compute the parameter (log probabilities) of the marginalisation
@@ -80,7 +76,7 @@ discrete_marginaliser <- R6Class(
 
       # list of parameters in a mock greta array (just representing a tf list)
       args <- c(operation = "marginalisation_log_weights",
-                distribution_node,
+                self$distribution,
                 self$other_parameters,
                 tf_operation = "tf_compute_log_weights",
                 list(dim = c(length(self$other_parameters), 1L)))
@@ -122,15 +118,6 @@ discrete_marginaliser <- R6Class(
 
     return_list = function(parameters) {
       list(probabilities = exp(parameters$log_weights))
-    },
-
-    # check that the distribution is discrete
-    distribution_check = function(distrib) {
-      if (!distrib$discrete) {
-        stop("this marginalisation method can only be used ",
-             "with discrete distributions",
-             call. = FALSE)
-      }
     }
 
   )
@@ -146,11 +133,19 @@ laplace_marginaliser <- R6Class(
     diagonal_hessian = NULL,
     multivariate = FALSE,
 
-    initialize = function(tolerance,
+    initialize = function(distribution,
+                          tolerance,
                           max_iterations,
                           diagonal_hessian) {
 
-      super$initialize()
+      name <- distribution$distribution_name
+      if (!name %in% c("normal", "multivariate_normal")) {
+        stop("the Laplace approximation can only be used ",
+             "with a normal or multivariate normal distribution",
+             call. = FALSE)
+      }
+
+      super$initialize(distribution)
 
       # in future:
       #  - enable warm starts for subsequent steps of the outer inference algorithm
@@ -182,20 +177,6 @@ laplace_marginaliser <- R6Class(
 
       # nolint end
 
-      if (!(is.numeric(tolerance) &&
-            is.vector(tolerance) &&
-            length(tolerance) == 1 &&
-            tolerance > 0)) {
-        stop("'tolerance' must be a positive, scalar numeric value")
-      }
-
-      max_iterations <- as.integer(max_iterations)
-      if (!(is.vector(max_iterations) &&
-            length(max_iterations) == 1 &&
-            max_iterations > 0)) {
-        stop("'max_iterations' must be a positive, scalar integer value")
-      }
-
       self$other_args <- list(tolerance = tolerance,
                               max_iterations = max_iterations,
                               diagonal_hessian = diagonal_hessian)
@@ -203,9 +184,7 @@ laplace_marginaliser <- R6Class(
     },
 
     # named list of operation greta arrays for the marginalisation parameters
-    compute_parameters = function(conditional_density_fun,
-                                   distribution_node,
-                                   dots) {
+    compute_parameters = function(conditional_density_fun, dots) {
 
       # function to compute the parameter (log probabilities) of the marginalisation
       # define the marginalisation function
@@ -406,8 +385,8 @@ laplace_marginaliser <- R6Class(
       }
 
       # get greta arrays for parameters of distribution node
-      mean <- distribution_node$parameters$mean
-      sigma <- distribution_node$parameters$sigma
+      mean <- self$distribution$parameters$mean
+      sigma <- self$distribution$parameters$sigma
 
       # run the laplace approximation fitting and get a list of parameters in a
       # mock greta array (just representing a tf list)
@@ -518,16 +497,6 @@ laplace_marginaliser <- R6Class(
            iterations = parameters$iterations,
            converged = parameters$converged)
 
-    },
-
-    # check that the distribution is normal
-    distribution_check = function(distrib) {
-      name <- distrib$distribution_name
-      if (!name %in% c("multivariate_normal")) {
-        stop("the Laplace approximation can only be used ",
-             "with a multivariate normal distribution",
-             call. = FALSE)
-      }
     }
 
   )
@@ -569,6 +538,16 @@ NULL
 #'   the distribution, that approximation error will be minimal.
 discrete_marginalisation <- function(values) {
 
+  if (!is.vector(values) | !is.numeric(values)) {
+    msg <- "'values' must be an R numeric vector"
+
+    if (inherits(values, "greta_array")) {
+      msg <- paste0(msg, ", not a greta array")
+    }
+
+    stop(msg)
+  }
+
   as_marginaliser(values = values,
                   class = discrete_marginaliser)
 
@@ -602,6 +581,20 @@ laplace_approximation <- function(tolerance = 1e-6,
                                   max_iterations = 50,
                                   diagonal_hessian = FALSE) {
 
+  if (!(is.numeric(tolerance) &&
+        is.vector(tolerance) &&
+        length(tolerance) == 1 &&
+        tolerance > 0)) {
+    stop("'tolerance' must be a positive, scalar numeric value")
+  }
+
+  max_iterations <- as.integer(max_iterations)
+  if (!(is.vector(max_iterations) &&
+        length(max_iterations) == 1 &&
+        max_iterations > 0)) {
+    stop("'max_iterations' must be a positive, scalar integer value")
+  }
+
   as_marginaliser(tolerance = tolerance,
                   max_iterations = max_iterations,
                   diagonal_hessian = diagonal_hessian,
@@ -612,7 +605,7 @@ laplace_approximation <- function(tolerance = 1e-6,
 # helper to contruct marginalisers
 as_marginaliser <- function(..., class) {
 
-  obj <- list(..., class = class)
+  obj <- list(class = class, args = list(...))
   class_name <- class$classname
   class(obj) <- c(class_name, "marginaliser")
   obj
diff --git a/tests/testthat/test_marginalisation.R b/tests/testthat/test_marginalisation.R
index 23230f58..7377c31f 100644
--- a/tests/testthat/test_marginalisation.R
+++ b/tests/testthat/test_marginalisation.R
@@ -60,24 +60,15 @@ test_that("discrete_marginalisation errors nicely", {
   skip_if_not(check_tf_version())
   source("helpers.R")
 
-  y <- 1:3
-  fun <- function(x) {
-    distribution(y) <- normal(x)
-  }
-
   # greta array, not a numeric
   expect_error(
-    marginalise(fun,
-                poisson(3),
-                discrete_marginalisation(values = variable())),
+    discrete_marginalisation(values = variable()),
     "must be an R numeric vector, not a greta array"
   )
 
   # not a numeric
   expect_error(
-    marginalise(fun,
-                poisson(3),
-                discrete_marginalisation(values = c("apple", "banana"))),
+    discrete_marginalisation(values = c("apple", "banana")),
     "must be an R numeric vector$"
   )
 
@@ -160,31 +151,22 @@ test_that("laplace_approximation errors nicely", {
   skip_if_not(check_tf_version())
   source("helpers.R")
 
-  y <- 1:3
-  fun <- function(x) {
-    distribution(y) <- normal(x)
-  }
-
   # bad tolerance
   expect_error(
-    marginalise(fun,
-                normal(0, 1, dim = 3),
-                laplace_approximation(tolerance = -1)),
+    laplace_approximation(tolerance = -1),
     "must be a positive, scalar numeric value"
   )
 
   # bad max iterations
   expect_error(
-    marginalise(fun,
-                normal(0, 1, dim = 3),
-                laplace_approximation(max_iterations = 0)),
+    laplace_approximation(max_iterations = 0),
     "must be a positive, scalar integer value"
   )
 
   # mismatch with distribution
   expect_error(
     marginalise(I, beta(2, 2), laplace_approximation()),
-    "can only be used with a multivariate normal distribution"
+    "can only be used with a normal or multivariate normal distribution"
   )
 
 })
@@ -257,6 +239,17 @@ test_that("laplace approximation has correct posterior for univariate normal", {
   theta_sd <- sqrt(theta_var)
   analytic <- cbind(mean = theta_mu, sd = sqrt(theta_var))
 
+  # analyse as univariate normal
+  lik <- function(theta) {
+    distribution(y) <- normal(theta, obs_sd)
+  }
+  out <- marginalise(lik,
+                     normal(mu, sd, dim = 8),
+                     laplace_approximation(diagonal_hessian = TRUE))
+  res <- do.call(calculate, out)
+  laplace_uni <- cbind(mean = res$mean, sd = res$sd)
+  compare_op(analytic, laplace_uni)
+
   # analyse as multivariate normal
   lik <- function(theta) {
     distribution(y) <- normal(t(theta), obs_sd)

From fe40342cc53c3f42c90be0d0a4cffaa5a86b94ab Mon Sep 17 00:00:00 2001
From: Nick Golding <goldingn35@gmail.com>
Date: Fri, 28 Feb 2020 15:22:32 +1100
Subject: [PATCH 38/38] get univariate laplace running (with incorrect)

---
 R/marginalisers.R | 412 ++++++++++++++++++++++++++++++++++++----------
 1 file changed, 324 insertions(+), 88 deletions(-)

diff --git a/R/marginalisers.R b/R/marginalisers.R
index 87355875..3e33c4dc 100644
--- a/R/marginalisers.R
+++ b/R/marginalisers.R
@@ -138,6 +138,9 @@ laplace_marginaliser <- R6Class(
                           max_iterations,
                           diagonal_hessian) {
 
+
+      # check the distribution and determine whether we are witking with a
+      # multivariate or univariate normal
       name <- distribution$distribution_name
       if (!name %in% c("normal", "multivariate_normal")) {
         stop("the Laplace approximation can only be used ",
@@ -145,56 +148,36 @@ laplace_marginaliser <- R6Class(
              call. = FALSE)
       }
 
-      super$initialize(distribution)
-
-      # in future:
-      #  - enable warm starts for subsequent steps of the outer inference algorithm
-      #  - enable a non-factorising version (have the user say whether it is
-      # factorising)
-      #  - handle an iid normal distribution too.
-
-      # the following algorithm for a univariate normal distribution is derived
-      # from LA in Rasmussen & Williams for diagonal Hessian & diagonal prior
-      # it neeeds checking!
-
-      # nolint start
-
-      # with univariate normal and diagonal hessian, iteration is:
-      #   z_new = mu + (w * (z - mu) + d1(z)) / (1 / sd + w)
-
-      # instead work with decentred variable a where z = a * sd ^ 2 + mu
-      #   l = sqrt(1 + w * sd)
-      #   b = w * (z - mu) * d1(z)
-      #   a_diff = b − (w * sd * b) / l ^ 2 - a
-      #   a_new = a + s * a_diff
-      #   z_new = a_new * sd ^ 2 + mu
+      self$multivariate <- name == "multivariate_normal"
 
-      # estimated sd at mode is:
-      #   sqrt(1 / (1 / sd + w))
-
-      # the approximated negative marginal density is:
-      #   nmcp = 0.5 * sum((z - mu) ^ 2 / sd) - d0(z) + 0.5 * sum(log1p(w * sd))
-
-      # nolint end
+      super$initialize(distribution)
 
       self$other_args <- list(tolerance = tolerance,
                               max_iterations = max_iterations,
-                              diagonal_hessian = diagonal_hessian)
+                              diagonal_hessian = diagonal_hessian,
+                              multivariate = self$multivariate)
 
     },
 
     # named list of operation greta arrays for the marginalisation parameters
     compute_parameters = function(conditional_density_fun, dots) {
+      if (self$multivariate) {
+        self$compute_parameters_multivariate(conditional_density_fun, dots)
+      } else {
+        self$compute_parameters_univariate(conditional_density_fun, dots)
+      }
+    },
+
+    # named list of operation greta arrays for the marginalisation parameters
+    compute_parameters_multivariate = function(conditional_density_fun, dots) {
 
       # function to compute the parameter (log probabilities) of the marginalisation
       # define the marginalisation function
-      tf_compute_laplace_parameters = function(mu,
+      tf_compute_laplace_parameters = function(mean,
                                                sigma,
                                                ...,
                                                tf_conditional_density_fun,
-                                               diagonal_hessian,
-                                               tolerance,
-                                               max_iterations) {
+                                               other_args) {
 
         dots <- list(...)
 
@@ -209,7 +192,7 @@ laplace_marginaliser <- R6Class(
 
         # get the vectors of first and second derivatives of the conditional density
         # function, w.r.t. the variable being marginalised
-        if (diagonal_hessian) {
+        if (other_args$diagonal_hessian) {
           derivs <- function(z) {
             y <- d0(z, reduce = FALSE)
             d1 <- tf$gradients(y, z)[[1]]
@@ -230,26 +213,26 @@ laplace_marginaliser <- R6Class(
         }
 
         # negative log-posterior for the current value of z under MVN assumption
-        psi <- function(a, z, mu) {
-          p1 <- tf$matmul(tf_transpose(a), z - mu)
+        psi <- function(a, z, mean) {
+          p1 <- tf$matmul(tf_transpose(a), z - mean)
           fl(0.5) * tf$squeeze(p1, 1:2) - d0(z)
         }
 
-        # tensors for parameters of MVN distribution (make mu column vector now)
-        mu <- tf_transpose(mu)
+        # tensors for parameters of MVN distribution (make mean column vector now)
+        mean <- tf_transpose(mean)
 
         # dimension of the MVN distribution
-        n <- dim(mu)[[2]]
+        n <- dim(mean)[[2]]
 
         # here z is a *column vector* to simplify later calculations, it needs to be
         # transposed to a row vector before feeding into the likelihood function(s)
-        z <- tf$identity(mu)
+        z <- tf$identity(mean)
 
         # Newton-Raphson parameters
-        tol <- tf$constant(tolerance, tf_float(), shape(1))
+        tol <- tf$constant(other_args$tolerance, tf_float(), shape(1))
         obj_old <- tf$constant(Inf, tf_float(), shape(1))
         iter <- tf$constant(0L)
-        maxiter <- tf$constant(max_iterations)
+        maxiter <- tf$constant(other_args$max_iterations)
 
         # other objects
         a_value <- add_first_dim(as_2d_array(rep(0, n)))
@@ -260,9 +243,9 @@ laplace_marginaliser <- R6Class(
 
         # match batches on everything going into the loop that will have a batch
         # dimension later
-        objects <- list(mu, sigma, z, a, u, obj_old, tol)
+        objects <- list(mean, sigma, z, a, u, obj_old, tol)
         objects <- greta:::match_batches(objects)
-        mu <- objects[[1]]
+        mean <- objects[[1]]
         sigma <- objects[[2]]
         z <- objects[[3]]
         a <- objects[[4]]
@@ -273,7 +256,7 @@ laplace_marginaliser <- R6Class(
         obj <- -d0(z)
 
         # get the batch dim explicitly, for the step size optimisation
-        batch_dim <- tf$shape(mu)[[0]]
+        batch_dim <- tf$shape(mean)[[0]]
         batch_dim <- tf$expand_dims(batch_dim, 0L)
 
         # tensorflow while loop to do Newton-Raphson iterations
@@ -290,7 +273,7 @@ laplace_marginaliser <- R6Class(
           rw <- sqrt(w)
 
           # decentred values of z
-          cf <- z - mu
+          cf <- z - mean
 
           # approximate posterior covariance & cholesky factor (using the matrix inverse
           # lemma for numerical stability)
@@ -310,8 +293,8 @@ laplace_marginaliser <- R6Class(
             s <- tf$expand_dims(s, 1L)
             s <- tf$expand_dims(s, 2L)
             a_new <- a + s * adiff
-            z_new <- tf$matmul(sigma, a_new) + mu
-            psi(a_new, z_new, mu)
+            z_new <- tf$matmul(sigma, a_new) + mean
+            psi(a_new, z_new, mean)
           }
 
           ls_results <- gss(psiline, batch_dim, upper = 2)
@@ -321,8 +304,8 @@ laplace_marginaliser <- R6Class(
 
           # do the update and compute new z and objective
           a_new <- a + stepsize * adiff
-          z_new <- tf$matmul(sigma, a_new) + mu
-          obj <- psi(a_new, z_new, mu)
+          z_new <- tf$matmul(sigma, a_new) + mean
+          obj <- psi(a_new, z_new, mean)
 
           list(z_new, a_new, u, obj_old, obj, tol, iter + 1L, maxiter)
 
@@ -351,7 +334,7 @@ laplace_marginaliser <- R6Class(
         # apparently we need to redefine z etc. here, or the backprop errors
 
         # lots of duplicated code; this could be tidied up, but I ran out of time!
-        z <- tf$matmul(sigma, a) + mu
+        z <- tf$matmul(sigma, a) + mean
 
         # curvature of the likelihood at the mode
         deriv <- derivs(z)
@@ -368,6 +351,7 @@ laplace_marginaliser <- R6Class(
         l <- tf$cholesky(mat1)
         l_diag <- tf$matrix_diag_part(l)
         logdet <- tf_sum(tf$log(l_diag))
+        logdet <- tf$squeeze(logdet, 1L)
 
         # convergence information
         iter <- out[[7]]
@@ -375,8 +359,9 @@ laplace_marginaliser <- R6Class(
 
         # return a list of these things
         list(z = z,
-             mu = mu,
+             mean = mean,
              a = a,
+             sigma = sigma,
              logdet = logdet,
              covar = covar,
              iterations = iter,
@@ -391,13 +376,13 @@ laplace_marginaliser <- R6Class(
       # run the laplace approximation fitting and get a list of parameters in a
       # mock greta array (just representing a tf list)
       args <- c(operation = "marginalisation_parameters",
-                mu = mean,
+                mean = mean,
                 sigma = sigma,
                 dots,
                 list(
-                  operation_args = c(
+                  operation_args = list(
                     tf_conditional_density_fun = conditional_density_fun,
-                    self$other_args
+                    other_args = self$other_args
                   ),
                   tf_operation = "tf_compute_laplace_parameters",
                   dim = 1
@@ -418,12 +403,6 @@ laplace_marginaliser <- R6Class(
               tf_operation = "get_element",
               operation_args = list("a"))
 
-      mu <- op("mu",
-               parameter_list,
-               dim = dim(mean),
-               tf_operation = "get_element",
-               operation_args = list("mu"))
-
       logdet <- op("log determinant",
                    parameter_list,
                    tf_operation = "get_element",
@@ -448,7 +427,7 @@ laplace_marginaliser <- R6Class(
       # pull out the elements
       list(z = z,
            a = a,
-           mu = mu,
+           mean = mean,
            logdet = logdet,
            covar = covar,
            iterations = iterations,
@@ -456,46 +435,301 @@ laplace_marginaliser <- R6Class(
 
     },
 
+    # named list of operation greta arrays for the marginalisation parameters
+    compute_parameters_univariate = function(conditional_density_fun, dots) {
+
+      # function to compute the parameter (log probabilities) of the marginalisation
+      tf_compute_laplace_parameters = function(mean,
+                                               sd,
+                                               ...,
+                                               tf_conditional_density_fun,
+                                               other_args) {
+
+
+        # the following algorithm for a univariate normal distribution is derived
+        # from LA in Rasmussen & Williams for diagonal Hessian & diagonal prior.
+        # It needs checking!
+
+        # nolint start
+
+        # with univariate normal and diagonal hessian, iteration is:
+        #   z_diff = mean + (w * (z - mean) + d1(z)) / (1 / sd + w) - z
+        #   z_new = z + s * z_diff
+
+        #         # instead work with decentred variable a where z = a * sd ^ 2 + mean
+        #         #   l = sqrt(1 + w * sd)
+        #         #   b = w * (z - mean) * d1(z)
+        #         #   a_diff = b − (w * sd * b) / l ^ 2 - a
+        #         #   a_new = a + s * a_diff
+        #         #   z_new = a_new * sd ^ 2 + mean
+
+        # estimated sd at mode is:
+        #   sqrt(1 / (1 / sd + w))
+
+        # the approximated negative marginal density is:
+        #   nmcp = 0.5 * sum((z - mean) ^ 2 / sd) - d0(z) + 0.5 * sum(log1p(w * sd))
+
+        # nolint end
+
+        dots <- list(...)
+
+        # simpler interface to conditional density. If reduce = TRUE, it does
+        # reduce_sum on component densities
+        d0 <- function(z, reduce = TRUE) {
+          # transpose z to a row vector, which the dag is expecting
+          t_z <- tf_transpose(z)
+          args <- c(list(t_z), dots, list(reduce = reduce))
+          do.call(tf_conditional_density_fun, args)
+        }
+
+        # get the vectors of first and second derivatives of the conditional density
+        # function, w.r.t. the variable being marginalised
+        if (other_args$diagonal_hessian) {
+          derivs <- function(z) {
+            y <- d0(z, reduce = FALSE)
+            d1 <- tf$gradients(y, z)[[1]]
+            d2 <- tf$gradients(d1, z)[[1]]
+            list(d1, d2)
+          }
+        } else {
+
+          stop("inference with non-diagonal hessians is not yet implemented",
+               call. = FALSE)
+
+          derivs <- function(z) {
+            y <- d0(z, reduce = FALSE)
+            d1 <- tf$gradients(y, z)[[1]]
+            d2 <- tf$hessians(y, z)[[1]]  # this won't work!
+            list(d1, d2)
+          }
+        }
+
+        # negative log-posterior for the current value of z under MVN assumption
+        psi <- function(z, mean, sd) {
+          p1 <- tf_sum((z - mean) ^ 2 / sd)
+          fl(0.5) * tf$squeeze(p1, 1:2) - d0(z)
+        }
+
+        # here z is a *column vector* to simplify later calculations, it needs to be
+        # transposed to a row vector before feeding into the likelihood function(s)
+        z <- tf$identity(mean)
+
+        # Newton-Raphson parameters
+        tol <- tf$constant(other_args$tolerance, tf_float(), shape(1))
+        obj_old <- tf$constant(Inf, tf_float(), shape(1))
+        iter <- tf$constant(0L)
+        maxiter <- tf$constant(other_args$max_iterations)
+
+        # match batches on everything going into the loop that will have a batch
+        # dimension later
+        objects <- list(mean, sd, z, obj_old, tol)
+        objects <- greta:::match_batches(objects)
+        mean <- objects[[1]]
+        sd <- objects[[2]]
+        z <- objects[[3]]
+        obj_old <- objects[[4]]
+        tol <- objects[[5]]
+
+        obj <- -d0(z)
+
+        # get the batch dim explicitly, for the step size optimisation
+        batch_dim <- tf$shape(mean)[[0]]
+        batch_dim <- tf$expand_dims(batch_dim, 0L)
+
+        # tensorflow while loop to do Newton-Raphson iterations
+        body <- function(z, obj_old, obj, tol, iter, maxiter) {
+
+          obj_old <- obj
+
+          deriv <- derivs(z)
+          d1 <- deriv[[1]]
+          d2 <- deriv[[2]]
+
+          # curvature of the likelihood
+          w <- -d2
+
+          # with univariate normal and diagonal hessian, iteration is:
+          z_diff <- mean + (w * (z - mean) + d1) / (fl(1) / sd + w) - z
+
+          # use golden section search to find the optimum distance to step in this
+          # direction, for each batch simultaneously
+          psiline <- function(s) {
+            s <- tf$expand_dims(s, 1L)
+            s <- tf$expand_dims(s, 2L)
+            z_new <- z + s * z_diff
+            psi(z_new, mean, sd)
+          }
+
+          ls_results <- gss(psiline, batch_dim, upper = 2)
+          stepsize <- ls_results$minimum
+          stepsize <- tf$expand_dims(stepsize, 1L)
+          stepsize <- tf$expand_dims(stepsize, 2L)
+
+          # do the update and compute new z and objective
+          z_new = z + stepsize * z_diff
+          obj <- psi(z_new, mean, sd)
+
+          list(z_new, obj_old, obj, tol, iter + 1L, maxiter)
+
+        }
+
+        cond <- function(z, obj_old, obj, tol, iter, maxiter) {
+          not_all_converged <- tf$reduce_any(tf$less(tol, obj_old - obj))
+          in_time <- tf$less(iter, maxiter)
+          in_time & not_all_converged
+        }
+
+        values <- list(z,
+                       obj_old,
+                       obj,
+                       tol,
+                       iter,
+                       maxiter)
+
+        # run the Newton-Raphson optimisation to find the posterior mode of z
+        out <- tf$while_loop(cond, body, values)
+
+        # we might need to redefine z etc. if the backprop won't allow this errors
+        z <- out[[1]]
+
+        # curvature of the likelihood at the mode
+        deriv <- derivs(z)
+        d2 <- deriv[[2]]
+        w <- -d2
+
+        # approximate posterior variance
+        var <- fl(1) / (fl(1) / sd + w)
+
+        # log-determinant of l
+        logdet <- fl(0.5) * tf_sum(tf$math$log1p(w * sd))
+
+        # convergence information
+        iter <- out[[5]]
+        converged <- tf$less(iter, maxiter)
+
+        # return a list of these things
+        list(z = z,
+             var = var,
+             logdet = logdet,
+             iterations = iter,
+             converged = converged)
+
+      }
+
+      # get greta arrays for parameters of distribution node
+      mean <- self$distribution$parameters$mean
+      sd <- self$distribution$parameters$sd
+
+      # run the laplace approximation fitting and get a list of parameters in a
+      # mock greta array (just representing a tf list)
+      args <- c(operation = "marginalisation_parameters",
+                mean = mean,
+                sd = sd,
+                dots,
+                list(
+                  operation_args = list(
+                    tf_conditional_density_fun = conditional_density_fun,
+                    other_args = self$other_args
+                  ),
+                  tf_operation = "tf_compute_laplace_parameters",
+                  dim = 1
+                ))
+
+      parameter_list <- do.call(op, args)
+
+      # extract the elements to operation greta arrays with the correct shapes
+      z <- op("z",
+              parameter_list,
+              dim = self$distribution$dim,
+              tf_operation = "get_element",
+              operation_args = list("z"))
+
+      var <- op("var",
+              parameter_list,
+              dim = self$distribution$dim,
+              tf_operation = "get_element",
+              operation_args = list("var"))
+
+      logdet <- op("log determinant",
+                   parameter_list,
+                   tf_operation = "get_element",
+                   operation_args = list("logdet"))
+
+      iterations <- op("iterations",
+                       parameter_list,
+                       tf_operation = "get_element",
+                       operation_args = list("iterations"))
+
+      converged <- op("converged",
+                      parameter_list,
+                      tf_operation = "get_element",
+                      operation_args = list("converged"))
+
+      # pull out the elements
+      list(z = z,
+           var = var,
+           mean = mean,
+           sd = sd,
+           logdet = logdet,
+           iterations = iterations,
+           converged = converged)
+
+    },
+
     tf_marginalisation_density = function(parameters,
                                            tf_conditional_density_fun,
                                            dots,
                                            other_args) {
 
-      diagonal_hessian <- other_args$diagonal_hessian
+      z <- parameters$z
+      mean <- parameters$mean
+      logdet <- parameters$logdet
 
-      # simpler interface to conditional density. If reduce = TRUE, it does
-      # reduce_sum on component densities
-      d0 <- function(z, reduce = TRUE) {
-        # transpose z to a row vector, which the dag is expecting
-        t_z <- tf_transpose(z)
-        args <- c(list(t_z), dots, list(reduce = reduce))
-        do.call(tf_conditional_density_fun, args)
-      }
+      # evaluate conditional density at posterior mode of z
+      args <- c(list(tf_transpose(z)),
+                dots,
+                list(reduce = TRUE))
+      d0 <- do.call(tf_conditional_density_fun, args)
 
-      # negative log-posterior for the current value of z under MVN assumption
-      psi <- function(a, z, mu) {
-        p1 <- tf$matmul(tf_transpose(a), z - mu)
-        fl(0.5) * tf$squeeze(p1, 1:2) - d0(z)
-      }
+      # the approximate marginal conditional posterior
+      if (other_args$multivariate) {
 
-      mu <- parameters$mu
-      logdet <- parameters$logdet
-      z <- parameters$z
-      a <- parameters$a
+        a <- parameters$a
+        mean <- tf_transpose(mean)
+        p1 <- tf$matmul(tf_transpose(a), z - mean)
+        p1 <- tf$squeeze(p1, 1:2)
 
-      # the approximate marginal conditional posterior
-      nmcp <- psi(a, z, mu) + tf$squeeze(logdet, 1)
+      } else {
+
+        sd <- parameters$sd
+        p1 <- tf_sum((z - mean) ^ 2 / sd)
 
+      }
+
+      nmcp <- fl(0.5) * p1 - d0 + logdet
       -nmcp
 
     },
 
     return_list = function(parameters) {
 
-      list(mean = t(parameters$z),
-           sigma = parameters$covar,
-           iterations = parameters$iterations,
-           converged = parameters$converged)
+      if (self$multivariate) {
+
+        result <- list(mean = t(parameters$z),
+                       sigma = parameters$covar,
+                       iterations = parameters$iterations,
+                       converged = parameters$converged)
+
+      } else {
+
+        result <- list(mean = parameters$z,
+                       sd = sqrt(parameters$var),
+                       iterations = parameters$iterations,
+                       converged = parameters$converged)
+
+      }
+
+      result
 
     }
 
@@ -612,6 +846,8 @@ as_marginaliser <- function(..., class) {
 
 }
 
+#' @export
+#' @noRd
 print.marginaliser <- function(x, ...) {
   msg <- paste(x$class$classname, "object")
   cat(msg)