From 312347fba94648a056847e583e19fe9cda999970 Mon Sep 17 00:00:00 2001 From: Nick Golding Date: Mon, 7 Jan 2019 12:08:25 +1100 Subject: [PATCH 01/38] flesh out marginalisation interface --- DESCRIPTION | 3 +- NAMESPACE | 3 + R/marginalise.R | 151 +++++++++++++++++++++++++++++++++++++++++ man/marginalisation.Rd | 83 ++++++++++++++++++++++ 4 files changed, 239 insertions(+), 1 deletion(-) create mode 100644 R/marginalise.R create mode 100644 man/marginalisation.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 6e05812f..64ebfb03 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -61,6 +61,7 @@ Collate: 'internals.R' 'calculate.R' 'callbacks.R' + 'marginalise.R' Imports: R6, tensorflow, @@ -86,4 +87,4 @@ Suggests: MASS, abind VignetteBuilder: knitr -RoxygenNote: 6.1.0 +RoxygenNote: 6.1.1 diff --git a/NAMESPACE b/NAMESPACE index 07c1b749..24e7e5c3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -85,6 +85,7 @@ S3method(plot,greta_model) S3method(print,greta_array) S3method(print,greta_model) S3method(print,initials) +S3method(print,marginaliser) S3method(print,optimiser) S3method(print,sampler) S3method(print,unknowns) @@ -141,6 +142,7 @@ export(cov2cor) export(diag) export(dirichlet) export(dirichlet_multinomial) +export(discrete_marginalisation) export(distribution) export(eigen) export(exponential) @@ -169,6 +171,7 @@ export(lkj_correlation) export(log1pe) export(logistic) export(lognormal) +export(marginalise) export(mcmc) export(mixture) export(model) diff --git a/R/marginalise.R b/R/marginalise.R new file mode 100644 index 00000000..42f3af61 --- /dev/null +++ b/R/marginalise.R @@ -0,0 +1,151 @@ +# marginalising random variables + +#' @name marginalisation +#' +#' @title direct marginalisation of random variables +#' @description Inference on many statistical models requires marginalisation of +#' (ie. integration of functions over) random variables. In addition to +#' general purpose marginalisation methods like MCMC that act on an entire +#' model, it is often useful to directly marginalise a single random variable +#' within a larger model; e.g. for random variables where marginalisation by +#' MCMC is inefficient, impossible, or when an approximate marginalisation is +#' substantially more efficient. \code{marginalise()} performs direct +#' marginalisation within a greta model, using one of the available marginalisation methods. +#' +#' @return \code{discrete_marginalisation} - a \code{marginaliser} object that +#' can be passed to \code{marginalise}. +#' +#' @examples +#' \dontrun{ +#' # marginalise a discrete random variable and carry out HMC on the +#' # continouous variables: +#' y <- rnorm(100) +#' +#' lambda <- lognormal(0, 1) +#' theta <- normal(0, 1) +#' sd <- lognormal(0, 1) +#' +#' # if discrete variables could be sampled by MCMC in greta, +#' # we would be able to simply do: +#' # n <- poisson(lambda) +#' # mu <- theta ^ n +#' # distribution(y) <- normal(mu, sd) +#' +#' # the function to marginalise over +#' # (only the variable to marginalise is passed in) +#' fun <- function (n) { +#' mu <- theta ^ n +#' distribution(y) <- normal(mu, sd) +#' } +#' +#' # integrate the function values w.r.t. the poisson distribution +#' marginalise(fun, +#' poisson(lambda), +#' method = discrete_marginalisation(values = 0:10)) +#' +#' m <- model(lambda) +#' draws <- mcmc(m, hmc()) +#' } +NULL + +#' @rdname marginalisation +#' @export +#' +#' @param fun an R function to integrate with respect to the random variable. +#' Must take a single argument - a greta array giving the value of the random +#' variable +#' @param variable a variable greta array with a distribution, representing the +#' random variable to marginalise +#' @param method a \code{marginaliser} object giving the method for carrying out +#' the marginalisation +marginalise <- function(fun, variable, method) { + + distrib <- get_node(variable)$distribution + + # check the inputs + if (!is.function(fun)) { + stop ("'fun' must be an R function") + } + + if (!inherits(distrib, "distribution_node")) { + stop ("'variable' must be a variable greta array with a distribution") + } + + # handle case when the method is passed like: `laplace_approximation`, rather + # than `laplace_approximation()` + if (is.function(method)) { + method <- method() + } + + if (!inherits(method, "marginalisation_method")) { + stop ("'method' must be a valid marginalisation method. ", + "See ?marginalise for options") + } + + # check the distribution is compatible with the method + method$distribution_check(distrib) + + stop ("not yet implemented") + +} + +#' @rdname marginalisation +#' @export +#' +#' @param fun an R vector giving values at which to evaluate the function for a +#' discrete marginalisation +#' +#' @details \code{discrete_marginalisation} can only be used with discrete +#' probability distributions, e.g. those defined with \code{poisson()} and +#' \code{binomial()}. For discrete distributions with finite support (such as +#' \code{bernoulli()}) the marginalisation will be exact, so long as +#' \code{values} includes all possible values of the variable. For discrete +#' distributions with non-finite support (such as \code{poisson()}, which has +#' no upper bound), the marginalisation can only ever be approximate. However +#' if \code{values} cover a range of values with sufficiently high support in +#' the distribution, that approximation error will be minimal. +discrete_marginalisation <- function(values) { + + if (inherits(values, "greta_array")) { + stop ("'values' must be an R numeric vector, not a greta array") + } + + # define the marginalisation function + marginalisation_function <- function() { + stop ("not yet implemented") + } + + as_marginaliser(name = "discrete", + method = marginalisation_function, + parameters = list(values = values), + distribution_check = discrete_check) + +} + +# check that the distribution is discrete +discrete_check <- function(node) { + if (!distrib$discrete) { + stop ("this marginalisation method can only be used with discrete distributions") + } +} + +# helper to contruct marginalisers +as_marginaliser <- function (name, method, parameters, distribution_check) { + + obj <- list(name = name, + method = method, + parameters = parameters, + distribution_check = distribution_check) + + class_name <- paste0(name, "_marginaliser") + class(obj) <- c(class_name, "marginaliser") + obj + +} + +#' @noRd +#' @export +print.marginaliser <- function(x, ...) { + msg <- paste(x$name, "marginaliser object") + cat(msg) +} diff --git a/man/marginalisation.Rd b/man/marginalisation.Rd new file mode 100644 index 00000000..5909725b --- /dev/null +++ b/man/marginalisation.Rd @@ -0,0 +1,83 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/marginalise.R +\name{marginalisation} +\alias{marginalisation} +\alias{marginalise} +\alias{discrete_marginalisation} +\title{direct marginalisation of random variables} +\usage{ +marginalise(fun, variable, method) + +discrete_marginalisation(values) +} +\arguments{ +\item{fun}{an R function to integrate with respect to the random variable. +Must take a single argument - a greta array giving the value of the random +variable} + +\item{variable}{a variable greta array with a distribution, representing the +random variable to marginalise} + +\item{method}{a \code{marginaliser} object giving the method for carrying out +the marginalisation} + +\item{fun}{an R vector giving values at which to evaluate the function for a +discrete marginalisation} +} +\value{ +\code{discrete_marginalisation} - a \code{marginaliser} object that + can be passed to \code{marginalise}. +} +\description{ +Inference on many statistical models requires marginalisation of + (ie. integration of functions over) random variables. In addition to + general purpose marginalisation methods like MCMC that act on an entire + model, it is often useful to directly marginalise a single random variable + within a larger model; e.g. for random variables where marginalisation by + MCMC is inefficient, impossible, or when an approximate marginalisation is + substantially more efficient. \code{marginalise()} performs direct + marginalisation within a greta model, using one of the available marginalisation methods. +} +\details{ +\code{discrete_marginalisation} can only be used with discrete + probability distributions, e.g. those defined with \code{poisson()} and + \code{binomial()}. For discrete distributions with finite support (such as + \code{bernoulli()}) the marginalisation will be exact, so long as + \code{values} includes all possible values of the variable. For discrete + distributions with non-finite support (such as \code{poisson()}, which has + no upper bound), the marginalisation can only ever be approximate. However + if \code{values} cover a range of values with sufficiently high support in + the distribution, that approximation error will be minimal. +} +\examples{ +\dontrun{ +# marginalise a discrete random variable and carry out HMC on the +# continouous variables: +y <- rnorm(100) + +lambda <- lognormal(0, 1) +theta <- normal(0, 1) +sd <- lognormal(0, 1) + +# if discrete variables could be sampled by MCMC in greta, +# we would be able to simply do: +# n <- poisson(lambda) +# mu <- theta ^ n +# distribution(y) <- normal(mu, sd) + +# the function to marginalise over +# (only the variable to marginalise is passed in) +fun <- function (n) { + mu <- theta ^ n + distribution(y) <- normal(mu, sd) +} + +# integrate the function values w.r.t. the poisson distribution +marginalise(fun, + poisson(lambda), + method = discrete_marginalisation(values = 0:10)) + +m <- model(lambda) +draws <- mcmc(m, hmc()) +} +} From 1e5e9077c65a3a1df9d13501b2a83a801d38a459 Mon Sep 17 00:00:00 2001 From: Nick Golding Date: Mon, 7 Jan 2019 14:40:31 +1100 Subject: [PATCH 02/38] add tests for marginalisation error messages --- R/marginalise.R | 24 +++++------ tests/testthat/test_marginalisation.R | 57 +++++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 11 deletions(-) create mode 100644 tests/testthat/test_marginalisation.R diff --git a/R/marginalise.R b/R/marginalise.R index 42f3af61..38637461 100644 --- a/R/marginalise.R +++ b/R/marginalise.R @@ -71,13 +71,7 @@ marginalise <- function(fun, variable, method) { stop ("'variable' must be a variable greta array with a distribution") } - # handle case when the method is passed like: `laplace_approximation`, rather - # than `laplace_approximation()` - if (is.function(method)) { - method <- method() - } - - if (!inherits(method, "marginalisation_method")) { + if (!inherits(method, "marginaliser")) { stop ("'method' must be a valid marginalisation method. ", "See ?marginalise for options") } @@ -106,8 +100,14 @@ marginalise <- function(fun, variable, method) { #' the distribution, that approximation error will be minimal. discrete_marginalisation <- function(values) { - if (inherits(values, "greta_array")) { - stop ("'values' must be an R numeric vector, not a greta array") + if (!is.vector(values) | !is.numeric(values)) { + msg <- "'values' must be an R numeric vector" + + if (inherits(values, "greta_array")) { + msg <- paste0(msg, ", not a greta array") + } + + stop (msg) } # define the marginalisation function @@ -123,9 +123,11 @@ discrete_marginalisation <- function(values) { } # check that the distribution is discrete -discrete_check <- function(node) { +discrete_check <- function(distrib) { if (!distrib$discrete) { - stop ("this marginalisation method can only be used with discrete distributions") + stop ("this marginalisation method can only be used ", + "with discrete distributions", + call. = FALSE) } } diff --git a/tests/testthat/test_marginalisation.R b/tests/testthat/test_marginalisation.R new file mode 100644 index 00000000..1558dbb3 --- /dev/null +++ b/tests/testthat/test_marginalisation.R @@ -0,0 +1,57 @@ +context("marginalisation") + +test_that("marginalise errors nicely", { + + skip_if_not(check_tf_version()) + source("helpers.R") + + # not a function + expect_error( + marginalise("apple", normal(0, 1), discrete_marginalisation()), + "must be an R function" + ) + + # not a greta array + expect_error( + marginalise(I, 1:5, discrete_marginalisation()), + "must be a variable greta array with a distribution" + ) + + # greta array but no distribution + expect_error( + marginalise(I, variable(), discrete_marginalisation()), + "must be a variable greta array with a distribution" + ) + + # not a marginaliser + expect_error( + marginalise(I, normal(0, 1), mean), + "'method' must be a valid marginalisation method" + ) + +}) + +test_that("discrete_marginalisation errors nicely", { + + skip_if_not(check_tf_version()) + source("helpers.R") + + # greta array, not a numeric + expect_error( + discrete_marginalisation(values = variable()), + "must be an R numeric vector, not a greta array" + ) + + # not a numeric + expect_error( + discrete_marginalisation(values = c("apple", "banana")), + "must be an R numeric vector$" + ) + + # mismatch with distribution + expect_error( + marginalise(I, normal(0, 1), discrete_marginalisation(values = 1:5)), + "can only be used with discrete distributions" + ) + +}) From 3b56a6cddf54cc0a6001d7380c5fc67768c5a86e Mon Sep 17 00:00:00 2001 From: Nick Golding Date: Mon, 7 Jan 2019 15:07:17 +1100 Subject: [PATCH 03/38] fix typo in marginalisation docs --- R/marginalise.R | 2 +- man/marginalisation.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/marginalise.R b/R/marginalise.R index 38637461..9c8bb37e 100644 --- a/R/marginalise.R +++ b/R/marginalise.R @@ -86,7 +86,7 @@ marginalise <- function(fun, variable, method) { #' @rdname marginalisation #' @export #' -#' @param fun an R vector giving values at which to evaluate the function for a +#' @param values an R vector giving values at which to evaluate the function for a #' discrete marginalisation #' #' @details \code{discrete_marginalisation} can only be used with discrete diff --git a/man/marginalisation.Rd b/man/marginalisation.Rd index 5909725b..fb05930f 100644 --- a/man/marginalisation.Rd +++ b/man/marginalisation.Rd @@ -21,7 +21,7 @@ random variable to marginalise} \item{method}{a \code{marginaliser} object giving the method for carrying out the marginalisation} -\item{fun}{an R vector giving values at which to evaluate the function for a +\item{values}{an R vector giving values at which to evaluate the function for a discrete marginalisation} } \value{ From 0c766e6e0dc4a3947dcbba62f51edc891fcffcfe Mon Sep 17 00:00:00 2001 From: Nick Golding Date: Mon, 7 Jan 2019 16:52:42 +1100 Subject: [PATCH 04/38] flesh out internals of marginalisation --- R/marginalise.R | 128 ++++++++++++++++++++++++++++++++++++++--- man/marginalisation.Rd | 19 +++--- 2 files changed, 131 insertions(+), 16 deletions(-) diff --git a/R/marginalise.R b/R/marginalise.R index 9c8bb37e..d018348c 100644 --- a/R/marginalise.R +++ b/R/marginalise.R @@ -31,9 +31,9 @@ #' # mu <- theta ^ n #' # distribution(y) <- normal(mu, sd) #' -#' # the function to marginalise over -#' # (only the variable to marginalise is passed in) -#' fun <- function (n) { +#' # the function to marginalise over - the variable to marginalise +#' # must be first, others must be passed to marginalise() +#' fun <- function (n, theta, mu, sd) { #' mu <- theta ^ n #' distribution(y) <- normal(mu, sd) #' } @@ -41,7 +41,10 @@ #' # integrate the function values w.r.t. the poisson distribution #' marginalise(fun, #' poisson(lambda), -#' method = discrete_marginalisation(values = 0:10)) +#' method = discrete_marginalisation(values = 0:10), +#' theta = theta, +#' mu = mu, +#' sd = sd) #' #' m <- model(lambda) #' draws <- mcmc(m, hmc()) @@ -52,14 +55,16 @@ NULL #' @export #' #' @param fun an R function to integrate with respect to the random variable. -#' Must take a single argument - a greta array giving the value of the random -#' variable +#' The first argument must be - a greta array giving the value of the random +#' variable, any subsequent arguments must passed in via the \dots argument. #' @param variable a variable greta array with a distribution, representing the #' random variable to marginalise #' @param method a \code{marginaliser} object giving the method for carrying out #' the marginalisation -marginalise <- function(fun, variable, method) { +#' @param \dots named greta arrays to be passed to \code{fun} +marginalise <- function(fun, variable, method, ...) { + # get the distribution for the random variable distrib <- get_node(variable)$distribution # check the inputs @@ -67,6 +72,14 @@ marginalise <- function(fun, variable, method) { stop ("'fun' must be an R function") } + # check the additional arguments to fun are given in dots + dots <- list(...) + dot_names <- names(dots) + expected_names <- names(formals(fun)[-1]) + if (!all(expected_names %in% dot_names)) { + stop ("all arguments to 'fun' must be passed, named, to marginalise") + } + if (!inherits(distrib, "distribution_node")) { stop ("'variable' must be a variable greta array with a distribution") } @@ -79,10 +92,98 @@ marginalise <- function(fun, variable, method) { # check the distribution is compatible with the method method$distribution_check(distrib) + # excise the variable from the distribution + distrib$remove_target() + stop ("not yet implemented") + # turn the greta function into a TF conditional density function; doing + # something very similar to as_tf_function(), but giving a function that + # returns a tensorflow scalar for the density + conditional_joint_density <- as_conditional_density(fun) + + # get a tf function from the method which will turn that conditional density + # function into a marginal density (conditional on the inputs to the + # distribution) to be added to the overall model density + method$tf_marginaliser + + # pass the conditional density function and the marginaliser TF function to a + # marginalisation distribution (needs an R6 class) + + # create the distribution + vble <- distrib( + "marginalisation", + marginaliser = method, + conditional_density_fun = conditional_joint_density, + distribution = distrib, + dots = dots + ) + + # excise the variable from the marginalisation distribution + distrib <- get_node(variable)$distribution + distrib$remove_target() + + # return nothing + invisible(NULL) + } +marginalisation_distribution <- R6Class( + "marginalisation_distribution", + inherit = distribution_node, + public = list( + + tf_marginaliser = NULL, + conditional_density_fun = NULL, + + initialize = function(marginaliser, + conditional_density_fun, + distribution, + dots) { + + # initialize class, and add methods + super$initialize("marginalisation") + self$tf_marginaliser <- marginaliser$tf_marginaliser + self$conditional_density_fun <- conditional_density_fun + + # add the dots (extra inputs to conditional_density_fun) as parameters + dot_nodes <- lapply(dots, get_node) + for (i in seq_len(dot_nodes)) { + self$add_parameter(dot_nodes[[i]], + paste("input", i)) + } + + # add the distribution as a parameter + self$add_parameter(distribution, "distribution") + + }, + + tf_distrib = function(parameters, dag) { + + # do the marginalisation + + # but it has no target!! + # should we handle distributions with no targets as a special case? + # or should this not be a distribution after all? + + log_prob <- function(x) { + + self$tf_marginaliser(self$conditional_density_fun, + parameters$distribution, + parameters[names(parameters) != "distribution"]) + + } + + list(log_prob = log_prob, cdf = NULL, log_cdf = NULL) + + }, + + tf_cdf_function = NULL, + tf_log_cdf_function = NULL + + ) +) + #' @rdname marginalisation #' @export #' @@ -111,12 +212,21 @@ discrete_marginalisation <- function(values) { } # define the marginalisation function - marginalisation_function <- function() { + tf_marginaliser <- function(conditional_density_fun, + distribution, + other_args) { + # base this on mixture; + # 1. get weights from the distribution (use its log_pdf tf method on the + # values) + # 2. compute the conditional joint density for each value (passing in + # other_args) + # 3. compute a weighted sum with tf_reduce_log_sum_exp() + stop ("not yet implemented") } as_marginaliser(name = "discrete", - method = marginalisation_function, + tf_marginaliser = tf_marginaliser, parameters = list(values = values), distribution_check = discrete_check) diff --git a/man/marginalisation.Rd b/man/marginalisation.Rd index fb05930f..f5968662 100644 --- a/man/marginalisation.Rd +++ b/man/marginalisation.Rd @@ -6,14 +6,14 @@ \alias{discrete_marginalisation} \title{direct marginalisation of random variables} \usage{ -marginalise(fun, variable, method) +marginalise(fun, variable, method, ...) discrete_marginalisation(values) } \arguments{ \item{fun}{an R function to integrate with respect to the random variable. -Must take a single argument - a greta array giving the value of the random -variable} +The first argument must be - a greta array giving the value of the random +variable, any subsequent arguments must passed in via the \dots argument.} \item{variable}{a variable greta array with a distribution, representing the random variable to marginalise} @@ -21,6 +21,8 @@ random variable to marginalise} \item{method}{a \code{marginaliser} object giving the method for carrying out the marginalisation} +\item{\dots}{named greta arrays to be passed to \code{fun}} + \item{values}{an R vector giving values at which to evaluate the function for a discrete marginalisation} } @@ -65,9 +67,9 @@ sd <- lognormal(0, 1) # mu <- theta ^ n # distribution(y) <- normal(mu, sd) -# the function to marginalise over -# (only the variable to marginalise is passed in) -fun <- function (n) { +# the function to marginalise over - the variable to marginalise +# must be first, others must be passed to marginalise() +fun <- function (n, theta, mu, sd) { mu <- theta ^ n distribution(y) <- normal(mu, sd) } @@ -75,7 +77,10 @@ fun <- function (n) { # integrate the function values w.r.t. the poisson distribution marginalise(fun, poisson(lambda), - method = discrete_marginalisation(values = 0:10)) + method = discrete_marginalisation(values = 0:10), + theta = theta, + mu = mu, + sd = sd) m <- model(lambda) draws <- mcmc(m, hmc()) From 7e802cbce3d8d131f075b032dbae7da7b61ad74e Mon Sep 17 00:00:00 2001 From: Nick Golding Date: Tue, 8 Jan 2019 12:00:25 +1100 Subject: [PATCH 05/38] fill in last of guess at internals --- R/marginalise.R | 164 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 130 insertions(+), 34 deletions(-) diff --git a/R/marginalise.R b/R/marginalise.R index d018348c..977ba05f 100644 --- a/R/marginalise.R +++ b/R/marginalise.R @@ -55,13 +55,18 @@ NULL #' @export #' #' @param fun an R function to integrate with respect to the random variable. -#' The first argument must be - a greta array giving the value of the random -#' variable, any subsequent arguments must passed in via the \dots argument. +#' The first argument must be a greta array for the random variable, any +#' subsequent arguments must passed in via the \dots argument. #' @param variable a variable greta array with a distribution, representing the #' random variable to marginalise #' @param method a \code{marginaliser} object giving the method for carrying out #' the marginalisation #' @param \dots named greta arrays to be passed to \code{fun} +#' +#' @details The code in \code{fun} must define at least one distribution over +#' data (ie. a model likelihood), and cannot create any new variables. Any +#' variables must be created outide this function, and passed in via the \dots +#' argument. marginalise <- function(fun, variable, method, ...) { # get the distribution for the random variable @@ -95,20 +100,19 @@ marginalise <- function(fun, variable, method, ...) { # excise the variable from the distribution distrib$remove_target() - stop ("not yet implemented") - # turn the greta function into a TF conditional density function; doing # something very similar to as_tf_function(), but giving a function that - # returns a tensorflow scalar for the density - conditional_joint_density <- as_conditional_density(fun) + # returns a tensorflow scalar for the density (unadjusted since there will be + # no new variables) + args <- c(list(variable), dots) + conditional_joint_density <- as_conditional_density(fun, args) # get a tf function from the method which will turn that conditional density # function into a marginal density (conditional on the inputs to the # distribution) to be added to the overall model density - method$tf_marginaliser - # pass the conditional density function and the marginaliser TF function to a - # marginalisation distribution (needs an R6 class) + # pass the conditional density function and the marginaliser to a + # marginalisation distribution # create the distribution vble <- distrib( @@ -119,11 +123,7 @@ marginalise <- function(fun, variable, method, ...) { dots = dots ) - # excise the variable from the marginalisation distribution - distrib <- get_node(variable)$distribution - distrib$remove_target() - - # return nothing + # return nothing (users can't have distribution nodes on their own) invisible(NULL) } @@ -153,24 +153,22 @@ marginalisation_distribution <- R6Class( paste("input", i)) } - # add the distribution as a parameter - self$add_parameter(distribution, "distribution") + # make the distribution the target + self$remove_target() + self$add_target(distribution) }, tf_distrib = function(parameters, dag) { - # do the marginalisation - - # but it has no target!! - # should we handle distributions with no targets as a special case? - # or should this not be a distribution after all? - + # the marginal density implied by the function log_prob <- function(x) { + # x will be the target; a tf function for the distribution being + # marginalised self$tf_marginaliser(self$conditional_density_fun, - parameters$distribution, - parameters[names(parameters) != "distribution"]) + tf_distribution_log_pdf = x, + parameters) } @@ -212,17 +210,37 @@ discrete_marginalisation <- function(values) { } # define the marginalisation function - tf_marginaliser <- function(conditional_density_fun, - distribution, + tf_marginaliser <- function(tf_conditional_density_fun, + tf_distribution_log_pdf, other_args) { - # base this on mixture; - # 1. get weights from the distribution (use its log_pdf tf method on the - # values) + + values_list <- as.list(values) + + # 1. get weights from the distribution log pdf + + # assuming values is a list, get tensors for the weights + weights_list <- lapply(values_list, tf_distribution_log_pdf) + weights_list <- lapply(weights_list, tf_sum) + + # convert to a vector and make them sum to 1 + weights_vec <- tf$concat(weights_list, axis = 1L) + weights_vec <- weights_vec / tf_sum(weights_vec) + log_weights_vec <- tf$log(weights_vec) + # 2. compute the conditional joint density for each value (passing in - # other_args) - # 3. compute a weighted sum with tf_reduce_log_sum_exp() + # other_args) + log_density_list <- lapply( + values_list, + tf_conditional_density_fun, + other_args + ) + + log_density_vec <- tf$concat(log_density_list, axis = 1L) + + # 3. compute a weighted sum + log_density_weighted_vec <- log_density_vec + log_weights_vec + tf$reduce_logsumexp(log_density_weighted_vec, axis = 1L) - stop ("not yet implemented") } as_marginaliser(name = "discrete", @@ -242,10 +260,10 @@ discrete_check <- function(distrib) { } # helper to contruct marginalisers -as_marginaliser <- function (name, method, parameters, distribution_check) { +as_marginaliser <- function (name, tf_marginaliser, parameters, distribution_check) { obj <- list(name = name, - method = method, + tf_marginaliser = tf_marginaliser, parameters = parameters, distribution_check = distribution_check) @@ -261,3 +279,81 @@ print.marginaliser <- function(x, ...) { msg <- paste(x$name, "marginaliser object") cat(msg) } + +# turn the greta function into a TF conditional density function; doing +# something very similar to as_tf_function(), but giving a function that +# returns a tensorflow scalar for the density +as_conditional_density <- function(r_fun, args) { + + # run the operation on isolated greta arrays, so nothing gets attached to the + # model. Real greta arrays in args + ga_dummies <- lapply(args, dummy_greta_array) + out <- do.call(r_fun, ga_dummies) + + # we don't want to rely on the function returning a greta array that depends + # on everything internally. instead, we want to grab any densities defined in + # the function. + + # this function will take in tensors corresponding to the things in args + function(...) { + + tensor_inputs <- list(...) + + # if any of these are shapeless, make them into greta scalars (3D) + tensor_inputs <- lapply(tensor_inputs, + function(x) { + if (identical(dim(x), list())) + x <- tf$reshape(x, shape(1, 1, 1)) + x + }) + + # transfer batch dimensions if needed + tensor_inputs <- match_batches(tensor_inputs) + + # create a sub-dag for everything connected to the inputs + sub_dag <- dag_class$new(ga_dummies) + + # check there are distributions + if (!any(sub_dag$node_types == "distribution")) { + stop ("'fun' must constain at least one distribution over data", + call. = FALSE) + } + + # check there are no variables + if (any(sub_dag$node_types == "variable")) { + stop ("'fun' must not create any new variables; ", + "variables can be passed in as arguments", + call. = FALSE) + } + + # use the default graph, so that it can be overwritten when this is called + # (using on_graph()) + sub_dag$tf_graph <- tf$get_default_graph() + sub_tfe <- sub_dag$tf_environment + + # set the input tensors as the values for the dummy greta arrays in the new + # tf_environment + node_dummies <- lapply(ga_dummies, get_node) + tf_names <- lapply(node_dummies, sub_dag$tf_name) + for (i in seq_along(tf_names)) + assign(tf_names[[i]], tensor_inputs[[i]], envir = sub_tfe) + + # have any new data defined as constants, to avoid placeholding issues in + # the wider model + greta_stash$data_as_constants <- TRUE + on.exit(greta_stash$data_as_constants <- NULL) + + # define all nodes (only data, operation and distribution) in the + # environment, and on the graph + sub_dag$on_graph(lapply(sub_dag$node_list, + function(x) x$define_tf(self))) + + # define and return the joint density tensor (no variables, so no need for + # log jacobian adjustment) + sub_dag$define_joint_density() + + sub_tfe$joint_density + + } + +} From ad98d72ea74d629c2e4cea7a99413b6b82660e8a Mon Sep 17 00:00:00 2001 From: Nick Golding Date: Tue, 8 Jan 2019 14:46:04 +1100 Subject: [PATCH 06/38] be strict about float type in as_tf_function --- R/utils.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/utils.R b/R/utils.R index 9bfc0f6f..dc50d44e 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1151,7 +1151,8 @@ as_tf_function <- function(r_fun, ...) { if (!is.list(ga_out)) ga_out <- list(ga_out) targets <- c(ga_out, ga_dummies) - sub_dag <- dag_class$new(targets) + sub_dag <- dag_class$new(targets, + tf_float = options()$greta_tf_float) # use the default graph, so that it can be overwritten when this is called? # alternatively fetch from above, or put it in greta_stash? From 9219a42a97acc4e76c18cafc6794085b1e00bece Mon Sep 17 00:00:00 2001 From: Nick Golding Date: Tue, 8 Jan 2019 14:46:52 +1100 Subject: [PATCH 07/38] bugfixes to get marginalise apparently working --- R/dag_class.R | 30 ++++++++++++++++------------ R/marginalise.R | 45 +++++++++++++++++++++++------------------- man/marginalisation.Rd | 9 +++++++-- 3 files changed, 49 insertions(+), 35 deletions(-) diff --git a/R/dag_class.R b/R/dag_class.R index eb6fe48e..b05bae02 100644 --- a/R/dag_class.R +++ b/R/dag_class.R @@ -242,7 +242,7 @@ dag_class <- R6Class( }, # define tensor for overall log density and gradients - define_joint_density = function() { + define_joint_density = function(adjusted = TRUE) { tfe <- self$tf_environment @@ -293,22 +293,26 @@ dag_class <- R6Class( joint_density, envir = self$tf_environment) - # define adjusted joint density + if (adjusted) { - # get names of adjustment tensors for all variable nodes - adj_names <- paste0(self$get_tf_names(types = "variable"), "_adj") + # optionally define adjusted joint density - # get TF density tensors for all distribution - adj <- lapply(adj_names, get, envir = self$tf_environment) + # get names of adjustment tensors for all variable nodes + adj_names <- paste0(self$get_tf_names(types = "variable"), "_adj") - # remove their names and sum them together - names(adj) <- NULL - self$on_graph(total_adj <- tf$add_n(adj)) + # get TF density tensors for all distribution + adj <- lapply(adj_names, get, envir = self$tf_environment) - # assign overall density to environment - assign("joint_density_adj", - joint_density + total_adj, - envir = self$tf_environment) + # remove their names and sum them together + names(adj) <- NULL + self$on_graph(total_adj <- tf$add_n(adj)) + + # assign overall density to environment + assign("joint_density_adj", + joint_density + total_adj, + envir = self$tf_environment) + + } }, diff --git a/R/marginalise.R b/R/marginalise.R index 977ba05f..513000c3 100644 --- a/R/marginalise.R +++ b/R/marginalise.R @@ -70,7 +70,7 @@ NULL marginalise <- function(fun, variable, method, ...) { # get the distribution for the random variable - distrib <- get_node(variable)$distribution + distr <- get_node(variable)$distribution # check the inputs if (!is.function(fun)) { @@ -85,7 +85,7 @@ marginalise <- function(fun, variable, method, ...) { stop ("all arguments to 'fun' must be passed, named, to marginalise") } - if (!inherits(distrib, "distribution_node")) { + if (!inherits(distr, "distribution_node")) { stop ("'variable' must be a variable greta array with a distribution") } @@ -95,10 +95,10 @@ marginalise <- function(fun, variable, method, ...) { } # check the distribution is compatible with the method - method$distribution_check(distrib) + method$distribution_check(distr) # excise the variable from the distribution - distrib$remove_target() + distr$remove_target() # turn the greta function into a TF conditional density function; doing # something very similar to as_tf_function(), but giving a function that @@ -116,10 +116,10 @@ marginalise <- function(fun, variable, method, ...) { # create the distribution vble <- distrib( - "marginalisation", + distribution = "marginalisation", marginaliser = method, conditional_density_fun = conditional_joint_density, - distribution = distrib, + target_distribution = distr, dots = dots ) @@ -138,7 +138,7 @@ marginalisation_distribution <- R6Class( initialize = function(marginaliser, conditional_density_fun, - distribution, + target_distribution, dots) { # initialize class, and add methods @@ -148,14 +148,14 @@ marginalisation_distribution <- R6Class( # add the dots (extra inputs to conditional_density_fun) as parameters dot_nodes <- lapply(dots, get_node) - for (i in seq_len(dot_nodes)) { + for (i in seq_along(dot_nodes)) { self$add_parameter(dot_nodes[[i]], paste("input", i)) } # make the distribution the target self$remove_target() - self$add_target(distribution) + self$add_target(target_distribution) }, @@ -213,11 +213,14 @@ discrete_marginalisation <- function(values) { tf_marginaliser <- function(tf_conditional_density_fun, tf_distribution_log_pdf, other_args) { - + # convert these into a list of constant tensors with the correct dimensions + # and float types values_list <- as.list(values) + values_list <- lapply(values_list, as_2D_array) + values_list <- lapply(values_list, add_first_dim) + values_list <- lapply(values_list, fl) # 1. get weights from the distribution log pdf - # assuming values is a list, get tensors for the weights weights_list <- lapply(values_list, tf_distribution_log_pdf) weights_list <- lapply(weights_list, tf_sum) @@ -229,12 +232,14 @@ discrete_marginalisation <- function(values) { # 2. compute the conditional joint density for each value (passing in # other_args) - log_density_list <- lapply( - values_list, - tf_conditional_density_fun, - other_args - ) + log_density_list <- list() + for (i in seq_along(values_list)) { + args <- c(list(values_list[[i]]), other_args) + log_density_list[[i]] <- do.call(tf_conditional_density_fun, args) + } + log_density_list <- lapply(log_density_list, tf$expand_dims, 1L) + log_density_list <- lapply(log_density_list, tf$expand_dims, 2L) log_density_vec <- tf$concat(log_density_list, axis = 1L) # 3. compute a weighted sum @@ -311,7 +316,7 @@ as_conditional_density <- function(r_fun, args) { tensor_inputs <- match_batches(tensor_inputs) # create a sub-dag for everything connected to the inputs - sub_dag <- dag_class$new(ga_dummies) + sub_dag <- dag_class$new(ga_dummies, tf_float = options()$greta_tf_float) # check there are distributions if (!any(sub_dag$node_types == "distribution")) { @@ -321,7 +326,7 @@ as_conditional_density <- function(r_fun, args) { # check there are no variables if (any(sub_dag$node_types == "variable")) { - stop ("'fun' must not create any new variables; ", + stop ("'fun' must not create any new variables, ", "variables can be passed in as arguments", call. = FALSE) } @@ -346,11 +351,11 @@ as_conditional_density <- function(r_fun, args) { # define all nodes (only data, operation and distribution) in the # environment, and on the graph sub_dag$on_graph(lapply(sub_dag$node_list, - function(x) x$define_tf(self))) + function(x) x$define_tf(sub_dag))) # define and return the joint density tensor (no variables, so no need for # log jacobian adjustment) - sub_dag$define_joint_density() + sub_dag$define_joint_density(adjusted = FALSE) sub_tfe$joint_density diff --git a/man/marginalisation.Rd b/man/marginalisation.Rd index f5968662..958a3ec6 100644 --- a/man/marginalisation.Rd +++ b/man/marginalisation.Rd @@ -12,8 +12,8 @@ discrete_marginalisation(values) } \arguments{ \item{fun}{an R function to integrate with respect to the random variable. -The first argument must be - a greta array giving the value of the random -variable, any subsequent arguments must passed in via the \dots argument.} +The first argument must be a greta array for the random variable, any +subsequent arguments must passed in via the \dots argument.} \item{variable}{a variable greta array with a distribution, representing the random variable to marginalise} @@ -41,6 +41,11 @@ Inference on many statistical models requires marginalisation of marginalisation within a greta model, using one of the available marginalisation methods. } \details{ +The code in \code{fun} must define at least one distribution over + data (ie. a model likelihood), and cannot create any new variables. Any + variables must be created outide this function, and passed in via the \dots + argument. + \code{discrete_marginalisation} can only be used with discrete probability distributions, e.g. those defined with \code{poisson()} and \code{binomial()}. For discrete distributions with finite support (such as From 24fabd1a959308e784e178567cbf8a066d132bf6 Mon Sep 17 00:00:00 2001 From: Nick Golding Date: Tue, 8 Jan 2019 16:53:36 +1100 Subject: [PATCH 08/38] add marginalisation tests and realted fixes; fix marginalise() example --- R/marginalise.R | 11 ++-- man/marginalisation.Rd | 5 +- tests/testthat/test_marginalisation.R | 95 ++++++++++++++++++++++++++- 3 files changed, 100 insertions(+), 11 deletions(-) diff --git a/R/marginalise.R b/R/marginalise.R index 513000c3..6a4ff340 100644 --- a/R/marginalise.R +++ b/R/marginalise.R @@ -33,17 +33,16 @@ #' #' # the function to marginalise over - the variable to marginalise #' # must be first, others must be passed to marginalise() -#' fun <- function (n, theta, mu, sd) { +#' likelihood <- function (n, theta, sd) { #' mu <- theta ^ n #' distribution(y) <- normal(mu, sd) #' } #' #' # integrate the function values w.r.t. the poisson distribution -#' marginalise(fun, -#' poisson(lambda), +#' marginalise(fun = likelihood, +#' variable = poisson(lambda), #' method = discrete_marginalisation(values = 0:10), #' theta = theta, -#' mu = mu, #' sd = sd) #' #' m <- model(lambda) @@ -213,6 +212,7 @@ discrete_marginalisation <- function(values) { tf_marginaliser <- function(tf_conditional_density_fun, tf_distribution_log_pdf, other_args) { + # convert these into a list of constant tensors with the correct dimensions # and float types values_list <- as.list(values) @@ -225,8 +225,9 @@ discrete_marginalisation <- function(values) { weights_list <- lapply(values_list, tf_distribution_log_pdf) weights_list <- lapply(weights_list, tf_sum) - # convert to a vector and make them sum to 1 + # convert to a vector of discrete probabilities and make them sum to 1 weights_vec <- tf$concat(weights_list, axis = 1L) + weights_vec <- tf$exp(weights_vec) weights_vec <- weights_vec / tf_sum(weights_vec) log_weights_vec <- tf$log(weights_vec) diff --git a/man/marginalisation.Rd b/man/marginalisation.Rd index 958a3ec6..e3365ab7 100644 --- a/man/marginalisation.Rd +++ b/man/marginalisation.Rd @@ -74,17 +74,16 @@ sd <- lognormal(0, 1) # the function to marginalise over - the variable to marginalise # must be first, others must be passed to marginalise() -fun <- function (n, theta, mu, sd) { +likelihood <- function (n, theta, sd) { mu <- theta ^ n distribution(y) <- normal(mu, sd) } # integrate the function values w.r.t. the poisson distribution -marginalise(fun, +marginalise(likelihood, poisson(lambda), method = discrete_marginalisation(values = 0:10), theta = theta, - mu = mu, sd = sd) m <- model(lambda) diff --git a/tests/testthat/test_marginalisation.R b/tests/testthat/test_marginalisation.R index 1558dbb3..27ce7c23 100644 --- a/tests/testthat/test_marginalisation.R +++ b/tests/testthat/test_marginalisation.R @@ -7,13 +7,13 @@ test_that("marginalise errors nicely", { # not a function expect_error( - marginalise("apple", normal(0, 1), discrete_marginalisation()), + marginalise("apple", poisson(0.1), discrete_marginalisation(0:2)), "must be an R function" ) # not a greta array expect_error( - marginalise(I, 1:5, discrete_marginalisation()), + marginalise(I, 1:5, discrete_marginalisation(0:2)), "must be a variable greta array with a distribution" ) @@ -25,12 +25,33 @@ test_that("marginalise errors nicely", { # not a marginaliser expect_error( - marginalise(I, normal(0, 1), mean), + marginalise(I, poisson(0.1), mean), "'method' must be a valid marginalisation method" ) + # the following can only be assessed on final model definition + + # function adds variables + fun <- function (x) {x + normal(0, 1)} + lambda <- variable() + marginalise(fun, poisson(lambda), discrete_marginalisation(0:2)) + expect_error( + mcmc(model(lambda)), + "must not create any new variables" + ) + + # function has no distribution + fun <- function (x) {x * 2} + lambda <- variable() + marginalise(fun, poisson(lambda), discrete_marginalisation(0:2)) + expect_error( + mcmc(model(lambda)), + "must constain at least one distribution over data" + ) + }) + test_that("discrete_marginalisation errors nicely", { skip_if_not(check_tf_version()) @@ -55,3 +76,71 @@ test_that("discrete_marginalisation errors nicely", { ) }) + + +test_that("inference runs with discrete marginalisation", { + + skip_if_not(check_tf_version()) + source("helpers.R") + + # spike+slab poisson glm (in reality this is poorly identified) + n <- 100 + x <- runif(n, -5, 5) + y <- rpois(n, exp(1)) + p <- variable(lower = 0, upper = 1) + alpha <- variable() + beta <- variable() + likelihood <- function (z, alpha, beta) { + eta <- alpha + beta * x * z + lambda <- exp(eta) + distribution(y) <- poisson(lambda) + } + marginalise(likelihood, + bernoulli(p), + method = discrete_marginalisation(0:1), + alpha = alpha, + beta = beta) + m <- model(alpha, beta, p) + + expect_ok(o <- opt(m)) + expect_ok(draws <- mcmc(m, warmup = 20, n_samples = 20)) + +}) + + +test_that("discrete marginalisation gives correct densities", { + + skip_if_not(check_tf_version()) + source("helpers.R") + + # no constraints to no adjustment to account for + lambda <- variable() + y <- rnorm(100) + likelihood <- function (z) { + distribution(y) <- normal(z, 1) + } + values <- 0:5 + marginalise(likelihood, + poisson(lambda), + method = discrete_marginalisation(values)) + m <- model(lambda) + o <- opt(m) + + lambda_val <- runif(1, 0, 5) + + # manually compute expected marginal likelihood given this value of lambda + wt <- dpois(values, lambda_val) + wt <- wt / sum(wt) + dens <- vapply(y, dnorm, values, 1, + FUN.VALUE = rep(1, length(values))) + dens <- apply(dens, 1, prod) + expected <- log(sum(wt * dens)) + + # compute with greta + m$dag$send_parameters(lambda_val) + observed <- m$dag$log_density() + + # compare results (within tolerance) + compare_op(expected, observed) + +}) From 7b9d845004d2b25e0af1213143e96b15689d0256 Mon Sep 17 00:00:00 2001 From: Nick Golding Date: Wed, 9 Jan 2019 14:16:14 +1100 Subject: [PATCH 09/38] split marginalisers into separate helpfile --- DESCRIPTION | 1 + R/marginalise.R | 111 +------------------------------------- R/marginalisers.R | 117 +++++++++++++++++++++++++++++++++++++++++ man/marginalisation.Rd | 27 ++-------- man/marginalisers.Rd | 33 ++++++++++++ 5 files changed, 157 insertions(+), 132 deletions(-) create mode 100644 R/marginalisers.R create mode 100644 man/marginalisers.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 64ebfb03..ee6dbb9e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -62,6 +62,7 @@ Collate: 'calculate.R' 'callbacks.R' 'marginalise.R' + 'marginalisers.R' Imports: R6, tensorflow, diff --git a/R/marginalise.R b/R/marginalise.R index 6a4ff340..d706f1fc 100644 --- a/R/marginalise.R +++ b/R/marginalise.R @@ -10,10 +10,8 @@ #' within a larger model; e.g. for random variables where marginalisation by #' MCMC is inefficient, impossible, or when an approximate marginalisation is #' substantially more efficient. \code{marginalise()} performs direct -#' marginalisation within a greta model, using one of the available marginalisation methods. -#' -#' @return \code{discrete_marginalisation} - a \code{marginaliser} object that -#' can be passed to \code{marginalise}. +#' marginalisation within a greta model, using one of the available +#' marginalisation methods, detailed in \link{marginalisers}. #' #' @examples #' \dontrun{ @@ -181,111 +179,6 @@ marginalisation_distribution <- R6Class( ) ) -#' @rdname marginalisation -#' @export -#' -#' @param values an R vector giving values at which to evaluate the function for a -#' discrete marginalisation -#' -#' @details \code{discrete_marginalisation} can only be used with discrete -#' probability distributions, e.g. those defined with \code{poisson()} and -#' \code{binomial()}. For discrete distributions with finite support (such as -#' \code{bernoulli()}) the marginalisation will be exact, so long as -#' \code{values} includes all possible values of the variable. For discrete -#' distributions with non-finite support (such as \code{poisson()}, which has -#' no upper bound), the marginalisation can only ever be approximate. However -#' if \code{values} cover a range of values with sufficiently high support in -#' the distribution, that approximation error will be minimal. -discrete_marginalisation <- function(values) { - - if (!is.vector(values) | !is.numeric(values)) { - msg <- "'values' must be an R numeric vector" - - if (inherits(values, "greta_array")) { - msg <- paste0(msg, ", not a greta array") - } - - stop (msg) - } - - # define the marginalisation function - tf_marginaliser <- function(tf_conditional_density_fun, - tf_distribution_log_pdf, - other_args) { - - # convert these into a list of constant tensors with the correct dimensions - # and float types - values_list <- as.list(values) - values_list <- lapply(values_list, as_2D_array) - values_list <- lapply(values_list, add_first_dim) - values_list <- lapply(values_list, fl) - - # 1. get weights from the distribution log pdf - # assuming values is a list, get tensors for the weights - weights_list <- lapply(values_list, tf_distribution_log_pdf) - weights_list <- lapply(weights_list, tf_sum) - - # convert to a vector of discrete probabilities and make them sum to 1 - weights_vec <- tf$concat(weights_list, axis = 1L) - weights_vec <- tf$exp(weights_vec) - weights_vec <- weights_vec / tf_sum(weights_vec) - log_weights_vec <- tf$log(weights_vec) - - # 2. compute the conditional joint density for each value (passing in - # other_args) - log_density_list <- list() - for (i in seq_along(values_list)) { - args <- c(list(values_list[[i]]), other_args) - log_density_list[[i]] <- do.call(tf_conditional_density_fun, args) - } - - log_density_list <- lapply(log_density_list, tf$expand_dims, 1L) - log_density_list <- lapply(log_density_list, tf$expand_dims, 2L) - log_density_vec <- tf$concat(log_density_list, axis = 1L) - - # 3. compute a weighted sum - log_density_weighted_vec <- log_density_vec + log_weights_vec - tf$reduce_logsumexp(log_density_weighted_vec, axis = 1L) - - } - - as_marginaliser(name = "discrete", - tf_marginaliser = tf_marginaliser, - parameters = list(values = values), - distribution_check = discrete_check) - -} - -# check that the distribution is discrete -discrete_check <- function(distrib) { - if (!distrib$discrete) { - stop ("this marginalisation method can only be used ", - "with discrete distributions", - call. = FALSE) - } -} - -# helper to contruct marginalisers -as_marginaliser <- function (name, tf_marginaliser, parameters, distribution_check) { - - obj <- list(name = name, - tf_marginaliser = tf_marginaliser, - parameters = parameters, - distribution_check = distribution_check) - - class_name <- paste0(name, "_marginaliser") - class(obj) <- c(class_name, "marginaliser") - obj - -} - -#' @noRd -#' @export -print.marginaliser <- function(x, ...) { - msg <- paste(x$name, "marginaliser object") - cat(msg) -} - # turn the greta function into a TF conditional density function; doing # something very similar to as_tf_function(), but giving a function that # returns a tensorflow scalar for the density diff --git a/R/marginalisers.R b/R/marginalisers.R new file mode 100644 index 00000000..4612cafc --- /dev/null +++ b/R/marginalisers.R @@ -0,0 +1,117 @@ +# marginaliser constructors + +#' @name marginalisers +#' +#' @title marginalisation methods +#' @description Functions to set up marginalisers (which explicitly integrate +#' out random variables) and modify their behaviour, for use in +#' \code{\link{marginalise}()}. +#' +#' @return a \code{marginaliser} object that can be passed to +#' \code{marginalise}. +NULL + +#' @rdname marginalisers +#' @export +#' +#' @param values an R vector giving values at which to evaluate the function for a +#' discrete marginalisation +#' +#' @details \code{discrete_marginalisation} can only be used with discrete +#' probability distributions, e.g. those defined with \code{poisson()} and +#' \code{binomial()}. For discrete distributions with finite support (such as +#' \code{bernoulli()}) the marginalisation will be exact, so long as +#' \code{values} includes all possible values of the variable. For discrete +#' distributions with non-finite support (such as \code{poisson()}, which has +#' no upper bound), the marginalisation can only ever be approximate. However +#' if \code{values} cover a range of values with sufficiently high support in +#' the distribution, that approximation error will be minimal. +discrete_marginalisation <- function(values) { + + if (!is.vector(values) | !is.numeric(values)) { + msg <- "'values' must be an R numeric vector" + + if (inherits(values, "greta_array")) { + msg <- paste0(msg, ", not a greta array") + } + + stop (msg) + } + + # define the marginalisation function + tf_marginaliser <- function(tf_conditional_density_fun, + tf_distribution_log_pdf, + other_args) { + + # convert these into a list of constant tensors with the correct dimensions + # and float types + values_list <- as.list(values) + values_list <- lapply(values_list, as_2D_array) + values_list <- lapply(values_list, add_first_dim) + values_list <- lapply(values_list, fl) + + # 1. get weights from the distribution log pdf + # assuming values is a list, get tensors for the weights + weights_list <- lapply(values_list, tf_distribution_log_pdf) + weights_list <- lapply(weights_list, tf_sum) + + # convert to a vector of discrete probabilities and make them sum to 1 + weights_vec <- tf$concat(weights_list, axis = 1L) + weights_vec <- tf$exp(weights_vec) + weights_vec <- weights_vec / tf_sum(weights_vec) + log_weights_vec <- tf$log(weights_vec) + + # 2. compute the conditional joint density for each value (passing in + # other_args) + log_density_list <- list() + for (i in seq_along(values_list)) { + args <- c(list(values_list[[i]]), other_args) + log_density_list[[i]] <- do.call(tf_conditional_density_fun, args) + } + + log_density_list <- lapply(log_density_list, tf$expand_dims, 1L) + log_density_list <- lapply(log_density_list, tf$expand_dims, 2L) + log_density_vec <- tf$concat(log_density_list, axis = 1L) + + # 3. compute a weighted sum + log_density_weighted_vec <- log_density_vec + log_weights_vec + tf$reduce_logsumexp(log_density_weighted_vec, axis = 1L) + + } + + as_marginaliser(name = "discrete", + tf_marginaliser = tf_marginaliser, + parameters = list(values = values), + distribution_check = discrete_check) + +} + +# check that the distribution is discrete +discrete_check <- function(distrib) { + if (!distrib$discrete) { + stop ("this marginalisation method can only be used ", + "with discrete distributions", + call. = FALSE) + } +} + +# helper to contruct marginalisers +as_marginaliser <- function (name, tf_marginaliser, parameters, distribution_check) { + + obj <- list(name = name, + tf_marginaliser = tf_marginaliser, + parameters = parameters, + distribution_check = distribution_check) + + class_name <- paste0(name, "_marginaliser") + class(obj) <- c(class_name, "marginaliser") + obj + +} + +#' @noRd +#' @export +print.marginaliser <- function(x, ...) { + msg <- paste(x$name, "marginaliser object") + cat(msg) +} diff --git a/man/marginalisation.Rd b/man/marginalisation.Rd index e3365ab7..10eb2a0e 100644 --- a/man/marginalisation.Rd +++ b/man/marginalisation.Rd @@ -3,12 +3,9 @@ \name{marginalisation} \alias{marginalisation} \alias{marginalise} -\alias{discrete_marginalisation} \title{direct marginalisation of random variables} \usage{ marginalise(fun, variable, method, ...) - -discrete_marginalisation(values) } \arguments{ \item{fun}{an R function to integrate with respect to the random variable. @@ -22,13 +19,6 @@ random variable to marginalise} the marginalisation} \item{\dots}{named greta arrays to be passed to \code{fun}} - -\item{values}{an R vector giving values at which to evaluate the function for a -discrete marginalisation} -} -\value{ -\code{discrete_marginalisation} - a \code{marginaliser} object that - can be passed to \code{marginalise}. } \description{ Inference on many statistical models requires marginalisation of @@ -38,23 +28,14 @@ Inference on many statistical models requires marginalisation of within a larger model; e.g. for random variables where marginalisation by MCMC is inefficient, impossible, or when an approximate marginalisation is substantially more efficient. \code{marginalise()} performs direct - marginalisation within a greta model, using one of the available marginalisation methods. + marginalisation within a greta model, using one of the available + marginalisation methods, detailed in \link{marginalisers}. } \details{ The code in \code{fun} must define at least one distribution over data (ie. a model likelihood), and cannot create any new variables. Any variables must be created outide this function, and passed in via the \dots argument. - -\code{discrete_marginalisation} can only be used with discrete - probability distributions, e.g. those defined with \code{poisson()} and - \code{binomial()}. For discrete distributions with finite support (such as - \code{bernoulli()}) the marginalisation will be exact, so long as - \code{values} includes all possible values of the variable. For discrete - distributions with non-finite support (such as \code{poisson()}, which has - no upper bound), the marginalisation can only ever be approximate. However - if \code{values} cover a range of values with sufficiently high support in - the distribution, that approximation error will be minimal. } \examples{ \dontrun{ @@ -80,8 +61,8 @@ likelihood <- function (n, theta, sd) { } # integrate the function values w.r.t. the poisson distribution -marginalise(likelihood, - poisson(lambda), +marginalise(fun = likelihood, + variable = poisson(lambda), method = discrete_marginalisation(values = 0:10), theta = theta, sd = sd) diff --git a/man/marginalisers.Rd b/man/marginalisers.Rd new file mode 100644 index 00000000..aea7562e --- /dev/null +++ b/man/marginalisers.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/marginalisers.R +\name{marginalisers} +\alias{marginalisers} +\alias{discrete_marginalisation} +\title{marginalisation methods} +\usage{ +discrete_marginalisation(values) +} +\arguments{ +\item{values}{an R vector giving values at which to evaluate the function for a +discrete marginalisation} +} +\value{ +a \code{marginaliser} object that can be passed to + \code{marginalise}. +} +\description{ +Functions to set up marginalisers (which explicitly integrate + out random variables) and modify their behaviour, for use in + \code{\link{marginalise}()}. +} +\details{ +\code{discrete_marginalisation} can only be used with discrete + probability distributions, e.g. those defined with \code{poisson()} and + \code{binomial()}. For discrete distributions with finite support (such as + \code{bernoulli()}) the marginalisation will be exact, so long as + \code{values} includes all possible values of the variable. For discrete + distributions with non-finite support (such as \code{poisson()}, which has + no upper bound), the marginalisation can only ever be approximate. However + if \code{values} cover a range of values with sufficiently high support in + the distribution, that approximation error will be minimal. +} From 86894816407045b74031c00e9403b7ffd8012361 Mon Sep 17 00:00:00 2001 From: Nick Golding Date: Wed, 9 Jan 2019 15:39:31 +1100 Subject: [PATCH 10/38] flesh out laplace_approximation marginaliser --- NAMESPACE | 1 + R/marginalisers.R | 225 +++++++++++++++++++++++++++++++++++++++++++ man/marginalisers.Rd | 35 +++++++ 3 files changed, 261 insertions(+) diff --git a/NAMESPACE b/NAMESPACE index 24e7e5c3..9f8bb8f2 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -167,6 +167,7 @@ export(iprobit) export(joint) export(l_bfgs_b) export(laplace) +export(laplace_approximation) export(lkj_correlation) export(log1pe) export(logistic) diff --git a/R/marginalisers.R b/R/marginalisers.R index 4612cafc..3274a4f3 100644 --- a/R/marginalisers.R +++ b/R/marginalisers.R @@ -7,6 +7,14 @@ #' out random variables) and modify their behaviour, for use in #' \code{\link{marginalise}()}. #' +#' \code{discrete_marginalisation} marginalises discrete random variables via +#' a Riemann sum over a range of plausible values. +#' +#' \code{laplace_approximation} marginalises random variables (with +#' multivariate normal priors) approximately, using by Newton-Raphson +#' optimisation and assuming the posterior follows a multivariate normal +#' distribution. +#' #' @return a \code{marginaliser} object that can be passed to #' \code{marginalise}. NULL @@ -86,6 +94,214 @@ discrete_marginalisation <- function(values) { } +#' @rdname marginalisers +#' @export +#' +#' @param stepsize the (positive) size of steps in the Newton-Raphson +#' optimisation algorithm used to find the approximate conditional posterior +#' mode of the random variable +#' +#' @param stepsize the (positive) numerical convergence tolerance (in units of +#' the log conditional posterior density) of the Newton-Raphson optimisation +#' algorithm +#' +#' @param max_iterations the (positive) integer-like maximum number of iterations of +#' the Newton-Raphson optimisation algorithm +#' +#' @param warm_start whether to start the Newton-Raphson optimisation algorithm +#' at the optimal value from the last iteration of model inference (if +#' \code{TRUE}), or to randomly re-initialise it. +#' +#' @details \code{laplace_approximation} can only be used to marginalise +#' variables following a multivariate normal distribution. In addition, the +#' function to be marginalised must \emph{factorise}; ie. it must return a +#' vector-valued density with as many elements as the vector variable being +#' marginalised, and each of element of the density must depend only on the +#' corresponding element of the variable vector. This is the responsibility of +#' the user, and is not checked. +laplace_approximation <- function(stepsize = 0.1, + tolerance = 1e-6, + max_iterations = 50, + warm_start = TRUE) { + + # to do: + # - get tensors for parameters of the distribution + + # in future: + # - enable a non-factorising version (have the user say whether it is + # factorising) + # - handle an iid normal distribution too. + + # check arguments, Newton-Raphson iterations etc. + if (!(is.numeric(stepsize) && + is.vector(stepsize) && + length(stepsize) == 1 && + stepsize > 0)) { + stop ("'stepsize' must be a positive, scalar numeric value") + } + + if (!(is.numeric(tolerance) && + is.vector(tolerance) && + length(tolerance) == 1 && + tolerance > 0)) { + stop ("'tolerance' must be a positive, scalar numeric value") + } + + max_iterations <- as.integer(max_iterations) + if (!(is.vector(max_iterations) && + length(max_iterations) == 1 && + max_iterations > 0)) { + stop ("'max_iterations' must be a positive, scalar integer value") + } + + if (!(is.logical(warm_start) && warm_start %in% c(TRUE, FALSE))) { + stop ("'warm_start' must be either TRUE or FALSE") + } + + # define the marginalisation function + tf_marginaliser <- function(tf_conditional_density_fun, + tf_distribution_log_pdf, + other_args) { + + # get the first and second derivatives of the conditional density function, + # w.r.t. the variable being marginalised + derivs <- function (z) { + y <- tf_conditional_density_fun(z) + d1 <- tf$gradients(y, z)[[1]] + d2 <- tf$gradients(d1, z)[[1]] + list(d1, d2) + } + + # negative log-posterior for the current value of z under MVN assumption + psi <- function (a, z, mu, d0) { + fl(0.5) * t(a) %*% (z - mu) - tf_sum(tf_conditional_density_fun(z)) + } + + # # compute psi for different values of the stepsize (s) + # # to be added later, if I can work out how to nest/recode this linesearch optimisation + # psiline <- function (s, adiff, a, K, mu, d0) { + # a <- a + s * as.vector(adiff) + # z <- K %*% a + mu + # psi(a, z, mu, d0) + # } + + # dimension of the MVN distribution - how to pass in this and the parameters? + n <- ? + + # if the user wants hot starts, stash the last estimate of z, so we can + # start there in the next iteration of the outer inference algorithm. + z <- NULL + if (warm_start) { + z <- greta_stash$laplace_z_est + } + + # otherwise randomly initialise + if (is.null(z)) { + z_value <- add_first_dim(as_2D_array(rnorm(n))) + z <- tf$constant(z_value, dtype = tf_float()) + } + + # get parameters of MVN distribution + mu <- ? + Sigma <- ? + + # Newton-Raphson parameters + tolerance <- add_first_dim(as_2D_array(tolerance)) + tol <- tf$constant(tolerance, tf_float()) + obj_old <- tf$constant(-Inf, tf_float(), shape(1, 1, 1)) + obj <- -tf_sum(tf_conditional_density_fun(z)) + iter <- tf$constant(0L) + maxiter <- tf$constant(as.integer(max_iterations)) + + # other objects + a_value <- add_first_dim(as_2D_array(rep(0, n))) + a <- tf$constant(a_value, dtype = tf_float()) + U_value <- add_first_dim(diag(n)) + U <- tf$constant(U_value, tf_float()) + eye <- tf$constant(add_first_dim(diag(n)), + dtype = tf_float()) + + + # need tensorflow while loop to do Newton-Raphson iterations + + body <- function(z, a, U, obj_old, obj, tol, iter, maxiter) { + + obj.old <- obj + + deriv <- derivs(z) + d1 <- deriv[[2]] + d2 <- deriv[[3]] + + # curvature of the likelihood + W <- -d2 + rW <- sqrt(W) + + # decentred values of z + cf <- z - mu + + # approximate posterior covariance & cholesky factor + mat1 <- rW %*% t(rW) * Sigma + eye + U <- tf$cholesky(mat1) + L <- tf_transpose(U) + + # compute Newton-Raphson update direction + b <- W * cf + d1 + mat2 <- rW * (Sigma %*% b) + mat3 <- tf$matrix_triangular_solve(U, mat2) + adiff <- b - rW * tf$matrix_triangular_solve(L, mat3, lower = FALSE) - a + + # # find the optimal stepsize in that dimension (to be added later) + # res <- optimise(psiline, c(0, 2), adiff, a, Sigma, mu) + # stepsize <- res$minimum + + # do the update and compute new z and objective + a_new <- a + stepsize * adiff + z_new <- Sigma %*% a_new + mu + obj <- psi(a_new, z_new, mu) + + list(z_new, a_new, U, obj_old, obj, tol, iter + 1, maxiter) + + } + + cond <- function(z, a, U, obj_old, obj, tol, iter, maxiter) { + tf$less(iter, maxiter) & tf$greater(obj_old - obj, tol) + } + + values <- list(z, + a, + U, + obj_old, + obj, + tol, + iter, + maxiter) + + # run the Newton-Raphson optimisation to find the posterior mode of z + out <- tf$while_loop(cond, body, values) + + z <- out$z + a <- out$a + U <- out$U + + # store the estimate in the stash as an R vector (this will break with nested laplace + # approximations!) + greta_stash$laplace_z_est <- z + + # the approximate marginal conditional posterior + lp <- tf_sum(tf_conditional_density_fun(z)) + mnll <- (a %*% (z - mu)) / fl(2) - lp + tf_sum(tf$log(tf$matrix_diag_part(U))) + + -mnll + + } + + as_marginaliser(name = "laplace", + tf_marginaliser = tf_marginaliser, + parameters = list(), + distribution_check = multivariate_normal_check) + +} + # check that the distribution is discrete discrete_check <- function(distrib) { if (!distrib$discrete) { @@ -95,6 +311,15 @@ discrete_check <- function(distrib) { } } +# check that the distribution is multiariate normal +multivariate_normal_check <- function (distrib) { + if (distrib$distribution_name != "multivariate_normal") { + stop ("the Laplace approximation can only be used ", + "with a multivariate normal distribution", + call. = FALSE) + } +} + # helper to contruct marginalisers as_marginaliser <- function (name, tf_marginaliser, parameters, distribution_check) { diff --git a/man/marginalisers.Rd b/man/marginalisers.Rd index aea7562e..b2c19b24 100644 --- a/man/marginalisers.Rd +++ b/man/marginalisers.Rd @@ -3,13 +3,32 @@ \name{marginalisers} \alias{marginalisers} \alias{discrete_marginalisation} +\alias{laplace_approximation} \title{marginalisation methods} \usage{ discrete_marginalisation(values) + +laplace_approximation(stepsize = 0.1, tolerance = 1e-06, + max_iterations = 50, warm_start = TRUE) } \arguments{ \item{values}{an R vector giving values at which to evaluate the function for a discrete marginalisation} + +\item{stepsize}{the (positive) size of steps in the Newton-Raphson +optimisation algorithm used to find the approximate conditional posterior +mode of the random variable} + +\item{max_iterations}{the (positive) integer-like maximum number of iterations of +the Newton-Raphson optimisation algorithm} + +\item{warm_start}{whether to start the Newton-Raphson optimisation algorithm +at the optimal value from the last iteration of model inference (if +\code{TRUE}), or to randomly re-initialise it.} + +\item{stepsize}{the (positive) numerical convergence tolerance (in units of +the log conditional posterior density) of the Newton-Raphson optimisation +algorithm} } \value{ a \code{marginaliser} object that can be passed to @@ -19,6 +38,14 @@ a \code{marginaliser} object that can be passed to Functions to set up marginalisers (which explicitly integrate out random variables) and modify their behaviour, for use in \code{\link{marginalise}()}. + + \code{discrete_marginalisation} marginalises discrete random variables via + a Riemann sum over a range of plausible values. + + \code{laplace_approximation} marginalises random variables (with + multivariate normal priors) approximately, using by Newton-Raphson + optimisation and assuming the posterior follows a multivariate normal + distribution. } \details{ \code{discrete_marginalisation} can only be used with discrete @@ -30,4 +57,12 @@ Functions to set up marginalisers (which explicitly integrate no upper bound), the marginalisation can only ever be approximate. However if \code{values} cover a range of values with sufficiently high support in the distribution, that approximation error will be minimal. + +\code{laplace_approximation} can only be used to marginalise + variables following a multivariate normal distribution. In addition, the + function to be marginalised must \emph{factorise}; ie. it must return a + vector-valued density with as many elements as the vector variable being + marginalised, and each of element of the density must depend only on the + corresponding element of the variable vector. This is the responsibility of + the user, and is not checked. } From 125a989bef4742d6efbae9a735281676afe9da1f Mon Sep 17 00:00:00 2001 From: Nick Golding Date: Wed, 9 Jan 2019 18:10:18 +1100 Subject: [PATCH 11/38] get laplace approximation (almost) working --- R/dag_class.R | 5 ++ R/marginalise.R | 24 +++++++-- R/marginalisers.R | 123 ++++++++++++++++++++++--------------------- man/marginalisers.Rd | 8 +-- 4 files changed, 90 insertions(+), 70 deletions(-) diff --git a/R/dag_class.R b/R/dag_class.R index b05bae02..ccf8f20a 100644 --- a/R/dag_class.R +++ b/R/dag_class.R @@ -281,6 +281,11 @@ dag_class <- R6Class( MoreArgs = list(envir = tfe), SIMPLIFY = FALSE) + # assign the un-reduced densities, for use in marginalisation + assign("component_densities", + densities, + envir = self$tf_environment) + # reduce_sum them self$on_graph(summed_densities <- lapply(densities, tf_sum, drop = TRUE)) diff --git a/R/marginalise.R b/R/marginalise.R index d706f1fc..f742dcb4 100644 --- a/R/marginalise.R +++ b/R/marginalise.R @@ -162,10 +162,13 @@ marginalisation_distribution <- R6Class( log_prob <- function(x) { # x will be the target; a tf function for the distribution being - # marginalised + # marginalised. Pass in the distribution node and the dag too, in case + # the marginalisers need them. self$tf_marginaliser(self$conditional_density_fun, tf_distribution_log_pdf = x, - parameters) + other_args = parameters, + dag = dag, + distribution_node = self$target) } @@ -194,7 +197,7 @@ as_conditional_density <- function(r_fun, args) { # the function. # this function will take in tensors corresponding to the things in args - function(...) { + function(..., reduce = TRUE) { tensor_inputs <- list(...) @@ -251,7 +254,20 @@ as_conditional_density <- function(r_fun, args) { # log jacobian adjustment) sub_dag$define_joint_density(adjusted = FALSE) - sub_tfe$joint_density + # whether to reduce_sum the densities, or sum them elementwise + if (reduce) { + result <- sub_tfe$joint_density + } else { + densities <- sub_tfe$component_densities + if (length(densities) > 1) { + names(densities) <- NULL + result <- tf$add_n(densities) + } else { + result <- densities[[1]] + } + } + + result } diff --git a/R/marginalisers.R b/R/marginalisers.R index 3274a4f3..1908066b 100644 --- a/R/marginalisers.R +++ b/R/marginalisers.R @@ -49,7 +49,9 @@ discrete_marginalisation <- function(values) { # define the marginalisation function tf_marginaliser <- function(tf_conditional_density_fun, tf_distribution_log_pdf, - other_args) { + other_args, + dag, + distribution_node) { # convert these into a list of constant tensors with the correct dimensions # and float types @@ -108,10 +110,6 @@ discrete_marginalisation <- function(values) { #' @param max_iterations the (positive) integer-like maximum number of iterations of #' the Newton-Raphson optimisation algorithm #' -#' @param warm_start whether to start the Newton-Raphson optimisation algorithm -#' at the optimal value from the last iteration of model inference (if -#' \code{TRUE}), or to randomly re-initialise it. -#' #' @details \code{laplace_approximation} can only be used to marginalise #' variables following a multivariate normal distribution. In addition, the #' function to be marginalised must \emph{factorise}; ie. it must return a @@ -119,15 +117,13 @@ discrete_marginalisation <- function(values) { #' marginalised, and each of element of the density must depend only on the #' corresponding element of the variable vector. This is the responsibility of #' the user, and is not checked. -laplace_approximation <- function(stepsize = 0.1, +laplace_approximation <- function(stepsize = 0.05, tolerance = 1e-6, - max_iterations = 50, - warm_start = TRUE) { - - # to do: - # - get tensors for parameters of the distribution + max_iterations = 50) { # in future: + # - enable warm starts for subsequent steps of the outer inference algorithm + # do linesearch in the Newton-Raphson iterations # - enable a non-factorising version (have the user say whether it is # factorising) # - handle an iid normal distribution too. @@ -154,27 +150,31 @@ laplace_approximation <- function(stepsize = 0.1, stop ("'max_iterations' must be a positive, scalar integer value") } - if (!(is.logical(warm_start) && warm_start %in% c(TRUE, FALSE))) { - stop ("'warm_start' must be either TRUE or FALSE") - } - # define the marginalisation function tf_marginaliser <- function(tf_conditional_density_fun, tf_distribution_log_pdf, - other_args) { + other_args, + dag, + distribution_node) { + + # simpler interface to conditional density. If reduce = TRUE, it does reduce_sum on component densities + d0 <- function (z, reduce = TRUE) { + args <- c(list(z), other_args, list(reduce = reduce)) + do.call(tf_conditional_density_fun, args) + } - # get the first and second derivatives of the conditional density function, + # get the vectors of first and second derivatives of the conditional density function, # w.r.t. the variable being marginalised derivs <- function (z) { - y <- tf_conditional_density_fun(z) + y <- d0(z, reduce = FALSE) d1 <- tf$gradients(y, z)[[1]] d2 <- tf$gradients(d1, z)[[1]] list(d1, d2) } # negative log-posterior for the current value of z under MVN assumption - psi <- function (a, z, mu, d0) { - fl(0.5) * t(a) %*% (z - mu) - tf_sum(tf_conditional_density_fun(z)) + psi <- function (a, z, mu) { + fl(0.5) * tf$matmul(tf_transpose(a), (z - mu)) - d0(z) } # # compute psi for different values of the stepsize (s) @@ -185,33 +185,26 @@ laplace_approximation <- function(stepsize = 0.1, # psi(a, z, mu, d0) # } - # dimension of the MVN distribution - how to pass in this and the parameters? - n <- ? + # tensors for parameters of MVN distribution (make mu column vector now) + mu_node <- distribution_node$parameters$mean + mu <- dag$tf_environment[[dag$tf_name(mu_node)]] + mu <- tf_transpose(mu) + Sigma_node <- distribution_node$parameters$Sigma + Sigma <- dag$tf_environment[[dag$tf_name(Sigma_node)]] - # if the user wants hot starts, stash the last estimate of z, so we can - # start there in the next iteration of the outer inference algorithm. - z <- NULL - if (warm_start) { - z <- greta_stash$laplace_z_est - } - - # otherwise randomly initialise - if (is.null(z)) { - z_value <- add_first_dim(as_2D_array(rnorm(n))) - z <- tf$constant(z_value, dtype = tf_float()) - } + # dimension of the MVN distribution + n <- dim(mu)[[2]] - # get parameters of MVN distribution - mu <- ? - Sigma <- ? + # randomly initialise z, and expand to batch (warm starts will need some TF + # trickery) + z_value <- add_first_dim(as_2D_array(rnorm(n))) + z <- tf$constant(z_value, dtype = tf_float()) # Newton-Raphson parameters - tolerance <- add_first_dim(as_2D_array(tolerance)) - tol <- tf$constant(tolerance, tf_float()) - obj_old <- tf$constant(-Inf, tf_float(), shape(1, 1, 1)) - obj <- -tf_sum(tf_conditional_density_fun(z)) + tol <- tf$constant(tolerance, tf_float(), shape(1)) + obj_old <- tf$constant(-Inf, tf_float(), shape(1)) iter <- tf$constant(0L) - maxiter <- tf$constant(as.integer(max_iterations)) + maxiter <- tf$constant(max_iterations) # other objects a_value <- add_first_dim(as_2D_array(rep(0, n))) @@ -220,17 +213,26 @@ laplace_approximation <- function(stepsize = 0.1, U <- tf$constant(U_value, tf_float()) eye <- tf$constant(add_first_dim(diag(n)), dtype = tf_float()) + stepsize <- fl(stepsize) + # match batches on everything going into the loop that will have a batch + # dimension + z <- expand_to_batch(z, mu) + a <- expand_to_batch(a, mu) + U <- expand_to_batch(U, mu) + obj_old <- expand_to_batch(obj_old, mu) + tol <- expand_to_batch(tol, mu) - # need tensorflow while loop to do Newton-Raphson iterations + obj <- -d0(z) + # tensorflow while loop to do Newton-Raphson iterations body <- function(z, a, U, obj_old, obj, tol, iter, maxiter) { obj.old <- obj deriv <- derivs(z) - d1 <- deriv[[2]] - d2 <- deriv[[3]] + d1 <- deriv[[1]] + d2 <- deriv[[2]] # curvature of the likelihood W <- -d2 @@ -240,13 +242,13 @@ laplace_approximation <- function(stepsize = 0.1, cf <- z - mu # approximate posterior covariance & cholesky factor - mat1 <- rW %*% t(rW) * Sigma + eye + mat1 <- tf$matmul(rW, tf_transpose(rW)) * Sigma + eye U <- tf$cholesky(mat1) L <- tf_transpose(U) # compute Newton-Raphson update direction b <- W * cf + d1 - mat2 <- rW * (Sigma %*% b) + mat2 <- rW * tf$matmul(Sigma, b) mat3 <- tf$matrix_triangular_solve(U, mat2) adiff <- b - rW * tf$matrix_triangular_solve(L, mat3, lower = FALSE) - a @@ -256,15 +258,18 @@ laplace_approximation <- function(stepsize = 0.1, # do the update and compute new z and objective a_new <- a + stepsize * adiff - z_new <- Sigma %*% a_new + mu + z_new <- tf$matmul(Sigma, a_new) + mu obj <- psi(a_new, z_new, mu) + obj <- tf$squeeze(obj, 1:2) - list(z_new, a_new, U, obj_old, obj, tol, iter + 1, maxiter) + list(z_new, a_new, U, obj_old, obj, tol, iter + 1L, maxiter) } cond <- function(z, a, U, obj_old, obj, tol, iter, maxiter) { - tf$less(iter, maxiter) & tf$greater(obj_old - obj, tol) + none_converged <- tf$reduce_any(tf$greater(obj_old - obj, tol)) + in_time <- tf$less(iter, maxiter) + in_time & none_converged } values <- list(z, @@ -279,19 +284,17 @@ laplace_approximation <- function(stepsize = 0.1, # run the Newton-Raphson optimisation to find the posterior mode of z out <- tf$while_loop(cond, body, values) - z <- out$z - a <- out$a - U <- out$U - - # store the estimate in the stash as an R vector (this will break with nested laplace - # approximations!) - greta_stash$laplace_z_est <- z + z <- out[[1]] + a <- out[[2]] + U <- out[[3]] # the approximate marginal conditional posterior - lp <- tf_sum(tf_conditional_density_fun(z)) - mnll <- (a %*% (z - mu)) / fl(2) - lp + tf_sum(tf$log(tf$matrix_diag_part(U))) + lp <- d0(z) + p1 <- tf$matmul(tf_transpose(a), z - mu) / fl(2) + p2 <- tf_sum(tf$log(tf$matrix_diag_part(U))) + nmcp <- tf$squeeze(p1, 1:2) - lp + tf$squeeze(p2, 1) - -mnll + -nmcp } diff --git a/man/marginalisers.Rd b/man/marginalisers.Rd index b2c19b24..cb65a0ae 100644 --- a/man/marginalisers.Rd +++ b/man/marginalisers.Rd @@ -8,8 +8,8 @@ \usage{ discrete_marginalisation(values) -laplace_approximation(stepsize = 0.1, tolerance = 1e-06, - max_iterations = 50, warm_start = TRUE) +laplace_approximation(stepsize = 0.05, tolerance = 1e-06, + max_iterations = 50) } \arguments{ \item{values}{an R vector giving values at which to evaluate the function for a @@ -22,10 +22,6 @@ mode of the random variable} \item{max_iterations}{the (positive) integer-like maximum number of iterations of the Newton-Raphson optimisation algorithm} -\item{warm_start}{whether to start the Newton-Raphson optimisation algorithm -at the optimal value from the last iteration of model inference (if -\code{TRUE}), or to randomly re-initialise it.} - \item{stepsize}{the (positive) numerical convergence tolerance (in units of the log conditional posterior density) of the Newton-Raphson optimisation algorithm} From 990e4970d7e54401c5c06ab573bcbceb0de68b56 Mon Sep 17 00:00:00 2001 From: Nick Golding Date: Thu, 10 Jan 2019 14:16:58 +1100 Subject: [PATCH 12/38] finish making laplace approximation run --- R/dag_class.R | 1 + R/marginalisers.R | 27 ++++++++++++++++++--------- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/R/dag_class.R b/R/dag_class.R index ccf8f20a..5e01afa4 100644 --- a/R/dag_class.R +++ b/R/dag_class.R @@ -281,6 +281,7 @@ dag_class <- R6Class( MoreArgs = list(envir = tfe), SIMPLIFY = FALSE) + names(densities) <- NULL # assign the un-reduced densities, for use in marginalisation assign("component_densities", densities, diff --git a/R/marginalisers.R b/R/marginalisers.R index 1908066b..c0f1c964 100644 --- a/R/marginalisers.R +++ b/R/marginalisers.R @@ -216,15 +216,20 @@ laplace_approximation <- function(stepsize = 0.05, stepsize <- fl(stepsize) # match batches on everything going into the loop that will have a batch - # dimension - z <- expand_to_batch(z, mu) - a <- expand_to_batch(a, mu) - U <- expand_to_batch(U, mu) - obj_old <- expand_to_batch(obj_old, mu) - tol <- expand_to_batch(tol, mu) + # dimension later + objects <- list(mu, Sigma, z, a, U, obj_old, tol) + objects <- greta:::match_batches(objects) + mu <- objects[[1]] + Sigma <- objects[[2]] + z <- objects[[3]] + a <- objects[[4]] + U <- objects[[5]] + obj_old <- objects[[6]] + tol <- objects[[7]] obj <- -d0(z) + # tensorflow while loop to do Newton-Raphson iterations body <- function(z, a, U, obj_old, obj, tol, iter, maxiter) { @@ -284,9 +289,13 @@ laplace_approximation <- function(stepsize = 0.05, # run the Newton-Raphson optimisation to find the posterior mode of z out <- tf$while_loop(cond, body, values) - z <- out[[1]] - a <- out[[2]] - U <- out[[3]] + # get the two components, preventing the gradient from being computed through the optimisation + a <- tf$stop_gradient(out[[2]]) + U <- tf$stop_gradient(out[[3]]) + + # recompute z (gradient of z w.r.t. the free state should include this op, + # but not optimisation) + z <- tf$matmul(Sigma, a) + mu # the approximate marginal conditional posterior lp <- d0(z) From 1a8901cc78ff0576b3154e49c69c5f2a93fe6e73 Mon Sep 17 00:00:00 2001 From: Nick Golding Date: Fri, 11 Jan 2019 14:43:29 +1100 Subject: [PATCH 13/38] add golden section search and get laplace approximation working --- R/marginalisers.R | 67 ++++++++++++++++-------------- R/utils.R | 84 +++++++++++++++++++++++++++++++++++++- tests/testthat/test_misc.R | 22 ++++++++++ 3 files changed, 141 insertions(+), 32 deletions(-) diff --git a/R/marginalisers.R b/R/marginalisers.R index c0f1c964..c63bb7dc 100644 --- a/R/marginalisers.R +++ b/R/marginalisers.R @@ -159,7 +159,9 @@ laplace_approximation <- function(stepsize = 0.05, # simpler interface to conditional density. If reduce = TRUE, it does reduce_sum on component densities d0 <- function (z, reduce = TRUE) { - args <- c(list(z), other_args, list(reduce = reduce)) + # transpose z to a row vector, which the dag is expecting + t_z <- tf_transpose(z) + args <- c(list(t_z), other_args, list(reduce = reduce)) do.call(tf_conditional_density_fun, args) } @@ -174,17 +176,10 @@ laplace_approximation <- function(stepsize = 0.05, # negative log-posterior for the current value of z under MVN assumption psi <- function (a, z, mu) { - fl(0.5) * tf$matmul(tf_transpose(a), (z - mu)) - d0(z) + p1 <- tf$matmul(tf_transpose(a), z - mu) + fl(0.5) * tf$squeeze(p1, 1:2) - d0(z) } - # # compute psi for different values of the stepsize (s) - # # to be added later, if I can work out how to nest/recode this linesearch optimisation - # psiline <- function (s, adiff, a, K, mu, d0) { - # a <- a + s * as.vector(adiff) - # z <- K %*% a + mu - # psi(a, z, mu, d0) - # } - # tensors for parameters of MVN distribution (make mu column vector now) mu_node <- distribution_node$parameters$mean mu <- dag$tf_environment[[dag$tf_name(mu_node)]] @@ -197,12 +192,15 @@ laplace_approximation <- function(stepsize = 0.05, # randomly initialise z, and expand to batch (warm starts will need some TF # trickery) + + # here z is a *column vector* to simplify later calculations, it needs to be + # transposed to a row vector before feeding into the likelihood function(s) z_value <- add_first_dim(as_2D_array(rnorm(n))) z <- tf$constant(z_value, dtype = tf_float()) # Newton-Raphson parameters tol <- tf$constant(tolerance, tf_float(), shape(1)) - obj_old <- tf$constant(-Inf, tf_float(), shape(1)) + obj_old <- tf$constant(Inf, tf_float(), shape(1)) iter <- tf$constant(0L) maxiter <- tf$constant(max_iterations) @@ -213,11 +211,12 @@ laplace_approximation <- function(stepsize = 0.05, U <- tf$constant(U_value, tf_float()) eye <- tf$constant(add_first_dim(diag(n)), dtype = tf_float()) - stepsize <- fl(stepsize) + + stepsize <- fl(add_first_dim(as_2D_array(stepsize))) # match batches on everything going into the loop that will have a batch # dimension later - objects <- list(mu, Sigma, z, a, U, obj_old, tol) + objects <- list(mu, Sigma, z, a, U, obj_old, tol, stepsize) objects <- greta:::match_batches(objects) mu <- objects[[1]] Sigma <- objects[[2]] @@ -226,14 +225,18 @@ laplace_approximation <- function(stepsize = 0.05, U <- objects[[5]] obj_old <- objects[[6]] tol <- objects[[7]] + stepsize <- objects[[8]] obj <- -d0(z) + # get the batch dim explicitly, for the step size optimisation + batch_dim <- tf$shape(mu)[[0]] + batch_dim <- tf$expand_dims(batch_dim, 0L) # tensorflow while loop to do Newton-Raphson iterations body <- function(z, a, U, obj_old, obj, tol, iter, maxiter) { - obj.old <- obj + obj_old <- obj deriv <- derivs(z) d1 <- deriv[[1]] @@ -257,24 +260,32 @@ laplace_approximation <- function(stepsize = 0.05, mat3 <- tf$matrix_triangular_solve(U, mat2) adiff <- b - rW * tf$matrix_triangular_solve(L, mat3, lower = FALSE) - a - # # find the optimal stepsize in that dimension (to be added later) - # res <- optimise(psiline, c(0, 2), adiff, a, Sigma, mu) - # stepsize <- res$minimum + # use golden section search to find the optimum distance to step in this + # direction, for each batch simultaneously + psiline <- function (s) { + a_new <- a + s * adiff + z_new <- tf$matmul(Sigma, a) + mu + psi(a_new, z_new, mu) + } + + ls_results <- gss(psiline, batch_dim) + stepsize <- ls_results$minimum + stepsize <- tf$expand_dims(stepsize, 1L) + stepsize <- tf$expand_dims(stepsize, 2L) # do the update and compute new z and objective a_new <- a + stepsize * adiff z_new <- tf$matmul(Sigma, a_new) + mu obj <- psi(a_new, z_new, mu) - obj <- tf$squeeze(obj, 1:2) list(z_new, a_new, U, obj_old, obj, tol, iter + 1L, maxiter) } cond <- function(z, a, U, obj_old, obj, tol, iter, maxiter) { - none_converged <- tf$reduce_any(tf$greater(obj_old - obj, tol)) + not_all_converged <- tf$reduce_any(tf$less(tol, obj_old - obj)) in_time <- tf$less(iter, maxiter) - in_time & none_converged + in_time & not_all_converged } values <- list(z, @@ -289,19 +300,13 @@ laplace_approximation <- function(stepsize = 0.05, # run the Newton-Raphson optimisation to find the posterior mode of z out <- tf$while_loop(cond, body, values) - # get the two components, preventing the gradient from being computed through the optimisation - a <- tf$stop_gradient(out[[2]]) - U <- tf$stop_gradient(out[[3]]) - - # recompute z (gradient of z w.r.t. the free state should include this op, - # but not optimisation) - z <- tf$matmul(Sigma, a) + mu + z <- out[[1]] + a <- out[[2]] + U <- out[[3]] # the approximate marginal conditional posterior - lp <- d0(z) - p1 <- tf$matmul(tf_transpose(a), z - mu) / fl(2) - p2 <- tf_sum(tf$log(tf$matrix_diag_part(U))) - nmcp <- tf$squeeze(p1, 1:2) - lp + tf$squeeze(p2, 1) + logdet <- tf_sum(tf$log(tf$matrix_diag_part(U))) + nmcp <- psi(a, z, mu) + tf$squeeze(logdet, 1) -nmcp diff --git a/R/utils.R b/R/utils.R index dc50d44e..e047c30c 100644 --- a/R/utils.R +++ b/R/utils.R @@ -398,6 +398,87 @@ rhex <- function() paste(as.raw(sample.int(256L, 4, TRUE) - 1L), collapse = "") +# vectorised golden section search algorithm for linesearch (1D constrained +# minimisation) function must accept and return a 1D tensor of values, and +# return a a tensor of the same shape returning objectives; this can therefore +# run linesearch for multiple chains simultaneously. The batch dimension must be +# known in advance however. +gss <- function(func, + batch_dim = 1, + lower = 0, + upper = 1, + max_iterations = 50, + tolerance = 1e-32) { + + lower <- tf$constant(lower, tf_float(), shape(1)) + upper <- tf$constant(upper, tf_float(), shape(1)) + maxiter <- tf$constant(as.integer(max_iterations), tf$int32) + tol <- fl(tolerance) + iter <- tf$constant(1L, tf$int32) + + phi <- (1 + sqrt(5)) / 2 + phim1 <- fl(phi - 1) + twomphi <- fl(2.0 - phi) + + # expand out to batch dimensions + if (!inherits(batch_dim, "tensorflow.tensor")) { + batch_dim <- as.integer(batch_dim) + batch_dim <- tf$constant(batch_dim, tf$int32, shape(1)) + } + + left <- tf$tile(lower, batch_dim) + right <- tf$tile(upper, batch_dim) + width <- right - left + optimum <- tf$zeros(batch_dim, tf_float()) + + # status of the multiple consecutive searches; three columns, for left, right + # and optimum + status <- tf$stack(list(left, right, optimum), axis = 1L) + + # start loop + body <- function (left, right, optimum, width, iter) { + + d <- phim1 * width + + # find evaluation points + a <- left + d + b <- right - d + + # prep reordered matrices for whether steps are above or below + # if func(a) < func(b) + above <- tf$stack(list(left, b, a), axis = 1L) + # else + not_above <- tf$stack(list(a, right, b), axis = 1L) + + status <- tf$where(tf$less(func(b), func(a)), above, not_above) + + left <- status[, 0] + right <- status[, 1] + optimum <- status[, 2] + + width <- right - left + + list(left, right, optimum, width, iter + 1L) + } + + cond <- function (left, right, optimum, width, iter) { + err <- twomphi * tf$abs(width / optimum) + not_converged <- tf$less(tol, err) + not_all_converged <- tf$reduce_any(not_converged) + in_time <- tf$less(iter, maxiter) + in_time & not_all_converged + } + + values <- list(left, right, optimum, width, iter) + + out <- tf$while_loop(cond, body, values) + + result <- out[3:5] + names(result) <- c("minimum", "width", "iterations") + result + +} + misc_module <- module(module, check_tf_version, member, @@ -422,7 +503,8 @@ misc_module <- module(module, match_batches, split_chains, hessian_dims, - rhex) + rhex, + gss) # check dimensions of arguments to ops, and return the maximum dimension check_dims <- function(..., target_dim = NULL) { diff --git a/tests/testthat/test_misc.R b/tests/testthat/test_misc.R index 49f38073..ccbeebdd 100644 --- a/tests/testthat/test_misc.R +++ b/tests/testthat/test_misc.R @@ -285,3 +285,25 @@ test_that("module works", { expect_identical(names(mod$functions), c("exp", "log", "sum")) }) + +test_that("golden section search works", { + + skip_if_not(check_tf_version()) + source("helpers.R") + + n_batch <- 4 + + # simple quadratic function with different optima for each batch + func <- function (x) { + (x - locs) ^ 2 + } + + # random optima + locs <- runif(n_batch, 0.2, 0.8) + + # run the minimisation + ss <- gss(func, n_batch) + result <- tf$Session()$run(ss) + compare_op(result$minimum, locs, tolerance = 0.2) + +}) From f08650745395d193dfc79911ea0f094763cfad9b Mon Sep 17 00:00:00 2001 From: Nick Golding Date: Fri, 11 Jan 2019 17:04:47 +1100 Subject: [PATCH 14/38] remove stepsize from laplace_approximation options (we optimise it now) --- R/marginalisers.R | 23 +++-------------------- man/marginalisers.Rd | 13 ++++--------- 2 files changed, 7 insertions(+), 29 deletions(-) diff --git a/R/marginalisers.R b/R/marginalisers.R index c63bb7dc..b8804185 100644 --- a/R/marginalisers.R +++ b/R/marginalisers.R @@ -99,11 +99,7 @@ discrete_marginalisation <- function(values) { #' @rdname marginalisers #' @export #' -#' @param stepsize the (positive) size of steps in the Newton-Raphson -#' optimisation algorithm used to find the approximate conditional posterior -#' mode of the random variable -#' -#' @param stepsize the (positive) numerical convergence tolerance (in units of +#' @param tolerance the (positive) numerical convergence tolerance (in units of #' the log conditional posterior density) of the Newton-Raphson optimisation #' algorithm #' @@ -117,25 +113,15 @@ discrete_marginalisation <- function(values) { #' marginalised, and each of element of the density must depend only on the #' corresponding element of the variable vector. This is the responsibility of #' the user, and is not checked. -laplace_approximation <- function(stepsize = 0.05, - tolerance = 1e-6, +laplace_approximation <- function(tolerance = 1e-6, max_iterations = 50) { # in future: # - enable warm starts for subsequent steps of the outer inference algorithm - # do linesearch in the Newton-Raphson iterations # - enable a non-factorising version (have the user say whether it is # factorising) # - handle an iid normal distribution too. - # check arguments, Newton-Raphson iterations etc. - if (!(is.numeric(stepsize) && - is.vector(stepsize) && - length(stepsize) == 1 && - stepsize > 0)) { - stop ("'stepsize' must be a positive, scalar numeric value") - } - if (!(is.numeric(tolerance) && is.vector(tolerance) && length(tolerance) == 1 && @@ -212,11 +198,9 @@ laplace_approximation <- function(stepsize = 0.05, eye <- tf$constant(add_first_dim(diag(n)), dtype = tf_float()) - stepsize <- fl(add_first_dim(as_2D_array(stepsize))) - # match batches on everything going into the loop that will have a batch # dimension later - objects <- list(mu, Sigma, z, a, U, obj_old, tol, stepsize) + objects <- list(mu, Sigma, z, a, U, obj_old, tol) objects <- greta:::match_batches(objects) mu <- objects[[1]] Sigma <- objects[[2]] @@ -225,7 +209,6 @@ laplace_approximation <- function(stepsize = 0.05, U <- objects[[5]] obj_old <- objects[[6]] tol <- objects[[7]] - stepsize <- objects[[8]] obj <- -d0(z) diff --git a/man/marginalisers.Rd b/man/marginalisers.Rd index cb65a0ae..b5a8fdfc 100644 --- a/man/marginalisers.Rd +++ b/man/marginalisers.Rd @@ -8,23 +8,18 @@ \usage{ discrete_marginalisation(values) -laplace_approximation(stepsize = 0.05, tolerance = 1e-06, - max_iterations = 50) +laplace_approximation(tolerance = 1e-06, max_iterations = 50) } \arguments{ \item{values}{an R vector giving values at which to evaluate the function for a discrete marginalisation} -\item{stepsize}{the (positive) size of steps in the Newton-Raphson -optimisation algorithm used to find the approximate conditional posterior -mode of the random variable} +\item{tolerance}{the (positive) numerical convergence tolerance (in units of +the log conditional posterior density) of the Newton-Raphson optimisation +algorithm} \item{max_iterations}{the (positive) integer-like maximum number of iterations of the Newton-Raphson optimisation algorithm} - -\item{stepsize}{the (positive) numerical convergence tolerance (in units of -the log conditional posterior density) of the Newton-Raphson optimisation -algorithm} } \value{ a \code{marginaliser} object that can be passed to From 0744608609930fd7c79b43a3608605b46524cda8 Mon Sep 17 00:00:00 2001 From: Nick Golding Date: Fri, 11 Jan 2019 17:05:18 +1100 Subject: [PATCH 15/38] bugfixes and tests to make sure laplace approximation works --- R/marginalisers.R | 24 ++++++++++- tests/testthat/test_marginalisation.R | 62 +++++++++++++++++++++++++++ 2 files changed, 84 insertions(+), 2 deletions(-) diff --git a/R/marginalisers.R b/R/marginalisers.R index b8804185..bd9a6f49 100644 --- a/R/marginalisers.R +++ b/R/marginalisers.R @@ -246,6 +246,8 @@ laplace_approximation <- function(tolerance = 1e-6, # use golden section search to find the optimum distance to step in this # direction, for each batch simultaneously psiline <- function (s) { + s <- tf$expand_dims(s, 1L) + s <- tf$expand_dims(s, 2L) a_new <- a + s * adiff z_new <- tf$matmul(Sigma, a) + mu psi(a_new, z_new, mu) @@ -283,9 +285,27 @@ laplace_approximation <- function(tolerance = 1e-6, # run the Newton-Raphson optimisation to find the posterior mode of z out <- tf$while_loop(cond, body, values) - z <- out[[1]] a <- out[[2]] - U <- out[[3]] + + # apparently we need to redefine z and U here, or the backprop errors + + # lots of duplicated code; this could be tidied up, but I ran out of time! + z <- tf$matmul(Sigma, a) + mu + + deriv <- derivs(z) + d1 <- deriv[[1]] + d2 <- deriv[[2]] + + # curvature of the likelihood + W <- -d2 + rW <- sqrt(W) + + # decentred values of z + cf <- z - mu + + # approximate posterior covariance & cholesky factor + mat1 <- tf$matmul(rW, tf_transpose(rW)) * Sigma + eye + U <- tf$cholesky(mat1) # the approximate marginal conditional posterior logdet <- tf_sum(tf$log(tf$matrix_diag_part(U))) diff --git a/tests/testthat/test_marginalisation.R b/tests/testthat/test_marginalisation.R index 27ce7c23..594124fc 100644 --- a/tests/testthat/test_marginalisation.R +++ b/tests/testthat/test_marginalisation.R @@ -144,3 +144,65 @@ test_that("discrete marginalisation gives correct densities", { compare_op(expected, observed) }) + + +test_that("laplace_approximation errors nicely", { + + skip_if_not(check_tf_version()) + source("helpers.R") + + # bad tolerance + expect_error( + laplace_approximation(tolerance = -1), + "must be a positive, scalar numeric value" + ) + + # bad max iterations + expect_error( + laplace_approximation(max_iterations = 0), + "must be a positive, scalar integer value" + ) + + # mismatch with distribution + expect_error( + marginalise(I, normal(0, 1), laplace_approximation()), + "can only be used with a multivariate normal distribution" + ) + +}) + + +test_that("inference runs with laplace approximation", { + + skip_if_not(check_tf_version()) + source("helpers.R") + + # the eight schools model + N <- letters[1:8] + treatment_effects <- c(28.39, 7.94, -2.75 , 6.82, -0.64, 0.63, 18.01, 12.16) + treatment_stddevs <- c(14.9, 10.2, 16.3, 11.0, 9.4, 11.4, 10.4, 17.6) + + int <- variable() + sd <- variable(lower = 0) + lik <- function (theta) { + distribution(treatment_effects) <- normal(t(theta), treatment_stddevs) + } + + # mock up as a multivariate normal distribution + Sigma <- diag(8) * sd ^ 2 + mu <- ones(1, 8) * int + marginalise(lik, + multivariate_normal(mu, Sigma), + laplace_approximation()) + + m <- model(int, sd) + + expect_ok(o <- opt(m)) + expect_ok(draws <- mcmc(m, warmup = 20, n_samples = 20, + verbose = FALSE)) + + # the optimisation result should be similar to a very long run of + +}) + + From d65ad2599bfce3b22b5af5e3cc3e2b9b8a3aff66 Mon Sep 17 00:00:00 2001 From: Nick Golding Date: Fri, 14 Feb 2020 17:04:21 +1100 Subject: [PATCH 16/38] fix lints --- R/marginalise.R | 20 ++--- R/marginalisers.R | 106 +++++++++++++------------- R/utils.R | 4 +- tests/testthat/test_marginalisation.R | 21 ++--- tests/testthat/test_misc.R | 2 +- 5 files changed, 79 insertions(+), 74 deletions(-) diff --git a/R/marginalise.R b/R/marginalise.R index f742dcb4..641af894 100644 --- a/R/marginalise.R +++ b/R/marginalise.R @@ -71,7 +71,7 @@ marginalise <- function(fun, variable, method, ...) { # check the inputs if (!is.function(fun)) { - stop ("'fun' must be an R function") + stop("'fun' must be an R function") } # check the additional arguments to fun are given in dots @@ -79,16 +79,16 @@ marginalise <- function(fun, variable, method, ...) { dot_names <- names(dots) expected_names <- names(formals(fun)[-1]) if (!all(expected_names %in% dot_names)) { - stop ("all arguments to 'fun' must be passed, named, to marginalise") + stop("all arguments to 'fun' must be passed, named, to marginalise") } if (!inherits(distr, "distribution_node")) { - stop ("'variable' must be a variable greta array with a distribution") + stop("'variable' must be a variable greta array with a distribution") } if (!inherits(method, "marginaliser")) { - stop ("'method' must be a valid marginalisation method. ", - "See ?marginalise for options") + stop("'method' must be a valid marginalisation method. ", + "See ?marginalise for options") } # check the distribution is compatible with the method @@ -217,15 +217,15 @@ as_conditional_density <- function(r_fun, args) { # check there are distributions if (!any(sub_dag$node_types == "distribution")) { - stop ("'fun' must constain at least one distribution over data", - call. = FALSE) + stop("'fun' must constain at least one distribution over data", + call. = FALSE) } # check there are no variables if (any(sub_dag$node_types == "variable")) { - stop ("'fun' must not create any new variables, ", - "variables can be passed in as arguments", - call. = FALSE) + stop("'fun' must not create any new variables, ", + "variables can be passed in as arguments", + call. = FALSE) } # use the default graph, so that it can be overwritten when this is called diff --git a/R/marginalisers.R b/R/marginalisers.R index bd9a6f49..54d9fa15 100644 --- a/R/marginalisers.R +++ b/R/marginalisers.R @@ -22,8 +22,8 @@ NULL #' @rdname marginalisers #' @export #' -#' @param values an R vector giving values at which to evaluate the function for a -#' discrete marginalisation +#' @param values an R vector giving values at which to evaluate the function for +#' a discrete marginalisation #' #' @details \code{discrete_marginalisation} can only be used with discrete #' probability distributions, e.g. those defined with \code{poisson()} and @@ -43,7 +43,7 @@ discrete_marginalisation <- function(values) { msg <- paste0(msg, ", not a greta array") } - stop (msg) + stop(msg) } # define the marginalisation function @@ -103,8 +103,8 @@ discrete_marginalisation <- function(values) { #' the log conditional posterior density) of the Newton-Raphson optimisation #' algorithm #' -#' @param max_iterations the (positive) integer-like maximum number of iterations of -#' the Newton-Raphson optimisation algorithm +#' @param max_iterations the (positive) integer-like maximum number of +#' iterations of the Newton-Raphson optimisation algorithm #' #' @details \code{laplace_approximation} can only be used to marginalise #' variables following a multivariate normal distribution. In addition, the @@ -126,14 +126,14 @@ laplace_approximation <- function(tolerance = 1e-6, is.vector(tolerance) && length(tolerance) == 1 && tolerance > 0)) { - stop ("'tolerance' must be a positive, scalar numeric value") + stop("'tolerance' must be a positive, scalar numeric value") } max_iterations <- as.integer(max_iterations) if (!(is.vector(max_iterations) && length(max_iterations) == 1 && max_iterations > 0)) { - stop ("'max_iterations' must be a positive, scalar integer value") + stop("'max_iterations' must be a positive, scalar integer value") } # define the marginalisation function @@ -143,17 +143,18 @@ laplace_approximation <- function(tolerance = 1e-6, dag, distribution_node) { - # simpler interface to conditional density. If reduce = TRUE, it does reduce_sum on component densities - d0 <- function (z, reduce = TRUE) { + # simpler interface to conditional density. If reduce = TRUE, it does + # reduce_sum on component densities + d0 <- function(z, reduce = TRUE) { # transpose z to a row vector, which the dag is expecting t_z <- tf_transpose(z) args <- c(list(t_z), other_args, list(reduce = reduce)) do.call(tf_conditional_density_fun, args) } - # get the vectors of first and second derivatives of the conditional density function, - # w.r.t. the variable being marginalised - derivs <- function (z) { + # get the vectors of first and second derivatives of the conditional density + # function, w.r.t. the variable being marginalised + derivs <- function(z) { y <- d0(z, reduce = FALSE) d1 <- tf$gradients(y, z)[[1]] d2 <- tf$gradients(d1, z)[[1]] @@ -161,7 +162,7 @@ laplace_approximation <- function(tolerance = 1e-6, } # negative log-posterior for the current value of z under MVN assumption - psi <- function (a, z, mu) { + psi <- function(a, z, mu) { p1 <- tf$matmul(tf_transpose(a), z - mu) fl(0.5) * tf$squeeze(p1, 1:2) - d0(z) } @@ -170,8 +171,8 @@ laplace_approximation <- function(tolerance = 1e-6, mu_node <- distribution_node$parameters$mean mu <- dag$tf_environment[[dag$tf_name(mu_node)]] mu <- tf_transpose(mu) - Sigma_node <- distribution_node$parameters$Sigma - Sigma <- dag$tf_environment[[dag$tf_name(Sigma_node)]] + sigma_node <- distribution_node$parameters$Sigma + sigma <- dag$tf_environment[[dag$tf_name(sigma_node)]] # dimension of the MVN distribution n <- dim(mu)[[2]] @@ -193,20 +194,20 @@ laplace_approximation <- function(tolerance = 1e-6, # other objects a_value <- add_first_dim(as_2D_array(rep(0, n))) a <- tf$constant(a_value, dtype = tf_float()) - U_value <- add_first_dim(diag(n)) - U <- tf$constant(U_value, tf_float()) + u_value <- add_first_dim(diag(n)) + u <- tf$constant(u_value, tf_float()) eye <- tf$constant(add_first_dim(diag(n)), dtype = tf_float()) # match batches on everything going into the loop that will have a batch # dimension later - objects <- list(mu, Sigma, z, a, U, obj_old, tol) + objects <- list(mu, sigma, z, a, u, obj_old, tol) objects <- greta:::match_batches(objects) mu <- objects[[1]] - Sigma <- objects[[2]] + sigma <- objects[[2]] z <- objects[[3]] a <- objects[[4]] - U <- objects[[5]] + u <- objects[[5]] obj_old <- objects[[6]] tol <- objects[[7]] @@ -217,7 +218,7 @@ laplace_approximation <- function(tolerance = 1e-6, batch_dim <- tf$expand_dims(batch_dim, 0L) # tensorflow while loop to do Newton-Raphson iterations - body <- function(z, a, U, obj_old, obj, tol, iter, maxiter) { + body <- function(z, a, u, obj_old, obj, tol, iter, maxiter) { obj_old <- obj @@ -226,30 +227,30 @@ laplace_approximation <- function(tolerance = 1e-6, d2 <- deriv[[2]] # curvature of the likelihood - W <- -d2 - rW <- sqrt(W) + w <- -d2 + rw <- sqrt(w) # decentred values of z cf <- z - mu # approximate posterior covariance & cholesky factor - mat1 <- tf$matmul(rW, tf_transpose(rW)) * Sigma + eye - U <- tf$cholesky(mat1) - L <- tf_transpose(U) + mat1 <- tf$matmul(rw, tf_transpose(rw)) * sigma + eye + u <- tf$cholesky(mat1) + L <- tf_transpose(u) # compute Newton-Raphson update direction - b <- W * cf + d1 - mat2 <- rW * tf$matmul(Sigma, b) - mat3 <- tf$matrix_triangular_solve(U, mat2) - adiff <- b - rW * tf$matrix_triangular_solve(L, mat3, lower = FALSE) - a + b <- w * cf + d1 + mat2 <- rw * tf$matmul(sigma, b) + mat3 <- tf$matrix_triangular_solve(u, mat2) + adiff <- b - rw * tf$matrix_triangular_solve(L, mat3, lower = FALSE) - a # use golden section search to find the optimum distance to step in this # direction, for each batch simultaneously - psiline <- function (s) { + psiline <- function(s) { s <- tf$expand_dims(s, 1L) s <- tf$expand_dims(s, 2L) a_new <- a + s * adiff - z_new <- tf$matmul(Sigma, a) + mu + z_new <- tf$matmul(sigma, a) + mu psi(a_new, z_new, mu) } @@ -260,14 +261,14 @@ laplace_approximation <- function(tolerance = 1e-6, # do the update and compute new z and objective a_new <- a + stepsize * adiff - z_new <- tf$matmul(Sigma, a_new) + mu + z_new <- tf$matmul(sigma, a_new) + mu obj <- psi(a_new, z_new, mu) - list(z_new, a_new, U, obj_old, obj, tol, iter + 1L, maxiter) + list(z_new, a_new, u, obj_old, obj, tol, iter + 1L, maxiter) } - cond <- function(z, a, U, obj_old, obj, tol, iter, maxiter) { + cond <- function(z, a, u, obj_old, obj, tol, iter, maxiter) { not_all_converged <- tf$reduce_any(tf$less(tol, obj_old - obj)) in_time <- tf$less(iter, maxiter) in_time & not_all_converged @@ -275,7 +276,7 @@ laplace_approximation <- function(tolerance = 1e-6, values <- list(z, a, - U, + u, obj_old, obj, tol, @@ -287,28 +288,28 @@ laplace_approximation <- function(tolerance = 1e-6, a <- out[[2]] - # apparently we need to redefine z and U here, or the backprop errors + # apparently we need to redefine z and u here, or the backprop errors # lots of duplicated code; this could be tidied up, but I ran out of time! - z <- tf$matmul(Sigma, a) + mu + z <- tf$matmul(sigma, a) + mu deriv <- derivs(z) d1 <- deriv[[1]] d2 <- deriv[[2]] # curvature of the likelihood - W <- -d2 - rW <- sqrt(W) + w <- -d2 + rw <- sqrt(w) # decentred values of z cf <- z - mu # approximate posterior covariance & cholesky factor - mat1 <- tf$matmul(rW, tf_transpose(rW)) * Sigma + eye - U <- tf$cholesky(mat1) + mat1 <- tf$matmul(rw, tf_transpose(rw)) * sigma + eye + u <- tf$cholesky(mat1) # the approximate marginal conditional posterior - logdet <- tf_sum(tf$log(tf$matrix_diag_part(U))) + logdet <- tf_sum(tf$log(tf$matrix_diag_part(u))) nmcp <- psi(a, z, mu) + tf$squeeze(logdet, 1) -nmcp @@ -325,23 +326,26 @@ laplace_approximation <- function(tolerance = 1e-6, # check that the distribution is discrete discrete_check <- function(distrib) { if (!distrib$discrete) { - stop ("this marginalisation method can only be used ", - "with discrete distributions", - call. = FALSE) + stop("this marginalisation method can only be used ", + "with discrete distributions", + call. = FALSE) } } # check that the distribution is multiariate normal -multivariate_normal_check <- function (distrib) { +multivariate_normal_check <- function(distrib) { if (distrib$distribution_name != "multivariate_normal") { - stop ("the Laplace approximation can only be used ", - "with a multivariate normal distribution", - call. = FALSE) + stop("the Laplace approximation can only be used ", + "with a multivariate normal distribution", + call. = FALSE) } } # helper to contruct marginalisers -as_marginaliser <- function (name, tf_marginaliser, parameters, distribution_check) { +as_marginaliser <- function(name, + tf_marginaliser, + parameters, + distribution_check) { obj <- list(name = name, tf_marginaliser = tf_marginaliser, diff --git a/R/utils.R b/R/utils.R index d8194d63..908cc31b 100644 --- a/R/utils.R +++ b/R/utils.R @@ -421,7 +421,7 @@ gss <- function(func, status <- tf$stack(list(left, right, optimum), axis = 1L) # start loop - body <- function (left, right, optimum, width, iter) { + body <- function(left, right, optimum, width, iter) { d <- phim1 * width @@ -446,7 +446,7 @@ gss <- function(func, list(left, right, optimum, width, iter + 1L) } - cond <- function (left, right, optimum, width, iter) { + cond <- function(left, right, optimum, width, iter) { err <- twomphi * tf$abs(width / optimum) not_converged <- tf$less(tol, err) not_all_converged <- tf$reduce_any(not_converged) diff --git a/tests/testthat/test_marginalisation.R b/tests/testthat/test_marginalisation.R index 594124fc..f1cdad3a 100644 --- a/tests/testthat/test_marginalisation.R +++ b/tests/testthat/test_marginalisation.R @@ -32,7 +32,9 @@ test_that("marginalise errors nicely", { # the following can only be assessed on final model definition # function adds variables - fun <- function (x) {x + normal(0, 1)} + fun <- function(x) { + x + normal(0, 1) + } lambda <- variable() marginalise(fun, poisson(lambda), discrete_marginalisation(0:2)) expect_error( @@ -41,7 +43,9 @@ test_that("marginalise errors nicely", { ) # function has no distribution - fun <- function (x) {x * 2} + fun <- function(x) { + x * 2 + } lambda <- variable() marginalise(fun, poisson(lambda), discrete_marginalisation(0:2)) expect_error( @@ -90,7 +94,7 @@ test_that("inference runs with discrete marginalisation", { p <- variable(lower = 0, upper = 1) alpha <- variable() beta <- variable() - likelihood <- function (z, alpha, beta) { + likelihood <- function(z, alpha, beta) { eta <- alpha + beta * x * z lambda <- exp(eta) distribution(y) <- poisson(lambda) @@ -116,7 +120,7 @@ test_that("discrete marginalisation gives correct densities", { # no constraints to no adjustment to account for lambda <- variable() y <- rnorm(100) - likelihood <- function (z) { + likelihood <- function(z) { distribution(y) <- normal(z, 1) } values <- 0:5 @@ -178,21 +182,20 @@ test_that("inference runs with laplace approximation", { source("helpers.R") # the eight schools model - N <- letters[1:8] treatment_effects <- c(28.39, 7.94, -2.75 , 6.82, -0.64, 0.63, 18.01, 12.16) treatment_stddevs <- c(14.9, 10.2, 16.3, 11.0, 9.4, 11.4, 10.4, 17.6) int <- variable() sd <- variable(lower = 0) - lik <- function (theta) { + lik <- function(theta) { distribution(treatment_effects) <- normal(t(theta), treatment_stddevs) } # mock up as a multivariate normal distribution - Sigma <- diag(8) * sd ^ 2 + sigma <- diag(8) * sd ^ 2 mu <- ones(1, 8) * int marginalise(lik, - multivariate_normal(mu, Sigma), + multivariate_normal(mu, sigma), laplace_approximation()) m <- model(int, sd) @@ -204,5 +207,3 @@ test_that("inference runs with laplace approximation", { # the optimisation result should be similar to a very long run of }) - - diff --git a/tests/testthat/test_misc.R b/tests/testthat/test_misc.R index 259ba8f2..d4a7590b 100644 --- a/tests/testthat/test_misc.R +++ b/tests/testthat/test_misc.R @@ -294,7 +294,7 @@ test_that("golden section search works", { n_batch <- 4 # simple quadratic function with different optima for each batch - func <- function (x) { + func <- function(x) { (x - locs) ^ 2 } From f8c88dcdd176d3eae06bcfb80fa109ed16feb68a Mon Sep 17 00:00:00 2001 From: Nick Golding Date: Fri, 14 Feb 2020 17:13:20 +1100 Subject: [PATCH 17/38] fix lints --- R/marginalisers.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/R/marginalisers.R b/R/marginalisers.R index 54d9fa15..5cb80479 100644 --- a/R/marginalisers.R +++ b/R/marginalisers.R @@ -56,7 +56,7 @@ discrete_marginalisation <- function(values) { # convert these into a list of constant tensors with the correct dimensions # and float types values_list <- as.list(values) - values_list <- lapply(values_list, as_2D_array) + values_list <- lapply(values_list, as_2d_array) values_list <- lapply(values_list, add_first_dim) values_list <- lapply(values_list, fl) @@ -182,7 +182,7 @@ laplace_approximation <- function(tolerance = 1e-6, # here z is a *column vector* to simplify later calculations, it needs to be # transposed to a row vector before feeding into the likelihood function(s) - z_value <- add_first_dim(as_2D_array(rnorm(n))) + z_value <- add_first_dim(as_2d_array(rnorm(n))) z <- tf$constant(z_value, dtype = tf_float()) # Newton-Raphson parameters @@ -192,7 +192,7 @@ laplace_approximation <- function(tolerance = 1e-6, maxiter <- tf$constant(max_iterations) # other objects - a_value <- add_first_dim(as_2D_array(rep(0, n))) + a_value <- add_first_dim(as_2d_array(rep(0, n))) a <- tf$constant(a_value, dtype = tf_float()) u_value <- add_first_dim(diag(n)) u <- tf$constant(u_value, tf_float()) @@ -236,13 +236,13 @@ laplace_approximation <- function(tolerance = 1e-6, # approximate posterior covariance & cholesky factor mat1 <- tf$matmul(rw, tf_transpose(rw)) * sigma + eye u <- tf$cholesky(mat1) - L <- tf_transpose(u) + l <- tf_transpose(u) # compute Newton-Raphson update direction b <- w * cf + d1 mat2 <- rw * tf$matmul(sigma, b) mat3 <- tf$matrix_triangular_solve(u, mat2) - adiff <- b - rw * tf$matrix_triangular_solve(L, mat3, lower = FALSE) - a + adiff <- b - rw * tf$matrix_triangular_solve(l, mat3, lower = FALSE) - a # use golden section search to find the optimum distance to step in this # direction, for each batch simultaneously From faa9cadfa4ea5f2a483144bdd0213f26c5e3e32d Mon Sep 17 00:00:00 2001 From: Nick Golding Date: Tue, 18 Feb 2020 17:23:17 +1100 Subject: [PATCH 18/38] bring marginalisation branch up to date with greta --- R/dag_class.R | 3 +++ R/marginalise.R | 24 ++++++++++++++++-------- R/marginalisers.R | 14 +++++++------- tests/testthat/test_marginalisation.R | 2 +- 4 files changed, 27 insertions(+), 16 deletions(-) diff --git a/R/dag_class.R b/R/dag_class.R index bf3f4c6e..3bce0502 100644 --- a/R/dag_class.R +++ b/R/dag_class.R @@ -530,6 +530,9 @@ dag_class <- R6Class( for (name in data_names) tfe[[name]] <- tfe_old[[name]] + # copy the batch size over + tfe$batch_size <- tfe_old$batch_size + # put the free state in the environment, and build out the tf graph tfe$free_state <- free_state self$define_tf_body() diff --git a/R/marginalise.R b/R/marginalise.R index 641af894..3d80f1ca 100644 --- a/R/marginalise.R +++ b/R/marginalise.R @@ -158,26 +158,31 @@ marginalisation_distribution <- R6Class( tf_distrib = function(parameters, dag) { + self$target + + # build the tfp distribution for the marginalisation distribution + parameter_nodes <- self$target$parameters + distrib_constructor <- dag$get_tf_object(self$target) + tf_parameter_list <- lapply(parameter_nodes, dag$get_tf_object) + tfp_distribution <- distrib_constructor(tf_parameter_list, dag = dag) + # the marginal density implied by the function log_prob <- function(x) { - # x will be the target; a tf function for the distribution being + # x will be the target; a tf constructor function for the distribution being # marginalised. Pass in the distribution node and the dag too, in case # the marginalisers need them. self$tf_marginaliser(self$conditional_density_fun, - tf_distribution_log_pdf = x, + tfp_distribution = tfp_distribution, other_args = parameters, dag = dag, distribution_node = self$target) } - list(log_prob = log_prob, cdf = NULL, log_cdf = NULL) - - }, + list(log_prob = log_prob) - tf_cdf_function = NULL, - tf_log_cdf_function = NULL + } ) ) @@ -197,7 +202,7 @@ as_conditional_density <- function(r_fun, args) { # the function. # this function will take in tensors corresponding to the things in args - function(..., reduce = TRUE) { + function(..., reduce = TRUE, dag = NULL) { tensor_inputs <- list(...) @@ -233,6 +238,9 @@ as_conditional_density <- function(r_fun, args) { sub_dag$tf_graph <- tf$get_default_graph() sub_tfe <- sub_dag$tf_environment + # pass on the batch size, used when defining data + sub_tfe$batch_size <- dag$tf_environment$batch_size + # set the input tensors as the values for the dummy greta arrays in the new # tf_environment node_dummies <- lapply(ga_dummies, get_node) diff --git a/R/marginalisers.R b/R/marginalisers.R index 5cb80479..b2ea8538 100644 --- a/R/marginalisers.R +++ b/R/marginalisers.R @@ -48,7 +48,7 @@ discrete_marginalisation <- function(values) { # define the marginalisation function tf_marginaliser <- function(tf_conditional_density_fun, - tf_distribution_log_pdf, + tfp_distribution, other_args, dag, distribution_node) { @@ -62,7 +62,7 @@ discrete_marginalisation <- function(values) { # 1. get weights from the distribution log pdf # assuming values is a list, get tensors for the weights - weights_list <- lapply(values_list, tf_distribution_log_pdf) + weights_list <- lapply(values_list, tfp_distribution$log_prob) weights_list <- lapply(weights_list, tf_sum) # convert to a vector of discrete probabilities and make them sum to 1 @@ -72,10 +72,10 @@ discrete_marginalisation <- function(values) { log_weights_vec <- tf$log(weights_vec) # 2. compute the conditional joint density for each value (passing in - # other_args) + # other_args and the outer dag) log_density_list <- list() for (i in seq_along(values_list)) { - args <- c(list(values_list[[i]]), other_args) + args <- c(list(values_list[[i]]), other_args, dag = dag) log_density_list[[i]] <- do.call(tf_conditional_density_fun, args) } @@ -138,7 +138,7 @@ laplace_approximation <- function(tolerance = 1e-6, # define the marginalisation function tf_marginaliser <- function(tf_conditional_density_fun, - tf_distribution_log_pdf, + tfp_distribution, other_args, dag, distribution_node) { @@ -148,7 +148,7 @@ laplace_approximation <- function(tolerance = 1e-6, d0 <- function(z, reduce = TRUE) { # transpose z to a row vector, which the dag is expecting t_z <- tf_transpose(z) - args <- c(list(t_z), other_args, list(reduce = reduce)) + args <- c(list(t_z), other_args, list(reduce = reduce, dag = dag)) do.call(tf_conditional_density_fun, args) } @@ -171,7 +171,7 @@ laplace_approximation <- function(tolerance = 1e-6, mu_node <- distribution_node$parameters$mean mu <- dag$tf_environment[[dag$tf_name(mu_node)]] mu <- tf_transpose(mu) - sigma_node <- distribution_node$parameters$Sigma + sigma_node <- distribution_node$parameters$sigma sigma <- dag$tf_environment[[dag$tf_name(sigma_node)]] # dimension of the MVN distribution diff --git a/tests/testthat/test_marginalisation.R b/tests/testthat/test_marginalisation.R index f1cdad3a..c8e5deb6 100644 --- a/tests/testthat/test_marginalisation.R +++ b/tests/testthat/test_marginalisation.R @@ -107,7 +107,7 @@ test_that("inference runs with discrete marginalisation", { m <- model(alpha, beta, p) expect_ok(o <- opt(m)) - expect_ok(draws <- mcmc(m, warmup = 20, n_samples = 20)) + expect_ok(draws <- mcmc(m, warmup = 20, n_samples = 20, verbose = FALSE)) }) From ff341719f1b9ed9733892719c7a9152c15f2bf33 Mon Sep 17 00:00:00 2001 From: Nick Golding Date: Wed, 19 Feb 2020 08:25:40 +1100 Subject: [PATCH 19/38] more stable computation of discrete marginalisation weights --- R/marginalisers.R | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/R/marginalisers.R b/R/marginalisers.R index b2ea8538..afdaf174 100644 --- a/R/marginalisers.R +++ b/R/marginalisers.R @@ -62,14 +62,20 @@ discrete_marginalisation <- function(values) { # 1. get weights from the distribution log pdf # assuming values is a list, get tensors for the weights - weights_list <- lapply(values_list, tfp_distribution$log_prob) - weights_list <- lapply(weights_list, tf_sum) + log_weights_list <- lapply(values_list, tfp_distribution$log_prob) + log_weights_list <- lapply(log_weights_list, tf_sum) - # convert to a vector of discrete probabilities and make them sum to 1 - weights_vec <- tf$concat(weights_list, axis = 1L) - weights_vec <- tf$exp(weights_vec) - weights_vec <- weights_vec / tf_sum(weights_vec) - log_weights_vec <- tf$log(weights_vec) + # convert to a vector of discrete probabilities and make them sum to 1, + # whilst staying on the log scale + log_weights <- tf$concat(log_weights_list, axis = 1L) + + # normalise weights on log scale + log_weights_sum <- tf$reduce_logsumexp( + log_weights, + axis = 1L, + keepdims = TRUE + ) + log_weights <- log_weights - log_weights_sum # 2. compute the conditional joint density for each value (passing in # other_args and the outer dag) @@ -84,7 +90,7 @@ discrete_marginalisation <- function(values) { log_density_vec <- tf$concat(log_density_list, axis = 1L) # 3. compute a weighted sum - log_density_weighted_vec <- log_density_vec + log_weights_vec + log_density_weighted_vec <- log_density_vec + log_weights tf$reduce_logsumexp(log_density_weighted_vec, axis = 1L) } From 2c6ff3996b422a635059f3f3acd3a1e8d6e9e60f Mon Sep 17 00:00:00 2001 From: Nick Golding Date: Wed, 19 Feb 2020 10:34:10 +1100 Subject: [PATCH 20/38] add posterior conjugate normal test; run short posterior checks every time --- tests/testthat/helpers.R | 20 +++++----- tests/testthat/test_posteriors.R | 65 ++++++++++++++++++++++++++++---- 2 files changed, 68 insertions(+), 17 deletions(-) diff --git a/tests/testthat/helpers.R b/tests/testthat/helpers.R index 0e5a47ed..e593b532 100644 --- a/tests/testthat/helpers.R +++ b/tests/testthat/helpers.R @@ -835,7 +835,7 @@ not_finished <- function(draws, target_samples = 5000) { new_samples <- function(draws, target_samples = 5000) { neff <- min(coda::effectiveSize(draws)) efficiency <- neff / coda::niter(draws) - 1.2 * (target_samples - neff) / efficiency + 1.5 * (target_samples - neff) / efficiency } not_timed_out <- function(start_time, time_limit = 300) { @@ -844,15 +844,17 @@ not_timed_out <- function(start_time, time_limit = 300) { } get_enough_draws <- function(model, - sampler = sampler, n_effective = 5000, + sampler = hmc(), time_limit = 300, - verbose = TRUE, + chains = 50, + verbose = FALSE, one_by_one = FALSE) { start_time <- Sys.time() draws <- mcmc(model, sampler = sampler, + chains = chains, verbose = verbose, one_by_one = one_by_one) @@ -917,8 +919,8 @@ check_mvn_samples <- function(sampler, n_effective = 3000) { m <- model(x, precision = "single") draws <- get_enough_draws(m, - sampler = sampler, n_effective = n_effective, + sampler = sampler, verbose = FALSE) # get MCMC samples for statistics of the samples (value, variance and @@ -938,10 +940,10 @@ check_mvn_samples <- function(sampler, n_effective = 3000) { # get absolute errors between posterior means and true values, and scale them # by time-series Monte Carlo standard errors (the expected amount of # uncertainty in the MCMC estimate), to give the number of standard errors - # away from truth. There's a 1/100 chance of any one of these scaled errors - # being greater than qnorm(0.99) if the sampler is correct + # away from truth. There's a 1/200 chance of any one of these scaled errors + # being greater than qnorm(1 - 0.005) if the sampler is correct errors <- scaled_error(stat_draws, stat_truth) - expect_lte(max(errors), qnorm(0.99)) + expect_lte(max(errors), qnorm(1 - 0.005)) } @@ -958,8 +960,8 @@ check_samples <- function(x, m <- model(x, precision = "single") draws <- get_enough_draws(m, - sampler = sampler, n_effective = n_effective, + sampler = sampler, verbose = FALSE, one_by_one = one_by_one) @@ -979,6 +981,6 @@ check_samples <- function(x, # do a formal hypothesis test suppressWarnings(stat <- ks.test(mcmc_samples, iid_samples)) - testthat::expect_gte(stat$p.value, 0.01) + testthat::expect_gte(stat$p.value, 0.005) } diff --git a/tests/testthat/test_posteriors.R b/tests/testthat/test_posteriors.R index 68bae814..71e7bbdc 100644 --- a/tests/testthat/test_posteriors.R +++ b/tests/testthat/test_posteriors.R @@ -4,7 +4,6 @@ test_that("posterior is correct (binomial)", { skip_if_not(check_tf_version()) source("helpers.R") - skip_if_not_release() # analytic solution to the posterior of the paramter of a binomial # distribution, with uniform prior @@ -15,7 +14,7 @@ test_that("posterior is correct (binomial)", { distribution(pos) <- binomial(n, theta) m <- model(theta) - draws <- get_enough_draws(m, hmc(), 2000, verbose = FALSE) + draws <- get_enough_draws(m, n_effective = 5000) samples <- as.matrix(draws) @@ -33,7 +32,61 @@ test_that("posterior is correct (binomial)", { n_draws <- round(coda::effectiveSize(draws)) comparison <- rbeta(n_draws, shape1, shape2) suppressWarnings(test <- ks.test(samples, comparison)) - expect_gte(test$p.value, 0.01) + expect_gte(test$p.value, 0.05) + +}) + +test_that("posterior is correct (normal)", { + + skip_if_not(check_tf_version()) + source("helpers.R") + + # test vs analytic posterior on 8 schools data with no pooling: + # y_i ~ N(theta_i, obs_sd_i ^ 2) + # theta_i ~ N(mu, sd ^ 2) + + # eight schools data + y <- c(28.39, 7.94, -2.75 , 6.82, -0.64, 0.63, 18.01, 12.16) + obs_sd <- c(14.9, 10.2, 16.3, 11.0, 9.4, 11.4, 10.4, 17.6) + + # prior parameters for int, and fixed variance of theta + mu <- rnorm(1) + sd <- abs(rnorm(1)) + + # Bayes theorum gives: + # p(theta | y) \propto p(y|theta) p(theta) + # which with normal densities is: + # p(theta_i | y_i) \propto N(y_i | theta_i, obs_sd_i ^ 2) * N(theta_i | mu, sd ^ 2) + # which is equivalent to: + # p(theta_i | y_i) \propto N(theta_mu_i, theta_var_i) + # theta_var_i = 1 / (1 / sd ^ 2 + 1 / obs_sd_i ^ 2) + # theta_mu_i = (mu / sd ^ 2 + y_i / obs_sd_i ^ 2) * theta_var_i + # conjugate prior, see Wikipedia conjugate prior table + + obs_prec <- 1 / (obs_sd ^ 2) + prec <- 1 / (sd ^ 2) + theta_var <- 1 / (obs_prec + prec) + theta_mu <- (y * obs_prec + mu * prec) * theta_var + analytic <- cbind(mean = theta_mu, sd = sqrt(theta_var)) + + # mcmc solution: + theta <- normal(mu, sd, dim = 8) + distribution(y) <- normal(theta, obs_sd) + m <- model(theta) + + # draw at least 5000 effective samples of each parameter from the posterior + draws <- get_enough_draws(m, n_effective = 5000) + + mcmc_stats <- summary(draws)$statistics + mcmc <- mcmc_stats[, 1:2] + error <- abs(mcmc - analytic) + mcmc_se <- mcmc_stats[, 4] + + # ratio of error to that expected due to Monte Carlo noise + error_multiples <- error / cbind(mcmc_se, mcmc_se) + + within_error_margin <- error_multiples < qnorm(1 - 0.005) + expect_true(all(within_error_margin)) }) @@ -41,7 +94,6 @@ test_that("samplers are unbiased for bivariate normals", { skip_if_not(check_tf_version()) source("helpers.R") - skip_if_not_release() check_mvn_samples(hmc()) check_mvn_samples(rwmh()) @@ -53,7 +105,6 @@ test_that("samplers are unbiased for chi-squared", { skip_if_not(check_tf_version()) source("helpers.R") - skip_if_not_release() df <- 5 x <- chi_squared(df) @@ -67,7 +118,6 @@ test_that("samplers are unbiased for standard uniform", { skip_if_not(check_tf_version()) source("helpers.R") - skip_if_not_release() x <- uniform(0, 1) iid <- runif @@ -80,13 +130,12 @@ test_that("samplers are unbiased for LKJ", { skip_if_not(check_tf_version()) source("helpers.R") - skip_if_not_release() x <- lkj_correlation(3, 2)[1, 2] iid <- function(n) rlkjcorr(n, 3, 2)[, 1, 2] - check_samples(x, iid, hmc(), one_by_one = TRUE) + check_samples(x, iid) }) From ddf9e9ecdffe0a2c2f9110a8a2b147b9476b4a61 Mon Sep 17 00:00:00 2001 From: Nick Golding Date: Wed, 19 Feb 2020 10:34:39 +1100 Subject: [PATCH 21/38] add failing test for laplace approximation --- tests/testthat/test_marginalisation.R | 61 +++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 3 deletions(-) diff --git a/tests/testthat/test_marginalisation.R b/tests/testthat/test_marginalisation.R index c8e5deb6..379038e9 100644 --- a/tests/testthat/test_marginalisation.R +++ b/tests/testthat/test_marginalisation.R @@ -55,7 +55,6 @@ test_that("marginalise errors nicely", { }) - test_that("discrete_marginalisation errors nicely", { skip_if_not(check_tf_version()) @@ -175,7 +174,6 @@ test_that("laplace_approximation errors nicely", { }) - test_that("inference runs with laplace approximation", { skip_if_not(check_tf_version()) @@ -204,6 +202,63 @@ test_that("inference runs with laplace approximation", { expect_ok(draws <- mcmc(m, warmup = 20, n_samples = 20, verbose = FALSE)) - # the optimisation result should be similar to a very long run of +}) + +test_that("laplace approximation converges on correct posterior", { + + skip_if_not(check_tf_version()) + source("helpers.R") + + # test vs analytic posterior on 8 schools data with no pooling: + # y_i ~ N(theta_i, obs_sd_i ^ 2) + # theta_i ~ N(mu, sd ^ 2) + # the posterior for theta is normal, so laplace should be exact + + # eight schools data + y <- c(28.39, 7.94, -2.75 , 6.82, -0.64, 0.63, 18.01, 12.16) + obs_sd <- c(14.9, 10.2, 16.3, 11.0, 9.4, 11.4, 10.4, 17.6) + + # prior parameters for int, and fixed variance of theta + mu <- rnorm(1) + sd <- abs(rnorm(1)) + + # analytic solution: + + # Marginalising int we can write: + # Bayes theorum gives: + # p(theta | y) \propto p(y|theta) p(theta) + # which with normal densities is: + # p(theta_i | y_i) \propto N(y_i | theta_i, obs_sd_i ^ 2) * N(theta_i | mu, sd ^ 2) + # which is equivalent to: + # p(theta_i | y_i) \propto N(theta_mu_i, theta_var_i) + # theta_var_i = 1 / (1 / sd ^ 2 + 1 / obs_sd_i ^ 2) + # theta_mu_i = (mu / sd ^ 2 + y_i / obs_sd_i ^ 2) * theta_var_i + # conjugate prior, see Wikipedia conjugate prior table + + obs_prec <- 1 / (obs_sd ^ 2) + prec <- 1 / (sd ^ 2) + theta_var <- 1 / (obs_prec + prec) + theta_mu <- (y * obs_prec + mu * prec) * theta_var + theta_sd <- sqrt(theta_var) + + # Laplace solution: + lik <- function(theta) { + distribution(y) <- normal(t(theta), obs_sd) + } + + # mock up as a multivariate normal distribution + mean <- ones(1, 8) * mu + sigma <- diag(8) * sd ^ 2 + out <- marginalise(lik, + multivariate_normal(mean, sigma), + laplace_approximation()) + theta_mu_est <- calculate(out$mean) + theta_var_est <- calculate(diag(out$Sigma)) + + analytic <- cbind(mean = theta_mu, sd = sqrt(theta_var)) + laplace <- cbind(mean = theta_mu_est, sd = sqrt(theta_var_est)) + + # compare these to within a tolerance + compare_op(analytic, laplace) }) From 18049ea4120d56d07de5bb3a2f8227088d0f2c47 Mon Sep 17 00:00:00 2001 From: Nick Golding Date: Wed, 19 Feb 2020 10:47:24 +1100 Subject: [PATCH 22/38] prepare for non-diagonal hessians --- R/marginalisers.R | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/R/marginalisers.R b/R/marginalisers.R index afdaf174..5bb9a1ee 100644 --- a/R/marginalisers.R +++ b/R/marginalisers.R @@ -112,15 +112,23 @@ discrete_marginalisation <- function(values) { #' @param max_iterations the (positive) integer-like maximum number of #' iterations of the Newton-Raphson optimisation algorithm #' +#' @param diagonal_hessian whether the Hessian matrix should be assumed to be +#' diagonal, to speed up computations. See Details. +#' #' @details \code{laplace_approximation} can only be used to marginalise -#' variables following a multivariate normal distribution. In addition, the -#' function to be marginalised must \emph{factorise}; ie. it must return a -#' vector-valued density with as many elements as the vector variable being -#' marginalised, and each of element of the density must depend only on the -#' corresponding element of the variable vector. This is the responsibility of -#' the user, and is not checked. +#' variables following a multivariate normal distribution. +#' +#' The argument \code{diagonal_hessian} can be used to state that the +#' conditional density factorises along the elements of the variable being +#' marginalised, and therefore the Hessian matrix of this function can be +#' assumed to be diagonal. A conditional density function factorises if each +#' observation in the conditional density depends only on the corresponding per +#' element of the variable being marginalised. If this is not the case and you +#' set \code{diagonal_hessian = TRUE}, your inferences will be incorrect. +#' laplace_approximation <- function(tolerance = 1e-6, - max_iterations = 50) { + max_iterations = 50, + diagonal_hessian = FALSE) { # in future: # - enable warm starts for subsequent steps of the outer inference algorithm @@ -161,10 +169,19 @@ laplace_approximation <- function(tolerance = 1e-6, # get the vectors of first and second derivatives of the conditional density # function, w.r.t. the variable being marginalised derivs <- function(z) { + y <- d0(z, reduce = FALSE) d1 <- tf$gradients(y, z)[[1]] - d2 <- tf$gradients(d1, z)[[1]] + + if (diagonal_hessian) { + d2 <- tf$map_fn(tf$gradients, list(d1, z))[[1]] + } else { + stop ("inference with non-diagonal hessians not yet implemented") + d2 <- tf$hessians(y, z)[[1]] + } + list(d1, d2) + } # negative log-posterior for the current value of z under MVN assumption From e6ae1cbd3fa1f69549bcc71c14e98928c2cdeb1e Mon Sep 17 00:00:00 2001 From: Nick Golding Date: Wed, 19 Feb 2020 14:56:08 +1100 Subject: [PATCH 23/38] define distributions as their tfp distribution objects --- R/dag_class.R | 29 ++--------------------------- R/joint.R | 10 +++++----- R/mixture.R | 13 ++++++------- R/node_types.R | 8 ++++++-- 4 files changed, 19 insertions(+), 41 deletions(-) diff --git a/R/dag_class.R b/R/dag_class.R index 3bce0502..936400a1 100644 --- a/R/dag_class.R +++ b/R/dag_class.R @@ -439,18 +439,8 @@ dag_class <- R6Class( # target tensor evaluate_density = function(distribution_node, target_node) { - tfe <- self$tf_environment - - parameter_nodes <- distribution_node$parameters - - # get the tensorflow objects for these - distrib_constructor <- self$get_tf_object(distribution_node) + tfp_distribution <- self$get_tf_object(distribution_node) tf_target <- self$get_tf_object(target_node) - tf_parameter_list <- lapply(parameter_nodes, self$get_tf_object) - - # execute the distribution constructor functions to return a tfp - # distribution object - tfp_distribution <- distrib_constructor(tf_parameter_list, dag = self) self$tf_evaluate_density(tfp_distribution, tf_target, @@ -788,25 +778,10 @@ dag_class <- R6Class( }, - # get the tfp distribution object for a distribution node - get_tfp_distribution = function(distrib_node) { - - # build the tfp distribution object for the distribution, and use it - # to get the tensor for the sample - distrib_constructor <- self$get_tf_object(distrib_node) - parameter_nodes <- distrib_node$parameters - tf_parameter_list <- lapply(parameter_nodes, self$get_tf_object) - - # execute the distribution constructor functions to return a tfp - # distribution object - tfp_distribution <- distrib_constructor(tf_parameter_list, dag = self) - - }, - # try to draw a random sample from a distribution node draw_sample = function(distribution_node) { - tfp_distribution <- self$get_tfp_distribution(distribution_node) + tfp_distribution <- self$get_tf_object(distribution_node) sample <- tfp_distribution$sample diff --git a/R/joint.R b/R/joint.R index c61605af..045258a7 100644 --- a/R/joint.R +++ b/R/joint.R @@ -124,15 +124,15 @@ joint_distribution <- R6Class( tf_distrib = function(parameters, dag) { - # get information from the *nodes* for component distributions, not the tf - # objects passed in here + # get tfp distributions + tfp_distributions <- parameters + names(tfp_distributions) <- NULL - # get tfp distributions, truncations, & bounds of component distributions + # get information on truncations, & bounds of component distributions from + # the *nodes* for component distributions distribution_nodes <- self$parameters truncations <- lapply(distribution_nodes, member, "truncation") bounds <- lapply(distribution_nodes, member, "bounds") - tfp_distributions <- lapply(distribution_nodes, dag$get_tfp_distribution) - names(tfp_distributions) <- NULL log_prob <- function(x) { diff --git a/R/mixture.R b/R/mixture.R index 5c96de8f..23f12587 100644 --- a/R/mixture.R +++ b/R/mixture.R @@ -210,17 +210,16 @@ mixture_distribution <- R6Class( tf_distrib = function(parameters, dag) { - # get information from the *nodes* for component distributions, not the tf - # objects passed in here + # get tfp distributions and mixture weights + weights <- parameters$weights + tfp_distributions <- parameters[names(parameters) != "weights"] + names(tfp_distributions) <- NULL - # get tfp distributions, truncations, & bounds of component distributions + # get information on truncations, & bounds of component distributions from + # the *nodes* for component distributions distribution_nodes <- self$parameters[names(self$parameters) != "weights"] truncations <- lapply(distribution_nodes, member, "truncation") bounds <- lapply(distribution_nodes, member, "bounds") - tfp_distributions <- lapply(distribution_nodes, dag$get_tfp_distribution) - names(tfp_distributions) <- NULL - - weights <- parameters$weights # use log weights if available if (self$weights_is_log) { diff --git a/R/node_types.R b/R/node_types.R index 281b18e8..feff453d 100644 --- a/R/node_types.R +++ b/R/node_types.R @@ -543,9 +543,13 @@ distribution_node <- R6Class( tf = function(dag) { - # assign the distribution object constructor function to the environment + # build a tfp distribution object + tf_parameter_list <- lapply(self$parameters, dag$get_tf_object) + tfp_distribution <- self$tf_distrib(tf_parameter_list, dag = dag) + + # assign it to the environment assign(dag$tf_name(self), - self$tf_distrib, + tfp_distribution, envir = dag$tf_environment) }, From e0f7e238a8ed4a505ecb8784a02f9ca3ebf755bb Mon Sep 17 00:00:00 2001 From: Nick Golding Date: Wed, 19 Feb 2020 14:56:34 +1100 Subject: [PATCH 24/38] tidy up tiling to batch size --- R/node_types.R | 3 +-- R/utils.R | 11 +++++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/R/node_types.R b/R/node_types.R index feff453d..519ce3b2 100644 --- a/R/node_types.R +++ b/R/node_types.R @@ -56,8 +56,7 @@ data_node <- R6Class( } # expand up to batch size - tiling <- c(tfe$batch_size, rep(1L, ndim)) - batched_tensor <- tf$tile(unbatched_tensor, tiling) + batched_tensor <- tile_to_batch(unbatched_tensor, tfe$batch_size) # put unbatched tensor in environment so it can be set assign(unbatched_name, unbatched_tensor, envir = tfe) diff --git a/R/utils.R b/R/utils.R index 8f7e1518..5c4847da 100644 --- a/R/utils.R +++ b/R/utils.R @@ -299,13 +299,19 @@ drop_column_dim <- function(x) { x } +# given a tensor (with batch dimension of size 1) and a batch size tensor, tile +# the tensor to match the batch size +tile_to_batch <- function(x, batch_size) { + ndim <- length(dim(x)) + tf$tile(x, c(batch_size, rep(1L, ndim - 1))) +} + # where x is a tensor with no batch dimension, and y is a tensor with a batch # dimension, tile x to have first dimension matching y (dimension determined at # run time) expand_to_batch <- function(x, y) { batch_size <- tf$shape(y)[[0]] - ndim <- length(dim(x)) - tf$tile(x, c(batch_size, rep(1L, ndim - 1))) + tile_to_batch(x, batch_size) } # does this tensor have a batch dimension (of unknown size) as its first @@ -497,6 +503,7 @@ misc_module <- module(module, drop_first_dim, tile_first_dim, drop_column_dim, + tile_to_batch, expand_to_batch, has_batch, match_batches, From 3138eff9aec8a8d140f976f88fd08e3dc2ec6dcb Mon Sep 17 00:00:00 2001 From: Nick Golding Date: Wed, 19 Feb 2020 17:48:17 +1100 Subject: [PATCH 25/38] stop double-definition of dags --- R/inference.R | 1 - R/inference_class.R | 8 ++++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/R/inference.R b/R/inference.R index 2402c2e4..ddaca382 100644 --- a/R/inference.R +++ b/R/inference.R @@ -291,7 +291,6 @@ run_samplers <- function(samplers, thin <- as.integer(thin) dag <- samplers[[1]]$model$dag - chains <- samplers[[1]]$n_chains n_cores <- check_n_cores(n_cores, length(samplers), plan_is) float_type <- dag$tf_float diff --git a/R/inference_class.R b/R/inference_class.R index a019b398..d49ff011 100644 --- a/R/inference_class.R +++ b/R/inference_class.R @@ -46,8 +46,6 @@ inference <- R6Class( free_parameters <- model$dag$example_parameters(free = TRUE) free_parameters <- unlist_tf(free_parameters) self$n_free <- length(free_parameters) - self$set_initial_values(initial_values) - self$n_traced <- length(model$dag$trace_values(self$free_state)) self$seed <- seed }, @@ -277,8 +275,6 @@ sampler <- R6Class( parameters = parameters, seed = seed) - self$n_chains <- nrow(self$free_state) - # duplicate diag_sd if needed n_diag <- length(self$parameters$diag_sd) n_parameters <- self$n_free @@ -289,6 +285,8 @@ sampler <- R6Class( # define the draws tensor on the tf graph self$define_tf_draws() + self$set_initial_values(initial_values) + self$n_chains <- nrow(self$free_state) }, @@ -1025,6 +1023,8 @@ optimiser <- R6Class( self$create_optimiser_objective() self$create_tf_minimiser() + self$set_initial_values(initial_values) + self$n_traced <- length(model$dag$trace_values(self$free_state)) }, From 8346c8ebd295d173eb10ebf00ddb59f6d098a358 Mon Sep 17 00:00:00 2001 From: Nick Golding Date: Wed, 19 Feb 2020 17:49:12 +1100 Subject: [PATCH 26/38] enable batch size to be found without access to dag --- R/dag_class.R | 13 +++++++++---- R/utils.R | 10 ++++++++++ 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/R/dag_class.R b/R/dag_class.R index 936400a1..e1604ac6 100644 --- a/R/dag_class.R +++ b/R/dag_class.R @@ -54,11 +54,16 @@ dag_class <- R6Class( # float type on_graph = function(expr) { - # temporarily pass float type info to options, so it can be accessed by - # nodes on definition, without cluncky explicit passing + # temporarily pass float type and batch size info to options, so it can be accessed by + # nodes on definition, without clunky explicit passing old_float_type <- options()$greta_tf_float - on.exit(options(greta_tf_float = old_float_type)) - options(greta_tf_float = self$tf_float) + old_batch_size <- options()$greta_batch_size + + on.exit(options(greta_tf_float = old_float_type, + greta_batch_size = old_batch_size)) + + options(greta_tf_float = self$tf_float, + greta_batch_size = self$tf_environment$batch_size) with(self$tf_graph$as_default(), expr) }, diff --git a/R/utils.R b/R/utils.R index 5c4847da..8701c648 100644 --- a/R/utils.R +++ b/R/utils.R @@ -164,6 +164,10 @@ check_tf_version <- function(alert = c("none", member <- function(x, method) eval(parse(text = paste0("x$", method))) +get_element <- function(x, element) { + x[[element]] +} + node_type <- function(node) { classes <- class(node) type <- grep("*_node", classes, value = TRUE) @@ -181,6 +185,12 @@ fl <- function(x) { tf$constant(x, dtype = tf_float()) } +# get the tensor for the batch size in the dag currently defining (since it's +# not alway possible to pass the dag in) +get_batch_size <- function() { + options()$greta_batch_size +} + # coerce an integer(ish) vector to a list as expected in tensorflow shape # arguments #' @noRd From 6a02b8daf8dc462aa970902e9bc8bc12d353c404 Mon Sep 17 00:00:00 2001 From: Nick Golding Date: Wed, 19 Feb 2020 17:49:41 +1100 Subject: [PATCH 27/38] let marginalisation return the marginalation parameters --- R/marginalise.R | 82 ++++---- R/marginalisers.R | 263 ++++++++++++++++++++------ tests/testthat/test_marginalisation.R | 11 +- 3 files changed, 252 insertions(+), 104 deletions(-) diff --git a/R/marginalise.R b/R/marginalise.R index 3d80f1ca..1120a10b 100644 --- a/R/marginalise.R +++ b/R/marginalise.R @@ -67,7 +67,7 @@ NULL marginalise <- function(fun, variable, method, ...) { # get the distribution for the random variable - distr <- get_node(variable)$distribution + distribution_node <- get_node(variable)$distribution # check the inputs if (!is.function(fun)) { @@ -82,7 +82,7 @@ marginalise <- function(fun, variable, method, ...) { stop("all arguments to 'fun' must be passed, named, to marginalise") } - if (!inherits(distr, "distribution_node")) { + if (!inherits(distribution_node, "distribution_node")) { stop("'variable' must be a variable greta array with a distribution") } @@ -92,36 +92,42 @@ marginalise <- function(fun, variable, method, ...) { } # check the distribution is compatible with the method - method$distribution_check(distr) + method$distribution_check(distribution_node) # excise the variable from the distribution - distr$remove_target() + distribution_node$remove_target() # turn the greta function into a TF conditional density function; doing # something very similar to as_tf_function(), but giving a function that # returns a tensorflow scalar for the density (unadjusted since there will be # no new variables) - args <- c(list(variable), dots) - conditional_joint_density <- as_conditional_density(fun, args) + conditional_joint_density <- as_conditional_density(fun, c(list(variable), dots)) - # get a tf function from the method which will turn that conditional density - # function into a marginal density (conditional on the inputs to the - # distribution) to be added to the overall model density + # get a list of greta arrays for the marginalisation parameters: + parameters <- method$compute_parameters( + conditional_density_fun = conditional_joint_density, + distribution_node = distribution_node, + dots = dots, + marginaliser = method + ) - # pass the conditional density function and the marginaliser to a - # marginalisation distribution + # add on any pre-computed parameters + parameters <- c(parameters, method$other_parameters) # create the distribution - vble <- distrib( + distribution <- distrib( distribution = "marginalisation", marginaliser = method, + parameters = parameters, conditional_density_fun = conditional_joint_density, - target_distribution = distr, + distribution_node = distribution_node, dots = dots ) - # return nothing (users can't have distribution nodes on their own) - invisible(NULL) + # return a list of the greta arrays computed during the marginalisation, and any other + # things from the method + output <- method$return_list(parameters) + invisible(output) } @@ -130,19 +136,28 @@ marginalisation_distribution <- R6Class( inherit = distribution_node, public = list( - tf_marginaliser = NULL, + marginaliser = NULL, + tf_marginalisation_density = NULL, conditional_density_fun = NULL, initialize = function(marginaliser, + parameters = parameters, conditional_density_fun, - target_distribution, + distribution_node, dots) { # initialize class, and add methods super$initialize("marginalisation") - self$tf_marginaliser <- marginaliser$tf_marginaliser + self$marginaliser <- marginaliser self$conditional_density_fun <- conditional_density_fun + # add the parameters + parameter_names <- names(parameters) + for (i in seq_along(parameters)) { + self$add_parameter(parameters[[i]], + parameter_names[i]) + } + # add the dots (extra inputs to conditional_density_fun) as parameters dot_nodes <- lapply(dots, get_node) for (i in seq_along(dot_nodes)) { @@ -152,34 +167,29 @@ marginalisation_distribution <- R6Class( # make the distribution the target self$remove_target() - self$add_target(target_distribution) + self$add_target(distribution_node) }, tf_distrib = function(parameters, dag) { - self$target - - # build the tfp distribution for the marginalisation distribution - parameter_nodes <- self$target$parameters - distrib_constructor <- dag$get_tf_object(self$target) - tf_parameter_list <- lapply(parameter_nodes, dag$get_tf_object) - tfp_distribution <- distrib_constructor(tf_parameter_list, dag = dag) + # unpack the parameters here + are_dots <- grepl("^input ", names(parameters)) + dots <- parameters[are_dots] + parameters <- parameters[!are_dots] # the marginal density implied by the function log_prob <- function(x) { - # x will be the target; a tf constructor function for the distribution being - # marginalised. Pass in the distribution node and the dag too, in case - # the marginalisers need them. - self$tf_marginaliser(self$conditional_density_fun, - tfp_distribution = tfp_distribution, - other_args = parameters, - dag = dag, - distribution_node = self$target) + marginal_density <- self$marginaliser$tf_marginalisation_density + marginal_density(parameters = parameters, + tf_conditional_density_fun = self$conditional_density_fun, + dots = dots, + other_args = self$marginaliser$other_args) } + list(log_prob = log_prob) } @@ -202,7 +212,7 @@ as_conditional_density <- function(r_fun, args) { # the function. # this function will take in tensors corresponding to the things in args - function(..., reduce = TRUE, dag = NULL) { + function(..., reduce = TRUE) { tensor_inputs <- list(...) @@ -239,7 +249,7 @@ as_conditional_density <- function(r_fun, args) { sub_tfe <- sub_dag$tf_environment # pass on the batch size, used when defining data - sub_tfe$batch_size <- dag$tf_environment$batch_size + sub_tfe$batch_size <- get_batch_size() # set the input tensors as the values for the dummy greta arrays in the new # tf_environment diff --git a/R/marginalisers.R b/R/marginalisers.R index 5bb9a1ee..eaf25df3 100644 --- a/R/marginalisers.R +++ b/R/marginalisers.R @@ -46,22 +46,16 @@ discrete_marginalisation <- function(values) { stop(msg) } - # define the marginalisation function - tf_marginaliser <- function(tf_conditional_density_fun, - tfp_distribution, - other_args, - dag, - distribution_node) { - - # convert these into a list of constant tensors with the correct dimensions - # and float types - values_list <- as.list(values) - values_list <- lapply(values_list, as_2d_array) - values_list <- lapply(values_list, add_first_dim) - values_list <- lapply(values_list, fl) - - # 1. get weights from the distribution log pdf - # assuming values is a list, get tensors for the weights + # convert values to a list of data greta arrays + values_list <- as.list(values) + values_list <- lapply(values_list, as_data) + names(values_list) <- paste0("value_", seq_along(values_list)) + + # function to compute the parameter (log probabilities) of the marginalisation + tf_compute_log_weights <- function(tfp_distribution, ...) { + + # and compute log probabilities for the values + values_list <- list(...) log_weights_list <- lapply(values_list, tfp_distribution$log_prob) log_weights_list <- lapply(log_weights_list, tf_sum) @@ -75,30 +69,69 @@ discrete_marginalisation <- function(values) { axis = 1L, keepdims = TRUE ) - log_weights <- log_weights - log_weights_sum - # 2. compute the conditional joint density for each value (passing in - # other_args and the outer dag) + log_weights - log_weights_sum + + } + + # return a named list of operation greta arrays for the marginalisation parameters + compute_parameters <- function(conditional_density_fun, + distribution_node, + dots, + marginaliser) { + + # get a list of parameters in a mock greta array (just representing a tf list) + args <- c(operation = "marginalisation_log_weights", + distribution_node, + values_list, + tf_operation = "tf_compute_log_weights", + list(dim = c(length(values_list), 1L))) + log_weights <- do.call(op, args) + + list(log_weights = log_weights) + + } + + # compute the marginal joint density + tf_marginalisation_density <- function(parameters, + tf_conditional_density_fun, + dots, + other_args) { + + # unpack the parameters + log_weights <- parameters$log_weights + values_list <- parameters[names(parameters) != "log_weights"] + + # compute the conditional joint density for each value (passing in + # dots too) log_density_list <- list() for (i in seq_along(values_list)) { - args <- c(list(values_list[[i]]), other_args, dag = dag) + args <- c(list(values_list[[i]]), dots) log_density_list[[i]] <- do.call(tf_conditional_density_fun, args) } + # expand and combine the densities log_density_list <- lapply(log_density_list, tf$expand_dims, 1L) log_density_list <- lapply(log_density_list, tf$expand_dims, 2L) log_density_vec <- tf$concat(log_density_list, axis = 1L) - # 3. compute a weighted sum + # compute a weighted sum log_density_weighted_vec <- log_density_vec + log_weights - tf$reduce_logsumexp(log_density_weighted_vec, axis = 1L) + density <- tf$reduce_logsumexp(log_density_weighted_vec, axis = 1L) + density + + } + return_list_function <- function(parameters) { + list(probabilities = exp(parameters$log_weights)) } as_marginaliser(name = "discrete", - tf_marginaliser = tf_marginaliser, - parameters = list(values = values), - distribution_check = discrete_check) + compute_parameters = compute_parameters, + tf_marginalisation_density = tf_marginalisation_density, + return_list = return_list_function, + distribution_check = discrete_check, + other_parameters = values_list) } @@ -150,38 +183,45 @@ laplace_approximation <- function(tolerance = 1e-6, stop("'max_iterations' must be a positive, scalar integer value") } + # function to compute the parameter (log probabilities) of the marginalisation # define the marginalisation function - tf_marginaliser <- function(tf_conditional_density_fun, - tfp_distribution, - other_args, - dag, - distribution_node) { + tf_compute_laplace_parameters <- function(mu, + sigma, + ..., + tf_conditional_density_fun, + diagonal_hessian) { + + dots <- list(...) # simpler interface to conditional density. If reduce = TRUE, it does # reduce_sum on component densities d0 <- function(z, reduce = TRUE) { # transpose z to a row vector, which the dag is expecting t_z <- tf_transpose(z) - args <- c(list(t_z), other_args, list(reduce = reduce, dag = dag)) + args <- c(list(t_z), dots, list(reduce = reduce)) do.call(tf_conditional_density_fun, args) } # get the vectors of first and second derivatives of the conditional density # function, w.r.t. the variable being marginalised - derivs <- function(z) { + if (diagonal_hessian) { + derivs <- function(z) { + y <- d0(z, reduce = FALSE) + d1 <- tf$gradients(y, z)[[1]] + d2 <- tf$gradients(d1, z)[[1]] + list(d1, d2) + } + } else { - y <- d0(z, reduce = FALSE) - d1 <- tf$gradients(y, z)[[1]] + stop("inference with non-diagonal hessians is not yet implemented", + call. = FALSE) - if (diagonal_hessian) { - d2 <- tf$map_fn(tf$gradients, list(d1, z))[[1]] - } else { - stop ("inference with non-diagonal hessians not yet implemented") + derivs <- function(z) { + y <- d0(z, reduce = FALSE) + d1 <- tf$gradients(y, z)[[1]] d2 <- tf$hessians(y, z)[[1]] + list(d1, d2) } - - list(d1, d2) - } # negative log-posterior for the current value of z under MVN assumption @@ -191,11 +231,7 @@ laplace_approximation <- function(tolerance = 1e-6, } # tensors for parameters of MVN distribution (make mu column vector now) - mu_node <- distribution_node$parameters$mean - mu <- dag$tf_environment[[dag$tf_name(mu_node)]] mu <- tf_transpose(mu) - sigma_node <- distribution_node$parameters$sigma - sigma <- dag$tf_environment[[dag$tf_name(sigma_node)]] # dimension of the MVN distribution n <- dim(mu)[[2]] @@ -316,33 +352,132 @@ laplace_approximation <- function(tolerance = 1e-6, # lots of duplicated code; this could be tidied up, but I ran out of time! z <- tf$matmul(sigma, a) + mu + # curvature of the likelihood at the mode deriv <- derivs(z) - d1 <- deriv[[1]] d2 <- deriv[[2]] - - # curvature of the likelihood w <- -d2 rw <- sqrt(w) - # decentred values of z - cf <- z - mu - # approximate posterior covariance & cholesky factor mat1 <- tf$matmul(rw, tf_transpose(rw)) * sigma + eye u <- tf$cholesky(mat1) + # return a list of these things + list(z = z, + mu = mu, + a = a, + u = u) + + } + + # return a named list of operation greta arrays for the marginalisation parameters + compute_parameters <- function(conditional_density_fun, + distribution_node, + dots, + marginaliser) { + + # get greta arrays for parameters of distribution node + mean <- distribution_node$parameters$mean + sigma <- distribution_node$parameters$sigma + + # run the laplace approximation fitting and get a list of parameters in a + # mock greta array (just representing a tf list) + args <- c(operation = "marginalisation_parameters", + mu = mean, + sigma = sigma, + dots, + list( + operation_args = list( + tf_conditional_density_fun = conditional_density_fun, + diagonal_hessian = marginaliser$other_args$diagonal_hessian + ), + tf_operation = "tf_compute_laplace_parameters", + dim = 1 + )) + + parameter_list <- do.call(op, args) + + # extract the elements to operation greta arrays with the correct shapes + z <- op("z", + parameter_list, + dim = dim(mean), + tf_operation = "get_element", + operation_args = list("z")) + + a <- op("a", + parameter_list, + dim = dim(mean), + tf_operation = "get_element", + operation_args = list("a")) + + mu <- op("mu", + parameter_list, + dim = dim(mean), + tf_operation = "get_element", + operation_args = list("mu")) + + u <- op("chol_sigma", + parameter_list, + dim = dim(sigma), + tf_operation = "get_element", + operation_args = list("u")) + + + # pull out the elements + list(z = z, + a = a, + mu = mu, + u = u) + + } + + tf_marginalisation_density <- function(parameters, + tf_conditional_density_fun, + dots, + other_args) { + + diagonal_hessian <- other_args$diagonal_hessian + + # simpler interface to conditional density. If reduce = TRUE, it does + # reduce_sum on component densities + d0 <- function(z, reduce = TRUE) { + # transpose z to a row vector, which the dag is expecting + t_z <- tf_transpose(z) + args <- c(list(t_z), dots, list(reduce = reduce)) + do.call(tf_conditional_density_fun, args) + } + + # negative log-posterior for the current value of z under MVN assumption + psi <- function(a, z, mu) { + p1 <- tf$matmul(tf_transpose(a), z - mu) + fl(0.5) * tf$squeeze(p1, 1:2) - d0(z) + } + + mu <- parameters$mu + u <- parameters$u + z <- parameters$z + a <- parameters$a + # the approximate marginal conditional posterior - logdet <- tf_sum(tf$log(tf$matrix_diag_part(u))) + u_diag <- tf$matrix_diag_part(u) + logdet <- tf_sum(tf$log(u_diag)) nmcp <- psi(a, z, mu) + tf$squeeze(logdet, 1) -nmcp } + return_list_function <- function(parameters) { + list(mean = parameters$z - parameters$mu, + sigma = chol2symm(parameters$u)) + } + as_marginaliser(name = "laplace", - tf_marginaliser = tf_marginaliser, - parameters = list(), - distribution_check = multivariate_normal_check) + compute_parameters = compute_parameters, + tf_marginalisation_density = tf_marginalisation_density, + return_list = return_list_function, + distribution_check = multivariate_normal_check, + other_args = list(diagonal_hessian = diagonal_hessian)) } @@ -366,14 +501,20 @@ multivariate_normal_check <- function(distrib) { # helper to contruct marginalisers as_marginaliser <- function(name, - tf_marginaliser, - parameters, - distribution_check) { + compute_parameters, + tf_marginalisation_density, + distribution_check, + return_list, + other_parameters = list(), + other_args = list()) { obj <- list(name = name, - tf_marginaliser = tf_marginaliser, - parameters = parameters, - distribution_check = distribution_check) + compute_parameters = compute_parameters, + tf_marginalisation_density = tf_marginalisation_density, + distribution_check = distribution_check, + return_list = return_list, + other_args = other_args, + other_parameters = other_parameters) class_name <- paste0(name, "_marginaliser") class(obj) <- c(class_name, "marginaliser") diff --git a/tests/testthat/test_marginalisation.R b/tests/testthat/test_marginalisation.R index 379038e9..17dc2146 100644 --- a/tests/testthat/test_marginalisation.R +++ b/tests/testthat/test_marginalisation.R @@ -80,7 +80,6 @@ test_that("discrete_marginalisation errors nicely", { }) - test_that("inference runs with discrete marginalisation", { skip_if_not(check_tf_version()) @@ -110,7 +109,6 @@ test_that("inference runs with discrete marginalisation", { }) - test_that("discrete marginalisation gives correct densities", { skip_if_not(check_tf_version()) @@ -148,7 +146,6 @@ test_that("discrete marginalisation gives correct densities", { }) - test_that("laplace_approximation errors nicely", { skip_if_not(check_tf_version()) @@ -194,7 +191,7 @@ test_that("inference runs with laplace approximation", { mu <- ones(1, 8) * int marginalise(lik, multivariate_normal(mu, sigma), - laplace_approximation()) + laplace_approximation(diagonal_hessian = TRUE)) m <- model(int, sd) @@ -251,9 +248,9 @@ test_that("laplace approximation converges on correct posterior", { sigma <- diag(8) * sd ^ 2 out <- marginalise(lik, multivariate_normal(mean, sigma), - laplace_approximation()) - theta_mu_est <- calculate(out$mean) - theta_var_est <- calculate(diag(out$Sigma)) + laplace_approximation(diagonal_hessian = TRUE)) + theta_mu_est <- t(calculate(out$mean)[[1]]) + theta_var_est <- calculate(diag(out$sigma))[[1]] analytic <- cbind(mean = theta_mu, sd = sqrt(theta_var)) laplace <- cbind(mean = theta_mu_est, sd = sqrt(theta_var_est)) From fd532a525b53675f514b95b7999a3d9215ce66a4 Mon Sep 17 00:00:00 2001 From: Nick Golding Date: Wed, 19 Feb 2020 18:06:58 +1100 Subject: [PATCH 28/38] fix lints --- R/dag_class.R | 4 ++-- R/marginalise.R | 17 +++++++------- R/marginalisers.R | 9 +++++--- man/marginalisers.Rd | 32 ++++++++++++++++++--------- tests/testthat/test_marginalisation.R | 7 +++++- tests/testthat/test_posteriors.R | 4 ++++ 6 files changed, 48 insertions(+), 25 deletions(-) diff --git a/R/dag_class.R b/R/dag_class.R index e1604ac6..4b3f4dff 100644 --- a/R/dag_class.R +++ b/R/dag_class.R @@ -54,8 +54,8 @@ dag_class <- R6Class( # float type on_graph = function(expr) { - # temporarily pass float type and batch size info to options, so it can be accessed by - # nodes on definition, without clunky explicit passing + # temporarily pass float type and batch size info to options, so it can be + # accessed by nodes on definition, without clunky explicit passing old_float_type <- options()$greta_tf_float old_batch_size <- options()$greta_batch_size diff --git a/R/marginalise.R b/R/marginalise.R index 1120a10b..5db116b7 100644 --- a/R/marginalise.R +++ b/R/marginalise.R @@ -101,7 +101,8 @@ marginalise <- function(fun, variable, method, ...) { # something very similar to as_tf_function(), but giving a function that # returns a tensorflow scalar for the density (unadjusted since there will be # no new variables) - conditional_joint_density <- as_conditional_density(fun, c(list(variable), dots)) + conditional_joint_density <- as_conditional_density(fun, + c(list(variable), dots)) # get a list of greta arrays for the marginalisation parameters: parameters <- method$compute_parameters( @@ -124,8 +125,8 @@ marginalise <- function(fun, variable, method, ...) { dots = dots ) - # return a list of the greta arrays computed during the marginalisation, and any other - # things from the method + # return a list of the greta arrays computed during the marginalisation, and + # any other things from the method output <- method$return_list(parameters) invisible(output) @@ -181,11 +182,11 @@ marginalisation_distribution <- R6Class( # the marginal density implied by the function log_prob <- function(x) { - marginal_density <- self$marginaliser$tf_marginalisation_density - marginal_density(parameters = parameters, - tf_conditional_density_fun = self$conditional_density_fun, - dots = dots, - other_args = self$marginaliser$other_args) + density <- self$marginaliser$tf_marginalisation_density + density(parameters = parameters, + tf_conditional_density_fun = self$conditional_density_fun, + dots = dots, + other_args = self$marginaliser$other_args) } diff --git a/R/marginalisers.R b/R/marginalisers.R index eaf25df3..ba0f562f 100644 --- a/R/marginalisers.R +++ b/R/marginalisers.R @@ -74,13 +74,14 @@ discrete_marginalisation <- function(values) { } - # return a named list of operation greta arrays for the marginalisation parameters + # return a named list of operation greta arrays for the marginalisation + # parameters compute_parameters <- function(conditional_density_fun, distribution_node, dots, marginaliser) { - # get a list of parameters in a mock greta array (just representing a tf list) + # list of parameters in a mock greta array (just representing a tf list) args <- c(operation = "marginalisation_log_weights", distribution_node, values_list, @@ -370,7 +371,7 @@ laplace_approximation <- function(tolerance = 1e-6, } - # return a named list of operation greta arrays for the marginalisation parameters + # named list of operation greta arrays for the marginalisation parameters compute_parameters <- function(conditional_density_fun, distribution_node, dots, @@ -468,8 +469,10 @@ laplace_approximation <- function(tolerance = 1e-6, } return_list_function <- function(parameters) { + list(mean = parameters$z - parameters$mu, sigma = chol2symm(parameters$u)) + } as_marginaliser(name = "laplace", diff --git a/man/marginalisers.Rd b/man/marginalisers.Rd index b5a8fdfc..c9268f6e 100644 --- a/man/marginalisers.Rd +++ b/man/marginalisers.Rd @@ -8,18 +8,25 @@ \usage{ discrete_marginalisation(values) -laplace_approximation(tolerance = 1e-06, max_iterations = 50) +laplace_approximation( + tolerance = 1e-06, + max_iterations = 50, + diagonal_hessian = FALSE +) } \arguments{ -\item{values}{an R vector giving values at which to evaluate the function for a -discrete marginalisation} +\item{values}{an R vector giving values at which to evaluate the function for +a discrete marginalisation} \item{tolerance}{the (positive) numerical convergence tolerance (in units of the log conditional posterior density) of the Newton-Raphson optimisation algorithm} -\item{max_iterations}{the (positive) integer-like maximum number of iterations of -the Newton-Raphson optimisation algorithm} +\item{max_iterations}{the (positive) integer-like maximum number of +iterations of the Newton-Raphson optimisation algorithm} + +\item{diagonal_hessian}{whether the Hessian matrix should be assumed to be +diagonal, to speed up computations. See Details.} } \value{ a \code{marginaliser} object that can be passed to @@ -50,10 +57,13 @@ Functions to set up marginalisers (which explicitly integrate the distribution, that approximation error will be minimal. \code{laplace_approximation} can only be used to marginalise - variables following a multivariate normal distribution. In addition, the - function to be marginalised must \emph{factorise}; ie. it must return a - vector-valued density with as many elements as the vector variable being - marginalised, and each of element of the density must depend only on the - corresponding element of the variable vector. This is the responsibility of - the user, and is not checked. + variables following a multivariate normal distribution. + + The argument \code{diagonal_hessian} can be used to state that the + conditional density factorises along the elements of the variable being + marginalised, and therefore the Hessian matrix of this function can be + assumed to be diagonal. A conditional density function factorises if each + observation in the conditional density depends only on the corresponding per + element of the variable being marginalised. If this is not the case and you + set \code{diagonal_hessian = TRUE}, your inferences will be incorrect. } diff --git a/tests/testthat/test_marginalisation.R b/tests/testthat/test_marginalisation.R index 17dc2146..fc0520d6 100644 --- a/tests/testthat/test_marginalisation.R +++ b/tests/testthat/test_marginalisation.R @@ -206,10 +206,12 @@ test_that("laplace approximation converges on correct posterior", { skip_if_not(check_tf_version()) source("helpers.R") + # nolint start # test vs analytic posterior on 8 schools data with no pooling: # y_i ~ N(theta_i, obs_sd_i ^ 2) # theta_i ~ N(mu, sd ^ 2) # the posterior for theta is normal, so laplace should be exact + # nolint end # eight schools data y <- c(28.39, 7.94, -2.75 , 6.82, -0.64, 0.63, 18.01, 12.16) @@ -221,16 +223,19 @@ test_that("laplace approximation converges on correct posterior", { # analytic solution: + # nolint start # Marginalising int we can write: # Bayes theorum gives: # p(theta | y) \propto p(y|theta) p(theta) # which with normal densities is: - # p(theta_i | y_i) \propto N(y_i | theta_i, obs_sd_i ^ 2) * N(theta_i | mu, sd ^ 2) + # p(theta_i | y_i) \propto N(y_i | theta_i, obs_sd_i ^ 2) * + # N(theta_i | mu, sd ^ 2) # which is equivalent to: # p(theta_i | y_i) \propto N(theta_mu_i, theta_var_i) # theta_var_i = 1 / (1 / sd ^ 2 + 1 / obs_sd_i ^ 2) # theta_mu_i = (mu / sd ^ 2 + y_i / obs_sd_i ^ 2) * theta_var_i # conjugate prior, see Wikipedia conjugate prior table + # nolint end obs_prec <- 1 / (obs_sd ^ 2) prec <- 1 / (sd ^ 2) diff --git a/tests/testthat/test_posteriors.R b/tests/testthat/test_posteriors.R index 71e7bbdc..d2e91c5c 100644 --- a/tests/testthat/test_posteriors.R +++ b/tests/testthat/test_posteriors.R @@ -41,9 +41,11 @@ test_that("posterior is correct (normal)", { skip_if_not(check_tf_version()) source("helpers.R") + # nolint start # test vs analytic posterior on 8 schools data with no pooling: # y_i ~ N(theta_i, obs_sd_i ^ 2) # theta_i ~ N(mu, sd ^ 2) + # nolint end # eight schools data y <- c(28.39, 7.94, -2.75 , 6.82, -0.64, 0.63, 18.01, 12.16) @@ -53,6 +55,7 @@ test_that("posterior is correct (normal)", { mu <- rnorm(1) sd <- abs(rnorm(1)) + # nolint start # Bayes theorum gives: # p(theta | y) \propto p(y|theta) p(theta) # which with normal densities is: @@ -62,6 +65,7 @@ test_that("posterior is correct (normal)", { # theta_var_i = 1 / (1 / sd ^ 2 + 1 / obs_sd_i ^ 2) # theta_mu_i = (mu / sd ^ 2 + y_i / obs_sd_i ^ 2) * theta_var_i # conjugate prior, see Wikipedia conjugate prior table + # nolint end obs_prec <- 1 / (obs_sd ^ 2) prec <- 1 / (sd ^ 2) From 57047d83018cbd50cbaa91104e51ab591496c8f9 Mon Sep 17 00:00:00 2001 From: Nick Golding Date: Mon, 24 Feb 2020 16:59:15 +1100 Subject: [PATCH 29/38] fix golden section search --- R/utils.R | 93 +++++++++++++++++++++++--------------- tests/testthat/test_misc.R | 8 ++-- 2 files changed, 60 insertions(+), 41 deletions(-) diff --git a/R/utils.R b/R/utils.R index 8701c648..da482f32 100644 --- a/R/utils.R +++ b/R/utils.R @@ -409,17 +409,14 @@ gss <- function(func, lower = 0, upper = 1, max_iterations = 50, - tolerance = 1e-32) { + tolerance = 1e-8) { lower <- tf$constant(lower, tf_float(), shape(1)) upper <- tf$constant(upper, tf_float(), shape(1)) maxiter <- tf$constant(as.integer(max_iterations), tf$int32) tol <- fl(tolerance) - iter <- tf$constant(1L, tf$int32) - - phi <- (1 + sqrt(5)) / 2 - phim1 <- fl(phi - 1) - twomphi <- fl(2.0 - phi) + iter <- tf$constant(0L, tf$int32) + golden_ratio <- fl(2 / (sqrt(5) + 1)) # expand out to batch dimensions if (!inherits(batch_dim, "tensorflow.tensor")) { @@ -427,56 +424,78 @@ gss <- function(func, batch_dim <- tf$constant(batch_dim, tf$int32, shape(1)) } + # initial evaluation points left <- tf$tile(lower, batch_dim) right <- tf$tile(upper, batch_dim) width <- right - left - optimum <- tf$zeros(batch_dim, tf_float()) + d <- golden_ratio * width + x1 <- right - d + x2 <- left + d - # status of the multiple consecutive searches; three columns, for left, right - # and optimum - status <- tf$stack(list(left, right, optimum), axis = 1L) + values <- list(left, right, x1, x2, width, iter) # start loop - body <- function(left, right, optimum, width, iter) { - - d <- phim1 * width - - # find evaluation points - a <- left + d - b <- right - d - - # prep reordered matrices for whether steps are above or below - # if func(a) < func(b) - above <- tf$stack(list(left, b, a), axis = 1L) - # else - not_above <- tf$stack(list(a, right, b), axis = 1L) - - status <- tf$where(tf$less(func(b), func(a)), above, not_above) + body <- function(left, right, x1, x2, width, iter) { + + # prep lists of vectors for whether steps are above x1 or below x2 + + # order: lower, upper, x1, x2, width + + # if the minimum is below x2, shift the bounds and reuse x1 as the new x2 + below_width <- x2 - left + below <- list( + left, + x2, + x2 - golden_ratio * below_width, + x1, + below_width + ) + + # if the minimum is above x1, shift the bounds and reuse x2 as the new x1 + above_width <- right - x1 + above <- list( + x1, + right, + x2, + x1 + golden_ratio * above_width, + above_width + ) + + # convert to matrices + below <- tf$stack(below, axis = 1L) + above <- tf$stack(above, axis = 1L) + + # can we stash and reuse these and reduce the number of function evaluations? + f1 <- func(x1) + f2 <- func(x2) + + status <- tf$where(tf$greater(f2, f1), below, above) left <- status[, 0] right <- status[, 1] - optimum <- status[, 2] - - width <- right - left + x1 <- status[, 2] + x2 <- status[, 3] + width <- status[, 4] - list(left, right, optimum, width, iter + 1L) + list(left, right, x1, x2, width, iter + 1L) } - cond <- function(left, right, optimum, width, iter) { - err <- twomphi * tf$abs(width / optimum) - not_converged <- tf$less(tol, err) + cond <- function(left, right, x1, x2, width, iter) { + not_converged <- tf$less(tol, tf$abs(width)) not_all_converged <- tf$reduce_any(not_converged) in_time <- tf$less(iter, maxiter) in_time & not_all_converged } - values <- list(left, right, optimum, width, iter) - out <- tf$while_loop(cond, body, values) - result <- out[3:5] - names(result) <- c("minimum", "width", "iterations") - result + # get minimum value + width <- out[[5]] + min <- out[[1]] + width / fl(2) + + list(minimum = min, + width = width, + iterations = out[[6]]) } diff --git a/tests/testthat/test_misc.R b/tests/testthat/test_misc.R index d4a7590b..ef0e39e6 100644 --- a/tests/testthat/test_misc.R +++ b/tests/testthat/test_misc.R @@ -293,17 +293,17 @@ test_that("golden section search works", { n_batch <- 4 + # random optima + locs <- runif(n_batch) + # simple quadratic function with different optima for each batch func <- function(x) { (x - locs) ^ 2 } - # random optima - locs <- runif(n_batch, 0.2, 0.8) - # run the minimisation ss <- gss(func, n_batch) result <- tf$Session()$run(ss) - compare_op(result$minimum, locs, tolerance = 0.2) + compare_op(result$minimum, locs) }) From 60ba1b26bff63aacc88a6842613f3aaf67f44a6b Mon Sep 17 00:00:00 2001 From: Nick Golding Date: Mon, 24 Feb 2020 17:09:09 +1100 Subject: [PATCH 30/38] stash function calls in gss() --- R/utils.R | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/R/utils.R b/R/utils.R index da482f32..08c75b48 100644 --- a/R/utils.R +++ b/R/utils.R @@ -409,7 +409,7 @@ gss <- function(func, lower = 0, upper = 1, max_iterations = 50, - tolerance = 1e-8) { + tolerance = .Machine$double.eps ^ 0.25) { lower <- tf$constant(lower, tf_float(), shape(1)) upper <- tf$constant(upper, tf_float(), shape(1)) @@ -431,14 +431,15 @@ gss <- function(func, d <- golden_ratio * width x1 <- right - d x2 <- left + d + f1 <- func(x1) + f2 <- func(x2) - values <- list(left, right, x1, x2, width, iter) + values <- list(left, right, x1, x2, f1, f2, width, iter) # start loop - body <- function(left, right, x1, x2, width, iter) { + body <- function(left, right, x1, x2, f1, f2, width, iter) { # prep lists of vectors for whether steps are above x1 or below x2 - # order: lower, upper, x1, x2, width # if the minimum is below x2, shift the bounds and reuse x1 as the new x2 @@ -465,11 +466,8 @@ gss <- function(func, below <- tf$stack(below, axis = 1L) above <- tf$stack(above, axis = 1L) - # can we stash and reuse these and reduce the number of function evaluations? - f1 <- func(x1) - f2 <- func(x2) - - status <- tf$where(tf$greater(f2, f1), below, above) + is_below <- tf$greater(f2, f1) + status <- tf$where(is_below, below, above) left <- status[, 0] right <- status[, 1] @@ -477,10 +475,19 @@ gss <- function(func, x2 <- status[, 3] width <- status[, 4] - list(left, right, x1, x2, width, iter + 1L) + # either recompute f1 (f2) at new location, or use f2 (f1) in its place + + # this is a bit convoluted, but ensures function calls don't need to be + # duplicated, whilst maintaining the vectorisation + x_to_evaluate <- tf$where(is_below, x1, x2) + new_f <- func(x_to_evaluate) + new_f1 <- tf$where(is_below, new_f, f2) + new_f2 <- tf$where(is_below, f1, new_f) + + list(left, right, x1, x2, new_f1, new_f2, width, iter + 1L) } - cond <- function(left, right, x1, x2, width, iter) { + cond <- function(left, right, x1, x2, f1, f2, width, iter) { not_converged <- tf$less(tol, tf$abs(width)) not_all_converged <- tf$reduce_any(not_converged) in_time <- tf$less(iter, maxiter) @@ -490,12 +497,12 @@ gss <- function(func, out <- tf$while_loop(cond, body, values) # get minimum value - width <- out[[5]] + width <- out[[7]] min <- out[[1]] + width / fl(2) list(minimum = min, width = width, - iterations = out[[6]]) + iterations = out[[8]]) } From 5e3dd969e6e2174d275cebd838e1b9f67163476a Mon Sep 17 00:00:00 2001 From: Nick Golding Date: Wed, 26 Feb 2020 16:23:36 +1100 Subject: [PATCH 31/38] make laplace agree with GRaF --- R/marginalisers.R | 46 +++++++++++++++++---------- tests/testthat/test_marginalisation.R | 7 ++-- 2 files changed, 35 insertions(+), 18 deletions(-) diff --git a/R/marginalisers.R b/R/marginalisers.R index ba0f562f..40d4a161 100644 --- a/R/marginalisers.R +++ b/R/marginalisers.R @@ -220,7 +220,7 @@ laplace_approximation <- function(tolerance = 1e-6, derivs <- function(z) { y <- d0(z, reduce = FALSE) d1 <- tf$gradients(y, z)[[1]] - d2 <- tf$hessians(y, z)[[1]] + d2 <- tf$hessians(y, z)[[1]] # this won't work! list(d1, d2) } } @@ -237,13 +237,9 @@ laplace_approximation <- function(tolerance = 1e-6, # dimension of the MVN distribution n <- dim(mu)[[2]] - # randomly initialise z, and expand to batch (warm starts will need some TF - # trickery) - # here z is a *column vector* to simplify later calculations, it needs to be # transposed to a row vector before feeding into the likelihood function(s) - z_value <- add_first_dim(as_2d_array(rnorm(n))) - z <- tf$constant(z_value, dtype = tf_float()) + z <- tf$identity(mu) # Newton-Raphson parameters tol <- tf$constant(tolerance, tf_float(), shape(1)) @@ -253,11 +249,10 @@ laplace_approximation <- function(tolerance = 1e-6, # other objects a_value <- add_first_dim(as_2d_array(rep(0, n))) - a <- tf$constant(a_value, dtype = tf_float()) + a <- fl(a_value) u_value <- add_first_dim(diag(n)) - u <- tf$constant(u_value, tf_float()) - eye <- tf$constant(add_first_dim(diag(n)), - dtype = tf_float()) + u <- fl(u_value) + eye <- fl(add_first_dim(diag(n))) # match batches on everything going into the loop that will have a batch # dimension later @@ -310,11 +305,11 @@ laplace_approximation <- function(tolerance = 1e-6, s <- tf$expand_dims(s, 1L) s <- tf$expand_dims(s, 2L) a_new <- a + s * adiff - z_new <- tf$matmul(sigma, a) + mu + z_new <- tf$matmul(sigma, a_new) + mu psi(a_new, z_new, mu) } - ls_results <- gss(psiline, batch_dim) + ls_results <- gss(psiline, batch_dim, upper = 2) stepsize <- ls_results$minimum stepsize <- tf$expand_dims(stepsize, 1L) stepsize <- tf$expand_dims(stepsize, 2L) @@ -363,11 +358,17 @@ laplace_approximation <- function(tolerance = 1e-6, mat1 <- tf$matmul(rw, tf_transpose(rw)) * sigma + eye u <- tf$cholesky(mat1) + # convergence information + iter <- out[[7]] + converged <- tf$less(iter, maxiter) + # return a list of these things list(z = z, mu = mu, a = a, - u = u) + u = u, + iterations = iter, + converged = converged) } @@ -423,12 +424,23 @@ laplace_approximation <- function(tolerance = 1e-6, tf_operation = "get_element", operation_args = list("u")) + iterations <- op("iterations", + parameter_list, + tf_operation = "get_element", + operation_args = list("iterations")) + + converged <- op("converged", + parameter_list, + tf_operation = "get_element", + operation_args = list("converged")) # pull out the elements list(z = z, a = a, mu = mu, - u = u) + u = u, + iterations = iterations, + converged = converged) } @@ -470,8 +482,10 @@ laplace_approximation <- function(tolerance = 1e-6, return_list_function <- function(parameters) { - list(mean = parameters$z - parameters$mu, - sigma = chol2symm(parameters$u)) + list(mean = t(parameters$z), + sigma = chol2symm(parameters$u), + iterations = parameters$iterations, + converged = parameters$converged) } diff --git a/tests/testthat/test_marginalisation.R b/tests/testthat/test_marginalisation.R index fc0520d6..04f43382 100644 --- a/tests/testthat/test_marginalisation.R +++ b/tests/testthat/test_marginalisation.R @@ -254,8 +254,9 @@ test_that("laplace approximation converges on correct posterior", { out <- marginalise(lik, multivariate_normal(mean, sigma), laplace_approximation(diagonal_hessian = TRUE)) - theta_mu_est <- t(calculate(out$mean)[[1]]) - theta_var_est <- calculate(diag(out$sigma))[[1]] + res <- calculate(mean = t(out$mean), diag_sigma = diag(out$sigma)) + theta_mu_est <- res$mean + theta_var_est <- res$diag_sigma analytic <- cbind(mean = theta_mu, sd = sqrt(theta_var)) laplace <- cbind(mean = theta_mu_est, sd = sqrt(theta_var_est)) @@ -263,4 +264,6 @@ test_that("laplace approximation converges on correct posterior", { # compare these to within a tolerance compare_op(analytic, laplace) + # modes are right, sds are not! + }) From 2bae9abf7dd1fc8a1a24ae1de78bb3692b22800f Mon Sep 17 00:00:00 2001 From: Nick Golding Date: Wed, 26 Feb 2020 17:17:33 +1100 Subject: [PATCH 32/38] get laplace variances working --- R/marginalisers.R | 43 ++++++++++++++++++--------- tests/testthat/test_marginalisation.R | 9 ++---- 2 files changed, 32 insertions(+), 20 deletions(-) diff --git a/R/marginalisers.R b/R/marginalisers.R index 40d4a161..c5b3f3a9 100644 --- a/R/marginalisers.R +++ b/R/marginalisers.R @@ -354,9 +354,19 @@ laplace_approximation <- function(tolerance = 1e-6, w <- -d2 rw <- sqrt(w) - # approximate posterior covariance & cholesky factor + # approximate posterior covariance + # do we need the eye? mat1 <- tf$matmul(rw, tf_transpose(rw)) * sigma + eye - u <- tf$cholesky(mat1) + l <- tf$cholesky(mat1) + v <- tf$linalg$triangular_solve(matrix = l, + rhs = sigma * rw, + lower = TRUE, + adjoint = TRUE) + covar <- sigma - tf$linalg$matmul(v, v, transpose_b = TRUE) + + # log-determinant of l + l_diag <- tf$matrix_diag_part(l) + logdet <- tf_sum(tf$log(l_diag)) # convergence information iter <- out[[7]] @@ -366,7 +376,8 @@ laplace_approximation <- function(tolerance = 1e-6, list(z = z, mu = mu, a = a, - u = u, + logdet = logdet, + covar = covar, iterations = iter, converged = converged) @@ -418,11 +429,16 @@ laplace_approximation <- function(tolerance = 1e-6, tf_operation = "get_element", operation_args = list("mu")) - u <- op("chol_sigma", - parameter_list, - dim = dim(sigma), - tf_operation = "get_element", - operation_args = list("u")) + logdet <- op("log determinant", + parameter_list, + tf_operation = "get_element", + operation_args = list("logdet")) + + covar <- op("covar", + parameter_list, + dim = dim(sigma), + tf_operation = "get_element", + operation_args = list("covar")) iterations <- op("iterations", parameter_list, @@ -438,7 +454,8 @@ laplace_approximation <- function(tolerance = 1e-6, list(z = z, a = a, mu = mu, - u = u, + logdet = logdet, + covar = covar, iterations = iterations, converged = converged) @@ -467,14 +484,12 @@ laplace_approximation <- function(tolerance = 1e-6, } mu <- parameters$mu - u <- parameters$u + logdet <- parameters$logdet z <- parameters$z a <- parameters$a # the approximate marginal conditional posterior - u_diag <- tf$matrix_diag_part(u) - logdet <- tf_sum(tf$log(u_diag)) - nmcp <- psi(a, z, mu) + tf$squeeze(logdet, 1) + nmcp <- psi(a, z, mu) + tf$squeeze(u_logdet, 1) -nmcp @@ -483,7 +498,7 @@ laplace_approximation <- function(tolerance = 1e-6, return_list_function <- function(parameters) { list(mean = t(parameters$z), - sigma = chol2symm(parameters$u), + sigma = parameters$covar, iterations = parameters$iterations, converged = parameters$converged) diff --git a/tests/testthat/test_marginalisation.R b/tests/testthat/test_marginalisation.R index 04f43382..72f70782 100644 --- a/tests/testthat/test_marginalisation.R +++ b/tests/testthat/test_marginalisation.R @@ -243,14 +243,12 @@ test_that("laplace approximation converges on correct posterior", { theta_mu <- (y * obs_prec + mu * prec) * theta_var theta_sd <- sqrt(theta_var) - # Laplace solution: - lik <- function(theta) { - distribution(y) <- normal(t(theta), obs_sd) - } - # mock up as a multivariate normal distribution mean <- ones(1, 8) * mu sigma <- diag(8) * sd ^ 2 + lik <- function(theta) { + distribution(y) <- normal(t(theta), obs_sd) + } out <- marginalise(lik, multivariate_normal(mean, sigma), laplace_approximation(diagonal_hessian = TRUE)) @@ -264,6 +262,5 @@ test_that("laplace approximation converges on correct posterior", { # compare these to within a tolerance compare_op(analytic, laplace) - # modes are right, sds are not! }) From ca8920ba882b7e94e7c5e9dfcd15c2f68361be15 Mon Sep 17 00:00:00 2001 From: Nick Golding Date: Wed, 26 Feb 2020 18:12:35 +1100 Subject: [PATCH 33/38] make laplace work for MVN test case --- R/marginalisers.R | 14 +++---- tests/testthat/test_marginalisation.R | 60 ++++++++++++++++++++++++++- 2 files changed, 63 insertions(+), 11 deletions(-) diff --git a/R/marginalisers.R b/R/marginalisers.R index c5b3f3a9..7dd0c37c 100644 --- a/R/marginalisers.R +++ b/R/marginalisers.R @@ -343,7 +343,7 @@ laplace_approximation <- function(tolerance = 1e-6, a <- out[[2]] - # apparently we need to redefine z and u here, or the backprop errors + # apparently we need to redefine z etc. here, or the backprop errors # lots of duplicated code; this could be tidied up, but I ran out of time! z <- tf$matmul(sigma, a) + mu @@ -353,18 +353,14 @@ laplace_approximation <- function(tolerance = 1e-6, d2 <- deriv[[2]] w <- -d2 rw <- sqrt(w) + hessian <- tf$linalg$diag(tf$squeeze(w, 2L)) # approximate posterior covariance - # do we need the eye? - mat1 <- tf$matmul(rw, tf_transpose(rw)) * sigma + eye - l <- tf$cholesky(mat1) - v <- tf$linalg$triangular_solve(matrix = l, - rhs = sigma * rw, - lower = TRUE, - adjoint = TRUE) - covar <- sigma - tf$linalg$matmul(v, v, transpose_b = TRUE) + covar <- tf$linalg$inv(tf$linalg$inv(sigma) + hessian) # log-determinant of l + mat1 <- tf$matmul(rw, tf_transpose(rw)) * sigma + eye + l <- tf$cholesky(mat1) l_diag <- tf$matrix_diag_part(l) logdet <- tf_sum(tf$log(l_diag)) diff --git a/tests/testthat/test_marginalisation.R b/tests/testthat/test_marginalisation.R index 72f70782..b2a5113d 100644 --- a/tests/testthat/test_marginalisation.R +++ b/tests/testthat/test_marginalisation.R @@ -201,7 +201,7 @@ test_that("inference runs with laplace approximation", { }) -test_that("laplace approximation converges on correct posterior", { +test_that("laplace approximation has correct posterior for univariate normal", { skip_if_not(check_tf_version()) source("helpers.R") @@ -224,7 +224,6 @@ test_that("laplace approximation converges on correct posterior", { # analytic solution: # nolint start - # Marginalising int we can write: # Bayes theorum gives: # p(theta | y) \propto p(y|theta) p(theta) # which with normal densities is: @@ -262,5 +261,62 @@ test_that("laplace approximation converges on correct posterior", { # compare these to within a tolerance compare_op(analytic, laplace) +}) + +test_that("laplace approximation has correct posterior for multivariate normal", { + + skip_if_not(check_tf_version()) + source("helpers.R") + + # nolint start + # test vs analytic posterior on 8 schools data with no pooling: + # y_i ~ N(theta_i, obs_sd_i ^ 2) + # theta ~ MVN(mu, sigma) + # the posterior for theta is multivariate normal, so laplace should be exact + # nolint end + + # eight schools data + y <- c(28.39, 7.94, -2.75 , 6.82, -0.64, 0.63, 18.01, 12.16) + obs_sd <- c(14.9, 10.2, 16.3, 11.0, 9.4, 11.4, 10.4, 17.6) + + # prior parameters for mu and sigma + mu <- rnorm(8) + sigma <- rwish(1, 9, diag(8))[1, , ] + + # analytic solution: + + # nolint start + # Bayes theorum gives: + # p(theta | y) \propto p(y|theta) p(theta) + # which with normal densities is: + # p(theta_i | y_i) \propto N(y_i | theta_i, obs_sd_i ^ 2) * + # N(theta_i | mu, sd ^ 2) + # which is equivalent to: + # p(theta | y) \propto MNN(theta_mu, theta_sigma) + # theta_sigma = (sigma^-1 + diag(1 /obs_sd^2))^-1 + # theta_mu = theta_sigma (sigma^-1 mu + diag(1 /obs_sd^2) mean(y))^-1 + # conjugate prior, see Wikipedia conjugate prior table + # nolint end + + i_obs_sigma <- diag(1 / (obs_sd ^ 2)) + i_sigma <- solve(sigma) + theta_sigma <- solve(i_sigma + i_obs_sigma) + theta_mu <- theta_sigma %*% (i_sigma %*% mu + i_obs_sigma %*% y) + theta_sigma_flat <- theta_sigma[upper.tri(theta_sigma, diag = TRUE)] + + lik <- function(theta) { + distribution(y) <- normal(t(theta), obs_sd) + } + out <- marginalise(lik, + multivariate_normal(t(mu), sigma), + laplace_approximation(diagonal_hessian = TRUE, tolerance = 1e-32)) + res <- calculate(mean = t(out$mean), sigma = out$sigma, iterations = out$iterations) + theta_mu_est <- res$mean + theta_sigma_est <- res$sigma + theta_sigma_est_flat <- theta_sigma_est[upper.tri(theta_sigma_est, diag = TRUE)] + + # compare these to within a tolerance + compare_op(theta_mu, theta_mu_est) + compare_op(theta_sigma_flat, theta_sigma_est_flat) }) From 5191f45219966f31e56392690cc4ea18bade40c3 Mon Sep 17 00:00:00 2001 From: Nick Golding Date: Thu, 27 Feb 2020 09:15:54 +1100 Subject: [PATCH 34/38] bugfix and add notes for univariate Laplace --- R/marginalisers.R | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/R/marginalisers.R b/R/marginalisers.R index 7dd0c37c..9ffdd8a3 100644 --- a/R/marginalisers.R +++ b/R/marginalisers.R @@ -170,6 +170,30 @@ laplace_approximation <- function(tolerance = 1e-6, # factorising) # - handle an iid normal distribution too. + # the following algorithm for a univariate normal distribution is derived + # from LA in Rasmussen & Williams for diagonal Hessian & diagonal prior + # it neeeds checking! + + # nolint start + + # with univariate normal and diagonal hessian, iteration is: + # z_new = mu + (w * (z - mu) + d1(z)) / (1 / sd + w) + + # instead work with decentred variable a where z = a * sd ^ 2 + mu + # l = sqrt(1 + w * sd) + # b = w * (z - mu) * d1(z) + # a_diff = b − (w * sd * b) / l ^ 2 - a + # a_new = a + s * a_diff + # z_new = a_new * sd ^ 2 + mu + + # estimated sd at mode is: + # sqrt(1 / (1 / sd + w)) + + # the approximated negative marginal density is: + # nmcp = 0.5 * sum((z - mu) ^ 2 / sd) - d0(z) + 0.5 * sum(log1p(w * sd)) + + # nolint end + if (!(is.numeric(tolerance) && is.vector(tolerance) && length(tolerance) == 1 && @@ -288,7 +312,8 @@ laplace_approximation <- function(tolerance = 1e-6, # decentred values of z cf <- z - mu - # approximate posterior covariance & cholesky factor + # approximate posterior covariance & cholesky factor (using the matrix inverse + # lemma for numerical stability) mat1 <- tf$matmul(rw, tf_transpose(rw)) * sigma + eye u <- tf$cholesky(mat1) l <- tf_transpose(u) @@ -485,7 +510,7 @@ laplace_approximation <- function(tolerance = 1e-6, a <- parameters$a # the approximate marginal conditional posterior - nmcp <- psi(a, z, mu) + tf$squeeze(u_logdet, 1) + nmcp <- psi(a, z, mu) + tf$squeeze(logdet, 1) -nmcp From f8b4755a2c1291e2fc91596458bfe85af379d55c Mon Sep 17 00:00:00 2001 From: Nick Golding Date: Fri, 28 Feb 2020 11:38:51 +1100 Subject: [PATCH 35/38] add failing test for univariate normal laplace --- tests/testthat/test_marginalisation.R | 79 +++++++++++++-------------- 1 file changed, 37 insertions(+), 42 deletions(-) diff --git a/tests/testthat/test_marginalisation.R b/tests/testthat/test_marginalisation.R index b2a5113d..08d2f521 100644 --- a/tests/testthat/test_marginalisation.R +++ b/tests/testthat/test_marginalisation.R @@ -211,19 +211,6 @@ test_that("laplace approximation has correct posterior for univariate normal", { # y_i ~ N(theta_i, obs_sd_i ^ 2) # theta_i ~ N(mu, sd ^ 2) # the posterior for theta is normal, so laplace should be exact - # nolint end - - # eight schools data - y <- c(28.39, 7.94, -2.75 , 6.82, -0.64, 0.63, 18.01, 12.16) - obs_sd <- c(14.9, 10.2, 16.3, 11.0, 9.4, 11.4, 10.4, 17.6) - - # prior parameters for int, and fixed variance of theta - mu <- rnorm(1) - sd <- abs(rnorm(1)) - - # analytic solution: - - # nolint start # Bayes theorum gives: # p(theta | y) \propto p(y|theta) p(theta) # which with normal densities is: @@ -236,30 +223,43 @@ test_that("laplace approximation has correct posterior for univariate normal", { # conjugate prior, see Wikipedia conjugate prior table # nolint end + # eight schools data + y <- c(28.39, 7.94, -2.75 , 6.82, -0.64, 0.63, 18.01, 12.16) + obs_sd <- c(14.9, 10.2, 16.3, 11.0, 9.4, 11.4, 10.4, 17.6) + + # prior parameters for int, and fixed variance of theta + mu <- rnorm(1) + sd <- abs(rnorm(1)) + + # analytic solution obs_prec <- 1 / (obs_sd ^ 2) prec <- 1 / (sd ^ 2) theta_var <- 1 / (obs_prec + prec) theta_mu <- (y * obs_prec + mu * prec) * theta_var theta_sd <- sqrt(theta_var) + analytic <- cbind(mean = theta_mu, sd = sqrt(theta_var)) - # mock up as a multivariate normal distribution - mean <- ones(1, 8) * mu - sigma <- diag(8) * sd ^ 2 lik <- function(theta) { distribution(y) <- normal(t(theta), obs_sd) } + + # analyse as univariate normal out <- marginalise(lik, - multivariate_normal(mean, sigma), + normal(mu, sd, dim = 8), laplace_approximation(diagonal_hessian = TRUE)) - res <- calculate(mean = t(out$mean), diag_sigma = diag(out$sigma)) - theta_mu_est <- res$mean - theta_var_est <- res$diag_sigma + res <- do.call(calculate, out) + laplace_uni <- cbind(mean = res$mean, sd = res$sd) + compare_op(analytic, laplace_uni) - analytic <- cbind(mean = theta_mu, sd = sqrt(theta_var)) - laplace <- cbind(mean = theta_mu_est, sd = sqrt(theta_var_est)) - - # compare these to within a tolerance - compare_op(analytic, laplace) + # analyse as multivariate normal + mean <- ones(1, 8) * mu + sigma <- diag(8) * sd ^ 2 + out <- marginalise(lik, + multivariate_normal(mean, sigma), + laplace_approximation(diagonal_hessian = TRUE)) + res <- do.call(calculate, out) + laplace_multi <- cbind(mean = res$mean[1, ], sd = sqrt(diag(res$sigma))) + compare_op(analytic, laplace_multi) }) @@ -273,19 +273,6 @@ test_that("laplace approximation has correct posterior for multivariate normal", # y_i ~ N(theta_i, obs_sd_i ^ 2) # theta ~ MVN(mu, sigma) # the posterior for theta is multivariate normal, so laplace should be exact - # nolint end - - # eight schools data - y <- c(28.39, 7.94, -2.75 , 6.82, -0.64, 0.63, 18.01, 12.16) - obs_sd <- c(14.9, 10.2, 16.3, 11.0, 9.4, 11.4, 10.4, 17.6) - - # prior parameters for mu and sigma - mu <- rnorm(8) - sigma <- rwish(1, 9, diag(8))[1, , ] - - # analytic solution: - - # nolint start # Bayes theorum gives: # p(theta | y) \propto p(y|theta) p(theta) # which with normal densities is: @@ -298,10 +285,18 @@ test_that("laplace approximation has correct posterior for multivariate normal", # conjugate prior, see Wikipedia conjugate prior table # nolint end - i_obs_sigma <- diag(1 / (obs_sd ^ 2)) - i_sigma <- solve(sigma) - theta_sigma <- solve(i_sigma + i_obs_sigma) - theta_mu <- theta_sigma %*% (i_sigma %*% mu + i_obs_sigma %*% y) + # eight schools data + y <- c(28.39, 7.94, -2.75 , 6.82, -0.64, 0.63, 18.01, 12.16) + obs_sd <- c(14.9, 10.2, 16.3, 11.0, 9.4, 11.4, 10.4, 17.6) + + # prior parameters for mu and sigma + mu <- rnorm(8) + sigma <- rwish(1, 9, diag(8))[1, , ] + + obs_prec <- diag(1 / (obs_sd ^ 2)) + prec <- solve(sigma) + theta_sigma <- solve(prec + obs_prec) + theta_mu <- theta_sigma %*% (prec %*% mu + obs_prec %*% y) theta_sigma_flat <- theta_sigma[upper.tri(theta_sigma, diag = TRUE)] lik <- function(theta) { From fc8fe92688b41fab84edc0329b1944204cf8ff73 Mon Sep 17 00:00:00 2001 From: Nick Golding Date: Fri, 28 Feb 2020 12:46:32 +1100 Subject: [PATCH 36/38] switch marginalisers to R6 classes --- R/marginalise.R | 17 +- R/marginalisers.R | 1046 +++++++++++++------------ tests/testthat/test_marginalisation.R | 45 +- 3 files changed, 578 insertions(+), 530 deletions(-) diff --git a/R/marginalise.R b/R/marginalise.R index 5db116b7..34b393b4 100644 --- a/R/marginalise.R +++ b/R/marginalise.R @@ -91,8 +91,12 @@ marginalise <- function(fun, variable, method, ...) { "See ?marginalise for options") } + # construct the marginaliser + method_args <- method[which(names(method) != "class")] + marginaliser <- do.call(method$class$new, method_args) + # check the distribution is compatible with the method - method$distribution_check(distribution_node) + marginaliser$distribution_check(distribution_node) # excise the variable from the distribution distribution_node$remove_target() @@ -105,20 +109,19 @@ marginalise <- function(fun, variable, method, ...) { c(list(variable), dots)) # get a list of greta arrays for the marginalisation parameters: - parameters <- method$compute_parameters( + parameters <- marginaliser$compute_parameters( conditional_density_fun = conditional_joint_density, distribution_node = distribution_node, - dots = dots, - marginaliser = method + dots = dots ) # add on any pre-computed parameters - parameters <- c(parameters, method$other_parameters) + parameters <- c(parameters, marginaliser$other_parameters) # create the distribution distribution <- distrib( distribution = "marginalisation", - marginaliser = method, + marginaliser = marginaliser, parameters = parameters, conditional_density_fun = conditional_joint_density, distribution_node = distribution_node, @@ -127,7 +130,7 @@ marginalise <- function(fun, variable, method, ...) { # return a list of the greta arrays computed during the marginalisation, and # any other things from the method - output <- method$return_list(parameters) + output <- marginaliser$return_list(parameters) invisible(output) } diff --git a/R/marginalisers.R b/R/marginalisers.R index 9ffdd8a3..d33979bf 100644 --- a/R/marginalisers.R +++ b/R/marginalisers.R @@ -1,5 +1,538 @@ # marginaliser constructors +# add a marginaliser R6 class + +marginaliser <- R6Class( + "marginaliser", + public = list( + + name = "marginaliser", + + other_parameters = list(), + other_args = list(), + + initialize = function(...) { + invisible(NULL) + }, + + return_list = function(parameters) { + list() + } + + ) +) + +discrete_marginaliser <- R6Class( + "discrete marginaliser", + inherit = marginaliser, + public = list( + + initialize = function(values) { + + super$initialize() + + if (!is.vector(values) | !is.numeric(values)) { + msg <- "'values' must be an R numeric vector" + + if (inherits(values, "greta_array")) { + msg <- paste0(msg, ", not a greta array") + } + + stop(msg) + } + + # convert values to a list of data greta arrays + values_list <- as.list(values) + values_list <- lapply(values_list, as_data) + names(values_list) <- paste0("value_", seq_along(values_list)) + self$other_parameters <- values_list + + }, + + # return a named list of operation greta arrays for the marginalisation + # parameters + compute_parameters = function(conditional_density_fun, + distribution_node, + dots) { + + # function to compute the parameter (log probabilities) of the marginalisation + tf_compute_log_weights <- function(tfp_distribution, ...) { + + # and compute log probabilities for the values + values_list <- list(...) + log_weights_list <- lapply(values_list, tfp_distribution$log_prob) + log_weights_list <- lapply(log_weights_list, tf_sum) + + # convert to a vector of discrete probabilities and make them sum to 1, + # whilst staying on the log scale + log_weights <- tf$concat(log_weights_list, axis = 1L) + + # normalise weights on log scale + log_weights_sum <- tf$reduce_logsumexp( + log_weights, + axis = 1L, + keepdims = TRUE + ) + + log_weights - log_weights_sum + + } + + # list of parameters in a mock greta array (just representing a tf list) + args <- c(operation = "marginalisation_log_weights", + distribution_node, + self$other_parameters, + tf_operation = "tf_compute_log_weights", + list(dim = c(length(self$other_parameters), 1L))) + log_weights <- do.call(op, args) + + list(log_weights = log_weights) + + }, + + # compute the marginal joint density + tf_marginalisation_density = function(parameters, + tf_conditional_density_fun, + dots, + other_args) { + + # unpack the parameters + log_weights <- parameters$log_weights + values_list <- parameters[names(parameters) != "log_weights"] + + # compute the conditional joint density for each value (passing in + # dots too) + log_density_list <- list() + for (i in seq_along(values_list)) { + args <- c(list(values_list[[i]]), dots) + log_density_list[[i]] <- do.call(tf_conditional_density_fun, args) + } + + # expand and combine the densities + log_density_list <- lapply(log_density_list, tf$expand_dims, 1L) + log_density_list <- lapply(log_density_list, tf$expand_dims, 2L) + log_density_vec <- tf$concat(log_density_list, axis = 1L) + + # compute a weighted sum + log_density_weighted_vec <- log_density_vec + log_weights + density <- tf$reduce_logsumexp(log_density_weighted_vec, axis = 1L) + density + + }, + + return_list = function(parameters) { + list(probabilities = exp(parameters$log_weights)) + }, + + # check that the distribution is discrete + distribution_check = function(distrib) { + if (!distrib$discrete) { + stop("this marginalisation method can only be used ", + "with discrete distributions", + call. = FALSE) + } + } + + ) +) + +laplace_marginaliser <- R6Class( + "laplace marginaliser", + inherit = marginaliser, + public = list( + + tolerance = NULL, + max_iterations = NULL, + diagonal_hessian = NULL, + multivariate = FALSE, + + initialize = function(tolerance, + max_iterations, + diagonal_hessian) { + + super$initialize() + + # in future: + # - enable warm starts for subsequent steps of the outer inference algorithm + # - enable a non-factorising version (have the user say whether it is + # factorising) + # - handle an iid normal distribution too. + + # the following algorithm for a univariate normal distribution is derived + # from LA in Rasmussen & Williams for diagonal Hessian & diagonal prior + # it neeeds checking! + + # nolint start + + # with univariate normal and diagonal hessian, iteration is: + # z_new = mu + (w * (z - mu) + d1(z)) / (1 / sd + w) + + # instead work with decentred variable a where z = a * sd ^ 2 + mu + # l = sqrt(1 + w * sd) + # b = w * (z - mu) * d1(z) + # a_diff = b − (w * sd * b) / l ^ 2 - a + # a_new = a + s * a_diff + # z_new = a_new * sd ^ 2 + mu + + # estimated sd at mode is: + # sqrt(1 / (1 / sd + w)) + + # the approximated negative marginal density is: + # nmcp = 0.5 * sum((z - mu) ^ 2 / sd) - d0(z) + 0.5 * sum(log1p(w * sd)) + + # nolint end + + if (!(is.numeric(tolerance) && + is.vector(tolerance) && + length(tolerance) == 1 && + tolerance > 0)) { + stop("'tolerance' must be a positive, scalar numeric value") + } + + max_iterations <- as.integer(max_iterations) + if (!(is.vector(max_iterations) && + length(max_iterations) == 1 && + max_iterations > 0)) { + stop("'max_iterations' must be a positive, scalar integer value") + } + + self$other_args <- list(tolerance = tolerance, + max_iterations = max_iterations, + diagonal_hessian = diagonal_hessian) + + }, + + # named list of operation greta arrays for the marginalisation parameters + compute_parameters = function(conditional_density_fun, + distribution_node, + dots) { + + # function to compute the parameter (log probabilities) of the marginalisation + # define the marginalisation function + tf_compute_laplace_parameters = function(mu, + sigma, + ..., + tf_conditional_density_fun, + diagonal_hessian, + tolerance, + max_iterations) { + + dots <- list(...) + + # simpler interface to conditional density. If reduce = TRUE, it does + # reduce_sum on component densities + d0 <- function(z, reduce = TRUE) { + # transpose z to a row vector, which the dag is expecting + t_z <- tf_transpose(z) + args <- c(list(t_z), dots, list(reduce = reduce)) + do.call(tf_conditional_density_fun, args) + } + + # get the vectors of first and second derivatives of the conditional density + # function, w.r.t. the variable being marginalised + if (diagonal_hessian) { + derivs <- function(z) { + y <- d0(z, reduce = FALSE) + d1 <- tf$gradients(y, z)[[1]] + d2 <- tf$gradients(d1, z)[[1]] + list(d1, d2) + } + } else { + + stop("inference with non-diagonal hessians is not yet implemented", + call. = FALSE) + + derivs <- function(z) { + y <- d0(z, reduce = FALSE) + d1 <- tf$gradients(y, z)[[1]] + d2 <- tf$hessians(y, z)[[1]] # this won't work! + list(d1, d2) + } + } + + # negative log-posterior for the current value of z under MVN assumption + psi <- function(a, z, mu) { + p1 <- tf$matmul(tf_transpose(a), z - mu) + fl(0.5) * tf$squeeze(p1, 1:2) - d0(z) + } + + # tensors for parameters of MVN distribution (make mu column vector now) + mu <- tf_transpose(mu) + + # dimension of the MVN distribution + n <- dim(mu)[[2]] + + # here z is a *column vector* to simplify later calculations, it needs to be + # transposed to a row vector before feeding into the likelihood function(s) + z <- tf$identity(mu) + + # Newton-Raphson parameters + tol <- tf$constant(tolerance, tf_float(), shape(1)) + obj_old <- tf$constant(Inf, tf_float(), shape(1)) + iter <- tf$constant(0L) + maxiter <- tf$constant(max_iterations) + + # other objects + a_value <- add_first_dim(as_2d_array(rep(0, n))) + a <- fl(a_value) + u_value <- add_first_dim(diag(n)) + u <- fl(u_value) + eye <- fl(add_first_dim(diag(n))) + + # match batches on everything going into the loop that will have a batch + # dimension later + objects <- list(mu, sigma, z, a, u, obj_old, tol) + objects <- greta:::match_batches(objects) + mu <- objects[[1]] + sigma <- objects[[2]] + z <- objects[[3]] + a <- objects[[4]] + u <- objects[[5]] + obj_old <- objects[[6]] + tol <- objects[[7]] + + obj <- -d0(z) + + # get the batch dim explicitly, for the step size optimisation + batch_dim <- tf$shape(mu)[[0]] + batch_dim <- tf$expand_dims(batch_dim, 0L) + + # tensorflow while loop to do Newton-Raphson iterations + body <- function(z, a, u, obj_old, obj, tol, iter, maxiter) { + + obj_old <- obj + + deriv <- derivs(z) + d1 <- deriv[[1]] + d2 <- deriv[[2]] + + # curvature of the likelihood + w <- -d2 + rw <- sqrt(w) + + # decentred values of z + cf <- z - mu + + # approximate posterior covariance & cholesky factor (using the matrix inverse + # lemma for numerical stability) + mat1 <- tf$matmul(rw, tf_transpose(rw)) * sigma + eye + u <- tf$cholesky(mat1) + l <- tf_transpose(u) + + # compute Newton-Raphson update direction + b <- w * cf + d1 + mat2 <- rw * tf$matmul(sigma, b) + mat3 <- tf$matrix_triangular_solve(u, mat2) + adiff <- b - rw * tf$matrix_triangular_solve(l, mat3, lower = FALSE) - a + + # use golden section search to find the optimum distance to step in this + # direction, for each batch simultaneously + psiline <- function(s) { + s <- tf$expand_dims(s, 1L) + s <- tf$expand_dims(s, 2L) + a_new <- a + s * adiff + z_new <- tf$matmul(sigma, a_new) + mu + psi(a_new, z_new, mu) + } + + ls_results <- gss(psiline, batch_dim, upper = 2) + stepsize <- ls_results$minimum + stepsize <- tf$expand_dims(stepsize, 1L) + stepsize <- tf$expand_dims(stepsize, 2L) + + # do the update and compute new z and objective + a_new <- a + stepsize * adiff + z_new <- tf$matmul(sigma, a_new) + mu + obj <- psi(a_new, z_new, mu) + + list(z_new, a_new, u, obj_old, obj, tol, iter + 1L, maxiter) + + } + + cond <- function(z, a, u, obj_old, obj, tol, iter, maxiter) { + not_all_converged <- tf$reduce_any(tf$less(tol, obj_old - obj)) + in_time <- tf$less(iter, maxiter) + in_time & not_all_converged + } + + values <- list(z, + a, + u, + obj_old, + obj, + tol, + iter, + maxiter) + + # run the Newton-Raphson optimisation to find the posterior mode of z + out <- tf$while_loop(cond, body, values) + + a <- out[[2]] + + # apparently we need to redefine z etc. here, or the backprop errors + + # lots of duplicated code; this could be tidied up, but I ran out of time! + z <- tf$matmul(sigma, a) + mu + + # curvature of the likelihood at the mode + deriv <- derivs(z) + d2 <- deriv[[2]] + w <- -d2 + rw <- sqrt(w) + hessian <- tf$linalg$diag(tf$squeeze(w, 2L)) + + # approximate posterior covariance + covar <- tf$linalg$inv(tf$linalg$inv(sigma) + hessian) + + # log-determinant of l + mat1 <- tf$matmul(rw, tf_transpose(rw)) * sigma + eye + l <- tf$cholesky(mat1) + l_diag <- tf$matrix_diag_part(l) + logdet <- tf_sum(tf$log(l_diag)) + + # convergence information + iter <- out[[7]] + converged <- tf$less(iter, maxiter) + + # return a list of these things + list(z = z, + mu = mu, + a = a, + logdet = logdet, + covar = covar, + iterations = iter, + converged = converged) + + } + + # get greta arrays for parameters of distribution node + mean <- distribution_node$parameters$mean + sigma <- distribution_node$parameters$sigma + + # run the laplace approximation fitting and get a list of parameters in a + # mock greta array (just representing a tf list) + args <- c(operation = "marginalisation_parameters", + mu = mean, + sigma = sigma, + dots, + list( + operation_args = c( + tf_conditional_density_fun = conditional_density_fun, + self$other_args + ), + tf_operation = "tf_compute_laplace_parameters", + dim = 1 + )) + + parameter_list <- do.call(op, args) + + # extract the elements to operation greta arrays with the correct shapes + z <- op("z", + parameter_list, + dim = dim(mean), + tf_operation = "get_element", + operation_args = list("z")) + + a <- op("a", + parameter_list, + dim = dim(mean), + tf_operation = "get_element", + operation_args = list("a")) + + mu <- op("mu", + parameter_list, + dim = dim(mean), + tf_operation = "get_element", + operation_args = list("mu")) + + logdet <- op("log determinant", + parameter_list, + tf_operation = "get_element", + operation_args = list("logdet")) + + covar <- op("covar", + parameter_list, + dim = dim(sigma), + tf_operation = "get_element", + operation_args = list("covar")) + + iterations <- op("iterations", + parameter_list, + tf_operation = "get_element", + operation_args = list("iterations")) + + converged <- op("converged", + parameter_list, + tf_operation = "get_element", + operation_args = list("converged")) + + # pull out the elements + list(z = z, + a = a, + mu = mu, + logdet = logdet, + covar = covar, + iterations = iterations, + converged = converged) + + }, + + tf_marginalisation_density = function(parameters, + tf_conditional_density_fun, + dots, + other_args) { + + diagonal_hessian <- other_args$diagonal_hessian + + # simpler interface to conditional density. If reduce = TRUE, it does + # reduce_sum on component densities + d0 <- function(z, reduce = TRUE) { + # transpose z to a row vector, which the dag is expecting + t_z <- tf_transpose(z) + args <- c(list(t_z), dots, list(reduce = reduce)) + do.call(tf_conditional_density_fun, args) + } + + # negative log-posterior for the current value of z under MVN assumption + psi <- function(a, z, mu) { + p1 <- tf$matmul(tf_transpose(a), z - mu) + fl(0.5) * tf$squeeze(p1, 1:2) - d0(z) + } + + mu <- parameters$mu + logdet <- parameters$logdet + z <- parameters$z + a <- parameters$a + + # the approximate marginal conditional posterior + nmcp <- psi(a, z, mu) + tf$squeeze(logdet, 1) + + -nmcp + + }, + + return_list = function(parameters) { + + list(mean = t(parameters$z), + sigma = parameters$covar, + iterations = parameters$iterations, + converged = parameters$converged) + + }, + + # check that the distribution is normal + distribution_check = function(distrib) { + name <- distrib$distribution_name + if (!name %in% c("multivariate_normal")) { + stop("the Laplace approximation can only be used ", + "with a multivariate normal distribution", + call. = FALSE) + } + } + + ) +) + #' @name marginalisers #' #' @title marginalisation methods @@ -36,103 +569,8 @@ NULL #' the distribution, that approximation error will be minimal. discrete_marginalisation <- function(values) { - if (!is.vector(values) | !is.numeric(values)) { - msg <- "'values' must be an R numeric vector" - - if (inherits(values, "greta_array")) { - msg <- paste0(msg, ", not a greta array") - } - - stop(msg) - } - - # convert values to a list of data greta arrays - values_list <- as.list(values) - values_list <- lapply(values_list, as_data) - names(values_list) <- paste0("value_", seq_along(values_list)) - - # function to compute the parameter (log probabilities) of the marginalisation - tf_compute_log_weights <- function(tfp_distribution, ...) { - - # and compute log probabilities for the values - values_list <- list(...) - log_weights_list <- lapply(values_list, tfp_distribution$log_prob) - log_weights_list <- lapply(log_weights_list, tf_sum) - - # convert to a vector of discrete probabilities and make them sum to 1, - # whilst staying on the log scale - log_weights <- tf$concat(log_weights_list, axis = 1L) - - # normalise weights on log scale - log_weights_sum <- tf$reduce_logsumexp( - log_weights, - axis = 1L, - keepdims = TRUE - ) - - log_weights - log_weights_sum - - } - - # return a named list of operation greta arrays for the marginalisation - # parameters - compute_parameters <- function(conditional_density_fun, - distribution_node, - dots, - marginaliser) { - - # list of parameters in a mock greta array (just representing a tf list) - args <- c(operation = "marginalisation_log_weights", - distribution_node, - values_list, - tf_operation = "tf_compute_log_weights", - list(dim = c(length(values_list), 1L))) - log_weights <- do.call(op, args) - - list(log_weights = log_weights) - - } - - # compute the marginal joint density - tf_marginalisation_density <- function(parameters, - tf_conditional_density_fun, - dots, - other_args) { - - # unpack the parameters - log_weights <- parameters$log_weights - values_list <- parameters[names(parameters) != "log_weights"] - - # compute the conditional joint density for each value (passing in - # dots too) - log_density_list <- list() - for (i in seq_along(values_list)) { - args <- c(list(values_list[[i]]), dots) - log_density_list[[i]] <- do.call(tf_conditional_density_fun, args) - } - - # expand and combine the densities - log_density_list <- lapply(log_density_list, tf$expand_dims, 1L) - log_density_list <- lapply(log_density_list, tf$expand_dims, 2L) - log_density_vec <- tf$concat(log_density_list, axis = 1L) - - # compute a weighted sum - log_density_weighted_vec <- log_density_vec + log_weights - density <- tf$reduce_logsumexp(log_density_weighted_vec, axis = 1L) - density - - } - - return_list_function <- function(parameters) { - list(probabilities = exp(parameters$log_weights)) - } - - as_marginaliser(name = "discrete", - compute_parameters = compute_parameters, - tf_marginalisation_density = tf_marginalisation_density, - return_list = return_list_function, - distribution_check = discrete_check, - other_parameters = values_list) + as_marginaliser(values = values, + class = discrete_marginaliser) } @@ -164,420 +602,24 @@ laplace_approximation <- function(tolerance = 1e-6, max_iterations = 50, diagonal_hessian = FALSE) { - # in future: - # - enable warm starts for subsequent steps of the outer inference algorithm - # - enable a non-factorising version (have the user say whether it is - # factorising) - # - handle an iid normal distribution too. - - # the following algorithm for a univariate normal distribution is derived - # from LA in Rasmussen & Williams for diagonal Hessian & diagonal prior - # it neeeds checking! - - # nolint start - - # with univariate normal and diagonal hessian, iteration is: - # z_new = mu + (w * (z - mu) + d1(z)) / (1 / sd + w) - - # instead work with decentred variable a where z = a * sd ^ 2 + mu - # l = sqrt(1 + w * sd) - # b = w * (z - mu) * d1(z) - # a_diff = b − (w * sd * b) / l ^ 2 - a - # a_new = a + s * a_diff - # z_new = a_new * sd ^ 2 + mu - - # estimated sd at mode is: - # sqrt(1 / (1 / sd + w)) - - # the approximated negative marginal density is: - # nmcp = 0.5 * sum((z - mu) ^ 2 / sd) - d0(z) + 0.5 * sum(log1p(w * sd)) - - # nolint end - - if (!(is.numeric(tolerance) && - is.vector(tolerance) && - length(tolerance) == 1 && - tolerance > 0)) { - stop("'tolerance' must be a positive, scalar numeric value") - } - - max_iterations <- as.integer(max_iterations) - if (!(is.vector(max_iterations) && - length(max_iterations) == 1 && - max_iterations > 0)) { - stop("'max_iterations' must be a positive, scalar integer value") - } - - # function to compute the parameter (log probabilities) of the marginalisation - # define the marginalisation function - tf_compute_laplace_parameters <- function(mu, - sigma, - ..., - tf_conditional_density_fun, - diagonal_hessian) { - - dots <- list(...) - - # simpler interface to conditional density. If reduce = TRUE, it does - # reduce_sum on component densities - d0 <- function(z, reduce = TRUE) { - # transpose z to a row vector, which the dag is expecting - t_z <- tf_transpose(z) - args <- c(list(t_z), dots, list(reduce = reduce)) - do.call(tf_conditional_density_fun, args) - } - - # get the vectors of first and second derivatives of the conditional density - # function, w.r.t. the variable being marginalised - if (diagonal_hessian) { - derivs <- function(z) { - y <- d0(z, reduce = FALSE) - d1 <- tf$gradients(y, z)[[1]] - d2 <- tf$gradients(d1, z)[[1]] - list(d1, d2) - } - } else { - - stop("inference with non-diagonal hessians is not yet implemented", - call. = FALSE) + as_marginaliser(tolerance = tolerance, + max_iterations = max_iterations, + diagonal_hessian = diagonal_hessian, + class = laplace_marginaliser) - derivs <- function(z) { - y <- d0(z, reduce = FALSE) - d1 <- tf$gradients(y, z)[[1]] - d2 <- tf$hessians(y, z)[[1]] # this won't work! - list(d1, d2) - } - } - - # negative log-posterior for the current value of z under MVN assumption - psi <- function(a, z, mu) { - p1 <- tf$matmul(tf_transpose(a), z - mu) - fl(0.5) * tf$squeeze(p1, 1:2) - d0(z) - } - - # tensors for parameters of MVN distribution (make mu column vector now) - mu <- tf_transpose(mu) - - # dimension of the MVN distribution - n <- dim(mu)[[2]] - - # here z is a *column vector* to simplify later calculations, it needs to be - # transposed to a row vector before feeding into the likelihood function(s) - z <- tf$identity(mu) - - # Newton-Raphson parameters - tol <- tf$constant(tolerance, tf_float(), shape(1)) - obj_old <- tf$constant(Inf, tf_float(), shape(1)) - iter <- tf$constant(0L) - maxiter <- tf$constant(max_iterations) - - # other objects - a_value <- add_first_dim(as_2d_array(rep(0, n))) - a <- fl(a_value) - u_value <- add_first_dim(diag(n)) - u <- fl(u_value) - eye <- fl(add_first_dim(diag(n))) - - # match batches on everything going into the loop that will have a batch - # dimension later - objects <- list(mu, sigma, z, a, u, obj_old, tol) - objects <- greta:::match_batches(objects) - mu <- objects[[1]] - sigma <- objects[[2]] - z <- objects[[3]] - a <- objects[[4]] - u <- objects[[5]] - obj_old <- objects[[6]] - tol <- objects[[7]] - - obj <- -d0(z) - - # get the batch dim explicitly, for the step size optimisation - batch_dim <- tf$shape(mu)[[0]] - batch_dim <- tf$expand_dims(batch_dim, 0L) - - # tensorflow while loop to do Newton-Raphson iterations - body <- function(z, a, u, obj_old, obj, tol, iter, maxiter) { - - obj_old <- obj - - deriv <- derivs(z) - d1 <- deriv[[1]] - d2 <- deriv[[2]] - - # curvature of the likelihood - w <- -d2 - rw <- sqrt(w) - - # decentred values of z - cf <- z - mu - - # approximate posterior covariance & cholesky factor (using the matrix inverse - # lemma for numerical stability) - mat1 <- tf$matmul(rw, tf_transpose(rw)) * sigma + eye - u <- tf$cholesky(mat1) - l <- tf_transpose(u) - - # compute Newton-Raphson update direction - b <- w * cf + d1 - mat2 <- rw * tf$matmul(sigma, b) - mat3 <- tf$matrix_triangular_solve(u, mat2) - adiff <- b - rw * tf$matrix_triangular_solve(l, mat3, lower = FALSE) - a - - # use golden section search to find the optimum distance to step in this - # direction, for each batch simultaneously - psiline <- function(s) { - s <- tf$expand_dims(s, 1L) - s <- tf$expand_dims(s, 2L) - a_new <- a + s * adiff - z_new <- tf$matmul(sigma, a_new) + mu - psi(a_new, z_new, mu) - } - - ls_results <- gss(psiline, batch_dim, upper = 2) - stepsize <- ls_results$minimum - stepsize <- tf$expand_dims(stepsize, 1L) - stepsize <- tf$expand_dims(stepsize, 2L) - - # do the update and compute new z and objective - a_new <- a + stepsize * adiff - z_new <- tf$matmul(sigma, a_new) + mu - obj <- psi(a_new, z_new, mu) - - list(z_new, a_new, u, obj_old, obj, tol, iter + 1L, maxiter) - - } - - cond <- function(z, a, u, obj_old, obj, tol, iter, maxiter) { - not_all_converged <- tf$reduce_any(tf$less(tol, obj_old - obj)) - in_time <- tf$less(iter, maxiter) - in_time & not_all_converged - } - - values <- list(z, - a, - u, - obj_old, - obj, - tol, - iter, - maxiter) - - # run the Newton-Raphson optimisation to find the posterior mode of z - out <- tf$while_loop(cond, body, values) - - a <- out[[2]] - - # apparently we need to redefine z etc. here, or the backprop errors - - # lots of duplicated code; this could be tidied up, but I ran out of time! - z <- tf$matmul(sigma, a) + mu - - # curvature of the likelihood at the mode - deriv <- derivs(z) - d2 <- deriv[[2]] - w <- -d2 - rw <- sqrt(w) - hessian <- tf$linalg$diag(tf$squeeze(w, 2L)) - - # approximate posterior covariance - covar <- tf$linalg$inv(tf$linalg$inv(sigma) + hessian) - - # log-determinant of l - mat1 <- tf$matmul(rw, tf_transpose(rw)) * sigma + eye - l <- tf$cholesky(mat1) - l_diag <- tf$matrix_diag_part(l) - logdet <- tf_sum(tf$log(l_diag)) - - # convergence information - iter <- out[[7]] - converged <- tf$less(iter, maxiter) - - # return a list of these things - list(z = z, - mu = mu, - a = a, - logdet = logdet, - covar = covar, - iterations = iter, - converged = converged) - - } - - # named list of operation greta arrays for the marginalisation parameters - compute_parameters <- function(conditional_density_fun, - distribution_node, - dots, - marginaliser) { - - # get greta arrays for parameters of distribution node - mean <- distribution_node$parameters$mean - sigma <- distribution_node$parameters$sigma - - # run the laplace approximation fitting and get a list of parameters in a - # mock greta array (just representing a tf list) - args <- c(operation = "marginalisation_parameters", - mu = mean, - sigma = sigma, - dots, - list( - operation_args = list( - tf_conditional_density_fun = conditional_density_fun, - diagonal_hessian = marginaliser$other_args$diagonal_hessian - ), - tf_operation = "tf_compute_laplace_parameters", - dim = 1 - )) - - parameter_list <- do.call(op, args) - - # extract the elements to operation greta arrays with the correct shapes - z <- op("z", - parameter_list, - dim = dim(mean), - tf_operation = "get_element", - operation_args = list("z")) - - a <- op("a", - parameter_list, - dim = dim(mean), - tf_operation = "get_element", - operation_args = list("a")) - - mu <- op("mu", - parameter_list, - dim = dim(mean), - tf_operation = "get_element", - operation_args = list("mu")) - - logdet <- op("log determinant", - parameter_list, - tf_operation = "get_element", - operation_args = list("logdet")) - - covar <- op("covar", - parameter_list, - dim = dim(sigma), - tf_operation = "get_element", - operation_args = list("covar")) - - iterations <- op("iterations", - parameter_list, - tf_operation = "get_element", - operation_args = list("iterations")) - - converged <- op("converged", - parameter_list, - tf_operation = "get_element", - operation_args = list("converged")) - - # pull out the elements - list(z = z, - a = a, - mu = mu, - logdet = logdet, - covar = covar, - iterations = iterations, - converged = converged) - - } - - tf_marginalisation_density <- function(parameters, - tf_conditional_density_fun, - dots, - other_args) { - - diagonal_hessian <- other_args$diagonal_hessian - - # simpler interface to conditional density. If reduce = TRUE, it does - # reduce_sum on component densities - d0 <- function(z, reduce = TRUE) { - # transpose z to a row vector, which the dag is expecting - t_z <- tf_transpose(z) - args <- c(list(t_z), dots, list(reduce = reduce)) - do.call(tf_conditional_density_fun, args) - } - - # negative log-posterior for the current value of z under MVN assumption - psi <- function(a, z, mu) { - p1 <- tf$matmul(tf_transpose(a), z - mu) - fl(0.5) * tf$squeeze(p1, 1:2) - d0(z) - } - - mu <- parameters$mu - logdet <- parameters$logdet - z <- parameters$z - a <- parameters$a - - # the approximate marginal conditional posterior - nmcp <- psi(a, z, mu) + tf$squeeze(logdet, 1) - - -nmcp - - } - - return_list_function <- function(parameters) { - - list(mean = t(parameters$z), - sigma = parameters$covar, - iterations = parameters$iterations, - converged = parameters$converged) - - } - - as_marginaliser(name = "laplace", - compute_parameters = compute_parameters, - tf_marginalisation_density = tf_marginalisation_density, - return_list = return_list_function, - distribution_check = multivariate_normal_check, - other_args = list(diagonal_hessian = diagonal_hessian)) - -} - -# check that the distribution is discrete -discrete_check <- function(distrib) { - if (!distrib$discrete) { - stop("this marginalisation method can only be used ", - "with discrete distributions", - call. = FALSE) - } -} - -# check that the distribution is multiariate normal -multivariate_normal_check <- function(distrib) { - if (distrib$distribution_name != "multivariate_normal") { - stop("the Laplace approximation can only be used ", - "with a multivariate normal distribution", - call. = FALSE) - } } # helper to contruct marginalisers -as_marginaliser <- function(name, - compute_parameters, - tf_marginalisation_density, - distribution_check, - return_list, - other_parameters = list(), - other_args = list()) { - - obj <- list(name = name, - compute_parameters = compute_parameters, - tf_marginalisation_density = tf_marginalisation_density, - distribution_check = distribution_check, - return_list = return_list, - other_args = other_args, - other_parameters = other_parameters) - - class_name <- paste0(name, "_marginaliser") +as_marginaliser <- function(..., class) { + + obj <- list(..., class = class) + class_name <- class$classname class(obj) <- c(class_name, "marginaliser") obj } -#' @noRd -#' @export print.marginaliser <- function(x, ...) { - msg <- paste(x$name, "marginaliser object") + msg <- paste(x$class$classname, "object") cat(msg) } diff --git a/tests/testthat/test_marginalisation.R b/tests/testthat/test_marginalisation.R index 08d2f521..23230f58 100644 --- a/tests/testthat/test_marginalisation.R +++ b/tests/testthat/test_marginalisation.R @@ -60,15 +60,24 @@ test_that("discrete_marginalisation errors nicely", { skip_if_not(check_tf_version()) source("helpers.R") + y <- 1:3 + fun <- function(x) { + distribution(y) <- normal(x) + } + # greta array, not a numeric expect_error( - discrete_marginalisation(values = variable()), + marginalise(fun, + poisson(3), + discrete_marginalisation(values = variable())), "must be an R numeric vector, not a greta array" ) # not a numeric expect_error( - discrete_marginalisation(values = c("apple", "banana")), + marginalise(fun, + poisson(3), + discrete_marginalisation(values = c("apple", "banana"))), "must be an R numeric vector$" ) @@ -151,21 +160,30 @@ test_that("laplace_approximation errors nicely", { skip_if_not(check_tf_version()) source("helpers.R") + y <- 1:3 + fun <- function(x) { + distribution(y) <- normal(x) + } + # bad tolerance expect_error( - laplace_approximation(tolerance = -1), + marginalise(fun, + normal(0, 1, dim = 3), + laplace_approximation(tolerance = -1)), "must be a positive, scalar numeric value" ) # bad max iterations expect_error( - laplace_approximation(max_iterations = 0), + marginalise(fun, + normal(0, 1, dim = 3), + laplace_approximation(max_iterations = 0)), "must be a positive, scalar integer value" ) # mismatch with distribution expect_error( - marginalise(I, normal(0, 1), laplace_approximation()), + marginalise(I, beta(2, 2), laplace_approximation()), "can only be used with a multivariate normal distribution" ) @@ -239,19 +257,10 @@ test_that("laplace approximation has correct posterior for univariate normal", { theta_sd <- sqrt(theta_var) analytic <- cbind(mean = theta_mu, sd = sqrt(theta_var)) + # analyse as multivariate normal lik <- function(theta) { distribution(y) <- normal(t(theta), obs_sd) } - - # analyse as univariate normal - out <- marginalise(lik, - normal(mu, sd, dim = 8), - laplace_approximation(diagonal_hessian = TRUE)) - res <- do.call(calculate, out) - laplace_uni <- cbind(mean = res$mean, sd = res$sd) - compare_op(analytic, laplace_uni) - - # analyse as multivariate normal mean <- ones(1, 8) * mu sigma <- diag(8) * sd ^ 2 out <- marginalise(lik, @@ -273,12 +282,6 @@ test_that("laplace approximation has correct posterior for multivariate normal", # y_i ~ N(theta_i, obs_sd_i ^ 2) # theta ~ MVN(mu, sigma) # the posterior for theta is multivariate normal, so laplace should be exact - # Bayes theorum gives: - # p(theta | y) \propto p(y|theta) p(theta) - # which with normal densities is: - # p(theta_i | y_i) \propto N(y_i | theta_i, obs_sd_i ^ 2) * - # N(theta_i | mu, sd ^ 2) - # which is equivalent to: # p(theta | y) \propto MNN(theta_mu, theta_sigma) # theta_sigma = (sigma^-1 + diag(1 /obs_sd^2))^-1 # theta_mu = theta_sigma (sigma^-1 mu + diag(1 /obs_sd^2) mean(y))^-1 From 0780a318c8aa0b99f469cee46e986aeed49c7b05 Mon Sep 17 00:00:00 2001 From: Nick Golding Date: Fri, 28 Feb 2020 13:44:09 +1100 Subject: [PATCH 37/38] pass the distribution to the marginaliser and check inputs early --- R/marginalise.R | 19 ++--- R/marginalisers.R | 105 ++++++++++++-------------- tests/testthat/test_marginalisation.R | 39 ++++------ 3 files changed, 71 insertions(+), 92 deletions(-) diff --git a/R/marginalise.R b/R/marginalise.R index 34b393b4..7ec07680 100644 --- a/R/marginalise.R +++ b/R/marginalise.R @@ -91,27 +91,22 @@ marginalise <- function(fun, variable, method, ...) { "See ?marginalise for options") } - # construct the marginaliser - method_args <- method[which(names(method) != "class")] - marginaliser <- do.call(method$class$new, method_args) - - # check the distribution is compatible with the method - marginaliser$distribution_check(distribution_node) - # excise the variable from the distribution distribution_node$remove_target() + # construct the marginaliser + marginaliser <- do.call(method$class$new, c(distribution_node, method$args)) + # turn the greta function into a TF conditional density function; doing # something very similar to as_tf_function(), but giving a function that # returns a tensorflow scalar for the density (unadjusted since there will be # no new variables) - conditional_joint_density <- as_conditional_density(fun, - c(list(variable), dots)) + args <- c(list(variable), dots) + conditional_joint_density <- as_conditional_density(fun, args) # get a list of greta arrays for the marginalisation parameters: parameters <- marginaliser$compute_parameters( conditional_density_fun = conditional_joint_density, - distribution_node = distribution_node, dots = dots ) @@ -124,7 +119,6 @@ marginalise <- function(fun, variable, method, ...) { marginaliser = marginaliser, parameters = parameters, conditional_density_fun = conditional_joint_density, - distribution_node = distribution_node, dots = dots ) @@ -147,7 +141,6 @@ marginalisation_distribution <- R6Class( initialize = function(marginaliser, parameters = parameters, conditional_density_fun, - distribution_node, dots) { # initialize class, and add methods @@ -171,7 +164,7 @@ marginalisation_distribution <- R6Class( # make the distribution the target self$remove_target() - self$add_target(distribution_node) + self$add_target(marginaliser$distribution) }, diff --git a/R/marginalisers.R b/R/marginalisers.R index d33979bf..87355875 100644 --- a/R/marginalisers.R +++ b/R/marginalisers.R @@ -7,12 +7,13 @@ marginaliser <- R6Class( public = list( name = "marginaliser", + distribution = NULL, other_parameters = list(), other_args = list(), - initialize = function(...) { - invisible(NULL) + initialize = function(distribution, ...) { + self$distribution <- distribution }, return_list = function(parameters) { @@ -27,20 +28,16 @@ discrete_marginaliser <- R6Class( inherit = marginaliser, public = list( - initialize = function(values) { + initialize = function(distribution, values) { - super$initialize() - - if (!is.vector(values) | !is.numeric(values)) { - msg <- "'values' must be an R numeric vector" - - if (inherits(values, "greta_array")) { - msg <- paste0(msg, ", not a greta array") - } - - stop(msg) + if (!distribution$discrete) { + stop("this marginalisation method can only be used ", + "with discrete distributions", + call. = FALSE) } + super$initialize(distribution) + # convert values to a list of data greta arrays values_list <- as.list(values) values_list <- lapply(values_list, as_data) @@ -52,7 +49,6 @@ discrete_marginaliser <- R6Class( # return a named list of operation greta arrays for the marginalisation # parameters compute_parameters = function(conditional_density_fun, - distribution_node, dots) { # function to compute the parameter (log probabilities) of the marginalisation @@ -80,7 +76,7 @@ discrete_marginaliser <- R6Class( # list of parameters in a mock greta array (just representing a tf list) args <- c(operation = "marginalisation_log_weights", - distribution_node, + self$distribution, self$other_parameters, tf_operation = "tf_compute_log_weights", list(dim = c(length(self$other_parameters), 1L))) @@ -122,15 +118,6 @@ discrete_marginaliser <- R6Class( return_list = function(parameters) { list(probabilities = exp(parameters$log_weights)) - }, - - # check that the distribution is discrete - distribution_check = function(distrib) { - if (!distrib$discrete) { - stop("this marginalisation method can only be used ", - "with discrete distributions", - call. = FALSE) - } } ) @@ -146,11 +133,19 @@ laplace_marginaliser <- R6Class( diagonal_hessian = NULL, multivariate = FALSE, - initialize = function(tolerance, + initialize = function(distribution, + tolerance, max_iterations, diagonal_hessian) { - super$initialize() + name <- distribution$distribution_name + if (!name %in% c("normal", "multivariate_normal")) { + stop("the Laplace approximation can only be used ", + "with a normal or multivariate normal distribution", + call. = FALSE) + } + + super$initialize(distribution) # in future: # - enable warm starts for subsequent steps of the outer inference algorithm @@ -182,20 +177,6 @@ laplace_marginaliser <- R6Class( # nolint end - if (!(is.numeric(tolerance) && - is.vector(tolerance) && - length(tolerance) == 1 && - tolerance > 0)) { - stop("'tolerance' must be a positive, scalar numeric value") - } - - max_iterations <- as.integer(max_iterations) - if (!(is.vector(max_iterations) && - length(max_iterations) == 1 && - max_iterations > 0)) { - stop("'max_iterations' must be a positive, scalar integer value") - } - self$other_args <- list(tolerance = tolerance, max_iterations = max_iterations, diagonal_hessian = diagonal_hessian) @@ -203,9 +184,7 @@ laplace_marginaliser <- R6Class( }, # named list of operation greta arrays for the marginalisation parameters - compute_parameters = function(conditional_density_fun, - distribution_node, - dots) { + compute_parameters = function(conditional_density_fun, dots) { # function to compute the parameter (log probabilities) of the marginalisation # define the marginalisation function @@ -406,8 +385,8 @@ laplace_marginaliser <- R6Class( } # get greta arrays for parameters of distribution node - mean <- distribution_node$parameters$mean - sigma <- distribution_node$parameters$sigma + mean <- self$distribution$parameters$mean + sigma <- self$distribution$parameters$sigma # run the laplace approximation fitting and get a list of parameters in a # mock greta array (just representing a tf list) @@ -518,16 +497,6 @@ laplace_marginaliser <- R6Class( iterations = parameters$iterations, converged = parameters$converged) - }, - - # check that the distribution is normal - distribution_check = function(distrib) { - name <- distrib$distribution_name - if (!name %in% c("multivariate_normal")) { - stop("the Laplace approximation can only be used ", - "with a multivariate normal distribution", - call. = FALSE) - } } ) @@ -569,6 +538,16 @@ NULL #' the distribution, that approximation error will be minimal. discrete_marginalisation <- function(values) { + if (!is.vector(values) | !is.numeric(values)) { + msg <- "'values' must be an R numeric vector" + + if (inherits(values, "greta_array")) { + msg <- paste0(msg, ", not a greta array") + } + + stop(msg) + } + as_marginaliser(values = values, class = discrete_marginaliser) @@ -602,6 +581,20 @@ laplace_approximation <- function(tolerance = 1e-6, max_iterations = 50, diagonal_hessian = FALSE) { + if (!(is.numeric(tolerance) && + is.vector(tolerance) && + length(tolerance) == 1 && + tolerance > 0)) { + stop("'tolerance' must be a positive, scalar numeric value") + } + + max_iterations <- as.integer(max_iterations) + if (!(is.vector(max_iterations) && + length(max_iterations) == 1 && + max_iterations > 0)) { + stop("'max_iterations' must be a positive, scalar integer value") + } + as_marginaliser(tolerance = tolerance, max_iterations = max_iterations, diagonal_hessian = diagonal_hessian, @@ -612,7 +605,7 @@ laplace_approximation <- function(tolerance = 1e-6, # helper to contruct marginalisers as_marginaliser <- function(..., class) { - obj <- list(..., class = class) + obj <- list(class = class, args = list(...)) class_name <- class$classname class(obj) <- c(class_name, "marginaliser") obj diff --git a/tests/testthat/test_marginalisation.R b/tests/testthat/test_marginalisation.R index 23230f58..7377c31f 100644 --- a/tests/testthat/test_marginalisation.R +++ b/tests/testthat/test_marginalisation.R @@ -60,24 +60,15 @@ test_that("discrete_marginalisation errors nicely", { skip_if_not(check_tf_version()) source("helpers.R") - y <- 1:3 - fun <- function(x) { - distribution(y) <- normal(x) - } - # greta array, not a numeric expect_error( - marginalise(fun, - poisson(3), - discrete_marginalisation(values = variable())), + discrete_marginalisation(values = variable()), "must be an R numeric vector, not a greta array" ) # not a numeric expect_error( - marginalise(fun, - poisson(3), - discrete_marginalisation(values = c("apple", "banana"))), + discrete_marginalisation(values = c("apple", "banana")), "must be an R numeric vector$" ) @@ -160,31 +151,22 @@ test_that("laplace_approximation errors nicely", { skip_if_not(check_tf_version()) source("helpers.R") - y <- 1:3 - fun <- function(x) { - distribution(y) <- normal(x) - } - # bad tolerance expect_error( - marginalise(fun, - normal(0, 1, dim = 3), - laplace_approximation(tolerance = -1)), + laplace_approximation(tolerance = -1), "must be a positive, scalar numeric value" ) # bad max iterations expect_error( - marginalise(fun, - normal(0, 1, dim = 3), - laplace_approximation(max_iterations = 0)), + laplace_approximation(max_iterations = 0), "must be a positive, scalar integer value" ) # mismatch with distribution expect_error( marginalise(I, beta(2, 2), laplace_approximation()), - "can only be used with a multivariate normal distribution" + "can only be used with a normal or multivariate normal distribution" ) }) @@ -257,6 +239,17 @@ test_that("laplace approximation has correct posterior for univariate normal", { theta_sd <- sqrt(theta_var) analytic <- cbind(mean = theta_mu, sd = sqrt(theta_var)) + # analyse as univariate normal + lik <- function(theta) { + distribution(y) <- normal(theta, obs_sd) + } + out <- marginalise(lik, + normal(mu, sd, dim = 8), + laplace_approximation(diagonal_hessian = TRUE)) + res <- do.call(calculate, out) + laplace_uni <- cbind(mean = res$mean, sd = res$sd) + compare_op(analytic, laplace_uni) + # analyse as multivariate normal lik <- function(theta) { distribution(y) <- normal(t(theta), obs_sd) From fe40342cc53c3f42c90be0d0a4cffaa5a86b94ab Mon Sep 17 00:00:00 2001 From: Nick Golding Date: Fri, 28 Feb 2020 15:22:32 +1100 Subject: [PATCH 38/38] get univariate laplace running (with incorrect) --- R/marginalisers.R | 412 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 324 insertions(+), 88 deletions(-) diff --git a/R/marginalisers.R b/R/marginalisers.R index 87355875..3e33c4dc 100644 --- a/R/marginalisers.R +++ b/R/marginalisers.R @@ -138,6 +138,9 @@ laplace_marginaliser <- R6Class( max_iterations, diagonal_hessian) { + + # check the distribution and determine whether we are witking with a + # multivariate or univariate normal name <- distribution$distribution_name if (!name %in% c("normal", "multivariate_normal")) { stop("the Laplace approximation can only be used ", @@ -145,56 +148,36 @@ laplace_marginaliser <- R6Class( call. = FALSE) } - super$initialize(distribution) - - # in future: - # - enable warm starts for subsequent steps of the outer inference algorithm - # - enable a non-factorising version (have the user say whether it is - # factorising) - # - handle an iid normal distribution too. - - # the following algorithm for a univariate normal distribution is derived - # from LA in Rasmussen & Williams for diagonal Hessian & diagonal prior - # it neeeds checking! - - # nolint start - - # with univariate normal and diagonal hessian, iteration is: - # z_new = mu + (w * (z - mu) + d1(z)) / (1 / sd + w) - - # instead work with decentred variable a where z = a * sd ^ 2 + mu - # l = sqrt(1 + w * sd) - # b = w * (z - mu) * d1(z) - # a_diff = b − (w * sd * b) / l ^ 2 - a - # a_new = a + s * a_diff - # z_new = a_new * sd ^ 2 + mu + self$multivariate <- name == "multivariate_normal" - # estimated sd at mode is: - # sqrt(1 / (1 / sd + w)) - - # the approximated negative marginal density is: - # nmcp = 0.5 * sum((z - mu) ^ 2 / sd) - d0(z) + 0.5 * sum(log1p(w * sd)) - - # nolint end + super$initialize(distribution) self$other_args <- list(tolerance = tolerance, max_iterations = max_iterations, - diagonal_hessian = diagonal_hessian) + diagonal_hessian = diagonal_hessian, + multivariate = self$multivariate) }, # named list of operation greta arrays for the marginalisation parameters compute_parameters = function(conditional_density_fun, dots) { + if (self$multivariate) { + self$compute_parameters_multivariate(conditional_density_fun, dots) + } else { + self$compute_parameters_univariate(conditional_density_fun, dots) + } + }, + + # named list of operation greta arrays for the marginalisation parameters + compute_parameters_multivariate = function(conditional_density_fun, dots) { # function to compute the parameter (log probabilities) of the marginalisation # define the marginalisation function - tf_compute_laplace_parameters = function(mu, + tf_compute_laplace_parameters = function(mean, sigma, ..., tf_conditional_density_fun, - diagonal_hessian, - tolerance, - max_iterations) { + other_args) { dots <- list(...) @@ -209,7 +192,7 @@ laplace_marginaliser <- R6Class( # get the vectors of first and second derivatives of the conditional density # function, w.r.t. the variable being marginalised - if (diagonal_hessian) { + if (other_args$diagonal_hessian) { derivs <- function(z) { y <- d0(z, reduce = FALSE) d1 <- tf$gradients(y, z)[[1]] @@ -230,26 +213,26 @@ laplace_marginaliser <- R6Class( } # negative log-posterior for the current value of z under MVN assumption - psi <- function(a, z, mu) { - p1 <- tf$matmul(tf_transpose(a), z - mu) + psi <- function(a, z, mean) { + p1 <- tf$matmul(tf_transpose(a), z - mean) fl(0.5) * tf$squeeze(p1, 1:2) - d0(z) } - # tensors for parameters of MVN distribution (make mu column vector now) - mu <- tf_transpose(mu) + # tensors for parameters of MVN distribution (make mean column vector now) + mean <- tf_transpose(mean) # dimension of the MVN distribution - n <- dim(mu)[[2]] + n <- dim(mean)[[2]] # here z is a *column vector* to simplify later calculations, it needs to be # transposed to a row vector before feeding into the likelihood function(s) - z <- tf$identity(mu) + z <- tf$identity(mean) # Newton-Raphson parameters - tol <- tf$constant(tolerance, tf_float(), shape(1)) + tol <- tf$constant(other_args$tolerance, tf_float(), shape(1)) obj_old <- tf$constant(Inf, tf_float(), shape(1)) iter <- tf$constant(0L) - maxiter <- tf$constant(max_iterations) + maxiter <- tf$constant(other_args$max_iterations) # other objects a_value <- add_first_dim(as_2d_array(rep(0, n))) @@ -260,9 +243,9 @@ laplace_marginaliser <- R6Class( # match batches on everything going into the loop that will have a batch # dimension later - objects <- list(mu, sigma, z, a, u, obj_old, tol) + objects <- list(mean, sigma, z, a, u, obj_old, tol) objects <- greta:::match_batches(objects) - mu <- objects[[1]] + mean <- objects[[1]] sigma <- objects[[2]] z <- objects[[3]] a <- objects[[4]] @@ -273,7 +256,7 @@ laplace_marginaliser <- R6Class( obj <- -d0(z) # get the batch dim explicitly, for the step size optimisation - batch_dim <- tf$shape(mu)[[0]] + batch_dim <- tf$shape(mean)[[0]] batch_dim <- tf$expand_dims(batch_dim, 0L) # tensorflow while loop to do Newton-Raphson iterations @@ -290,7 +273,7 @@ laplace_marginaliser <- R6Class( rw <- sqrt(w) # decentred values of z - cf <- z - mu + cf <- z - mean # approximate posterior covariance & cholesky factor (using the matrix inverse # lemma for numerical stability) @@ -310,8 +293,8 @@ laplace_marginaliser <- R6Class( s <- tf$expand_dims(s, 1L) s <- tf$expand_dims(s, 2L) a_new <- a + s * adiff - z_new <- tf$matmul(sigma, a_new) + mu - psi(a_new, z_new, mu) + z_new <- tf$matmul(sigma, a_new) + mean + psi(a_new, z_new, mean) } ls_results <- gss(psiline, batch_dim, upper = 2) @@ -321,8 +304,8 @@ laplace_marginaliser <- R6Class( # do the update and compute new z and objective a_new <- a + stepsize * adiff - z_new <- tf$matmul(sigma, a_new) + mu - obj <- psi(a_new, z_new, mu) + z_new <- tf$matmul(sigma, a_new) + mean + obj <- psi(a_new, z_new, mean) list(z_new, a_new, u, obj_old, obj, tol, iter + 1L, maxiter) @@ -351,7 +334,7 @@ laplace_marginaliser <- R6Class( # apparently we need to redefine z etc. here, or the backprop errors # lots of duplicated code; this could be tidied up, but I ran out of time! - z <- tf$matmul(sigma, a) + mu + z <- tf$matmul(sigma, a) + mean # curvature of the likelihood at the mode deriv <- derivs(z) @@ -368,6 +351,7 @@ laplace_marginaliser <- R6Class( l <- tf$cholesky(mat1) l_diag <- tf$matrix_diag_part(l) logdet <- tf_sum(tf$log(l_diag)) + logdet <- tf$squeeze(logdet, 1L) # convergence information iter <- out[[7]] @@ -375,8 +359,9 @@ laplace_marginaliser <- R6Class( # return a list of these things list(z = z, - mu = mu, + mean = mean, a = a, + sigma = sigma, logdet = logdet, covar = covar, iterations = iter, @@ -391,13 +376,13 @@ laplace_marginaliser <- R6Class( # run the laplace approximation fitting and get a list of parameters in a # mock greta array (just representing a tf list) args <- c(operation = "marginalisation_parameters", - mu = mean, + mean = mean, sigma = sigma, dots, list( - operation_args = c( + operation_args = list( tf_conditional_density_fun = conditional_density_fun, - self$other_args + other_args = self$other_args ), tf_operation = "tf_compute_laplace_parameters", dim = 1 @@ -418,12 +403,6 @@ laplace_marginaliser <- R6Class( tf_operation = "get_element", operation_args = list("a")) - mu <- op("mu", - parameter_list, - dim = dim(mean), - tf_operation = "get_element", - operation_args = list("mu")) - logdet <- op("log determinant", parameter_list, tf_operation = "get_element", @@ -448,7 +427,7 @@ laplace_marginaliser <- R6Class( # pull out the elements list(z = z, a = a, - mu = mu, + mean = mean, logdet = logdet, covar = covar, iterations = iterations, @@ -456,46 +435,301 @@ laplace_marginaliser <- R6Class( }, + # named list of operation greta arrays for the marginalisation parameters + compute_parameters_univariate = function(conditional_density_fun, dots) { + + # function to compute the parameter (log probabilities) of the marginalisation + tf_compute_laplace_parameters = function(mean, + sd, + ..., + tf_conditional_density_fun, + other_args) { + + + # the following algorithm for a univariate normal distribution is derived + # from LA in Rasmussen & Williams for diagonal Hessian & diagonal prior. + # It needs checking! + + # nolint start + + # with univariate normal and diagonal hessian, iteration is: + # z_diff = mean + (w * (z - mean) + d1(z)) / (1 / sd + w) - z + # z_new = z + s * z_diff + + # # instead work with decentred variable a where z = a * sd ^ 2 + mean + # # l = sqrt(1 + w * sd) + # # b = w * (z - mean) * d1(z) + # # a_diff = b − (w * sd * b) / l ^ 2 - a + # # a_new = a + s * a_diff + # # z_new = a_new * sd ^ 2 + mean + + # estimated sd at mode is: + # sqrt(1 / (1 / sd + w)) + + # the approximated negative marginal density is: + # nmcp = 0.5 * sum((z - mean) ^ 2 / sd) - d0(z) + 0.5 * sum(log1p(w * sd)) + + # nolint end + + dots <- list(...) + + # simpler interface to conditional density. If reduce = TRUE, it does + # reduce_sum on component densities + d0 <- function(z, reduce = TRUE) { + # transpose z to a row vector, which the dag is expecting + t_z <- tf_transpose(z) + args <- c(list(t_z), dots, list(reduce = reduce)) + do.call(tf_conditional_density_fun, args) + } + + # get the vectors of first and second derivatives of the conditional density + # function, w.r.t. the variable being marginalised + if (other_args$diagonal_hessian) { + derivs <- function(z) { + y <- d0(z, reduce = FALSE) + d1 <- tf$gradients(y, z)[[1]] + d2 <- tf$gradients(d1, z)[[1]] + list(d1, d2) + } + } else { + + stop("inference with non-diagonal hessians is not yet implemented", + call. = FALSE) + + derivs <- function(z) { + y <- d0(z, reduce = FALSE) + d1 <- tf$gradients(y, z)[[1]] + d2 <- tf$hessians(y, z)[[1]] # this won't work! + list(d1, d2) + } + } + + # negative log-posterior for the current value of z under MVN assumption + psi <- function(z, mean, sd) { + p1 <- tf_sum((z - mean) ^ 2 / sd) + fl(0.5) * tf$squeeze(p1, 1:2) - d0(z) + } + + # here z is a *column vector* to simplify later calculations, it needs to be + # transposed to a row vector before feeding into the likelihood function(s) + z <- tf$identity(mean) + + # Newton-Raphson parameters + tol <- tf$constant(other_args$tolerance, tf_float(), shape(1)) + obj_old <- tf$constant(Inf, tf_float(), shape(1)) + iter <- tf$constant(0L) + maxiter <- tf$constant(other_args$max_iterations) + + # match batches on everything going into the loop that will have a batch + # dimension later + objects <- list(mean, sd, z, obj_old, tol) + objects <- greta:::match_batches(objects) + mean <- objects[[1]] + sd <- objects[[2]] + z <- objects[[3]] + obj_old <- objects[[4]] + tol <- objects[[5]] + + obj <- -d0(z) + + # get the batch dim explicitly, for the step size optimisation + batch_dim <- tf$shape(mean)[[0]] + batch_dim <- tf$expand_dims(batch_dim, 0L) + + # tensorflow while loop to do Newton-Raphson iterations + body <- function(z, obj_old, obj, tol, iter, maxiter) { + + obj_old <- obj + + deriv <- derivs(z) + d1 <- deriv[[1]] + d2 <- deriv[[2]] + + # curvature of the likelihood + w <- -d2 + + # with univariate normal and diagonal hessian, iteration is: + z_diff <- mean + (w * (z - mean) + d1) / (fl(1) / sd + w) - z + + # use golden section search to find the optimum distance to step in this + # direction, for each batch simultaneously + psiline <- function(s) { + s <- tf$expand_dims(s, 1L) + s <- tf$expand_dims(s, 2L) + z_new <- z + s * z_diff + psi(z_new, mean, sd) + } + + ls_results <- gss(psiline, batch_dim, upper = 2) + stepsize <- ls_results$minimum + stepsize <- tf$expand_dims(stepsize, 1L) + stepsize <- tf$expand_dims(stepsize, 2L) + + # do the update and compute new z and objective + z_new = z + stepsize * z_diff + obj <- psi(z_new, mean, sd) + + list(z_new, obj_old, obj, tol, iter + 1L, maxiter) + + } + + cond <- function(z, obj_old, obj, tol, iter, maxiter) { + not_all_converged <- tf$reduce_any(tf$less(tol, obj_old - obj)) + in_time <- tf$less(iter, maxiter) + in_time & not_all_converged + } + + values <- list(z, + obj_old, + obj, + tol, + iter, + maxiter) + + # run the Newton-Raphson optimisation to find the posterior mode of z + out <- tf$while_loop(cond, body, values) + + # we might need to redefine z etc. if the backprop won't allow this errors + z <- out[[1]] + + # curvature of the likelihood at the mode + deriv <- derivs(z) + d2 <- deriv[[2]] + w <- -d2 + + # approximate posterior variance + var <- fl(1) / (fl(1) / sd + w) + + # log-determinant of l + logdet <- fl(0.5) * tf_sum(tf$math$log1p(w * sd)) + + # convergence information + iter <- out[[5]] + converged <- tf$less(iter, maxiter) + + # return a list of these things + list(z = z, + var = var, + logdet = logdet, + iterations = iter, + converged = converged) + + } + + # get greta arrays for parameters of distribution node + mean <- self$distribution$parameters$mean + sd <- self$distribution$parameters$sd + + # run the laplace approximation fitting and get a list of parameters in a + # mock greta array (just representing a tf list) + args <- c(operation = "marginalisation_parameters", + mean = mean, + sd = sd, + dots, + list( + operation_args = list( + tf_conditional_density_fun = conditional_density_fun, + other_args = self$other_args + ), + tf_operation = "tf_compute_laplace_parameters", + dim = 1 + )) + + parameter_list <- do.call(op, args) + + # extract the elements to operation greta arrays with the correct shapes + z <- op("z", + parameter_list, + dim = self$distribution$dim, + tf_operation = "get_element", + operation_args = list("z")) + + var <- op("var", + parameter_list, + dim = self$distribution$dim, + tf_operation = "get_element", + operation_args = list("var")) + + logdet <- op("log determinant", + parameter_list, + tf_operation = "get_element", + operation_args = list("logdet")) + + iterations <- op("iterations", + parameter_list, + tf_operation = "get_element", + operation_args = list("iterations")) + + converged <- op("converged", + parameter_list, + tf_operation = "get_element", + operation_args = list("converged")) + + # pull out the elements + list(z = z, + var = var, + mean = mean, + sd = sd, + logdet = logdet, + iterations = iterations, + converged = converged) + + }, + tf_marginalisation_density = function(parameters, tf_conditional_density_fun, dots, other_args) { - diagonal_hessian <- other_args$diagonal_hessian + z <- parameters$z + mean <- parameters$mean + logdet <- parameters$logdet - # simpler interface to conditional density. If reduce = TRUE, it does - # reduce_sum on component densities - d0 <- function(z, reduce = TRUE) { - # transpose z to a row vector, which the dag is expecting - t_z <- tf_transpose(z) - args <- c(list(t_z), dots, list(reduce = reduce)) - do.call(tf_conditional_density_fun, args) - } + # evaluate conditional density at posterior mode of z + args <- c(list(tf_transpose(z)), + dots, + list(reduce = TRUE)) + d0 <- do.call(tf_conditional_density_fun, args) - # negative log-posterior for the current value of z under MVN assumption - psi <- function(a, z, mu) { - p1 <- tf$matmul(tf_transpose(a), z - mu) - fl(0.5) * tf$squeeze(p1, 1:2) - d0(z) - } + # the approximate marginal conditional posterior + if (other_args$multivariate) { - mu <- parameters$mu - logdet <- parameters$logdet - z <- parameters$z - a <- parameters$a + a <- parameters$a + mean <- tf_transpose(mean) + p1 <- tf$matmul(tf_transpose(a), z - mean) + p1 <- tf$squeeze(p1, 1:2) - # the approximate marginal conditional posterior - nmcp <- psi(a, z, mu) + tf$squeeze(logdet, 1) + } else { + + sd <- parameters$sd + p1 <- tf_sum((z - mean) ^ 2 / sd) + } + + nmcp <- fl(0.5) * p1 - d0 + logdet -nmcp }, return_list = function(parameters) { - list(mean = t(parameters$z), - sigma = parameters$covar, - iterations = parameters$iterations, - converged = parameters$converged) + if (self$multivariate) { + + result <- list(mean = t(parameters$z), + sigma = parameters$covar, + iterations = parameters$iterations, + converged = parameters$converged) + + } else { + + result <- list(mean = parameters$z, + sd = sqrt(parameters$var), + iterations = parameters$iterations, + converged = parameters$converged) + + } + + result } @@ -612,6 +846,8 @@ as_marginaliser <- function(..., class) { } +#' @export +#' @noRd print.marginaliser <- function(x, ...) { msg <- paste(x$class$classname, "object") cat(msg)