diff --git a/DESCRIPTION b/DESCRIPTION index 033adad0..17a6a25d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -65,6 +65,8 @@ Collate: 'internals.R' 'calculate.R' 'callbacks.R' + 'marginalise.R' + 'marginalisers.R' 'simulate.R' 'chol2symm.R' Imports: diff --git a/NAMESPACE b/NAMESPACE index 1514c300..18b30ec7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -99,6 +99,7 @@ S3method(plot,greta_model) S3method(print,greta_array) S3method(print,greta_model) S3method(print,initials) +S3method(print,marginaliser) S3method(print,optimiser) S3method(print,sampler) S3method(print,summary.greta_array) @@ -164,6 +165,7 @@ export(cov2cor) export(diag) export(dirichlet) export(dirichlet_multinomial) +export(discrete_marginalisation) export(distribution) export(eigen) export(exponential) @@ -188,12 +190,14 @@ export(iprobit) export(joint) export(l_bfgs_b) export(laplace) +export(laplace_approximation) export(lkj_correlation) export(log10.greta_array) export(log1pe) export(log2.greta_array) export(logistic) export(lognormal) +export(marginalise) export(mcmc) export(mixture) export(model) diff --git a/R/dag_class.R b/R/dag_class.R index bd526083..4b3f4dff 100644 --- a/R/dag_class.R +++ b/R/dag_class.R @@ -54,11 +54,16 @@ dag_class <- R6Class( # float type on_graph = function(expr) { - # temporarily pass float type info to options, so it can be accessed by - # nodes on definition, without cluncky explicit passing + # temporarily pass float type and batch size info to options, so it can be + # accessed by nodes on definition, without clunky explicit passing old_float_type <- options()$greta_tf_float - on.exit(options(greta_tf_float = old_float_type)) - options(greta_tf_float = self$tf_float) + old_batch_size <- options()$greta_batch_size + + on.exit(options(greta_tf_float = old_float_type, + greta_batch_size = old_batch_size)) + + options(greta_tf_float = self$tf_float, + greta_batch_size = self$tf_environment$batch_size) with(self$tf_graph$as_default(), expr) }, @@ -375,7 +380,7 @@ dag_class <- R6Class( }, # define tensor for overall log density and gradients - define_joint_density = function() { + define_joint_density = function(adjusted = TRUE) { tfe <- self$tf_environment @@ -392,6 +397,12 @@ dag_class <- R6Class( target_nodes, SIMPLIFY = FALSE) + # assign the un-reduced densities, for use in marginalisation + names(densities) <- NULL + assign("component_densities", + densities, + envir = self$tf_environment) + # reduce_sum each of them (skipping the batch dimension) self$on_graph(summed_densities <- lapply(densities, tf_sum, drop = TRUE)) @@ -404,24 +415,28 @@ dag_class <- R6Class( joint_density, envir = self$tf_environment) - # define adjusted joint density + if (adjusted) { - # get names of Jacobian adjustment tensors for all variable nodes - adj_names <- paste0(self$get_tf_names(types = "variable"), "_adj") + # get names of adjustment tensors for all variable nodes + adj_names <- paste0(self$get_tf_names(types = "variable"), "_adj") - # get TF density tensors for all distribution - adj <- lapply(adj_names, get, envir = self$tf_environment) + # get TF density tensors for all distribution + adj <- lapply(adj_names, get, envir = self$tf_environment) - # remove their names and sum them together (accounting for tfp bijectors - # sometimes returning a scalar tensor) - names(adj) <- NULL - adj <- match_batches(adj) - self$on_graph(total_adj <- tf$add_n(adj)) + # remove their names and sum them together (accounting for tfp bijectors + # sometimes returning a scalar tensor) + adj <- match_batches(adj) - # assign overall density to environment - assign("joint_density_adj", - joint_density + total_adj, - envir = self$tf_environment) + # remove their names and sum them together + names(adj) <- NULL + self$on_graph(total_adj <- tf$add_n(adj)) + + # assign overall density to environment + assign("joint_density_adj", + joint_density + total_adj, + envir = self$tf_environment) + + } }, @@ -429,18 +444,8 @@ dag_class <- R6Class( # target tensor evaluate_density = function(distribution_node, target_node) { - tfe <- self$tf_environment - - parameter_nodes <- distribution_node$parameters - - # get the tensorflow objects for these - distrib_constructor <- self$get_tf_object(distribution_node) + tfp_distribution <- self$get_tf_object(distribution_node) tf_target <- self$get_tf_object(target_node) - tf_parameter_list <- lapply(parameter_nodes, self$get_tf_object) - - # execute the distribution constructor functions to return a tfp - # distribution object - tfp_distribution <- distrib_constructor(tf_parameter_list, dag = self) self$tf_evaluate_density(tfp_distribution, tf_target, @@ -520,6 +525,9 @@ dag_class <- R6Class( for (name in data_names) tfe[[name]] <- tfe_old[[name]] + # copy the batch size over + tfe$batch_size <- tfe_old$batch_size + # put the free state in the environment, and build out the tf graph tfe$free_state <- free_state self$define_tf_body() @@ -775,25 +783,10 @@ dag_class <- R6Class( }, - # get the tfp distribution object for a distribution node - get_tfp_distribution = function(distrib_node) { - - # build the tfp distribution object for the distribution, and use it - # to get the tensor for the sample - distrib_constructor <- self$get_tf_object(distrib_node) - parameter_nodes <- distrib_node$parameters - tf_parameter_list <- lapply(parameter_nodes, self$get_tf_object) - - # execute the distribution constructor functions to return a tfp - # distribution object - tfp_distribution <- distrib_constructor(tf_parameter_list, dag = self) - - }, - # try to draw a random sample from a distribution node draw_sample = function(distribution_node) { - tfp_distribution <- self$get_tfp_distribution(distribution_node) + tfp_distribution <- self$get_tf_object(distribution_node) sample <- tfp_distribution$sample diff --git a/R/inference.R b/R/inference.R index 2402c2e4..ddaca382 100644 --- a/R/inference.R +++ b/R/inference.R @@ -291,7 +291,6 @@ run_samplers <- function(samplers, thin <- as.integer(thin) dag <- samplers[[1]]$model$dag - chains <- samplers[[1]]$n_chains n_cores <- check_n_cores(n_cores, length(samplers), plan_is) float_type <- dag$tf_float diff --git a/R/inference_class.R b/R/inference_class.R index a019b398..d49ff011 100644 --- a/R/inference_class.R +++ b/R/inference_class.R @@ -46,8 +46,6 @@ inference <- R6Class( free_parameters <- model$dag$example_parameters(free = TRUE) free_parameters <- unlist_tf(free_parameters) self$n_free <- length(free_parameters) - self$set_initial_values(initial_values) - self$n_traced <- length(model$dag$trace_values(self$free_state)) self$seed <- seed }, @@ -277,8 +275,6 @@ sampler <- R6Class( parameters = parameters, seed = seed) - self$n_chains <- nrow(self$free_state) - # duplicate diag_sd if needed n_diag <- length(self$parameters$diag_sd) n_parameters <- self$n_free @@ -289,6 +285,8 @@ sampler <- R6Class( # define the draws tensor on the tf graph self$define_tf_draws() + self$set_initial_values(initial_values) + self$n_chains <- nrow(self$free_state) }, @@ -1025,6 +1023,8 @@ optimiser <- R6Class( self$create_optimiser_objective() self$create_tf_minimiser() + self$set_initial_values(initial_values) + self$n_traced <- length(model$dag$trace_values(self$free_state)) }, diff --git a/R/joint.R b/R/joint.R index c61605af..045258a7 100644 --- a/R/joint.R +++ b/R/joint.R @@ -124,15 +124,15 @@ joint_distribution <- R6Class( tf_distrib = function(parameters, dag) { - # get information from the *nodes* for component distributions, not the tf - # objects passed in here + # get tfp distributions + tfp_distributions <- parameters + names(tfp_distributions) <- NULL - # get tfp distributions, truncations, & bounds of component distributions + # get information on truncations, & bounds of component distributions from + # the *nodes* for component distributions distribution_nodes <- self$parameters truncations <- lapply(distribution_nodes, member, "truncation") bounds <- lapply(distribution_nodes, member, "bounds") - tfp_distributions <- lapply(distribution_nodes, dag$get_tfp_distribution) - names(tfp_distributions) <- NULL log_prob <- function(x) { diff --git a/R/marginalise.R b/R/marginalise.R new file mode 100644 index 00000000..7ec07680 --- /dev/null +++ b/R/marginalise.R @@ -0,0 +1,289 @@ +# marginalising random variables + +#' @name marginalisation +#' +#' @title direct marginalisation of random variables +#' @description Inference on many statistical models requires marginalisation of +#' (ie. integration of functions over) random variables. In addition to +#' general purpose marginalisation methods like MCMC that act on an entire +#' model, it is often useful to directly marginalise a single random variable +#' within a larger model; e.g. for random variables where marginalisation by +#' MCMC is inefficient, impossible, or when an approximate marginalisation is +#' substantially more efficient. \code{marginalise()} performs direct +#' marginalisation within a greta model, using one of the available +#' marginalisation methods, detailed in \link{marginalisers}. +#' +#' @examples +#' \dontrun{ +#' # marginalise a discrete random variable and carry out HMC on the +#' # continouous variables: +#' y <- rnorm(100) +#' +#' lambda <- lognormal(0, 1) +#' theta <- normal(0, 1) +#' sd <- lognormal(0, 1) +#' +#' # if discrete variables could be sampled by MCMC in greta, +#' # we would be able to simply do: +#' # n <- poisson(lambda) +#' # mu <- theta ^ n +#' # distribution(y) <- normal(mu, sd) +#' +#' # the function to marginalise over - the variable to marginalise +#' # must be first, others must be passed to marginalise() +#' likelihood <- function (n, theta, sd) { +#' mu <- theta ^ n +#' distribution(y) <- normal(mu, sd) +#' } +#' +#' # integrate the function values w.r.t. the poisson distribution +#' marginalise(fun = likelihood, +#' variable = poisson(lambda), +#' method = discrete_marginalisation(values = 0:10), +#' theta = theta, +#' sd = sd) +#' +#' m <- model(lambda) +#' draws <- mcmc(m, hmc()) +#' } +NULL + +#' @rdname marginalisation +#' @export +#' +#' @param fun an R function to integrate with respect to the random variable. +#' The first argument must be a greta array for the random variable, any +#' subsequent arguments must passed in via the \dots argument. +#' @param variable a variable greta array with a distribution, representing the +#' random variable to marginalise +#' @param method a \code{marginaliser} object giving the method for carrying out +#' the marginalisation +#' @param \dots named greta arrays to be passed to \code{fun} +#' +#' @details The code in \code{fun} must define at least one distribution over +#' data (ie. a model likelihood), and cannot create any new variables. Any +#' variables must be created outide this function, and passed in via the \dots +#' argument. +marginalise <- function(fun, variable, method, ...) { + + # get the distribution for the random variable + distribution_node <- get_node(variable)$distribution + + # check the inputs + if (!is.function(fun)) { + stop("'fun' must be an R function") + } + + # check the additional arguments to fun are given in dots + dots <- list(...) + dot_names <- names(dots) + expected_names <- names(formals(fun)[-1]) + if (!all(expected_names %in% dot_names)) { + stop("all arguments to 'fun' must be passed, named, to marginalise") + } + + if (!inherits(distribution_node, "distribution_node")) { + stop("'variable' must be a variable greta array with a distribution") + } + + if (!inherits(method, "marginaliser")) { + stop("'method' must be a valid marginalisation method. ", + "See ?marginalise for options") + } + + # excise the variable from the distribution + distribution_node$remove_target() + + # construct the marginaliser + marginaliser <- do.call(method$class$new, c(distribution_node, method$args)) + + # turn the greta function into a TF conditional density function; doing + # something very similar to as_tf_function(), but giving a function that + # returns a tensorflow scalar for the density (unadjusted since there will be + # no new variables) + args <- c(list(variable), dots) + conditional_joint_density <- as_conditional_density(fun, args) + + # get a list of greta arrays for the marginalisation parameters: + parameters <- marginaliser$compute_parameters( + conditional_density_fun = conditional_joint_density, + dots = dots + ) + + # add on any pre-computed parameters + parameters <- c(parameters, marginaliser$other_parameters) + + # create the distribution + distribution <- distrib( + distribution = "marginalisation", + marginaliser = marginaliser, + parameters = parameters, + conditional_density_fun = conditional_joint_density, + dots = dots + ) + + # return a list of the greta arrays computed during the marginalisation, and + # any other things from the method + output <- marginaliser$return_list(parameters) + invisible(output) + +} + +marginalisation_distribution <- R6Class( + "marginalisation_distribution", + inherit = distribution_node, + public = list( + + marginaliser = NULL, + tf_marginalisation_density = NULL, + conditional_density_fun = NULL, + + initialize = function(marginaliser, + parameters = parameters, + conditional_density_fun, + dots) { + + # initialize class, and add methods + super$initialize("marginalisation") + self$marginaliser <- marginaliser + self$conditional_density_fun <- conditional_density_fun + + # add the parameters + parameter_names <- names(parameters) + for (i in seq_along(parameters)) { + self$add_parameter(parameters[[i]], + parameter_names[i]) + } + + # add the dots (extra inputs to conditional_density_fun) as parameters + dot_nodes <- lapply(dots, get_node) + for (i in seq_along(dot_nodes)) { + self$add_parameter(dot_nodes[[i]], + paste("input", i)) + } + + # make the distribution the target + self$remove_target() + self$add_target(marginaliser$distribution) + + }, + + tf_distrib = function(parameters, dag) { + + # unpack the parameters here + are_dots <- grepl("^input ", names(parameters)) + dots <- parameters[are_dots] + parameters <- parameters[!are_dots] + + # the marginal density implied by the function + log_prob <- function(x) { + + density <- self$marginaliser$tf_marginalisation_density + density(parameters = parameters, + tf_conditional_density_fun = self$conditional_density_fun, + dots = dots, + other_args = self$marginaliser$other_args) + + } + + + list(log_prob = log_prob) + + } + + ) +) + +# turn the greta function into a TF conditional density function; doing +# something very similar to as_tf_function(), but giving a function that +# returns a tensorflow scalar for the density +as_conditional_density <- function(r_fun, args) { + + # run the operation on isolated greta arrays, so nothing gets attached to the + # model. Real greta arrays in args + ga_dummies <- lapply(args, dummy_greta_array) + out <- do.call(r_fun, ga_dummies) + + # we don't want to rely on the function returning a greta array that depends + # on everything internally. instead, we want to grab any densities defined in + # the function. + + # this function will take in tensors corresponding to the things in args + function(..., reduce = TRUE) { + + tensor_inputs <- list(...) + + # if any of these are shapeless, make them into greta scalars (3D) + tensor_inputs <- lapply(tensor_inputs, + function(x) { + if (identical(dim(x), list())) + x <- tf$reshape(x, shape(1, 1, 1)) + x + }) + + # transfer batch dimensions if needed + tensor_inputs <- match_batches(tensor_inputs) + + # create a sub-dag for everything connected to the inputs + sub_dag <- dag_class$new(ga_dummies, tf_float = options()$greta_tf_float) + + # check there are distributions + if (!any(sub_dag$node_types == "distribution")) { + stop("'fun' must constain at least one distribution over data", + call. = FALSE) + } + + # check there are no variables + if (any(sub_dag$node_types == "variable")) { + stop("'fun' must not create any new variables, ", + "variables can be passed in as arguments", + call. = FALSE) + } + + # use the default graph, so that it can be overwritten when this is called + # (using on_graph()) + sub_dag$tf_graph <- tf$get_default_graph() + sub_tfe <- sub_dag$tf_environment + + # pass on the batch size, used when defining data + sub_tfe$batch_size <- get_batch_size() + + # set the input tensors as the values for the dummy greta arrays in the new + # tf_environment + node_dummies <- lapply(ga_dummies, get_node) + tf_names <- lapply(node_dummies, sub_dag$tf_name) + for (i in seq_along(tf_names)) + assign(tf_names[[i]], tensor_inputs[[i]], envir = sub_tfe) + + # have any new data defined as constants, to avoid placeholding issues in + # the wider model + greta_stash$data_as_constants <- TRUE + on.exit(greta_stash$data_as_constants <- NULL) + + # define all nodes (only data, operation and distribution) in the + # environment, and on the graph + sub_dag$on_graph(lapply(sub_dag$node_list, + function(x) x$define_tf(sub_dag))) + + # define and return the joint density tensor (no variables, so no need for + # log jacobian adjustment) + sub_dag$define_joint_density(adjusted = FALSE) + + # whether to reduce_sum the densities, or sum them elementwise + if (reduce) { + result <- sub_tfe$joint_density + } else { + densities <- sub_tfe$component_densities + if (length(densities) > 1) { + names(densities) <- NULL + result <- tf$add_n(densities) + } else { + result <- densities[[1]] + } + } + + result + + } + +} diff --git a/R/marginalisers.R b/R/marginalisers.R new file mode 100644 index 00000000..3e33c4dc --- /dev/null +++ b/R/marginalisers.R @@ -0,0 +1,854 @@ +# marginaliser constructors + +# add a marginaliser R6 class + +marginaliser <- R6Class( + "marginaliser", + public = list( + + name = "marginaliser", + distribution = NULL, + + other_parameters = list(), + other_args = list(), + + initialize = function(distribution, ...) { + self$distribution <- distribution + }, + + return_list = function(parameters) { + list() + } + + ) +) + +discrete_marginaliser <- R6Class( + "discrete marginaliser", + inherit = marginaliser, + public = list( + + initialize = function(distribution, values) { + + if (!distribution$discrete) { + stop("this marginalisation method can only be used ", + "with discrete distributions", + call. = FALSE) + } + + super$initialize(distribution) + + # convert values to a list of data greta arrays + values_list <- as.list(values) + values_list <- lapply(values_list, as_data) + names(values_list) <- paste0("value_", seq_along(values_list)) + self$other_parameters <- values_list + + }, + + # return a named list of operation greta arrays for the marginalisation + # parameters + compute_parameters = function(conditional_density_fun, + dots) { + + # function to compute the parameter (log probabilities) of the marginalisation + tf_compute_log_weights <- function(tfp_distribution, ...) { + + # and compute log probabilities for the values + values_list <- list(...) + log_weights_list <- lapply(values_list, tfp_distribution$log_prob) + log_weights_list <- lapply(log_weights_list, tf_sum) + + # convert to a vector of discrete probabilities and make them sum to 1, + # whilst staying on the log scale + log_weights <- tf$concat(log_weights_list, axis = 1L) + + # normalise weights on log scale + log_weights_sum <- tf$reduce_logsumexp( + log_weights, + axis = 1L, + keepdims = TRUE + ) + + log_weights - log_weights_sum + + } + + # list of parameters in a mock greta array (just representing a tf list) + args <- c(operation = "marginalisation_log_weights", + self$distribution, + self$other_parameters, + tf_operation = "tf_compute_log_weights", + list(dim = c(length(self$other_parameters), 1L))) + log_weights <- do.call(op, args) + + list(log_weights = log_weights) + + }, + + # compute the marginal joint density + tf_marginalisation_density = function(parameters, + tf_conditional_density_fun, + dots, + other_args) { + + # unpack the parameters + log_weights <- parameters$log_weights + values_list <- parameters[names(parameters) != "log_weights"] + + # compute the conditional joint density for each value (passing in + # dots too) + log_density_list <- list() + for (i in seq_along(values_list)) { + args <- c(list(values_list[[i]]), dots) + log_density_list[[i]] <- do.call(tf_conditional_density_fun, args) + } + + # expand and combine the densities + log_density_list <- lapply(log_density_list, tf$expand_dims, 1L) + log_density_list <- lapply(log_density_list, tf$expand_dims, 2L) + log_density_vec <- tf$concat(log_density_list, axis = 1L) + + # compute a weighted sum + log_density_weighted_vec <- log_density_vec + log_weights + density <- tf$reduce_logsumexp(log_density_weighted_vec, axis = 1L) + density + + }, + + return_list = function(parameters) { + list(probabilities = exp(parameters$log_weights)) + } + + ) +) + +laplace_marginaliser <- R6Class( + "laplace marginaliser", + inherit = marginaliser, + public = list( + + tolerance = NULL, + max_iterations = NULL, + diagonal_hessian = NULL, + multivariate = FALSE, + + initialize = function(distribution, + tolerance, + max_iterations, + diagonal_hessian) { + + + # check the distribution and determine whether we are witking with a + # multivariate or univariate normal + name <- distribution$distribution_name + if (!name %in% c("normal", "multivariate_normal")) { + stop("the Laplace approximation can only be used ", + "with a normal or multivariate normal distribution", + call. = FALSE) + } + + self$multivariate <- name == "multivariate_normal" + + super$initialize(distribution) + + self$other_args <- list(tolerance = tolerance, + max_iterations = max_iterations, + diagonal_hessian = diagonal_hessian, + multivariate = self$multivariate) + + }, + + # named list of operation greta arrays for the marginalisation parameters + compute_parameters = function(conditional_density_fun, dots) { + if (self$multivariate) { + self$compute_parameters_multivariate(conditional_density_fun, dots) + } else { + self$compute_parameters_univariate(conditional_density_fun, dots) + } + }, + + # named list of operation greta arrays for the marginalisation parameters + compute_parameters_multivariate = function(conditional_density_fun, dots) { + + # function to compute the parameter (log probabilities) of the marginalisation + # define the marginalisation function + tf_compute_laplace_parameters = function(mean, + sigma, + ..., + tf_conditional_density_fun, + other_args) { + + dots <- list(...) + + # simpler interface to conditional density. If reduce = TRUE, it does + # reduce_sum on component densities + d0 <- function(z, reduce = TRUE) { + # transpose z to a row vector, which the dag is expecting + t_z <- tf_transpose(z) + args <- c(list(t_z), dots, list(reduce = reduce)) + do.call(tf_conditional_density_fun, args) + } + + # get the vectors of first and second derivatives of the conditional density + # function, w.r.t. the variable being marginalised + if (other_args$diagonal_hessian) { + derivs <- function(z) { + y <- d0(z, reduce = FALSE) + d1 <- tf$gradients(y, z)[[1]] + d2 <- tf$gradients(d1, z)[[1]] + list(d1, d2) + } + } else { + + stop("inference with non-diagonal hessians is not yet implemented", + call. = FALSE) + + derivs <- function(z) { + y <- d0(z, reduce = FALSE) + d1 <- tf$gradients(y, z)[[1]] + d2 <- tf$hessians(y, z)[[1]] # this won't work! + list(d1, d2) + } + } + + # negative log-posterior for the current value of z under MVN assumption + psi <- function(a, z, mean) { + p1 <- tf$matmul(tf_transpose(a), z - mean) + fl(0.5) * tf$squeeze(p1, 1:2) - d0(z) + } + + # tensors for parameters of MVN distribution (make mean column vector now) + mean <- tf_transpose(mean) + + # dimension of the MVN distribution + n <- dim(mean)[[2]] + + # here z is a *column vector* to simplify later calculations, it needs to be + # transposed to a row vector before feeding into the likelihood function(s) + z <- tf$identity(mean) + + # Newton-Raphson parameters + tol <- tf$constant(other_args$tolerance, tf_float(), shape(1)) + obj_old <- tf$constant(Inf, tf_float(), shape(1)) + iter <- tf$constant(0L) + maxiter <- tf$constant(other_args$max_iterations) + + # other objects + a_value <- add_first_dim(as_2d_array(rep(0, n))) + a <- fl(a_value) + u_value <- add_first_dim(diag(n)) + u <- fl(u_value) + eye <- fl(add_first_dim(diag(n))) + + # match batches on everything going into the loop that will have a batch + # dimension later + objects <- list(mean, sigma, z, a, u, obj_old, tol) + objects <- greta:::match_batches(objects) + mean <- objects[[1]] + sigma <- objects[[2]] + z <- objects[[3]] + a <- objects[[4]] + u <- objects[[5]] + obj_old <- objects[[6]] + tol <- objects[[7]] + + obj <- -d0(z) + + # get the batch dim explicitly, for the step size optimisation + batch_dim <- tf$shape(mean)[[0]] + batch_dim <- tf$expand_dims(batch_dim, 0L) + + # tensorflow while loop to do Newton-Raphson iterations + body <- function(z, a, u, obj_old, obj, tol, iter, maxiter) { + + obj_old <- obj + + deriv <- derivs(z) + d1 <- deriv[[1]] + d2 <- deriv[[2]] + + # curvature of the likelihood + w <- -d2 + rw <- sqrt(w) + + # decentred values of z + cf <- z - mean + + # approximate posterior covariance & cholesky factor (using the matrix inverse + # lemma for numerical stability) + mat1 <- tf$matmul(rw, tf_transpose(rw)) * sigma + eye + u <- tf$cholesky(mat1) + l <- tf_transpose(u) + + # compute Newton-Raphson update direction + b <- w * cf + d1 + mat2 <- rw * tf$matmul(sigma, b) + mat3 <- tf$matrix_triangular_solve(u, mat2) + adiff <- b - rw * tf$matrix_triangular_solve(l, mat3, lower = FALSE) - a + + # use golden section search to find the optimum distance to step in this + # direction, for each batch simultaneously + psiline <- function(s) { + s <- tf$expand_dims(s, 1L) + s <- tf$expand_dims(s, 2L) + a_new <- a + s * adiff + z_new <- tf$matmul(sigma, a_new) + mean + psi(a_new, z_new, mean) + } + + ls_results <- gss(psiline, batch_dim, upper = 2) + stepsize <- ls_results$minimum + stepsize <- tf$expand_dims(stepsize, 1L) + stepsize <- tf$expand_dims(stepsize, 2L) + + # do the update and compute new z and objective + a_new <- a + stepsize * adiff + z_new <- tf$matmul(sigma, a_new) + mean + obj <- psi(a_new, z_new, mean) + + list(z_new, a_new, u, obj_old, obj, tol, iter + 1L, maxiter) + + } + + cond <- function(z, a, u, obj_old, obj, tol, iter, maxiter) { + not_all_converged <- tf$reduce_any(tf$less(tol, obj_old - obj)) + in_time <- tf$less(iter, maxiter) + in_time & not_all_converged + } + + values <- list(z, + a, + u, + obj_old, + obj, + tol, + iter, + maxiter) + + # run the Newton-Raphson optimisation to find the posterior mode of z + out <- tf$while_loop(cond, body, values) + + a <- out[[2]] + + # apparently we need to redefine z etc. here, or the backprop errors + + # lots of duplicated code; this could be tidied up, but I ran out of time! + z <- tf$matmul(sigma, a) + mean + + # curvature of the likelihood at the mode + deriv <- derivs(z) + d2 <- deriv[[2]] + w <- -d2 + rw <- sqrt(w) + hessian <- tf$linalg$diag(tf$squeeze(w, 2L)) + + # approximate posterior covariance + covar <- tf$linalg$inv(tf$linalg$inv(sigma) + hessian) + + # log-determinant of l + mat1 <- tf$matmul(rw, tf_transpose(rw)) * sigma + eye + l <- tf$cholesky(mat1) + l_diag <- tf$matrix_diag_part(l) + logdet <- tf_sum(tf$log(l_diag)) + logdet <- tf$squeeze(logdet, 1L) + + # convergence information + iter <- out[[7]] + converged <- tf$less(iter, maxiter) + + # return a list of these things + list(z = z, + mean = mean, + a = a, + sigma = sigma, + logdet = logdet, + covar = covar, + iterations = iter, + converged = converged) + + } + + # get greta arrays for parameters of distribution node + mean <- self$distribution$parameters$mean + sigma <- self$distribution$parameters$sigma + + # run the laplace approximation fitting and get a list of parameters in a + # mock greta array (just representing a tf list) + args <- c(operation = "marginalisation_parameters", + mean = mean, + sigma = sigma, + dots, + list( + operation_args = list( + tf_conditional_density_fun = conditional_density_fun, + other_args = self$other_args + ), + tf_operation = "tf_compute_laplace_parameters", + dim = 1 + )) + + parameter_list <- do.call(op, args) + + # extract the elements to operation greta arrays with the correct shapes + z <- op("z", + parameter_list, + dim = dim(mean), + tf_operation = "get_element", + operation_args = list("z")) + + a <- op("a", + parameter_list, + dim = dim(mean), + tf_operation = "get_element", + operation_args = list("a")) + + logdet <- op("log determinant", + parameter_list, + tf_operation = "get_element", + operation_args = list("logdet")) + + covar <- op("covar", + parameter_list, + dim = dim(sigma), + tf_operation = "get_element", + operation_args = list("covar")) + + iterations <- op("iterations", + parameter_list, + tf_operation = "get_element", + operation_args = list("iterations")) + + converged <- op("converged", + parameter_list, + tf_operation = "get_element", + operation_args = list("converged")) + + # pull out the elements + list(z = z, + a = a, + mean = mean, + logdet = logdet, + covar = covar, + iterations = iterations, + converged = converged) + + }, + + # named list of operation greta arrays for the marginalisation parameters + compute_parameters_univariate = function(conditional_density_fun, dots) { + + # function to compute the parameter (log probabilities) of the marginalisation + tf_compute_laplace_parameters = function(mean, + sd, + ..., + tf_conditional_density_fun, + other_args) { + + + # the following algorithm for a univariate normal distribution is derived + # from LA in Rasmussen & Williams for diagonal Hessian & diagonal prior. + # It needs checking! + + # nolint start + + # with univariate normal and diagonal hessian, iteration is: + # z_diff = mean + (w * (z - mean) + d1(z)) / (1 / sd + w) - z + # z_new = z + s * z_diff + + # # instead work with decentred variable a where z = a * sd ^ 2 + mean + # # l = sqrt(1 + w * sd) + # # b = w * (z - mean) * d1(z) + # # a_diff = b − (w * sd * b) / l ^ 2 - a + # # a_new = a + s * a_diff + # # z_new = a_new * sd ^ 2 + mean + + # estimated sd at mode is: + # sqrt(1 / (1 / sd + w)) + + # the approximated negative marginal density is: + # nmcp = 0.5 * sum((z - mean) ^ 2 / sd) - d0(z) + 0.5 * sum(log1p(w * sd)) + + # nolint end + + dots <- list(...) + + # simpler interface to conditional density. If reduce = TRUE, it does + # reduce_sum on component densities + d0 <- function(z, reduce = TRUE) { + # transpose z to a row vector, which the dag is expecting + t_z <- tf_transpose(z) + args <- c(list(t_z), dots, list(reduce = reduce)) + do.call(tf_conditional_density_fun, args) + } + + # get the vectors of first and second derivatives of the conditional density + # function, w.r.t. the variable being marginalised + if (other_args$diagonal_hessian) { + derivs <- function(z) { + y <- d0(z, reduce = FALSE) + d1 <- tf$gradients(y, z)[[1]] + d2 <- tf$gradients(d1, z)[[1]] + list(d1, d2) + } + } else { + + stop("inference with non-diagonal hessians is not yet implemented", + call. = FALSE) + + derivs <- function(z) { + y <- d0(z, reduce = FALSE) + d1 <- tf$gradients(y, z)[[1]] + d2 <- tf$hessians(y, z)[[1]] # this won't work! + list(d1, d2) + } + } + + # negative log-posterior for the current value of z under MVN assumption + psi <- function(z, mean, sd) { + p1 <- tf_sum((z - mean) ^ 2 / sd) + fl(0.5) * tf$squeeze(p1, 1:2) - d0(z) + } + + # here z is a *column vector* to simplify later calculations, it needs to be + # transposed to a row vector before feeding into the likelihood function(s) + z <- tf$identity(mean) + + # Newton-Raphson parameters + tol <- tf$constant(other_args$tolerance, tf_float(), shape(1)) + obj_old <- tf$constant(Inf, tf_float(), shape(1)) + iter <- tf$constant(0L) + maxiter <- tf$constant(other_args$max_iterations) + + # match batches on everything going into the loop that will have a batch + # dimension later + objects <- list(mean, sd, z, obj_old, tol) + objects <- greta:::match_batches(objects) + mean <- objects[[1]] + sd <- objects[[2]] + z <- objects[[3]] + obj_old <- objects[[4]] + tol <- objects[[5]] + + obj <- -d0(z) + + # get the batch dim explicitly, for the step size optimisation + batch_dim <- tf$shape(mean)[[0]] + batch_dim <- tf$expand_dims(batch_dim, 0L) + + # tensorflow while loop to do Newton-Raphson iterations + body <- function(z, obj_old, obj, tol, iter, maxiter) { + + obj_old <- obj + + deriv <- derivs(z) + d1 <- deriv[[1]] + d2 <- deriv[[2]] + + # curvature of the likelihood + w <- -d2 + + # with univariate normal and diagonal hessian, iteration is: + z_diff <- mean + (w * (z - mean) + d1) / (fl(1) / sd + w) - z + + # use golden section search to find the optimum distance to step in this + # direction, for each batch simultaneously + psiline <- function(s) { + s <- tf$expand_dims(s, 1L) + s <- tf$expand_dims(s, 2L) + z_new <- z + s * z_diff + psi(z_new, mean, sd) + } + + ls_results <- gss(psiline, batch_dim, upper = 2) + stepsize <- ls_results$minimum + stepsize <- tf$expand_dims(stepsize, 1L) + stepsize <- tf$expand_dims(stepsize, 2L) + + # do the update and compute new z and objective + z_new = z + stepsize * z_diff + obj <- psi(z_new, mean, sd) + + list(z_new, obj_old, obj, tol, iter + 1L, maxiter) + + } + + cond <- function(z, obj_old, obj, tol, iter, maxiter) { + not_all_converged <- tf$reduce_any(tf$less(tol, obj_old - obj)) + in_time <- tf$less(iter, maxiter) + in_time & not_all_converged + } + + values <- list(z, + obj_old, + obj, + tol, + iter, + maxiter) + + # run the Newton-Raphson optimisation to find the posterior mode of z + out <- tf$while_loop(cond, body, values) + + # we might need to redefine z etc. if the backprop won't allow this errors + z <- out[[1]] + + # curvature of the likelihood at the mode + deriv <- derivs(z) + d2 <- deriv[[2]] + w <- -d2 + + # approximate posterior variance + var <- fl(1) / (fl(1) / sd + w) + + # log-determinant of l + logdet <- fl(0.5) * tf_sum(tf$math$log1p(w * sd)) + + # convergence information + iter <- out[[5]] + converged <- tf$less(iter, maxiter) + + # return a list of these things + list(z = z, + var = var, + logdet = logdet, + iterations = iter, + converged = converged) + + } + + # get greta arrays for parameters of distribution node + mean <- self$distribution$parameters$mean + sd <- self$distribution$parameters$sd + + # run the laplace approximation fitting and get a list of parameters in a + # mock greta array (just representing a tf list) + args <- c(operation = "marginalisation_parameters", + mean = mean, + sd = sd, + dots, + list( + operation_args = list( + tf_conditional_density_fun = conditional_density_fun, + other_args = self$other_args + ), + tf_operation = "tf_compute_laplace_parameters", + dim = 1 + )) + + parameter_list <- do.call(op, args) + + # extract the elements to operation greta arrays with the correct shapes + z <- op("z", + parameter_list, + dim = self$distribution$dim, + tf_operation = "get_element", + operation_args = list("z")) + + var <- op("var", + parameter_list, + dim = self$distribution$dim, + tf_operation = "get_element", + operation_args = list("var")) + + logdet <- op("log determinant", + parameter_list, + tf_operation = "get_element", + operation_args = list("logdet")) + + iterations <- op("iterations", + parameter_list, + tf_operation = "get_element", + operation_args = list("iterations")) + + converged <- op("converged", + parameter_list, + tf_operation = "get_element", + operation_args = list("converged")) + + # pull out the elements + list(z = z, + var = var, + mean = mean, + sd = sd, + logdet = logdet, + iterations = iterations, + converged = converged) + + }, + + tf_marginalisation_density = function(parameters, + tf_conditional_density_fun, + dots, + other_args) { + + z <- parameters$z + mean <- parameters$mean + logdet <- parameters$logdet + + # evaluate conditional density at posterior mode of z + args <- c(list(tf_transpose(z)), + dots, + list(reduce = TRUE)) + d0 <- do.call(tf_conditional_density_fun, args) + + # the approximate marginal conditional posterior + if (other_args$multivariate) { + + a <- parameters$a + mean <- tf_transpose(mean) + p1 <- tf$matmul(tf_transpose(a), z - mean) + p1 <- tf$squeeze(p1, 1:2) + + } else { + + sd <- parameters$sd + p1 <- tf_sum((z - mean) ^ 2 / sd) + + } + + nmcp <- fl(0.5) * p1 - d0 + logdet + -nmcp + + }, + + return_list = function(parameters) { + + if (self$multivariate) { + + result <- list(mean = t(parameters$z), + sigma = parameters$covar, + iterations = parameters$iterations, + converged = parameters$converged) + + } else { + + result <- list(mean = parameters$z, + sd = sqrt(parameters$var), + iterations = parameters$iterations, + converged = parameters$converged) + + } + + result + + } + + ) +) + +#' @name marginalisers +#' +#' @title marginalisation methods +#' @description Functions to set up marginalisers (which explicitly integrate +#' out random variables) and modify their behaviour, for use in +#' \code{\link{marginalise}()}. +#' +#' \code{discrete_marginalisation} marginalises discrete random variables via +#' a Riemann sum over a range of plausible values. +#' +#' \code{laplace_approximation} marginalises random variables (with +#' multivariate normal priors) approximately, using by Newton-Raphson +#' optimisation and assuming the posterior follows a multivariate normal +#' distribution. +#' +#' @return a \code{marginaliser} object that can be passed to +#' \code{marginalise}. +NULL + +#' @rdname marginalisers +#' @export +#' +#' @param values an R vector giving values at which to evaluate the function for +#' a discrete marginalisation +#' +#' @details \code{discrete_marginalisation} can only be used with discrete +#' probability distributions, e.g. those defined with \code{poisson()} and +#' \code{binomial()}. For discrete distributions with finite support (such as +#' \code{bernoulli()}) the marginalisation will be exact, so long as +#' \code{values} includes all possible values of the variable. For discrete +#' distributions with non-finite support (such as \code{poisson()}, which has +#' no upper bound), the marginalisation can only ever be approximate. However +#' if \code{values} cover a range of values with sufficiently high support in +#' the distribution, that approximation error will be minimal. +discrete_marginalisation <- function(values) { + + if (!is.vector(values) | !is.numeric(values)) { + msg <- "'values' must be an R numeric vector" + + if (inherits(values, "greta_array")) { + msg <- paste0(msg, ", not a greta array") + } + + stop(msg) + } + + as_marginaliser(values = values, + class = discrete_marginaliser) + +} + +#' @rdname marginalisers +#' @export +#' +#' @param tolerance the (positive) numerical convergence tolerance (in units of +#' the log conditional posterior density) of the Newton-Raphson optimisation +#' algorithm +#' +#' @param max_iterations the (positive) integer-like maximum number of +#' iterations of the Newton-Raphson optimisation algorithm +#' +#' @param diagonal_hessian whether the Hessian matrix should be assumed to be +#' diagonal, to speed up computations. See Details. +#' +#' @details \code{laplace_approximation} can only be used to marginalise +#' variables following a multivariate normal distribution. +#' +#' The argument \code{diagonal_hessian} can be used to state that the +#' conditional density factorises along the elements of the variable being +#' marginalised, and therefore the Hessian matrix of this function can be +#' assumed to be diagonal. A conditional density function factorises if each +#' observation in the conditional density depends only on the corresponding per +#' element of the variable being marginalised. If this is not the case and you +#' set \code{diagonal_hessian = TRUE}, your inferences will be incorrect. +#' +laplace_approximation <- function(tolerance = 1e-6, + max_iterations = 50, + diagonal_hessian = FALSE) { + + if (!(is.numeric(tolerance) && + is.vector(tolerance) && + length(tolerance) == 1 && + tolerance > 0)) { + stop("'tolerance' must be a positive, scalar numeric value") + } + + max_iterations <- as.integer(max_iterations) + if (!(is.vector(max_iterations) && + length(max_iterations) == 1 && + max_iterations > 0)) { + stop("'max_iterations' must be a positive, scalar integer value") + } + + as_marginaliser(tolerance = tolerance, + max_iterations = max_iterations, + diagonal_hessian = diagonal_hessian, + class = laplace_marginaliser) + +} + +# helper to contruct marginalisers +as_marginaliser <- function(..., class) { + + obj <- list(class = class, args = list(...)) + class_name <- class$classname + class(obj) <- c(class_name, "marginaliser") + obj + +} + +#' @export +#' @noRd +print.marginaliser <- function(x, ...) { + msg <- paste(x$class$classname, "object") + cat(msg) +} diff --git a/R/mixture.R b/R/mixture.R index 5c96de8f..23f12587 100644 --- a/R/mixture.R +++ b/R/mixture.R @@ -210,17 +210,16 @@ mixture_distribution <- R6Class( tf_distrib = function(parameters, dag) { - # get information from the *nodes* for component distributions, not the tf - # objects passed in here + # get tfp distributions and mixture weights + weights <- parameters$weights + tfp_distributions <- parameters[names(parameters) != "weights"] + names(tfp_distributions) <- NULL - # get tfp distributions, truncations, & bounds of component distributions + # get information on truncations, & bounds of component distributions from + # the *nodes* for component distributions distribution_nodes <- self$parameters[names(self$parameters) != "weights"] truncations <- lapply(distribution_nodes, member, "truncation") bounds <- lapply(distribution_nodes, member, "bounds") - tfp_distributions <- lapply(distribution_nodes, dag$get_tfp_distribution) - names(tfp_distributions) <- NULL - - weights <- parameters$weights # use log weights if available if (self$weights_is_log) { diff --git a/R/node_types.R b/R/node_types.R index 281b18e8..519ce3b2 100644 --- a/R/node_types.R +++ b/R/node_types.R @@ -56,8 +56,7 @@ data_node <- R6Class( } # expand up to batch size - tiling <- c(tfe$batch_size, rep(1L, ndim)) - batched_tensor <- tf$tile(unbatched_tensor, tiling) + batched_tensor <- tile_to_batch(unbatched_tensor, tfe$batch_size) # put unbatched tensor in environment so it can be set assign(unbatched_name, unbatched_tensor, envir = tfe) @@ -543,9 +542,13 @@ distribution_node <- R6Class( tf = function(dag) { - # assign the distribution object constructor function to the environment + # build a tfp distribution object + tf_parameter_list <- lapply(self$parameters, dag$get_tf_object) + tfp_distribution <- self$tf_distrib(tf_parameter_list, dag = dag) + + # assign it to the environment assign(dag$tf_name(self), - self$tf_distrib, + tfp_distribution, envir = dag$tf_environment) }, diff --git a/R/utils.R b/R/utils.R index bb5b8c40..08c75b48 100644 --- a/R/utils.R +++ b/R/utils.R @@ -164,6 +164,10 @@ check_tf_version <- function(alert = c("none", member <- function(x, method) eval(parse(text = paste0("x$", method))) +get_element <- function(x, element) { + x[[element]] +} + node_type <- function(node) { classes <- class(node) type <- grep("*_node", classes, value = TRUE) @@ -181,6 +185,12 @@ fl <- function(x) { tf$constant(x, dtype = tf_float()) } +# get the tensor for the batch size in the dag currently defining (since it's +# not alway possible to pass the dag in) +get_batch_size <- function() { + options()$greta_batch_size +} + # coerce an integer(ish) vector to a list as expected in tensorflow shape # arguments #' @noRd @@ -299,13 +309,19 @@ drop_column_dim <- function(x) { x } +# given a tensor (with batch dimension of size 1) and a batch size tensor, tile +# the tensor to match the batch size +tile_to_batch <- function(x, batch_size) { + ndim <- length(dim(x)) + tf$tile(x, c(batch_size, rep(1L, ndim - 1))) +} + # where x is a tensor with no batch dimension, and y is a tensor with a batch # dimension, tile x to have first dimension matching y (dimension determined at # run time) expand_to_batch <- function(x, y) { batch_size <- tf$shape(y)[[0]] - ndim <- length(dim(x)) - tf$tile(x, c(batch_size, rep(1L, ndim - 1))) + tile_to_batch(x, batch_size) } # does this tensor have a batch dimension (of unknown size) as its first @@ -383,6 +399,113 @@ disable_tensorflow_logging <- function(disable = TRUE) { reticulate::py_set_attr(logger, "disabled", disable) } +# vectorised golden section search algorithm for linesearch (1D constrained +# minimisation) function must accept and return a 1D tensor of values, and +# return a a tensor of the same shape returning objectives; this can therefore +# run linesearch for multiple chains simultaneously. The batch dimension must be +# known in advance however. +gss <- function(func, + batch_dim = 1, + lower = 0, + upper = 1, + max_iterations = 50, + tolerance = .Machine$double.eps ^ 0.25) { + + lower <- tf$constant(lower, tf_float(), shape(1)) + upper <- tf$constant(upper, tf_float(), shape(1)) + maxiter <- tf$constant(as.integer(max_iterations), tf$int32) + tol <- fl(tolerance) + iter <- tf$constant(0L, tf$int32) + golden_ratio <- fl(2 / (sqrt(5) + 1)) + + # expand out to batch dimensions + if (!inherits(batch_dim, "tensorflow.tensor")) { + batch_dim <- as.integer(batch_dim) + batch_dim <- tf$constant(batch_dim, tf$int32, shape(1)) + } + + # initial evaluation points + left <- tf$tile(lower, batch_dim) + right <- tf$tile(upper, batch_dim) + width <- right - left + d <- golden_ratio * width + x1 <- right - d + x2 <- left + d + f1 <- func(x1) + f2 <- func(x2) + + values <- list(left, right, x1, x2, f1, f2, width, iter) + + # start loop + body <- function(left, right, x1, x2, f1, f2, width, iter) { + + # prep lists of vectors for whether steps are above x1 or below x2 + # order: lower, upper, x1, x2, width + + # if the minimum is below x2, shift the bounds and reuse x1 as the new x2 + below_width <- x2 - left + below <- list( + left, + x2, + x2 - golden_ratio * below_width, + x1, + below_width + ) + + # if the minimum is above x1, shift the bounds and reuse x2 as the new x1 + above_width <- right - x1 + above <- list( + x1, + right, + x2, + x1 + golden_ratio * above_width, + above_width + ) + + # convert to matrices + below <- tf$stack(below, axis = 1L) + above <- tf$stack(above, axis = 1L) + + is_below <- tf$greater(f2, f1) + status <- tf$where(is_below, below, above) + + left <- status[, 0] + right <- status[, 1] + x1 <- status[, 2] + x2 <- status[, 3] + width <- status[, 4] + + # either recompute f1 (f2) at new location, or use f2 (f1) in its place + + # this is a bit convoluted, but ensures function calls don't need to be + # duplicated, whilst maintaining the vectorisation + x_to_evaluate <- tf$where(is_below, x1, x2) + new_f <- func(x_to_evaluate) + new_f1 <- tf$where(is_below, new_f, f2) + new_f2 <- tf$where(is_below, f1, new_f) + + list(left, right, x1, x2, new_f1, new_f2, width, iter + 1L) + } + + cond <- function(left, right, x1, x2, f1, f2, width, iter) { + not_converged <- tf$less(tol, tf$abs(width)) + not_all_converged <- tf$reduce_any(not_converged) + in_time <- tf$less(iter, maxiter) + in_time & not_all_converged + } + + out <- tf$while_loop(cond, body, values) + + # get minimum value + width <- out[[7]] + min <- out[[1]] + width / fl(2) + + list(minimum = min, + width = width, + iterations = out[[8]]) + +} + pad_vector <- function(x, to_length, with = 1) { pad_by <- to_length - length(x) @@ -416,12 +539,14 @@ misc_module <- module(module, drop_first_dim, tile_first_dim, drop_column_dim, + tile_to_batch, expand_to_batch, has_batch, match_batches, split_chains, hessian_dims, rhex, + gss, disable_tensorflow_logging, pad_vector) @@ -1315,7 +1440,8 @@ as_tf_function <- function(r_fun, ...) { if (!is.list(ga_out)) ga_out <- list(ga_out) targets <- c(ga_out, ga_dummies) - sub_dag <- dag_class$new(targets) + sub_dag <- dag_class$new(targets, + tf_float = options()$greta_tf_float) # use the default graph, so that it can be overwritten when this is called? # alternatively fetch from above, or put it in greta_stash? diff --git a/codemeta.json b/codemeta.json index d0cccf26..418086b3 100644 --- a/codemeta.json +++ b/codemeta.json @@ -405,7 +405,7 @@ ], "releaseNotes": "https://github.com/dill/greta/blob/master/NEWS.md", "readme": "https://github.com/dill/greta/blob/master/README.md", - "fileSize": "494.678KB", + "fileSize": "504.898KB", "contIntegration": [ "https://travis-ci.org/greta-dev/greta", "https://codecov.io/github/greta-dev/greta?branch=master" diff --git a/man/marginalisation.Rd b/man/marginalisation.Rd new file mode 100644 index 00000000..10eb2a0e --- /dev/null +++ b/man/marginalisation.Rd @@ -0,0 +1,73 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/marginalise.R +\name{marginalisation} +\alias{marginalisation} +\alias{marginalise} +\title{direct marginalisation of random variables} +\usage{ +marginalise(fun, variable, method, ...) +} +\arguments{ +\item{fun}{an R function to integrate with respect to the random variable. +The first argument must be a greta array for the random variable, any +subsequent arguments must passed in via the \dots argument.} + +\item{variable}{a variable greta array with a distribution, representing the +random variable to marginalise} + +\item{method}{a \code{marginaliser} object giving the method for carrying out +the marginalisation} + +\item{\dots}{named greta arrays to be passed to \code{fun}} +} +\description{ +Inference on many statistical models requires marginalisation of + (ie. integration of functions over) random variables. In addition to + general purpose marginalisation methods like MCMC that act on an entire + model, it is often useful to directly marginalise a single random variable + within a larger model; e.g. for random variables where marginalisation by + MCMC is inefficient, impossible, or when an approximate marginalisation is + substantially more efficient. \code{marginalise()} performs direct + marginalisation within a greta model, using one of the available + marginalisation methods, detailed in \link{marginalisers}. +} +\details{ +The code in \code{fun} must define at least one distribution over + data (ie. a model likelihood), and cannot create any new variables. Any + variables must be created outide this function, and passed in via the \dots + argument. +} +\examples{ +\dontrun{ +# marginalise a discrete random variable and carry out HMC on the +# continouous variables: +y <- rnorm(100) + +lambda <- lognormal(0, 1) +theta <- normal(0, 1) +sd <- lognormal(0, 1) + +# if discrete variables could be sampled by MCMC in greta, +# we would be able to simply do: +# n <- poisson(lambda) +# mu <- theta ^ n +# distribution(y) <- normal(mu, sd) + +# the function to marginalise over - the variable to marginalise +# must be first, others must be passed to marginalise() +likelihood <- function (n, theta, sd) { + mu <- theta ^ n + distribution(y) <- normal(mu, sd) +} + +# integrate the function values w.r.t. the poisson distribution +marginalise(fun = likelihood, + variable = poisson(lambda), + method = discrete_marginalisation(values = 0:10), + theta = theta, + sd = sd) + +m <- model(lambda) +draws <- mcmc(m, hmc()) +} +} diff --git a/man/marginalisers.Rd b/man/marginalisers.Rd new file mode 100644 index 00000000..c9268f6e --- /dev/null +++ b/man/marginalisers.Rd @@ -0,0 +1,69 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/marginalisers.R +\name{marginalisers} +\alias{marginalisers} +\alias{discrete_marginalisation} +\alias{laplace_approximation} +\title{marginalisation methods} +\usage{ +discrete_marginalisation(values) + +laplace_approximation( + tolerance = 1e-06, + max_iterations = 50, + diagonal_hessian = FALSE +) +} +\arguments{ +\item{values}{an R vector giving values at which to evaluate the function for +a discrete marginalisation} + +\item{tolerance}{the (positive) numerical convergence tolerance (in units of +the log conditional posterior density) of the Newton-Raphson optimisation +algorithm} + +\item{max_iterations}{the (positive) integer-like maximum number of +iterations of the Newton-Raphson optimisation algorithm} + +\item{diagonal_hessian}{whether the Hessian matrix should be assumed to be +diagonal, to speed up computations. See Details.} +} +\value{ +a \code{marginaliser} object that can be passed to + \code{marginalise}. +} +\description{ +Functions to set up marginalisers (which explicitly integrate + out random variables) and modify their behaviour, for use in + \code{\link{marginalise}()}. + + \code{discrete_marginalisation} marginalises discrete random variables via + a Riemann sum over a range of plausible values. + + \code{laplace_approximation} marginalises random variables (with + multivariate normal priors) approximately, using by Newton-Raphson + optimisation and assuming the posterior follows a multivariate normal + distribution. +} +\details{ +\code{discrete_marginalisation} can only be used with discrete + probability distributions, e.g. those defined with \code{poisson()} and + \code{binomial()}. For discrete distributions with finite support (such as + \code{bernoulli()}) the marginalisation will be exact, so long as + \code{values} includes all possible values of the variable. For discrete + distributions with non-finite support (such as \code{poisson()}, which has + no upper bound), the marginalisation can only ever be approximate. However + if \code{values} cover a range of values with sufficiently high support in + the distribution, that approximation error will be minimal. + +\code{laplace_approximation} can only be used to marginalise + variables following a multivariate normal distribution. + + The argument \code{diagonal_hessian} can be used to state that the + conditional density factorises along the elements of the variable being + marginalised, and therefore the Hessian matrix of this function can be + assumed to be diagonal. A conditional density function factorises if each + observation in the conditional density depends only on the corresponding per + element of the variable being marginalised. If this is not the case and you + set \code{diagonal_hessian = TRUE}, your inferences will be incorrect. +} diff --git a/tests/testthat/helpers.R b/tests/testthat/helpers.R index 0e5a47ed..e593b532 100644 --- a/tests/testthat/helpers.R +++ b/tests/testthat/helpers.R @@ -835,7 +835,7 @@ not_finished <- function(draws, target_samples = 5000) { new_samples <- function(draws, target_samples = 5000) { neff <- min(coda::effectiveSize(draws)) efficiency <- neff / coda::niter(draws) - 1.2 * (target_samples - neff) / efficiency + 1.5 * (target_samples - neff) / efficiency } not_timed_out <- function(start_time, time_limit = 300) { @@ -844,15 +844,17 @@ not_timed_out <- function(start_time, time_limit = 300) { } get_enough_draws <- function(model, - sampler = sampler, n_effective = 5000, + sampler = hmc(), time_limit = 300, - verbose = TRUE, + chains = 50, + verbose = FALSE, one_by_one = FALSE) { start_time <- Sys.time() draws <- mcmc(model, sampler = sampler, + chains = chains, verbose = verbose, one_by_one = one_by_one) @@ -917,8 +919,8 @@ check_mvn_samples <- function(sampler, n_effective = 3000) { m <- model(x, precision = "single") draws <- get_enough_draws(m, - sampler = sampler, n_effective = n_effective, + sampler = sampler, verbose = FALSE) # get MCMC samples for statistics of the samples (value, variance and @@ -938,10 +940,10 @@ check_mvn_samples <- function(sampler, n_effective = 3000) { # get absolute errors between posterior means and true values, and scale them # by time-series Monte Carlo standard errors (the expected amount of # uncertainty in the MCMC estimate), to give the number of standard errors - # away from truth. There's a 1/100 chance of any one of these scaled errors - # being greater than qnorm(0.99) if the sampler is correct + # away from truth. There's a 1/200 chance of any one of these scaled errors + # being greater than qnorm(1 - 0.005) if the sampler is correct errors <- scaled_error(stat_draws, stat_truth) - expect_lte(max(errors), qnorm(0.99)) + expect_lte(max(errors), qnorm(1 - 0.005)) } @@ -958,8 +960,8 @@ check_samples <- function(x, m <- model(x, precision = "single") draws <- get_enough_draws(m, - sampler = sampler, n_effective = n_effective, + sampler = sampler, verbose = FALSE, one_by_one = one_by_one) @@ -979,6 +981,6 @@ check_samples <- function(x, # do a formal hypothesis test suppressWarnings(stat <- ks.test(mcmc_samples, iid_samples)) - testthat::expect_gte(stat$p.value, 0.01) + testthat::expect_gte(stat$p.value, 0.005) } diff --git a/tests/testthat/test_marginalisation.R b/tests/testthat/test_marginalisation.R new file mode 100644 index 00000000..7377c31f --- /dev/null +++ b/tests/testthat/test_marginalisation.R @@ -0,0 +1,313 @@ +context("marginalisation") + +test_that("marginalise errors nicely", { + + skip_if_not(check_tf_version()) + source("helpers.R") + + # not a function + expect_error( + marginalise("apple", poisson(0.1), discrete_marginalisation(0:2)), + "must be an R function" + ) + + # not a greta array + expect_error( + marginalise(I, 1:5, discrete_marginalisation(0:2)), + "must be a variable greta array with a distribution" + ) + + # greta array but no distribution + expect_error( + marginalise(I, variable(), discrete_marginalisation()), + "must be a variable greta array with a distribution" + ) + + # not a marginaliser + expect_error( + marginalise(I, poisson(0.1), mean), + "'method' must be a valid marginalisation method" + ) + + # the following can only be assessed on final model definition + + # function adds variables + fun <- function(x) { + x + normal(0, 1) + } + lambda <- variable() + marginalise(fun, poisson(lambda), discrete_marginalisation(0:2)) + expect_error( + mcmc(model(lambda)), + "must not create any new variables" + ) + + # function has no distribution + fun <- function(x) { + x * 2 + } + lambda <- variable() + marginalise(fun, poisson(lambda), discrete_marginalisation(0:2)) + expect_error( + mcmc(model(lambda)), + "must constain at least one distribution over data" + ) + +}) + +test_that("discrete_marginalisation errors nicely", { + + skip_if_not(check_tf_version()) + source("helpers.R") + + # greta array, not a numeric + expect_error( + discrete_marginalisation(values = variable()), + "must be an R numeric vector, not a greta array" + ) + + # not a numeric + expect_error( + discrete_marginalisation(values = c("apple", "banana")), + "must be an R numeric vector$" + ) + + # mismatch with distribution + expect_error( + marginalise(I, normal(0, 1), discrete_marginalisation(values = 1:5)), + "can only be used with discrete distributions" + ) + +}) + +test_that("inference runs with discrete marginalisation", { + + skip_if_not(check_tf_version()) + source("helpers.R") + + # spike+slab poisson glm (in reality this is poorly identified) + n <- 100 + x <- runif(n, -5, 5) + y <- rpois(n, exp(1)) + p <- variable(lower = 0, upper = 1) + alpha <- variable() + beta <- variable() + likelihood <- function(z, alpha, beta) { + eta <- alpha + beta * x * z + lambda <- exp(eta) + distribution(y) <- poisson(lambda) + } + marginalise(likelihood, + bernoulli(p), + method = discrete_marginalisation(0:1), + alpha = alpha, + beta = beta) + m <- model(alpha, beta, p) + + expect_ok(o <- opt(m)) + expect_ok(draws <- mcmc(m, warmup = 20, n_samples = 20, verbose = FALSE)) + +}) + +test_that("discrete marginalisation gives correct densities", { + + skip_if_not(check_tf_version()) + source("helpers.R") + + # no constraints to no adjustment to account for + lambda <- variable() + y <- rnorm(100) + likelihood <- function(z) { + distribution(y) <- normal(z, 1) + } + values <- 0:5 + marginalise(likelihood, + poisson(lambda), + method = discrete_marginalisation(values)) + m <- model(lambda) + o <- opt(m) + + lambda_val <- runif(1, 0, 5) + + # manually compute expected marginal likelihood given this value of lambda + wt <- dpois(values, lambda_val) + wt <- wt / sum(wt) + dens <- vapply(y, dnorm, values, 1, + FUN.VALUE = rep(1, length(values))) + dens <- apply(dens, 1, prod) + expected <- log(sum(wt * dens)) + + # compute with greta + m$dag$send_parameters(lambda_val) + observed <- m$dag$log_density() + + # compare results (within tolerance) + compare_op(expected, observed) + +}) + +test_that("laplace_approximation errors nicely", { + + skip_if_not(check_tf_version()) + source("helpers.R") + + # bad tolerance + expect_error( + laplace_approximation(tolerance = -1), + "must be a positive, scalar numeric value" + ) + + # bad max iterations + expect_error( + laplace_approximation(max_iterations = 0), + "must be a positive, scalar integer value" + ) + + # mismatch with distribution + expect_error( + marginalise(I, beta(2, 2), laplace_approximation()), + "can only be used with a normal or multivariate normal distribution" + ) + +}) + +test_that("inference runs with laplace approximation", { + + skip_if_not(check_tf_version()) + source("helpers.R") + + # the eight schools model + treatment_effects <- c(28.39, 7.94, -2.75 , 6.82, -0.64, 0.63, 18.01, 12.16) + treatment_stddevs <- c(14.9, 10.2, 16.3, 11.0, 9.4, 11.4, 10.4, 17.6) + + int <- variable() + sd <- variable(lower = 0) + lik <- function(theta) { + distribution(treatment_effects) <- normal(t(theta), treatment_stddevs) + } + + # mock up as a multivariate normal distribution + sigma <- diag(8) * sd ^ 2 + mu <- ones(1, 8) * int + marginalise(lik, + multivariate_normal(mu, sigma), + laplace_approximation(diagonal_hessian = TRUE)) + + m <- model(int, sd) + + expect_ok(o <- opt(m)) + expect_ok(draws <- mcmc(m, warmup = 20, n_samples = 20, + verbose = FALSE)) + +}) + +test_that("laplace approximation has correct posterior for univariate normal", { + + skip_if_not(check_tf_version()) + source("helpers.R") + + # nolint start + # test vs analytic posterior on 8 schools data with no pooling: + # y_i ~ N(theta_i, obs_sd_i ^ 2) + # theta_i ~ N(mu, sd ^ 2) + # the posterior for theta is normal, so laplace should be exact + # Bayes theorum gives: + # p(theta | y) \propto p(y|theta) p(theta) + # which with normal densities is: + # p(theta_i | y_i) \propto N(y_i | theta_i, obs_sd_i ^ 2) * + # N(theta_i | mu, sd ^ 2) + # which is equivalent to: + # p(theta_i | y_i) \propto N(theta_mu_i, theta_var_i) + # theta_var_i = 1 / (1 / sd ^ 2 + 1 / obs_sd_i ^ 2) + # theta_mu_i = (mu / sd ^ 2 + y_i / obs_sd_i ^ 2) * theta_var_i + # conjugate prior, see Wikipedia conjugate prior table + # nolint end + + # eight schools data + y <- c(28.39, 7.94, -2.75 , 6.82, -0.64, 0.63, 18.01, 12.16) + obs_sd <- c(14.9, 10.2, 16.3, 11.0, 9.4, 11.4, 10.4, 17.6) + + # prior parameters for int, and fixed variance of theta + mu <- rnorm(1) + sd <- abs(rnorm(1)) + + # analytic solution + obs_prec <- 1 / (obs_sd ^ 2) + prec <- 1 / (sd ^ 2) + theta_var <- 1 / (obs_prec + prec) + theta_mu <- (y * obs_prec + mu * prec) * theta_var + theta_sd <- sqrt(theta_var) + analytic <- cbind(mean = theta_mu, sd = sqrt(theta_var)) + + # analyse as univariate normal + lik <- function(theta) { + distribution(y) <- normal(theta, obs_sd) + } + out <- marginalise(lik, + normal(mu, sd, dim = 8), + laplace_approximation(diagonal_hessian = TRUE)) + res <- do.call(calculate, out) + laplace_uni <- cbind(mean = res$mean, sd = res$sd) + compare_op(analytic, laplace_uni) + + # analyse as multivariate normal + lik <- function(theta) { + distribution(y) <- normal(t(theta), obs_sd) + } + mean <- ones(1, 8) * mu + sigma <- diag(8) * sd ^ 2 + out <- marginalise(lik, + multivariate_normal(mean, sigma), + laplace_approximation(diagonal_hessian = TRUE)) + res <- do.call(calculate, out) + laplace_multi <- cbind(mean = res$mean[1, ], sd = sqrt(diag(res$sigma))) + compare_op(analytic, laplace_multi) + +}) + +test_that("laplace approximation has correct posterior for multivariate normal", { + + skip_if_not(check_tf_version()) + source("helpers.R") + + # nolint start + # test vs analytic posterior on 8 schools data with no pooling: + # y_i ~ N(theta_i, obs_sd_i ^ 2) + # theta ~ MVN(mu, sigma) + # the posterior for theta is multivariate normal, so laplace should be exact + # p(theta | y) \propto MNN(theta_mu, theta_sigma) + # theta_sigma = (sigma^-1 + diag(1 /obs_sd^2))^-1 + # theta_mu = theta_sigma (sigma^-1 mu + diag(1 /obs_sd^2) mean(y))^-1 + # conjugate prior, see Wikipedia conjugate prior table + # nolint end + + # eight schools data + y <- c(28.39, 7.94, -2.75 , 6.82, -0.64, 0.63, 18.01, 12.16) + obs_sd <- c(14.9, 10.2, 16.3, 11.0, 9.4, 11.4, 10.4, 17.6) + + # prior parameters for mu and sigma + mu <- rnorm(8) + sigma <- rwish(1, 9, diag(8))[1, , ] + + obs_prec <- diag(1 / (obs_sd ^ 2)) + prec <- solve(sigma) + theta_sigma <- solve(prec + obs_prec) + theta_mu <- theta_sigma %*% (prec %*% mu + obs_prec %*% y) + theta_sigma_flat <- theta_sigma[upper.tri(theta_sigma, diag = TRUE)] + + lik <- function(theta) { + distribution(y) <- normal(t(theta), obs_sd) + } + out <- marginalise(lik, + multivariate_normal(t(mu), sigma), + laplace_approximation(diagonal_hessian = TRUE, tolerance = 1e-32)) + res <- calculate(mean = t(out$mean), sigma = out$sigma, iterations = out$iterations) + theta_mu_est <- res$mean + theta_sigma_est <- res$sigma + theta_sigma_est_flat <- theta_sigma_est[upper.tri(theta_sigma_est, diag = TRUE)] + + # compare these to within a tolerance + compare_op(theta_mu, theta_mu_est) + compare_op(theta_sigma_flat, theta_sigma_est_flat) + +}) diff --git a/tests/testthat/test_misc.R b/tests/testthat/test_misc.R index 1bc2bcba..ef0e39e6 100644 --- a/tests/testthat/test_misc.R +++ b/tests/testthat/test_misc.R @@ -285,3 +285,25 @@ test_that("module works", { expect_identical(names(mod$functions), c("exp", "log", "sum")) }) + +test_that("golden section search works", { + + skip_if_not(check_tf_version()) + source("helpers.R") + + n_batch <- 4 + + # random optima + locs <- runif(n_batch) + + # simple quadratic function with different optima for each batch + func <- function(x) { + (x - locs) ^ 2 + } + + # run the minimisation + ss <- gss(func, n_batch) + result <- tf$Session()$run(ss) + compare_op(result$minimum, locs) + +}) diff --git a/tests/testthat/test_posteriors.R b/tests/testthat/test_posteriors.R index 68bae814..d2e91c5c 100644 --- a/tests/testthat/test_posteriors.R +++ b/tests/testthat/test_posteriors.R @@ -4,7 +4,6 @@ test_that("posterior is correct (binomial)", { skip_if_not(check_tf_version()) source("helpers.R") - skip_if_not_release() # analytic solution to the posterior of the paramter of a binomial # distribution, with uniform prior @@ -15,7 +14,7 @@ test_that("posterior is correct (binomial)", { distribution(pos) <- binomial(n, theta) m <- model(theta) - draws <- get_enough_draws(m, hmc(), 2000, verbose = FALSE) + draws <- get_enough_draws(m, n_effective = 5000) samples <- as.matrix(draws) @@ -33,7 +32,65 @@ test_that("posterior is correct (binomial)", { n_draws <- round(coda::effectiveSize(draws)) comparison <- rbeta(n_draws, shape1, shape2) suppressWarnings(test <- ks.test(samples, comparison)) - expect_gte(test$p.value, 0.01) + expect_gte(test$p.value, 0.05) + +}) + +test_that("posterior is correct (normal)", { + + skip_if_not(check_tf_version()) + source("helpers.R") + + # nolint start + # test vs analytic posterior on 8 schools data with no pooling: + # y_i ~ N(theta_i, obs_sd_i ^ 2) + # theta_i ~ N(mu, sd ^ 2) + # nolint end + + # eight schools data + y <- c(28.39, 7.94, -2.75 , 6.82, -0.64, 0.63, 18.01, 12.16) + obs_sd <- c(14.9, 10.2, 16.3, 11.0, 9.4, 11.4, 10.4, 17.6) + + # prior parameters for int, and fixed variance of theta + mu <- rnorm(1) + sd <- abs(rnorm(1)) + + # nolint start + # Bayes theorum gives: + # p(theta | y) \propto p(y|theta) p(theta) + # which with normal densities is: + # p(theta_i | y_i) \propto N(y_i | theta_i, obs_sd_i ^ 2) * N(theta_i | mu, sd ^ 2) + # which is equivalent to: + # p(theta_i | y_i) \propto N(theta_mu_i, theta_var_i) + # theta_var_i = 1 / (1 / sd ^ 2 + 1 / obs_sd_i ^ 2) + # theta_mu_i = (mu / sd ^ 2 + y_i / obs_sd_i ^ 2) * theta_var_i + # conjugate prior, see Wikipedia conjugate prior table + # nolint end + + obs_prec <- 1 / (obs_sd ^ 2) + prec <- 1 / (sd ^ 2) + theta_var <- 1 / (obs_prec + prec) + theta_mu <- (y * obs_prec + mu * prec) * theta_var + analytic <- cbind(mean = theta_mu, sd = sqrt(theta_var)) + + # mcmc solution: + theta <- normal(mu, sd, dim = 8) + distribution(y) <- normal(theta, obs_sd) + m <- model(theta) + + # draw at least 5000 effective samples of each parameter from the posterior + draws <- get_enough_draws(m, n_effective = 5000) + + mcmc_stats <- summary(draws)$statistics + mcmc <- mcmc_stats[, 1:2] + error <- abs(mcmc - analytic) + mcmc_se <- mcmc_stats[, 4] + + # ratio of error to that expected due to Monte Carlo noise + error_multiples <- error / cbind(mcmc_se, mcmc_se) + + within_error_margin <- error_multiples < qnorm(1 - 0.005) + expect_true(all(within_error_margin)) }) @@ -41,7 +98,6 @@ test_that("samplers are unbiased for bivariate normals", { skip_if_not(check_tf_version()) source("helpers.R") - skip_if_not_release() check_mvn_samples(hmc()) check_mvn_samples(rwmh()) @@ -53,7 +109,6 @@ test_that("samplers are unbiased for chi-squared", { skip_if_not(check_tf_version()) source("helpers.R") - skip_if_not_release() df <- 5 x <- chi_squared(df) @@ -67,7 +122,6 @@ test_that("samplers are unbiased for standard uniform", { skip_if_not(check_tf_version()) source("helpers.R") - skip_if_not_release() x <- uniform(0, 1) iid <- runif @@ -80,13 +134,12 @@ test_that("samplers are unbiased for LKJ", { skip_if_not(check_tf_version()) source("helpers.R") - skip_if_not_release() x <- lkj_correlation(3, 2)[1, 2] iid <- function(n) rlkjcorr(n, 3, 2)[, 1, 2] - check_samples(x, iid, hmc(), one_by_one = TRUE) + check_samples(x, iid) })