diff --git a/DESCRIPTION b/DESCRIPTION
index 033adad0..17a6a25d 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -65,6 +65,8 @@ Collate:
     'internals.R'
     'calculate.R'
     'callbacks.R'
+    'marginalise.R'
+    'marginalisers.R'
     'simulate.R'
     'chol2symm.R'
 Imports:
diff --git a/NAMESPACE b/NAMESPACE
index 1514c300..18b30ec7 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -99,6 +99,7 @@ S3method(plot,greta_model)
 S3method(print,greta_array)
 S3method(print,greta_model)
 S3method(print,initials)
+S3method(print,marginaliser)
 S3method(print,optimiser)
 S3method(print,sampler)
 S3method(print,summary.greta_array)
@@ -164,6 +165,7 @@ export(cov2cor)
 export(diag)
 export(dirichlet)
 export(dirichlet_multinomial)
+export(discrete_marginalisation)
 export(distribution)
 export(eigen)
 export(exponential)
@@ -188,12 +190,14 @@ export(iprobit)
 export(joint)
 export(l_bfgs_b)
 export(laplace)
+export(laplace_approximation)
 export(lkj_correlation)
 export(log10.greta_array)
 export(log1pe)
 export(log2.greta_array)
 export(logistic)
 export(lognormal)
+export(marginalise)
 export(mcmc)
 export(mixture)
 export(model)
diff --git a/R/dag_class.R b/R/dag_class.R
index bd526083..4b3f4dff 100644
--- a/R/dag_class.R
+++ b/R/dag_class.R
@@ -54,11 +54,16 @@ dag_class <- R6Class(
     # float type
     on_graph = function(expr) {
 
-      # temporarily pass float type info to options, so it can be accessed by
-      # nodes on definition, without cluncky explicit passing
+      # temporarily pass float type and batch size info to options, so it can be
+      # accessed by nodes on definition, without clunky explicit passing
       old_float_type <- options()$greta_tf_float
-      on.exit(options(greta_tf_float = old_float_type))
-      options(greta_tf_float = self$tf_float)
+      old_batch_size <- options()$greta_batch_size
+
+      on.exit(options(greta_tf_float = old_float_type,
+                      greta_batch_size = old_batch_size))
+
+      options(greta_tf_float = self$tf_float,
+              greta_batch_size = self$tf_environment$batch_size)
 
       with(self$tf_graph$as_default(), expr)
     },
@@ -375,7 +380,7 @@ dag_class <- R6Class(
     },
 
     # define tensor for overall log density and gradients
-    define_joint_density = function() {
+    define_joint_density = function(adjusted = TRUE) {
 
       tfe <- self$tf_environment
 
@@ -392,6 +397,12 @@ dag_class <- R6Class(
                           target_nodes,
                           SIMPLIFY = FALSE)
 
+      # assign the un-reduced densities, for use in marginalisation
+      names(densities) <- NULL
+      assign("component_densities",
+             densities,
+             envir = self$tf_environment)
+
       # reduce_sum each of them (skipping the batch dimension)
       self$on_graph(summed_densities <- lapply(densities, tf_sum, drop = TRUE))
 
@@ -404,24 +415,28 @@ dag_class <- R6Class(
              joint_density,
              envir = self$tf_environment)
 
-      # define adjusted joint density
+      if (adjusted) {
 
-      # get names of Jacobian adjustment tensors for all variable nodes
-      adj_names <- paste0(self$get_tf_names(types = "variable"), "_adj")
+        # get names of adjustment tensors for all variable nodes
+        adj_names <- paste0(self$get_tf_names(types = "variable"), "_adj")
 
-      # get TF density tensors for all distribution
-      adj <- lapply(adj_names, get, envir = self$tf_environment)
+        # get TF density tensors for all distribution
+        adj <- lapply(adj_names, get, envir = self$tf_environment)
 
-      # remove their names and sum them together (accounting for tfp bijectors
-      # sometimes returning a scalar tensor)
-      names(adj) <- NULL
-      adj <- match_batches(adj)
-      self$on_graph(total_adj <- tf$add_n(adj))
+        # remove their names and sum them together (accounting for tfp bijectors
+        # sometimes returning a scalar tensor)
+        adj <- match_batches(adj)
 
-      # assign overall density to environment
-      assign("joint_density_adj",
-             joint_density + total_adj,
-             envir = self$tf_environment)
+        # remove their names and sum them together
+        names(adj) <- NULL
+        self$on_graph(total_adj <- tf$add_n(adj))
+
+        # assign overall density to environment
+        assign("joint_density_adj",
+               joint_density + total_adj,
+               envir = self$tf_environment)
+
+      }
 
     },
 
@@ -429,18 +444,8 @@ dag_class <- R6Class(
     # target tensor
     evaluate_density = function(distribution_node, target_node) {
 
-      tfe <- self$tf_environment
-
-      parameter_nodes <- distribution_node$parameters
-
-      # get the tensorflow objects for these
-      distrib_constructor <- self$get_tf_object(distribution_node)
+      tfp_distribution <- self$get_tf_object(distribution_node)
       tf_target <- self$get_tf_object(target_node)
-      tf_parameter_list <- lapply(parameter_nodes, self$get_tf_object)
-
-      # execute the distribution constructor functions to return a tfp
-      # distribution object
-      tfp_distribution <- distrib_constructor(tf_parameter_list, dag = self)
 
       self$tf_evaluate_density(tfp_distribution,
                                tf_target,
@@ -520,6 +525,9 @@ dag_class <- R6Class(
         for (name in data_names)
           tfe[[name]] <- tfe_old[[name]]
 
+        # copy the batch size over
+        tfe$batch_size <- tfe_old$batch_size
+
         # put the free state in the environment, and build out the tf graph
         tfe$free_state <- free_state
         self$define_tf_body()
@@ -775,25 +783,10 @@ dag_class <- R6Class(
 
     },
 
-    # get the tfp distribution object for a distribution node
-    get_tfp_distribution = function(distrib_node) {
-
-      # build the tfp distribution object for the distribution, and use it
-      # to get the tensor for the sample
-      distrib_constructor <- self$get_tf_object(distrib_node)
-      parameter_nodes <- distrib_node$parameters
-      tf_parameter_list <- lapply(parameter_nodes, self$get_tf_object)
-
-      # execute the distribution constructor functions to return a tfp
-      # distribution object
-      tfp_distribution <- distrib_constructor(tf_parameter_list, dag = self)
-
-    },
-
     # try to draw a random sample from a distribution node
     draw_sample = function(distribution_node) {
 
-      tfp_distribution <- self$get_tfp_distribution(distribution_node)
+      tfp_distribution <- self$get_tf_object(distribution_node)
 
       sample <- tfp_distribution$sample
 
diff --git a/R/inference.R b/R/inference.R
index 2402c2e4..ddaca382 100644
--- a/R/inference.R
+++ b/R/inference.R
@@ -291,7 +291,6 @@ run_samplers <- function(samplers,
   thin <- as.integer(thin)
 
   dag <- samplers[[1]]$model$dag
-  chains <- samplers[[1]]$n_chains
   n_cores <- check_n_cores(n_cores, length(samplers), plan_is)
   float_type <- dag$tf_float
 
diff --git a/R/inference_class.R b/R/inference_class.R
index a019b398..d49ff011 100644
--- a/R/inference_class.R
+++ b/R/inference_class.R
@@ -46,8 +46,6 @@ inference <- R6Class(
       free_parameters <- model$dag$example_parameters(free = TRUE)
       free_parameters <- unlist_tf(free_parameters)
       self$n_free <- length(free_parameters)
-      self$set_initial_values(initial_values)
-      self$n_traced <- length(model$dag$trace_values(self$free_state))
       self$seed <- seed
 
     },
@@ -277,8 +275,6 @@ sampler <- R6Class(
                        parameters = parameters,
                        seed = seed)
 
-      self$n_chains <- nrow(self$free_state)
-
       # duplicate diag_sd if needed
       n_diag <- length(self$parameters$diag_sd)
       n_parameters <- self$n_free
@@ -289,6 +285,8 @@ sampler <- R6Class(
 
       # define the draws tensor on the tf graph
       self$define_tf_draws()
+      self$set_initial_values(initial_values)
+      self$n_chains <- nrow(self$free_state)
 
     },
 
@@ -1025,6 +1023,8 @@ optimiser <- R6Class(
 
       self$create_optimiser_objective()
       self$create_tf_minimiser()
+      self$set_initial_values(initial_values)
+      self$n_traced <- length(model$dag$trace_values(self$free_state))
 
     },
 
diff --git a/R/joint.R b/R/joint.R
index c61605af..045258a7 100644
--- a/R/joint.R
+++ b/R/joint.R
@@ -124,15 +124,15 @@ joint_distribution <- R6Class(
 
     tf_distrib = function(parameters, dag) {
 
-      # get information from the *nodes* for component distributions, not the tf
-      # objects passed in here
+      # get tfp distributions
+      tfp_distributions <- parameters
+      names(tfp_distributions) <- NULL
 
-      # get tfp distributions, truncations, & bounds of component distributions
+      # get information on truncations, & bounds of component distributions from
+      # the *nodes* for component distributions
       distribution_nodes <- self$parameters
       truncations <- lapply(distribution_nodes, member, "truncation")
       bounds <- lapply(distribution_nodes, member, "bounds")
-      tfp_distributions <- lapply(distribution_nodes, dag$get_tfp_distribution)
-      names(tfp_distributions) <- NULL
 
       log_prob <- function(x) {
 
diff --git a/R/marginalise.R b/R/marginalise.R
new file mode 100644
index 00000000..7ec07680
--- /dev/null
+++ b/R/marginalise.R
@@ -0,0 +1,289 @@
+# marginalising random variables
+
+#' @name marginalisation
+#'
+#' @title direct marginalisation of random variables
+#' @description Inference on many statistical models requires marginalisation of
+#'   (ie. integration of functions over) random variables. In addition to
+#'   general purpose marginalisation methods like MCMC that act on an entire
+#'   model, it is often useful to directly marginalise a single random variable
+#'   within a larger model; e.g. for random variables where marginalisation by
+#'   MCMC is inefficient, impossible, or when an approximate marginalisation is
+#'   substantially more efficient. \code{marginalise()} performs direct
+#'   marginalisation within a greta model, using one of the available
+#'   marginalisation methods, detailed in \link{marginalisers}.
+#'
+#' @examples
+#' \dontrun{
+#' # marginalise a discrete random variable and carry out HMC on the
+#' # continouous variables:
+#' y <- rnorm(100)
+#'
+#' lambda <- lognormal(0, 1)
+#' theta <- normal(0, 1)
+#' sd <- lognormal(0, 1)
+#'
+#' # if discrete variables could be sampled by MCMC in greta,
+#' # we would be able to simply do:
+#' #   n <- poisson(lambda)
+#' #   mu <- theta ^ n
+#' #   distribution(y) <- normal(mu, sd)
+#'
+#' # the function to marginalise over - the variable to marginalise
+#' # must be first, others must be passed to marginalise()
+#' likelihood <- function (n, theta, sd) {
+#'   mu <- theta ^ n
+#'   distribution(y) <- normal(mu, sd)
+#' }
+#'
+#' # integrate the function values w.r.t. the poisson distribution
+#' marginalise(fun = likelihood,
+#'             variable = poisson(lambda),
+#'             method = discrete_marginalisation(values = 0:10),
+#'             theta = theta,
+#'             sd = sd)
+#'
+#' m <- model(lambda)
+#' draws <- mcmc(m, hmc())
+#' }
+NULL
+
+#' @rdname marginalisation
+#' @export
+#'
+#' @param fun an R function to integrate with respect to the random variable.
+#'   The first argument must be a greta array for the random variable, any
+#'   subsequent arguments must passed in via the \dots argument.
+#' @param variable a variable greta array with a distribution, representing the
+#'   random variable to marginalise
+#' @param method a \code{marginaliser} object giving the method for carrying out
+#'   the marginalisation
+#' @param \dots named greta arrays to be passed to \code{fun}
+#'
+#' @details The code in \code{fun} must define at least one distribution over
+#'   data (ie. a model likelihood), and cannot create any new variables. Any
+#'   variables must be created outide this function, and passed in via the \dots
+#'   argument.
+marginalise <- function(fun, variable, method, ...) {
+
+  # get the distribution for the random variable
+  distribution_node <- get_node(variable)$distribution
+
+  # check the inputs
+  if (!is.function(fun)) {
+    stop("'fun' must be an R function")
+  }
+
+  # check the additional arguments to fun are given in dots
+  dots <- list(...)
+  dot_names <- names(dots)
+  expected_names <- names(formals(fun)[-1])
+  if (!all(expected_names %in% dot_names)) {
+    stop("all arguments to 'fun' must be passed, named, to marginalise")
+  }
+
+  if (!inherits(distribution_node, "distribution_node")) {
+    stop("'variable' must be a variable greta array with a distribution")
+  }
+
+  if (!inherits(method, "marginaliser")) {
+    stop("'method' must be a valid marginalisation method. ",
+         "See ?marginalise for options")
+  }
+
+  # excise the variable from the distribution
+  distribution_node$remove_target()
+
+  # construct the marginaliser
+  marginaliser <- do.call(method$class$new, c(distribution_node, method$args))
+
+  # turn the greta function into a TF conditional density function; doing
+  # something very similar to as_tf_function(), but giving a function that
+  # returns a tensorflow scalar for the density (unadjusted since there will be
+  # no new variables)
+  args <- c(list(variable), dots)
+  conditional_joint_density <- as_conditional_density(fun, args)
+
+  # get a list of greta arrays for the marginalisation parameters:
+  parameters <- marginaliser$compute_parameters(
+    conditional_density_fun = conditional_joint_density,
+    dots = dots
+  )
+
+  # add on any pre-computed parameters
+  parameters <- c(parameters, marginaliser$other_parameters)
+
+  # create the distribution
+  distribution <- distrib(
+    distribution = "marginalisation",
+    marginaliser = marginaliser,
+    parameters = parameters,
+    conditional_density_fun = conditional_joint_density,
+    dots = dots
+  )
+
+  # return a list of the greta arrays computed during the marginalisation, and
+  # any other things from the method
+  output <- marginaliser$return_list(parameters)
+  invisible(output)
+
+}
+
+marginalisation_distribution <- R6Class(
+  "marginalisation_distribution",
+  inherit = distribution_node,
+  public = list(
+
+    marginaliser = NULL,
+    tf_marginalisation_density = NULL,
+    conditional_density_fun = NULL,
+
+    initialize = function(marginaliser,
+                          parameters = parameters,
+                          conditional_density_fun,
+                          dots) {
+
+      # initialize class, and add methods
+      super$initialize("marginalisation")
+      self$marginaliser <- marginaliser
+      self$conditional_density_fun <- conditional_density_fun
+
+      # add the parameters
+      parameter_names <- names(parameters)
+      for (i in seq_along(parameters)) {
+        self$add_parameter(parameters[[i]],
+                           parameter_names[i])
+      }
+
+      # add the dots (extra inputs to conditional_density_fun) as parameters
+      dot_nodes <- lapply(dots, get_node)
+      for (i in seq_along(dot_nodes)) {
+        self$add_parameter(dot_nodes[[i]],
+                           paste("input", i))
+      }
+
+      # make the distribution the target
+      self$remove_target()
+      self$add_target(marginaliser$distribution)
+
+    },
+
+    tf_distrib = function(parameters, dag) {
+
+      # unpack the parameters here
+      are_dots <- grepl("^input ", names(parameters))
+      dots <- parameters[are_dots]
+      parameters <- parameters[!are_dots]
+
+      # the marginal density implied by the function
+      log_prob <- function(x) {
+
+        density <- self$marginaliser$tf_marginalisation_density
+        density(parameters = parameters,
+                tf_conditional_density_fun = self$conditional_density_fun,
+                dots = dots,
+                other_args = self$marginaliser$other_args)
+
+      }
+
+
+      list(log_prob = log_prob)
+
+    }
+
+  )
+)
+
+# turn the greta function into a TF conditional density function; doing
+# something very similar to as_tf_function(), but giving a function that
+# returns a tensorflow scalar for the density
+as_conditional_density <- function(r_fun, args) {
+
+  # run the operation on isolated greta arrays, so nothing gets attached to the
+  # model. Real greta arrays in args
+  ga_dummies <- lapply(args, dummy_greta_array)
+  out <- do.call(r_fun, ga_dummies)
+
+  # we don't want to rely on the function returning a greta array that depends
+  # on everything internally. instead, we want to grab any densities defined in
+  # the function.
+
+  # this function will take in tensors corresponding to the things in args
+  function(..., reduce = TRUE) {
+
+    tensor_inputs <- list(...)
+
+    # if any of these are shapeless, make them into greta scalars (3D)
+    tensor_inputs <- lapply(tensor_inputs,
+                            function(x) {
+                              if (identical(dim(x), list()))
+                                x <- tf$reshape(x, shape(1, 1, 1))
+                              x
+                            })
+
+    # transfer batch dimensions if needed
+    tensor_inputs <- match_batches(tensor_inputs)
+
+    # create a sub-dag for everything connected to the inputs
+    sub_dag <- dag_class$new(ga_dummies, tf_float = options()$greta_tf_float)
+
+    # check there are distributions
+    if (!any(sub_dag$node_types == "distribution")) {
+      stop("'fun' must constain at least one distribution over data",
+           call. = FALSE)
+    }
+
+    # check there are no variables
+    if (any(sub_dag$node_types == "variable")) {
+      stop("'fun' must not create any new variables, ",
+           "variables can be passed in as arguments",
+           call. = FALSE)
+    }
+
+    # use the default graph, so that it can be overwritten when this is called
+    # (using on_graph())
+    sub_dag$tf_graph <- tf$get_default_graph()
+    sub_tfe <- sub_dag$tf_environment
+
+    # pass on the batch size, used when defining data
+    sub_tfe$batch_size <- get_batch_size()
+
+    # set the input tensors as the values for the dummy greta arrays in the new
+    # tf_environment
+    node_dummies <- lapply(ga_dummies, get_node)
+    tf_names <- lapply(node_dummies, sub_dag$tf_name)
+    for (i in seq_along(tf_names))
+      assign(tf_names[[i]], tensor_inputs[[i]], envir = sub_tfe)
+
+    # have any new data defined as constants, to avoid placeholding issues in
+    # the wider model
+    greta_stash$data_as_constants <- TRUE
+    on.exit(greta_stash$data_as_constants <- NULL)
+
+    # define all nodes (only data, operation and distribution) in the
+    # environment, and on the graph
+    sub_dag$on_graph(lapply(sub_dag$node_list,
+                            function(x) x$define_tf(sub_dag)))
+
+    # define and return the joint density tensor (no variables, so no need for
+    # log jacobian adjustment)
+    sub_dag$define_joint_density(adjusted = FALSE)
+
+    # whether to reduce_sum the densities, or sum them elementwise
+    if (reduce) {
+      result <- sub_tfe$joint_density
+    } else {
+      densities <- sub_tfe$component_densities
+      if (length(densities) > 1) {
+        names(densities) <- NULL
+        result <- tf$add_n(densities)
+      } else {
+        result <- densities[[1]]
+      }
+    }
+
+    result
+
+  }
+
+}
diff --git a/R/marginalisers.R b/R/marginalisers.R
new file mode 100644
index 00000000..3e33c4dc
--- /dev/null
+++ b/R/marginalisers.R
@@ -0,0 +1,854 @@
+# marginaliser constructors
+
+# add a marginaliser R6 class
+
+marginaliser <- R6Class(
+  "marginaliser",
+  public = list(
+
+    name = "marginaliser",
+    distribution = NULL,
+
+    other_parameters = list(),
+    other_args = list(),
+
+    initialize = function(distribution, ...) {
+      self$distribution <- distribution
+    },
+
+    return_list = function(parameters) {
+      list()
+    }
+
+  )
+)
+
+discrete_marginaliser <- R6Class(
+  "discrete marginaliser",
+  inherit = marginaliser,
+  public = list(
+
+    initialize = function(distribution, values) {
+
+      if (!distribution$discrete) {
+        stop("this marginalisation method can only be used ",
+             "with discrete distributions",
+             call. = FALSE)
+      }
+
+      super$initialize(distribution)
+
+      # convert values to a list of data greta arrays
+      values_list <- as.list(values)
+      values_list <- lapply(values_list, as_data)
+      names(values_list) <- paste0("value_", seq_along(values_list))
+      self$other_parameters <- values_list
+
+    },
+
+    # return a named list of operation greta arrays for the marginalisation
+    # parameters
+    compute_parameters = function(conditional_density_fun,
+                                  dots) {
+
+      # function to compute the parameter (log probabilities) of the marginalisation
+      tf_compute_log_weights <- function(tfp_distribution, ...) {
+
+        # and compute log probabilities for the values
+        values_list <- list(...)
+        log_weights_list <- lapply(values_list, tfp_distribution$log_prob)
+        log_weights_list <- lapply(log_weights_list, tf_sum)
+
+        # convert to a vector of discrete probabilities and make them sum to 1,
+        # whilst staying on the log scale
+        log_weights <- tf$concat(log_weights_list, axis = 1L)
+
+        # normalise weights on log scale
+        log_weights_sum <- tf$reduce_logsumexp(
+          log_weights,
+          axis = 1L,
+          keepdims = TRUE
+        )
+
+        log_weights - log_weights_sum
+
+      }
+
+      # list of parameters in a mock greta array (just representing a tf list)
+      args <- c(operation = "marginalisation_log_weights",
+                self$distribution,
+                self$other_parameters,
+                tf_operation = "tf_compute_log_weights",
+                list(dim = c(length(self$other_parameters), 1L)))
+      log_weights <- do.call(op, args)
+
+      list(log_weights = log_weights)
+
+    },
+
+    # compute the marginal joint density
+    tf_marginalisation_density = function(parameters,
+                                          tf_conditional_density_fun,
+                                          dots,
+                                          other_args) {
+
+      # unpack the parameters
+      log_weights <- parameters$log_weights
+      values_list <- parameters[names(parameters) != "log_weights"]
+
+      # compute the conditional joint density for each value (passing in
+      # dots too)
+      log_density_list <- list()
+      for (i in seq_along(values_list)) {
+        args <- c(list(values_list[[i]]), dots)
+        log_density_list[[i]] <- do.call(tf_conditional_density_fun, args)
+      }
+
+      # expand and combine the densities
+      log_density_list <- lapply(log_density_list, tf$expand_dims, 1L)
+      log_density_list <- lapply(log_density_list, tf$expand_dims, 2L)
+      log_density_vec <- tf$concat(log_density_list, axis = 1L)
+
+      # compute a weighted sum
+      log_density_weighted_vec <- log_density_vec + log_weights
+      density <- tf$reduce_logsumexp(log_density_weighted_vec, axis = 1L)
+      density
+
+    },
+
+    return_list = function(parameters) {
+      list(probabilities = exp(parameters$log_weights))
+    }
+
+  )
+)
+
+laplace_marginaliser <- R6Class(
+  "laplace marginaliser",
+  inherit = marginaliser,
+  public = list(
+
+    tolerance = NULL,
+    max_iterations = NULL,
+    diagonal_hessian = NULL,
+    multivariate = FALSE,
+
+    initialize = function(distribution,
+                          tolerance,
+                          max_iterations,
+                          diagonal_hessian) {
+
+
+      # check the distribution and determine whether we are witking with a
+      # multivariate or univariate normal
+      name <- distribution$distribution_name
+      if (!name %in% c("normal", "multivariate_normal")) {
+        stop("the Laplace approximation can only be used ",
+             "with a normal or multivariate normal distribution",
+             call. = FALSE)
+      }
+
+      self$multivariate <- name == "multivariate_normal"
+
+      super$initialize(distribution)
+
+      self$other_args <- list(tolerance = tolerance,
+                              max_iterations = max_iterations,
+                              diagonal_hessian = diagonal_hessian,
+                              multivariate = self$multivariate)
+
+    },
+
+    # named list of operation greta arrays for the marginalisation parameters
+    compute_parameters = function(conditional_density_fun, dots) {
+      if (self$multivariate) {
+        self$compute_parameters_multivariate(conditional_density_fun, dots)
+      } else {
+        self$compute_parameters_univariate(conditional_density_fun, dots)
+      }
+    },
+
+    # named list of operation greta arrays for the marginalisation parameters
+    compute_parameters_multivariate = function(conditional_density_fun, dots) {
+
+      # function to compute the parameter (log probabilities) of the marginalisation
+      # define the marginalisation function
+      tf_compute_laplace_parameters = function(mean,
+                                               sigma,
+                                               ...,
+                                               tf_conditional_density_fun,
+                                               other_args) {
+
+        dots <- list(...)
+
+        # simpler interface to conditional density. If reduce = TRUE, it does
+        # reduce_sum on component densities
+        d0 <- function(z, reduce = TRUE) {
+          # transpose z to a row vector, which the dag is expecting
+          t_z <- tf_transpose(z)
+          args <- c(list(t_z), dots, list(reduce = reduce))
+          do.call(tf_conditional_density_fun, args)
+        }
+
+        # get the vectors of first and second derivatives of the conditional density
+        # function, w.r.t. the variable being marginalised
+        if (other_args$diagonal_hessian) {
+          derivs <- function(z) {
+            y <- d0(z, reduce = FALSE)
+            d1 <- tf$gradients(y, z)[[1]]
+            d2 <- tf$gradients(d1, z)[[1]]
+            list(d1, d2)
+          }
+        } else {
+
+          stop("inference with non-diagonal hessians is not yet implemented",
+               call. = FALSE)
+
+          derivs <- function(z) {
+            y <- d0(z, reduce = FALSE)
+            d1 <- tf$gradients(y, z)[[1]]
+            d2 <- tf$hessians(y, z)[[1]]  # this won't work!
+            list(d1, d2)
+          }
+        }
+
+        # negative log-posterior for the current value of z under MVN assumption
+        psi <- function(a, z, mean) {
+          p1 <- tf$matmul(tf_transpose(a), z - mean)
+          fl(0.5) * tf$squeeze(p1, 1:2) - d0(z)
+        }
+
+        # tensors for parameters of MVN distribution (make mean column vector now)
+        mean <- tf_transpose(mean)
+
+        # dimension of the MVN distribution
+        n <- dim(mean)[[2]]
+
+        # here z is a *column vector* to simplify later calculations, it needs to be
+        # transposed to a row vector before feeding into the likelihood function(s)
+        z <- tf$identity(mean)
+
+        # Newton-Raphson parameters
+        tol <- tf$constant(other_args$tolerance, tf_float(), shape(1))
+        obj_old <- tf$constant(Inf, tf_float(), shape(1))
+        iter <- tf$constant(0L)
+        maxiter <- tf$constant(other_args$max_iterations)
+
+        # other objects
+        a_value <- add_first_dim(as_2d_array(rep(0, n)))
+        a <- fl(a_value)
+        u_value <- add_first_dim(diag(n))
+        u <- fl(u_value)
+        eye <- fl(add_first_dim(diag(n)))
+
+        # match batches on everything going into the loop that will have a batch
+        # dimension later
+        objects <- list(mean, sigma, z, a, u, obj_old, tol)
+        objects <- greta:::match_batches(objects)
+        mean <- objects[[1]]
+        sigma <- objects[[2]]
+        z <- objects[[3]]
+        a <- objects[[4]]
+        u <- objects[[5]]
+        obj_old <- objects[[6]]
+        tol <- objects[[7]]
+
+        obj <- -d0(z)
+
+        # get the batch dim explicitly, for the step size optimisation
+        batch_dim <- tf$shape(mean)[[0]]
+        batch_dim <- tf$expand_dims(batch_dim, 0L)
+
+        # tensorflow while loop to do Newton-Raphson iterations
+        body <- function(z, a, u, obj_old, obj, tol, iter, maxiter) {
+
+          obj_old <- obj
+
+          deriv <- derivs(z)
+          d1 <- deriv[[1]]
+          d2 <- deriv[[2]]
+
+          # curvature of the likelihood
+          w <- -d2
+          rw <- sqrt(w)
+
+          # decentred values of z
+          cf <- z - mean
+
+          # approximate posterior covariance & cholesky factor (using the matrix inverse
+          # lemma for numerical stability)
+          mat1 <- tf$matmul(rw, tf_transpose(rw)) * sigma + eye
+          u <- tf$cholesky(mat1)
+          l <- tf_transpose(u)
+
+          # compute Newton-Raphson update direction
+          b <- w * cf + d1
+          mat2 <- rw * tf$matmul(sigma, b)
+          mat3 <- tf$matrix_triangular_solve(u, mat2)
+          adiff <- b - rw * tf$matrix_triangular_solve(l, mat3, lower = FALSE) - a
+
+          # use golden section search to find the optimum distance to step in this
+          # direction, for each batch simultaneously
+          psiline <- function(s) {
+            s <- tf$expand_dims(s, 1L)
+            s <- tf$expand_dims(s, 2L)
+            a_new <- a + s * adiff
+            z_new <- tf$matmul(sigma, a_new) + mean
+            psi(a_new, z_new, mean)
+          }
+
+          ls_results <- gss(psiline, batch_dim, upper = 2)
+          stepsize <- ls_results$minimum
+          stepsize <- tf$expand_dims(stepsize, 1L)
+          stepsize <- tf$expand_dims(stepsize, 2L)
+
+          # do the update and compute new z and objective
+          a_new <- a + stepsize * adiff
+          z_new <- tf$matmul(sigma, a_new) + mean
+          obj <- psi(a_new, z_new, mean)
+
+          list(z_new, a_new, u, obj_old, obj, tol, iter + 1L, maxiter)
+
+        }
+
+        cond <- function(z, a, u, obj_old, obj, tol, iter, maxiter) {
+          not_all_converged <- tf$reduce_any(tf$less(tol, obj_old - obj))
+          in_time <- tf$less(iter, maxiter)
+          in_time & not_all_converged
+        }
+
+        values <- list(z,
+                       a,
+                       u,
+                       obj_old,
+                       obj,
+                       tol,
+                       iter,
+                       maxiter)
+
+        # run the Newton-Raphson optimisation to find the posterior mode of z
+        out <- tf$while_loop(cond, body, values)
+
+        a <- out[[2]]
+
+        # apparently we need to redefine z etc. here, or the backprop errors
+
+        # lots of duplicated code; this could be tidied up, but I ran out of time!
+        z <- tf$matmul(sigma, a) + mean
+
+        # curvature of the likelihood at the mode
+        deriv <- derivs(z)
+        d2 <- deriv[[2]]
+        w <- -d2
+        rw <- sqrt(w)
+        hessian <- tf$linalg$diag(tf$squeeze(w, 2L))
+
+        # approximate posterior covariance
+        covar <- tf$linalg$inv(tf$linalg$inv(sigma) + hessian)
+
+        # log-determinant of l
+        mat1 <- tf$matmul(rw, tf_transpose(rw)) * sigma + eye
+        l <- tf$cholesky(mat1)
+        l_diag <- tf$matrix_diag_part(l)
+        logdet <- tf_sum(tf$log(l_diag))
+        logdet <- tf$squeeze(logdet, 1L)
+
+        # convergence information
+        iter <- out[[7]]
+        converged <- tf$less(iter, maxiter)
+
+        # return a list of these things
+        list(z = z,
+             mean = mean,
+             a = a,
+             sigma = sigma,
+             logdet = logdet,
+             covar = covar,
+             iterations = iter,
+             converged = converged)
+
+      }
+
+      # get greta arrays for parameters of distribution node
+      mean <- self$distribution$parameters$mean
+      sigma <- self$distribution$parameters$sigma
+
+      # run the laplace approximation fitting and get a list of parameters in a
+      # mock greta array (just representing a tf list)
+      args <- c(operation = "marginalisation_parameters",
+                mean = mean,
+                sigma = sigma,
+                dots,
+                list(
+                  operation_args = list(
+                    tf_conditional_density_fun = conditional_density_fun,
+                    other_args = self$other_args
+                  ),
+                  tf_operation = "tf_compute_laplace_parameters",
+                  dim = 1
+                ))
+
+      parameter_list <- do.call(op, args)
+
+      # extract the elements to operation greta arrays with the correct shapes
+      z <- op("z",
+              parameter_list,
+              dim = dim(mean),
+              tf_operation = "get_element",
+              operation_args = list("z"))
+
+      a <- op("a",
+              parameter_list,
+              dim = dim(mean),
+              tf_operation = "get_element",
+              operation_args = list("a"))
+
+      logdet <- op("log determinant",
+                   parameter_list,
+                   tf_operation = "get_element",
+                   operation_args = list("logdet"))
+
+      covar <- op("covar",
+                  parameter_list,
+                  dim = dim(sigma),
+                  tf_operation = "get_element",
+                  operation_args = list("covar"))
+
+      iterations <- op("iterations",
+                       parameter_list,
+                       tf_operation = "get_element",
+                       operation_args = list("iterations"))
+
+      converged <- op("converged",
+                      parameter_list,
+                      tf_operation = "get_element",
+                      operation_args = list("converged"))
+
+      # pull out the elements
+      list(z = z,
+           a = a,
+           mean = mean,
+           logdet = logdet,
+           covar = covar,
+           iterations = iterations,
+           converged = converged)
+
+    },
+
+    # named list of operation greta arrays for the marginalisation parameters
+    compute_parameters_univariate = function(conditional_density_fun, dots) {
+
+      # function to compute the parameter (log probabilities) of the marginalisation
+      tf_compute_laplace_parameters = function(mean,
+                                               sd,
+                                               ...,
+                                               tf_conditional_density_fun,
+                                               other_args) {
+
+
+        # the following algorithm for a univariate normal distribution is derived
+        # from LA in Rasmussen & Williams for diagonal Hessian & diagonal prior.
+        # It needs checking!
+
+        # nolint start
+
+        # with univariate normal and diagonal hessian, iteration is:
+        #   z_diff = mean + (w * (z - mean) + d1(z)) / (1 / sd + w) - z
+        #   z_new = z + s * z_diff
+
+        #         # instead work with decentred variable a where z = a * sd ^ 2 + mean
+        #         #   l = sqrt(1 + w * sd)
+        #         #   b = w * (z - mean) * d1(z)
+        #         #   a_diff = b − (w * sd * b) / l ^ 2 - a
+        #         #   a_new = a + s * a_diff
+        #         #   z_new = a_new * sd ^ 2 + mean
+
+        # estimated sd at mode is:
+        #   sqrt(1 / (1 / sd + w))
+
+        # the approximated negative marginal density is:
+        #   nmcp = 0.5 * sum((z - mean) ^ 2 / sd) - d0(z) + 0.5 * sum(log1p(w * sd))
+
+        # nolint end
+
+        dots <- list(...)
+
+        # simpler interface to conditional density. If reduce = TRUE, it does
+        # reduce_sum on component densities
+        d0 <- function(z, reduce = TRUE) {
+          # transpose z to a row vector, which the dag is expecting
+          t_z <- tf_transpose(z)
+          args <- c(list(t_z), dots, list(reduce = reduce))
+          do.call(tf_conditional_density_fun, args)
+        }
+
+        # get the vectors of first and second derivatives of the conditional density
+        # function, w.r.t. the variable being marginalised
+        if (other_args$diagonal_hessian) {
+          derivs <- function(z) {
+            y <- d0(z, reduce = FALSE)
+            d1 <- tf$gradients(y, z)[[1]]
+            d2 <- tf$gradients(d1, z)[[1]]
+            list(d1, d2)
+          }
+        } else {
+
+          stop("inference with non-diagonal hessians is not yet implemented",
+               call. = FALSE)
+
+          derivs <- function(z) {
+            y <- d0(z, reduce = FALSE)
+            d1 <- tf$gradients(y, z)[[1]]
+            d2 <- tf$hessians(y, z)[[1]]  # this won't work!
+            list(d1, d2)
+          }
+        }
+
+        # negative log-posterior for the current value of z under MVN assumption
+        psi <- function(z, mean, sd) {
+          p1 <- tf_sum((z - mean) ^ 2 / sd)
+          fl(0.5) * tf$squeeze(p1, 1:2) - d0(z)
+        }
+
+        # here z is a *column vector* to simplify later calculations, it needs to be
+        # transposed to a row vector before feeding into the likelihood function(s)
+        z <- tf$identity(mean)
+
+        # Newton-Raphson parameters
+        tol <- tf$constant(other_args$tolerance, tf_float(), shape(1))
+        obj_old <- tf$constant(Inf, tf_float(), shape(1))
+        iter <- tf$constant(0L)
+        maxiter <- tf$constant(other_args$max_iterations)
+
+        # match batches on everything going into the loop that will have a batch
+        # dimension later
+        objects <- list(mean, sd, z, obj_old, tol)
+        objects <- greta:::match_batches(objects)
+        mean <- objects[[1]]
+        sd <- objects[[2]]
+        z <- objects[[3]]
+        obj_old <- objects[[4]]
+        tol <- objects[[5]]
+
+        obj <- -d0(z)
+
+        # get the batch dim explicitly, for the step size optimisation
+        batch_dim <- tf$shape(mean)[[0]]
+        batch_dim <- tf$expand_dims(batch_dim, 0L)
+
+        # tensorflow while loop to do Newton-Raphson iterations
+        body <- function(z, obj_old, obj, tol, iter, maxiter) {
+
+          obj_old <- obj
+
+          deriv <- derivs(z)
+          d1 <- deriv[[1]]
+          d2 <- deriv[[2]]
+
+          # curvature of the likelihood
+          w <- -d2
+
+          # with univariate normal and diagonal hessian, iteration is:
+          z_diff <- mean + (w * (z - mean) + d1) / (fl(1) / sd + w) - z
+
+          # use golden section search to find the optimum distance to step in this
+          # direction, for each batch simultaneously
+          psiline <- function(s) {
+            s <- tf$expand_dims(s, 1L)
+            s <- tf$expand_dims(s, 2L)
+            z_new <- z + s * z_diff
+            psi(z_new, mean, sd)
+          }
+
+          ls_results <- gss(psiline, batch_dim, upper = 2)
+          stepsize <- ls_results$minimum
+          stepsize <- tf$expand_dims(stepsize, 1L)
+          stepsize <- tf$expand_dims(stepsize, 2L)
+
+          # do the update and compute new z and objective
+          z_new = z + stepsize * z_diff
+          obj <- psi(z_new, mean, sd)
+
+          list(z_new, obj_old, obj, tol, iter + 1L, maxiter)
+
+        }
+
+        cond <- function(z, obj_old, obj, tol, iter, maxiter) {
+          not_all_converged <- tf$reduce_any(tf$less(tol, obj_old - obj))
+          in_time <- tf$less(iter, maxiter)
+          in_time & not_all_converged
+        }
+
+        values <- list(z,
+                       obj_old,
+                       obj,
+                       tol,
+                       iter,
+                       maxiter)
+
+        # run the Newton-Raphson optimisation to find the posterior mode of z
+        out <- tf$while_loop(cond, body, values)
+
+        # we might need to redefine z etc. if the backprop won't allow this errors
+        z <- out[[1]]
+
+        # curvature of the likelihood at the mode
+        deriv <- derivs(z)
+        d2 <- deriv[[2]]
+        w <- -d2
+
+        # approximate posterior variance
+        var <- fl(1) / (fl(1) / sd + w)
+
+        # log-determinant of l
+        logdet <- fl(0.5) * tf_sum(tf$math$log1p(w * sd))
+
+        # convergence information
+        iter <- out[[5]]
+        converged <- tf$less(iter, maxiter)
+
+        # return a list of these things
+        list(z = z,
+             var = var,
+             logdet = logdet,
+             iterations = iter,
+             converged = converged)
+
+      }
+
+      # get greta arrays for parameters of distribution node
+      mean <- self$distribution$parameters$mean
+      sd <- self$distribution$parameters$sd
+
+      # run the laplace approximation fitting and get a list of parameters in a
+      # mock greta array (just representing a tf list)
+      args <- c(operation = "marginalisation_parameters",
+                mean = mean,
+                sd = sd,
+                dots,
+                list(
+                  operation_args = list(
+                    tf_conditional_density_fun = conditional_density_fun,
+                    other_args = self$other_args
+                  ),
+                  tf_operation = "tf_compute_laplace_parameters",
+                  dim = 1
+                ))
+
+      parameter_list <- do.call(op, args)
+
+      # extract the elements to operation greta arrays with the correct shapes
+      z <- op("z",
+              parameter_list,
+              dim = self$distribution$dim,
+              tf_operation = "get_element",
+              operation_args = list("z"))
+
+      var <- op("var",
+              parameter_list,
+              dim = self$distribution$dim,
+              tf_operation = "get_element",
+              operation_args = list("var"))
+
+      logdet <- op("log determinant",
+                   parameter_list,
+                   tf_operation = "get_element",
+                   operation_args = list("logdet"))
+
+      iterations <- op("iterations",
+                       parameter_list,
+                       tf_operation = "get_element",
+                       operation_args = list("iterations"))
+
+      converged <- op("converged",
+                      parameter_list,
+                      tf_operation = "get_element",
+                      operation_args = list("converged"))
+
+      # pull out the elements
+      list(z = z,
+           var = var,
+           mean = mean,
+           sd = sd,
+           logdet = logdet,
+           iterations = iterations,
+           converged = converged)
+
+    },
+
+    tf_marginalisation_density = function(parameters,
+                                           tf_conditional_density_fun,
+                                           dots,
+                                           other_args) {
+
+      z <- parameters$z
+      mean <- parameters$mean
+      logdet <- parameters$logdet
+
+      # evaluate conditional density at posterior mode of z
+      args <- c(list(tf_transpose(z)),
+                dots,
+                list(reduce = TRUE))
+      d0 <- do.call(tf_conditional_density_fun, args)
+
+      # the approximate marginal conditional posterior
+      if (other_args$multivariate) {
+
+        a <- parameters$a
+        mean <- tf_transpose(mean)
+        p1 <- tf$matmul(tf_transpose(a), z - mean)
+        p1 <- tf$squeeze(p1, 1:2)
+
+      } else {
+
+        sd <- parameters$sd
+        p1 <- tf_sum((z - mean) ^ 2 / sd)
+
+      }
+
+      nmcp <- fl(0.5) * p1 - d0 + logdet
+      -nmcp
+
+    },
+
+    return_list = function(parameters) {
+
+      if (self$multivariate) {
+
+        result <- list(mean = t(parameters$z),
+                       sigma = parameters$covar,
+                       iterations = parameters$iterations,
+                       converged = parameters$converged)
+
+      } else {
+
+        result <- list(mean = parameters$z,
+                       sd = sqrt(parameters$var),
+                       iterations = parameters$iterations,
+                       converged = parameters$converged)
+
+      }
+
+      result
+
+    }
+
+  )
+)
+
+#' @name marginalisers
+#'
+#' @title marginalisation methods
+#' @description Functions to set up marginalisers (which explicitly integrate
+#'   out random variables) and modify their behaviour, for use in
+#'   \code{\link{marginalise}()}.
+#'
+#'   \code{discrete_marginalisation} marginalises discrete random variables via
+#'   a Riemann sum over a range of plausible values.
+#'
+#'   \code{laplace_approximation} marginalises random variables (with
+#'   multivariate normal priors) approximately, using by Newton-Raphson
+#'   optimisation and assuming the posterior follows a multivariate normal
+#'   distribution.
+#'
+#' @return a \code{marginaliser} object that can be passed to
+#'   \code{marginalise}.
+NULL
+
+#' @rdname marginalisers
+#' @export
+#'
+#' @param values an R vector giving values at which to evaluate the function for
+#'   a discrete marginalisation
+#'
+#' @details \code{discrete_marginalisation} can only be used with discrete
+#'   probability distributions, e.g. those defined with \code{poisson()} and
+#'   \code{binomial()}. For discrete distributions with finite support (such as
+#'   \code{bernoulli()}) the marginalisation will be exact, so long as
+#'   \code{values} includes all possible values of the variable. For discrete
+#'   distributions with non-finite support (such as \code{poisson()}, which has
+#'   no upper bound), the marginalisation can only ever be approximate. However
+#'   if \code{values} cover a range of values with sufficiently high support in
+#'   the distribution, that approximation error will be minimal.
+discrete_marginalisation <- function(values) {
+
+  if (!is.vector(values) | !is.numeric(values)) {
+    msg <- "'values' must be an R numeric vector"
+
+    if (inherits(values, "greta_array")) {
+      msg <- paste0(msg, ", not a greta array")
+    }
+
+    stop(msg)
+  }
+
+  as_marginaliser(values = values,
+                  class = discrete_marginaliser)
+
+}
+
+#' @rdname marginalisers
+#' @export
+#'
+#' @param tolerance the (positive) numerical convergence tolerance (in units of
+#'   the log conditional posterior density) of the Newton-Raphson optimisation
+#'   algorithm
+#'
+#' @param max_iterations the (positive) integer-like maximum number of
+#'   iterations of the Newton-Raphson optimisation algorithm
+#'
+#' @param diagonal_hessian whether the Hessian matrix should be assumed to be
+#'   diagonal, to speed up computations. See Details.
+#'
+#' @details \code{laplace_approximation} can only be used to marginalise
+#'   variables following a multivariate normal distribution.
+#'
+#'  The argument \code{diagonal_hessian} can be used to state that the
+#'  conditional density factorises along the elements of the variable being
+#'  marginalised, and therefore the Hessian matrix of this function can be
+#'  assumed to be diagonal. A conditional density function factorises if each
+#'  observation in the conditional density depends only on the corresponding per
+#'  element of the variable being marginalised. If this is not the case and you
+#'  set \code{diagonal_hessian = TRUE}, your inferences will be incorrect.
+#'
+laplace_approximation <- function(tolerance = 1e-6,
+                                  max_iterations = 50,
+                                  diagonal_hessian = FALSE) {
+
+  if (!(is.numeric(tolerance) &&
+        is.vector(tolerance) &&
+        length(tolerance) == 1 &&
+        tolerance > 0)) {
+    stop("'tolerance' must be a positive, scalar numeric value")
+  }
+
+  max_iterations <- as.integer(max_iterations)
+  if (!(is.vector(max_iterations) &&
+        length(max_iterations) == 1 &&
+        max_iterations > 0)) {
+    stop("'max_iterations' must be a positive, scalar integer value")
+  }
+
+  as_marginaliser(tolerance = tolerance,
+                  max_iterations = max_iterations,
+                  diagonal_hessian = diagonal_hessian,
+                  class = laplace_marginaliser)
+
+}
+
+# helper to contruct marginalisers
+as_marginaliser <- function(..., class) {
+
+  obj <- list(class = class, args = list(...))
+  class_name <- class$classname
+  class(obj) <- c(class_name, "marginaliser")
+  obj
+
+}
+
+#' @export
+#' @noRd
+print.marginaliser <- function(x, ...) {
+  msg <- paste(x$class$classname, "object")
+  cat(msg)
+}
diff --git a/R/mixture.R b/R/mixture.R
index 5c96de8f..23f12587 100644
--- a/R/mixture.R
+++ b/R/mixture.R
@@ -210,17 +210,16 @@ mixture_distribution <- R6Class(
 
     tf_distrib = function(parameters, dag) {
 
-      # get information from the *nodes* for component distributions, not the tf
-      # objects passed in here
+      # get tfp distributions and mixture weights
+      weights <- parameters$weights
+      tfp_distributions <- parameters[names(parameters) != "weights"]
+      names(tfp_distributions) <- NULL
 
-      # get tfp distributions, truncations, & bounds of component distributions
+      # get information on truncations, & bounds of component distributions from
+      # the *nodes* for component distributions
       distribution_nodes <- self$parameters[names(self$parameters) != "weights"]
       truncations <- lapply(distribution_nodes, member, "truncation")
       bounds <- lapply(distribution_nodes, member, "bounds")
-      tfp_distributions <- lapply(distribution_nodes, dag$get_tfp_distribution)
-      names(tfp_distributions) <- NULL
-
-      weights <- parameters$weights
 
       # use log weights if available
       if (self$weights_is_log) {
diff --git a/R/node_types.R b/R/node_types.R
index 281b18e8..519ce3b2 100644
--- a/R/node_types.R
+++ b/R/node_types.R
@@ -56,8 +56,7 @@ data_node <- R6Class(
         }
 
         # expand up to batch size
-        tiling <- c(tfe$batch_size, rep(1L, ndim))
-        batched_tensor <- tf$tile(unbatched_tensor, tiling)
+        batched_tensor <- tile_to_batch(unbatched_tensor, tfe$batch_size)
 
         # put unbatched tensor in environment so it can be set
         assign(unbatched_name, unbatched_tensor, envir = tfe)
@@ -543,9 +542,13 @@ distribution_node <- R6Class(
 
     tf = function(dag) {
 
-      # assign the distribution object constructor function to the environment
+      # build a tfp distribution object
+      tf_parameter_list <- lapply(self$parameters, dag$get_tf_object)
+      tfp_distribution <- self$tf_distrib(tf_parameter_list, dag = dag)
+
+      # assign it to the environment
       assign(dag$tf_name(self),
-             self$tf_distrib,
+             tfp_distribution,
              envir = dag$tf_environment)
 
     },
diff --git a/R/utils.R b/R/utils.R
index bb5b8c40..08c75b48 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -164,6 +164,10 @@ check_tf_version <- function(alert = c("none",
 member <- function(x, method)
   eval(parse(text = paste0("x$", method)))
 
+get_element <- function(x, element) {
+  x[[element]]
+}
+
 node_type <- function(node) {
   classes <- class(node)
   type <- grep("*_node", classes, value = TRUE)
@@ -181,6 +185,12 @@ fl <- function(x) {
   tf$constant(x, dtype = tf_float())
 }
 
+# get the tensor for the batch size in the dag currently defining (since it's
+# not alway possible to pass the dag in)
+get_batch_size <- function() {
+  options()$greta_batch_size
+}
+
 # coerce an integer(ish) vector to a list as expected in tensorflow shape
 # arguments
 #' @noRd
@@ -299,13 +309,19 @@ drop_column_dim <- function(x) {
   x
 }
 
+# given a tensor (with batch dimension of size 1) and a batch size tensor, tile
+# the tensor to match the batch size
+tile_to_batch <- function(x, batch_size) {
+  ndim <- length(dim(x))
+  tf$tile(x, c(batch_size, rep(1L, ndim - 1)))
+}
+
 # where x is a tensor with no batch dimension, and y is a tensor with a batch
 # dimension, tile x to have first dimension matching y (dimension determined at
 # run time)
 expand_to_batch <- function(x, y) {
   batch_size <- tf$shape(y)[[0]]
-  ndim <- length(dim(x))
-  tf$tile(x, c(batch_size, rep(1L, ndim - 1)))
+  tile_to_batch(x, batch_size)
 }
 
 # does this tensor have a batch dimension (of unknown size) as its first
@@ -383,6 +399,113 @@ disable_tensorflow_logging <- function(disable = TRUE) {
   reticulate::py_set_attr(logger, "disabled", disable)
 }
 
+# vectorised golden section search algorithm for linesearch (1D constrained
+# minimisation) function must accept and return a 1D tensor of values, and
+# return a a tensor of the same shape returning objectives; this can therefore
+# run linesearch for multiple chains simultaneously. The batch dimension must be
+# known in advance however.
+gss <- function(func,
+                batch_dim = 1,
+                lower = 0,
+                upper = 1,
+                max_iterations = 50,
+                tolerance = .Machine$double.eps ^ 0.25) {
+
+  lower <- tf$constant(lower, tf_float(), shape(1))
+  upper <- tf$constant(upper, tf_float(), shape(1))
+  maxiter <- tf$constant(as.integer(max_iterations), tf$int32)
+  tol <- fl(tolerance)
+  iter <- tf$constant(0L, tf$int32)
+  golden_ratio <- fl(2 / (sqrt(5) + 1))
+
+  # expand out to batch dimensions
+  if (!inherits(batch_dim, "tensorflow.tensor")) {
+    batch_dim <- as.integer(batch_dim)
+    batch_dim <- tf$constant(batch_dim, tf$int32, shape(1))
+  }
+
+  # initial evaluation points
+  left <- tf$tile(lower, batch_dim)
+  right <- tf$tile(upper, batch_dim)
+  width <- right - left
+  d <- golden_ratio * width
+  x1 <- right - d
+  x2 <- left + d
+  f1 <- func(x1)
+  f2 <- func(x2)
+
+  values <- list(left, right, x1, x2, f1, f2, width, iter)
+
+  # start loop
+  body <- function(left, right, x1, x2, f1, f2, width, iter) {
+
+    # prep lists of vectors for whether steps are above x1 or below x2
+    # order: lower, upper, x1, x2, width
+
+    # if the minimum is below x2, shift the bounds and reuse x1 as the new x2
+    below_width <- x2 - left
+    below <- list(
+      left,
+      x2,
+      x2 - golden_ratio * below_width,
+      x1,
+      below_width
+    )
+
+    # if the minimum is above x1, shift the bounds and reuse x2 as the new x1
+    above_width <- right - x1
+    above <- list(
+      x1,
+      right,
+      x2,
+      x1 + golden_ratio * above_width,
+      above_width
+    )
+
+    # convert to matrices
+    below <- tf$stack(below, axis = 1L)
+    above <- tf$stack(above, axis = 1L)
+
+    is_below <- tf$greater(f2, f1)
+    status <- tf$where(is_below, below, above)
+
+    left <- status[, 0]
+    right <- status[, 1]
+    x1 <- status[, 2]
+    x2 <- status[, 3]
+    width <- status[, 4]
+
+    # either recompute f1 (f2) at new location, or use f2 (f1) in its place
+
+    # this is a bit convoluted, but ensures function calls don't need to be
+    # duplicated, whilst maintaining the vectorisation
+    x_to_evaluate <- tf$where(is_below, x1, x2)
+    new_f <- func(x_to_evaluate)
+    new_f1 <- tf$where(is_below, new_f, f2)
+    new_f2 <- tf$where(is_below, f1, new_f)
+
+    list(left, right, x1, x2, new_f1, new_f2, width, iter + 1L)
+  }
+
+  cond <- function(left, right, x1, x2, f1, f2, width, iter) {
+    not_converged <- tf$less(tol, tf$abs(width))
+    not_all_converged <- tf$reduce_any(not_converged)
+    in_time <- tf$less(iter, maxiter)
+    in_time & not_all_converged
+  }
+
+  out <- tf$while_loop(cond, body, values)
+
+  # get minimum value
+  width <- out[[7]]
+  min <- out[[1]] + width / fl(2)
+
+  list(minimum = min,
+       width = width,
+       iterations = out[[8]])
+
+}
+
 
 pad_vector <- function(x, to_length, with = 1) {
   pad_by <- to_length - length(x)
@@ -416,12 +539,14 @@ misc_module <- module(module,
                       drop_first_dim,
                       tile_first_dim,
                       drop_column_dim,
+                      tile_to_batch,
                       expand_to_batch,
                       has_batch,
                       match_batches,
                       split_chains,
                       hessian_dims,
                       rhex,
+                      gss,
                       disable_tensorflow_logging,
                       pad_vector)
 
@@ -1315,7 +1440,8 @@ as_tf_function <- function(r_fun, ...) {
     if (!is.list(ga_out))
       ga_out <- list(ga_out)
     targets <- c(ga_out, ga_dummies)
-    sub_dag <- dag_class$new(targets)
+    sub_dag <- dag_class$new(targets,
+                             tf_float = options()$greta_tf_float)
 
     # use the default graph, so that it can be overwritten when this is called?
     # alternatively fetch from above, or put it in greta_stash?
diff --git a/codemeta.json b/codemeta.json
index d0cccf26..418086b3 100644
--- a/codemeta.json
+++ b/codemeta.json
@@ -405,7 +405,7 @@
   ],
   "releaseNotes": "https://github.com/dill/greta/blob/master/NEWS.md",
   "readme": "https://github.com/dill/greta/blob/master/README.md",
-  "fileSize": "494.678KB",
+  "fileSize": "504.898KB",
   "contIntegration": [
     "https://travis-ci.org/greta-dev/greta",
     "https://codecov.io/github/greta-dev/greta?branch=master"
diff --git a/man/marginalisation.Rd b/man/marginalisation.Rd
new file mode 100644
index 00000000..10eb2a0e
--- /dev/null
+++ b/man/marginalisation.Rd
@@ -0,0 +1,73 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/marginalise.R
+\name{marginalisation}
+\alias{marginalisation}
+\alias{marginalise}
+\title{direct marginalisation of random variables}
+\usage{
+marginalise(fun, variable, method, ...)
+}
+\arguments{
+\item{fun}{an R function to integrate with respect to the random variable.
+The first argument must be a greta array for the random variable, any
+subsequent arguments must passed in via the \dots argument.}
+
+\item{variable}{a variable greta array with a distribution, representing the
+random variable to marginalise}
+
+\item{method}{a \code{marginaliser} object giving the method for carrying out
+the marginalisation}
+
+\item{\dots}{named greta arrays to be passed to \code{fun}}
+}
+\description{
+Inference on many statistical models requires marginalisation of
+  (ie. integration of functions over) random variables. In addition to
+  general purpose marginalisation methods like MCMC that act on an entire
+  model, it is often useful to directly marginalise a single random variable
+  within a larger model; e.g. for random variables where marginalisation by
+  MCMC is inefficient, impossible, or when an approximate marginalisation is
+  substantially more efficient. \code{marginalise()} performs direct
+  marginalisation within a greta model, using one of the available
+  marginalisation methods, detailed in \link{marginalisers}.
+}
+\details{
+The code in \code{fun} must define at least one distribution over
+  data (ie. a model likelihood), and cannot create any new variables. Any
+  variables must be created outide this function, and passed in via the \dots
+  argument.
+}
+\examples{
+\dontrun{
+# marginalise a discrete random variable and carry out HMC on the
+# continouous variables:
+y <- rnorm(100)
+
+lambda <- lognormal(0, 1)
+theta <- normal(0, 1)
+sd <- lognormal(0, 1)
+
+# if discrete variables could be sampled by MCMC in greta,
+# we would be able to simply do:
+#   n <- poisson(lambda)
+#   mu <- theta ^ n
+#   distribution(y) <- normal(mu, sd)
+
+# the function to marginalise over - the variable to marginalise
+# must be first, others must be passed to marginalise()
+likelihood <- function (n, theta, sd) {
+  mu <- theta ^ n
+  distribution(y) <- normal(mu, sd)
+}
+
+# integrate the function values w.r.t. the poisson distribution
+marginalise(fun = likelihood,
+            variable = poisson(lambda),
+            method = discrete_marginalisation(values = 0:10),
+            theta = theta,
+            sd = sd)
+
+m <- model(lambda)
+draws <- mcmc(m, hmc())
+}
+}
diff --git a/man/marginalisers.Rd b/man/marginalisers.Rd
new file mode 100644
index 00000000..c9268f6e
--- /dev/null
+++ b/man/marginalisers.Rd
@@ -0,0 +1,69 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/marginalisers.R
+\name{marginalisers}
+\alias{marginalisers}
+\alias{discrete_marginalisation}
+\alias{laplace_approximation}
+\title{marginalisation methods}
+\usage{
+discrete_marginalisation(values)
+
+laplace_approximation(
+  tolerance = 1e-06,
+  max_iterations = 50,
+  diagonal_hessian = FALSE
+)
+}
+\arguments{
+\item{values}{an R vector giving values at which to evaluate the function for
+a discrete marginalisation}
+
+\item{tolerance}{the (positive) numerical convergence tolerance (in units of
+the log conditional posterior density) of the Newton-Raphson optimisation
+algorithm}
+
+\item{max_iterations}{the (positive) integer-like maximum number of
+iterations of the Newton-Raphson optimisation algorithm}
+
+\item{diagonal_hessian}{whether the Hessian matrix should be assumed to be
+diagonal, to speed up computations. See Details.}
+}
+\value{
+a \code{marginaliser} object that can be passed to
+  \code{marginalise}.
+}
+\description{
+Functions to set up marginalisers (which explicitly integrate
+  out random variables) and modify their behaviour, for use in
+  \code{\link{marginalise}()}.
+
+  \code{discrete_marginalisation} marginalises discrete random variables via
+  a Riemann sum over a range of plausible values.
+
+  \code{laplace_approximation} marginalises random variables (with
+  multivariate normal priors) approximately, using by Newton-Raphson
+  optimisation and assuming the posterior follows a multivariate normal
+  distribution.
+}
+\details{
+\code{discrete_marginalisation} can only be used with discrete
+  probability distributions, e.g. those defined with \code{poisson()} and
+  \code{binomial()}. For discrete distributions with finite support (such as
+  \code{bernoulli()}) the marginalisation will be exact, so long as
+  \code{values} includes all possible values of the variable. For discrete
+  distributions with non-finite support (such as \code{poisson()}, which has
+  no upper bound), the marginalisation can only ever be approximate. However
+  if \code{values} cover a range of values with sufficiently high support in
+  the distribution, that approximation error will be minimal.
+
+\code{laplace_approximation} can only be used to marginalise
+  variables following a multivariate normal distribution.
+
+ The argument \code{diagonal_hessian} can be used to state that the
+ conditional density factorises along the elements of the variable being
+ marginalised, and therefore the Hessian matrix of this function can be
+ assumed to be diagonal. A conditional density function factorises if each
+ observation in the conditional density depends only on the corresponding per
+ element of the variable being marginalised. If this is not the case and you
+ set \code{diagonal_hessian = TRUE}, your inferences will be incorrect.
+}
diff --git a/tests/testthat/helpers.R b/tests/testthat/helpers.R
index 0e5a47ed..e593b532 100644
--- a/tests/testthat/helpers.R
+++ b/tests/testthat/helpers.R
@@ -835,7 +835,7 @@ not_finished <- function(draws, target_samples = 5000) {
 new_samples <- function(draws, target_samples = 5000) {
   neff <- min(coda::effectiveSize(draws))
   efficiency <- neff / coda::niter(draws)
-  1.2 * (target_samples - neff) / efficiency
+  1.5 * (target_samples - neff) / efficiency
 }
 
 not_timed_out <- function(start_time, time_limit = 300) {
@@ -844,15 +844,17 @@ not_timed_out <- function(start_time, time_limit = 300) {
 }
 
 get_enough_draws <- function(model,
-                             sampler = sampler,
                              n_effective = 5000,
+                             sampler = hmc(),
                              time_limit = 300,
-                             verbose = TRUE,
+                             chains = 50,
+                             verbose = FALSE,
                              one_by_one = FALSE) {
 
   start_time <- Sys.time()
   draws <- mcmc(model,
                 sampler = sampler,
+                chains = chains,
                 verbose = verbose,
                 one_by_one = one_by_one)
 
@@ -917,8 +919,8 @@ check_mvn_samples <- function(sampler, n_effective = 3000) {
   m <- model(x, precision = "single")
 
   draws <- get_enough_draws(m,
-                            sampler = sampler,
                             n_effective = n_effective,
+                            sampler = sampler,
                             verbose = FALSE)
 
   # get MCMC samples for statistics of the samples (value, variance and
@@ -938,10 +940,10 @@ check_mvn_samples <- function(sampler, n_effective = 3000) {
   # get absolute errors between posterior means and true values, and scale them
   # by time-series Monte Carlo standard errors (the expected amount of
   # uncertainty in the MCMC estimate), to give the number of standard errors
-  # away from truth. There's a 1/100 chance of any one of these scaled errors
-  # being greater than qnorm(0.99) if the sampler is correct
+  # away from truth. There's a 1/200 chance of any one of these scaled errors
+  # being greater than qnorm(1 - 0.005) if the sampler is correct
   errors <- scaled_error(stat_draws, stat_truth)
-  expect_lte(max(errors), qnorm(0.99))
+  expect_lte(max(errors), qnorm(1 - 0.005))
 
 }
 
@@ -958,8 +960,8 @@ check_samples <- function(x,
 
   m <- model(x, precision = "single")
   draws <- get_enough_draws(m,
-                            sampler = sampler,
                             n_effective = n_effective,
+                            sampler = sampler,
                             verbose = FALSE,
                             one_by_one = one_by_one)
 
@@ -979,6 +981,6 @@ check_samples <- function(x,
 
   # do a formal hypothesis test
   suppressWarnings(stat <- ks.test(mcmc_samples, iid_samples))
-  testthat::expect_gte(stat$p.value, 0.01)
+  testthat::expect_gte(stat$p.value, 0.005)
 
 }
diff --git a/tests/testthat/test_marginalisation.R b/tests/testthat/test_marginalisation.R
new file mode 100644
index 00000000..7377c31f
--- /dev/null
+++ b/tests/testthat/test_marginalisation.R
@@ -0,0 +1,313 @@
+context("marginalisation")
+
+test_that("marginalise errors nicely", {
+
+  skip_if_not(check_tf_version())
+  source("helpers.R")
+
+  # not a function
+  expect_error(
+    marginalise("apple", poisson(0.1), discrete_marginalisation(0:2)),
+    "must be an R function"
+  )
+
+  # not a greta array
+  expect_error(
+    marginalise(I, 1:5, discrete_marginalisation(0:2)),
+    "must be a variable greta array with a distribution"
+  )
+
+  # greta array but no distribution
+  expect_error(
+    marginalise(I, variable(), discrete_marginalisation()),
+    "must be a variable greta array with a distribution"
+  )
+
+  # not a marginaliser
+  expect_error(
+    marginalise(I, poisson(0.1), mean),
+    "'method' must be a valid marginalisation method"
+  )
+
+  # the following can only be assessed on final model definition
+
+  # function adds variables
+  fun <- function(x) {
+    x + normal(0, 1)
+  }
+  lambda <- variable()
+  marginalise(fun, poisson(lambda), discrete_marginalisation(0:2))
+  expect_error(
+    mcmc(model(lambda)),
+    "must not create any new variables"
+  )
+
+  # function has no distribution
+  fun <- function(x) {
+    x * 2
+  }
+  lambda <- variable()
+  marginalise(fun, poisson(lambda), discrete_marginalisation(0:2))
+  expect_error(
+    mcmc(model(lambda)),
+    "must constain at least one distribution over data"
+  )
+
+})
+
+test_that("discrete_marginalisation errors nicely", {
+
+  skip_if_not(check_tf_version())
+  source("helpers.R")
+
+  # greta array, not a numeric
+  expect_error(
+    discrete_marginalisation(values = variable()),
+    "must be an R numeric vector, not a greta array"
+  )
+
+  # not a numeric
+  expect_error(
+    discrete_marginalisation(values = c("apple", "banana")),
+    "must be an R numeric vector$"
+  )
+
+  # mismatch with distribution
+  expect_error(
+    marginalise(I, normal(0, 1), discrete_marginalisation(values = 1:5)),
+    "can only be used with discrete distributions"
+  )
+
+})
+
+test_that("inference runs with discrete marginalisation", {
+
+  skip_if_not(check_tf_version())
+  source("helpers.R")
+
+  # spike+slab poisson glm (in reality this is poorly identified)
+  n <- 100
+  x <- runif(n, -5, 5)
+  y <- rpois(n, exp(1))
+  p <- variable(lower = 0, upper = 1)
+  alpha <- variable()
+  beta <- variable()
+  likelihood <- function(z, alpha, beta) {
+    eta <- alpha + beta * x * z
+    lambda <- exp(eta)
+    distribution(y) <- poisson(lambda)
+  }
+  marginalise(likelihood,
+              bernoulli(p),
+              method = discrete_marginalisation(0:1),
+              alpha = alpha,
+              beta = beta)
+  m <- model(alpha, beta, p)
+
+  expect_ok(o <- opt(m))
+  expect_ok(draws <- mcmc(m, warmup = 20, n_samples = 20, verbose = FALSE))
+
+})
+
+test_that("discrete marginalisation gives correct densities", {
+
+  skip_if_not(check_tf_version())
+  source("helpers.R")
+
+  # no constraints to no adjustment to account for
+  lambda <- variable()
+  y <- rnorm(100)
+  likelihood <- function(z) {
+    distribution(y) <- normal(z, 1)
+  }
+  values <- 0:5
+  marginalise(likelihood,
+              poisson(lambda),
+              method = discrete_marginalisation(values))
+  m <- model(lambda)
+  o <- opt(m)
+
+  lambda_val <- runif(1, 0, 5)
+
+  # manually compute expected marginal likelihood given this value of lambda
+  wt <- dpois(values, lambda_val)
+  wt <- wt / sum(wt)
+  dens <- vapply(y, dnorm, values, 1,
+                 FUN.VALUE = rep(1, length(values)))
+  dens <- apply(dens, 1, prod)
+  expected <- log(sum(wt * dens))
+
+  # compute with greta
+  m$dag$send_parameters(lambda_val)
+  observed <- m$dag$log_density()
+
+  # compare results (within tolerance)
+  compare_op(expected, observed)
+
+})
+
+test_that("laplace_approximation errors nicely", {
+
+  skip_if_not(check_tf_version())
+  source("helpers.R")
+
+  # bad tolerance
+  expect_error(
+    laplace_approximation(tolerance = -1),
+    "must be a positive, scalar numeric value"
+  )
+
+  # bad max iterations
+  expect_error(
+    laplace_approximation(max_iterations = 0),
+    "must be a positive, scalar integer value"
+  )
+
+  # mismatch with distribution
+  expect_error(
+    marginalise(I, beta(2, 2), laplace_approximation()),
+    "can only be used with a normal or multivariate normal distribution"
+  )
+
+})
+
+test_that("inference runs with laplace approximation", {
+
+  skip_if_not(check_tf_version())
+  source("helpers.R")
+
+  # the eight schools model
+  treatment_effects <- c(28.39, 7.94, -2.75 , 6.82, -0.64, 0.63, 18.01, 12.16)
+  treatment_stddevs <- c(14.9, 10.2, 16.3, 11.0, 9.4, 11.4, 10.4, 17.6)
+
+  int <- variable()
+  sd <- variable(lower = 0)
+  lik <- function(theta) {
+    distribution(treatment_effects) <- normal(t(theta), treatment_stddevs)
+  }
+
+  # mock up as a multivariate normal distribution
+  sigma <- diag(8) * sd ^ 2
+  mu <- ones(1, 8) * int
+  marginalise(lik,
+              multivariate_normal(mu, sigma),
+              laplace_approximation(diagonal_hessian = TRUE))
+
+  m <- model(int, sd)
+
+  expect_ok(o <- opt(m))
+  expect_ok(draws <- mcmc(m, warmup = 20, n_samples = 20,
+                          verbose = FALSE))
+
+})
+
+test_that("laplace approximation has correct posterior for univariate normal", {
+
+  skip_if_not(check_tf_version())
+  source("helpers.R")
+
+  # nolint start
+  # test vs analytic posterior on 8 schools data with no pooling:
+  #   y_i ~ N(theta_i, obs_sd_i ^ 2)
+  #   theta_i ~ N(mu, sd ^ 2)
+  # the posterior for theta is normal, so laplace should be exact
+  # Bayes theorum gives:
+  #   p(theta | y) \propto p(y|theta) p(theta)
+  # which with normal densities is:
+  #   p(theta_i | y_i) \propto N(y_i | theta_i, obs_sd_i ^ 2) *
+  #         N(theta_i | mu, sd ^ 2)
+  # which is equivalent to:
+  #   p(theta_i | y_i) \propto N(theta_mu_i, theta_var_i)
+  #   theta_var_i = 1 / (1 / sd ^ 2 + 1 / obs_sd_i ^ 2)
+  #   theta_mu_i = (mu / sd ^ 2 + y_i / obs_sd_i ^ 2) * theta_var_i
+  # conjugate prior, see Wikipedia conjugate prior table
+  # nolint end
+
+  # eight schools data
+  y <- c(28.39, 7.94, -2.75 , 6.82, -0.64, 0.63, 18.01, 12.16)
+  obs_sd <- c(14.9, 10.2, 16.3, 11.0, 9.4, 11.4, 10.4, 17.6)
+
+  # prior parameters for int, and fixed variance of theta
+  mu <- rnorm(1)
+  sd <- abs(rnorm(1))
+
+  # analytic solution
+  obs_prec <- 1 / (obs_sd ^ 2)
+  prec <- 1 / (sd ^ 2)
+  theta_var <- 1 / (obs_prec + prec)
+  theta_mu <- (y * obs_prec  + mu * prec) * theta_var
+  theta_sd <- sqrt(theta_var)
+  analytic <- cbind(mean = theta_mu, sd = sqrt(theta_var))
+
+  # analyse as univariate normal
+  lik <- function(theta) {
+    distribution(y) <- normal(theta, obs_sd)
+  }
+  out <- marginalise(lik,
+                     normal(mu, sd, dim = 8),
+                     laplace_approximation(diagonal_hessian = TRUE))
+  res <- do.call(calculate, out)
+  laplace_uni <- cbind(mean = res$mean, sd = res$sd)
+  compare_op(analytic, laplace_uni)
+
+  # analyse as multivariate normal
+  lik <- function(theta) {
+    distribution(y) <- normal(t(theta), obs_sd)
+  }
+  mean <- ones(1, 8) * mu
+  sigma <- diag(8) * sd ^ 2
+  out <- marginalise(lik,
+                     multivariate_normal(mean, sigma),
+                     laplace_approximation(diagonal_hessian = TRUE))
+  res <- do.call(calculate, out)
+  laplace_multi <- cbind(mean = res$mean[1, ], sd = sqrt(diag(res$sigma)))
+  compare_op(analytic, laplace_multi)
+
+})
+
+test_that("laplace approximation has correct posterior for multivariate normal", {
+
+  skip_if_not(check_tf_version())
+  source("helpers.R")
+
+  # nolint start
+  # test vs analytic posterior on 8 schools data with no pooling:
+  #   y_i ~ N(theta_i, obs_sd_i ^ 2)
+  #   theta ~ MVN(mu, sigma)
+  # the posterior for theta is multivariate normal, so laplace should be exact
+  #   p(theta | y) \propto MNN(theta_mu, theta_sigma)
+  #   theta_sigma = (sigma^-1 + diag(1 /obs_sd^2))^-1
+  #   theta_mu = theta_sigma (sigma^-1 mu + diag(1 /obs_sd^2) mean(y))^-1
+  # conjugate prior, see Wikipedia conjugate prior table
+  # nolint end
+
+  # eight schools data
+  y <- c(28.39, 7.94, -2.75 , 6.82, -0.64, 0.63, 18.01, 12.16)
+  obs_sd <- c(14.9, 10.2, 16.3, 11.0, 9.4, 11.4, 10.4, 17.6)
+
+  # prior parameters for mu and sigma
+  mu <- rnorm(8)
+  sigma <- rwish(1, 9, diag(8))[1, , ]
+
+  obs_prec <- diag(1 / (obs_sd ^ 2))
+  prec <- solve(sigma)
+  theta_sigma <- solve(prec + obs_prec)
+  theta_mu <- theta_sigma %*% (prec %*% mu + obs_prec %*% y)
+  theta_sigma_flat <- theta_sigma[upper.tri(theta_sigma, diag = TRUE)]
+
+  lik <- function(theta) {
+    distribution(y) <- normal(t(theta), obs_sd)
+  }
+  out <- marginalise(lik,
+                     multivariate_normal(t(mu), sigma),
+                     laplace_approximation(diagonal_hessian = TRUE, tolerance = 1e-32))
+  res <- calculate(mean =  t(out$mean), sigma = out$sigma, iterations = out$iterations)
+  theta_mu_est <- res$mean
+  theta_sigma_est <- res$sigma
+  theta_sigma_est_flat <- theta_sigma_est[upper.tri(theta_sigma_est, diag = TRUE)]
+
+  # compare these to within a tolerance
+  compare_op(theta_mu, theta_mu_est)
+  compare_op(theta_sigma_flat, theta_sigma_est_flat)
+
+})
diff --git a/tests/testthat/test_misc.R b/tests/testthat/test_misc.R
index 1bc2bcba..ef0e39e6 100644
--- a/tests/testthat/test_misc.R
+++ b/tests/testthat/test_misc.R
@@ -285,3 +285,25 @@ test_that("module works", {
   expect_identical(names(mod$functions), c("exp", "log", "sum"))
 
 })
+
+test_that("golden section search works", {
+
+  skip_if_not(check_tf_version())
+  source("helpers.R")
+
+  n_batch <- 4
+
+  # random optima
+  locs <- runif(n_batch)
+
+  # simple quadratic function with different optima for each batch
+  func <- function(x) {
+    (x - locs) ^ 2
+  }
+
+  # run the minimisation
+  ss <- gss(func, n_batch)
+  result <- tf$Session()$run(ss)
+  compare_op(result$minimum, locs)
+
+})
diff --git a/tests/testthat/test_posteriors.R b/tests/testthat/test_posteriors.R
index 68bae814..d2e91c5c 100644
--- a/tests/testthat/test_posteriors.R
+++ b/tests/testthat/test_posteriors.R
@@ -4,7 +4,6 @@ test_that("posterior is correct (binomial)", {
 
   skip_if_not(check_tf_version())
   source("helpers.R")
-  skip_if_not_release()
 
   # analytic solution to the posterior of the paramter of a binomial
   # distribution, with uniform prior
@@ -15,7 +14,7 @@ test_that("posterior is correct (binomial)", {
   distribution(pos) <- binomial(n, theta)
   m <- model(theta)
 
-  draws <- get_enough_draws(m, hmc(), 2000, verbose = FALSE)
+  draws <- get_enough_draws(m, n_effective = 5000)
 
   samples <- as.matrix(draws)
 
@@ -33,7 +32,65 @@ test_that("posterior is correct (binomial)", {
   n_draws <- round(coda::effectiveSize(draws))
   comparison <- rbeta(n_draws, shape1, shape2)
   suppressWarnings(test <- ks.test(samples, comparison))
-  expect_gte(test$p.value, 0.01)
+  expect_gte(test$p.value, 0.05)
+
+})
+
+test_that("posterior is correct (normal)", {
+
+  skip_if_not(check_tf_version())
+  source("helpers.R")
+
+  # nolint start
+  # test vs analytic posterior on 8 schools data with no pooling:
+  #   y_i ~ N(theta_i, obs_sd_i ^ 2)
+  #   theta_i ~ N(mu, sd ^ 2)
+  # nolint end
+
+  # eight schools data
+  y <- c(28.39, 7.94, -2.75 , 6.82, -0.64, 0.63, 18.01, 12.16)
+  obs_sd <- c(14.9, 10.2, 16.3, 11.0, 9.4, 11.4, 10.4, 17.6)
+
+  # prior parameters for int, and fixed variance of theta
+  mu <- rnorm(1)
+  sd <- abs(rnorm(1))
+
+  # nolint start
+  # Bayes theorum gives:
+  #   p(theta | y) \propto p(y|theta) p(theta)
+  # which with normal densities is:
+  #   p(theta_i | y_i) \propto N(y_i | theta_i, obs_sd_i ^ 2) * N(theta_i | mu, sd ^ 2)
+  # which is equivalent to:
+  #   p(theta_i | y_i) \propto N(theta_mu_i, theta_var_i)
+  #   theta_var_i = 1 / (1 / sd ^ 2 + 1 / obs_sd_i ^ 2)
+  #   theta_mu_i = (mu / sd ^ 2 + y_i / obs_sd_i ^ 2) * theta_var_i
+  # conjugate prior, see Wikipedia conjugate prior table
+  # nolint end
+
+  obs_prec <- 1 / (obs_sd ^ 2)
+  prec <- 1 / (sd ^ 2)
+  theta_var <- 1 / (obs_prec + prec)
+  theta_mu <- (y * obs_prec  + mu * prec) * theta_var
+  analytic <- cbind(mean = theta_mu, sd = sqrt(theta_var))
+
+  # mcmc solution:
+  theta <- normal(mu, sd, dim = 8)
+  distribution(y) <- normal(theta, obs_sd)
+  m <- model(theta)
+
+  # draw at least 5000 effective samples of each parameter from the posterior
+  draws <- get_enough_draws(m, n_effective = 5000)
+
+  mcmc_stats <- summary(draws)$statistics
+  mcmc <- mcmc_stats[, 1:2]
+  error <- abs(mcmc - analytic)
+  mcmc_se <- mcmc_stats[, 4]
+
+  # ratio of error to that expected due to Monte Carlo noise
+  error_multiples <- error / cbind(mcmc_se, mcmc_se)
+
+  within_error_margin <- error_multiples < qnorm(1 - 0.005)
+  expect_true(all(within_error_margin))
 
 })
 
@@ -41,7 +98,6 @@ test_that("samplers are unbiased for bivariate normals", {
 
   skip_if_not(check_tf_version())
   source("helpers.R")
-  skip_if_not_release()
 
   check_mvn_samples(hmc())
   check_mvn_samples(rwmh())
@@ -53,7 +109,6 @@ test_that("samplers are unbiased for chi-squared", {
 
   skip_if_not(check_tf_version())
   source("helpers.R")
-  skip_if_not_release()
 
   df <- 5
   x <- chi_squared(df)
@@ -67,7 +122,6 @@ test_that("samplers are unbiased for standard uniform", {
 
   skip_if_not(check_tf_version())
   source("helpers.R")
-  skip_if_not_release()
 
   x <- uniform(0, 1)
   iid <- runif
@@ -80,13 +134,12 @@ test_that("samplers are unbiased for LKJ", {
 
   skip_if_not(check_tf_version())
   source("helpers.R")
-  skip_if_not_release()
 
   x <- lkj_correlation(3, 2)[1, 2]
   iid <- function(n)
     rlkjcorr(n, 3, 2)[, 1, 2]
 
-  check_samples(x, iid, hmc(), one_by_one = TRUE)
+  check_samples(x, iid)
 
 })