Source code for pyvbmc.vbmc.variational_optimization

"""Variational optimization / training of variational posterior"""

import copy
import math

import gpyreg as gpr
import numpy as np
import scipy as sp

from pyvbmc.entropy import entlb_vbmc, entmc_vbmc
from pyvbmc.stats import get_hpd
from pyvbmc.variational_posterior import VariationalPosterior

from .iteration_history import IterationHistory
from .minimize_adam import minimize_adam
from .options import Options



[docs]
def update_K(
    optim_state: dict, iteration_history: IterationHistory, options: Options
):
    """
    Update number of variational mixture components.

    Parameters
    ==========
    optim_state : dict
        Optimization state from the VBMC instance we are calling this from.
    iteration_history : IterationHistory
        Iteration history from the VBMC instance we are calling this from.
    options : Options
        Options from the VBMC instance we are calling this from.

    Returns
    =======
    K_new : int
        The new number of variational mixture components.
    """
    K_new = optim_state["vp_K"]

    # Compute maximum number of components
    K_max = math.ceil(options.eval("k_fun_max", {"N": optim_state["n_eff"]}))

    # Evaluate bonus for stable solution.
    K_bonus = round(options.eval("adaptive_k", {"unkn": K_new}))

    # If not warming up, check if number of components gets to be increased.
    if not optim_state["warmup"] and optim_state["iter"] > 0:
        recent_iters = math.ceil(
            0.5 * options["tol_stable_count"] / options["fun_evals_per_iter"]
        )

        # Check if ELCBO has improved wrt recent iterations
        lower_end = max(0, optim_state["iter"] - recent_iters)
        elbos = iteration_history["elbo"][lower_end:]
        elboSDs = iteration_history["elbo_sd"][lower_end:]
        elcbos = elbos - options["elcbo_impro_weight"] * elboSDs
        warmups = iteration_history["warmup"][lower_end:].astype(bool)
        elcbos_after = elcbos[~warmups]
        # Ignore two iterations right after warmup.
        elcbos_after[0 : min(2, optim_state["iter"] + 1)] = -np.inf
        elcbo_max = np.max(elcbos_after)
        improving_flag = elcbos_after[-1] >= elcbo_max and np.isfinite(
            elcbos_after[-1]
        )

        # Add one component if ELCBO is improving and no pruning in
        # the last iteration
        if iteration_history["pruned"][-1] == 0 and improving_flag:
            K_new += 1

        # Bonus components for stable solution (speed up exploration)
        if (
            iteration_history["r_index"][-1] < 1
            and not optim_state["recompute_var_post"]
            and improving_flag
        ):
            # No bonus if any component was very recently pruned.
            new_lower_end = max(
                0, optim_state["iter"] - math.ceil(0.5 * recent_iters)
            )
            if np.all(iteration_history["pruned"][new_lower_end:] == 0):
                K_new += K_bonus

        K_new = max(optim_state["vp_K"], min(K_new, K_max))

    return K_new




[docs]
def optimize_vp(
    options: Options,
    optim_state: dict,
    vp: VariationalPosterior,
    gp: gpr.GP,
    fast_opts_N: int,
    slow_opts_N: int,
    K: int = None,
):
    """
    Optimize variational posterior.

    Parameters
    ==========
    options : Options
        Options from the VBMC instance we are calling this from.
    optim_state : dict
        Optimization state from the VBMC instance we are calling this from.
    vp : VariationalPosterior
        The variational posterior we want to optimize.
    gp : gpyreg.GaussianProcess
        The Gaussian process surrogate of the log-posterior, against which to
        optimize the VP.
    fast_opts_N : int
        Number of fast optimizations.
    slow_opts_N : int
        Number of slow optimizations.
    K : int, optional
        Number of mixture components. If not given defaults to the number
        of mixture components the given VP has.

    Returns
    =======
    vp : VariationalPosterior
        The optimized variational posterior.
    var_ss : int
        Estimated variance of the ELBO, due to variance of the expected
        log-joint, for each GP hyperparameter sample.
    pruned : int
        Number of pruned components.
    """

    if K is None:
        K = vp.K

    # Missing port: assigning default values to options if it is none
    #               due to the new structure of the program

    # Missing port: assign default values to optim_state since these are
    #               not really used

    # Turn off weight optimization in warm up if not already done.
    if optim_state["warmup"]:
        vp.optimize_weights = False

    # Quick sieve optimization to determine starting point(s)
    vp0_vec, vp0_type, elcbo_beta, compute_var, ns_ent_K, _ = _sieve(
        options,
        optim_state,
        vp,
        gp,
        K=K,
        init_N=fast_opts_N,
        best_N=slow_opts_N,
    )

    # Compute soft bounds for variational parameter optimization.
    theta_bnd = vp.get_bounds(gp.X, options, K)

    ## Perform optimization starting from one or few selected points.

    # Set up an empty stats struct for optimization
    theta_N = np.size(vp0_vec[0].get_parameters())
    Ns = np.size(gp.posteriors)
    elbo_stats = _initialize_full_elcbo(slow_opts_N * 2, theta_N, K, Ns)

    # For the moment no gradient available for variance
    gradient_available = compute_var == 0

    if gradient_available:
        # Set basic options for deterministic (?) optimizer
        compute_grad = True
    else:
        if ns_ent_K > 0:
            raise ValueError(
                """Gradients must be available when ns_ent_K is > 0."""
            )
        else:
            compute_grad = False

    vp0_fine = {}
    for i in range(0, slow_opts_N):
        # Be careful with off-by-one errors here, Python is zero-based.
        i_mid = (i + 1) * 2 - 2
        i_end = (i + 1) * 2 - 1

        # Select points from best ones depending on subset
        if slow_opts_N == 1:
            idx = 0
        elif slow_opts_N == 2:
            if i == 0:
                idx = np.where(vp0_type == 1)[0][0]
            else:
                idx = np.where((vp0_type == 2) | (vp0_type == 3))[0][0]
        else:
            idx = np.where(vp0_type == (i % 3) + 1)[0][0]

        vp0 = vp0_vec[idx]
        vp0_vec = np.delete(vp0_vec, idx)
        vp0_type = np.delete(vp0_type, idx)
        theta0 = vp0.get_parameters()

        if ns_ent_K == 0:
            # Fast optimization via deterministic entropy approximation

            # Objective function
            def vb_train_fun(theta_):
                res = _neg_elcbo(
                    theta_,
                    gp,
                    vp0,
                    elcbo_beta,
                    0,
                    compute_grad=compute_grad,
                    compute_var=compute_var,
                    theta_bnd=theta_bnd,
                )
                if compute_grad:
                    return res[0], res[1]
                return res[0]

            res = sp.optimize.minimize(
                vb_train_fun,
                theta0,
                jac=compute_grad,
                tol=options["det_entropy_tol_opt"],
            )

            if not res.success:
                # SciPy minimize failed
                raise RuntimeError(
                    "Cannot optimize variational parameters with",
                    "scipy.optimize.minimize.",
                )
            else:
                theta_opt = res.x
        else:
            # Objective function, should only return value and gradient.
            def vb_train_mc_fun(theta_):
                res = _neg_elcbo(
                    theta_,
                    gp,
                    vp0,
                    elcbo_beta,
                    ns_ent_K,
                    compute_grad=True,
                    compute_var=compute_var,
                    theta_bnd=theta_bnd,
                )
                return res[0], res[1]

            # Optimization via unbiased stochastic entroply approximation
            theta_opt = theta0

            if options["stochastic_optimizer"] == "adam":
                master_min = min(options["sgd_step_size"], 0.001)
                if optim_state["warmup"] or not vp.optimize_weights:
                    scaling_factor = min(0.1, options["sgd_step_size"] * 10)
                else:
                    scaling_factor = min(0.1, options["sgd_step_size"])

                # Fixed master stepsize
                master_max = scaling_factor

                # Note: we tried to adapt the stepsizes guided by the GP
                # hyperparameters, but this did not seem to help (the former
                # experimental option was "GPStochasticStepsize").
                master_max = max(master_min, master_max)
                master_decay = 200
                max_iter = min(10000, options["max_iter_stochastic"])

                theta_opt, _, theta_lst, f_val_lst, _ = minimize_adam(
                    vb_train_mc_fun,
                    theta_opt,
                    tol_fun=options["tol_fun_stochastic"],
                    max_iter=max_iter,
                    master_min=master_min,
                    master_max=master_max,
                    master_decay=master_decay,
                )

                if options["elcbo_midpoint"]:
                    # Recompute ELCBO at best midpoint with full variance
                    # and more precision.
                    idx_mid = np.argmin(f_val_lst)
                    elbo_stats = _eval_full_elcbo(
                        i_mid,
                        theta_lst[:, idx_mid],
                        vp0,
                        gp,
                        elbo_stats,
                        elcbo_beta,
                        options,
                    )
            else:
                raise ValueError("Unknown stochastic optimizer!")

        # Recompute ELCBO at endpoint with full variance and more precision
        elbo_stats = _eval_full_elcbo(
            i_end, theta_opt, vp0, gp, elbo_stats, elcbo_beta, options
        )

        vp0_fine[i_mid] = copy.deepcopy(vp0)
        vp0_fine[i_end] = copy.deepcopy(vp0)  # Parameters get assigned later

    ## Finalize optimization by taking variational parameters with best ELCBO

    idx = np.argmin(elbo_stats["nelcbo"])
    elbo = -elbo_stats["nelbo"][idx]
    elbo_sd = np.sqrt(elbo_stats["varF"][idx])
    G = elbo_stats["G"][idx]
    H = elbo_stats["H"][idx]
    var_ss = elbo_stats["var_ss"][idx]
    varG = elbo_stats["varG"][idx]
    varH = elbo_stats["varH"][idx]
    I_sk = np.zeros((Ns, K))
    J_sjk = np.zeros((Ns, K, K))
    I_sk[:, :] = elbo_stats["I_sk"][idx, :, :].copy()
    J_sjk[:, :, :] = elbo_stats["J_sjk"][idx, :, :, :].copy()
    vp = vp0_fine[idx]
    vp.set_parameters(elbo_stats["theta"][idx, :])

    ## Potentionally prune mixture components
    pruned = 0
    if vp.optimize_weights:
        already_checked = np.full((vp.K,), False)

        while np.any((vp.w < options["tol_weight"]) & ~already_checked):
            vp_pruned = copy.deepcopy(vp)

            # Choose a random component below threshold
            idx = np.argwhere(
                (vp.w < options["tol_weight"]).ravel() & ~already_checked
            ).ravel()
            idx = idx[np.random.randint(0, np.size(idx))]
            vp_pruned.w = np.delete(vp_pruned.w, idx)
            vp_pruned.eta = np.delete(vp_pruned.eta, idx)
            vp_pruned.sigma = np.delete(vp_pruned.sigma, idx)
            vp_pruned.mu = np.delete(vp_pruned.mu, idx, axis=1)
            vp_pruned.K -= 1
            theta_pruned = vp_pruned.get_parameters()
            # Recompute ELCBO
            elbo_stats = _eval_full_elcbo(
                0,
                theta_pruned,
                vp_pruned,
                gp,
                elbo_stats,
                elcbo_beta,
                options,
            )
            elbo_pruned = -elbo_stats["nelbo"][0]
            elbo_pruned_sd = np.sqrt(elbo_stats["varF"][0])

            # Difference in ELCBO (before and after pruning)
            delta_elcbo = np.abs(
                (elbo_pruned - options["elcbo_impro_weight"] * elbo_pruned_sd)
                - (elbo - options["elcbo_impro_weight"] * elbo_sd)
            )
            # Prune component if it has neglible influence on ELCBO
            pruning_threshold = options["tol_improvement"] * options.eval(
                "pruning_threshold_multiplier", {"K": K}
            )

            if delta_elcbo < pruning_threshold:
                vp = vp_pruned
                elbo = elbo_pruned
                elbo_sd = elbo_pruned_sd
                G = elbo_stats["G"][0]
                H = elbo_stats["H"][0]
                var_ss = elbo_stats["var_ss"][0]
                varG = elbo_stats["varG"][0]
                varH = elbo_stats["varH"][0]
                pruned += 1
                already_checked = np.delete(already_checked, idx)
                I_sk = np.delete(I_sk, idx, axis=1)
                J_sjk = np.delete(J_sjk, idx, axis=2)
            else:
                already_checked[idx] = True

    vp.stats = {}
    vp.stats["elbo"] = elbo  # ELBO
    vp.stats["elbo_sd"] = elbo_sd  # Error on the ELBO
    vp.stats["e_log_joint"] = G  # Expected log joint
    vp.stats["e_log_joint_sd"] = np.sqrt(varG)  # Error on expected log joint
    vp.stats["entropy"] = H  # Entropy
    vp.stats["entropy_sd"] = np.sqrt(varH)  # Error on the entropy
    vp.stats["stable"] = False  # Unstable until proven otherwise
    vp.stats["I_sk"] = I_sk  # Expected log joint per component
    vp.stats["J_sjk"] = J_sjk  # Covariance of expected log joint

    return vp, var_ss, pruned



def _initialize_full_elcbo(max_idx: int, D: int, K: int, Ns: int):
    """Initialize a dictionary for keeping track of full ELCBO output.

    Parameters
    ==========
    max_idx : int
        Maximum number of full ELCBO evaluations.
    D : int
        The dimension.
    K : int
        Number of mixture components.
    Ns : int
        Number of samples for entropy approximation.

    Returns
    =======
    elbo_stats : dict
        A dictionary with entries for all output variables of full ELCBO.
    """
    elbo_stats = {}
    elbo_stats["nelbo"] = np.full((max_idx,), np.inf)
    elbo_stats["G"] = np.full((max_idx,), np.nan)
    elbo_stats["H"] = np.full((max_idx,), np.nan)
    elbo_stats["varF"] = np.full((max_idx,), np.nan)
    elbo_stats["varG"] = np.full((max_idx,), np.nan)
    elbo_stats["varH"] = np.full((max_idx,), np.nan)
    elbo_stats["var_ss"] = np.full((max_idx,), np.nan)
    elbo_stats["nelcbo"] = np.full((max_idx,), np.inf)
    elbo_stats["theta"] = np.full((max_idx, D), np.nan)
    elbo_stats["I_sk"] = np.full((max_idx, Ns, K), np.nan)
    elbo_stats["J_sjk"] = np.full((max_idx, Ns, K, K), np.nan)
    return elbo_stats


def _eval_full_elcbo(
    idx: int,
    theta: np.ndarray,
    vp: VariationalPosterior,
    gp: gpr.GP,
    elbo_stats: dict,
    beta: float,
    options: Options,
    entropy_alpha: float = 0.0,
):
    """Evaluate full ELCBO and store the results in a dictionary.

    Parameters
    ==========
    idx : int
        Index in the dictionary to which store the evaluated values.
    theta : np.ndarray
        VP parameters for which to evaluate full ELCBO.
    vp : VariationalPosterior
        The variational posterior in question.
    gp : GP
        Gaussian process from VBMC main loop.
    elbo_stats : dict
        The dictionary for storing full ELCBO stats.
    beta : float
        Confidence weight.
    options : Options
        Options from the VBMC instance we are calling from.
    entropy_alpha : float, defaults to 0.0
        (currently unused) Parameter for lower/upper deterministic entropy
        interpolation.

    Returns
    =======
    elbo_stats : dict
        The updated dictionary.
    """
    # Number of samples per component for MC approximation of the entropy.
    K = vp.K
    ns_ent_fine_K = math.ceil(options.eval("ns_ent_fine", {"K": K}) / K)

    if "skip_elbo_variance" in options and options["skip_elbo_variance"]:
        compute_var = False
    else:
        compute_var = True

    nelbo, _, G, H, varF, _, var_ss, varG, varH, I_sk, J_sjk = _neg_elcbo(
        theta,
        gp,
        vp,
        0,
        ns_ent_fine_K,
        False,
        compute_var,
        None,
        entropy_alpha,
        True,
    )
    nelcbo = nelbo + beta * np.sqrt(varF)

    elbo_stats["nelbo"][idx] = nelbo
    elbo_stats["G"][idx] = G
    elbo_stats["H"][idx] = H
    elbo_stats["varF"][idx] = varF
    elbo_stats["varG"][idx] = varG
    elbo_stats["varH"][idx] = varH
    elbo_stats["var_ss"][idx] = var_ss
    elbo_stats["nelcbo"][idx] = nelcbo
    elbo_stats["theta"][idx, 0 : np.size(theta)] = theta
    elbo_stats["I_sk"][idx, :, 0:K] = I_sk
    elbo_stats["J_sjk"][idx, :, 0:K, 0:K] = J_sjk

    return elbo_stats


def _vp_bound_loss(
    vp: VariationalPosterior,
    theta: np.ndarray,
    theta_bnd: dict,
    tol_con: float = 1e-3,
    compute_grad: bool = True,
):
    """
    Variational parameter loss function for soft optimization bounds.

    Parameters
    ==========
    vp : VariationalPosterior
        The variational posterior for which we are interested in computing
        the loss function on.
    theta : np.ndarray, shape (N,)
        The parameters at which we want to compute the loss function.
    theta_bnd : dict
        Variational posterior soft bounds.
    tol_con : float, defaults to 1e-3
        Penalization relative scale.
    compute_grad : bool, defaults to True
        Whether to compute gradients.

    Returns
    =======
    L : float
        The value of the loss function.
    dL : np.ndarray, shape (N,), optional
        The gradient of the loss function.

    """

    if vp.optimize_mu:
        mu = theta[: vp.D * vp.K]
        start_idx = vp.D * vp.K
    else:
        mu = vp.mu.ravel(order="F")
        start_idx = 0

    if vp.optimize_sigma:
        ln_sigma = theta[start_idx : start_idx + vp.K]
        start_idx += vp.K
    else:
        ln_sigma = np.log(vp.sigma.ravel())

    if vp.optimize_lambd:
        ln_lambd = theta[start_idx : start_idx + vp.D].T
    else:
        ln_lambd = np.log(vp.lambd.ravel())

    if vp.optimize_weights:
        eta = theta[-vp.K :]

    ln_scale = np.reshape(ln_lambd, (-1, 1)) + np.reshape(ln_sigma, (1, -1))
    theta_ext = []
    if vp.optimize_mu:
        theta_ext.append(mu.ravel())
    if vp.optimize_sigma or vp.optimize_lambda:
        theta_ext.append(ln_scale.ravel(order="F"))
    if vp.optimize_weights:
        theta_ext.append(eta.ravel())
    theta_ext = np.concatenate(theta_ext)

    if compute_grad:
        L, dL = _soft_bound_loss(
            theta_ext,
            theta_bnd["lb"].ravel(),
            theta_bnd["ub"].ravel(),
            tol_con,
            compute_grad=True,
        )

        dL_new = np.array([])
        if vp.optimize_mu:
            dL_new = np.concatenate((dL_new, dL[0 : vp.D * vp.K].ravel()))
            start_idx = vp.D * vp.K
        else:
            start_idx = 0

        if vp.optimize_sigma or vp.optimize_lambda:
            dlnscale = np.reshape(
                dL[start_idx : start_idx + vp.D * vp.K], (vp.D, vp.K)
            )

            if vp.optimize_sigma:
                dL_new = np.concatenate((dL_new, np.sum(dlnscale, axis=0)))

            if vp.optimize_lambd:
                dL_new = np.concatenate((dL_new, np.sum(dlnscale, axis=1)))

        if vp.optimize_weights:
            dL_new = np.concatenate((dL_new, dL[-vp.K :].ravel()))

        return L, dL_new

    L = _soft_bound_loss(
        theta_ext,
        theta_bnd["lb"].ravel(),
        theta_bnd["ub"].ravel(),
        tol_con,
    )

    return L


def _soft_bound_loss(
    x: np.ndarray,
    slb: np.ndarray,
    sub: np.ndarray,
    tol_con: float = 1e-3,
    compute_grad: bool = False,
):
    """
    Loss function for soft bounds for function minimization.

    Parameters
    ==========
    x : np.ndarray, shape (D,)
        Point for which we want to know the loss function value.
    slb : np.ndarray, shape (D,)
        Soft lower bounds.
    sub : np.ndarray, shape (D,)
        Soft upper bounds.
    tol_con : float, defaults to 1e-3
        Penalization relative scale.
    compute_grad : bool, defaults to False
        Whether to compute gradients.

    Returns
    =======
    y : float
        The value of the loss function.
    dy : np.ndarray, shape (D,), optional
        The gradient of the loss function.
    """
    ell = (sub - slb) * tol_con
    y = 0.0
    dy = np.zeros(x.shape)

    idx = x < slb
    if np.any(idx):
        y += 0.5 * np.sum(((slb[idx] - x[idx]) / ell[idx]) ** 2)
        if compute_grad:
            dy[idx] = (x[idx] - slb[idx]) / ell[idx] ** 2

    idx = x > sub
    if np.any(idx):
        y += 0.5 * np.sum(((x[idx] - sub[idx]) / ell[idx]) ** 2)
        if compute_grad:
            dy[idx] = (x[idx] - sub[idx]) / ell[idx] ** 2

    if compute_grad:
        return y, dy
    return y


def _sieve(
    options: Options,
    optim_state: dict,
    vp: VariationalPosterior,
    gp: gpr.GP,
    init_N: int = None,
    best_N: int = 1,
    K: int = None,
):
    """
    Preliminary 'sieve' method for fitting variational posterior.

    Parameters
    ==========
    options : Options
        Options from the VBMC instance we are calling this from.
    optim_state : dict
        Optimization state from the VBMC instance we are calling this from.
    vp : VariationalPosterior
        The variational posterior to use as a basis for new candidates.
    gp : GP
        Current GP from optimization.
    init_N : int, optional
        Number of initial starting points.
    best_N : int, defaults to 1
        Specifies the design pattern for new starting parameters. ``best_N==1``
        means use the old variational parameters as a starting point for new
        candidate VP's. Any other value will use an even mix of:

        - the old variational parameters,
        - the highest posterior density training points, and
        - random starting points

        for new candidate VP's.
    K : int, optional
        Number of mixture components. If not given defaults to the number
        of mixture components the given VP has.

    Returns
    =======
    vp0_vec : np.ndarray, shape (init_N,)
        Vector of candidate variational posteriors.
    vp0_type : np.ndarray, shape (init_N,)
        Vector of types of candidate variational posteriors.
    elcbo_beta : float
        Confidence weight.
    compute_var : bool
        Whether to compute variance in later optimization.
    ns_ent_K : int
        Number of samples per component for MC approximation of the entropy.
    ns_ent_K_fast : int
        Number of samples per component for preliminary MC approximation of
        the entropy.
    """
    if K is None:
        K = vp.K

    # Missing port: assign default values to optim_state (since
    #               this doesn't seem to be necessary)

    ## Set up optimization variables and options.

    # Number of initial starting points
    if init_N is None:
        init_N = math.ceil(options.eval("ns_elbo", {"K": K}))
    nelcbo_fill = np.zeros((init_N,))

    # Number of samples per component for MC approximation of the entropy.
    ns_ent_K = math.ceil(options.eval("ns_ent", {"K": K}) / K)

    # Number of samples per component for preliminary MC approximation
    # of the entropy.
    ns_ent_K_fast = math.ceil(options.eval("ns_ent_fast", {"K": K}) / K)

    # Deterministic entropy if entropy switch is on or only one component
    if optim_state["entropy_switch"] or K == 1:
        ns_ent_K = 0
        ns_ent_K_fast = 0

    # Confidence weight
    # Missing port: elcboweight does not exist
    # elcbo_beta = self._eval_option(self.options["elcboweight"],
    #                                self.optim_state["n_eff"])
    elcbo_beta = 0
    compute_var = elcbo_beta != 0

    # Compute soft bounds for variational parameter optimization
    theta_bnd = vp.get_bounds(gp.X, options, K)

    ## Perform quick shotgun evaluation of many candidate parameters

    if init_N > 0:
        # Get high-posterior density points
        X_star, y_star, _, _ = get_hpd(gp.X, gp.y, options["hpd_frac"])

        # Generate a bunch of random candidate variational parameters.
        if best_N == 1:
            vp0_vec, vp0_type = _vb_init(vp, 1, init_N, K, X_star, y_star)
        else:
            # Fix random seed here if trying to reproduce MATLAB numbers
            vp0_vec1, vp0_type1 = _vb_init(
                vp, 1, math.ceil(init_N / 3), K, X_star, y_star
            )
            vp0_vec2, vp0_type2 = _vb_init(
                vp, 2, math.ceil(init_N / 3), K, X_star, y_star
            )
            vp0_vec3, vp0_type3 = _vb_init(
                vp, 3, init_N - 2 * math.ceil(init_N / 3), K, X_star, y_star
            )
            vp0_vec = np.concatenate([vp0_vec1, vp0_vec2, vp0_vec3])
            vp0_type = np.concatenate([vp0_type1, vp0_type2, vp0_type3])

        # in MATLAB the vp_repo is used here

        # Quickly estimate ELCBO at each candidate variational posterior.
        for i, vp0 in enumerate(vp0_vec):
            theta = vp0.get_parameters()
            nelbo_tmp, _, _, _, varF_tmp = _neg_elcbo(
                theta,
                gp,
                vp0,
                0,
                ns_ent_K_fast,
                0,
                compute_var,
                theta_bnd,
            )
            nelcbo_fill[i] = nelbo_tmp + elcbo_beta * np.sqrt(varF_tmp)

        # Sort by negative ELCBO
        order = np.argsort(nelcbo_fill)
        vp0_vec = vp0_vec[order]
        vp0_type = vp0_type[order]

        return (
            vp0_vec,
            vp0_type,
            elcbo_beta,
            compute_var,
            ns_ent_K,
            ns_ent_K_fast,
        )

    return (
        copy.deepcopy(vp),
        1,
        elcbo_beta,
        compute_var,
        ns_ent_K,
        ns_ent_K_fast,
    )


def _vb_init(
    vp: VariationalPosterior,
    vb_type: int,
    opts_N: int,
    K_new: int,
    X_star: np.ndarray,
    y_star: np.ndarray,
):
    """
    Generate array of random starting parameters for variational posterior.

    Parameters
    ==========
    vp : VariationalPosterior
        Variational posterior to use as base.
    vb_type : {1, 2, 3}
        Type of method to create new starting parameters. Here 1 means
        starting from old variational parameters, 2 means starting from
        highest-posterior density training points, and 3 means starting
        from random provided training points.
    opts_N : int
        Number of random starting parameters.
    K_new : int
        New number of mixture components.
    X_star : np.ndarray, shape (N, D)
        Training inputs, usually HPD regions.
    y_star : np.ndarray, shape (N, 1)
        Training targets, usually HPD regions.

    Returns
    =======
    vp0_vec : np.ndarray, shape (opts_N, )
        The array of random starting parameters.
    type_vec : np.ndarray, shape (opts_N, )
        The array of type of each random starting parameter.
    """

    D = vp.D
    K = vp.K
    N_star = X_star.shape[0]
    type_vec = vb_type * np.ones((opts_N))
    lambd0 = vp.lambd.copy()
    mu0 = vp.mu.copy()
    w0 = vp.w.copy()

    if vb_type == 1:
        # Start from old variational parameters
        sigma0 = vp.sigma.copy()
    elif vb_type == 2:
        # Start from highest-posterior density training points
        if vp.optimize_mu:
            order = np.argsort(y_star, axis=None)[::-1]
            idx_order = np.tile(
                range(0, min(K_new, N_star)), (math.ceil(K_new / N_star),)
            )
            mu0 = X_star[order[idx_order[0:K_new]], :].T
        if K > 1:
            V = np.var(mu0, axis=1, ddof=1)
        else:
            V = np.var(X_star, axis=0, ddof=1)
        sigma0 = np.sqrt(np.mean(V / lambd0**2) / K_new) * np.exp(
            0.2 * np.random.randn(1, K_new)
        )
    else:
        # Start from random provided training points.
        if vp.optimize_mu:
            mu0 = np.zeros((D, K))
        sigma0 = np.zeros((1, K))

    vp0_list = []
    for i in range(0, opts_N):
        add_jitter = True
        mu = mu0.copy()
        sigma = sigma0.copy()
        lambd = lambd0.copy()
        if vp.optimize_weights:
            w = w0.copy()

        if vb_type == 1:
            # Start from old variational parameters

            # Copy previous parameters verbatim.
            if i == 0:
                add_jitter = False

            if K_new > vp.K:
                # Spawn a new component near an existing one
                for i_new in range(K, K_new):
                    idx = np.random.randint(0, K)
                    mu = np.hstack((mu, mu[:, idx : idx + 1]))
                    sigma = np.hstack((sigma, sigma[0:1, idx : idx + 1]))
                    mu[:, i_new : i_new + 1] += (
                        0.5 * sigma[0, i_new] * lambd * np.random.randn(D, 1)
                    )

                    if vp.optimize_sigma:
                        sigma[0, i_new] *= np.exp(0.2 * np.random.randn())

                    if vp.optimize_weights:
                        xi = 0.25 + 0.25 * np.random.rand()
                        w = np.hstack((w, xi * w[0:1, idx : idx + 1]))
                        w[0, idx] *= 1 - xi
        elif vb_type == 2:
            # Start from highest-posterior density training points
            if i == 0:
                add_jitter = False
            if vp.optimize_lambd:
                lambd = np.reshape(np.std(X_star, axis=0, ddof=1), (-1, 1))
                lambd *= np.sqrt(D / np.sum(lambd**2))
            if vp.optimize_weights:
                w = np.ones((1, K_new)) / K_new
        elif vb_type == 3:
            # Start from random provided training points
            if vp.optimize_mu:
                order = np.random.permutation(N_star)
                idx_order = np.tile(
                    range(0, min(K_new, N_star)),
                    (math.ceil(K_new / N_star),),
                )
                mu = X_star[order[idx_order[0:K_new]], :].T
            else:
                mu = mu0.copy()

            if vp.optimize_sigma:
                if K > 1:
                    V = np.var(mu, axis=1, ddof=1)
                else:
                    V = np.var(X_star, axis=0, ddof=1)
                sigma = np.sqrt(np.mean(V) / K_new) * np.exp(
                    0.2 * np.random.randn(1, K_new)
                )

            if vp.optimize_lambd:
                lambd = np.reshape(np.std(X_star, axis=0, ddof=1), (-1, 1))
                lambd *= np.sqrt(D / np.sum(lambd**2))

            if vp.optimize_weights:
                w = np.ones((1, K_new)) / K_new
        else:
            raise ValueError(
                "Unknown type for initialization of variational posteriors."
            )

        if add_jitter:
            if vp.optimize_mu:
                # When reproducing MATLAB numbers we need to do Fortran order
                # here, adding .T works with square shape.
                mu += sigma * lambd * np.random.randn(mu.shape[0], mu.shape[1])
            if vp.optimize_sigma:
                sigma *= np.exp(0.2 * np.random.randn(1, K_new))
            if vp.optimize_lambd:
                lambd *= np.exp(0.2 * np.random.randn(D, 1))
            if vp.optimize_weights:
                w *= np.exp(0.2 * np.random.randn(1, K_new))
                w /= np.sum(w)

        new_vp = copy.deepcopy(vp)
        new_vp.K = K_new

        if vp.optimize_weights:
            new_vp.w = w
        else:
            new_vp.w = np.ones((1, K_new)) / K_new
        if vp.optimize_mu:
            new_vp.mu = mu
        else:
            new_vp.mu = mu0.copy()
        new_vp.sigma = sigma
        new_vp.lambd = lambd
        # TODO: just set to None?
        new_vp.eta = np.ones((1, K_new)) / K_new
        new_vp.bounds = None
        new_vp.stats = None
        vp0_list.append(new_vp)

    return np.array(vp0_list), type_vec


def _neg_elcbo(
    theta: np.ndarray,
    gp: gpr.GP,
    vp: VariationalPosterior,
    beta: float = 0.0,
    Ns: int = 0,
    compute_grad: bool = True,
    compute_var: int = None,
    theta_bnd: dict = None,
    _entropy_alpha: float = 0.0,
    separate_K: bool = False,
):
    """
    Negative evidence lower confidence bound objective.

    Parameters
    ==========
    theta : np.ndarray
        Vector of variational parameters at which to evaluate NELCBO.
        Note that these should be transformed parameters.
    gp : GP
        Gaussian process from optimization
    vp : VariationalPosterior
        Variational posterior for which to evaluate NELCBO.
    beta : float, defaults to 0.0
        Confidence weight.
    Ns : int, defaults to 0
        Number of samples for entropy.
    compute_grad : bool, defaults to True
        Whether to compute gradient.
    compute_var : bool, optional
        Whether to compute variance. If not given this is
        determined automatically.
    theta_bnd : dict, optional
        Soft bounds for theta.
    entropy_alpha : float, defaults to 0.0
        (currently unused) Parameter for lower/upper deterministic entropy
        interpolation.
    separate_K : bool, defaults to False
        Whether to return expected log joint per component.

    Returns
    =======
    F : float
        Negative evidence lower confidence bound objective.
    dF : np.ndarray
        Gradient of NELCBO.
    G : object
        The expected variational log joint probability.
    H : float
        Entropy term.
    varF : float
        Variance of NELCBO.
    dH : np.ndarray
        Gradient of entropy term.
    varG_ss :
        Variance of the expected variational log joint, for each GP
        hyperparameter sample.
    varG :
        Variance of the expected variational log joint
        probability.
    varH : float
        Variance of entropy term.
    I_sk : np.ndarray
        The contribution to ``G`` per GP hyperparameter sample and per VP
        component.
    J_sjk : np.ndarray
        The contribution to ``varG`` per GP hyperparameter sample and per pair
        of VP components.
    """
    if not np.isfinite(beta):
        beta = 0
    if compute_var is None:
        compute_var = beta != 0

    if compute_grad and beta != 0 and compute_var != 2:
        raise NotImplementedError(
            "Computation of the gradient of ELBO with full variance not "
            "supported"
        )

    K = vp.K

    # Average over multiple GP hyperparameters if provided
    avg_flag = 1
    # Variational parameters are transformed
    jacobian_flag = 1

    # Reformat variational parameters from theta.
    vp.set_parameters(theta)

    if vp.optimize_weights:
        vp.eta = theta[-K:]
        vp.eta -= np.amax(vp.eta)
        vp.eta = np.reshape(vp.eta, (1, -1))
        # Doing the above is more numerically robust than
        # below, but it might cause slightly different results
        # to MATLAB in some cases.
        # vp.eta = np.reshape(theta[-K:], (1, -1))

    # Which gradients should be computed, if any?
    if compute_grad:
        grad_flags = (
            vp.optimize_mu,
            vp.optimize_sigma,
            vp.optimize_lambd,
            vp.optimize_weights,
        )
    else:
        grad_flags = (False, False, False, False)

    # Only weight optimization?
    # Not currently used, since it is only a speed optimization.
    # onlyweights_flag = (
    #     vp.optimize_weights
    #     and not vp.optimize_mu
    #     and not vp.optimize_sigma
    #     and not vp.optimize_lambd
    # )

    # Missing port: block below does not have branches for only weight
    #               optimization
    if separate_K:
        if compute_grad:
            raise ValueError(
                "Computing the gradient of variational parameters and "
                "requesting per-component results at the same time."
            )

        if compute_var:
            G, _, varG, _, varG_ss, I_sk, J_sjk = _gp_log_joint(
                vp,
                gp,
                grad_flags,
                avg_flag,
                jacobian_flag,
                compute_var,
                True,
            )
        else:
            G, dG, _, _, _, I_sk, _ = _gp_log_joint(
                vp, gp, grad_flags, avg_flag, jacobian_flag, 0, True
            )
            varG = varG_ss = 0
            J_sjk = None
    else:
        if compute_var:
            if compute_grad:
                G, dG, varG, dvarG, varG_ss = _gp_log_joint(
                    vp,
                    gp,
                    grad_flags,
                    avg_flag,
                    jacobian_flag,
                    compute_var,
                )
            else:
                G, _, varG, _, varG_ss = _gp_log_joint(
                    vp,
                    gp,
                    grad_flags,
                    avg_flag,
                    jacobian_flag,
                    compute_var,
                )
        else:
            G, dG, _, _, _ = _gp_log_joint(
                vp, gp, grad_flags, avg_flag, jacobian_flag, 0
            )
            varG = varG_ss = 0

    # Entropy term
    if Ns > 0:
        # Monte carlo approximation
        H, dH = entmc_vbmc(vp, Ns, grad_flags, jacobian_flag)
    else:
        # Deterministic approximation via lower bound on the entropy
        H, dH = entlb_vbmc(vp, grad_flags, jacobian_flag)

    # Negative ELBO and its gradient
    F = -G - H
    if compute_grad:
        dF = -dG - dH
    else:
        dF = None
        dH = None

    # For the moment use zero variance for entropy
    varH = 0
    if compute_var:
        varF = varG + varH
    else:
        varF = 0

    # Negative ELCBO (add confidence bound)
    if beta != 0:
        F += beta * np.sqrt(varF)
        if compute_grad:
            dF += 0.5 * beta * dvarG / np.sqrt(varF)

    # Additional loss for variational parameter bound violation (soft bounds)
    # and for weight size (if optimizing mixture weights)
    # Only done when optimizing the variational parameters, but not when
    # computing the EL(C)BO at each iteration.
    if theta_bnd is not None:
        if compute_grad:
            L, dL = _vp_bound_loss(
                vp, theta, theta_bnd, tol_con=theta_bnd["tol_con"]
            )
            dF += dL
        else:
            L = _vp_bound_loss(
                vp,
                theta,
                theta_bnd,
                tol_con=theta_bnd["tol_con"],
                compute_grad=False,
            )
        F += L

        #  Penalty to reduce weight size.
        if vp.optimize_weights:
            thresh = theta_bnd["weight_threshold"]
            L = (
                np.sum(vp.w * (vp.w < thresh) + thresh * (vp.w >= thresh))
                * theta_bnd["weight_penalty"]
            )

            F += L
            if compute_grad:
                w_grad = theta_bnd["weight_penalty"] * (vp.w.ravel() < thresh)
                eta_sum = np.sum(np.exp(vp.eta))
                J_w = (
                    -np.exp(vp.eta).T * np.exp(vp.eta) / eta_sum**2
                ) + np.diag(np.exp(vp.eta.ravel()) / eta_sum)
                w_grad = np.dot(J_w, w_grad)
                dL = np.zeros(dF.shape)
                dL[-vp.K :] = w_grad
                dF += dL

    # Missing port: way to return stuff here is not that good,
    #               though it works currently.
    if separate_K:
        return F, dF, G, H, varF, dH, varG_ss, varG, varH, I_sk, J_sjk
    return F, dF, G, H, varF


def _gp_log_joint(
    vp: VariationalPosterior,
    gp: gpr.GP,
    grad_flags,
    avg_flag: bool = True,
    jacobian_flag: bool = True,
    compute_var: bool = False,
    separate_K: bool = False,
):
    """
    Expected variational log joint probability via GP approximation.

    Parameters
    ==========
    vp : VariationalPosterior
        Variational posterior.
    gp : GP
        Gaussian process from optimization.
    grad_flags : object
        Flags on which gradients to compute. If a boolean then this
        sets all flags to the boolean value, and if a 4-tuple then
        each entry specifies which gradients to compute, in order
        mu, sigma, lambd, w.
    avg_flag : bool, defaults to True
        Whether to average over multiple GP hyperparameters if provided.
    jacobian_flag : bool, defaults to True
        Whether variational parameters are transformed.
    compute_var : bool, defaults to False
        Whether to compute variance.
    separate_K : bool, defaults to False
        Whether to return expected log joint per component.

    Returns
    =======
    G : object
        The expected variational log joint probability.
    dG : np.ndarray
        The gradient.
    varG : np.ndarray, optional
        The variance.
    dvarG : np.ndarray, optional
        The gradient of the variance.
    var_ss : float
        Variance for each GP hyperparameter sample.
    I_sk : np.ndarray
        The contribution to ``G`` per GP hyperparameter sample and per VP
        component.
    J_sjk : np.ndarray
        The contribution to ``varG`` per GP hyperparameter sample and per pair
        of VP components.

    Raises
    ------
    NotImplementedError
        If the diagonal approximation of the gradient is requested
        (``compute_var == 2``) or if the gradient of the variance is requested
        without the diagonal approximation.
    """
    if np.isscalar(grad_flags):
        if grad_flags:
            grad_flags = (True, True, True, True)
        else:
            grad_flags = (False, False, False, False)

    compute_vargrad = compute_var and np.any(grad_flags)
    if compute_vargrad and compute_var != 2:
        raise NotImplementedError(
            "Computation of gradient of log joint variance is currently "
            "available only for diagonal approximation of the variance."
        )

    D = vp.D
    K = vp.K
    N = gp.X.shape[0]
    mu = vp.mu.copy()
    sigma = vp.sigma.copy()
    lambd = vp.lambd.copy().reshape(-1, 1)

    w = vp.w.copy()[0, :]
    Ns = len(gp.posteriors)

    # TODO: once we get more mean function add a check here
    # if all(gp.meanfun ~= [0,1,4,6,8,10,12,14,16,18,20,22])
    #     error('gp_log_joint:UnsupportedMeanFun', ...
    #     'Log joint computation currently only supports zero, constant,
    #     negative quadratic, negative quadratic (fixed/isotropic),
    #     negative quadratic-only, or squared exponential mean functions.');
    # end

    # Which mean function is being used?
    quadratic_meanfun = isinstance(
        gp.mean, gpr.mean_functions.NegativeQuadratic
    )

    G = np.zeros((Ns,))
    # Check which gradients are computed
    if grad_flags[0]:
        mu_grad = np.zeros((D, K, Ns))
    if grad_flags[1]:
        sigma_grad = np.zeros((K, Ns))
    if grad_flags[2]:
        lambd_grad = np.zeros((D, Ns))
    if grad_flags[3]:
        w_grad = np.zeros((K, Ns))
    if compute_var:
        varG = np.zeros((Ns,))
    # Compute gradient of variance?
    if compute_vargrad:
        # TODO: compute vargrad is untested
        if grad_flags[0]:
            mu_vargrad = np.zeros((D, K, Ns))
        if grad_flags[1]:
            sigma_vargrad = np.zeros((K, Ns))
        if grad_flags[2]:
            lambd_vargrad = np.zeros((D, Ns))
        if grad_flags[3]:
            w_vargrad = np.zeros((K, Ns))

    # Store contribution to the jog joint separately for each component?
    if separate_K:
        I_sk = np.zeros((Ns, K))
        if compute_var:
            J_sjk = np.zeros((Ns, K, K))

    Xt = np.zeros((K, D, N))
    for k in range(0, K):
        Xt[k, :, :] = np.reshape(mu[:, k], (-1, 1)) - gp.X.T

    # Number of GP hyperparameters
    cov_N = gp.covariance.hyperparameter_count(D)
    # mean_N = gp.mean.hyperparameter_count(D)
    noise_N = gp.noise.hyperparameter_count()

    # Loop over hyperparameter samples.
    # Missing port: below loop does not have code related to mean functions
    #               we haven't implemented in gpyreg
    for s in range(0, Ns):
        hyp = gp.posteriors[s].hyp

        # Extract GP hyperparameters from hyperparameter array.
        ell = np.exp(hyp[0:D]).reshape(-1, 1)
        ln_sf2 = 2 * hyp[D]
        sum_lnell = np.sum(hyp[0:D])

        # GP mean function hyperparameters
        if isinstance(gp.mean, gpr.mean_functions.ZeroMean):
            m0 = 0
        else:
            m0 = hyp[cov_N + noise_N]

        if quadratic_meanfun:
            xm = hyp[cov_N + noise_N + 1 : cov_N + noise_N + D + 1].reshape(
                -1, 1
            )
            omega = np.exp(hyp[cov_N + noise_N + D + 1 :]).reshape(-1, 1)

        # GP posterior parameters
        alpha = gp.posteriors[s].alpha
        L = gp.posteriors[s].L
        L_chol = gp.posteriors[s].L_chol
        sn2_eff = 1 / gp.posteriors[s].sW[0] ** 2

        for k in range(0, K):
            tau_k = np.sqrt(sigma[:, k] ** 2 * lambd**2 + ell**2)
            lnnf_k = (
                ln_sf2 + sum_lnell - np.sum(np.log(tau_k), axis=0)
            )  # Covariance normalization factor
            delta_k = Xt[k, :, :] / tau_k
            z_k = np.exp(lnnf_k - 0.5 * np.sum(delta_k**2, axis=0))
            I_k = np.dot(z_k, alpha).item() + m0

            if quadratic_meanfun:
                nu_k = (
                    -0.5
                    * np.sum(
                        1
                        / omega**2
                        * (
                            mu[:, k : k + 1] ** 2
                            + sigma[:, k] ** 2 * lambd**2
                            - 2 * mu[:, k : k + 1] * xm
                            + xm**2
                        ),
                        axis=0,
                    ).item()
                )
                I_k += nu_k
            G[s] += w[k] * I_k

            if separate_K:
                I_sk[s, k] = I_k

            if grad_flags[0]:
                dz_dmu = -(delta_k / tau_k) * z_k
                mu_grad[:, k, s : s + 1] = w[k] * np.dot(dz_dmu, alpha)
                if quadratic_meanfun:
                    mu_grad[:, k, s : s + 1] -= (
                        w[k] / omega**2 * (mu[:, k : k + 1] - xm)
                    )

            if grad_flags[1]:
                dz_dsigma = (
                    np.sum((lambd / tau_k) ** 2 * (delta_k**2 - 1), axis=0)
                    * sigma[:, k]
                    * z_k
                )
                sigma_grad[k, s] = w[k] * np.dot(dz_dsigma, alpha).item()
                if quadratic_meanfun:
                    sigma_grad[k, s] -= (
                        w[k]
                        * sigma[0, k]
                        * np.sum(1 / omega**2 * lambd**2, axis=0).item()
                    )

            if grad_flags[2]:
                dz_dlambd = (
                    (sigma[:, k] / tau_k) ** 2
                    * (delta_k**2 - 1)
                    * (lambd * z_k)
                )
                lambd_grad[:, s : s + 1] += w[k] * np.dot(dz_dlambd, alpha)
                if quadratic_meanfun:
                    lambd_grad[:, s : s + 1] -= (
                        w[k] * sigma[:, k] ** 2 / omega**2 * lambd
                    )

            if grad_flags[3]:
                w_grad[k, s] = I_k

            if compute_var == 2:
                # Missing port: compute_var == 2 skipped since it is not used
                raise NotImplementedError(
                    "Diagonal approximation of GP log-joint variance not implemented."
                )
            elif compute_var:
                for j in range(0, k + 1):
                    tau_j = np.sqrt(sigma[:, j] ** 2 * lambd**2 + ell**2)
                    lnnf_j = ln_sf2 + sum_lnell - np.sum(np.log(tau_j), axis=0)
                    delta_j = (mu[:, j : j + 1] - gp.X.T) / tau_j
                    z_j = np.exp(lnnf_j - 0.5 * np.sum(delta_j**2, axis=0))

                    tau_jk = np.sqrt(
                        (sigma[:, j] ** 2 + sigma[:, k] ** 2) * lambd**2
                        + ell**2
                    )
                    lnnf_jk = ln_sf2 + sum_lnell - np.sum(np.log(tau_jk))
                    delta_jk = (mu[:, j : j + 1] - mu[:, k : k + 1]) / tau_jk

                    J_jk = np.exp(
                        lnnf_jk - 0.5 * np.sum(delta_jk**2, axis=0).item()
                    )
                    if L_chol:
                        J_jk -= np.dot(
                            z_k,
                            sp.linalg.solve_triangular(
                                L,
                                sp.linalg.solve_triangular(
                                    L, z_j, trans=1, check_finite=False
                                ),
                                trans=0,
                                check_finite=False,
                            )
                            / sn2_eff,
                        )
                    else:
                        J_jk += np.dot(z_k, np.dot(L, z_j.T))

                    # Off-diagonal elements are symmetric (count twice)
                    if j == k:
                        varG[s] += w[k] ** 2 * np.maximum(np.spacing(1), J_jk)
                        if separate_K:
                            J_sjk[s, k, k] = J_jk
                    else:
                        varG[s] += 2 * w[j] * w[k] * J_jk
                        if separate_K:
                            J_sjk[s, j, k] = J_jk
                            J_sjk[s, k, j] = J_jk

    # Correct for numerical error
    if compute_var:
        varG = np.maximum(varG, np.spacing(1))
    else:
        varG = None

    if np.any(grad_flags):
        grad_list = []
        if grad_flags[0]:
            mu_grad = np.reshape(mu_grad, (D * K, Ns), order="F")
            grad_list.append(mu_grad)

        # Correct for standard log reparametrization of sigma
        if jacobian_flag and grad_flags[1]:
            sigma_grad *= np.reshape(sigma, (-1, 1))
            grad_list.append(sigma_grad)

        # Correct for standard log reparametrization of lambd
        if jacobian_flag and grad_flags[2]:
            lambd_grad *= lambd
            grad_list.append(lambd_grad)

        # Correct for standard softmax reparametrization of w
        if jacobian_flag and grad_flags[3]:
            eta_sum = np.sum(np.exp(vp.eta))
            J_w = (
                -np.exp(vp.eta).T * np.exp(vp.eta) / eta_sum**2
                + np.diag(np.exp(vp.eta.ravel())) / eta_sum
            )
            w_grad = np.dot(J_w, w_grad)
            grad_list.append(w_grad)

        dG = np.concatenate(grad_list, axis=0)
    else:
        dG = None

    if compute_vargrad:
        # TODO: compute vargrad is untested
        vargrad_list = []
        if grad_flags[0]:
            mu_vargrad = np.reshape(mu_vargrad, (D * K, Ns))
            vargrad_list.append(mu_vargrad)

        # Correct for standard log reparametrization of sigma
        if jacobian_flag and grad_flags[1]:
            sigma_vargrad *= np.reshape(sigma_vargrad, (-1, 1))
            vargrad_list.append(sigma_vargrad)

        # Correct for standard log reparametrization of lambd
        if jacobian_flag and grad_flags[2]:
            lambd_vargrad *= lambd
            vargrad_list.append(lambd_vargrad)

        # Correct for standard softmax reparametrization of w
        if jacobian_flag and grad_flags[3]:
            w_vargrad = np.dot(J_w, w_vargrad)
            vargrad_list.append(w_vargrad)

        dvarG = np.concatenate(grad_list, axis=0)
    else:
        dvarG = None

    # Average multiple hyperparameter samples
    var_ss = 0
    if Ns > 1 and avg_flag:
        G_bar = np.sum(G) / Ns
        if compute_var:
            # Estimated variance of the samples
            varG_ss = np.sum((G - G_bar) ** 2) / (Ns - 1)
            # Variability due to sampling
            var_ss = varG_ss + np.std(varG, ddof=1)
            varG = np.sum(varG, axis=0) / Ns + varG_ss
        if compute_vargrad:
            # TODO: compute vargrad is untested
            dvv = 2 * np.sum(G * dG, axis=1) / (Ns - 1) - 2 * G_bar * np.sum(
                dG, axis=1
            ) / (Ns - 1)
            dvarG = np.sum(dvarG, axis=1) / Ns + dvv
        G = G_bar
        if np.any(grad_flags):
            dG = np.sum(dG, axis=1) / Ns

    # Drop extra dims if Ns == 1
    if Ns == 1:
        G = G[0]
        if np.any(grad_flags):
            dG = dG[:, 0]

    if separate_K:
        return G, dG, varG, dvarG, var_ss, I_sk, J_sjk
    return G, dG, varG, dvarG, var_ss