diff --git a/README.md b/README.md index f3bb745f..9bd000c7 100644 --- a/README.md +++ b/README.md @@ -4,9 +4,10 @@ [![Coverage](https://codecov.io/gh/TuringLang/AdvancedVI.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/TuringLang/AdvancedVI.jl) # AdvancedVI.jl + [AdvancedVI](https://github.com/TuringLang/AdvancedVI.jl) provides implementations of variational inference (VI) algorithms, which is a family of algorithms aiming for scalable approximate Bayesian inference by leveraging optimization. `AdvancedVI` is part of the [Turing](https://turinglang.org/stable/) probabilistic programming ecosystem. -The purpose of this package is to provide a common accessible interface for various VI algorithms and utilities so that other packages, e.g. `Turing`, only need to write a light wrapper for integration. +The purpose of this package is to provide a common accessible interface for various VI algorithms and utilities so that other packages, e.g. `Turing`, only need to write a light wrapper for integration. For example, integrating `Turing` with `AdvancedVI.ADVI` only involves converting a `Turing.Model` into a [`LogDensityProblem`](https://github.com/tpapp/LogDensityProblems.jl) and extracting a corresponding `Bijectors.bijector`. ## Examples @@ -21,7 +22,8 @@ y &\sim \mathcal{N}\left(\mu_y, \sigma_y^2\right), \end{aligned} $$ -a `LogDensityProblem` can be implemented as +a `LogDensityProblem` can be implemented as + ```julia using LogDensityProblems using SimpleUnPack @@ -35,46 +37,50 @@ end function LogDensityProblems.logdensity(model::NormalLogNormal, θ) (; μ_x, σ_x, μ_y, Σ_y) = model - logpdf(LogNormal(μ_x, σ_x), θ[1]) + logpdf(MvNormal(μ_y, Σ_y), θ[2:end]) + return logpdf(LogNormal(μ_x, σ_x), θ[1]) + logpdf(MvNormal(μ_y, Σ_y), θ[2:end]) end function LogDensityProblems.dimension(model::NormalLogNormal) - length(model.μ_y) + 1 + return length(model.μ_y) + 1 end function LogDensityProblems.capabilities(::Type{<:NormalLogNormal}) - LogDensityProblems.LogDensityOrder{0}() + return LogDensityProblems.LogDensityOrder{0}() end ``` -Since the support of `x` is constrained to be positive and VI is best done in the unconstrained Euclidean space, we need to use a *bijector* to transform `x` into unconstrained Euclidean space. We will use the [`Bijectors.jl`](https://github.com/TuringLang/Bijectors.jl) package for this purpose. +Since the support of `x` is constrained to be positive and VI is best done in the unconstrained Euclidean space, we need to use a *bijector* to transform `x` into unconstrained Euclidean space. We will use the [`Bijectors.jl`](https://github.com/TuringLang/Bijectors.jl) package for this purpose. This corresponds to the automatic differentiation variational inference (ADVI) formulation[^KTRGB2017]. + ```julia using Bijectors function Bijectors.bijector(model::NormalLogNormal) (; μ_x, σ_x, μ_y, Σ_y) = model - Bijectors.Stacked( + return Bijectors.Stacked( Bijectors.bijector.([LogNormal(μ_x, σ_x), MvNormal(μ_y, Σ_y)]), - [1:1, 2:1+length(μ_y)]) + [1:1, 2:(1 + length(μ_y))], + ) end ``` A simpler approach is to use `Turing`, where a `Turing.Model` can be automatically be converted into a `LogDensityProblem` and a corresponding `bijector` is automatically generated. Let us instantiate a random normal-log-normal model. + ```julia using LinearAlgebra n_dims = 10 -μ_x = randn() -σ_x = exp.(randn()) -μ_y = randn(n_dims) -σ_y = exp.(randn(n_dims)) -model = NormalLogNormal(μ_x, σ_x, μ_y, Diagonal(σ_y.^2)) +μ_x = randn() +σ_x = exp.(randn()) +μ_y = randn(n_dims) +σ_y = exp.(randn(n_dims)) +model = NormalLogNormal(μ_x, σ_x, μ_y, Diagonal(σ_y .^ 2)) ``` We can perform VI with stochastic gradient descent (SGD) using reparameterization gradient estimates of the ELBO[^TL2014][^RMW2014][^KW2014] as follows: + ```julia using Optimisers using ADTypes, ForwardDiff @@ -82,7 +88,7 @@ using AdvancedVI # ELBO objective with the reparameterization gradient n_montecarlo = 10 -elbo = AdvancedVI.RepGradELBO(n_montecarlo) +elbo = AdvancedVI.RepGradELBO(n_montecarlo) # Mean-field Gaussian variational family d = LogDensityProblems.dimension(model) @@ -91,11 +97,10 @@ L = Diagonal(ones(d)) q = AdvancedVI.MeanFieldGaussian(μ, L) # Match support by applying the `model`'s inverse bijector -b = Bijectors.bijector(model) -binv = inverse(b) +b = Bijectors.bijector(model) +binv = inverse(b) q_transformed = Bijectors.TransformedDistribution(q, binv) - # Run inference max_iter = 10^3 q_avg, _, stats, _ = AdvancedVI.optimize( @@ -103,8 +108,8 @@ q_avg, _, stats, _ = AdvancedVI.optimize( elbo, q_transformed, max_iter; - adtype = ADTypes.AutoForwardDiff(), - optimizer = Optimisers.Adam(1e-3) + adtype=ADTypes.AutoForwardDiff(), + optimizer=Optimisers.Adam(1e-3), ) # Evaluate final ELBO with 10^3 Monte Carlo samples diff --git a/bench/README.md b/bench/README.md index 5c5214d8..8a8f5163 100644 --- a/bench/README.md +++ b/bench/README.md @@ -1,7 +1,5 @@ - # AdvancedVI.jl Continuous Benchmarking This subdirectory contains code for continuous benchmarking of the performance of `AdvancedVI.jl`. The initial version was heavily inspired by the setup of [Lux.jl](https://github.com/LuxDL/Lux.jl/tree/main). The Github action and pages integration is provided by https://github.com/benchmark-action/github-action-benchmark/ and [BenchmarkTools.jl](https://github.com/JuliaCI/BenchmarkTools.jl). - diff --git a/bench/benchmarks.jl b/bench/benchmarks.jl index 8585fa8c..551e12b2 100644 --- a/bench/benchmarks.jl +++ b/bench/benchmarks.jl @@ -33,25 +33,21 @@ const SUITES = BenchmarkGroup() # n_montecarlo = 4, # ) -SUITES["normal + bijector"]["meanfield"]["ReverseDiff"] = - @benchmarkable normallognormal( - ; - fptype = Float64, - adtype = AutoReverseDiff(), - family = :meanfield, - objective = :RepGradELBO, - n_montecarlo = 4, - ) - -SUITES["normal + bijector"]["meanfield"]["ForwardDiff"] = - @benchmarkable normallognormal( - ; - fptype = Float64, - adtype = AutoForwardDiff(), - family = :meanfield, - objective = :RepGradELBO, - n_montecarlo = 4, - ) +SUITES["normal + bijector"]["meanfield"]["ReverseDiff"] = @benchmarkable normallognormal(; + fptype=Float64, + adtype=AutoReverseDiff(), + family=:meanfield, + objective=:RepGradELBO, + n_montecarlo=4, +) + +SUITES["normal + bijector"]["meanfield"]["ForwardDiff"] = @benchmarkable normallognormal(; + fptype=Float64, + adtype=AutoForwardDiff(), + family=:meanfield, + objective=:RepGradELBO, + n_montecarlo=4, +) BenchmarkTools.tune!(SUITES; verbose=true) results = BenchmarkTools.run(SUITES; verbose=true) diff --git a/bench/normallognormal.jl b/bench/normallognormal.jl index 15d5a5a0..075bf3dc 100644 --- a/bench/normallognormal.jl +++ b/bench/normallognormal.jl @@ -8,49 +8,49 @@ end function LogDensityProblems.logdensity(model::NormalLogNormal, θ) (; μ_x, σ_x, μ_y, Σ_y) = model - logpdf(LogNormal(μ_x, σ_x), θ[1]) + logpdf(MvNormal(μ_y, Σ_y), θ[2:end]) + return logpdf(LogNormal(μ_x, σ_x), θ[1]) + logpdf(MvNormal(μ_y, Σ_y), θ[2:end]) end function LogDensityProblems.dimension(model::NormalLogNormal) - length(model.μ_y) + 1 + return length(model.μ_y) + 1 end function LogDensityProblems.capabilities(::Type{<:NormalLogNormal}) - LogDensityProblems.LogDensityOrder{0}() + return LogDensityProblems.LogDensityOrder{0}() end function Bijectors.bijector(model::NormalLogNormal) (; μ_x, σ_x, μ_y, Σ_y) = model - Bijectors.Stacked( + return Bijectors.Stacked( Bijectors.bijector.([LogNormal(μ_x, σ_x), MvNormal(μ_y, Σ_y)]), - [1:1, 2:1+length(μ_y)]) + [1:1, 2:(1 + length(μ_y))], + ) end -function normallognormal(; fptype, adtype, family, objective, kwargs...) +function normallognormal(; fptype, adtype, family, objective, max_iter=10^3, kwargs...) n_dims = 10 - μ_x = fptype(5.0) - σ_x = fptype(0.3) - μ_y = Fill(fptype(5.0), n_dims) - σ_y = Fill(fptype(0.3), n_dims) - model = NormalLogNormal(μ_x, σ_x, μ_y, Diagonal(σ_y.^2)) + μ_x = fptype(5.0) + σ_x = fptype(0.3) + μ_y = Fill(fptype(5.0), n_dims) + σ_y = Fill(fptype(0.3), n_dims) + model = NormalLogNormal(μ_x, σ_x, μ_y, Diagonal(σ_y .^ 2)) obj = variational_objective(objective; kwargs...) d = LogDensityProblems.dimension(model) - q = variational_standard_mvnormal(fptype, d, family) + q = variational_standard_mvnormal(fptype, d, family) - b = Bijectors.bijector(model) - binv = inverse(b) + b = Bijectors.bijector(model) + binv = inverse(b) q_transformed = Bijectors.TransformedDistribution(q, binv) - max_iter = 10^3 - AdvancedVI.optimize( + return AdvancedVI.optimize( model, obj, q_transformed, max_iter; adtype, - optimizer = Optimisers.Adam(fptype(1e-3)), - show_progress = false, + optimizer=Optimisers.Adam(fptype(1e-3)), + show_progress=false, ) end diff --git a/bench/utils.jl b/bench/utils.jl index 31c87c3d..d95741cd 100644 --- a/bench/utils.jl +++ b/bench/utils.jl @@ -1,13 +1,9 @@ function variational_standard_mvnormal(type::Type, n_dims::Int, family::Symbol) if family == :meanfield - AdvancedVI.MeanFieldGaussian( - zeros(type, n_dims), Diagonal(ones(type, n_dims)) - ) + AdvancedVI.MeanFieldGaussian(zeros(type, n_dims), Diagonal(ones(type, n_dims))) else - AdvancedVI.FullRankGaussian( - zeros(type, n_dims), Matrix(type, I, n_dims, n_dims) - ) + AdvancedVI.FullRankGaussian(zeros(type, n_dims), Matrix(type, I, n_dims, n_dims)) end end @@ -15,6 +11,10 @@ function variational_objective(objective::Symbol; kwargs...) if objective == :RepGradELBO AdvancedVI.RepGradELBO(kwargs[:n_montecarlo]) elseif objective == :RepGradELBOSTL - AdvancedVI.RepGradELBO(kwargs[:n_montecarlo], entropy=StickingTheLandingEntropy()) + AdvancedVI.RepGradELBO(kwargs[:n_montecarlo]; entropy=StickingTheLandingEntropy()) + elseif objective == :ScoreGradELBO + throw("ScoreGradELBO not supported yet. Please use ScoreGradELBOSTL instead.") + elseif objective == :ScoreGradELBOSTL + AdvancedVI.ScoreGradELBO(kwargs[:n_montecarlo]; entropy=StickingTheLandingEntropy()) end end diff --git a/docs/src/elbo/overview.md b/docs/src/elbo/overview.md index 4afac4db..db9b598e 100644 --- a/docs/src/elbo/overview.md +++ b/docs/src/elbo/overview.md @@ -1,5 +1,5 @@ - # [Evidence Lower Bound Maximization](@id elbomax) + ## Introduction Evidence lower bound (ELBO) maximization[^JGJS1999] is a general family of algorithms that minimize the exclusive (or reverse) Kullback-Leibler (KL) divergence between the target distribution ``\pi`` and a variational approximation ``q_{\lambda}``. @@ -8,15 +8,19 @@ More generally, they aim to solve the following problem: ```math \mathrm{minimize}_{q \in \mathcal{Q}}\quad \mathrm{KL}\left(q, \pi\right), ``` + where $$\mathcal{Q}$$ is some family of distributions, often called the variational family. Since the target distribution ``\pi`` is intractable in general, the KL divergence is also intractable. Instead, the ELBO maximization strategy maximizes a surrogate objective, the *ELBO*: + ```math \mathrm{ELBO}\left(q\right) \triangleq \mathbb{E}_{\theta \sim q} \log \pi\left(\theta\right) + \mathbb{H}\left(q\right), ``` + which serves as a lower bound to the KL. The ELBO and its gradient can be readily estimated through various strategies. Overall, ELBO maximization algorithms aim to solve the problem: + ```math \mathrm{maximize}_{q \in \mathcal{Q}}\quad \mathrm{ELBO}\left(q\right). ``` @@ -24,13 +28,15 @@ Overall, ELBO maximization algorithms aim to solve the problem: Multiple ways to solve this problem exist, each leading to a different variational inference algorithm. ## Algorithms + Currently, `AdvancedVI` only provides the approach known as black-box variational inference (also known as Monte Carlo VI, Stochastic Gradient VI). (Introduced independently by two groups [^RGB2014][^TL2014] in 2014.) In particular, `AdvancedVI` focuses on the reparameterization gradient estimator[^TL2014][^RMW2014][^KW2014], which is generally superior compared to alternative strategies[^XQKS2019], discussed in the following section: -* [RepGradELBO](@ref repgradelbo) + + - [RepGradELBO](@ref repgradelbo) [^JGJS1999]: Jordan, M. I., Ghahramani, Z., Jaakkola, T. S., & Saul, L. K. (1999). An introduction to variational methods for graphical models. Machine learning, 37, 183-233. -[^TL2014]: Titsias, M., & Lázaro-Gredilla, M. (2014). Doubly stochastic variational Bayes for non-conjugate inference. In *International Conference on Machine Learning*. +[^TL2014]: Titsias, M., & Lázaro-Gredilla, M. (2014). Doubly stochastic variational Bayes for non-conjugate inference. In *International Conference on Machine Learning*. [^RMW2014]: Rezende, D. J., Mohamed, S., & Wierstra, D. (2014). Stochastic backpropagation and approximate inference in deep generative models. In *International Conference on Machine Learning*. [^KW2014]: Kingma, D. P., & Welling, M. (2014). Auto-encoding variational bayes. In *International Conference on Learning Representations*. [^XQKS2019]: Xu, M., Quiroz, M., Kohn, R., & Sisson, S. A. (2019). Variance reduction properties of the reparameterization trick. In *The International Conference on Artificial Intelligence and Statistics. diff --git a/docs/src/general.md b/docs/src/general.md index 07a240e1..f4a8281d 100644 --- a/docs/src/general.md +++ b/docs/src/general.md @@ -1,13 +1,14 @@ - # [General Usage](@id general) Each VI algorithm provides the followings: -1. Variational families supported by each VI algorithm. -2. A variational objective corresponding to the VI algorithm. -Note that each variational family is subject to its own constraints. -Thus, please refer to the documentation of the variational inference algorithm of interest. + + 1. Variational families supported by each VI algorithm. + 2. A variational objective corresponding to the VI algorithm. + Note that each variational family is subject to its own constraints. + Thus, please refer to the documentation of the variational inference algorithm of interest. ## Optimizing a Variational Objective + After constructing a *variational objective* `objective` and initializing a *variational approximation*, one can optimize `objective` by calling `optimize`: ```@docs @@ -15,28 +16,35 @@ optimize ``` ## Estimating the Objective + In some cases, it is useful to directly estimate the objective value. This can be done by the following funciton: + ```@docs estimate_objective ``` !!! info - Note that `estimate_objective` is not expected to be differentiated through, and may not result in optimal statistical performance. + + Note that `estimate_objective` is not expected to be differentiated through, and may not result in optimal statistical performance. ## Advanced Usage + Each variational objective is a subtype of the following abstract type: + ```@docs AdvancedVI.AbstractVariationalObjective ``` Furthermore, `AdvancedVI` only interacts with each variational objective by querying gradient estimates. Therefore, to create a new custom objective to be optimized through `AdvancedVI`, it suffices to implement the following function: + ```@docs AdvancedVI.estimate_gradient! ``` If an objective needs to be stateful, one can implement the following function to inialize the state. + ```@docs AdvancedVI.init ``` diff --git a/docs/src/index.md b/docs/src/index.md index feb7adff..f177cd72 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -5,10 +5,13 @@ CurrentModule = AdvancedVI # AdvancedVI ## Introduction + [AdvancedVI](https://github.com/TuringLang/AdvancedVI.jl) provides implementations of variational Bayesian inference (VI) algorithms. VI algorithms perform scalable and computationally efficient Bayesian inference at the cost of asymptotic exactness. `AdvancedVI` is part of the [Turing](https://turinglang.org/stable/) probabilistic programming ecosystem. ## Provided Algorithms + `AdvancedVI` currently provides the following algorithm for evidence lower bound maximization: -- [Evidence Lower-Bound Maximization](@ref elbomax) + + - [Evidence Lower-Bound Maximization](@ref elbomax) diff --git a/src/AdvancedVI.jl b/src/AdvancedVI.jl index 5402e075..4c3c39cc 100644 --- a/src/AdvancedVI.jl +++ b/src/AdvancedVI.jl @@ -170,10 +170,12 @@ Estimate the entropy of `q`. """ function estimate_entropy end -export RepGradELBO, ClosedFormEntropy, StickingTheLandingEntropy, MonteCarloEntropy +export RepGradELBO, + ScoreGradELBO, ClosedFormEntropy, StickingTheLandingEntropy, MonteCarloEntropy include("objectives/elbo/entropy.jl") include("objectives/elbo/repgradelbo.jl") +include("objectives/elbo/scoregradelbo.jl") # Variational Families export MvLocationScale, MeanFieldGaussian, FullRankGaussian diff --git a/src/objectives/elbo/entropy.jl b/src/objectives/elbo/entropy.jl index 210b49ca..fa34022a 100644 --- a/src/objectives/elbo/entropy.jl +++ b/src/objectives/elbo/entropy.jl @@ -37,3 +37,10 @@ function estimate_entropy( -logpdf(q, mc_sample) end end + +function estimate_entropy_maybe_stl( + entropy_estimator::AbstractEntropyEstimator, samples, q, q_stop +) + q_maybe_stop = maybe_stop_entropy_score(entropy_estimator, q, q_stop) + return estimate_entropy(entropy_estimator, samples, q_maybe_stop) +end diff --git a/src/objectives/elbo/repgradelbo.jl b/src/objectives/elbo/repgradelbo.jl index e6f04ae8..b8bf63fa 100644 --- a/src/objectives/elbo/repgradelbo.jl +++ b/src/objectives/elbo/repgradelbo.jl @@ -45,13 +45,6 @@ function Base.show(io::IO, obj::RepGradELBO) return print(io, ")") end -function estimate_entropy_maybe_stl( - entropy_estimator::AbstractEntropyEstimator, samples, q, q_stop -) - q_maybe_stop = maybe_stop_entropy_score(entropy_estimator, q, q_stop) - return estimate_entropy(entropy_estimator, samples, q_maybe_stop) -end - function estimate_energy_with_samples(prob, samples) return mean(Base.Fix1(LogDensityProblems.logdensity, prob), eachsample(samples)) end diff --git a/src/objectives/elbo/scoregradelbo.jl b/src/objectives/elbo/scoregradelbo.jl new file mode 100644 index 00000000..053c6b3f --- /dev/null +++ b/src/objectives/elbo/scoregradelbo.jl @@ -0,0 +1,139 @@ +""" + ScoreGradELBO(n_samples; kwargs...) + +Evidence lower-bound objective computed with score function gradients. +```math +\\begin{aligned} +\\nabla_{\\lambda} \\mathrm{ELBO}\\left(\\lambda\\right) +&\\= +\\mathbb{E}_{z \\sim q_{\\lambda}}\\left[ + \\log \\pi\\left(z\\right) \\nabla_{\\lambda} \\log q_{\\lambda}(z) +\\right] ++ \\mathbb{H}\\left(q_{\\lambda}\\right), +\\end{aligned} +``` + +To reduce the variance of the gradient estimator, we use a baseline computed from a running average of the previous ELBO values and subtract it from the objective. + +```math +\\mathbb{E}_{z \\sim q_{\\lambda}}\\left[ + \\nabla_{\\lambda} \\log q_{\\lambda}(z) \\left(\\pi\\left(z\\right) - \\beta\\right) +\\right] +``` + +# Arguments +- `n_samples::Int`: Number of Monte Carlo samples used to estimate the ELBO. + +# Keyword Arguments +- `entropy`: The estimator for the entropy term. (Type `<: AbstractEntropyEstimator`; Default: `ClosedFormEntropy()`) +- `baseline_window_size::Int`: The window size to use to compute the baseline. (Default: `10`) +- `baseline_history::Vector{Float64}`: The history of the baseline. (Default: `Float64[]`) + +# Requirements +- The variational approximation ``q_{\\lambda}`` implements `rand` and `logpdf`. +- `logpdf(q, x)` must be differentiable with respect to `q` by the selected AD backend. +- The target distribution and the variational approximation have the same support. + +Depending on the options, additional requirements on ``q_{\\lambda}`` may apply. +""" +struct ScoreGradELBO{EntropyEst<:AbstractEntropyEstimator} <: + AdvancedVI.AbstractVariationalObjective + entropy::EntropyEst + n_samples::Int + baseline_window_size::Int + baseline_history::Vector{Float64} +end + +function ScoreGradELBO( + n_samples::Int; + entropy::AbstractEntropyEstimator=ClosedFormEntropy(), + baseline_window_size::Int=10, + baseline_history::Vector{Float64}=Float64[], +) + return ScoreGradELBO(entropy, n_samples, baseline_window_size, baseline_history) +end + +function Base.show(io::IO, obj::ScoreGradELBO) + print(io, "ScoreGradELBO(entropy=") + print(io, obj.entropy) + print(io, ", n_samples=") + print(io, obj.n_samples) + print(io, ", baseline_window_size=") + print(io, obj.baseline_window_size) + return print(io, ")") +end + +function compute_control_variate_baseline(history, window_size) + if length(history) == 0 + return 1.0 + end + min_index = max(1, length(history) - window_size) + return mean(history[min_index:end]) +end + +function estimate_energy_with_samples( + prob, samples_stop, samples_logprob, samples_logprob_stop, baseline +) + fv = Base.Fix1(LogDensityProblems.logdensity, prob).(eachsample(samples_stop)) + fv_mean = mean(fv) + score_grad = mean(@. samples_logprob * (fv - baseline)) + score_grad_stop = mean(@. samples_logprob_stop * (fv - baseline)) + return fv_mean + (score_grad - score_grad_stop) +end + +function estimate_objective( + rng::Random.AbstractRNG, obj::ScoreGradELBO, q, prob; n_samples::Int=obj.n_samples +) + samples, entropy = reparam_with_entropy(rng, q, q, obj.n_samples, obj.entropy) + energy = map(Base.Fix1(LogDensityProblems.logdensity, prob), eachsample(samples)) + return mean(energy) + entropy +end + +function estimate_objective(obj::ScoreGradELBO, q, prob; n_samples::Int=obj.n_samples) + return estimate_objective(Random.default_rng(), obj, q, prob; n_samples) +end + +function estimate_scoregradelbo_ad_forward(params′, aux) + @unpack rng, obj, problem, adtype, restructure, q_stop = aux + baseline = compute_control_variate_baseline( + obj.baseline_history, obj.baseline_window_size + ) + q = restructure_ad_forward(adtype, restructure, params′) + samples_stop = rand(rng, q_stop, obj.n_samples) + entropy = estimate_entropy_maybe_stl(obj.entropy, samples_stop, q, q_stop) + samples_logprob = logpdf.(Ref(q), AdvancedVI.eachsample(samples_stop)) + samples_logprob_stop = logpdf.(Ref(q_stop), AdvancedVI.eachsample(samples_stop)) + energy = estimate_energy_with_samples( + problem, samples_stop, samples_logprob, samples_logprob_stop, baseline + ) + elbo = energy + entropy + return -elbo +end + +function AdvancedVI.estimate_gradient!( + rng::Random.AbstractRNG, + obj::ScoreGradELBO, + adtype::ADTypes.AbstractADType, + out::DiffResults.MutableDiffResult, + prob, + params, + restructure, + state, +) + q_stop = restructure(params) + aux = ( + rng=rng, + adtype=adtype, + obj=obj, + problem=prob, + restructure=restructure, + q_stop=q_stop, + ) + AdvancedVI.value_and_gradient!( + adtype, estimate_scoregradelbo_ad_forward, params, aux, out + ) + nelbo = DiffResults.value(out) + stat = (elbo=-nelbo,) + push!(obj.baseline_history, -nelbo) + return out, nothing, stat +end diff --git a/src/optimize.jl b/src/optimize.jl index eb462ff5..9a748907 100644 --- a/src/optimize.jl +++ b/src/optimize.jl @@ -73,7 +73,6 @@ function optimize( for t in 1:max_iter stat = (iteration=t,) - grad_buf, obj_st, stat′ = estimate_gradient!( rng, objective, diff --git a/test/inference/scoregradelbo_distributionsad.jl b/test/inference/scoregradelbo_distributionsad.jl new file mode 100644 index 00000000..700dda6d --- /dev/null +++ b/test/inference/scoregradelbo_distributionsad.jl @@ -0,0 +1,101 @@ + +AD_distributionsad = Dict( + :ForwarDiff => AutoForwardDiff(), + #:ReverseDiff => AutoReverseDiff(), # DistributionsAD doesn't support ReverseDiff at the moment + :Zygote => AutoZygote(), +) + +if @isdefined(Tapir) + AD_distributionsad[:Tapir] = AutoTapir(; safe_mode=false) +end + +#if @isdefined(Enzyme) +# AD_distributionsad[:Enzyme] = AutoEnzyme() +#end + +@testset "inference RepGradELBO DistributionsAD" begin + @testset "$(modelname) $(objname) $(realtype) $(adbackname)" for realtype in + [Float64, Float32], + (modelname, modelconstr) in Dict(:Normal => normal_meanfield), + n_montecarlo in [1, 10], + (objname, objective) in Dict( + :ScoreGradELBOClosedFormEntropy => ScoreGradELBO(n_montecarlo), + :ScoreGradELBOStickingTheLanding => + ScoreGradELBO(n_montecarlo; entropy=StickingTheLandingEntropy()), + ), + (adbackname, adtype) in AD_distributionsad + + seed = (0x38bef07cf9cc549d) + rng = StableRNG(seed) + + modelstats = modelconstr(rng, realtype) + @unpack model, μ_true, L_true, n_dims, strong_convexity, is_meanfield = modelstats + + T = 1000 + η = 1e-5 + opt = Optimisers.Descent(realtype(η)) + + # For small enough η, the error of SGD, Δλ, is bounded as + # Δλ ≤ ρ^T Δλ0 + O(η), + # where ρ = 1 - ημ, μ is the strong convexity constant. + contraction_rate = 1 - η * strong_convexity + + μ0 = zeros(realtype, n_dims) + L0 = Diagonal(ones(realtype, n_dims)) + q0 = TuringDiagMvNormal(μ0, diag(L0)) + + @testset "convergence" begin + Δλ0 = sum(abs2, μ0 - μ_true) + sum(abs2, L0 - L_true) + q_avg, _, stats, _ = optimize( + rng, + model, + objective, + q0, + T; + optimizer=opt, + show_progress=PROGRESS, + adtype=adtype, + ) + + μ = mean(q_avg) + L = sqrt(cov(q_avg)) + Δλ = sum(abs2, μ - μ_true) + sum(abs2, L - L_true) + + @test Δλ ≤ contraction_rate^(T / 2) * Δλ0 + @test eltype(μ) == eltype(μ_true) + @test eltype(L) == eltype(L_true) + end + + @testset "determinism" begin + rng = StableRNG(seed) + q_avg, _, stats, _ = optimize( + rng, + model, + objective, + q0, + T; + optimizer=opt, + show_progress=PROGRESS, + adtype=adtype, + ) + μ = mean(q_avg) + L = sqrt(cov(q_avg)) + + rng_repl = StableRNG(seed) + q_avg, _, stats, _ = optimize( + rng_repl, + model, + objective, + q0, + T; + optimizer=opt, + show_progress=PROGRESS, + adtype=adtype, + ) + μ_repl = mean(q_avg) + L_repl = sqrt(cov(q_avg)) + @test μ ≈ μ_repl rtol = 1e-5 + @test L ≈ L_repl rtol = 1e-5 + end + end +end diff --git a/test/inference/scoregradelbo_locationscale.jl b/test/inference/scoregradelbo_locationscale.jl new file mode 100644 index 00000000..ef49713b --- /dev/null +++ b/test/inference/scoregradelbo_locationscale.jl @@ -0,0 +1,105 @@ + +AD_locationscale = Dict( + :ForwarDiff => AutoForwardDiff(), + :ReverseDiff => AutoReverseDiff(), + :Zygote => AutoZygote(), +) + +if @isdefined(Tapir) + AD_locationscale[:Tapir] = AutoTapir(; safe_mode=false) +end + +if @isdefined(Enzyme) + AD_locationscale[:Enzyme] = AutoEnzyme() +end + +@testset "inference ScoreGradELBO VILocationScale" begin + @testset "$(modelname) $(objname) $(realtype) $(adbackname)" for realtype in + [Float64, Float32], + (modelname, modelconstr) in + Dict(:Normal => normal_meanfield, :Normal => normal_fullrank), + n_montecarlo in [1, 10], + (objname, objective) in Dict( + :ScoreGradELBOClosedFormEntropy => ScoreGradELBO(n_montecarlo), + :ScoreGradELBOStickingTheLanding => + ScoreGradELBO(n_montecarlo; entropy=StickingTheLandingEntropy()), + ), + (adbackname, adtype) in AD_locationscale + + seed = (0x38bef07cf9cc549d) + rng = StableRNG(seed) + + modelstats = modelconstr(rng, realtype) + @unpack model, μ_true, L_true, n_dims, strong_convexity, is_meanfield = modelstats + + T = 1000 + η = 1e-5 + opt = Optimisers.Descent(realtype(η)) + + # For small enough η, the error of SGD, Δλ, is bounded as + # Δλ ≤ ρ^T Δλ0 + O(η), + # where ρ = 1 - ημ, μ is the strong convexity constant. + contraction_rate = 1 - η * strong_convexity + + q0 = if is_meanfield + MeanFieldGaussian(zeros(realtype, n_dims), Diagonal(ones(realtype, n_dims))) + else + L0 = LowerTriangular(Matrix{realtype}(I, n_dims, n_dims)) + FullRankGaussian(zeros(realtype, n_dims), L0) + end + + @testset "convergence" begin + Δλ0 = sum(abs2, q0.location - μ_true) + sum(abs2, q0.scale - L_true) + q_avg, _, stats, _ = optimize( + rng, + model, + objective, + q0, + T; + optimizer=opt, + show_progress=PROGRESS, + adtype=adtype, + ) + + μ = q_avg.location + L = q_avg.scale + Δλ = sum(abs2, μ - μ_true) + sum(abs2, L - L_true) + + @test Δλ ≤ contraction_rate^(T / 2) * Δλ0 + @test eltype(μ) == eltype(μ_true) + @test eltype(L) == eltype(L_true) + end + + @testset "determinism" begin + rng = StableRNG(seed) + q_avg, _, stats, _ = optimize( + rng, + model, + objective, + q0, + T; + optimizer=opt, + show_progress=PROGRESS, + adtype=adtype, + ) + μ = q_avg.location + L = q_avg.scale + + rng_repl = StableRNG(seed) + q_avg, _, stats, _ = optimize( + rng_repl, + model, + objective, + q0, + T; + optimizer=opt, + show_progress=PROGRESS, + adtype=adtype, + ) + μ_repl = q_avg.location + L_repl = q_avg.scale + @test μ ≈ μ_repl rtol = 1e-3 + @test L ≈ L_repl rtol = 1e-3 + end + end +end diff --git a/test/inference/scoregradelbo_locationscale_bijectors.jl b/test/inference/scoregradelbo_locationscale_bijectors.jl new file mode 100644 index 00000000..088130aa --- /dev/null +++ b/test/inference/scoregradelbo_locationscale_bijectors.jl @@ -0,0 +1,111 @@ + +AD_locationscale_bijectors = Dict( + :ForwarDiff => AutoForwardDiff(), + :ReverseDiff => AutoReverseDiff(), + #:Zygote => AutoZygote(), +) + +#if @isdefined(Tapir) +# AD_locationscale_bijectors[:Tapir] = AutoTapir(; safe_mode=false) +#end + +if @isdefined(Enzyme) + AD_locationscale_bijectors[:Enzyme] = AutoEnzyme() +end + +@testset "inference ScoreGradELBO VILocationScale Bijectors" begin + @testset "$(modelname) $(objname) $(realtype) $(adbackname)" for realtype in + [Float64, Float32], + (modelname, modelconstr) in + Dict(:NormalLogNormalMeanField => normallognormal_meanfield), + n_montecarlo in [1, 10], + (objname, objective) in Dict( + #:ScoreGradELBOClosedFormEntropy => ScoreGradELBO(n_montecarlo), # not supported yet. + :ScoreGradELBOStickingTheLanding => + ScoreGradELBO(n_montecarlo; entropy=StickingTheLandingEntropy()), + ), + (adbackname, adtype) in AD_locationscale_bijectors + + seed = (0x38bef07cf9cc549d) + rng = StableRNG(seed) + + modelstats = modelconstr(rng, realtype) + @unpack model, μ_true, L_true, n_dims, strong_convexity, is_meanfield = modelstats + + T = 1000 + η = 1e-5 + opt = Optimisers.Descent(realtype(η)) + + b = Bijectors.bijector(model) + b⁻¹ = inverse(b) + μ0 = Zeros(realtype, n_dims) + L0 = Diagonal(Ones(realtype, n_dims)) + + q0_η = if is_meanfield + MeanFieldGaussian(zeros(realtype, n_dims), Diagonal(ones(realtype, n_dims))) + else + L0 = LowerTriangular(Matrix{realtype}(I, n_dims, n_dims)) + FullRankGaussian(zeros(realtype, n_dims), L0) + end + q0_z = Bijectors.transformed(q0_η, b⁻¹) + + # For small enough η, the error of SGD, Δλ, is bounded as + # Δλ ≤ ρ^T Δλ0 + O(η), + # where ρ = 1 - ημ, μ is the strong convexity constant. + contraction_rate = 1 - η * strong_convexity + + @testset "convergence" begin + Δλ0 = sum(abs2, μ0 - μ_true) + sum(abs2, L0 - L_true) + q_avg, _, stats, _ = optimize( + rng, + model, + objective, + q0_z, + T; + optimizer=opt, + show_progress=PROGRESS, + adtype=adtype, + ) + + μ = q_avg.dist.location + L = q_avg.dist.scale + Δλ = sum(abs2, μ - μ_true) + sum(abs2, L - L_true) + + @test Δλ ≤ contraction_rate^(T / 2) * Δλ0 + @test eltype(μ) == eltype(μ_true) + @test eltype(L) == eltype(L_true) + end + + @testset "determinism" begin + rng = StableRNG(seed) + q_avg, _, stats, _ = optimize( + rng, + model, + objective, + q0_z, + T; + optimizer=opt, + show_progress=PROGRESS, + adtype=adtype, + ) + μ = q_avg.dist.location + L = q_avg.dist.scale + + rng_repl = StableRNG(seed) + q_avg, _, stats, _ = optimize( + rng_repl, + model, + objective, + q0_z, + T; + optimizer=opt, + show_progress=PROGRESS, + adtype=adtype, + ) + μ_repl = q_avg.dist.location + L_repl = q_avg.dist.scale + @test μ ≈ μ_repl rtol = 1e-3 + @test L ≈ L_repl rtol = 1e-3 + end + end +end diff --git a/test/interface/scoregradelbo.jl b/test/interface/scoregradelbo.jl new file mode 100644 index 00000000..a800f744 --- /dev/null +++ b/test/interface/scoregradelbo.jl @@ -0,0 +1,57 @@ + +using Test + +@testset "interface ScoreGradELBO" begin + seed = (0x38bef07cf9cc549d) + rng = StableRNG(seed) + + modelstats = normal_meanfield(rng, Float64) + + @unpack model, μ_true, L_true, n_dims, is_meanfield = modelstats + + q0 = TuringDiagMvNormal(zeros(Float64, n_dims), ones(Float64, n_dims)) + + obj = ScoreGradELBO(10) + rng = StableRNG(seed) + elbo_ref = estimate_objective(rng, obj, q0, model; n_samples=10^4) + + @testset "determinism" begin + rng = StableRNG(seed) + elbo = estimate_objective(rng, obj, q0, model; n_samples=10^4) + @test elbo == elbo_ref + end + + @testset "default_rng" begin + elbo = estimate_objective(obj, q0, model; n_samples=10^4) + @test elbo ≈ elbo_ref rtol = 0.2 + end +end + +@testset "interface ScoreGradELBO STL variance reduction" begin + seed = (0x38bef07cf9cc549d) + rng = StableRNG(seed) + + modelstats = normal_meanfield(rng, Float64) + @unpack model, μ_true, L_true, n_dims, is_meanfield = modelstats + + @testset for ad in [ + ADTypes.AutoForwardDiff(), ADTypes.AutoReverseDiff(), ADTypes.AutoZygote() + ] + q_true = MeanFieldGaussian( + Vector{eltype(μ_true)}(μ_true), Diagonal(Vector{eltype(L_true)}(diag(L_true))) + ) + params, re = Optimisers.destructure(q_true) + obj = ScoreGradELBO( + 1000; entropy=StickingTheLandingEntropy(), baseline_history=[0.0] + ) + out = DiffResults.DiffResult(zero(eltype(params)), similar(params)) + + aux = (rng=rng, obj=obj, problem=model, restructure=re, q_stop=q_true, adtype=ad) + AdvancedVI.value_and_gradient!( + ad, AdvancedVI.estimate_scoregradelbo_ad_forward, params, aux, out + ) + value = DiffResults.value(out) + grad = DiffResults.gradient(out) + @test norm(grad) ≈ 0 atol = 10 # high tolerance required. + end +end diff --git a/test/runtests.jl b/test/runtests.jl index 43958e8e..85bec3a7 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -50,6 +50,7 @@ if GROUP == "All" || GROUP == "Interface" include("interface/ad.jl") include("interface/optimize.jl") include("interface/repgradelbo.jl") + include("interface/scoregradelbo.jl") include("interface/rules.jl") include("interface/averaging.jl") end @@ -65,4 +66,7 @@ if GROUP == "All" || GROUP == "Inference" include("inference/repgradelbo_distributionsad.jl") include("inference/repgradelbo_locationscale.jl") include("inference/repgradelbo_locationscale_bijectors.jl") + include("inference/scoregradelbo_distributionsad.jl") + include("inference/scoregradelbo_locationscale.jl") + include("inference/scoregradelbo_locationscale_bijectors.jl") end