From 9fb699cb8bd974c61bf8d3b921786660eb004553 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mos=C3=A8=20Giordano?= <m.giordano@ucl.ac.uk>
Date: Fri, 15 Mar 2024 20:04:07 +0000
Subject: [PATCH] Add Hamiltonian Monte Carlo example

Example included in talk given at EnzymeCon 2024.
---
 README.md         |   3 +-
 docs/src/index.md |   3 +-
 examples/hmc.jl   | 109 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 111 insertions(+), 4 deletions(-)
 create mode 100644 examples/hmc.jl

diff --git a/README.md b/README.md
index e8004a4..841af1d 100644
--- a/README.md
+++ b/README.md
@@ -25,5 +25,4 @@ Here is some material that you may find useful for learning more about Julia on
 
 * [Pluto notebook](https://giordano.github.io/blog/2023-07-20-julia-ipu/) of presentation given at Graphcore and at JuliaCon in July 2023
 * Talk "[Julia meets the Intelligence Processing Unit](https://www.youtube.com/watch?v=-fxB0kmcCVE)" at JuliaCon 2023
-* [JuliaIpuDemo](https://github.com/JuliaIPU/JuliaIpuDemo), repository with instructions for running some Jupyter notebooks on Paperspace cloud.
-  This service offers also IPU time for free, these sessions are limited to 6 hours each and one at the time, but you can run as many as you want.
+* Talk "[Automatic differentiation on the IPU with Enzyme.jl](https://giordano.github.io/talks/2024-03-27-julia-ipu-enzymecon/)" at [EnzymeCon 2024](https://enzyme.mit.edu/conference)
diff --git a/docs/src/index.md b/docs/src/index.md
index 4404aa1..f3be239 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -73,5 +73,4 @@ Here is some material that you may find useful for learning more about Julia on
 
 * [Pluto notebook](https://giordano.github.io/blog/2023-07-20-julia-ipu/) of presentation given at Graphcore and at JuliaCon in July 2023
 * Talk "[Julia meets the Intelligence Processing Unit](https://www.youtube.com/watch?v=-fxB0kmcCVE)" at JuliaCon 2023
-* [JuliaIpuDemo](https://github.com/JuliaIPU/JuliaIpuDemo), repository with instructions for running some Jupyter notebooks on Paperspace cloud.
-  This service offers also IPU time for free, these sessions are limited to 6 hours each and one at the time, but you can run as many as you want.
+* Talk "[Automatic differentiation on the IPU with Enzyme.jl](https://giordano.github.io/talks/2024-03-27-julia-ipu-enzymecon/)" at [EnzymeCon 2024](https://enzyme.mit.edu/conference)
diff --git a/examples/hmc.jl b/examples/hmc.jl
new file mode 100644
index 0000000..341f0f6
--- /dev/null
+++ b/examples/hmc.jl
@@ -0,0 +1,109 @@
+using IPUToolkit.IPUCompiler, IPUToolkit.Poplar
+using Enzyme
+
+IPUCompiler.KEEP_LLVM_FILES[] = true
+ENV["POPLAR_RUNTIME_OPTIONS"] = """{"target.hostSyncTimeout":"30"}"""
+
+device = Poplar.get_ipu_device()
+target = Poplar.DeviceGetTarget(device)
+graph = Poplar.Graph(target)
+
+num_tiles = Int(Poplar.TargetGetNumTiles(target))
+
+∂!(f, x, f′) = autodiff_deferred(Reverse, f, Duplicated(x, f′))
+
+neg_log_density(q::AbstractVector{T}) where {T} = (q[1]^2  - q[2])^2 + (q[1]- one(T))^2 / T(100)
+
+# Note: both input and output must have exactly same type (including *all* parameters).
+function grad_neg_log_density!(f′::V, x::V) where {T,V<:AbstractVector{T}}
+    # The derivative is added to duplicated arguments, so we need to zero f′
+    # before going on.
+    f′ .= zero(T)
+    ∂!(neg_log_density, x, f′)
+    return f′
+end
+
+function leapfrog!(q::AbstractVector{T}, p::AbstractVector{T}, f′::AbstractVector{T}, dt::T) where {T}
+    grad_neg_log_density!(f′, q)
+    p .-= (dt ./ 2) .* f′
+    q .+= dt .* p
+    grad_neg_log_density!(f′, q)
+    p .-= (dt / 2) .* f′
+end
+
+function sample_transition!(q_proposed::AbstractVector{T}, p::AbstractVector{T}, f′::AbstractVector{T}, q::AbstractVector{T}, dt::T, n_step) where {T}
+    randn2!(p)
+    h_init = sum(abs2, p) / 2 + neg_log_density(q)
+    q_proposed .= q
+    for step in UInt32(1):n_step
+        leapfrog!(q_proposed, p, f′, dt)
+    end
+    h_diff = h_init - (sum(abs2, p) / 2 + neg_log_density(q_proposed))
+    accept_prob = isnan(h_diff) ? zero(T) : exp(min(0, h_diff))
+    if rand(T) >= accept_prob
+        q_proposed .= q
+    end
+    return accept_prob
+end
+
+function sample_chain!(q_chain::AbstractVector{T}, buffer_q::AbstractVector{T}, p::AbstractVector{T}, f′::AbstractVector{T}, orig_q::AbstractVector{T}, n_sample, n_step, dt::T) where {T}
+    sum_accept_prob = zero(T)
+    buffer_q .= orig_q
+    for sample in UInt32(1):n_sample
+        accept_prob = sample_transition!(buffer_q, p, f′, orig_q, dt, n_step)
+        for idx in eachindex(buffer_q)
+            @inbounds q_chain[length(buffer_q) * (sample - 1) + idx] = buffer_q[idx]
+        end
+        sum_accept_prob += accept_prob
+    end
+    return sum_accept_prob / n_sample
+end
+
+n_sample = UInt32(10)
+n_step = UInt32(10)
+dt = Float32(0.1)
+
+@eval @codelet graph function HamiltonianMonteCarlo(
+    q_chain::VertexVector{Float32, InOut},
+    buffer_q::VertexVector{Float32, InOut},
+    p::VertexVector{Float32, InOut},
+    gradient::VertexVector{Float32, InOut},
+    orig_q::VertexVector{Float32, InOut},
+    )
+    sample_chain!(q_chain, buffer_q, p, gradient, orig_q, $(n_sample), $(n_step), $(dt))
+end
+
+orig_q = randn(Float32, 2 * num_tiles)
+
+orig_q_ipu = Poplar.GraphAddVariable(graph, Poplar.FLOAT(), UInt64[length(orig_q)], "orig_q")
+copyto!(graph, orig_q_ipu, orig_q)
+buffer_q_ipu = similar(graph, orig_q, "buffer_q")
+p_ipu = similar(graph, orig_q, "p")
+gradient_ipu = similar(graph, orig_q, "gradient")
+q_chain_ipu = Poplar.GraphAddVariable(graph, Poplar.FLOAT(), UInt64[length(orig_q) * n_sample], "q_chain")
+q_chain = Matrix{Float32}(undef, length(orig_q), n_sample)
+
+prog = Poplar.ProgramSequence()
+
+add_vertex(graph, prog, 0:(num_tiles - 1), HamiltonianMonteCarlo,
+           q_chain_ipu, buffer_q_ipu, p_ipu, gradient_ipu, orig_q_ipu)
+
+Poplar.GraphCreateHostRead(graph, "q-chain-read", q_chain_ipu)
+
+flags = Poplar.OptionFlags()
+Poplar.OptionFlagsSet(flags, "debug.instrument", "false")
+
+engine = Poplar.Engine(graph, prog, flags)
+Poplar.EngineLoadAndRun(engine, device)
+Poplar.EngineReadTensor(engine, "q-chain-read", q_chain)
+
+Poplar.detach_devices()
+
+#=
+
+using Plots
+
+sample = 10
+scatter(q_chain[1:2:end, sample], q_chain[2:2:end, sample]; xlims=(-3, 3), ylims=(-3, 6))
+
+=#