From bfba441acb9b710b45e67911255f40d9d1e3853e Mon Sep 17 00:00:00 2001
From: Marius Meyer <marius.meyer@uni-paderborn.de>
Date: Fri, 6 Sep 2024 13:59:42 +0200
Subject: [PATCH] Implement prepare_bitstream macro

---
 CHANGELOG                              |  1 +
 README.md                              | 16 +++--
 docs/src/examples/high_level_basics.md | 15 ++--
 docs/src/high_level.md                 |  9 +--
 examples/stream/README.md              |  2 +-
 examples/stream/stream_fpga.jl         | 11 +--
 src/XRT.jl                             |  2 +-
 src/hl_execution.jl                    | 99 +++++++++++++++-----------
 8 files changed, 93 insertions(+), 62 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 056bea3..96912de 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -2,6 +2,7 @@
 
 ## Unreleased
 
+- Convert prepare_bitstream function to macro and fix signature generation
 - Minor changes in method names (adding ! to some methods because they change their input parameters)
 - Move wrapped API to XRTWrap submodule to support easier extension of core functionality
 - Use CMake module instead of OS cmake, full CMake workflow including install, add uuid link library
diff --git a/README.md b/README.md
index 2593b69..cdc7bb5 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ C++ API to allow kernel scheduling, bitstream analysis, and more via XRT directl
 
 This example executes a kernel on the FPGA that takes one buffer as output and
 two scalar values as input.
-The `prepare_bitstream` function can be used to generate Julia functions for all kernels implemented in the bitstream by parsing its meta data.
+The `@prepare_bitstream` macro can be used to generate Julia functions for all kernels implemented in the bitstream by parsing its meta data.
 Buffer synchronization is handled automatically by XRT.jl.
 An example code for the execution of a kernel `dummyKernel` in the bitstream is given below:
 
@@ -19,12 +19,16 @@ using ArrayAllocators
 # Allocate an output array
 a = Array{UInt8}(MemAlign(4096),1)
 
-# Load the bitstream to the FPGA and generate functions 
-# for each kernel
-bs = XRT.prepare_bitstream("communication_PCIE.xclbin")
+# Generate functions for each kernel in
+# the bitstream. Do this in a separate module
+# for convenience
+module Bitstream
+    using XRT
+    @prepare_bitstream("communication_PCIE.xclbin")
+end
 
-# execute the dummyKernel kernel
-bs.dummyKernel!(a, UInt8(1),1)
+# Program the bitstream to a specific device and execute the kernel
+Bitstream.dummyKernel!(a, UInt8(1),1; device=XRT.Device(0))
 
 # validate the execution results
 @assert all(a .== UInt8(1))
diff --git a/docs/src/examples/high_level_basics.md b/docs/src/examples/high_level_basics.md
index 1db9663..dc6c43a 100644
--- a/docs/src/examples/high_level_basics.md
+++ b/docs/src/examples/high_level_basics.md
@@ -2,8 +2,10 @@
 
 This example executes a kernel on the FPGA that takes one buffer as output and
 two scalar values as input.
-The `prepare_bitstream` function can be used to generate Julia functions for all kernels implemented in the bitstream by parsing its meta data.
+The `@prepare_bitstream` macro can be used to generate Julia functions for all kernels implemented in the bitstream by parsing its meta data.
 Buffer synchronization is handled automatically by XRT.jl.
+For more information refer to [High Level Abstractions for Kernel Executions](@ref).
+
 An example code for the execution of a kernel `dummyKernel` like this:
 
 ```C++
@@ -23,12 +25,15 @@ using ArrayAllocators
 # Allocate an output array
 a = Array{UInt8}(MemAlign(4096),1)
 
-# Load the bitstream to the FPGA and generate functions 
-# for each kernel
-bs = XRT.prepare_bitstream("communication_PCIE.xclbin")
+# Create a module that should contain the generated functions 
+# of the bitstream
+module Bitstream
+    using XRT
+    @prepare_bitstream("communication_PCIE.xclbin")
+end
 
 # execute the dummyKernel kernel
-bs.dummyKernel!(a, UInt8(1),1)
+Bitstream.dummyKernel!(a, UInt8(1),1)
 
 # validate the execution results
 @assert all(a .== UInt8(1))
diff --git a/docs/src/high_level.md b/docs/src/high_level.md
index 49dfe66..bebe999 100644
--- a/docs/src/high_level.md
+++ b/docs/src/high_level.md
@@ -1,12 +1,13 @@
 # High Level Abstractions for Kernel Executions
 
-Based on our [Custom XCLBIN Parser](@ref), XRT.jl provides a method to generate ready-to-use functions for
-the execution of individual kernels in the bitstream: `prepare_bitstream(path; device)`.
+Based on our [Custom XCLBIN Parser](@ref), XRT.jl provides a macro to generate ready-to-use functions for
+the execution of individual kernels in the bitstream: `@prepare_bitstream`.
 
-This function will create a new module with a function for each kernel in the provided bitstream and load the bitstream on
-an FPGA device.
+This function will create a new module with a function for each kernel in the provided bitstream.
 Kernels can then be executed by calling the function with the required input parameters. If the input parameter is an `AbstractArray`,
 it will be automatically copied to the FPGA memory before execution and back after execution.
+All generated function come with a keyworkd parameter `device` which can be used to specify the device the kernel should be executed on.
+If the bitstream is not already programmed on the device, this will be done automatically before executing the kernel.
 
 See [Example: Auto-generated Kernel Interfaces](@ref) and [Example: STREAM Benchmark](@ref) for examples, how this approach can be
 used to execute compute kernels on the FPGA.
diff --git a/examples/stream/README.md b/examples/stream/README.md
index 086e949..547da50 100644
--- a/examples/stream/README.md
+++ b/examples/stream/README.md
@@ -51,4 +51,4 @@ The output should look similar to this:
     Kernel Name: k2, CU Number: 1, Status: Shutdown
 
 Note, that the measured bandwidth is relatively low because software emulation is used.
-To execute the stream benchmark on hardware, the path to the bitstream has to be changed accordingly by updating the `bitstream()` function.
\ No newline at end of file
+To execute the stream benchmark on hardware, the path to the bitstream has to be changed accordingly in the `@prepare_bitstream` line.
\ No newline at end of file
diff --git a/examples/stream/stream_fpga.jl b/examples/stream/stream_fpga.jl
index 9408181..c5c5e8a 100644
--- a/examples/stream/stream_fpga.jl
+++ b/examples/stream/stream_fpga.jl
@@ -25,16 +25,19 @@ end
 
 # Load the bitstream to the FPGA and generate functions 
 # for each kernel
-@info "Upload bitstream and generate kernel functions"
-bs = XRT.prepare_bitstream(bitstream())
+@info "Generate kernel functions"
+module STREAMBitstream
+    using XRT
+    @prepare_bitstream("build_sw_emu/stream.xclbin")
+end
 
 # execute the stream kernel
 @info "Execute kernel test run" 
-bs.stream_calc!(a, b, c, 2.0, 16, 1)
+STREAMBitstream.stream_calc!(a, b, c, 2.0, 16, 1)
 c .= 0.0
 
 @info "Execute full kernel run TRIAD" 
-execution_time = @elapsed bs.stream_calc!(a, b, c, 2.0, array_size, 1)
+execution_time = @elapsed STREAMBitstream.stream_calc!(a, b, c, 2.0, array_size, 1)
 
 @info "Execution time: $execution_time seconds"
 total_data_moved_fpga = 3 * array_size * sizeof(eltype(a))
diff --git a/src/XRT.jl b/src/XRT.jl
index a733cb2..481be25 100644
--- a/src/XRT.jl
+++ b/src/XRT.jl
@@ -37,6 +37,6 @@ include("hl_execution.jl")
 
 export size, length, setindex!, getindex, convert, wait
 export sync!, group_id, set_arg!, start, stop, load_xclbin!
-export prepare_bitstream
+export @prepare_bitstream
 
 end
diff --git a/src/hl_execution.jl b/src/hl_execution.jl
index 7c2d11d..368eb9c 100644
--- a/src/hl_execution.jl
+++ b/src/hl_execution.jl
@@ -2,32 +2,41 @@
 """
 $(SIGNATURES)
 
- Load a bitstream to an FPGA and generate 
- interfaces for the included kernels.
- Returns a module with functions representing the 
- kernels of the bitstream.
+ Parse a bitstream and generate functions for the included kernels.
+ The functions will automatically copy all relevant buffers to the 
+ FPGA memory, and execute the Kernel.
 
- path: Path to the bitstream file (xclbin)
- device: XRT device to write the bitstream to
+ It is recommended to generate the kernel functions in a separate module
+ like this:
 
-"""
-function prepare_bitstream(path::String; device::XRT.Device=XRT.Device(1))
-    uuid = load_xclbin!(device, path)
-    j_data = XRT.get_kernel_info(path)
-    return eval(:(
-        module $(Symbol(first(split(basename(path),"."))))
+ ```Julia
+ module DummyBitstream
+    using XRT
+    @prepare_bitstream("my_bitstream.xclbin")
+ end
+ ```
+
+ Afterwards, you find the functions for each kernel in the module.
+ To execute the kernel on a specific device, use the `device` keyword parameter:
 
-        using XRT
-        for jk in $j_data
-            eval(quote
+ ```Julia
+ DummyBitstream.kernel_name!(args...; device=XRT.Device(0))
+ ```
 
-            args = [a["name"] for a in $(jk["arguments"])]
-            arg_vector = [parse(Int, a["address_qualifier"]) == 1 for a in $(jk["arguments"])]
-            arg_ids = [parse(Int, a["id"]) for a in $(jk["arguments"])]
+"""
+macro prepare_bitstream(path::String)
+    j_data = XRT.get_kernel_info(path)
+    mod_funcs = Expr[]
+    for jk in j_data 
+        args = [Symbol(a["name"]) for a in jk["arguments"]]
+        arg_vector = [parse(Int, a["address_qualifier"]) == 1 for a in jk["arguments"]]
+        arg_ids = [parse(Int, a["id"]) for a in jk["arguments"]] 
+        fname = esc(Symbol(jk["name"],"!"))
+        f = quote
 
             """
             ```Julia
-            $($(jk["name"]))!($($(join([a["name"] for a in jk["arguments"]],", "))))
+            $($(jk["name"]))!($($(join([a["name"] for a in jk["arguments"]],", "))); device=XRT.Device(0))
             ```
 
             Execute a kernel on the FPGA using the provided arguments and HLS data types:
@@ -36,31 +45,39 @@ function prepare_bitstream(path::String; device::XRT.Device=XRT.Device(1))
 
             The provided data types are C data types. Matching Julia data types have to be used as inputs!
             """
-            function $(Symbol(jk["name"] * "!"))(args...)
-                final_args = []
+            function $(fname)($([esc(a) for a in args]...); device=XRT.Device(0))
+                uuid = load_xclbin!(device, $path)
+                kernel = XRT.Kernel(device, uuid, $(String(jk["name"])))
+                # Generate the code for buffer synchronization
+                # and the actual kernel execution
+                Expr(:block, $(begin                
+                    final_args = []
+                    exp = Expr[]
 
-                kernel = XRT.Kernel($($device), $($uuid), $(String(jk["name"])))
-                for (a, v, i) in zip(args, arg_vector, arg_ids)
-                    if v
-                        bo_array =  XRT.BOArray($($(device)), a, XRT.group_id(kernel, i))
-                        XRT.sync!(bo_array, XRT.XCL_BO_SYNC_BO_TO_DEVICE) 
-                        push!(final_args, bo_array)
-                    else
-                        push!(final_args, a)
+                    for (a, v, i) in zip(args, arg_vector, arg_ids)
+                        if v
+                            sym_bo = esc(Symbol("bo_array",i))
+                            push!(exp, :($sym_bo =  XRT.BOArray(device, $a, XRT.group_id(kernel, $i))))
+                            push!(exp, :(XRT.sync!($sym_bo, XRT.XCL_BO_SYNC_BO_TO_DEVICE)))
+                            push!(final_args, sym_bo)
+                        else
+                            push!(final_args, a)
+                        end
                     end
-                end
-                XRT.wait(XRT.Run(kernel, final_args...))
-                current_bo_id = 1
-                for (a, v, fa) in zip(args, arg_vector, final_args)
-                    if v
-                        XRT.sync!(fa, XRT.XCL_BO_SYNC_BO_FROM_DEVICE) 
-                        a[:] .= fa[:]
+                    push!(exp, :(XRT.wait(XRT.Run(kernel, $(final_args...)))))
+                    for (a, v, fa) in zip(args, arg_vector, final_args)
+                        if v
+                            push!(exp, :(XRT.sync!($fa, XRT.XCL_BO_SYNC_BO_FROM_DEVICE)))
+                            push!(exp, :($a[:] .= $fa[:]))
+                        end
                     end
-                end
-            end
-        end)
+                    exp
+                end...))
+                nothing
+            end 
+        end
+        push!(mod_funcs, f)
     end
-end
-    ))
+    Expr(:block, mod_funcs...)
 end