Implement prepare_bitstream macro

pc2 · Sep 6, 2024 · bfba441 · bfba441
1 parent d341808
commit bfba441
Showing 8 changed files with 93 additions and 62 deletions.
diff --git a/CHANGELOG b/CHANGELOG
@@ -2,6 +2,7 @@
 
 ## Unreleased
 
+- Convert prepare_bitstream function to macro and fix signature generation
 - Minor changes in method names (adding ! to some methods because they change their input parameters)
 - Move wrapped API to XRTWrap submodule to support easier extension of core functionality
 - Use CMake module instead of OS cmake, full CMake workflow including install, add uuid link library

diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@ C++ API to allow kernel scheduling, bitstream analysis, and more via XRT directl
 
 This example executes a kernel on the FPGA that takes one buffer as output and
 two scalar values as input.
-The `prepare_bitstream` function can be used to generate Julia functions for all kernels implemented in the bitstream by parsing its meta data.
+The `@prepare_bitstream` macro can be used to generate Julia functions for all kernels implemented in the bitstream by parsing its meta data.
 Buffer synchronization is handled automatically by XRT.jl.
 An example code for the execution of a kernel `dummyKernel` in the bitstream is given below:
 
@@ -19,12 +19,16 @@ using ArrayAllocators
 # Allocate an output array
 a = Array{UInt8}(MemAlign(4096),1)
 
-# Load the bitstream to the FPGA and generate functions 
-# for each kernel
-bs = XRT.prepare_bitstream("communication_PCIE.xclbin")
+# Generate functions for each kernel in
+# the bitstream. Do this in a separate module
+# for convenience
+module Bitstream
+    using XRT
+    @prepare_bitstream("communication_PCIE.xclbin")
+end
 
-# execute the dummyKernel kernel
-bs.dummyKernel!(a, UInt8(1),1)
+# Program the bitstream to a specific device and execute the kernel
+Bitstream.dummyKernel!(a, UInt8(1),1; device=XRT.Device(0))
 
 # validate the execution results
 @assert all(a .== UInt8(1))

diff --git a/docs/src/examples/high_level_basics.md b/docs/src/examples/high_level_basics.md
@@ -2,8 +2,10 @@
 
 This example executes a kernel on the FPGA that takes one buffer as output and
 two scalar values as input.
-The `prepare_bitstream` function can be used to generate Julia functions for all kernels implemented in the bitstream by parsing its meta data.
+The `@prepare_bitstream` macro can be used to generate Julia functions for all kernels implemented in the bitstream by parsing its meta data.
 Buffer synchronization is handled automatically by XRT.jl.
+For more information refer to [High Level Abstractions for Kernel Executions](@ref).
+
 An example code for the execution of a kernel `dummyKernel` like this:
 
 ```C++
@@ -23,12 +25,15 @@ using ArrayAllocators
 # Allocate an output array
 a = Array{UInt8}(MemAlign(4096),1)
 
-# Load the bitstream to the FPGA and generate functions 
-# for each kernel
-bs = XRT.prepare_bitstream("communication_PCIE.xclbin")
+# Create a module that should contain the generated functions 
+# of the bitstream
+module Bitstream
+    using XRT
+    @prepare_bitstream("communication_PCIE.xclbin")
+end
 
 # execute the dummyKernel kernel
-bs.dummyKernel!(a, UInt8(1),1)
+Bitstream.dummyKernel!(a, UInt8(1),1)
 
 # validate the execution results
 @assert all(a .== UInt8(1))

diff --git a/docs/src/high_level.md b/docs/src/high_level.md
@@ -1,12 +1,13 @@
 # High Level Abstractions for Kernel Executions
 
-Based on our [Custom XCLBIN Parser](@ref), XRT.jl provides a method to generate ready-to-use functions for
-the execution of individual kernels in the bitstream: `prepare_bitstream(path; device)`.
+Based on our [Custom XCLBIN Parser](@ref), XRT.jl provides a macro to generate ready-to-use functions for
+the execution of individual kernels in the bitstream: `@prepare_bitstream`.
 
-This function will create a new module with a function for each kernel in the provided bitstream and load the bitstream on
-an FPGA device.
+This function will create a new module with a function for each kernel in the provided bitstream.
 Kernels can then be executed by calling the function with the required input parameters. If the input parameter is an `AbstractArray`,
 it will be automatically copied to the FPGA memory before execution and back after execution.
+All generated function come with a keyworkd parameter `device` which can be used to specify the device the kernel should be executed on.
+If the bitstream is not already programmed on the device, this will be done automatically before executing the kernel.
 
 See [Example: Auto-generated Kernel Interfaces](@ref) and [Example: STREAM Benchmark](@ref) for examples, how this approach can be
 used to execute compute kernels on the FPGA.

diff --git a/examples/stream/README.md b/examples/stream/README.md
@@ -51,4 +51,4 @@ The output should look similar to this:
     Kernel Name: k2, CU Number: 1, Status: Shutdown
 
 Note, that the measured bandwidth is relatively low because software emulation is used.
-To execute the stream benchmark on hardware, the path to the bitstream has to be changed accordingly by updating the `bitstream()` function.
+To execute the stream benchmark on hardware, the path to the bitstream has to be changed accordingly in the `@prepare_bitstream` line.
diff --git a/examples/stream/stream_fpga.jl b/examples/stream/stream_fpga.jl
@@ -25,16 +25,19 @@ end
 
 # Load the bitstream to the FPGA and generate functions 
 # for each kernel
-@info "Upload bitstream and generate kernel functions"
-bs = XRT.prepare_bitstream(bitstream())
+@info "Generate kernel functions"
+module STREAMBitstream
+    using XRT
+    @prepare_bitstream("build_sw_emu/stream.xclbin")
+end
 
 # execute the stream kernel
 @info "Execute kernel test run" 
-bs.stream_calc!(a, b, c, 2.0, 16, 1)
+STREAMBitstream.stream_calc!(a, b, c, 2.0, 16, 1)
 c .= 0.0
 
 @info "Execute full kernel run TRIAD" 
-execution_time = @elapsed bs.stream_calc!(a, b, c, 2.0, array_size, 1)
+execution_time = @elapsed STREAMBitstream.stream_calc!(a, b, c, 2.0, array_size, 1)
 
 @info "Execution time: $execution_time seconds"
 total_data_moved_fpga = 3 * array_size * sizeof(eltype(a))

diff --git a/src/XRT.jl b/src/XRT.jl
@@ -37,6 +37,6 @@ include("hl_execution.jl")
 
 export size, length, setindex!, getindex, convert, wait
 export sync!, group_id, set_arg!, start, stop, load_xclbin!
-export prepare_bitstream
+export @prepare_bitstream
 
 end
diff --git a/src/hl_execution.jl b/src/hl_execution.jl
@@ -2,32 +2,41 @@
 """
 $(SIGNATURES)
 
- Load a bitstream to an FPGA and generate 
- interfaces for the included kernels.
- Returns a module with functions representing the 
- kernels of the bitstream.
+ Parse a bitstream and generate functions for the included kernels.
+ The functions will automatically copy all relevant buffers to the 
+ FPGA memory, and execute the Kernel.
 
- path: Path to the bitstream file (xclbin)
- device: XRT device to write the bitstream to
+ It is recommended to generate the kernel functions in a separate module
+ like this:
 
-"""
-function prepare_bitstream(path::String; device::XRT.Device=XRT.Device(1))
-    uuid = load_xclbin!(device, path)
-    j_data = XRT.get_kernel_info(path)
-    return eval(:(
-        module $(Symbol(first(split(basename(path),"."))))
+ ```Julia
+ module DummyBitstream
+    using XRT
+    @prepare_bitstream("my_bitstream.xclbin")
+ end
+ ```
+
+ Afterwards, you find the functions for each kernel in the module.
+ To execute the kernel on a specific device, use the `device` keyword parameter:
 
-        using XRT
-        for jk in $j_data
-            eval(quote
+ ```Julia
+ DummyBitstream.kernel_name!(args...; device=XRT.Device(0))
+ ```
 
-            args = [a["name"] for a in $(jk["arguments"])]
-            arg_vector = [parse(Int, a["address_qualifier"]) == 1 for a in $(jk["arguments"])]
-            arg_ids = [parse(Int, a["id"]) for a in $(jk["arguments"])]
+"""
+macro prepare_bitstream(path::String)
+    j_data = XRT.get_kernel_info(path)
+    mod_funcs = Expr[]
+    for jk in j_data 
+        args = [Symbol(a["name"]) for a in jk["arguments"]]
+        arg_vector = [parse(Int, a["address_qualifier"]) == 1 for a in jk["arguments"]]
+        arg_ids = [parse(Int, a["id"]) for a in jk["arguments"]] 
+        fname = esc(Symbol(jk["name"],"!"))
+        f = quote
 
             """
             ```Julia
-            $($(jk["name"]))!($($(join([a["name"] for a in jk["arguments"]],", "))))
+            $($(jk["name"]))!($($(join([a["name"] for a in jk["arguments"]],", "))); device=XRT.Device(0))
             ```
 
             Execute a kernel on the FPGA using the provided arguments and HLS data types:
@@ -36,31 +45,39 @@ function prepare_bitstream(path::String; device::XRT.Device=XRT.Device(1))
 
             The provided data types are C data types. Matching Julia data types have to be used as inputs!
             """
-            function $(Symbol(jk["name"] * "!"))(args...)
-                final_args = []
+            function $(fname)($([esc(a) for a in args]...); device=XRT.Device(0))
+                uuid = load_xclbin!(device, $path)
+                kernel = XRT.Kernel(device, uuid, $(String(jk["name"])))
+                # Generate the code for buffer synchronization
+                # and the actual kernel execution
+                Expr(:block, $(begin                
+                    final_args = []
+                    exp = Expr[]
 
-                kernel = XRT.Kernel($($device), $($uuid), $(String(jk["name"])))
-                for (a, v, i) in zip(args, arg_vector, arg_ids)
-                    if v
-                        bo_array =  XRT.BOArray($($(device)), a, XRT.group_id(kernel, i))
-                        XRT.sync!(bo_array, XRT.XCL_BO_SYNC_BO_TO_DEVICE) 
-                        push!(final_args, bo_array)
-                    else
-                        push!(final_args, a)
+                    for (a, v, i) in zip(args, arg_vector, arg_ids)
+                        if v
+                            sym_bo = esc(Symbol("bo_array",i))
+                            push!(exp, :($sym_bo =  XRT.BOArray(device, $a, XRT.group_id(kernel, $i))))
+                            push!(exp, :(XRT.sync!($sym_bo, XRT.XCL_BO_SYNC_BO_TO_DEVICE)))
+                            push!(final_args, sym_bo)
+                        else
+                            push!(final_args, a)
+                        end
                     end
-                end
-                XRT.wait(XRT.Run(kernel, final_args...))
-                current_bo_id = 1
-                for (a, v, fa) in zip(args, arg_vector, final_args)
-                    if v
-                        XRT.sync!(fa, XRT.XCL_BO_SYNC_BO_FROM_DEVICE) 
-                        a[:] .= fa[:]
+                    push!(exp, :(XRT.wait(XRT.Run(kernel, $(final_args...)))))
+                    for (a, v, fa) in zip(args, arg_vector, final_args)
+                        if v
+                            push!(exp, :(XRT.sync!($fa, XRT.XCL_BO_SYNC_BO_FROM_DEVICE)))
+                            push!(exp, :($a[:] .= $fa[:]))
+                        end
                     end
-                end
-            end
-        end)
+                    exp
+                end...))
+                nothing
+            end 
+        end
+        push!(mod_funcs, f)
     end
-end
-    ))
+    Expr(:block, mod_funcs...)
 end