From f0d8f58d52b15e896da07f8f6e6a616694b248fc Mon Sep 17 00:00:00 2001
From: Soren Soe <2106410+stsoe@users.noreply.github.com>
Date: Wed, 20 Nov 2024 12:05:31 -0800
Subject: [PATCH] WIP: Add runner for run-recipe (#8608)

A run-recipe defines how to execute a graph model using XRT.

This changes contain a stand-alone runner class that reads and
executes a run-recipe json file. The idea is to have tools
geneate the run-recipe along with xclbin and control code for kernels.

The format (schema) of the recipe json is loosely defined. The
implementation of the runner drove some of the defintion of the json
format.

A run-recipe is associated with exactly one xclbin which, when loaded
into a region (partition) on the device, can run the recipe.

This is work-in-progress and is base-lined and validated for today's
NPU TXN control code.  The runner will change shortly when we obsolete
xclbin for NPU.

The recipe will not work with Alveo as the runner uses xrt::runlist
for execution and xrt::runlist is not supposed in Alveo.

Signed-off-by: Soren Soe <2106410+stsoe@users.noreply.github.com>
---
 src/.clang-tidy                               |    3 +-
 src/runtime_src/core/common/CMakeLists.txt    |    3 +
 .../core/common/runner/CMakeLists.txt         |   12 +
 src/runtime_src/core/common/runner/README.md  |  561 +++++++++
 src/runtime_src/core/common/runner/cpu.cpp    |  193 +++
 src/runtime_src/core/common/runner/cpu.h      |   64 +
 src/runtime_src/core/common/runner/runner.cpp | 1086 +++++++++++++++++
 src/runtime_src/core/common/runner/runner.h   |  153 +++
 .../core/common/runner/test/.gitignore        |    2 +
 .../core/common/runner/test/CMakeLists.txt    |   32 +
 .../core/common/runner/test/README.md         |   74 ++
 .../core/common/runner/test/cpulib.cpp        |   95 ++
 .../core/common/runner/test/recipe.cpp        |   38 +
 .../core/common/runner/test/recipe.json       |   83 ++
 .../core/common/runner/test/runner.cpp        |  208 ++++
 .../core/common/runner/test/tcpu.cpp          |   42 +
 16 files changed, 2648 insertions(+), 1 deletion(-)
 create mode 100644 src/runtime_src/core/common/runner/CMakeLists.txt
 create mode 100644 src/runtime_src/core/common/runner/README.md
 create mode 100644 src/runtime_src/core/common/runner/cpu.cpp
 create mode 100644 src/runtime_src/core/common/runner/cpu.h
 create mode 100644 src/runtime_src/core/common/runner/runner.cpp
 create mode 100644 src/runtime_src/core/common/runner/runner.h
 create mode 100644 src/runtime_src/core/common/runner/test/.gitignore
 create mode 100644 src/runtime_src/core/common/runner/test/CMakeLists.txt
 create mode 100644 src/runtime_src/core/common/runner/test/README.md
 create mode 100644 src/runtime_src/core/common/runner/test/cpulib.cpp
 create mode 100644 src/runtime_src/core/common/runner/test/recipe.cpp
 create mode 100644 src/runtime_src/core/common/runner/test/recipe.json
 create mode 100644 src/runtime_src/core/common/runner/test/runner.cpp
 create mode 100644 src/runtime_src/core/common/runner/test/tcpu.cpp

diff --git a/src/.clang-tidy b/src/.clang-tidy
index c9a72490838..5729706e1d3 100644
--- a/src/.clang-tidy
+++ b/src/.clang-tidy
@@ -30,7 +30,6 @@ modernize-*,
 
 WarningsAsErrors: ''
 HeaderFilterRegex: ''
-AnalyzeTemporaryDtors: false
 FormatStyle:     none
 User:            sonals
 CheckOptions:
@@ -124,6 +123,8 @@ CheckOptions:
     value:           llvm
   - key:             cert-oop54-cpp.WarnOnlyIfThisHasSuspiciousField
     value:           '0'
+  - key:             cppcoreguidelines-avoid-non-const-global-variables.AllowInternalLinkage
+    value:           '1'
   - key:             cppcoreguidelines-avoid-magic-numbers.IgnoredFloatingPointValues
     value:           '1.0;100.0;'
   - key:             cppcoreguidelines-avoid-magic-numbers.IgnoredIntegerValues
diff --git a/src/runtime_src/core/common/CMakeLists.txt b/src/runtime_src/core/common/CMakeLists.txt
index 242e6e19efa..2455180f1e9 100644
--- a/src/runtime_src/core/common/CMakeLists.txt
+++ b/src/runtime_src/core/common/CMakeLists.txt
@@ -3,6 +3,7 @@
 # Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
 add_subdirectory(api)
 add_subdirectory(xdp)
+add_subdirectory(runner)
 
 if(CMAKE_VERSION VERSION_LESS "3.18.0")
   message(WARNING "CMake version is less than 3.18.0, build of submodule aiebu disabled")
@@ -62,12 +63,14 @@ target_include_directories(core_common_objects
 
 add_library(xrt_coreutil SHARED
   $<TARGET_OBJECTS:core_common_library_objects>
+  $<TARGET_OBJECTS:core_common_runner_objects>
   $<TARGET_OBJECTS:core_common_api_library_objects>
   $<TARGET_OBJECTS:core_common_xdp_profile_objects>
   )
 
 add_library(xrt_coreutil_static STATIC
   $<TARGET_OBJECTS:core_common_library_objects>
+  $<TARGET_OBJECTS:core_common_runner_objects>
   $<TARGET_OBJECTS:core_common_api_library_objects>
   $<TARGET_OBJECTS:core_common_xdp_profile_objects>
   )
diff --git a/src/runtime_src/core/common/runner/CMakeLists.txt b/src/runtime_src/core/common/runner/CMakeLists.txt
new file mode 100644
index 00000000000..68d0197b31c
--- /dev/null
+++ b/src/runtime_src/core/common/runner/CMakeLists.txt
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.
+add_library(core_common_runner_objects OBJECT
+  runner.cpp
+  cpu.cpp
+  )
+
+target_include_directories(core_common_runner_objects
+  PRIVATE
+  ${XRT_SOURCE_DIR}/runtime_src
+  )
+
diff --git a/src/runtime_src/core/common/runner/README.md b/src/runtime_src/core/common/runner/README.md
new file mode 100644
index 00000000000..27392798503
--- /dev/null
+++ b/src/runtime_src/core/common/runner/README.md
@@ -0,0 +1,561 @@
+<!-- SPDX-License-Identifier: Apache-2.0 -->
+<!-- Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. -->
+# Run recipe for XRT
+
+A run-recipe defines how to execute a graph model using XRT.
+
+This directory contains a stand-alone `xrt::runner` class that reads and 
+executes a run-recipe json file.   The idea is to have tools, e.g. VAIML
+geneate the run-recipe along with xclbin and control code for kernels.
+
+The format (schema) of the recipe json is loosely defined.  The
+implementation of the runner drove some of the defintion of the json
+format.
+
+A run-recipe is associated with exactly one xclbin which, when loaded into
+a region (partition) on the device, can run the recipe.
+
+# JSON format
+
+There are three sections in the run-recipe.
+
+1. [header](#header)
+2. [resources](#resources)
+3. [execution](#execution)
+
+The `header` trivially contains the path (full name) of the xclbin that should
+be loaded before resources can be created or the recipe can be executed.
+
+The `resources` section defines all buffer objects, kernel objects,
+and cpu function objects used to execute the recipe. The resources are
+created as the run recipe is loaded. External input and output buffer
+may be bound later during the execution stage of recipe.
+
+The `execution` section defines how the resources are connected
+together during execution. It simply executes kernels and cpu
+functions that were previously defined in the resource section with
+arguments that were also defined in the resource section.  Execution
+of kernels can consume partial buffer input and produce partial buffer
+output per `size` and `offset` fields define as part of specifying the
+kernel arguments.
+
+## Header
+
+For the time being, the header stores nothing but the path to the
+xclbin.  The xclbin contais the kernel meta data used by XRT when
+xrt::kernel objects are created.  The xclbin contains PDIs for each
+kernel, the PDIs are loaded by firmware prior to running a kernel.
+
+The header section can be amended with other meta data as needed.
+
+```
+{
+  "header": {
+    "xclbin_path": "design.xclbin",
+  },
+  
+  ...
+}
+```
+
+The runner will use the xclbin from the `header` section to create an
+xrt::hw_context, which is subsequently used to create xrt::kernel
+objects.
+
+## Resources
+
+The resources section is a complete list of all objects that are used
+when the recipe is executed. Each kernel used in the `execution`
+section must be listed in the resources section.  All kernel argument
+buffers used by kernels in the `execution` section must be listed in
+the resources section.  Also all functions executed on the CPU must
+be listed in the resources section.
+
+### Kernel functions
+
+Kernels listed in the resoruces section result in runner creating
+`xrt::kernel` objects.  In XRT, the kernel objects are identified by
+name which must match a kernel name in the xclbin.
+
+Kernels are constructed from the xclbin name and by specifying which
+xrt::hw_context should execute the kernel and what control code the
+kernel should execute.  The hardware context is created by the runner
+from the xclbin specified in the recipe `header` section, so kernels
+in the resources section must contain just the xclbin kernel name
+and the full path to an ELF with the control code.
+
+```
+  "resources": {
+    "kernels": [
+      {
+        "name": "k1",
+        "xclbin_kernel_name": "DPU",
+        "ctrlcode": "no-ctrl-packet.elf"
+      }
+    ]
+  },
+```
+
+The name of the kernel in resources section must be unique in the list
+of kernel instances, the name is used in the `execution` section to refer 
+to which instance should be executed.
+
+If a kernel is instantiated from the same xclbin kernel name and same
+control code, then only one such kernel isntance needs to be listed in
+the resources section.  Listing multiple kernel instances referring to
+the same xclbin kernel and using the same control code is not error,
+but is not necessary.
+
+### CPU functions
+
+Functions to be executed on the CPU are listed in the resource section
+along with a path to a library containing the individual function.
+The library will be runtime loaded (dlopen); it will expose functions
+through a function pointer that is returned through a query lookup
+method, which it returned through a library entry (extern "C") function.
+
+CPU function arguments are expected to be `xrt::bo` objects, for
+example format converting functions will take an input buffer and
+and populate an output buffer, both buffers must be specified in the
+resource buffer section of the recipe.
+
+A library path is relative to the install location of XRT based on 
+the environment value of `XILINX_XRT` or from its inferred location if
+not set.  On windows, the inferred location would be the driver store.
+
+```
+  "resources": {
+    "cpus": [
+      {
+        "name": "convert_ifm",
+        "library_path": "umd/convert.dll"
+      },
+      {
+        "name": "convert_ofm",
+        "library_path": "umd/convert.dll"
+      },
+      {
+        "name": "average_pool",
+        "library_path": "umd/operators.dll"
+      }
+    ]
+  },
+```
+
+### Buffer
+
+The buffer instances listed in the resources section refer to
+`xrt::bo` objects that are used during execution of kernels. The
+buffers can be graph inputs or outputs, which refer to application
+created input and output tensors, or they can be internal buffers used
+during execution of the compiled graph at the discretion of the
+compiler (VAIML).
+
+#### External buffers (graph input and output)
+
+External buffers (input and output) are created by the framework /
+applicaiton outside of the runner and bound to the recipe during
+execution.  The runner itself does not create `xrt::bo` objects for
+external buffers, but does rely on the framework to bind these buffers
+to runner object created from the recipe.   The external buffers must
+still be listed in the resources section and specify a name that can 
+be used when execution sets kernel arguments.
+
+```
+  "resources": {
+    "buffers": [
+      {
+        "name": "wts",
+        "type": "input",
+      },
+      {
+        "name": "ifm",
+        "type": "input",
+      },
+      {
+        "name": "ofm",
+        "type": "output",
+      }
+    ]
+  }
+
+``` 
+
+The `name` of the buffers in the resources section must be unique.
+The name is used in the `execution` seciton to refer to kernel or cpu
+buffer arguments.
+
+<!-- The `src` of the buffers is meant to refer to a tensor name in the
+graph, but the use of this field is TBD as it does not appear to be
+required.  The `name` itself is enough to identify the buffer, both
+within the recipe and for external frame works to bind external
+created buffers to the graph. -->
+
+#### Internal buffers
+
+Internal buffers are created and managed by the runner. These are
+buffers that are used internally within a graph to carry data from one
+kernel or cpu execution to another.
+
+These buffers are created and managed by runner, hence unlike the
+external buffers, the size of internal buffer size must be specified
+in the recipe.
+
+```
+  "resources": {
+    "buffers": [
+      {
+        "name": "ifm_int",
+        "type": "internal",
+        "size": "1024"
+      },
+      {
+        "name": "ofm_int",
+        "type": "internal",
+        "size": "1024"
+      },
+      {
+        "name": "b0",
+        "type": "internal",
+        "size": "1024"
+      },
+      {
+        "name": "b1",
+        "type": "internal:,
+        "size": "1024"
+      },
+      {
+        "name": "b2",
+        "type": "internal",
+        "size": "1024"
+      }
+    ]
+  }
+
+``` 
+The `size` is currently specified in bytes, we could add support
+K/M, e.g. `1048576 = 1024K = 1M`
+
+## Execution
+
+The execution section is an ordered list of xrt::kernel or cpu runs
+with arguments from the resources section.
+
+Before the runner can execute the recipe in the execution section, all
+graph inputs and outputs must be bound to the recipe. As mentioned
+earlier, external inputs and outputs are defined by the framework that
+uses the runner.  Typically these external inputs are outputs are not
+available at the time when the runner is initialized from the recipe
+json.  In other words, the runner can be created even before the
+framework has created input and output tensors, but it can of course
+not be executed until the inputs and outputs are defined. The runner
+API has methods that must be called to bind the external inputs and
+outputs.
+
+Arguments to a run can be a sub-buffer of the corresponding
+resource.  A buffer in the resources section refer to the full buffer,
+but a run can use just a portion of the resource.  By default
+a run argument will use the full buffer, but optional attributes in
+the json for a buffer can specify the size and an offset into the
+resource buffer.
+
+As an example below, the kernel resource `k1` is executed twice with 
+3 arguments. The 3rd input is a sub-buffer of the `ifm_int` resource, the
+4th is the full resource `wts`, and the finally the 5th is a
+sub-buffer of `ofm_int`.
+
+The example illustrates the calling of a CPU function from the `cpu`
+resources section.  The CPU function calls are passed buffers from the
+resources section and scalar values as needed.
+
+```
+  "execution": {
+    "runs": [
+      {
+        "name": "convert_ifm",
+        "where": "cpu",
+        "arguments" : [
+            { "name": "ifm", "argidx": 0 },
+            { "name": "ifm_int", "argidx": 1 }
+         ],
+         "constants" : [
+            { "value": "nchw2nchw4c", "type": "string", "argidx": 2 }
+         ]
+        ]
+      },
+      {
+        "name": "k1",
+        "arguments" : [
+            { "name": "ifm_int", "size": 512, "offset": 0, "argidx": 3 },
+            { "name": "wts", "argidx": 4 },
+            { "name": "ofm_int", "size": 512, "offset": 512, "argidx": 5 }
+        ]
+      },
+      {
+        "name": "k1",
+        "arguments" : [
+            { "name": "ifm_int", "size": 512, "offset": 512, "argidx": 3 },
+            { "name": "wts", "argidx": 4 },
+            { "name": "ofm_int", "size": 512, "offset": 0, "argidx": 5 }
+        ]
+      },
+      {
+        "name": "convert_ofm",
+        "where": "cpu"
+        "arguments" : [
+            { "name": "ofm_int", "argidx": 0 },
+            { "name": "ofm", "argidx": 1 }
+         ],
+         "constants" : [
+            { "value": "nchw4c2nchw", "argidx": 2 }
+         ]
+        ]
+      },
+      ...
+    ]
+  }
+```
+
+The runner internally creates sub-buffers out of the specified
+resource buffers for each run. Both external and internal
+resource buffers can be sliced and diced as required.
+
+The runner creates `xrt::run` or `xrt_core::cpu::run` objects out of
+the specified execution runs.  The runner creates a CPU or NPU runlist
+for each contiguous sequence of CPU runs or NPU runs specified in the
+run recipe. The runlist is inserted into a vector of runlists where
+each individual runlist will be executed in sequence, when the
+framework calls the runner API execute method.
+
+In addition to the buffer arguments referring to resource buffers, the
+xclbin kernels and cpu functions may have additional arguments that
+need to be set. For example the current DPU kernel have 8 arguments
+and some of these must be set to some sentinel value.  Here the
+argument with index 0, represents the kernel opcode which specifies
+the type of control packet used for the kernel resource object.  The
+value `3` implies transaction buffer.
+
+```
+  "execution": {
+    "runs": [
+      {
+        "name": "k1",
+        "arguments" : [
+            { "name": "wts", "argidx": 4 },
+            { "name": "ifm", "argidx": 3 },
+            { "name": "ofm", "argidx": 5 }
+        ],
+        "constants" : [
+            { "value": "3", "type": "int", "argidx": 0 },
+            { "value": "0", "type": "int", "argidx": 1 },
+            { "value": "0", "type": "int", "argidx": 2 },
+            { "value": "0", "type": "int", "argidx": 6 },
+            { "value": "0", "type": "int", "argidx": 7 }
+        ]
+      }
+    ]
+  }
+```
+
+# Complete run recipe
+
+For illustration here is a simple complete run-recipe.json file that
+has been validated on NPU.  There are no internal buffer and external
+input and output are consumed during one kernel execution.  See the 
+`runner/test/recipe.json` for an example leveraging cpu functions.
+
+```
+{
+  "header": {
+    "xclbin_path": "design.xclbin",
+  },
+  "resources": {
+    "buffers": [
+      {
+        "name": "wts",
+        "type": "input",
+      },
+      {
+        "name": "ifm",
+        "type": "input",
+      },
+      {
+        "name": "ofm",
+        "type": "output",
+      }
+    ],
+    "kernels": [
+      {
+        "name": "k1",
+        "xclbin_kernel_name": "DPU",
+        "ctrlcode": "no-ctrl-packet.elf"
+      }
+    ]
+  },
+  "execution": {
+    "runs": [
+      {
+        "name": "k1",
+        "arguments" : [
+            { "name": "wts", "argidx": 4 },
+            { "name": "ifm", "argidx": 3 },
+            { "name": "ofm", "argidx": 5 }
+         ],
+         "constants": [
+            { "value": "3", "type": "int", "argidx": 0 },
+            { "value": "0", "type": "int", "argidx": 1 },
+            { "value": "0", "type": "int", "argidx": 2 },
+            { "value": "0", "type": "int", "argidx": 6 },
+            { "value": "0", "type": "int", "argidx": 7 }
+        ]
+      }
+    ]
+  }
+}
+```
+
+# Runner API
+
+The runner is contructed from a recipe json file and a device object.
+The runner is a standard XRT C++ first class object with the following
+API.  Include documentation will be beefed up when the runner code is 
+moved to public XRT.
+
+```
+class runner_impl;
+class runner
+{
+  std::shared_ptr<runner_impl> m_impl;  // probably unique_ptr is enough
+public:
+  // ctor - Create runner from a recipe json
+  runner(const xrt::device& device, const std::string& recipe);
+
+  // bind_input() - Bind a buffer object to an input tensor
+  void
+  bind_input(const std::string& name, const xrt::bo& bo);
+
+  // bind_output() - Bind a buffer object to an output tensor
+  void
+  bind_output(const std::string& name, const xrt::bo& bo);
+
+  // execute() - Execute the runner
+  void
+  execute();
+
+  // wait() - Wait for the execution to complete
+  void
+  wait();
+};
+```
+
+# CPU library requirements
+
+The run recipe can refer to functions executed on the CPU.  These
+functions should be implemented in a shared library that can be 
+loaded at runtime by the runner based on `resources/cpus` section.
+
+A referenced library is loaded by the runner, which subsequently looks
+for exported entry point (symbol) called `open` to initialize the shared 
+library. The `open()` is supposed to return function objects for callback 
+functions within the library.   At present time, only one callback function
+is required is the `lookup()` function, which the runner 
+uses to lookup functions referenced in the recipe resources section.
+
+The `lookup()` function must return the callable function that the
+runner is requesting along with the number of arguments this function
+expects.  If the function the runner is looking for is not available,
+then the `lookup()` function should throw an exception (TODO: define
+the exact exception to throw).  The reason the `lookup()` function is
+not itself an exported "extern C" function like `open()` is that the
+call semantics must be C++ with the bells and whistles that follow
+(exceptions).
+
+The signature of the `extern "C"` exported `open()` function and the 
+C++ signature of the `lookup()` function is defined in `xrt_runner.h`
+under `namespace xrt::cpu { ... }`.
+
+```
+/**
+ * The xrt::runner supports execution of CPU functions as well
+ * as xrt::kernel objects.
+ *
+ * The CPU functions are implemented in runtime loaded dynamic
+ * libraries. A library must define and export a function that
+ * initializes a callback structure with a lookup function.
+ *
+ * The signature of the lookup function must be
+ * @code
+ *  void lookup_fn(const std::string& name, xrt::cpu::lookup_args* args)
+ * @endcode
+ * where the name is the name of the function to lookup and args is a
+ * structure that the lookup function must populate with the function
+ * information.
+ *
+ * The arguments to the CPU functions are elided via std::any and
+ * the signature of the CPU functions is fixed to
+ * @code
+ *  void cpu_function(std::vector<std::any>& args)
+ * @endcode
+ * Internally, the CPU library unwraps the arguments and calls the
+ * actual function.
+ */
+namespace xrt::cpu {
+/**
+ * struct lookup_args - argument structure for the lookup function
+ *
+ * The lookup function takes as arguments the name of the function
+ * to lookup along with lookup_args to be populated with information
+ * about the function.
+ *
+ * @num_args - number of arguments to function
+ * @callable - a C++ function object wrapping the function
+ *
+ * The callable library functions uses type erasure on their arguments
+ * through a std::vector of std::any objects.  The callable must
+ * unwrap the std::any objects to its expected type, which is
+ * cumbersome, but type safe. The type erased arguments allow the
+ * runner to be generic and not tied to a specific function signature.
+*/
+struct lookup_args
+{
+  std::uint32_t num_args;
+  std::function<void(std::vector<std::any>&)> callable;
+};
+
+/**
+ * struct library_init_args - argument structure for libray initialization
+ *
+ * The library initialization function is the only function exported
+ * from the run time loaded library.  The library initialization
+ * function is called by the runner when a resource references a
+ * function in a library and the library is not already loaded.
+ *
+ * @lookup_fn - a callback function to be populated with the
+ *   lookup function.
+ *
+ * The library initialization function is C callable exported symbol,
+ * but returns a C++ function pointer to the lookup function.
+*/
+struct library_init_args
+{
+  std::function<void(const std::string&, lookup_args*)> lookup_fn;
+};
+
+/**
+ * library_init_fn - type of the library initialization function
+ * The name of the library initialization function is fixed to
+ * "library_init".
+*/
+using library_init_fn = void (*)(library_init_args*);
+} // xrt::cpu
+
+```
+
+A unit test for the cpu library and corresponding sample run recipe
+that references the cpu library is under `test/cpulib.cpp` and
+`test/main.cpp`
+
+
+
+
+
diff --git a/src/runtime_src/core/common/runner/cpu.cpp b/src/runtime_src/core/common/runner/cpu.cpp
new file mode 100644
index 00000000000..c933804e0cd
--- /dev/null
+++ b/src/runtime_src/core/common/runner/cpu.cpp
@@ -0,0 +1,193 @@
+// SPDX-License-Identifier: Apache-2.0
+// Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.
+#define XCL_DRIVER_DLL_EXPORT  // in same dll as exported xrt apis
+#define XRT_CORE_COMMON_SOURCE // in same dll as coreutil
+#define XRT_API_SOURCE         // in same dll as coreutil
+
+//#define XRT_VERBOSE
+#include "cpu.h"
+
+#include "core/common/debug.h"
+#include "core/common/dlfcn.h"
+
+#include <any>
+#include <filesystem>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <tuple>
+#include <vector>
+
+namespace {
+
+using lookup_args = xrt_core::cpu::lookup_args;
+using library_init_args = xrt_core::cpu::library_init_args;
+using library_init_fn = xrt_core::cpu::library_init_fn;
+  
+// struct dllwrap - wrapper class to manange the lifetime of a loaded library
+struct dllwrap
+{
+  using dll_guard = std::unique_ptr<void, decltype(&xrt_core::dlclose)>;
+  dll_guard dll;
+
+  explicit dllwrap(const std::filesystem::path& path)
+    : dll{xrt_core::dlopen(path.string().c_str(), RTLD_NOW | RTLD_GLOBAL), xrt_core::dlclose}
+  {
+    if (!dll)
+      throw std::runtime_error("Failed to open " + path.string() + ": " + xrt_core::dlerror());
+
+    XRT_DEBUGF("dllwrap::dllwrap(%s) loaded\n", path.c_str());
+  }
+};
+
+// Control the order of destruction of static objects. In particular
+// the dlls cannot be unloaded before the library init args have been
+// destroyed
+static std::map<std::filesystem::path, dllwrap> s_library_handles;   // NOLINT
+static std::map<std::string, lookup_args> s_function_map;            // NOLINT
+static std::map<std::string, library_init_args> s_library_callbacks; // NOLINT
+static std::mutex s_mutex;                                           // NOLINT
+
+static std::filesystem::path
+adjust_path(std::filesystem::path path)
+{
+#ifdef _WIN32
+  std::filesystem::path fn = path.filename();
+  fn += ".dll";
+#else
+  std::filesystem::path fn = "lib";
+  fn += path.filename();
+  fn += ".so";
+#endif
+  return path.replace_filename(fn);
+}
+
+static void*
+open_library(std::filesystem::path dll)
+{
+  std::lock_guard<std::mutex> lock(s_mutex);
+  if (auto it = s_library_handles.find(dll); it != s_library_handles.end())
+    return it->second.dll.get();
+
+  auto [it, inserted] = s_library_handles.emplace(dll, dllwrap{dll});
+  return it->second.dll.get();
+}
+
+static const lookup_args*
+lookup(const std::string& lname, const std::string& fname)
+{
+  XRT_DEBUGF("lookup(%s, %s)\n", lname.c_str(), fname.c_str());
+
+  // Check if the function is already loaded
+  std::lock_guard<std::mutex> lock(s_mutex);
+  if (auto it = s_function_map.find(fname); it != s_function_map.end())
+    return &it->second;
+
+  // Check if the library is not already loaded in which case load and
+  // initialize the library to get the callback functions
+  auto cb_itr = s_library_callbacks.find(lname);
+  if (cb_itr == s_library_callbacks.end()) { // load and initialize
+    auto lhdl = open_library(adjust_path(lname));
+    auto sym = xrt_core::dlsym(lhdl, "library_init");
+    auto init = reinterpret_cast<library_init_fn>(sym);
+    library_init_args args;
+    init(&args);
+    std::tie(cb_itr, std::ignore) = s_library_callbacks.emplace(lname, std::move(args));
+  }
+
+  // Use lookup callback function to get the function information, which
+  // is cached for future reference
+  auto& cb = cb_itr->second;
+  lookup_args args;
+  cb.lookup_fn(fname, &args);
+  auto [fitr, emplaced] = s_function_map.emplace(fname, std::move(args));
+  return &fitr->second;
+}
+
+} // namespace
+
+namespace xrt_core::cpu {
+
+class function_impl
+{
+  const lookup_args* m_fcn_info;
+public:
+  function_impl(const std::string& name, const std::string& libname)
+    : m_fcn_info{lookup(libname, name)}
+  {}
+
+  uint32_t
+  get_number_of_args() const
+  {
+    return m_fcn_info->num_args;
+  }
+
+  void
+  call(std::vector<std::any>& args) const
+  {
+    m_fcn_info->callable(args);
+  }
+};
+
+// class run - Facade for exexcuting functions within a library on the CPU
+//
+// Provides interface for run-time loading of a library with functions
+// to be executed on the CPU by the xrt::runner class.
+class run_impl
+{
+  std::shared_ptr<function_impl> m_fn;
+  std::vector<std::any> m_args;
+
+public:
+  explicit run_impl(std::shared_ptr<function_impl> fn)
+    : m_fn{std::move(fn)}
+    , m_args(m_fn->get_number_of_args()) // cannot be initializer list
+  {}
+
+  void
+  set_arg(int argidx, std::any value)
+  {
+    m_args.at(argidx) = std::move(value);
+  }
+
+  void
+  execute()
+  {
+    // Call the function
+    m_fn->call(m_args);
+  }
+};
+
+////////////////////////////////////////////////////////////////
+function::
+function(const std::string& fname, const std::string& lname)
+  : m_impl(std::make_shared<function_impl>(fname, lname))
+{}
+
+function::
+~function() = default;
+  
+run::
+run(const function& f)
+  : m_impl{std::make_shared<run_impl>(f.get_handle())}
+{}
+
+run::
+~run() = default;
+
+void
+run::
+set_arg(int argidx, const std::any& value)
+{
+  m_impl->set_arg(argidx, value);
+}
+
+void
+run::
+execute()
+{
+  m_impl->execute();
+}
+
+} // namespace xrt_core::cpu
diff --git a/src/runtime_src/core/common/runner/cpu.h b/src/runtime_src/core/common/runner/cpu.h
new file mode 100644
index 00000000000..d66389da250
--- /dev/null
+++ b/src/runtime_src/core/common/runner/cpu.h
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: Apache-2.0
+// Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.
+#ifndef XRT_COMMON_RUNNER_CPU_H_
+#define XRT_COMMON_RUNNER_CPU_H_
+#include "core/common/config.h"
+#include "runner.h"
+
+#include <any>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <vector>
+#include <string>
+
+namespace xrt_core::cpu {
+
+// class function - Manage a CPU function within a library
+//
+// Functions are created by the xrt::runner class as part of
+// initializing resources specified in a run-recipe json.
+class function_impl;
+class function
+{
+  std::shared_ptr<function_impl> m_impl;
+public:
+  XRT_CORE_COMMON_EXPORT
+  function(const std::string& fcn, const std::string& libname);
+
+  XRT_CORE_COMMON_EXPORT
+  ~function();
+
+  std::shared_ptr<function_impl>
+  get_handle() const
+  {
+    return m_impl;
+  }
+};
+
+// class run - Manage execution of a CPU function
+//
+// A run object is created by the xrt::runner class to bind arguments
+// specified in run-recipe json to the function and execute it.
+class run_impl;
+class run
+{
+  std::shared_ptr<run_impl> m_impl;
+ public:
+  XRT_CORE_COMMON_EXPORT
+  explicit run(const function&);
+
+  XRT_CORE_COMMON_EXPORT
+  ~run();
+
+  XRT_CORE_COMMON_EXPORT
+  void
+  set_arg(int argidx, const std::any& value);
+
+  XRT_CORE_COMMON_EXPORT
+  void
+  execute();
+}; // run
+
+} // xrt_core::cpu
+#endif
diff --git a/src/runtime_src/core/common/runner/runner.cpp b/src/runtime_src/core/common/runner/runner.cpp
new file mode 100644
index 00000000000..fc49820672b
--- /dev/null
+++ b/src/runtime_src/core/common/runner/runner.cpp
@@ -0,0 +1,1086 @@
+// SPDX-License-Identifier: Apache-2.0
+// Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.
+#define XCL_DRIVER_DLL_EXPORT  // in same dll as exported xrt apis
+#define XRT_CORE_COMMON_SOURCE // in same dll as coreutil
+#define XRT_API_SOURCE         // in same dll as coreutil
+
+#define XRT_VERBOSE
+#include "runner.h"
+#include "cpu.h"
+
+#include "core/common/debug.h"
+#include "core/common/dlfcn.h"
+#include "core/common/error.h"
+#include "core/common/module_loader.h"
+#include "core/include/xrt/xrt_bo.h"
+#include "core/include/xrt/xrt_device.h"
+#include "core/include/xrt/xrt_hw_context.h"
+#include "core/include/xrt/xrt_kernel.h"
+#include "core/include/experimental/xrt_elf.h"
+#include "core/include/experimental/xrt_ext.h"
+#include "core/include/experimental/xrt_kernel.h"
+#include "core/include/experimental/xrt_module.h"
+#include "core/include/experimental/xrt_queue.h"
+#include "core/include/experimental/xrt_xclbin.h"
+
+#ifdef _WIN32
+# pragma warning (push)
+# pragma warning (disable: 4702)
+#endif
+#include "boost/property_tree/json_parser.hpp"
+#include "boost/property_tree/ptree.hpp"
+#ifdef _WIN32
+# pragma warning (pop)
+#endif
+
+#include <istream>
+#include <map>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <variant>
+
+#ifdef _WIN32
+# pragma warning (disable: 4100 4189 4505)
+#endif
+
+namespace {
+
+const boost::property_tree::ptree default_ptree;
+
+// struct streambuf - wrap a std::streambuf around an external buffer
+//
+// This is used create elf files from memory through a std::istream
+struct streambuf : public std::streambuf
+{
+  streambuf(char* begin, char* end)
+  {
+    setg(begin, begin, end);
+  }
+
+  template <typename T>
+  streambuf(T* begin, T* end)
+    : streambuf(reinterpret_cast<char*>(begin), reinterpret_cast<char*>(end))
+  {}
+
+  template <typename T>
+  streambuf(const T* begin, const T* end) // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+    : streambuf(const_cast<T*>(begin), const_cast<T*>(end))
+  {}
+
+  std::streampos
+  seekpos(std::streampos pos, std::ios_base::openmode which) override
+  {
+    setg(eback(), eback() + pos, egptr());
+    return gptr() - eback();
+  }
+
+  std::streampos
+  seekoff(std::streamoff off, std::ios_base::seekdir way, std::ios_base::openmode which) override
+  {
+    if (way == std::ios_base::cur)
+      gbump(static_cast<int>(off));
+    else if (way == std::ios_base::end)
+      setg(eback(), egptr() + off, egptr());
+    else if (way == std::ios_base::beg)
+      setg(eback() + off, gptr(), egptr());
+    return gptr() - eback();
+  }
+};
+
+// Artifacts are encoded / referenced in recipe by string.
+// The artifacts can be stored in a file system or in memory
+// depending on how the recipe is loaded
+namespace artifacts {
+
+// class repo - artifact repository
+class repo
+{
+protected:
+  mutable std::map<std::string, std::vector<char>> m_data;
+
+public:
+  virtual ~repo() = default;
+
+  virtual const std::vector<char>&
+  get(const std::string& path) const = 0;
+};
+
+// class file_repo - file system artifact repository
+// Artifacts are loaded from disk and stored in persistent storage  
+class file_repo : public repo
+{
+public:
+  const std::vector<char>&
+  get(const std::string& path) const override
+  {
+    if (auto it = m_data.find(path); it != m_data.end())
+      return (*it).second;
+
+    std::ifstream ifs(path, std::ios::binary);
+    if (!ifs)
+      throw std::runtime_error{"Failed to open file: " + path};
+
+    ifs.seekg(0, std::ios::end);
+    std::vector<char> data(ifs.tellg());
+    ifs.seekg(0, std::ios::beg);
+    ifs.read(data.data(), data.size());
+    auto [itr, success] = m_data.emplace(path, std::move(data));
+    
+    return (*itr).second;
+  }
+};
+
+// class ram_repo - in-memory artifact repository
+// Used artifacts are copied to persistent storage
+class ram_repo : public repo
+{
+  const std::map<std::string, std::vector<char>>& m_reference;
+public:
+  explicit ram_repo(const std::map<std::string, std::vector<char>>& data)
+    : m_reference{data}
+  {}
+
+  const std::vector<char>&
+  get(const std::string& path) const override
+  {
+    if (auto it = m_data.find(path); it != m_data.end())
+      return (*it).second;
+
+    if (auto it = m_reference.find(path); it != m_reference.end()) {
+      auto [itr, success] = m_data.emplace(path, it->second);
+      return (*itr).second;
+    }
+
+    throw std::runtime_error{"Failed to find artifact: " + path};
+  }
+};
+
+} // namespace artifacts
+
+namespace module_cache {
+
+// Cache of elf files to modules to avoid recreating modules
+// referring to the same elf file.
+static std::map<std::string, xrt::elf> s_path2elf; // NOLINT
+static std::map<xrt::elf, xrt::module> s_elf2mod;  // NOLINT
+
+static xrt::module
+get(const xrt::elf& elf)
+{
+  if (auto it = s_elf2mod.find(elf); it != s_elf2mod.end())
+    return (*it).second;
+
+  xrt::module mod{elf};
+  s_elf2mod.emplace(elf, mod);
+  return mod;
+}
+
+static xrt::module
+get(const std::string& path, const artifacts::repo& repo)
+{
+  if (auto it = s_path2elf.find(path); it != s_path2elf.end())
+    return get((*it).second);
+
+  auto& data = repo.get(path);
+  streambuf buf{data.data(), data.data() + data.size()};
+  std::istream is{&buf};
+  xrt::elf elf{is};
+  s_path2elf.emplace(path, elf);
+
+  return get(elf);
+}
+
+} // module_cache
+
+class recipe
+{
+  // class header - header section of the recipe
+  class header
+  {
+    xrt::xclbin m_xclbin;
+
+    static xrt::xclbin
+    read_xclbin(const boost::property_tree::ptree& pt, const artifacts::repo& repo)
+    {
+      auto path = pt.get<std::string>("xclbin_path");
+      auto data = repo.get(path);
+      return xrt::xclbin{data};
+    }
+
+  public:
+    header(const boost::property_tree::ptree& pt, const artifacts::repo& repo)
+      : m_xclbin{read_xclbin(pt, repo)}
+    {
+      XRT_DEBUGF("Loaded xclbin: %s\n", m_xclbin.get_uuid().to_string().c_str());
+    }
+
+    header(const header&) = default;
+
+    xrt::xclbin
+    get_xclbin() const
+    {
+      return m_xclbin;
+    }
+  }; // class recipe::header
+
+  // class resources - resource section of the recipe
+  class resources
+  {
+  public:
+    class buffer
+    {
+      std::string m_name;
+
+      enum class type { input, output, internal };
+      type m_type;
+
+      size_t m_size;
+
+      // Buffer object is created for internal nodes, but not for
+      // input/output which are bound during execution.
+      xrt::bo m_xrt_bo;
+
+      // Only internal buffers have a size and are created during
+      // as part of loading the recipe.  External buffers are bound
+      // during execution.
+      buffer(const xrt::device& device, std::string name, type t, size_t sz)
+        : m_name(std::move(name))
+        , m_type(t)
+        , m_size(sz)
+        , m_xrt_bo{m_type == type::internal ? xrt::ext::bo{device, m_size} : xrt::bo{}}
+      {
+        XRT_DEBUGF("recipe::resources::buffer(%s)\n", m_name.c_str());
+      }
+
+      // Copy constructor creates a new buffer with same properties as other
+      // The xrt::bo is not copied, but a new one is created.
+      buffer(const xrt::device& device, const buffer& other)
+        : m_name(other.m_name)
+        , m_type(other.m_type)
+        , m_size(other.m_size)
+        , m_xrt_bo{m_type == type::internal ? xrt::ext::bo{device, m_size} : xrt::bo{}}
+      {}
+
+      static type
+      to_type(const std::string& t)
+      {
+        if (t == "input")
+          return type::input;
+        if (t == "output")
+          return type::output;
+        if (t == "internal")
+          return type::internal;
+
+        throw std::runtime_error("Unknown buffer type '" + t + "'");
+      }
+    public:
+      buffer(const buffer& rhs) = default;
+      buffer(buffer&& rhs) = default;
+
+      // create_buffer - create a buffer object from a property tree node
+      static buffer
+      create_buffer(const xrt::device& device, const boost::property_tree::ptree& pt)
+      {
+        auto tp = to_type(pt.get<std::string>("type")); // required, input/output/internal
+        auto sz = (tp == type::internal) ? pt.get<size_t>("size") : 0; // required for internal buffers
+        return {device, pt.get<std::string>("name"), tp, sz};
+      }
+
+      // create_buffer - create a buffer object from another buffer object
+      // This will create a new buffer object with the same properties as the
+      // other buffer, but with a new xrt::bo object.
+      static buffer
+      create_buffer(const xrt::device& device, const buffer& other)
+      {
+        return {device, other};
+      }
+
+      xrt::bo
+      get_xrt_bo() const
+      {
+        return m_xrt_bo;
+      }
+
+      std::string
+      get_name() const
+      {
+        return m_name;
+      }
+
+      void
+      bind(const xrt::bo& bo)
+      {
+        m_xrt_bo = bo;
+      }
+    }; // class recipe::resources::buffer
+
+    class kernel
+    {
+      std::string m_name;
+      std::string m_xclbin_name;
+      xrt::xclbin::kernel m_xclbin_kernel;
+      xrt::kernel m_xrt_kernel;
+
+      // Kernel must be in xclbin.  The xclbin was used when the hwctx was
+      // constructed.  Here we lookup the xclbin kernel object for additional
+      // meta data (may not be needed).
+      kernel(const xrt::hw_context& ctx, const xrt::module& mod, std::string name, std::string xname)
+        : m_name{std::move(name)}
+        , m_xclbin_name{std::move(xname)}
+        , m_xclbin_kernel{ctx.get_xclbin().get_kernel(m_xclbin_name)}
+        , m_xrt_kernel{xrt::ext::kernel{ctx, mod, m_xclbin_name}}
+      {
+        XRT_DEBUGF("recipe::resources::kernel(%s, %s)\n", m_name.c_str(), m_xclbin_name.c_str());
+      }
+
+      // Legacy kernel (alveo)
+      kernel(const xrt::hw_context& ctx, std::string name, std::string xname)
+        : m_name(std::move(name))
+        , m_xclbin_name(std::move(xname))
+        , m_xclbin_kernel{ctx.get_xclbin().get_kernel(m_xclbin_name)}
+        , m_xrt_kernel{xrt::kernel{ctx, m_xclbin_name}}
+      {
+        XRT_DEBUGF("recipe::resources::kernel(%s, %s)\n", m_name.c_str(), m_xclbin_name.c_str());
+      }
+
+    public:
+      kernel(const kernel& rhs) = default;
+      kernel(kernel&& rhs) = default;
+      
+      // create_kernel - create a kernel object from a property tree node
+      // The kernel control module is created if necessary.
+      static kernel
+      create_kernel(const xrt::hw_context& hwctx, const boost::property_tree::ptree& pt,
+                    const artifacts::repo& repo)
+      {
+        auto name = pt.get<std::string>("name"); // required, default xclbin kernel name
+        auto elf = pt.get<std::string>("ctrlcode", ""); // optional elf file
+        if (elf.empty())
+          return kernel{hwctx, name, pt.get<std::string>("xclbin_kernel_name", name)};
+
+        auto mod = module_cache::get(elf, repo);
+        return kernel{hwctx, mod, name, pt.get<std::string>("xclbin_kernel_name", name)};
+      }
+
+      xrt::kernel
+      get_xrt_kernel() const
+      {
+        return m_xrt_kernel;
+      }
+    }; // class recipe::resources::kernel
+
+    class cpu
+    {
+    private:
+      std::string m_name;
+      std::string m_path;
+      xrt_core::cpu::function m_fcn;
+
+      cpu(std::string name, std::string path)
+        : m_name{std::move(name)}
+        , m_path{std::move(path)}
+        , m_fcn{m_name, m_path}
+      {
+        XRT_DEBUGF("recipe::resources::cpu(%s, %s)\n", m_name.c_str(), m_path.c_str());
+      }
+
+    public:
+      cpu(const cpu& rhs) = default;
+      cpu(cpu&& rhs) = default;
+
+      // create_cpu - create a cpu object from a property tree node
+      static cpu
+      create_cpu(const boost::property_tree::ptree& pt)
+      {
+        auto name = pt.get<std::string>("name"); // required
+        auto library_path = xrt_core::environment::xilinx_xrt()
+          / pt.get<std::string>("library_path"); // required
+        return cpu{name, library_path.string()};
+      }
+
+      xrt_core::cpu::function
+      get_function() const
+      {
+        return m_fcn;
+      }
+    }; // class recipe::resources::cpu
+
+    xrt::device m_device;
+    xrt::hw_context m_hwctx;
+    std::map<std::string, buffer> m_buffers;
+    std::map<std::string, kernel> m_kernels;
+    std::map<std::string, cpu>    m_cpus;
+
+    // create_buffers - create buffer objects from buffer property tree nodes
+    static std::map<std::string, buffer>
+    create_buffers(const xrt::device& device, const boost::property_tree::ptree& pt)
+    {
+      std::map<std::string, buffer> buffers;
+      for (const auto& [name, node] : pt)
+        buffers.emplace(node.get<std::string>("name"), buffer::create_buffer(device, node));
+
+      return buffers;
+    }
+
+    // create_buffers - create buffer objects from buffer objects
+    // This will create new buffer objects with the same properties as the
+    // other buffers, but with new xrt::bo objects.
+    static std::map<std::string, buffer>
+    create_buffers(const xrt::device& device, const std::map<std::string, buffer>& others)
+    {
+      std::map<std::string, buffer> buffers;
+      for (const auto& [name, other] : others)
+        buffers.emplace(name, buffer::create_buffer(device, other));
+
+      return buffers;
+    }
+
+    // create_kernels - create kernel objects from kernel property tree nodes
+    static std::map<std::string, kernel>
+    create_kernels(xrt::device device, const xrt::hw_context& hwctx,
+                   const boost::property_tree::ptree& pt, const artifacts::repo& repo)
+    {
+      std::map<std::string, kernel> kernels;
+      for (const auto& [name, node] : pt)
+        kernels.emplace(node.get<std::string>("name"), kernel::create_kernel(hwctx, node, repo));
+
+      return kernels;
+    }
+
+    // create_cpus - create cpu objects from cpu property tree nodes
+    static std::map<std::string, cpu>
+    create_cpus(const boost::property_tree::ptree& pt)
+    {
+      std::map<std::string, cpu> cpus;
+      for (const auto& [name, node] : pt)
+        cpus.emplace(node.get<std::string>("name"), cpu::create_cpu(node));
+
+      return cpus;
+    }
+
+  public:
+    resources(xrt::device device, const xrt::xclbin& xclbin,
+              const boost::property_tree::ptree& recipe, const artifacts::repo& repo)
+      : m_device{std::move(device)}
+      , m_hwctx{m_device, m_device.register_xclbin(xclbin)}
+      , m_buffers{create_buffers(m_device, recipe.get_child("buffers"))}
+      , m_kernels{create_kernels(m_device, m_hwctx, recipe.get_child("kernels"), repo)}
+      , m_cpus{create_cpus(recipe.get_child("cpus", default_ptree))} // optional
+    {}
+
+    resources(const resources& other)
+      : m_device{other.m_device}                             // share device
+      , m_hwctx{other.m_hwctx}                               // share hwctx
+      , m_buffers{create_buffers(m_device, other.m_buffers)} // new buffers
+      , m_kernels{other.m_kernels}                           // share kernels
+      , m_cpus{other.m_cpus}                                 // share cpus
+    {}
+
+    xrt::hw_context
+    get_xrt_hwctx() const
+    {
+      return m_hwctx;
+    }
+
+    xrt::kernel
+    get_xrt_kernel_or_error(const std::string& name) const
+    {
+      auto it = m_kernels.find(name);
+      if (it == m_kernels.end())
+        throw std::runtime_error("Unknown kernel '" + name + "'");
+      return it->second.get_xrt_kernel();
+    }
+
+    xrt_core::cpu::function
+    get_cpu_function_or_error(const std::string& name) const
+    {
+      auto it = m_cpus.find(name);
+      if (it == m_cpus.end())
+        throw std::runtime_error("Unknown cpu '" + name + "'");
+      return it->second.get_function();
+    }
+
+    resources::buffer
+    get_buffer_or_error(const std::string& name) const
+    {
+      auto it = m_buffers.find(name);
+      if (it == m_buffers.end())
+        throw std::runtime_error("Unknown buffer '" + name + "'");
+
+      return it->second;
+    }
+  }; // class recipe::resources
+
+  // class execution - execution section of the recipe
+  class execution
+  {
+    class run
+    {
+      struct argument
+      {
+        resources::buffer m_buffer;
+
+        // Buffer object for the argument.  This can be a sub-buffer
+        // if the argument is sliced or it can be null bo if the
+        // argument is unbound.
+        size_t m_offset;
+        size_t m_size;   // 0 indicates the entire buffer
+        int m_argidx;
+
+        xrt::bo m_xrt_bo;
+
+        static xrt::bo
+        create_xrt_bo(const resources::buffer& buffer, size_t offset, size_t size)
+        {
+          auto bo = buffer.get_xrt_bo();
+          if (bo && (bo.size() < size))
+            throw std::runtime_error("buffer size mismatch for buffer: " + buffer.get_name());
+
+          if (bo && (size < bo.size()))
+            // sub-buffer
+            return xrt::bo{bo, size, offset};
+
+          return bo; // may be null bo for unbound buffer arguments
+        }
+
+        argument(const resources& resources, const boost::property_tree::ptree& pt)
+          : m_buffer{resources.get_buffer_or_error(pt.get<std::string>("name"))}
+          , m_offset{pt.get<size_t>("offset", 0)}
+          , m_size{pt.get<size_t>("size", 0)}
+          , m_argidx{pt.get<int>("argidx")}
+          , m_xrt_bo{create_xrt_bo(m_buffer, m_offset, m_size)}
+        {
+          XRT_DEBUGF("recipe::execution::run::argument(%s, %d, %d, %d) bound(%s)\n",
+                     m_buffer.get_name().c_str(), m_offset, m_size, m_argidx, m_xrt_bo ? "true" : "false");
+        }
+
+        void
+        bind(const xrt::bo& bo)
+        {
+          m_buffer.bind(bo);
+          m_xrt_bo = create_xrt_bo(m_buffer, m_offset, m_size);
+        }
+
+        xrt::bo
+        get_xrt_bo() const
+        {
+          return m_xrt_bo;
+        }
+      }; // class recipe::execution::run::argument
+
+      using run_type = std::variant<xrt::run, xrt_core::cpu::run>;
+      std::string m_name;
+      run_type m_run;
+      std::map<std::string, argument> m_args;
+
+      template <typename ArgType>
+      struct set_arg_visitor {
+        int m_idx;
+        ArgType m_value;
+        set_arg_visitor(int idx, ArgType&& arg) : m_idx(idx), m_value(std::move(arg)) {}
+        void operator() (xrt::run& run) const { run.set_arg(m_idx, m_value); }
+        void operator() (xrt_core::cpu::run& run) const { run.set_arg(m_idx, m_value); }
+      };
+
+      struct copy_visitor {
+        const std::string& m_name;
+        const resources& m_res;
+        copy_visitor(const std::string& nm, const resources& res) : m_name{nm}, m_res{res} {}
+        run_type operator() (const xrt::run&)
+        { return xrt::run{m_res.get_xrt_kernel_or_error(m_name)}; };
+        run_type operator() (const xrt_core::cpu::run&)
+        { return xrt_core::cpu::run{m_res.get_cpu_function_or_error(m_name)}; };
+      };
+
+      static std::map<std::string, argument>
+      create_and_set_args(const resources& resources, run_type run, const boost::property_tree::ptree& pt)
+      {
+        std::map<std::string, argument> args;
+        for (const auto& [name, node] : pt) {
+          argument arg {resources, node};
+          if (auto bo = arg.get_xrt_bo())
+            std::visit(set_arg_visitor{arg.m_argidx, std::move(bo)}, run);
+
+          args.emplace(node.get<std::string>("name"), std::move(arg));
+        }
+        return args;
+      }
+
+      static void
+      set_constant_args(run_type run, const boost::property_tree::ptree& pt)
+      {
+        for (const auto& [name, node] : pt) {
+          auto argidx = node.get<int>("argidx");
+          auto type = node.get<std::string>("type");
+          if (type == "int")
+            std::visit(set_arg_visitor{argidx, node.get<int>("value")}, run);
+          else if (type == "string")
+            std::visit(set_arg_visitor{argidx, node.get<std::string>("value")}, run);
+          else
+            throw std::runtime_error("Unknown constant argument type '" + type + "'");
+        }
+      }
+
+      static xrt_core::cpu::run
+      create_cpu_run(const resources& resources, const boost::property_tree::ptree& pt)
+      {
+        auto name = pt.get<std::string>("name");
+        return xrt_core::cpu::run{resources.get_cpu_function_or_error(name)};
+      }
+
+      static xrt::run
+      create_kernel_run(const resources& resources, const boost::property_tree::ptree& pt)
+      {
+        auto name = pt.get<std::string>("name");
+        return xrt::run{resources.get_xrt_kernel_or_error(name)};
+      }
+
+      static run_type
+      create_run(const resources& resources, const boost::property_tree::ptree& pt)
+      {
+        auto where = pt.get<std::string>("where", "npu");
+        if (where == "cpu")
+          return create_cpu_run(resources, pt);
+
+        return create_kernel_run(resources, pt);
+      }
+
+      static run_type
+      create_run(const resources& resources, const run& other)
+      {
+        return std::visit(copy_visitor{other.m_name, resources}, other.m_run);
+      }
+
+    public:
+      run(const resources& resources, const boost::property_tree::ptree& pt)
+        : m_name{pt.get<std::string>("name")}
+        , m_run{create_run(resources, pt)}
+        , m_args{create_and_set_args(resources, m_run, pt.get_child("arguments"))}
+      {
+        XRT_DEBUGF("recipe::execution::run(%s)\n", pt.get<std::string>("name").c_str());
+
+        if (auto constants = pt.get_child_optional("constants"))
+#if BOOST_VERSION >= 105600
+          set_constant_args(m_run, constants.value());
+#else
+          set_constant_args(m_run, constants.get());
+#endif
+      }
+
+      // Create a run from another run but using argument resources
+      // The ctor creates a new xrt::run or cpu::run from other, these
+      // runs refer to resources per argument resources
+      run(const resources& resources, const run& other)
+        : m_name{other.m_name}
+        , m_run{create_run(resources, other)}
+      {}
+
+      bool
+      is_npu_run() const
+      {
+        return std::holds_alternative<xrt::run>(m_run);
+      }
+
+      bool
+      is_cpu_run() const
+      {
+        return std::holds_alternative<xrt_core::cpu::run>(m_run);
+      }
+
+      xrt::run
+      get_xrt_run() const
+      {
+        if (std::holds_alternative<xrt::run>(m_run))
+          return std::get<xrt::run>(m_run);
+
+        throw std::runtime_error("recipe::execution::run::get_xrt_run() called on a CPU run");
+      }
+
+      xrt_core::cpu::run
+      get_cpu_run() const
+      {
+        if (std::holds_alternative<xrt_core::cpu::run>(m_run))
+          return std::get<xrt_core::cpu::run>(m_run);
+
+        throw std::runtime_error("recipe::execution::run::get_cpu_run() called on a GPU run");
+      }
+
+      void
+      bind(const std::string& name, const xrt::bo& bo)
+      {
+        auto it = m_args.find(name);
+        if (it == m_args.end())
+          return; // the argument is not used in this run
+
+        auto& arg = (*it).second;
+        arg.bind(bo);
+        std::visit(set_arg_visitor{arg.m_argidx, arg.get_xrt_bo()}, m_run);
+      }
+    }; // class recipe::execution::run
+
+    // struct runlist - a list of runs to execute
+    // Need to support CPU and NPU runlists.  The CPU runlist will be
+    // a vector of xrt_core::cpu::run objects. The NPU runlist is
+    // simply an xrt::runlist object.
+    struct runlist
+    {
+      virtual ~runlist() = default;
+      virtual void execute() = 0;
+      virtual void wait() {}
+    };
+
+    struct cpu_runlist : runlist
+    {
+      std::vector<xrt_core::cpu::run> m_runs;
+      
+      void
+      execute() override
+      {
+        for (auto& run : m_runs)
+          run.execute();
+      }
+    };
+
+    struct npu_runlist : runlist
+    {
+      xrt::runlist m_runlist;
+
+      explicit npu_runlist(const xrt::hw_context& hwctx)
+        : m_runlist{hwctx}
+      {}
+
+      void
+      execute() override
+      {
+        m_runlist.execute();
+      }
+
+      void
+      wait() override
+      {
+        m_runlist.wait();
+      }
+    };
+
+
+    std::vector<run> m_runs;
+    xrt::queue m_queue;        // Queue that executes the runlists in sequence
+    xrt::queue::event m_event; // Event that signals the completion of the last runlist
+    std::exception_ptr m_eptr;
+
+    std::vector<std::unique_ptr<runlist>> m_runlists;
+
+    static std::vector<std::unique_ptr<runlist>>
+    create_runlists(const resources& resources, const std::vector<run>& runs)
+    {
+      std::vector<std::unique_ptr<runlist>> runlists;
+
+      // A CPU or NPU runlist is created for each contiguous sequence
+      // of CPU runs or NPU runs. The runlist is inserted into a
+      // vector of runlists where each individual runlist will be
+      // executed in sequence.
+      npu_runlist* nrl = nullptr;
+      cpu_runlist* crl = nullptr;
+      for (const auto& run : runs) {
+        if (run.is_npu_run()) {
+          if (crl)
+            crl = nullptr;
+
+          if (!nrl) {
+            auto rl = std::make_unique<npu_runlist>(resources.get_xrt_hwctx());
+            nrl = rl.get();
+            runlists.push_back(std::move(rl));
+          }
+
+          nrl->m_runlist.add(run.get_xrt_run());
+        }
+        else if (run.is_cpu_run()) {
+          if (nrl) 
+            nrl = nullptr;
+
+          if (!crl) {
+            auto rl = std::make_unique<cpu_runlist>();
+            crl = rl.get();
+            runlists.push_back(std::move(rl));
+          }
+
+          crl->m_runs.push_back(run.get_cpu_run());
+        }
+      }
+      return runlists;
+    }
+
+    // create_runs() - create a vector of runs from a property tree
+    static std::vector<run>
+    create_runs(const resources& resources, const boost::property_tree::ptree& pt)
+    {
+      std::vector<run> runs;
+      for (const auto& [name, node] : pt)
+        runs.emplace_back(resources, node);
+
+      return runs;
+    }
+
+    // create_runs() - create a vector of runs from existing runs
+    // A run object is a variant, the new run objects are created
+    // from the variant matching the type of the existing run.
+    static std::vector<run>
+    create_runs(const resources& resources, const std::vector<run>& others)
+    {
+      std::vector<run> runs;
+      for (const auto& run : others)
+        runs.emplace_back(resources, run);
+
+      return runs;
+    }
+
+  public:
+    // execution() - create an execution object from a property tree
+    // The runs are created from the property tree and either xrt::run
+    // or cpu::run objects.
+    execution(const resources& resources, const boost::property_tree::ptree& recipe)
+      : m_runs{create_runs(resources, recipe.get_child("runs"))}
+      , m_runlists{create_runlists(resources, m_runs)}
+    {}
+
+    // execution() - create an execution object from existing runs
+    // New run objects are created from the existing runs.
+    execution(const resources& resources, const execution& other)
+      : m_runs{create_runs(resources, other.m_runs)}
+      , m_runlists{create_runlists(resources, m_runs)}
+    {}
+
+    void
+    bind(const std::string& name, const xrt::bo& bo)
+    {
+      // Iterate over all runs and bind the buffer.
+      // Note, that not all runs need to use the buffer.
+      // Maybe some optimization could be done here.
+      for (auto& run : m_runs)
+        run.bind(name, bo);
+    }
+
+    void
+    execute()
+    {
+      XRT_DEBUGF("recipe::execution::execute()\n");
+
+      // execute_runlist() - execute a runlist synchronously
+      // The lambda function is executed asynchronously by an
+      // xrt::queue object. The wait is necessary for an NPU runlist,
+      // which must complete before next enqueue operation can be
+      // executed.  Execution of an NPU runlist is itself asynchronous.
+      static auto execute_runlist = [](runlist* runlist, std::exception_ptr& eptr) {
+        try {
+          runlist->execute();
+          runlist->wait(); // needed for NPU runlists, noop for CPU
+        }
+        catch (const xrt::runlist::command_error&) {
+          eptr = std::current_exception();
+        }
+        catch (const std::exception&) {
+          eptr = std::current_exception();
+        }
+      };
+
+      // A recipe can have multiple runlists. Each runlist can have
+      // multiple runs.  Runlists are executed sequentially, execution
+      // is orchestrated by xrt::queue which uses one thread to
+      // asynchronously (from called pov) execute all runlists
+      for (auto& runlist : m_runlists)
+        m_event = m_queue.enqueue([this, &runlist] { execute_runlist(runlist.get(), m_eptr); });
+    }
+
+    void
+    wait()
+    {
+      XRT_DEBUGF("recipe::execution::wait()\n");
+      // Sufficient to wait for last runlist to finish since last list
+      // must have waited for all previous lists to finish.
+      auto runlist = m_runlists.back().get();
+      if (runlist) 
+        m_event.wait();
+
+      if (m_eptr)
+        std::rethrow_exception(m_eptr);
+    }
+  }; // class recipe::execution
+
+  xrt::device m_device;
+
+  boost::property_tree::ptree m_recipe;
+  header m_header;
+  resources m_resources;
+  execution m_execution;
+
+  static boost::property_tree::ptree
+  load(const std::string& path)
+  {
+    boost::property_tree::ptree pt;
+    boost::property_tree::read_json(path, pt);
+    return pt;
+  }
+
+public:
+  recipe(xrt::device device, const std::string& path, const artifacts::repo& repo)
+    : m_device{std::move(device)}
+    , m_recipe{load(path)}
+    , m_header{m_recipe.get_child("header"), repo}
+    , m_resources{m_device, m_header.get_xclbin(), m_recipe.get_child("resources"), repo}
+    , m_execution{m_resources, m_recipe.get_child("execution")}
+  {}
+
+  recipe(const recipe&) = default;
+
+  void
+  bind_input(const std::string& name, const xrt::bo& bo)
+  {
+    XRT_DEBUGF("recipe::bind_input(%s)\n", name.c_str());
+    m_execution.bind(name, bo);
+  }
+
+  void
+  bind_output(const std::string& name, const xrt::bo& bo)
+  {
+    XRT_DEBUGF("recipe::bind_output(%s)\n", name.c_str());
+    m_execution.bind(name, bo);
+  }
+
+  void
+  bind(const std::string& name, const xrt::bo& bo)
+  {
+    XRT_DEBUGF("recipe::bind(%s)\n", name.c_str());
+    m_execution.bind(name, bo);
+  }
+
+  // The recipe can be executed with its currently bound
+  // input and output resources
+  void
+  execute()
+  {
+    XRT_DEBUGF("recipe::execute()\n");
+    // Verify that all required resources are bound
+    // ...
+
+    // Execute the runlist
+    m_execution.execute();
+  }
+
+  void
+  wait()
+  {
+    XRT_DEBUGF("recipe::wait()\n");
+    m_execution.wait();
+  }
+}; // class recipe
+
+} // namespace
+
+namespace xrt_core {
+
+// class runner_impl -
+//
+// A runner implementation is default created with one instance of a
+// recipe.  But the runner can be used by multiple threads and new
+// recipe instances are created for each thread as needed.
+//
+// The runner can be created from any thread, but member functions
+// are thread specific.
+class runner_impl
+{
+  //std::map<std::thread::id, recipe> m_recipes;
+  recipe m_recipe;
+  //thread_local recipe m_thread_recipe;
+
+public:
+  runner_impl(const xrt::device& device, const std::string& recipe)
+    : m_recipe{device, recipe, artifacts::file_repo{}}
+  {}
+
+  runner_impl(const xrt::device& device, const std::string& recipe, const runner::artifacts_repository& artifacts)
+    : m_recipe{device, recipe, artifacts::ram_repo(artifacts)}
+  {}
+
+  void
+  bind_input(const std::string& name, const xrt::bo& bo)
+  {
+    m_recipe.bind_input(name, bo);
+  }
+
+  void
+  bind_output(const std::string& name, const xrt::bo& bo)
+  {
+    m_recipe.bind_output(name, bo);
+  }
+
+  void
+  bind(const std::string& name, const xrt::bo& bo)
+  {
+    m_recipe.bind(name, bo);
+  }
+
+  void
+  execute()
+  {
+    m_recipe.execute();
+  }
+
+  void
+  wait()
+  {
+    m_recipe.wait();
+  }
+};
+
+////////////////////////////////////////////////////////////////
+// Public runner interface APIs
+////////////////////////////////////////////////////////////////
+runner::
+runner(const xrt::device& device, const std::string& recipe)
+  : m_impl{std::make_unique<runner_impl>(device, recipe)}
+{} 
+  
+runner::
+runner(const xrt::device& device, const std::string& recipe, const artifacts_repository& repo)
+  : m_impl{std::make_unique<runner_impl>(device, recipe, repo)}
+{}
+
+void
+runner::
+bind_input(const std::string& name, const xrt::bo& bo)
+{
+  m_impl->bind_input(name, bo);
+}
+
+// bind_output() - Bind a buffer object to an output tensor
+void
+runner::
+bind_output(const std::string& name, const xrt::bo& bo)
+{
+  m_impl->bind_output(name, bo);
+}
+
+void
+runner::
+bind(const std::string& name, const xrt::bo& bo)
+{
+  m_impl->bind(name, bo);
+}
+
+// execute() - Execute the runner
+void
+runner::
+execute()
+{
+  m_impl->execute();
+}
+
+void
+runner::
+wait()
+{
+  m_impl->wait();
+}
+
+} // namespace xrt_core
diff --git a/src/runtime_src/core/common/runner/runner.h b/src/runtime_src/core/common/runner/runner.h
new file mode 100644
index 00000000000..787c6b98c51
--- /dev/null
+++ b/src/runtime_src/core/common/runner/runner.h
@@ -0,0 +1,153 @@
+// SPDX-License-Identifier: Apache-2.0
+// Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.
+#ifndef XRT_COMMON_RUNNER_RUNNER_H_
+#define XRT_COMMON_RUNNER_RUNNER_H_
+#include "core/common/config.h"
+
+#include <any>
+#include <cstdint>
+#include <functional>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace xrt {
+class device;
+class bo;
+}
+
+namespace xrt_core {
+
+/**
+ * class runner - A class to execute a run recipe json
+ */
+class runner_impl;
+class runner
+{
+  std::shared_ptr<runner_impl> m_impl;  // probably unique_ptr is enough
+
+public:
+  /**
+   * artifacts_repository - A map of artifacts
+   *
+   * The runner can be constructed with an artifacts repository, in
+   * which case the recipe references are looked up in the artifacts are
+   * looked up in the repository rather than from disk.
+   */
+  using artifacts_repository = std::map<std::string, std::vector<char>>;
+
+  // ctor - Create runner from a recipe json
+  XRT_CORE_COMMON_EXPORT
+  runner(const xrt::device& device, const std::string& recipe);
+
+  // ctor - Create runner from a recipe json and artifacts repository
+  // The lifetime of the repo must extend the lifetime of the runner
+  XRT_CORE_COMMON_EXPORT
+  runner(const xrt::device& device, const std::string& recipe, const artifacts_repository&);
+
+  // bind_input() - Bind a buffer object to an input tensor
+  XRT_CORE_COMMON_EXPORT
+  void
+  bind_input(const std::string& name, const xrt::bo& bo);
+
+  // bind_output() - Bind a buffer object to an output tensor
+  XRT_CORE_COMMON_EXPORT
+  void
+  bind_output(const std::string& name, const xrt::bo& bo);
+
+  // bind() - Bind a buffer object to a tensor
+  XRT_CORE_COMMON_EXPORT
+  void
+  bind(const std::string& name, const xrt::bo& bo);
+
+  // execute() - Execute the runner
+  XRT_CORE_COMMON_EXPORT
+  void
+  execute();
+
+  // wait() - Wait for the execution to complete
+  XRT_CORE_COMMON_EXPORT
+  void
+  wait();
+};
+
+/**
+ * The xrt::runner supports execution of CPU functions as well
+ * as xrt::kernel objects.
+ *
+ * The CPU functions are implemented in runtime loaded dynamic
+ * libraries. A library must define and export a function that
+ * initializes a callback structure with a lookup function.
+ *
+ * The signature of the lookup function must be
+ * @code
+ *  void lookup_fn(const std::string& name, xrt::cpu::lookup_args* args)
+ * @endcode
+ * where the name is the name of the function to lookup and args is a
+ * structure that the lookup function must populate with the function
+ * information.
+ *
+ * The arguments to the CPU functions are elided via std::any and
+ * the signature of the CPU functions is fixed to
+ * @code
+ *  void cpu_function(std::vector<std::any>& args)
+ * @endcode
+ * Internally, the CPU library unwraps the arguments and calls the
+ * actual function.
+ */  
+namespace cpu {
+
+/**
+ * struct lookup_args - argument structure for the lookup function
+ *
+ * The lookup function takes as arguments the name of the function
+ * to lookup along with lookup_args to be populated with information
+ * about the function.
+ *
+ * @num_args - number of arguments to function
+ * @callable - a C++ function object wrapping the function
+ *
+ * The callable library functions uses type erasure on their arguments
+ * through a std::vector of std::any objects.  The callable must
+ * unwrap the std::any objects to its expected type, which is
+ * cumbersome, but type safe. The type erased arguments allow the
+ * runner to be generic and not tied to a specific function signature.
+*/
+struct lookup_args
+{
+  std::uint32_t num_args {0};
+  std::function<void(std::vector<std::any>&)> callable;
+};
+
+/**
+ * struct library_init_args - argument structure for libray initialization
+ *
+ * The library initialization function is the only function exported
+ * from the run time loaded library.  The library initialization
+ * function is called by the runner when a resource references a
+ * function in a library and the library is not already loaded.
+ *
+ * @lookup_fn - a callback function to be populated with the
+ *   lookup function.  The lookup function must throw an exception
+ *   if it fails.
+ *
+ * The library initialization function is C callable exported symbol,
+ * but returns a C++ function pointer to the lookup function.
+*/
+struct library_init_args
+{
+  std::function<void(const std::string&, lookup_args*)> lookup_fn;
+};
+
+/**
+ * library_init_fn - type of the library initialization function
+ * The name of the library initialization function is fixed to
+ * "library_init".
+*/
+using library_init_fn = void (*)(library_init_args*);
+
+} // cpu
+
+} // namespace xrt
+#endif
diff --git a/src/runtime_src/core/common/runner/test/.gitignore b/src/runtime_src/core/common/runner/test/.gitignore
new file mode 100644
index 00000000000..34074f3d751
--- /dev/null
+++ b/src/runtime_src/core/common/runner/test/.gitignore
@@ -0,0 +1,2 @@
+# Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.
+build/*
diff --git a/src/runtime_src/core/common/runner/test/CMakeLists.txt b/src/runtime_src/core/common/runner/test/CMakeLists.txt
new file mode 100644
index 00000000000..1d519d5f40d
--- /dev/null
+++ b/src/runtime_src/core/common/runner/test/CMakeLists.txt
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.
+CMAKE_MINIMUM_REQUIRED(VERSION 3.18.0)
+PROJECT(runner)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED OFF)
+set(CMAKE_VERBOSE_MAKEFILE ON)
+set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
+
+if (WIN32)
+  add_compile_options(/Zc:__cplusplus)
+endif()
+
+find_package(XRT REQUIRED HINTS ${XILINX_XRT}/share/cmake/XRT)
+message("-- XRT_INCLUDE_DIRS=${XRT_INCLUDE_DIRS}")
+
+add_executable(runner runner.cpp)
+target_include_directories(runner PRIVATE ${XRT_INCLUDE_DIRS} ${XRT_ROOT}/src/runtime_src)
+target_link_libraries(runner PRIVATE XRT::xrt_coreutil)
+
+add_executable(recipe recipe.cpp)
+target_include_directories(recipe PRIVATE ${XRT_INCLUDE_DIRS} ${XRT_ROOT}/src/runtime_src)
+target_link_libraries(recipe PRIVATE XRT::xrt_coreutil)
+
+if (NOT WIN32)
+  target_link_libraries(runner PRIVATE pthread uuid dl)
+  target_link_libraries(recipe PRIVATE pthread uuid dl)
+endif()
+
+install(TARGETS runner recipe)
+
diff --git a/src/runtime_src/core/common/runner/test/README.md b/src/runtime_src/core/common/runner/test/README.md
new file mode 100644
index 00000000000..88e2d43adb3
--- /dev/null
+++ b/src/runtime_src/core/common/runner/test/README.md
@@ -0,0 +1,74 @@
+<!-- SPDX-License-Identifier: Apache-2.0 -->
+<!-- Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. -->
+# Runner tests
+
+This directory contains runner test code.
+
+## recipe.cpp
+
+A test wrapper for creating a `runner` from a `run-recipe.json`.  Used
+for debugging purposes, basically validates that the run-recipe can be
+parsed and that resources can be created.
+
+## runner.cpp
+
+A complete host code for creating a runner and executing the execution
+section of the recipe.
+
+The code will execute an argument recipe with external resources bound
+through command line arguments.
+
+```
+% runner.exe [-kr name:path]* [-kb name:path]* [-kg name:path]* -recipe <recipe>
+```
+
+The recipe references resources through `name` matching.  External resources
+must be made available to the runner in two ways:
+
+1. The resource must be bound to the runner after the runner has been created.
+2. The resource must be in-memory in a repository passed to the runner constructor.
+
+The runner.cpp file supports creating `xrt::bo` external objects from
+a binary file specified through `-buffer name:path` command line switch.
+This triggers the host code to create an `xrt::bo` and populate it
+with the content of the file pointed to by `path`.  The host code
+binds this resource to the runner using 1) above before the runner is
+executed.  The `-buffer` switch can be repeated any number of times.
+
+The runner.cpp supports loading external resources, for example elf
+files, into memory before calling the constructor of the runner.  This
+is done using the `-resource name:path` command line switch and is the
+2) method above.  The content of the file pointed to by `path` is read
+into memory and associated with `name` in an artifacts resposotory
+passed as argument the runner constructor.  The `-resource` switch can
+be specified any number of times.
+
+Fianlly, the runner supports loading golden data to be compared with
+the content of an external buffer populated by the runner.  This is
+done using the `-golden name:path` command line switch.  The `name` must
+match that of a external buffer created with `-buffer`.  The `path`
+identifies a file with golden data.  The golden data is compared to
+the content of the external buffer after the runner has completed
+execution.
+
+The host code has 3 steps:
+
+1. Create artifacts repository from `-resource` switches
+2. Create an xrt_core::runner object from artifacts repo and `recipe`
+3. Create external buffer resources from the `-buffer` switches
+4. Bind the external resources to the runner
+5. Execute runner 
+6. Wait for runner to complete
+7. Compare golden data specified in `-golden` switches.
+
+
+## Build instructions
+
+```
+% mkdir build
+% cd build
+% cmake -DXILINX_XRT=c:/users/stsoe/git/stsoe/XRT-MCDM/build/WDebug/xilinx/xrt \
+        -DXRT_ROOT=c:/users/stsoe/git/stsoe/XRT-MCDM/src/xrt ..
+% cmake --build . --config Debug
+```
+
diff --git a/src/runtime_src/core/common/runner/test/cpulib.cpp b/src/runtime_src/core/common/runner/test/cpulib.cpp
new file mode 100644
index 00000000000..758aec76bb3
--- /dev/null
+++ b/src/runtime_src/core/common/runner/test/cpulib.cpp
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: Apache-2.0
+// Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.
+#include "experimental/xrt_runner.h"
+#include "xrt/xrt_bo.h"
+
+#include <any>
+#include <map>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#pragma warning(disable: 4100 4505)
+
+
+
+namespace cpux {
+
+static void
+convert_ifm(std::vector<std::any>& args)
+{
+  auto src = std::any_cast<xrt::bo>(args.at(0));
+  auto dst = std::any_cast<xrt::bo>(args.at(1));
+
+  if (src.size() != dst.size())
+    throw std::runtime_error("src and dst size mismatch");
+
+  auto src_data = src.map<const uint8_t*>();
+  auto dst_data = dst.map<uint8_t*>();
+
+  // convert
+  std::memcpy(dst_data, src_data, src.size());
+}
+
+static void
+convert_ofm(std::vector<std::any>& args)
+{
+  auto src = std::any_cast<xrt::bo>(args.at(0));
+  auto dst = std::any_cast<xrt::bo>(args.at(1));
+
+  if (src.size() != dst.size())
+    throw std::runtime_error("src and dst size mismatch");
+
+  auto src_data = src.map<const uint8_t*>();
+  auto dst_data = dst.map<uint8_t*>();
+
+  // convert
+  std::memcpy(dst_data, src_data, src.size());
+}
+
+static void
+hello(const std::vector<std::any>& args)
+{
+  auto value = std::any_cast<int>(args.at(0));
+  auto str = std::any_cast<std::string>(args.at(1));
+  auto out = std::any_cast<std::string*>(args.at(2));
+
+  if (!out)
+    throw std::runtime_error("output argument is null");
+
+  *out = "hello out " + std::to_string(value) + " " + str;
+}
+
+static void
+lookup(const std::string& fnm, xrt::cpu::lookup_args* args)
+{
+  using function_info = xrt::cpu::lookup_args;
+  static std::map<std::string, function_info> function_map = 
+  {
+    { "convert_ifm", {2, convert_ifm} },
+    { "convert_ofm", {2, convert_ofm} },
+    { "hello", {3, hello} },
+  };
+
+  if (auto it = function_map.find(fnm); it != function_map.end()) {
+    const auto& [num_args, fn] = it->second;
+    args->num_args = num_args;
+    args->callable = fn;
+    return;
+  }
+
+  throw std::runtime_error("function '" + std::string(fnm) + "' not found");
+}
+
+} // cpux
+
+extern "C" {
+
+__declspec(dllexport)
+void
+library_init(xrt::cpu::library_init_args* args)
+{
+  args->lookup_fn = &cpux::lookup;
+}
+
+} // extern "C"
diff --git a/src/runtime_src/core/common/runner/test/recipe.cpp b/src/runtime_src/core/common/runner/test/recipe.cpp
new file mode 100644
index 00000000000..073df26e6ca
--- /dev/null
+++ b/src/runtime_src/core/common/runner/test/recipe.cpp
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: Apache-2.0
+// Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.
+#include "core/common/runner/runner.h"
+#include "xrt/xrt_device.h"
+
+#include <iostream>
+#include <string>
+
+#ifdef _WIN32
+# pragma warning (disable: 4100)
+#endif
+
+static void
+run(int argc, char* argv[])
+{
+  std::string recipe { argv[1] };
+  xrt::device device{0};
+
+  xrt_core::runner runner{device, recipe};
+}
+
+int
+main(int argc, char* argv[])
+{
+  try {
+    if (argc < 2) {
+      std::cout << "Usage: " << argv[0] << " <recipe_file>" << '\n';
+      return 1;
+    }
+
+    run(argc, argv);
+    return 0;
+  }
+  catch (const std::exception& e) {
+    std::cerr << "Error: " << e.what() << '\n';
+    return 1;
+  }
+}
diff --git a/src/runtime_src/core/common/runner/test/recipe.json b/src/runtime_src/core/common/runner/test/recipe.json
new file mode 100644
index 00000000000..d2f3f80cd8d
--- /dev/null
+++ b/src/runtime_src/core/common/runner/test/recipe.json
@@ -0,0 +1,83 @@
+{
+  "header": {
+    "xclbin_path": "design.xclbin"
+  },
+  "resources": {
+    "buffers": [
+      {
+        "name": "wts",
+        "type": "input"
+      },
+      {
+        "name": "ifm",
+        "type": "input"
+      },
+      {
+        "name": "ifm_int",
+        "type": "internal",
+        "size": "1536"
+      },
+      {
+        "name": "ofm_int",
+        "type": "internal",
+        "size": "320"
+      },
+      {
+        "name": "ofm",
+        "type": "output"
+      }
+    ],
+    "cpus": [
+      {
+          "name": "convert_ifm",
+          "library_path": "cpulib"
+      },
+      {
+          "name": "convert_ofm",
+          "library_path": "cpulib"
+      }
+    ],
+    "kernels": [
+      {
+        "name": "k1",
+        "xclbin_kernel_name": "DPU",
+        "ctrlcode": "no-ctrl-packet.elf"
+      }
+    ]
+  },
+  "execution": {
+    "runs": [
+      {
+          "name": "convert_ifm",
+          "where": "cpu",
+          "arguments" : [
+              { "name": "ifm", "argidx": 0 },
+              { "name": "ifm_int", "argidx": 1 }
+          ]
+      },
+      {
+        "name": "k1",
+        "arguments" : [
+            { "name": "wts", "argidx": 4 },
+            { "name": "ifm_int", "argidx": 3 },
+            { "name": "ofm_int", "argidx": 5 }
+        ],
+        "constants": [
+            { "value": "3", "type": "int", "argidx": 0 },
+            { "value": "0", "type": "int", "argidx": 1 },
+            { "value": "0", "type": "int", "argidx": 2 },
+            { "value": "0", "type": "int", "argidx": 6 },
+            { "value": "0", "type": "int", "argidx": 7 }
+        ]
+      },
+      {
+          "name": "convert_ofm",
+          "where": "cpu",
+          "arguments" : [
+              { "name": "ofm_int", "argidx": 0 },
+              { "name": "ofm", "argidx": 1 }
+          ]
+      }
+    ]
+  }
+}
diff --git a/src/runtime_src/core/common/runner/test/runner.cpp b/src/runtime_src/core/common/runner/test/runner.cpp
new file mode 100644
index 00000000000..93009da5de7
--- /dev/null
+++ b/src/runtime_src/core/common/runner/test/runner.cpp
@@ -0,0 +1,208 @@
+// SPDX-License-Identifier: Apache-2.0
+// Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.
+
+// This test configures and runs a recipe one time
+// g++ -g -std=c++17
+//   -I/home/stsoe/git/stsoe/XRT/build/Debug/opt/xilinx/xrt/include
+//   -I/home/stsoe/git/stsoe/XRT/src/runtime_src
+//   -L/home/stsoe/git/stsoe/XRT/build/Debug/opt/xilinx/xrt/lib
+//   -o runner.exe runner.cpp -lxrt_coreutil -pthread
+//
+// or
+//
+// mkdir build
+// cd build
+// cmake -DXILINX_XRT=/home/stsoe/git/stsoe/XRT/build/Debug/opt/xilinx/xrt
+//       -DXRT_ROOT=/home/stsoe/git/stsoe/XRT ..
+// cmake --build . --config Debug
+//
+// ./runner.exe -kp ... -kp ... -bd ... -bd ... -bg ... -recipe ...
+
+#include "xrt/xrt_device.h"
+#include "experimental/xrt_ext.h"
+#include "core/common/runner/runner.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <string>
+#include <vector>
+
+static xrt_core::runner::artifacts_repository g_repo;
+static std::map<std::string, std::string> g_buffer2data;
+static std::map<std::string, xrt::bo> g_buffer2bo;
+static std::map<std::string, std::string> g_buffer2golden;
+static std::string g_recipe;
+
+static void
+usage()
+{
+  std::cout << "usage: %s [options]\n";
+  std::cout << " --resource <key:path> artifact key data pair, the key is referenced by recipe\n";
+  std::cout << " --buffer <key:path> external buffer data, the key is referenced by recipe\n";
+  std::cout << " --golden <key:path> external buffer goldendata, the key matches a -bd pair\n";
+  std::cout << " --recipe <recipe.json> recipe file to run\n";
+  std::cout << "\n\n";
+  std::cout << "host.exe -r elf:foo.elf \n"
+            << "         -b ifm:ifm.bin -b ofm:ofm.bin -b wts:wts.bin\n"
+            << "         -g ofm:gold.bin\n"
+            << "         --recipe recipe.json\n";
+}
+
+static std::vector<char>
+read_file(const std::string& fnm)
+{
+  std::ifstream ifs{fnm, std::ios::binary};
+  if (!ifs)
+    throw std::runtime_error("Failed to open file '" + fnm + "' for reading");
+
+  ifs.seekg(0, std::ios::end);
+  std::vector<char> data(ifs.tellg());
+  ifs.seekg(0, std::ios::beg);
+  ifs.read(data.data(), data.size());
+  return data;
+}
+
+static void
+add_repo_file(const std::string& key, const std::string& path)
+{
+  auto data = read_file(path);
+  g_repo.emplace(key, std::move(data));
+}
+
+static void
+run(const xrt::device& device, const std::string& recipe)
+{
+  // 1. Add artifacts to the repository (done during cmdline parsing)
+  
+  // 2. Create the runner from the recipe
+  xrt_core::runner runner {device, recipe, g_repo};
+
+  // 3. Create buffers for external input and output
+  // 4. Bind to runner
+  for (auto& [buffer, path] : g_buffer2data) {
+    auto data = read_file(path);
+    std::cout << buffer << " size = " << data.size() << "\n";
+    xrt::bo bo = xrt::ext::bo{device, data.size()};
+    auto bo_data = bo.map<char*>();
+    std::copy(data.data(), data.data() + data.size(), bo_data);
+    bo.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+    runner.bind(buffer, bo);
+
+    // Save if referenced for golden comparison
+    g_buffer2bo.emplace(buffer, bo);
+  }
+
+  // 5. Execute the runner and wait for completion
+  runner.execute();
+
+  // 6. Wait for the runner to finish
+  runner.wait();
+
+  // 7. Compare the output with golden if any
+  for (auto& [buffer, golden] : g_buffer2golden) {
+    auto bo = g_buffer2bo.at(buffer);
+    bo.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+    
+    auto bo_data = bo.map<char*>();
+    auto golden_data = read_file(golden);
+    if (bo.size() != golden_data.size())
+      throw std::runtime_error("Golden and output size mismatch");
+
+    std::cout << "Comparing golden and output data\n";
+    if (!std::equal(golden_data.data(), golden_data.data() + golden_data.size(), bo_data)) {
+      for (uint64_t i = 0; i < golden_data.size(); ++i) {
+        if (golden_data[i] != bo_data[i])
+          throw std::runtime_error("Golden and output mismatch at index " + std::to_string(i));
+      }
+    }
+  }      
+}
+
+static void
+run(const std::string& recipe)
+{
+  // Create device
+  xrt::device device{0};
+  run(device, recipe);
+}
+
+static void
+run(int argc, char* argv[])
+{
+  std::vector<std::string> args(argv+1,argv+argc);
+  std::string cur;
+  std::string recipe;
+  for (auto& arg : args) {
+    if (arg == "-h") {
+      usage();
+      return;
+    }
+
+    if (arg[0] == '-') {
+      cur = arg;
+      continue;
+    }
+
+    if (cur == "--resource" || cur == "-r") {
+      auto pos = arg.find(":");
+      if (pos == std::string::npos)
+        throw std::runtime_error("resource option must take the form of '-resource key:path'");
+
+      auto key = arg.substr(0,pos);
+      auto path = arg.substr(pos+1);
+
+      std::cout << "Adding repo (key, path): (" << key << ", " << path << ")\n";
+      add_repo_file(key, path);
+    }
+    else if (cur == "--buffer" || cur =="-b") {
+      auto pos = arg.find(":");
+      if (pos == std::string::npos)
+        throw std::runtime_error("buffer data option must take the form of '-buffer buffer:path'");
+
+      auto buffer = arg.substr(0,pos);
+      auto datapath = arg.substr(pos+1);
+
+      std::cout << "Using (buffer, path): (" << buffer << ", " << datapath << ")\n";
+      g_buffer2data.emplace(buffer, datapath);
+    }
+    else if (cur == "-golden" || cur == "-g") {
+      auto pos = arg.find(":");
+      if (pos == std::string::npos)
+        throw std::runtime_error("golden data option must take the form of '-golden buffer:path'");
+
+      auto buffer = arg.substr(0,pos);
+      auto datapath = arg.substr(pos+1);
+
+      std::cout << "Using golden (buffer, path): (" << buffer << ", " << datapath << ")\n";
+      g_buffer2golden.emplace(buffer, datapath);
+    }
+    else if (cur == "--recipe") {
+      std::cout << "Using recipe: " << arg << '\n';
+      recipe = arg;
+    }
+    else
+      throw std::runtime_error("Unknown option value " + cur + " " + arg);
+  }
+
+  run(recipe);
+}
+
+int
+main(int argc, char **argv)
+{
+  try {
+    run(argc, argv);
+    return 0;
+  }
+  catch (const std::exception& ex) {
+    std::cerr << "Error: " << ex.what() << '\n';
+  }
+  catch (...) {
+    std::cerr << "Unknown error\n";
+  }
+  return 1;
+
+}
diff --git a/src/runtime_src/core/common/runner/test/tcpu.cpp b/src/runtime_src/core/common/runner/test/tcpu.cpp
new file mode 100644
index 00000000000..4b05f1411e2
--- /dev/null
+++ b/src/runtime_src/core/common/runner/test/tcpu.cpp
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: Apache-2.0
+// Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include "../xrt_runner.h"
+#include "../cpu.h"
+
+static void
+run(int argc, char **argv)
+{
+  if (argc != 2) {
+    std::cerr << "Usage: " << argv[0] << " <dll>\n";
+    return;
+  }
+  
+  auto dll = argv[1];
+  xrt_core::cpu::function hello{"hello", dll};
+  xrt_core::cpu::run run{hello};
+  run.set_arg(0, 10);
+  run.set_arg(1, std::string("world"));
+  std::string out;
+  run.set_arg(2, &out);
+  run.execute();
+  std::cout << out << "\n";
+}
+
+int
+main(int argc, char **argv)
+{
+  try {
+    run(argc, argv);
+    return 0;
+  }
+  catch (const std::exception& ex) {
+    std::cerr << "Error: " << ex.what() << "\n";
+  }
+  catch (...) {
+    std::cerr << "Unknown error" << "\n";
+  }
+  return 1;
+  
+}
+