diff --git a/CMakeLists.txt b/CMakeLists.txt
index b325c2763..a91c7f871 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -90,6 +90,14 @@ endif()
 if(USE_DML)
   if(WIN32)
     add_compile_definitions(USE_DML=1)
+    add_compile_definitions(NOMINMAX)
+
+    file(GLOB dml_srcs CONFIGURE_DEPENDS
+      "${PROJECT_SOURCE_DIR}/src/dml/*.h"
+      "${PROJECT_SOURCE_DIR}/src/dml/*.cpp"
+    )
+
+    list(APPEND generator_srcs ${dml_srcs})
   else()
     message(FATAL_ERROR "USE_DML is ON but this isn't windows.")
   endif()
@@ -135,6 +143,39 @@ endif()
 file(GLOB onnxruntime_libs "${ORT_LIB_DIR}/${ONNXRUNTIME_FILES}")
 if(USE_DML)
   list(APPEND onnxruntime_libs "${ORT_LIB_DIR}/DirectML.dll")
+  target_include_directories(onnxruntime-genai PRIVATE $<TARGET_PROPERTY:${WIL_TARGET},INTERFACE_INCLUDE_DIRECTORIES>)
+  target_include_directories(onnxruntime-genai PRIVATE $<TARGET_PROPERTY:${DIRECTX_HEADERS_TARGET},INTERFACE_INCLUDE_DIRECTORIES>/directx)
+  target_include_directories(onnxruntime-genai PRIVATE $<TARGET_PROPERTY:${DIRECTX_HEADERS_TARGET},INTERFACE_INCLUDE_DIRECTORIES>)
+  target_include_directories(onnxruntime-genai-static PUBLIC $<TARGET_PROPERTY:${WIL_TARGET},INTERFACE_INCLUDE_DIRECTORIES>)
+  target_include_directories(onnxruntime-genai-static PUBLIC $<TARGET_PROPERTY:${DIRECTX_HEADERS_TARGET},INTERFACE_INCLUDE_DIRECTORIES>/directx)
+  target_include_directories(onnxruntime-genai-static PUBLIC $<TARGET_PROPERTY:${DIRECTX_HEADERS_TARGET},INTERFACE_INCLUDE_DIRECTORIES>)
+  target_link_libraries(onnxruntime-genai PRIVATE d3d12.lib)
+  target_link_libraries(onnxruntime-genai-static PUBLIC d3d12.lib)
+
+  get_filename_component(PACKAGES_DIR ${CMAKE_CURRENT_BINARY_DIR}/_deps ABSOLUTE)
+  set(DXC_PACKAGE_DIR ${PACKAGES_DIR}/Microsoft.Direct3D.DXC.1.7.2308.12)
+  set(NUGET_CONFIG ${PROJECT_SOURCE_DIR}/nuget.config)
+  set(PACKAGES_CONFIG ${PROJECT_SOURCE_DIR}/packages.config)
+
+  add_custom_command(
+    OUTPUT
+    ${DXC_PACKAGE_DIR}/build/native/bin/x64/dxc.exe
+    DEPENDS
+    ${PACKAGES_CONFIG}
+    ${NUGET_CONFIG}
+    COMMAND ${CMAKE_CURRENT_BINARY_DIR}/nuget/src/nuget restore ${PACKAGES_CONFIG} -PackagesDirectory ${PACKAGES_DIR} -ConfigFile ${NUGET_CONFIG}
+    VERBATIM
+  )
+
+  add_custom_target(
+    RESTORE_PACKAGES ALL
+    DEPENDS
+    ${DXC_PACKAGE_DIR}/build/native/bin/x64/dxc.exe
+  )
+
+  add_dependencies(RESTORE_PACKAGES nuget)
+  add_dependencies(onnxruntime-genai RESTORE_PACKAGES)
+  add_dependencies(onnxruntime-genai-static RESTORE_PACKAGES)
 endif()
 
 if(NO_TOKENIZEROOT)
diff --git a/benchmark/python/benchmark_e2e.py b/benchmark/python/benchmark_e2e.py
index 04b4d0d14..d18f6acc9 100644
--- a/benchmark/python/benchmark_e2e.py
+++ b/benchmark/python/benchmark_e2e.py
@@ -13,13 +13,17 @@
 from tqdm import tqdm
 
 # Use input model to generate prompt
-def generate_prompt(model, tokenizer, prompt_length) -> str:
+def generate_prompt(model, tokenizer, prompt_length, use_graph_capture) -> str:
     temperature = 1.0
     prompt = "a"
     tokens = tokenizer.encode(prompt)
     params=og.GeneratorParams(model)
     params.set_search_options(do_sample=True, top_k=5, temperature=temperature, max_length=prompt_length, min_length=prompt_length+1)
     params.input_ids = tokens
+
+    if use_graph_capture:
+        params.try_use_cuda_graph_with_max_batch_size(1)
+
     generator=og.Generator(model, params)
     while not generator.is_done():
         generator.compute_logits()
@@ -63,13 +67,16 @@ def main(args):
     tokenizer = og.Tokenizer(model)
 
     # Generate prompt
-    prompt = [generate_prompt(model, tokenizer, prompt_length)] * batch_size
+    prompt = [generate_prompt(model, tokenizer, prompt_length, args.use_graph_capture)] * batch_size
     tokens = tokenizer.encode_batch(prompt)
 
     params = og.GeneratorParams(model)
     params.input_ids = tokens
     params.set_search_options(do_sample=True, top_k=args.top_k, top_p=args.top_p, temperature=temperature, max_length=max_length, min_length=max_length)
 
+    if args.use_graph_capture:
+        params.try_use_cuda_graph_with_max_batch_size(batch_size)
+
     if args.verbose: print("Running warmup runs...")
     for _ in tqdm(range(args.warmup)):
         generator = og.Generator(model, params)
@@ -100,6 +107,10 @@ def main(args):
         params = og.GeneratorParams(model)
         params.input_ids = tokens
         params.set_search_options(max_length=max_length, min_length=max_length)
+
+        if args.use_graph_capture:
+            params.try_use_cuda_graph_with_max_batch_size(batch_size)
+
         generator = og.Generator(model, params)
 
         # Measure prompt processing
@@ -199,5 +210,6 @@ def main(args):
     parser.add_argument('-o', '--output', type=str, default='genai_e2e', help='Output CSV file name or path (with .csv extension)')
     parser.add_argument('-v', '--verbose', action='store_true', help='Print extra information')
     parser.add_argument('-mo', '--print_model_output', action='store_true', help='Print model output')
+    parser.add_argument('-gc', '--use_graph_capture', action='store_true', help='Use the graph capture feature for CUDA or DML')
     args = parser.parse_args()
     main(args)
diff --git a/cmake/deps.txt b/cmake/deps.txt
index 99f7dc86a..c9a84529f 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -12,4 +12,6 @@
 # NOTE: You must run deps_update_and_upload.py and generate_cgmanifest.py when ready to test your changes in a CI.
 pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v2.10.1.zip;769b6aa67a77f17a770960f604b727645b6f6a13
 googletest;https://github.com/google/googletest/archive/530d5c8c84abd2a46f38583ee817743c9b3a42b4.zip;5e3a61db2aa975cfd0f97ba92c818744e7fa7034
+microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5
+directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e
 
diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index 77967b8c8..90b9b77af 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -44,3 +44,37 @@ FetchContent_Declare(
 
 onnxruntime_fetchcontent_makeavailable(googletest)
 
+if(USE_DML)
+  set(WIL_BUILD_PACKAGING OFF CACHE BOOL "" FORCE)
+  set(WIL_BUILD_TESTS OFF CACHE BOOL "" FORCE)
+
+  FetchContent_Declare(
+    microsoft_wil
+    URL ${DEP_URL_microsoft_wil}
+    URL_HASH SHA1=${DEP_SHA1_microsoft_wil}
+    FIND_PACKAGE_ARGS NAMES wil
+  )
+
+  onnxruntime_fetchcontent_makeavailable(microsoft_wil)
+  set(WIL_TARGET "WIL::WIL")
+
+  FetchContent_Declare(
+    directx_headers
+    URL ${DEP_URL_directx_headers}
+    URL_HASH SHA1=${DEP_SHA1_directx_headers}
+  )
+
+  onnxruntime_fetchcontent_makeavailable(directx_headers)
+  set(DIRECTX_HEADERS_TARGET "DirectX-Headers")
+
+  include(ExternalProject)
+  ExternalProject_Add(nuget
+    PREFIX nuget
+    URL "https://dist.nuget.org/win-x86-commandline/v5.3.0/nuget.exe"
+    DOWNLOAD_NO_EXTRACT 1
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ""
+    UPDATE_COMMAND ""
+    INSTALL_COMMAND ""
+  )
+endif()
\ No newline at end of file
diff --git a/examples/python/model-qa.py b/examples/python/model-qa.py
index ed1e02058..80da5093e 100644
--- a/examples/python/model-qa.py
+++ b/examples/python/model-qa.py
@@ -27,6 +27,7 @@ def main(args):
         input_tokens = tokenizer.encode(args.system_prompt + text)
 
         params = og.GeneratorParams(model)
+        params.try_use_cuda_graph_with_max_batch_size(1)
         params.set_search_options(do_sample=args.do_random_sampling, max_length=args.max_length, min_length=args.min_length, top_p=args.top_p, top_k=args.top_k, temperature=args.temperature, repetition_penalty=args.repetition_penalty)
         params.input_ids = input_tokens
         generator = og.Generator(model, params)
diff --git a/generate_dml_shaders.bat b/generate_dml_shaders.bat
new file mode 100644
index 000000000..62e298d6f
--- /dev/null
+++ b/generate_dml_shaders.bat
@@ -0,0 +1,4 @@
+.\build\_deps\Microsoft.Direct3D.DXC.1.7.2308.12\build\native\bin\x64\dxc.exe src\models\dml\dml_shaders\dml_update_attention_mask.hlsl -E CSMain -T cs_6_2 -DT=int32_t -O3 -Qstrip_reflect -Qstrip_debug -Qstrip_rootsignature -Fh src\models\dml\generated_dml_shaders\update_mask_int32.h
+.\build\_deps\Microsoft.Direct3D.DXC.1.7.2308.12\build\native\bin\x64\dxc.exe src\models\dml\dml_shaders\dml_update_attention_mask.hlsl -E CSMain -T cs_6_2 -DT=int64_t -O3 -Qstrip_reflect -Qstrip_debug -Qstrip_rootsignature -Fh src\models\dml\generated_dml_shaders\update_mask_int64.h
+.\build\_deps\Microsoft.Direct3D.DXC.1.7.2308.12\build\native\bin\x64\dxc.exe src\models\dml\dml_shaders\dml_increment_values.hlsl -E CSMain -T cs_6_2 -DT=int32_t -O3 -Qstrip_reflect -Qstrip_debug -Qstrip_rootsignature -Fh src\models\dml\generated_dml_shaders\increment_values_int32.h
+.\build\_deps\Microsoft.Direct3D.DXC.1.7.2308.12\build\native\bin\x64\dxc.exe src\models\dml\dml_shaders\dml_increment_values.hlsl -E CSMain -T cs_6_2 -DT=int64_t -O3 -Qstrip_reflect -Qstrip_debug -Qstrip_rootsignature -Fh src\models\dml\generated_dml_shaders\increment_values_int64.h
diff --git a/nuget.config b/nuget.config
new file mode 100644
index 000000000..3e0389a52
--- /dev/null
+++ b/nuget.config
@@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="utf-8"?>
+<configuration>
+  <solution>
+    <add key="disableSourceControlIntegration" value="true" />
+  </solution>
+  <packageSources>
+    <clear />
+    <add key="NuGet Official" value="https://api.nuget.org/v3/index.json" />
+  </packageSources>
+  <disabledPackageSources>
+    <clear />
+  </disabledPackageSources>
+</configuration>
\ No newline at end of file
diff --git a/packages.config b/packages.config
new file mode 100644
index 000000000..d4902c51f
--- /dev/null
+++ b/packages.config
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="utf-8"?>
+<packages>
+  <package id="Microsoft.Direct3D.DXC" version="1.7.2308.12" targetFramework="native" />
+</packages>
diff --git a/src/dml/dml_command_allocator_ring.h b/src/dml/dml_command_allocator_ring.h
new file mode 100644
index 000000000..74722dc68
--- /dev/null
+++ b/src/dml/dml_command_allocator_ring.h
@@ -0,0 +1,64 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <array>
+#include "dml_gpu_event.h"
+
+// A fixed-size ring of command allocators. Each time an allocator is retrieved, the allocator will
+// be reset if its previously recorded commands have finished executing on the GPU.
+template <size_t AllocatorCount>
+class DmlCommandAllocatorRing {
+ public:
+  DmlCommandAllocatorRing(
+      ID3D12Device* device,
+      D3D12_COMMAND_LIST_TYPE commandListType,
+      DmlGpuEvent initialEvent) {
+    for (auto& info : command_allocators_) {
+      THROW_IF_FAILED(device->CreateCommandAllocator(
+          commandListType,
+          IID_PPV_ARGS(info.allocator.ReleaseAndGetAddressOf())));
+
+      info.completion_event = initialEvent;
+    }
+  }
+
+  ID3D12CommandAllocator* GetNextAllocator(DmlGpuEvent next_completion_event) {
+    size_t earliest_other_allocator = (current_command_allocator_ + 1) % AllocatorCount;
+
+    assert(!command_allocators_[current_command_allocator_].completion_event.IsSignaled() ||
+           command_allocators_[earliest_other_allocator].completion_event.IsSignaled());
+
+    if (command_allocators_[earliest_other_allocator].completion_event.IsSignaled()) {
+      THROW_IF_FAILED(command_allocators_[earliest_other_allocator].Get()->Reset());
+      current_command_allocator_ = earliest_other_allocator;
+    }
+
+    // Set the completion event for the current allocator so it can be reset eventually.
+    command_allocators_[current_command_allocator_].completion_event = next_completion_event;
+
+    return command_allocators_[current_command_allocator_].Get();
+  }
+
+  // Updates the completion event of the current allocator to a different value.  This is used when the caller
+  // decides to issue an unrelated call to the queue such as ExecuteCommandLists which updates its fence between calling
+  // GetNextAllocator and executing the work which it recorded using the allocator it received.
+  void UpdateCurrentAllocatorCompletionEvent(DmlGpuEvent next_completion_event) {
+    command_allocators_[current_command_allocator_].completion_event = next_completion_event;
+  }
+
+ private:
+  struct CommandAllocatorInfo {
+    ComPtr<ID3D12CommandAllocator> allocator;
+
+    // The event which will be signaled when the last command list submitted using this allocator
+    // completes execution on the GPU.
+    DmlGpuEvent completion_event = {};
+
+    ID3D12CommandAllocator* Get() const { return allocator.Get(); }
+  };
+
+  std::array<CommandAllocatorInfo, AllocatorCount> command_allocators_;
+  size_t current_command_allocator_ = 0;
+};
\ No newline at end of file
diff --git a/src/dml/dml_command_queue.cpp b/src/dml/dml_command_queue.cpp
new file mode 100644
index 000000000..bb430c1cd
--- /dev/null
+++ b/src/dml/dml_command_queue.cpp
@@ -0,0 +1,76 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <assert.h>
+#include <wil/result.h>
+#include "dml_command_queue.h"
+
+DmlCommandQueue::DmlCommandQueue(ID3D12CommandQueue* existing_queue)
+    : queue_(existing_queue), type_(existing_queue->GetDesc().Type) {
+  ComPtr<ID3D12Device> device;
+  THROW_IF_FAILED(queue_->GetDevice(IID_PPV_ARGS(device.GetAddressOf())));
+  THROW_IF_FAILED(device->CreateFence(0, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(fence_.ReleaseAndGetAddressOf())));
+}
+
+void DmlCommandQueue::ExecuteCommandList(ID3D12CommandList* command_list) {
+  ExecuteCommandLists(std::span(&command_list, 1));
+}
+
+void DmlCommandQueue::ExecuteCommandLists(std::span<ID3D12CommandList*> command_lists) {
+  queue_->ExecuteCommandLists(static_cast<uint32_t>(command_lists.size()), command_lists.data());
+
+  ++last_fence_value_;
+  THROW_IF_FAILED(queue_->Signal(fence_.Get(), last_fence_value_));
+}
+
+void DmlCommandQueue::Wait(ID3D12Fence* fence, uint64_t value) {
+  THROW_IF_FAILED(queue_->Wait(fence, value));
+
+  ++last_fence_value_;
+  THROW_IF_FAILED(queue_->Signal(fence_.Get(), last_fence_value_));
+}
+
+DmlGpuEvent DmlCommandQueue::GetCurrentCompletionEvent() {
+  return DmlGpuEvent{last_fence_value_, fence_};
+}
+
+DmlGpuEvent DmlCommandQueue::GetNextCompletionEvent() {
+  return DmlGpuEvent{last_fence_value_ + 1, fence_};
+}
+
+void DmlCommandQueue::QueueReference(IUnknown* object, bool wait_for_unsubmitted_work) {
+  // If the DmlCommandQueue is closing, then queued_references_ is being cleared -- it is not OK
+  // to queue additional references at this time, since those references would be leaked. This
+  // affects any objects in queued_references_ whose destructors indirectly call QueueReference;
+  // for example, an allocation from BucketizedBufferAllocator attempts to queue a reference
+  // to its underlying D3D resource when freed. Furthermore, these references are unnecessary
+  // since Close() already blocks for scheduled GPU work before clearing queued_references_.
+  if (!closing_) {
+    QueuedReference queued_reference = {GetLastFenceValue(), object};
+
+    // If something has been recorded into a command list but not submitted yet, it means that the *next* fence
+    // value is the one to signal completion.
+    if (wait_for_unsubmitted_work) {
+      ++queued_reference.fence_value;
+    }
+
+    queued_references_.push_back(queued_reference);
+  }
+}
+
+void DmlCommandQueue::Close() {
+  // Wait for flushed work:
+  assert(!closing_);
+  closing_ = true;
+  DmlGpuEvent event = GetCurrentCompletionEvent();
+  event.WaitForSignal();
+  queued_references_.clear();
+  closing_ = false;
+}
+
+void DmlCommandQueue::ReleaseCompletedReferences() {
+  uint64_t completed_value = GetFence()->GetCompletedValue();
+  while (!queued_references_.empty() && queued_references_.front().fence_value <= completed_value) {
+    queued_references_.pop_front();
+  }
+}
\ No newline at end of file
diff --git a/src/dml/dml_command_queue.h b/src/dml/dml_command_queue.h
new file mode 100644
index 000000000..cbafdef99
--- /dev/null
+++ b/src/dml/dml_command_queue.h
@@ -0,0 +1,54 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <d3d12.h>
+#include <deque>
+#include "../span.h"
+#include "dml_gpu_event.h"
+
+// Manages a D3D12 command queue and provides a waitable fence which is signaled with a monotonically increasing
+// value once each execute completes on the GPU.
+class DmlCommandQueue {
+ public:
+  // Creates a DmlCommandQueue object that wraps an existing D3D12 queue.
+  DmlCommandQueue(ID3D12CommandQueue* existing_queue);
+
+  D3D12_COMMAND_LIST_TYPE GetType() const { return type_; }
+  ComPtr<ID3D12Fence> GetFence() const { return fence_; }
+  uint64_t GetLastFenceValue() const { return last_fence_value_; }
+
+  void ExecuteCommandList(ID3D12CommandList* command_list);
+  void ExecuteCommandLists(std::span<ID3D12CommandList*> command_lists);
+
+  // Queues a wait to block the GPU until the specified fence is signaled to a given value.
+  void Wait(ID3D12Fence* fence, uint64_t value);
+
+  // Returns an event that will become signaled when everything submitted to the queue thus far has
+  // completed execution on the GPU.
+  DmlGpuEvent GetCurrentCompletionEvent();
+
+  // Returns an event that will become signaled after the next ExecuteCommandLists call.
+  DmlGpuEvent GetNextCompletionEvent();
+
+  void QueueReference(IUnknown* object, bool wait_for_unsubmitted_work);
+
+  void Close();
+  void ReleaseCompletedReferences();
+
+ private:
+  struct QueuedReference {
+    uint64_t fence_value;
+    ComPtr<IUnknown> object;
+  };
+
+  std::deque<QueuedReference> queued_references_;
+
+  ComPtr<ID3D12CommandQueue> queue_;
+  D3D12_COMMAND_LIST_TYPE type_;
+
+  ComPtr<ID3D12Fence> fence_;
+  uint64_t last_fence_value_ = 0;
+  bool closing_ = false;
+};
\ No newline at end of file
diff --git a/src/dml/dml_command_recorder.cpp b/src/dml/dml_command_recorder.cpp
new file mode 100644
index 000000000..3f2fa97dd
--- /dev/null
+++ b/src/dml/dml_command_recorder.cpp
@@ -0,0 +1,281 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <assert.h>
+#include <stdexcept>
+#include <wil/result.h>
+#include "dml_command_recorder.h"
+#include "dml_command_queue.h"
+#include "../models/onnxruntime_api.h"
+
+DmlCommandRecorder::DmlCommandRecorder(
+    ID3D12Device* d3d_device,
+    IDMLDevice* dml_device,
+    std::shared_ptr<DmlCommandQueue> command_queue,
+    Ort::Allocator& device_allocator,
+    const OrtDmlApi* ort_dml_api)
+    : queue_(std::move(command_queue)),
+      d3d_device_(d3d_device),
+      dml_device_(dml_device),
+      descriptor_pool_(d3d_device, 2048),
+      command_allocator_ring_(d3d_device, queue_->GetType(), queue_->GetCurrentCompletionEvent()),
+      device_allocator_(device_allocator),
+      ort_dml_api_(ort_dml_api) {
+  THROW_IF_FAILED(dml_device->CreateOperatorInitializer(0, nullptr, IID_PPV_ARGS(&initializer_)));
+  THROW_IF_FAILED(dml_device->CreateCommandRecorder(IID_PPV_ARGS(&recorder_)));
+}
+
+void DmlCommandRecorder::CopyBufferRegion(
+    ID3D12Resource* dst_buffer,
+    uint64_t dst_offset,
+    ID3D12Resource* src_buffer,
+    uint64_t src_offset,
+    uint64_t byte_count) {
+  current_command_list_->CopyBufferRegion(dst_buffer, dst_offset, src_buffer, src_offset, byte_count);
+  operations_recorded_in_current_command_list = true;
+}
+
+void DmlCommandRecorder::ExecuteCommandList(
+    ID3D12GraphicsCommandList* command_list,
+    _Outptr_ ID3D12Fence** fence,
+    _Out_ uint64_t* completion_value) {
+  if (!operations_recorded_in_current_command_list) {
+    // The caller can re-use relevant resources after the next set of work to be
+    // flushed has completed.  Its command list hasn't been executed yet, just batched.
+    DmlGpuEvent gpu_event = queue_->GetNextCompletionEvent();
+    gpu_event.fence.CopyTo(fence);
+    *completion_value = gpu_event.fence_value;
+
+    queue_->ExecuteCommandLists(std::span<ID3D12CommandList*>(reinterpret_cast<ID3D12CommandList**>(&command_list), 1));
+
+    // The fence value at which the current command allocator may be re-used will now be higher
+    command_allocator_ring_.UpdateCurrentAllocatorCompletionEvent(queue_->GetNextCompletionEvent());
+
+    // Fail early if something horrifying happens
+    THROW_IF_FAILED(d3d_device_->GetDeviceRemovedReason());
+
+    return;
+  }
+
+  // Remember the descriptor heap and apply it to the next command list.  This avoids unnecessarily setting it onto
+  // the D3D object lazily at a point when the operation may not be parallelized with GPU work.
+  auto heap = current_descriptor_heap_;
+
+  // Execute work in the current command list plus provided command list while closing the recorder.
+  CloseAndExecute(command_list);
+  Open();
+
+  // Reset the descriptor heap opportunistically per above comment
+  SetDescriptorHeap(heap);
+
+  DmlGpuEvent gpu_event = queue_->GetCurrentCompletionEvent();
+  gpu_event.fence.CopyTo(fence);
+  *completion_value = gpu_event.fence_value;
+}
+
+ComPtr<ID3D12GraphicsCommandList> DmlCommandRecorder::GetCommandList() {
+  // Assume operations are added by the caller after this returns
+  operations_recorded_in_current_command_list = true;
+  return current_command_list_;
+}
+
+void DmlCommandRecorder::ResourceBarrier(std::span<const D3D12_RESOURCE_BARRIER> barriers) {
+  current_command_list_->ResourceBarrier(static_cast<uint32_t>(barriers.size()), barriers.data());
+  operations_recorded_in_current_command_list = true;
+}
+
+void DmlCommandRecorder::AddUAVBarrier() {
+#pragma warning(suppress : 6387)
+  auto barrier = CD3DX12_RESOURCE_BARRIER::UAV(nullptr);
+  current_command_list_->ResourceBarrier(1, &barrier);
+  operations_recorded_in_current_command_list = true;
+}
+
+void DmlCommandRecorder::Open() {
+  assert(current_descriptor_heap_ == nullptr);
+
+  ID3D12CommandAllocator* allocator = command_allocator_ring_.GetNextAllocator(queue_->GetNextCompletionEvent());
+
+  if (!cached_command_list_) {
+    THROW_IF_FAILED(d3d_device_->CreateCommandList(
+        0,
+        queue_->GetType(),
+        allocator,
+        nullptr,
+        IID_PPV_ARGS(current_command_list_.ReleaseAndGetAddressOf())));
+  } else {
+    current_command_list_ = cached_command_list_;
+    cached_command_list_ = nullptr;
+    THROW_IF_FAILED(current_command_list_->Reset(allocator, nullptr));
+  }
+}
+
+void DmlCommandRecorder::CloseAndExecute() {
+  CloseAndExecute(nullptr);
+}
+
+void DmlCommandRecorder::CloseAndExecute(_In_opt_ ID3D12GraphicsCommandList* command_list) {
+  THROW_IF_FAILED(current_command_list_->Close());
+
+  ID3D12GraphicsCommandList* command_lists_to_execute[2] = {};
+  uint32_t command_lists_to_execute_count = 0;
+
+  if (operations_recorded_in_current_command_list) {
+    command_lists_to_execute[command_lists_to_execute_count++] = current_command_list_.Get();
+  }
+
+  if (command_list) {
+    command_lists_to_execute[command_lists_to_execute_count++] = command_list;
+  }
+
+  if (command_lists_to_execute_count > 0) {
+    queue_->ExecuteCommandLists(std::span<ID3D12CommandList*>(reinterpret_cast<ID3D12CommandList**>(command_lists_to_execute), command_lists_to_execute_count));
+  }
+
+  cached_command_list_ = current_command_list_;
+  current_command_list_ = nullptr;
+  operations_recorded_in_current_command_list = false;
+
+  // The descriptor heap must be set on the command list the next time it's opened.
+  current_descriptor_heap_ = nullptr;
+
+  // Fail early if something horrifying happens
+  THROW_IF_FAILED(d3d_device_->GetDeviceRemovedReason());
+}
+
+void DmlCommandRecorder::SetDescriptorHeap(ID3D12DescriptorHeap* descriptor_heap) {
+  if (descriptor_heap != nullptr && descriptor_heap != current_descriptor_heap_) {
+    current_descriptor_heap_ = descriptor_heap;
+
+    ID3D12DescriptorHeap* descriptor_heaps[] = {descriptor_heap};
+    current_command_list_->SetDescriptorHeaps(ARRAYSIZE(descriptor_heaps), descriptor_heaps);
+  }
+}
+
+void DmlCommandRecorder::InitializeOperator(
+    IDMLCompiledOperator* op,
+    const DML_BINDING_DESC& persistent_resource_binding,
+    const DML_BINDING_DESC& input_array_binding) {
+  // Reset the initializer to reference the input operator.
+  IDMLCompiledOperator* ops[] = {op};
+  THROW_IF_FAILED(initializer_->Reset(ARRAYSIZE(ops), ops));
+
+  DML_BINDING_PROPERTIES init_binding_props = initializer_->GetBindingProperties();
+
+  const uint32_t num_descriptors = init_binding_props.RequiredDescriptorCount;
+  DmlDescriptorRange descriptor_range = descriptor_pool_.AllocDescriptors(
+      num_descriptors,
+      queue_->GetNextCompletionEvent());
+
+  // Create a binding table for initialization.
+  DML_BINDING_TABLE_DESC binding_table_desc = {};
+  binding_table_desc.Dispatchable = initializer_.Get();
+  binding_table_desc.CPUDescriptorHandle = descriptor_range.cpuHandle;
+  binding_table_desc.GPUDescriptorHandle = descriptor_range.gpuHandle;
+  binding_table_desc.SizeInDescriptors = num_descriptors;
+
+  ComPtr<IDMLBindingTable> binding_table;
+  THROW_IF_FAILED(dml_device_->CreateBindingTable(&binding_table_desc, IID_PPV_ARGS(&binding_table)));
+
+  // Create a temporary resource for initializing the op, if it's required.
+  uint64_t temporary_resource_size = init_binding_props.TemporaryResourceSize;
+  if (temporary_resource_size > 0) {
+    // Allocate and immediately free a temporary buffer. The buffer resource will still be
+    // alive (managed by the pool); freeing allows the resource to be shared with other operators.
+    std::array<int64_t, 1> temporary_resource_shape = {static_cast<int64_t>(temporary_resource_size)};
+
+    ComPtr<ID3D12Resource> buffer;
+    auto temp_resource = OrtValue::CreateTensor(device_allocator_, temporary_resource_shape, ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8);
+    Ort::ThrowOnError(ort_dml_api_->GetD3D12ResourceFromAllocation(&device_allocator_, temp_resource->GetTensorMutableRawData(), &buffer));
+
+    // Bind the temporary resource.
+    DML_BUFFER_BINDING buffer_binding = {buffer.Get(), 0, temporary_resource_size};
+    DML_BINDING_DESC binding_desc = {DML_BINDING_TYPE_BUFFER, &buffer_binding};
+    binding_table->BindTemporaryResource(&binding_desc);
+  }
+
+  // Bind inputs, if provided.
+  if (input_array_binding.Type != DML_BINDING_TYPE_NONE) {
+    // An operator with inputs to bind MUST use a BUFFER_ARRAY.
+    assert(input_array_binding.Type == DML_BINDING_TYPE_BUFFER_ARRAY);
+    binding_table->BindInputs(1, &input_array_binding);
+  }
+
+  // Bind the persistent resource, which is an output of initialization.
+  if (persistent_resource_binding.Type != DML_BINDING_TYPE_NONE) {
+    // Persistent resources MUST be bound as buffers.
+    assert(persistent_resource_binding.Type == DML_BINDING_TYPE_BUFFER);
+    binding_table->BindOutputs(1, &persistent_resource_binding);
+  }
+
+  // Record the initialization work.
+  SetDescriptorHeap(descriptor_range.heap);
+  recorder_->RecordDispatch(current_command_list_.Get(), initializer_.Get(), binding_table.Get());
+  operations_recorded_in_current_command_list = true;
+
+  // Barrier if there's an output (i.e. persistent resource), or if any temps are used.
+  if ((persistent_resource_binding.Type != DML_BINDING_TYPE_NONE) ||
+      (temporary_resource_size > 0)) {
+    auto uav = CD3DX12_RESOURCE_BARRIER::UAV(nullptr);
+    current_command_list_->ResourceBarrier(1, &uav);
+  }
+}
+
+void DmlCommandRecorder::ExecuteOperator(
+    IDMLCompiledOperator* op,
+    const DML_BINDING_DESC& persistent_resource_binding,
+    std::span<const DML_BINDING_DESC> input_bindings,
+    std::span<const DML_BINDING_DESC> output_bindings) {
+  DML_BINDING_PROPERTIES exec_binding_props = op->GetBindingProperties();
+
+  const uint32_t num_descriptors = exec_binding_props.RequiredDescriptorCount;
+  DmlDescriptorRange descriptor_range = descriptor_pool_.AllocDescriptors(
+      num_descriptors,
+      queue_->GetNextCompletionEvent());
+
+  // Create a binding table for execution.
+  DML_BINDING_TABLE_DESC binding_table_desc = {};
+  binding_table_desc.Dispatchable = op;
+  binding_table_desc.CPUDescriptorHandle = descriptor_range.cpuHandle;
+  binding_table_desc.GPUDescriptorHandle = descriptor_range.gpuHandle;
+  binding_table_desc.SizeInDescriptors = num_descriptors;
+
+  ComPtr<IDMLBindingTable> binding_table;
+  THROW_IF_FAILED(dml_device_->CreateBindingTable(&binding_table_desc, IID_PPV_ARGS(&binding_table)));
+
+  // Create a temporary resource for executing the op, if it's required.
+  uint64_t temporary_resource_size = exec_binding_props.TemporaryResourceSize;
+  if (temporary_resource_size > 0) {
+    // Allocate and immediately free a temporary buffer. The buffer resource will still be
+    // alive (managed by the pool); freeing allows the resource to be shared with other operators.
+    std::array<int64_t, 1> temporary_resource_shape = {static_cast<int64_t>(temporary_resource_size)};
+
+    ComPtr<ID3D12Resource> buffer;
+    auto temp_resource = OrtValue::CreateTensor(device_allocator_, temporary_resource_shape, ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8);
+    Ort::ThrowOnError(ort_dml_api_->GetD3D12ResourceFromAllocation(&device_allocator_, temp_resource->GetTensorMutableRawData(), &buffer));
+
+    // Bind the temporary resource.
+    DML_BUFFER_BINDING buffer_binding = {buffer.Get(), 0, temporary_resource_size};
+    DML_BINDING_DESC binding_desc = {DML_BINDING_TYPE_BUFFER, &buffer_binding};
+    binding_table->BindTemporaryResource(&binding_desc);
+  }
+
+  if (persistent_resource_binding.Type != DML_BINDING_TYPE_NONE) {
+    binding_table->BindPersistentResource(&persistent_resource_binding);
+  }
+
+  binding_table->BindInputs(static_cast<uint32_t>(input_bindings.size()), input_bindings.data());
+  binding_table->BindOutputs(static_cast<uint32_t>(output_bindings.size()), output_bindings.data());
+
+  // Record the execution work.
+  SetDescriptorHeap(descriptor_range.heap);
+  recorder_->RecordDispatch(current_command_list_.Get(), op, binding_table.Get());
+  operations_recorded_in_current_command_list = true;
+
+// Barrier all outputs.
+#pragma warning(push)
+#pragma warning(disable : 6387)
+  auto uav = CD3DX12_RESOURCE_BARRIER::UAV(nullptr);
+  current_command_list_->ResourceBarrier(1, &uav);
+#pragma warning(pop)
+}
\ No newline at end of file
diff --git a/src/dml/dml_command_recorder.h b/src/dml/dml_command_recorder.h
new file mode 100644
index 000000000..656d04f38
--- /dev/null
+++ b/src/dml/dml_command_recorder.h
@@ -0,0 +1,98 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <memory>
+#include <d3d12.h>
+#include <DirectML.h>
+#include "../span.h"
+#include "dml_command_allocator_ring.h"
+#include "dml_descriptor_pool.h"
+#include "dml_command_queue.h"
+#include "dml_descriptor_pool.h"
+#include "dml_provider_factory.h"
+
+struct OrtDmlApi;
+
+namespace Ort {
+struct Allocator;
+}
+
+class DmlCommandRecorder {
+ public:
+  DmlCommandRecorder(
+      ID3D12Device* d3d_device,
+      IDMLDevice* dml_device,
+      std::shared_ptr<DmlCommandQueue> command_queue,
+      Ort::Allocator& device_allocator,
+      const OrtDmlApi* ort_dml_api);
+
+  void InitializeOperator(
+      IDMLCompiledOperator* op,
+      const DML_BINDING_DESC& persistent_resource_binding,
+      const DML_BINDING_DESC& input_array_binding);
+
+  void ExecuteOperator(
+      IDMLCompiledOperator* op,
+      const DML_BINDING_DESC& persistent_resource_binding,
+      std::span<const DML_BINDING_DESC> input_bindings,
+      std::span<const DML_BINDING_DESC> output_bindings);
+
+  void CopyBufferRegion(
+      ID3D12Resource* dst_buffer,
+      uint64_t dst_offset,
+      ID3D12Resource* src_buffer,
+      uint64_t src_offset,
+      uint64_t byte_count);
+
+  void ExecuteCommandList(
+      ID3D12GraphicsCommandList* command_list,
+      _Outptr_ ID3D12Fence** fence,
+      _Out_ uint64_t* completion_value);
+
+  ComPtr<ID3D12GraphicsCommandList> GetCommandList();
+
+  void ResourceBarrier(std::span<const D3D12_RESOURCE_BARRIER> barriers);
+  void AddUAVBarrier();
+
+  void Open();
+  void CloseAndExecute();
+
+  bool HasUnsubmittedWork() {
+    return operations_recorded_in_current_command_list;
+  }
+
+  // Forces the descriptor heap to be reset to D3D before executing future operations
+  void InvalidateDescriptorHeap() {
+    current_descriptor_heap_ = nullptr;
+  }
+
+ private:
+  void CloseAndExecute(_In_opt_ ID3D12GraphicsCommandList* command_list);
+
+  std::shared_ptr<DmlCommandQueue> queue_;
+  ComPtr<ID3D12Device> d3d_device_;
+  Microsoft::WRL::ComPtr<IDMLDevice> dml_device_;
+  Microsoft::WRL::ComPtr<IDMLOperatorInitializer> initializer_;
+  Microsoft::WRL::ComPtr<IDMLCommandRecorder> recorder_;
+
+  // Descriptors are allocated from a pool. The current heap pointer is only used to avoid redundantly
+  // setting the same heap; it does not have ownership of the heap object.
+  DescriptorPool descriptor_pool_;
+  ID3D12DescriptorHeap* current_descriptor_heap_ = nullptr;
+
+  DmlCommandAllocatorRing<2> command_allocator_ring_;
+
+  // The command list currently being recorded into, and whether any command have been recorded yet.
+  ComPtr<ID3D12GraphicsCommandList> current_command_list_;
+  bool operations_recorded_in_current_command_list = false;
+
+  // A cached command list which may be re-used.
+  ComPtr<ID3D12GraphicsCommandList> cached_command_list_;
+
+  Ort::Allocator& device_allocator_;
+  const OrtDmlApi* ort_dml_api_;
+
+  void SetDescriptorHeap(ID3D12DescriptorHeap* descriptor_heap);
+};
\ No newline at end of file
diff --git a/src/dml/dml_descriptor_pool.cpp b/src/dml/dml_descriptor_pool.cpp
new file mode 100644
index 000000000..ff88752e1
--- /dev/null
+++ b/src/dml/dml_descriptor_pool.cpp
@@ -0,0 +1,109 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <assert.h>
+#include <optional>
+#include <wil/result.h>
+#include "dml_descriptor_pool.h"
+
+DmlDescriptorHeap::DmlDescriptorHeap(ID3D12DescriptorHeap* heap) : heap_(heap),
+                                                                   capacity_(heap->GetDesc().NumDescriptors),
+                                                                   head_cpu_handle_(heap->GetCPUDescriptorHandleForHeapStart()),
+                                                                   head_gpu_handle_(heap->GetGPUDescriptorHandleForHeapStart()),
+                                                                   heap_flags_(heap->GetDesc().Flags) {
+  ComPtr<ID3D12Device> device;
+  THROW_IF_FAILED(heap->GetDevice(IID_PPV_ARGS(device.GetAddressOf())));
+  handle_increment_size_ = device->GetDescriptorHandleIncrementSize(heap->GetDesc().Type);
+}
+
+std::optional<DmlDescriptorRange> DmlDescriptorHeap::TryAllocDescriptors(
+    uint32_t num_descriptors,
+    DmlGpuEvent completion_event,
+    D3D12_DESCRIPTOR_HEAP_FLAGS heap_flags) {
+  // Bail if the desired heap creation flags are incompatible with the existing heap.
+  if (heap_flags_ != heap_flags) {
+    return std::nullopt;
+  }
+
+  if ((completion_event_.fence != nullptr) && (completion_event_.IsSignaled())) {
+    // This class always allocates descriptors from the end of the heap.
+    // If the most recent completion event is signaled, then all previous
+    // allocations have completed; the entire capacity is available to use.
+    size_ = 0;
+    head_cpu_handle_ = heap_->GetCPUDescriptorHandleForHeapStart();
+    head_gpu_handle_ = heap_->GetGPUDescriptorHandleForHeapStart();
+  }
+
+  // The caller will need to create a new heap if there is no space left in this one.
+  uint32_t space_remaining = capacity_ - size_;
+  if (space_remaining < num_descriptors) {
+    return std::nullopt;
+  }
+
+  DmlDescriptorRange range = {heap_.Get(), head_cpu_handle_, head_gpu_handle_};
+
+  size_ += num_descriptors;
+  completion_event_ = completion_event;
+  head_cpu_handle_.Offset(num_descriptors, handle_increment_size_);
+  head_gpu_handle_.Offset(num_descriptors, handle_increment_size_);
+
+  return range;
+}
+
+DescriptorPool::DescriptorPool(ID3D12Device* device, uint32_t initial_capacity) : device_(device),
+                                                                                  initial_heap_capacity_(initial_capacity) {
+  CreateHeap(initial_capacity, D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE);
+}
+
+DmlDescriptorRange DescriptorPool::AllocDescriptors(
+    uint32_t num_descriptors,
+    DmlGpuEvent completion_event,
+    D3D12_DESCRIPTOR_HEAP_FLAGS heap_flags) {
+  // Attempt to allocate from an existing heap.
+  for (DmlDescriptorHeap& heap : heaps_) {
+    auto descriptor_range = heap.TryAllocDescriptors(num_descriptors, completion_event, heap_flags);
+    if (descriptor_range.has_value()) {
+      return descriptor_range.value();
+    }
+  }
+
+  // A new descriptor heap must be created.
+  uint32_t new_heap_capacity = std::max(num_descriptors, initial_heap_capacity_);
+  CreateHeap(new_heap_capacity, heap_flags);
+  auto descriptor_range = heaps_.back().TryAllocDescriptors(num_descriptors, completion_event, heap_flags);
+  assert(descriptor_range.has_value());
+  return descriptor_range.value();
+}
+
+void DescriptorPool::Trim() {
+  // Remove any heaps that are not pending execution.
+  auto it = std::remove_if(heaps_.begin(), heaps_.end(), [](const DmlDescriptorHeap& heap) {
+    auto completion_event = heap.GetLastCompletionEvent();
+    return !completion_event.fence || completion_event.IsSignaled();
+  });
+
+  heaps_.erase(it, heaps_.end());
+}
+
+void DescriptorPool::CreateHeap(uint32_t num_descriptors, D3D12_DESCRIPTOR_HEAP_FLAGS heap_flags) {
+  // This pool only manages CBV/SRV/UAV descriptors.
+  D3D12_DESCRIPTOR_HEAP_DESC desc = {};
+  desc.Flags = heap_flags;
+  desc.NumDescriptors = num_descriptors;
+  desc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
+
+  ComPtr<ID3D12DescriptorHeap> heap;
+  THROW_IF_FAILED(device_->CreateDescriptorHeap(&desc, IID_PPV_ARGS(heap.GetAddressOf())));
+
+  heaps_.push_back(DmlDescriptorHeap{heap.Get()});
+}
+
+uint32_t DescriptorPool::GetTotalCapacity() const {
+  uint32_t capacity = 0;
+
+  for (auto& heap : heaps_) {
+    capacity += heap.GetCapacity();
+  }
+
+  return capacity;
+}
\ No newline at end of file
diff --git a/src/dml/dml_descriptor_pool.h b/src/dml/dml_descriptor_pool.h
new file mode 100644
index 000000000..e297a034e
--- /dev/null
+++ b/src/dml/dml_descriptor_pool.h
@@ -0,0 +1,80 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <optional>
+#include <d3d12.h>
+#include <d3dx12.h>
+#include "dml_gpu_event.h"
+
+// A contiguous range of descriptors.
+struct DmlDescriptorRange {
+  ID3D12DescriptorHeap* heap;
+  D3D12_CPU_DESCRIPTOR_HANDLE cpuHandle;
+  D3D12_GPU_DESCRIPTOR_HANDLE gpuHandle;
+};
+
+// Wraps an ID3D12DescriptorHeap to allocate descriptor ranges.
+class DmlDescriptorHeap {
+ public:
+  // Wraps an existing heap.
+  explicit DmlDescriptorHeap(ID3D12DescriptorHeap* heap);
+
+  // Reserves descriptors from the end of the heap. Returns nullopt if there is
+  // no space left in the heap.
+  std::optional<DmlDescriptorRange> TryAllocDescriptors(
+      uint32_t num_descriptors,
+      DmlGpuEvent completion_event,
+      D3D12_DESCRIPTOR_HEAP_FLAGS heap_flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE);
+
+  DmlGpuEvent GetLastCompletionEvent() const {
+    return completion_event_;
+  }
+
+  uint32_t GetCapacity() const {
+    return capacity_;
+  }
+
+ private:
+  ComPtr<ID3D12DescriptorHeap> heap_;
+  uint32_t capacity_ = 0;
+  uint32_t size_ = 0;
+  uint32_t handle_increment_size_ = 0;
+  CD3DX12_CPU_DESCRIPTOR_HANDLE head_cpu_handle_;
+  CD3DX12_GPU_DESCRIPTOR_HANDLE head_gpu_handle_;
+  D3D12_DESCRIPTOR_HEAP_FLAGS heap_flags_ = D3D12_DESCRIPTOR_HEAP_FLAG_NONE;
+
+  // Most recent GPU completion event. Allocations are always done at the end,
+  // so there is no fragmentation of the heap.
+  DmlGpuEvent completion_event_;
+};
+
+// Manages a pool of CBV/SRV/UAV descriptors.
+class DescriptorPool {
+ public:
+  DescriptorPool(ID3D12Device* device, uint32_t initial_capacity);
+
+  // Reserves a contiguous range of descriptors from a single descriptor heap. The
+  // lifetime of the referenced descriptor heap is managed by the DescriptorPool class.
+  // The caller must supply a DmlGpuEvent that informs the pool when the reserved descriptors
+  // are no longer required.
+  DmlDescriptorRange AllocDescriptors(
+      uint32_t num_descriptors,
+      DmlGpuEvent completion_event,
+      D3D12_DESCRIPTOR_HEAP_FLAGS heap_flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE);
+
+  // Releases all descriptor heaps that contain only descriptors which have completed
+  // their work on the GPU.
+  void Trim();
+
+  // Returns the total capacity of all heaps.
+  uint32_t GetTotalCapacity() const;
+
+ private:
+  ComPtr<ID3D12Device> device_;
+  std::vector<DmlDescriptorHeap> heaps_;
+  const uint32_t initial_heap_capacity_;
+
+  void CreateHeap(uint32_t num_descriptors, D3D12_DESCRIPTOR_HEAP_FLAGS heap_flags);
+};
\ No newline at end of file
diff --git a/src/dml/dml_execution_context.cpp b/src/dml/dml_execution_context.cpp
new file mode 100644
index 000000000..05d8b75fc
--- /dev/null
+++ b/src/dml/dml_execution_context.cpp
@@ -0,0 +1,173 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <stdexcept>
+#include <assert.h>
+#include "dml_execution_context.h"
+#include "dml_command_queue.h"
+
+DmlExecutionContext::DmlExecutionContext(
+    ID3D12Device* d3d12_device,
+    IDMLDevice* dml_device,
+    ID3D12CommandQueue* queue,
+    Ort::Allocator& device_allocator,
+    const OrtDmlApi* ort_dml_api)
+    : queue_(std::make_shared<DmlCommandQueue>(queue)), dml_recorder_(d3d12_device, dml_device, queue_, device_allocator, ort_dml_api) {
+}
+
+void DmlExecutionContext::CopyBufferRegion(
+    ID3D12Resource* dst_buffer,
+    uint64_t dst_offset,
+    D3D12_RESOURCE_STATES dst_state,
+    ID3D12Resource* src_buffer,
+    uint64_t src_offset,
+    D3D12_RESOURCE_STATES src_state,
+    uint64_t byte_count) {
+  assert(!closed_);
+
+  SetCommandRecorder(&dml_recorder_);
+
+  std::vector<D3D12_RESOURCE_BARRIER> barriers;
+
+  if (!(dst_state & D3D12_RESOURCE_STATE_COPY_DEST)) {
+    barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(dst_buffer, dst_state, D3D12_RESOURCE_STATE_COPY_DEST));
+  }
+  if (!(src_state & D3D12_RESOURCE_STATE_COPY_SOURCE)) {
+    barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(src_buffer, src_state, D3D12_RESOURCE_STATE_COPY_SOURCE));
+  }
+
+  if (!barriers.empty()) {
+    dml_recorder_.ResourceBarrier(barriers);
+  }
+
+  dml_recorder_.CopyBufferRegion(dst_buffer, dst_offset, src_buffer, src_offset, byte_count);
+
+  // Reset barrier state
+  if (!barriers.empty()) {
+    for (auto& barrier : barriers) {
+      std::swap(barrier.Transition.StateBefore, barrier.Transition.StateAfter);
+    }
+
+    dml_recorder_.ResourceBarrier(barriers);
+  }
+}
+
+void DmlExecutionContext::InitializeOperator(
+    IDMLCompiledOperator* op,
+    const DML_BINDING_DESC& persistent_resource_binding,
+    const DML_BINDING_DESC& input_array_binding) {
+  assert(!closed_);
+  SetCommandRecorder(&dml_recorder_);
+
+  dml_recorder_.InitializeOperator(op, persistent_resource_binding, input_array_binding);
+}
+
+void DmlExecutionContext::ExecuteCommandList(
+    ID3D12GraphicsCommandList* command_list,
+    _Outptr_ ID3D12Fence** fence,
+    _Out_ uint64_t* completion_value) {
+  assert(!closed_);
+
+  SetCommandRecorder(&dml_recorder_);
+  dml_recorder_.ExecuteCommandList(command_list, fence, completion_value);
+}
+
+void DmlExecutionContext::AddUAVBarrier() {
+  assert(!closed_);
+  SetCommandRecorder(&dml_recorder_);
+
+  dml_recorder_.AddUAVBarrier();
+}
+
+void DmlExecutionContext::ResourceBarrier(std::span<const D3D12_RESOURCE_BARRIER> barriers) {
+  assert(!closed_);
+  SetCommandRecorder(&dml_recorder_);
+
+  dml_recorder_.ResourceBarrier(barriers);
+}
+
+void DmlExecutionContext::GetCommandListForRecordingAndInvalidateState(ID3D12GraphicsCommandList** command_list) {
+  assert(!closed_);
+  SetCommandRecorder(&dml_recorder_);
+
+  // Ensure the descriptor heap is reset to D3D as something external may change it before recording
+  dml_recorder_.InvalidateDescriptorHeap();
+
+  dml_recorder_.GetCommandList().CopyTo(command_list);
+}
+
+void DmlExecutionContext::SetCommandRecorder(DmlCommandRecorder* new_recorder) {
+  assert(!closed_);
+
+  // If changing which recorder is the current one, we need to flush the old one first. This is to ensure correct
+  // ordering of operations on the command queue.
+  if (current_recorder_ != new_recorder) {
+    Flush();
+    current_recorder_ = new_recorder;
+
+    if (current_recorder_ != nullptr) {
+      current_recorder_->Open();
+    }
+  }
+}
+
+void DmlExecutionContext::Flush() {
+  assert(!closed_);
+
+  if (!current_recorder_ || !current_recorder_->HasUnsubmittedWork()) {
+    // Nothing to flush
+    return;
+  }
+
+  current_recorder_->CloseAndExecute();
+  ReleaseCompletedReferences();
+
+  // Pre-emptively set the DML command recorder.  It's the only command recorder right now,
+  // and doing this here causes work and allocations resetting the command list to occur at
+  // a point where it's going to be parallelized with GPU work.
+  current_recorder_ = nullptr;
+  SetCommandRecorder(&dml_recorder_);
+}
+
+void DmlExecutionContext::QueueReference(IUnknown* object) {
+  assert(!closed_);
+  // If something has been recorded into a command list but not submitted yet, it means that the *next* fence
+  // value is the one to signal completion.
+  bool wait_for_unsubmitted_work = (current_recorder_ != nullptr);
+  queue_->QueueReference(object, wait_for_unsubmitted_work);
+}
+
+void DmlExecutionContext::Close() {
+  assert(!closed_);
+
+  // Discard unflushed work and clear queued references.  This prevents the circular reference:
+  // Kernel --> ProviderImpl -->  Context --> QueuedRefs --> Kernel
+  queue_->Close();
+  current_recorder_ = nullptr;
+  closed_ = true;
+}
+
+DmlGpuEvent DmlExecutionContext::GetCurrentCompletionEvent() {
+  assert(!closed_);
+
+  DmlGpuEvent event = queue_->GetCurrentCompletionEvent();
+
+  // If something has been recorded into a command list but not submitted yet, it means that the *next* fence
+  // value is the one to signal completion.
+  const bool unflushed_work_exists = (current_recorder_ != nullptr) && current_recorder_->HasUnsubmittedWork();
+  if (unflushed_work_exists) {
+    ++event.fence_value;
+  }
+
+  return event;
+}
+
+void DmlExecutionContext::ReleaseCompletedReferences() {
+  assert(!closed_);
+  queue_->ReleaseCompletedReferences();
+}
+
+D3D12_COMMAND_LIST_TYPE DmlExecutionContext::GetCommandListTypeForQueue() const {
+  assert(!closed_);
+  return queue_->GetType();
+}
\ No newline at end of file
diff --git a/src/dml/dml_execution_context.h b/src/dml/dml_execution_context.h
new file mode 100644
index 000000000..363e9f1bc
--- /dev/null
+++ b/src/dml/dml_execution_context.h
@@ -0,0 +1,85 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <d3d12.h>
+#include <d3dx12.h>
+#include "dml_command_recorder.h"
+#include "dml_gpu_event.h"
+#include "../models/onnxruntime_api.h"
+
+// Asynchronously performs GPU work, and automatically manages command list recording and submission to queues.
+// Work submitted to the DmlExecutionContext is typically recorded onto a command list and may not immediately begin
+// execution on the GPU. Call Flush() to force all recorded work to be submitted to the command queue for execution
+// on the GPU.
+class DmlExecutionContext {
+ public:
+  // Constructs an DmlExecutionContext that executes on the supplied queue.
+  DmlExecutionContext(
+      ID3D12Device* d3d12_device,
+      IDMLDevice* dml_device,
+      ID3D12CommandQueue* queue,
+      Ort::Allocator& device_allocator,
+      const OrtDmlApi* ort_dml_api);
+
+  // Waits for flushed work, discards unflushed work, and discards associated references to
+  // prevent circular references.  Must be the last call on the object before destruction.
+  void Close();
+
+  // Queues a CopyBufferRegion (see ID3D12GraphicsCommandList::CopyBufferRegion) for execution. Transition
+  // barriers are automatically inserted to transition the source and destination resources to COPY_SOURCE and
+  // COPY_DEST if necessary.
+  void CopyBufferRegion(
+      ID3D12Resource* dst_buffer,
+      uint64_t dst_offset,
+      D3D12_RESOURCE_STATES dst_state,
+      ID3D12Resource* src_buffer,
+      uint64_t src_offset,
+      D3D12_RESOURCE_STATES src_state,
+      uint64_t byte_count);
+
+  void InitializeOperator(
+      IDMLCompiledOperator* op,
+      const DML_BINDING_DESC& persistent_resource_binding,
+      const DML_BINDING_DESC& input_array_binding);
+
+  void ExecuteCommandList(
+      ID3D12GraphicsCommandList* command_list,
+      _Outptr_ ID3D12Fence** fence,
+      _Out_ uint64_t* completion_value);
+
+  void AddUAVBarrier();
+  void ResourceBarrier(std::span<const D3D12_RESOURCE_BARRIER> barriers);
+
+  void GetCommandListForRecordingAndInvalidateState(ID3D12GraphicsCommandList** command_list);
+
+  // Forces all queued work to begin executing on the GPU. This method returns immediately and does not wait
+  // for the submitted work to complete execution on the GPU.
+  void Flush();
+
+  // Returns an event which will become signaled when everything submitted to the execution context thus far has
+  // completed execution on the GPU, including work that has yet to be flushed to the queue.
+  DmlGpuEvent GetCurrentCompletionEvent();
+
+  // Adds a reference which will be released when queued GPU work is completed
+  void QueueReference(IUnknown* object);
+
+  // Release any accumulated references who corresponding GPU fence values have
+  // been reached.
+  void ReleaseCompletedReferences();
+
+  D3D12_COMMAND_LIST_TYPE GetCommandListTypeForQueue() const;
+
+ private:
+  void SetCommandRecorder(DmlCommandRecorder* new_recorder);
+
+  std::shared_ptr<DmlCommandQueue> queue_;
+
+  DmlCommandRecorder* current_recorder_ = nullptr;
+
+  // Up to one of these is active at a time
+  DmlCommandRecorder dml_recorder_;
+
+  bool closed_ = false;
+};
\ No newline at end of file
diff --git a/src/dml/dml_gpu_event.h b/src/dml/dml_gpu_event.h
new file mode 100644
index 000000000..7ca873c75
--- /dev/null
+++ b/src/dml/dml_gpu_event.h
@@ -0,0 +1,33 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <stdint.h>
+#include <wrl/client.h>
+#include <d3d12.h>
+
+using Microsoft::WRL::ComPtr;
+
+// Represents a fence which will be signaled at some point (usually by the GPU).
+struct DmlGpuEvent {
+  uint64_t fence_value;
+  ComPtr<ID3D12Fence> fence;
+
+  bool IsSignaled() const {
+    return fence->GetCompletedValue() >= fence_value;
+  }
+
+  // Blocks until IsSignaled returns true.
+  void WaitForSignal() const {
+    if (IsSignaled()) {
+      return;  // early-out
+    }
+
+    while (!IsSignaled()) {
+#if defined(_M_AMD64) || defined(__x86_64__)
+      _mm_pause();
+#endif
+    }
+  }
+};
\ No newline at end of file
diff --git a/src/dml/dml_helpers.cpp b/src/dml/dml_helpers.cpp
new file mode 100644
index 000000000..9741438a2
--- /dev/null
+++ b/src/dml/dml_helpers.cpp
@@ -0,0 +1,333 @@
+#pragma once
+
+#include <assert.h>
+#include <stdexcept>
+#include "dml_helpers.h"
+
+namespace DmlHelpers {
+
+DmlObjects CreateDmlObjects() {
+  D3D12_COMMAND_QUEUE_DESC command_queue_description = {
+      D3D12_COMMAND_LIST_TYPE_COMPUTE,
+      0,
+      D3D12_COMMAND_QUEUE_FLAG_NONE,
+      0,
+  };
+
+  DmlObjects dml_objects;
+
+  THROW_IF_FAILED(D3D12CreateDevice(nullptr, D3D_FEATURE_LEVEL_11_0, IID_PPV_ARGS(&dml_objects.d3d12_device)));
+  THROW_IF_FAILED(dml_objects.d3d12_device->CreateCommandQueue(&command_queue_description, IID_PPV_ARGS(&dml_objects.command_queue)));
+  THROW_IF_FAILED(dml_objects.d3d12_device->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_DIRECT, IID_PPV_ARGS(&dml_objects.command_allocator)));
+  THROW_IF_FAILED(dml_objects.d3d12_device->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_DIRECT, dml_objects.command_allocator.Get(), nullptr, IID_PPV_ARGS(&dml_objects.command_list)));
+  return dml_objects;
+}
+
+DmlReusedCommandListState BuildReusableCommandList(
+    IDMLDevice* dml_device,
+    IDMLCompiledOperator* compiled_operator,
+    ID3D12Resource* persistent_resource,
+    std::optional<DML_BUFFER_BINDING> persistent_resource_binding) {
+  DmlReusedCommandListState command_list_state{};
+
+  DML_BINDING_PROPERTIES exec_binding_props = compiled_operator->GetBindingProperties();
+
+  D3D12_DESCRIPTOR_HEAP_DESC desc = {};
+  desc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE;
+  desc.NumDescriptors = exec_binding_props.RequiredDescriptorCount;
+  desc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
+
+  ComPtr<ID3D12Device> d3d_device;
+  THROW_IF_FAILED(dml_device->GetParentDevice(IID_PPV_ARGS(&d3d_device)));
+
+  THROW_IF_FAILED(d3d_device->CreateDescriptorHeap(&desc, IID_PPV_ARGS(command_list_state.heap.ReleaseAndGetAddressOf())));
+
+  // Create a binding table for execution.
+  DML_BINDING_TABLE_DESC binding_table_desc = {};
+  binding_table_desc.Dispatchable = compiled_operator;
+  binding_table_desc.CPUDescriptorHandle = command_list_state.heap->GetCPUDescriptorHandleForHeapStart();
+  binding_table_desc.GPUDescriptorHandle = command_list_state.heap->GetGPUDescriptorHandleForHeapStart();
+  binding_table_desc.SizeInDescriptors = exec_binding_props.RequiredDescriptorCount;
+
+  THROW_IF_FAILED(dml_device->CreateBindingTable(&binding_table_desc, IID_PPV_ARGS(&command_list_state.binding_table)));
+
+  THROW_IF_FAILED(d3d_device->CreateCommandAllocator(
+      D3D12_COMMAND_LIST_TYPE_COMPUTE,
+      IID_PPV_ARGS(command_list_state.command_allocator.ReleaseAndGetAddressOf())));
+
+  THROW_IF_FAILED(d3d_device->CreateCommandList(
+      0,
+      D3D12_COMMAND_LIST_TYPE_COMPUTE,
+      command_list_state.command_allocator.Get(),
+      nullptr,
+      IID_PPV_ARGS(command_list_state.graphics_command_list.ReleaseAndGetAddressOf())));
+
+  if (persistent_resource) {
+    DML_BINDING_DESC persistent_resource_binding_desc = {DML_BINDING_TYPE_BUFFER, persistent_resource_binding ? &*persistent_resource_binding : nullptr};
+    command_list_state.binding_table->BindPersistentResource(&persistent_resource_binding_desc);
+    command_list_state.persistent_resource = persistent_resource;
+  }
+
+  ID3D12DescriptorHeap* descriptor_heaps[] = {command_list_state.heap.Get()};
+  command_list_state.graphics_command_list->SetDescriptorHeaps(ARRAYSIZE(descriptor_heaps), descriptor_heaps);
+
+  ComPtr<IDMLCommandRecorder> recorder;
+  THROW_IF_FAILED(dml_device->CreateCommandRecorder(IID_PPV_ARGS(recorder.GetAddressOf())));
+
+  recorder->RecordDispatch(command_list_state.graphics_command_list.Get(), compiled_operator, command_list_state.binding_table.Get());
+  command_list_state.compiled_operator = compiled_operator;
+
+  THROW_IF_FAILED(command_list_state.graphics_command_list->Close());
+
+  return command_list_state;
+}
+
+void ExecuteReusableCommandList(
+    DmlExecutionContext* execution_context,
+    DmlReusedCommandListState& command_list_state,
+    OrtAllocator& allocator,
+    const OrtDmlApi* ort_dml_api,
+    std::span<ID3D12Resource*> input_resources,
+    std::span<const uint64_t> input_sizes,
+    std::span<ID3D12Resource*> output_resources,
+    std::span<const uint64_t> output_sizes,
+    bool bindings_changed) {
+  assert(input_resources.size() == input_sizes.size());
+  assert(output_resources.size() == output_sizes.size());
+
+  DML_BINDING_PROPERTIES exec_binding_props = command_list_state.compiled_operator->GetBindingProperties();
+
+  std::vector<DML_BUFFER_BINDING> input_bindings(input_resources.size());
+  std::vector<DML_BINDING_DESC> input_binding_descs(output_resources.size());
+
+  std::vector<DML_BUFFER_BINDING> output_bindings(output_resources.size());
+  std::vector<DML_BINDING_DESC> output_binding_descs(output_resources.size());
+
+  if (bindings_changed) {
+    // Bind the inputs
+    for (uint32_t i = 0; i < input_bindings.size(); ++i) {
+      input_bindings[i].Buffer = input_resources[i];
+      input_bindings[i].SizeInBytes = input_sizes[i];
+      input_binding_descs[i] = {DML_BINDING_TYPE_BUFFER, &input_bindings[i]};
+    }
+
+    command_list_state.binding_table->BindInputs(static_cast<uint32_t>(input_binding_descs.size()), input_binding_descs.data());
+
+    // Bind the outputs
+    for (uint32_t i = 0; i < output_bindings.size(); ++i) {
+      output_bindings[i].Buffer = output_resources[i];
+      output_bindings[i].SizeInBytes = output_sizes[i];
+      output_binding_descs[i] = {DML_BINDING_TYPE_BUFFER, &output_bindings[i]};
+    }
+
+    command_list_state.binding_table->BindOutputs(static_cast<uint32_t>(output_binding_descs.size()), output_binding_descs.data());
+
+    // Create the temporary resource
+    if (exec_binding_props.TemporaryResourceSize > 0) {
+      ComPtr<ID3D12Resource> temporary_resource;
+      std::array<int64_t, 1> persistent_resource_shape = {static_cast<int64_t>(exec_binding_props.TemporaryResourceSize)};
+      auto persistent_tensor = OrtValue::CreateTensor(allocator, persistent_resource_shape, ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8);
+      Ort::ThrowOnError(ort_dml_api->GetD3D12ResourceFromAllocation(&allocator, persistent_tensor->GetTensorMutableRawData(), &temporary_resource));
+    }
+  }
+
+  // Execute the command list and if it succeeds, update the fence value at which this command may be
+  // re-used.
+  ComPtr<ID3D12Fence> fence;
+  uint64_t completion_value;
+  execution_context->ExecuteCommandList(command_list_state.graphics_command_list.Get(), fence.GetAddressOf(), &completion_value);
+}
+
+static uint64_t DataTypeSizeInBytes(DML_TENSOR_DATA_TYPE dml_data_type) {
+  switch (dml_data_type) {
+    case DML_TENSOR_DATA_TYPE_FLOAT16:
+      return sizeof(Ort::Float16_t);
+    case DML_TENSOR_DATA_TYPE_FLOAT32:
+      return sizeof(float);
+    case DML_TENSOR_DATA_TYPE_FLOAT64:
+      return sizeof(double);
+    case DML_TENSOR_DATA_TYPE_UINT8:
+      return sizeof(uint8_t);
+    case DML_TENSOR_DATA_TYPE_UINT16:
+      return sizeof(uint16_t);
+    case DML_TENSOR_DATA_TYPE_UINT32:
+      return sizeof(uint32_t);
+    case DML_TENSOR_DATA_TYPE_UINT64:
+      return sizeof(uint64_t);
+    case DML_TENSOR_DATA_TYPE_INT8:
+      return sizeof(int8_t);
+    case DML_TENSOR_DATA_TYPE_INT16:
+      return sizeof(int16_t);
+    case DML_TENSOR_DATA_TYPE_INT32:
+      return sizeof(int32_t);
+    case DML_TENSOR_DATA_TYPE_INT64:
+      return sizeof(int64_t);
+    default:
+      THROW_HR(E_NOTIMPL);
+  }
+}
+
+ComPtr<IDMLCompiledOperator> CreateCastOperator(
+    IDMLDevice* dml_device,
+    uint32_t num_elements,
+    DML_TENSOR_DATA_TYPE source_data_type,
+    DML_TENSOR_DATA_TYPE target_data_type) {
+  // Create the input tensor desc
+  DML_BUFFER_TENSOR_DESC input_buffer_desc{};
+  input_buffer_desc.Sizes = &num_elements;
+  input_buffer_desc.DimensionCount = 1;
+  input_buffer_desc.DataType = source_data_type;
+  input_buffer_desc.TotalTensorSizeInBytes = num_elements * DataTypeSizeInBytes(source_data_type);
+  DML_TENSOR_DESC input_tensor_desc = {DML_TENSOR_TYPE_BUFFER, &input_buffer_desc};
+
+  // Create the output tensor desc
+  DML_BUFFER_TENSOR_DESC output_buffer_desc{};
+  output_buffer_desc.Sizes = &num_elements;
+  output_buffer_desc.DimensionCount = 1;
+  output_buffer_desc.DataType = target_data_type;
+  output_buffer_desc.TotalTensorSizeInBytes = num_elements * DataTypeSizeInBytes(target_data_type);
+  DML_TENSOR_DESC output_tensor_desc = {DML_TENSOR_TYPE_BUFFER, &output_buffer_desc};
+
+  DML_CAST_OPERATOR_DESC cast_op_desc{};
+  cast_op_desc.InputTensor = &input_tensor_desc;
+  cast_op_desc.OutputTensor = &output_tensor_desc;
+  DML_OPERATOR_DESC cast_op_dml_desc = {DML_OPERATOR_CAST, &cast_op_desc};
+
+  ComPtr<IDMLOperator> cast_op;
+  THROW_IF_FAILED(dml_device->CreateOperator(&cast_op_dml_desc, IID_PPV_ARGS(&cast_op)));
+
+  ComPtr<IDMLCompiledOperator> compiled_cast_op;
+  THROW_IF_FAILED(dml_device->CompileOperator(cast_op.Get(), DML_EXECUTION_FLAG_DESCRIPTORS_VOLATILE, IID_PPV_ARGS(&compiled_cast_op)));
+
+  return compiled_cast_op;
+}
+
+void GetNextDispatchSize(
+    uint32_t element_count,
+    uint32_t num_threads,
+    uint32_t& dispatch,
+    uint32_t& pending_element_count) {
+  // Max threads per workgroup is 2^10 (1024). Max dispatch per dimension is 2^16. Taken together, we can dispatch a maximum of
+  // 2^26 (268,435,456) threads along a single dimension. This should suffice for a majority of the workload. Therefore, even
+  // though it is possible to dispatch up to (2^16)^3 workgroups simultaneously, we stick to the simpler 1D dispatch alternative.
+  assert(num_threads <= D3D12_CS_THREAD_GROUP_MAX_THREADS_PER_GROUP);
+
+  const uint32_t max_threads_per_dispatch = num_threads * D3D12_CS_DISPATCH_MAX_THREAD_GROUPS_PER_DIMENSION;
+
+  // Compute max dispatchable elements
+  const uint32_t available_thread_count = std::min(element_count, max_threads_per_dispatch);
+
+  // Compute required thread group count
+  uint32_t workgroup_count_1d = (available_thread_count + num_threads - 1) / num_threads;
+
+  // Compute min dispatch size
+  dispatch = workgroup_count_1d;
+
+  // With the dispatch size computed, compute the dispatched element count
+  const uint32_t dispatched_element_count = workgroup_count_1d * num_threads;
+
+  // Update the pending element count
+  pending_element_count = (dispatched_element_count < element_count) ? element_count - dispatched_element_count : 0;
+}
+
+DML_TENSOR_DATA_TYPE OrtToDmlDataType(ONNXTensorElementDataType ort_dtype) {
+  switch (ort_dtype) {
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16:
+      return DML_TENSOR_DATA_TYPE_FLOAT16;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT:
+      return DML_TENSOR_DATA_TYPE_FLOAT32;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE:
+      return DML_TENSOR_DATA_TYPE_FLOAT64;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8:
+      return DML_TENSOR_DATA_TYPE_UINT8;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16:
+      return DML_TENSOR_DATA_TYPE_UINT16;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32:
+      return DML_TENSOR_DATA_TYPE_UINT32;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64:
+      return DML_TENSOR_DATA_TYPE_UINT64;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8:
+      return DML_TENSOR_DATA_TYPE_INT8;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16:
+      return DML_TENSOR_DATA_TYPE_INT16;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32:
+      return DML_TENSOR_DATA_TYPE_INT32;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64:
+      return DML_TENSOR_DATA_TYPE_INT64;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL:
+      return DML_TENSOR_DATA_TYPE_UINT8;
+    default:
+      THROW_HR(E_NOTIMPL);
+  }
+}
+
+void DmlCastInputToOutput(
+    DmlExecutionContext* execution_context,
+    OrtAllocator& allocator,
+    OrtValue& in,
+    std::unique_ptr<OrtValue>& p_out,
+    IDMLDevice* dml_device,
+    const OrtDmlApi* ort_dml_api,
+    DmlReusedCommandListState& command_list_state) {
+  auto shape_info = in.GetTensorTypeAndShapeInfo();
+  auto shape = shape_info->GetShape();
+
+  bool allocate_p_out = p_out == nullptr;
+  if (p_out) {
+    auto out_shape_info = p_out->GetTensorTypeAndShapeInfo();
+    auto out_shape = out_shape_info->GetShape();
+    allocate_p_out = shape != out_shape;
+  }
+
+  if (allocate_p_out) {
+    p_out = OrtValue::CreateTensor<float>(allocator, shape);
+  }
+
+  int element_count = static_cast<int>(shape_info->GetElementCount());
+  auto dml_from_type = DmlHelpers::OrtToDmlDataType(in.GetTensorTypeAndShapeInfo()->GetElementType());
+  auto dml_to_type = DmlHelpers::OrtToDmlDataType(p_out->GetTensorTypeAndShapeInfo()->GetElementType());
+
+  bool rebind = command_list_state.previousOutput != p_out.get();
+
+  // If the sizes change, we need to recompile the operator and rebuild the command lists. It should only happen
+  // once after the very first iteration.
+  if (rebind) {
+    auto compiled_cast_operator = DmlHelpers::CreateCastOperator(dml_device, element_count, dml_from_type, dml_to_type);
+
+    ComPtr<ID3D12Resource> persistent_resource;
+    uint64_t persistent_resource_size = compiled_cast_operator->GetBindingProperties().PersistentResourceSize;
+
+    std::optional<DML_BUFFER_BINDING> persistent_resource_binding;
+
+    if (persistent_resource_size > 0) {
+      std::array<int64_t, 1> persistent_resource_shape = {static_cast<int64_t>(persistent_resource_size)};
+      auto persistent_tensor = OrtValue::CreateTensor(allocator, persistent_resource_shape, ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8);
+      Ort::ThrowOnError(ort_dml_api->GetD3D12ResourceFromAllocation(&allocator, persistent_tensor->GetTensorMutableRawData(), &persistent_resource));
+      persistent_resource_binding = DML_BUFFER_BINDING{persistent_resource.Get(), 0, persistent_resource_size};
+    }
+
+    DML_BINDING_DESC persistent_resource_bindingDesc = persistent_resource_binding
+                                                           ? DML_BINDING_DESC{DML_BINDING_TYPE_BUFFER, &*persistent_resource_binding}
+                                                           : DML_BINDING_DESC{DML_BINDING_TYPE_NONE, nullptr};
+
+    DML_BINDING_DESC input_array_binding_desc = DML_BINDING_DESC{DML_BINDING_TYPE_NONE, nullptr};
+    execution_context->InitializeOperator(compiled_cast_operator.Get(), persistent_resource_bindingDesc, input_array_binding_desc);
+    command_list_state = DmlHelpers::BuildReusableCommandList(dml_device, compiled_cast_operator.Get(), persistent_resource.Get(), persistent_resource_binding);
+    command_list_state.previousOutput = p_out.get();
+  }
+
+  ComPtr<ID3D12Resource> source_resource;
+  Ort::ThrowOnError(ort_dml_api->GetD3D12ResourceFromAllocation(&allocator, in.GetTensorMutableData<uint8_t>(), &source_resource));
+
+  ComPtr<ID3D12Resource> target_resource;
+  Ort::ThrowOnError(ort_dml_api->GetD3D12ResourceFromAllocation(&allocator, p_out->GetTensorMutableData<uint8_t>(), &target_resource));
+
+  std::array<ID3D12Resource*, 1> input_resources = {source_resource.Get()};
+  std::array<uint64_t, 1> input_sizes = {element_count * DataTypeSizeInBytes(dml_from_type)};
+
+  std::array<ID3D12Resource*, 1> output_resources = {target_resource.Get()};
+  std::array<uint64_t, 1> output_sizes = {element_count * DataTypeSizeInBytes(dml_to_type)};
+
+  DmlHelpers::ExecuteReusableCommandList(execution_context, command_list_state, allocator, ort_dml_api, input_resources, input_sizes, output_resources, output_sizes, rebind);
+}
+}  // namespace DmlHelpers
diff --git a/src/dml/dml_helpers.h b/src/dml/dml_helpers.h
new file mode 100644
index 000000000..22da63254
--- /dev/null
+++ b/src/dml/dml_helpers.h
@@ -0,0 +1,73 @@
+#pragma once
+
+#include <wrl/client.h>
+#include <wil/result.h>
+#include <d3d12.h>
+#include <DirectML.h>
+#include "dml_execution_context.h"
+
+using Microsoft::WRL::ComPtr;
+
+struct DmlReusedCommandListState {
+  // Re-usable command list, supporting descriptor heap, and DML binding table to update that heap.
+  Microsoft::WRL::ComPtr<IDMLCompiledOperator> compiled_operator;
+  Microsoft::WRL::ComPtr<ID3D12GraphicsCommandList> graphics_command_list;
+  Microsoft::WRL::ComPtr<ID3D12CommandAllocator> command_allocator;
+  Microsoft::WRL::ComPtr<ID3D12DescriptorHeap> heap;
+  Microsoft::WRL::ComPtr<IDMLBindingTable> binding_table;
+  Microsoft::WRL::ComPtr<ID3D12Resource> persistent_resource;
+  OrtValue* previousInput = nullptr;
+  OrtValue* previousOutput = nullptr;
+};
+
+struct DmlObjects {
+  ComPtr<ID3D12Device> d3d12_device;
+  ComPtr<ID3D12CommandQueue> command_queue;
+  ComPtr<ID3D12CommandAllocator> command_allocator;
+  ComPtr<ID3D12GraphicsCommandList> command_list;
+  ComPtr<ID3D12Resource> upload_buffer;
+};
+
+namespace DmlHelpers {
+DmlObjects CreateDmlObjects();
+
+DmlReusedCommandListState BuildReusableCommandList(
+    IDMLDevice* dml_device,
+    IDMLCompiledOperator* compiled_operator,
+    ID3D12Resource* persistent_resource,
+    std::optional<DML_BUFFER_BINDING> persistent_resource_binding);
+
+void ExecuteReusableCommandList(
+    DmlExecutionContext* execution_context,
+    DmlReusedCommandListState& command_list_state,
+    OrtAllocator& allocator,
+    const OrtDmlApi* ort_dml_api,
+    std::span<ID3D12Resource*> input_resources,
+    std::span<const uint64_t> input_sizes,
+    std::span<ID3D12Resource*> output_resources,
+    std::span<const uint64_t> output_sizes,
+    bool bindings_changed);
+
+ComPtr<IDMLCompiledOperator> CreateCastOperator(
+    IDMLDevice* dml_device,
+    uint32_t num_elements,
+    DML_TENSOR_DATA_TYPE source_data_type,
+    DML_TENSOR_DATA_TYPE target_data_type);
+
+void GetNextDispatchSize(
+    uint32_t element_count,
+    uint32_t num_threads,
+    uint32_t& dispatch,
+    uint32_t& pending_element_count);
+
+DML_TENSOR_DATA_TYPE OrtToDmlDataType(ONNXTensorElementDataType ort_dtype);
+
+void DmlCastInputToOutput(
+    DmlExecutionContext* execution_context,
+    OrtAllocator& allocator,
+    OrtValue& in,
+    std::unique_ptr<OrtValue>& p_out,
+    IDMLDevice* dml_device,
+    const OrtDmlApi* ort_dml_api,
+    DmlReusedCommandListState& command_list_state);
+}  // namespace DmlHelpers
diff --git a/src/dml/dml_increment_values_kernel.cpp b/src/dml/dml_increment_values_kernel.cpp
new file mode 100644
index 000000000..989da6288
--- /dev/null
+++ b/src/dml/dml_increment_values_kernel.cpp
@@ -0,0 +1,122 @@
+#include <d3dx12.h>
+#include <assert.h>
+#include <wil/result.h>
+#include <stdexcept>
+#include "dml_increment_values_kernel.h"
+#include "dml_helpers.h"
+
+namespace DmlIncrementValues_Int32 {
+#include "generated_dml_shaders/increment_values_int32.h"
+}
+
+namespace DmlIncrementValues_Int64 {
+#include "generated_dml_shaders/increment_values_int64.h"
+}
+
+DmlIncrementValuesKernel::DmlIncrementValuesKernel(
+    ID3D12Device* d3d12_device,
+    DmlExecutionContext* execution_context,
+    uint32_t element_count,
+    ONNXTensorElementDataType dtype,
+    ID3D12Resource* values_resource)
+    : device_(d3d12_device),
+      execution_context_(execution_context),
+      dtype_(dtype),
+      values_resource_(values_resource) {
+  constants_.element_count = element_count;
+  total_element_count_ = element_count;
+
+  // Compute root signature.
+  std::vector<CD3DX12_ROOT_PARAMETER1> root_parameters;
+  root_parameters.resize(uav_count_ + 1);
+
+  for (UINT i = 0; i < uav_count_; i++) {
+    root_parameters[i].InitAsUnorderedAccessView(i);
+  }
+
+  root_parameters[uav_count_].InitAsConstants(constant_count_, 0);
+
+  CD3DX12_VERSIONED_ROOT_SIGNATURE_DESC desc;
+  desc.Init_1_1(static_cast<uint32_t>(root_parameters.size()), root_parameters.data());
+
+  ComPtr<ID3DBlob> root_signature_blob;
+  ComPtr<ID3DBlob> root_signature_error_blob;
+  THROW_IF_FAILED(D3D12SerializeVersionedRootSignature(
+      &desc,
+      root_signature_blob.GetAddressOf(),
+      root_signature_error_blob.GetAddressOf()));
+
+  THROW_IF_FAILED(device_->CreateRootSignature(
+      0,
+      root_signature_blob->GetBufferPointer(),
+      root_signature_blob->GetBufferSize(),
+      IID_PPV_ARGS(&root_signature_)));
+
+  D3D12_COMPUTE_PIPELINE_STATE_DESC compute_pso_desc = {};
+  compute_pso_desc.pRootSignature = root_signature_.Get();
+
+  if (dtype == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32) {
+    compute_pso_desc.CS = CD3DX12_SHADER_BYTECODE(DmlIncrementValues_Int32::g_CSMain, sizeof(DmlIncrementValues_Int32::g_CSMain));
+  } else if (dtype == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64) {
+    compute_pso_desc.CS = CD3DX12_SHADER_BYTECODE(DmlIncrementValues_Int64::g_CSMain, sizeof(DmlIncrementValues_Int64::g_CSMain));
+  } else {
+    THROW_HR(E_NOTIMPL);
+  }
+
+  THROW_IF_FAILED(device_->CreateComputePipelineState(&compute_pso_desc, IID_PPV_ARGS(&pipeline_state_)));
+
+  THROW_IF_FAILED(d3d12_device->CreateCommandAllocator(
+      D3D12_COMMAND_LIST_TYPE_COMPUTE,
+      IID_PPV_ARGS(command_allocator_.ReleaseAndGetAddressOf())));
+
+  THROW_IF_FAILED(d3d12_device->CreateCommandList(
+      0,
+      D3D12_COMMAND_LIST_TYPE_COMPUTE,
+      command_allocator_.Get(),
+      nullptr,
+      IID_PPV_ARGS(graphics_command_list_.ReleaseAndGetAddressOf())));
+
+  D3D12_DESCRIPTOR_HEAP_DESC heap_desc = {};
+  heap_desc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE;
+  heap_desc.NumDescriptors = uav_count_;
+  heap_desc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
+
+  THROW_IF_FAILED(d3d12_device->CreateDescriptorHeap(&heap_desc, IID_PPV_ARGS(heap_.ReleaseAndGetAddressOf())));
+
+  ID3D12DescriptorHeap* descriptor_heaps[] = {heap_.Get()};
+  graphics_command_list_->SetDescriptorHeaps(ARRAYSIZE(descriptor_heaps), descriptor_heaps);
+
+  // Set the root signature and pipeline state
+  graphics_command_list_->SetComputeRootSignature(root_signature_.Get());
+  graphics_command_list_->SetPipelineState(pipeline_state_.Get());
+  graphics_command_list_->SetComputeRootUnorderedAccessView(0, values_resource_->GetGPUVirtualAddress());
+
+  auto pending_element_count = total_element_count_;
+  auto constants = constants_;
+
+  // Dispatch up to the maximum number of threads per iteration until
+  // all elements are completed
+  while (pending_element_count > 0) {
+    constants.start_index = total_element_count_ - pending_element_count;
+
+    uint32_t dispatch_size_x;
+
+    DmlHelpers::GetNextDispatchSize(
+        pending_element_count,
+        256,
+        dispatch_size_x,
+        pending_element_count);
+
+    // Set root constants
+    graphics_command_list_->SetComputeRoot32BitConstants(
+        uav_count_,       // root parameter index
+        constant_count_,  // Constant count
+        &constants,
+        0  // offset
+    );
+
+    graphics_command_list_->Dispatch(dispatch_size_x, 1, 1);
+  }
+
+  graphics_command_list_->Close();
+}
\ No newline at end of file
diff --git a/src/dml/dml_increment_values_kernel.h b/src/dml/dml_increment_values_kernel.h
new file mode 100644
index 000000000..b7b096bf8
--- /dev/null
+++ b/src/dml/dml_increment_values_kernel.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include <numeric>
+#include <d3d12.h>
+#include <wrl/client.h>
+#include <vector>
+#include "dml_execution_context.h"
+
+using Microsoft::WRL::ComPtr;
+
+class DmlIncrementValuesKernel {
+ public:
+  DmlIncrementValuesKernel(
+      ID3D12Device* d3d12_device,
+      DmlExecutionContext* execution_context,
+      uint32_t element_count,
+      ONNXTensorElementDataType dtype,
+      ID3D12Resource* values_resource);
+
+  ID3D12GraphicsCommandList* GetCommandList() { return graphics_command_list_.Get(); }
+
+ private:
+  struct Constants {
+    uint32_t element_count;
+    uint32_t start_index;
+  };
+
+  ComPtr<ID3D12Device> device_;
+  ComPtr<ID3D12RootSignature> root_signature_;
+  ComPtr<ID3D12PipelineState> pipeline_state_;
+  Constants constants_;
+  DmlExecutionContext* execution_context_;
+
+  ComPtr<ID3D12GraphicsCommandList> graphics_command_list_;
+  ComPtr<ID3D12CommandAllocator> command_allocator_;
+  ComPtr<ID3D12DescriptorHeap> heap_;
+
+  ONNXTensorElementDataType dtype_;
+  ComPtr<ID3D12Resource> values_resource_;
+  uint32_t total_element_count_;
+
+  constexpr static uint32_t constant_count_ = sizeof(Constants) / sizeof(uint32_t);
+  constexpr static uint32_t uav_count_ = 1;
+};
\ No newline at end of file
diff --git a/src/dml/dml_pooled_upload_heap.cpp b/src/dml/dml_pooled_upload_heap.cpp
new file mode 100644
index 000000000..db9380f23
--- /dev/null
+++ b/src/dml/dml_pooled_upload_heap.cpp
@@ -0,0 +1,253 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <assert.h>
+#include <wil/result.h>
+#include <stdexcept>
+#include <algorithm>
+#include "dml_pooled_upload_heap.h"
+#include "dml_execution_context.h"
+
+DmlPooledUploadHeap::DmlPooledUploadHeap(ID3D12Device* device, DmlExecutionContext* execution_context)
+    : device_(device), execution_context_(execution_context) {
+}
+
+static size_t Align(size_t offset, size_t alignment) {
+  assert(alignment != 0);
+  return (offset + alignment - 1) & ~(alignment - 1);
+}
+
+std::optional<size_t> DmlPooledUploadHeap::FindOffsetForAllocation(const Chunk& chunk, size_t size_in_bytes) {
+  assert(size_in_bytes != 0);
+
+  if (chunk.capacity_in_bytes < size_in_bytes) {
+    // This chunk isn't even big enough to accommodate this allocation
+    return std::nullopt;
+  }
+
+  if (chunk.allocations.empty()) {
+    // The entire chunk is empty - allocate from the beginning
+    return 0;
+  }
+
+  // Chunks are used as ring buffers, which means this allocation should go after the most recent previous
+  // allocation
+
+  const auto& last_allocation = chunk.allocations.back();
+  size_t new_allocation_begin = last_allocation.offset_in_chunk + last_allocation.size_in_bytes;
+  new_allocation_begin = Align(new_allocation_begin, c_allocation_alignment);
+
+  if (new_allocation_begin + size_in_bytes < new_allocation_begin) {
+    // Overflow
+    return std::nullopt;
+  }
+
+  const auto& first_allocation = chunk.allocations.front();
+  if (first_allocation.offset_in_chunk <= last_allocation.offset_in_chunk) {
+    // This is the case where there's potentially free space at the beginning and end of the chunk, but not
+    // the middle:
+    // e.g.
+    //   |------XXXXYYYZZ------|
+    //          ^^^^   ^^
+    //          first  last
+
+    if (new_allocation_begin + size_in_bytes <= chunk.capacity_in_bytes) {
+      // There's enough space between the end of the last allocation and the end of the chunk
+      return new_allocation_begin;
+    } else {
+      // Otherwise there's not enough space at the end of the chunk - try the beginning of the chunk instead
+      new_allocation_begin = 0;
+      if (new_allocation_begin + size_in_bytes <= first_allocation.offset_in_chunk) {
+        // There was enough space between the start of the buffer, and the start of the first allocation
+        return new_allocation_begin;
+      }
+    }
+  } else {
+    // This is the case where there's potentially free space in the middle of the chunk, but not at the edges
+    // e.g.
+    //   |YYYZZ---------XXXX-|
+    //       ^^         ^^^^
+    //       last       first
+
+    if (new_allocation_begin + size_in_bytes <= first_allocation.offset_in_chunk) {
+      // There's enough space between the end of the last allocation, and the start of the first one
+      return new_allocation_begin;
+    }
+  }
+
+  // Not enough space in this chunk to accommodate the requested allocation
+  return std::nullopt;
+}
+
+/* static */ DmlPooledUploadHeap::Chunk DmlPooledUploadHeap::CreateChunk(ID3D12Device* device, size_t size_in_bytes) {
+  ComPtr<ID3D12Resource> upload_buffer;
+  auto heap = CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_UPLOAD);
+  auto buffer = CD3DX12_RESOURCE_DESC::Buffer(size_in_bytes);
+
+  THROW_IF_FAILED(device->CreateCommittedResource(
+      &heap,
+      D3D12_HEAP_FLAG_NONE,
+      &buffer,
+      D3D12_RESOURCE_STATE_GENERIC_READ,
+      nullptr,
+      IID_PPV_ARGS(upload_buffer.ReleaseAndGetAddressOf())));
+
+  return Chunk{size_in_bytes, std::move(upload_buffer)};
+}
+
+std::pair<DmlPooledUploadHeap::Chunk*, size_t> DmlPooledUploadHeap::Reserve(size_t size_in_bytes) {
+  // Try to find a chunk with enough free space to accommodate the requested allocation size
+  for (Chunk& chunk : chunks_) {
+    std::optional<size_t> offset_for_allocation = FindOffsetForAllocation(chunk, size_in_bytes);
+    if (offset_for_allocation) {
+      // There's enough space in this chunk - return
+      return std::make_pair(&chunk, *offset_for_allocation);
+    }
+  }
+
+  // No chunks were able to accommodate the allocation - create a new chunk and return that instead
+
+  // At least double the capacity of the pool
+  const size_t new_chunk_size = std::max({total_capacity_, c_min_chunk_size, size_in_bytes});
+  chunks_.push_back(CreateChunk(device_.Get(), new_chunk_size));
+  total_capacity_ += new_chunk_size;
+
+  // Allocate from the beginning of the new chunk
+  return std::make_pair(&chunks_.back(), 0);
+}
+
+void DmlPooledUploadHeap::ReclaimAllocations() {
+  for (Chunk& chunk : chunks_) {
+    auto* allocs = &chunk.allocations;
+
+    // Remove all allocations which have had their fences signaled - this indicates that they are no longer
+    // being used by the GPU. We can stop as soon as we find an allocation which is still in use, because we
+    // only use a single command queue and executions always complete in the order they were submitted.
+    while (!allocs->empty() && allocs->front().done_event.IsSignaled()) {
+      allocs->pop_front();
+    }
+  }
+}
+
+DmlGpuEvent DmlPooledUploadHeap::BeginUploadToGpu(
+    ID3D12Resource* dst,
+    uint64_t dst_offset,
+    D3D12_RESOURCE_STATES dst_state,
+    std::span<const uint8_t> src) {
+  assert(!src.empty());
+  assert(dst->GetDesc().Dimension == D3D12_RESOURCE_DIMENSION_BUFFER);
+
+  InvariantChecker checker(this);
+
+  ReclaimAllocations();
+
+  // Allocate space from the upload heap
+  Chunk* chunk = nullptr;
+  size_t offset_in_chunk = 0;
+  std::tie(chunk, offset_in_chunk) = Reserve(src.size());
+
+  assert(chunk != nullptr);
+  assert(offset_in_chunk + src.size() <= chunk->capacity_in_bytes);
+
+  // Map the upload heap and copy the source data into it at the specified offset
+  void* upload_heap_data = nullptr;
+  THROW_IF_FAILED(chunk->resource->Map(0, nullptr, &upload_heap_data));
+  memcpy(static_cast<byte*>(upload_heap_data) + offset_in_chunk, src.data(), src.size());
+  chunk->resource->Unmap(0, nullptr);
+
+  // Copy from the upload heap into the destination resource
+  execution_context_->CopyBufferRegion(
+      dst,
+      dst_offset,
+      dst_state,
+      chunk->resource.Get(),
+      offset_in_chunk,
+      D3D12_RESOURCE_STATE_GENERIC_READ,
+      src.size());
+
+  DmlGpuEvent done_event = execution_context_->GetCurrentCompletionEvent();
+
+  execution_context_->Flush();
+  done_event.WaitForSignal();
+
+  // Add an allocation entry to the chunk
+  chunk->allocations.push_back(Allocation{static_cast<size_t>(src.size()), offset_in_chunk, done_event});
+
+  return done_event;
+}
+
+void DmlPooledUploadHeap::Trim() {
+  InvariantChecker checker(this);
+
+  ReclaimAllocations();
+
+  // Release any chunks which have no allocations
+  auto it = std::remove_if(chunks_.begin(), chunks_.end(), [](const Chunk& c) {
+    return c.allocations.empty();
+  });
+  chunks_.erase(it, chunks_.end());
+
+  // Re-calculate total capacity
+  total_capacity_ = 0;
+  for (const auto& chunk : chunks_) {
+    total_capacity_ += chunk.capacity_in_bytes;
+  }
+}
+
+void DmlPooledUploadHeap::AssertInvariants() {
+#ifdef _DEBUG
+
+  auto chunk_capacity_comparer = [](const Chunk& lhs, const Chunk& rhs) {
+    return lhs.capacity_in_bytes < rhs.capacity_in_bytes;
+  };
+
+  // Chunks should be sorted by ascending capacity
+  assert(std::is_sorted(chunks_.begin(), chunks_.end(), chunk_capacity_comparer));
+
+  // Allocations in a chunk should be sorted by ascending fence value
+  for (const auto& chunk : chunks_) {
+    auto alloc_fence_value_comparer = [](const Allocation& lhs, const Allocation& rhs) {
+      return lhs.done_event.fence_value < rhs.done_event.fence_value;
+    };
+    assert(std::is_sorted(chunk.allocations.begin(), chunk.allocations.end(), alloc_fence_value_comparer));
+  }
+
+  // Validate chunk properties
+  for (const auto& chunk : chunks_) {
+    assert(chunk.resource != nullptr);
+    assert(chunk.capacity_in_bytes == chunk.resource->GetDesc().Width);
+  }
+
+  // Validate allocation properties
+  for (const auto& chunk : chunks_) {
+    for (const auto& alloc : chunk.allocations) {
+      assert(alloc.offset_in_chunk + alloc.size_in_bytes <= chunk.capacity_in_bytes);
+      assert(alloc.offset_in_chunk % c_allocation_alignment == 0);  // Validate alignment
+    }
+  }
+
+  // Validate no overlapping allocations
+  for (const auto& chunk : chunks_) {
+    auto alloc_offset_comparer = [](const Allocation& lhs, const Allocation& rhs) {
+      return lhs.offset_in_chunk < rhs.offset_in_chunk;
+    };
+
+    std::vector<Allocation> allocations_sorted_by_offset(chunk.allocations.begin(), chunk.allocations.end());
+    std::sort(allocations_sorted_by_offset.begin(), allocations_sorted_by_offset.end(), alloc_offset_comparer);
+
+    for (size_t i = 1; i < allocations_sorted_by_offset.size(); ++i) {
+      const auto& alloc = allocations_sorted_by_offset[i - 1];
+      const auto& next_alloc = allocations_sorted_by_offset[i];
+      assert(alloc.offset_in_chunk + alloc.size_in_bytes <= next_alloc.offset_in_chunk);
+    }
+  }
+
+  // Validate total capacity of pool
+  size_t calculated_capacity = 0;
+  for (const auto& chunk : chunks_) {
+    calculated_capacity += chunk.capacity_in_bytes;
+  }
+  assert(calculated_capacity == total_capacity_);
+
+#endif  // #ifdef _DEBUG
+}
\ No newline at end of file
diff --git a/src/dml/dml_pooled_upload_heap.h b/src/dml/dml_pooled_upload_heap.h
new file mode 100644
index 000000000..817f817a4
--- /dev/null
+++ b/src/dml/dml_pooled_upload_heap.h
@@ -0,0 +1,92 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <list>
+#include <vector>
+#include <optional>
+#include <memory>
+#include "../span.h"
+#include "dml_gpu_event.h"
+#include "dml_execution_context.h"
+
+// Implements a non-blocking, ring-buffer style upload heap for copying CPU data to GPU resources.
+class DmlPooledUploadHeap {
+ public:
+  DmlPooledUploadHeap(ID3D12Device* device, DmlExecutionContext* execution_context);
+
+  // Makes a copy of the source data and begins copying it into the destination resource, and returns a GpuEvent
+  // which will become signaled when the copy is complete. The destination resource must be a default or readback
+  // buffer.
+  DmlGpuEvent BeginUploadToGpu(
+      ID3D12Resource* dst,
+      uint64_t dst_offset,
+      D3D12_RESOURCE_STATES dst_state,
+      std::span<const uint8_t> src);
+
+  // Releases unused capacity.
+  void Trim();
+
+  size_t Capacity() const { return total_capacity_; }
+
+ private:
+  static constexpr size_t c_min_chunk_size = 1024 * 1024;  // 1MB
+  static constexpr size_t c_allocation_alignment = 512;    // In bytes; as per D3D12 requirement for buffers
+
+  // A suballoction from a chunk
+  struct Allocation {
+    size_t size_in_bytes;
+
+    // The offset, in bytes, from the beginning of the chunk to the beginning of this allocation
+    size_t offset_in_chunk;
+
+    // The event that will be signaled to when the GPU is done executing work that uses this allocation
+    DmlGpuEvent done_event;
+  };
+
+  // Represents a single contiguous upload heap from which we carve out suballocations. Ranges are suballocated
+  // from the upload heap in a ring-buffer fashion.
+  struct Chunk {
+    size_t capacity_in_bytes;  // The total size of the upload heap, in bytes
+    ComPtr<ID3D12Resource> resource;
+
+    // Allocations are sorted by ascending fence value - that is, least to most recently allocated
+    std::list<Allocation> allocations;
+  };
+
+  // Calls AssertInvariants on construction and again on destruction
+  class InvariantChecker {
+   public:
+    InvariantChecker(DmlPooledUploadHeap* parent)
+        : parent_(parent) {
+      parent_->AssertInvariants();
+    }
+
+    ~InvariantChecker() {
+      parent_->AssertInvariants();
+    }
+
+   private:
+    DmlPooledUploadHeap* parent_;
+  };
+
+  // Attempts to find enough unused space in the supplied chunk to accommodate the given allocation size.
+  // Returns the offset of that memory if successful, null if there wasn't enough space.
+  static std::optional<size_t> FindOffsetForAllocation(const Chunk& chunk, size_t size_in_bytes);
+
+  static Chunk CreateChunk(ID3D12Device* device, size_t size_in_bytes);
+
+  // Finds or creates a chunk with enough space to accommodate an allocation of the given size, and returns a
+  // pointer to the chunk and allocation offset.
+  std::pair<Chunk*, size_t> Reserve(size_t size_in_bytes);
+
+  void ReclaimAllocations();  // Frees all allocations which are no longer being used by the GPU.
+  void AssertInvariants();
+
+  ComPtr<ID3D12Device> device_;
+  DmlExecutionContext* execution_context_;
+
+  std::vector<Chunk> chunks_;  // sorted ascending by capacity (upload heap size)
+  size_t total_capacity_ = 0;  // Total size of all chunks, in bytes
+};
\ No newline at end of file
diff --git a/src/dml/dml_readback_heap.cpp b/src/dml/dml_readback_heap.cpp
new file mode 100644
index 000000000..2726fe565
--- /dev/null
+++ b/src/dml/dml_readback_heap.cpp
@@ -0,0 +1,145 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <assert.h>
+#include <wil/result.h>
+#include <stdexcept>
+#include "dml_readback_heap.h"
+#include "dml_execution_context.h"
+
+static ComPtr<ID3D12Resource> CreateReadbackHeap(ID3D12Device* device, size_t size) {
+  ComPtr<ID3D12Resource> readback_heap;
+  auto heap = CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_READBACK);
+  auto buffer = CD3DX12_RESOURCE_DESC::Buffer(size);
+
+  THROW_IF_FAILED(device->CreateCommittedResource(
+      &heap,
+      D3D12_HEAP_FLAG_NONE,
+      &buffer,
+      D3D12_RESOURCE_STATE_COPY_DEST,
+      nullptr,
+      IID_PPV_ARGS(readback_heap.ReleaseAndGetAddressOf())));
+
+  return readback_heap;
+}
+
+DmlReadbackHeap::DmlReadbackHeap(ID3D12Device* device, DmlExecutionContext* execution_context)
+    : device_(device),
+      execution_context_(execution_context) {
+}
+
+static size_t ComputeNewCapacity(size_t existing_capacity, size_t desired_capacity) {
+  size_t new_capacity = existing_capacity;
+
+  while (new_capacity < desired_capacity) {
+    if (new_capacity >= std::numeric_limits<size_t>::max() / 2) {
+      // Overflow; there's no way we can satisfy this allocation request
+      THROW_HR(E_OUTOFMEMORY);
+    }
+
+    new_capacity *= 2;  // geometric growth
+  }
+
+  return new_capacity;
+}
+
+void DmlReadbackHeap::EnsureReadbackHeap(size_t size) {
+  if (!readback_heap_) {
+    // Initialize the readback heap for the first time
+    assert(capacity_ == 0);
+    capacity_ = ComputeNewCapacity(c_initial_capacity, size);
+    readback_heap_ = CreateReadbackHeap(device_.Get(), capacity_);
+  } else if (capacity_ < size) {
+    // Ensure there's sufficient capacity
+    capacity_ = ComputeNewCapacity(capacity_, size);
+
+    readback_heap_ = nullptr;
+    readback_heap_ = CreateReadbackHeap(device_.Get(), capacity_);
+  }
+
+  assert(readback_heap_->GetDesc().Width >= size);
+}
+
+void DmlReadbackHeap::ReadbackFromGpu(
+    std::span<uint8_t> dst,
+    ID3D12Resource* src,
+    uint64_t src_offset,
+    D3D12_RESOURCE_STATES src_state) {
+  assert(!dst.empty());
+
+  EnsureReadbackHeap(dst.size());
+
+  // Copy from the source resource into the readback heap
+  execution_context_->CopyBufferRegion(
+      readback_heap_.Get(),
+      0,
+      D3D12_RESOURCE_STATE_COPY_DEST,
+      src,
+      src_offset,
+      src_state,
+      dst.size());
+
+  // Wait for completion and map the result
+  execution_context_->Flush();
+  execution_context_->GetCurrentCompletionEvent().WaitForSignal();
+  execution_context_->ReleaseCompletedReferences();
+
+  // Map the readback heap and copy it into the destination
+  void* readback_heap_data = nullptr;
+  THROW_IF_FAILED(readback_heap_->Map(0, nullptr, &readback_heap_data));
+  memcpy(dst.data(), readback_heap_data, dst.size());
+  readback_heap_->Unmap(0, nullptr);
+}
+
+void DmlReadbackHeap::ReadbackFromGpu(
+    std::span<void*> dst,
+    std::span<const uint32_t> dst_sizes,
+    std::span<ID3D12Resource*> src,
+    D3D12_RESOURCE_STATES src_state) {
+  assert(dst.size() == src.size());
+  assert(dst_sizes.size() == src.size());
+
+  if (dst.empty()) {
+    return;
+  }
+
+  uint32_t total_size = 0;
+  for (auto size : dst_sizes) {
+    total_size += size;
+  }
+
+  EnsureReadbackHeap(total_size);
+
+  // Copy from the source resource into the readback heap
+  uint32_t offset = 0;
+  for (uint32_t i = 0; i < dst.size(); ++i) {
+    execution_context_->CopyBufferRegion(
+        readback_heap_.Get(),
+        offset,
+        D3D12_RESOURCE_STATE_COPY_DEST,
+        src[i],
+        0,
+        src_state,
+        dst_sizes[i]);
+
+    offset += dst_sizes[i];
+  }
+
+  // Wait for completion and map the result
+  execution_context_->Flush();
+  execution_context_->GetCurrentCompletionEvent().WaitForSignal();
+  execution_context_->ReleaseCompletedReferences();
+
+  // Map the readback heap and copy it into the destination
+  void* readback_heap_data = nullptr;
+  THROW_IF_FAILED(readback_heap_->Map(0, nullptr, &readback_heap_data));
+
+  // Copy from the source resource into the readback heap
+  offset = 0;
+  for (uint32_t i = 0; i < dst.size(); ++i) {
+    memcpy(dst[i], static_cast<uint8_t*>(readback_heap_data) + offset, dst_sizes[i]);
+    offset += dst_sizes[i];
+  }
+
+  readback_heap_->Unmap(0, nullptr);
+}
\ No newline at end of file
diff --git a/src/dml/dml_readback_heap.h b/src/dml/dml_readback_heap.h
new file mode 100644
index 000000000..1d6777185
--- /dev/null
+++ b/src/dml/dml_readback_heap.h
@@ -0,0 +1,40 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <d3d12.h>
+#include "dml_execution_context.h"
+
+// Because we never perform more than one readback at a time, we don't need anything fancy for managing the
+// readback heap - just maintain a single resource and reallocate it if it's not big enough.
+class DmlReadbackHeap {
+ public:
+  DmlReadbackHeap(ID3D12Device* device, DmlExecutionContext* execution_context);
+
+  // Copies data from the specified GPU resource into CPU memory pointed-to by the span. This method will block
+  // until the copy is complete.
+  void ReadbackFromGpu(
+      std::span<uint8_t> dst,
+      ID3D12Resource* src,
+      uint64_t src_offset,
+      D3D12_RESOURCE_STATES src_state);
+
+  // Overload supporting batching
+  void ReadbackFromGpu(
+      std::span<void*> dst,
+      std::span<const uint32_t> dst_sizes,
+      std::span<ID3D12Resource*> src,
+      D3D12_RESOURCE_STATES src_state);
+
+ private:
+  void EnsureReadbackHeap(size_t size);
+
+  static constexpr size_t c_initial_capacity = 1024 * 1024;  // 1MB
+
+  ComPtr<ID3D12Device> device_;
+  DmlExecutionContext* execution_context_;
+
+  ComPtr<ID3D12Resource> readback_heap_;
+  size_t capacity_ = 0;
+};
\ No newline at end of file
diff --git a/src/dml/dml_shaders/dml_increment_values.hlsl b/src/dml/dml_shaders/dml_increment_values.hlsl
new file mode 100644
index 000000000..ad31240fe
--- /dev/null
+++ b/src/dml/dml_shaders/dml_increment_values.hlsl
@@ -0,0 +1,28 @@
+
+//------------------------------------------------------------------------------
+//
+//  Copyright (c) Microsoft Corporation. All rights reserved.
+//
+//------------------------------------------------------------------------------
+
+#define ROOT_SIG_DEF "DescriptorTable(UAV(u0, numDescriptors=1, flags=DATA_VOLATILE | DESCRIPTORS_VOLATILE)), RootConstants(num32BitConstants=1, b0)"
+#define NUM_THREADS 256
+
+RWStructuredBuffer<T> values : register(u0);
+
+cbuffer Constants
+{
+    uint element_count;
+    uint start_index;
+};
+
+[RootSignature(ROOT_SIG_DEF)]
+[numthreads(NUM_THREADS, 1, 1)]
+void CSMain(uint3 dispatch_thread_id : SV_DispatchThreadID)
+{
+    uint global_index = dispatch_thread_id.x + start_index;
+    if (global_index < element_count)
+    {
+        ++values[global_index];
+    }
+}
diff --git a/src/dml/dml_shaders/dml_update_attention_mask.hlsl b/src/dml/dml_shaders/dml_update_attention_mask.hlsl
new file mode 100644
index 000000000..22e7c77c9
--- /dev/null
+++ b/src/dml/dml_shaders/dml_update_attention_mask.hlsl
@@ -0,0 +1,41 @@
+
+//------------------------------------------------------------------------------
+//
+//  Copyright (c) Microsoft Corporation. All rights reserved.
+//
+//------------------------------------------------------------------------------
+
+#define ROOT_SIG_DEF "DescriptorTable(UAV(u0, numDescriptors=2, flags=DATA_VOLATILE | DESCRIPTORS_VOLATILE)), RootConstants(num32BitConstants=1, b0)"
+#define NUM_THREADS 256
+
+RWStructuredBuffer<T> input_mask : register(u0);
+RWStructuredBuffer<T> output_mask : register(u1);
+
+cbuffer Constants
+{
+    uint max_seq_len;
+    uint seq_len;
+    uint element_count;
+    uint start_index;
+};
+
+[RootSignature(ROOT_SIG_DEF)]
+[numthreads(NUM_THREADS, 1, 1)]
+void CSMain(uint3 dispatch_thread_id : SV_DispatchThreadID)
+{
+    uint global_index = dispatch_thread_id.x + start_index;
+    if (global_index < element_count)
+    {
+        uint sequence_index = global_index % max_seq_len;
+
+        if (seq_len > 1)
+        {
+            const T value = sequence_index < seq_len ? 1 : 0;
+            output_mask[global_index] = value;
+        }
+        else
+        {
+            output_mask[global_index] = (sequence_index == 0 || input_mask[sequence_index] == 1 || input_mask[sequence_index - 1] == 1) ? 1 : 0;
+        }
+    }
+}
diff --git a/src/dml/dml_smart_container.h b/src/dml/dml_smart_container.h
new file mode 100644
index 000000000..c025f7018
--- /dev/null
+++ b/src/dml/dml_smart_container.h
@@ -0,0 +1,25 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <memory>
+#include <wrl/client.h>
+#include <wrl/implements.h>
+#include "../models/onnxruntime_api.h"
+
+// Allows objects to be added to a D3D12 object via SetPrivateDataInterface and extend its lifetime beyond the life of the model. For
+// example, we can put the DML allocator on the D3D12 device (which is a unique singleton for each adapter) and be sure that the allocator won't be
+// destroyed until nothing holds on to the device anymore.
+class DmlSmartContainer : public Microsoft::WRL::RuntimeClass<Microsoft::WRL::RuntimeClassFlags<Microsoft::WRL::ClassicCom>, IUnknown> {
+ public:
+  DmlSmartContainer(std::unique_ptr<OrtMemoryInfo>&& memory_info, std::unique_ptr<Ort::Allocator>&& allocator)
+      : memory_info_(std::move(memory_info)), allocator_(std::move(allocator)) {}
+
+  const OrtMemoryInfo* GetMemoryInfo() const { return memory_info_.get(); }
+  Ort::Allocator* GetAllocator() const { return allocator_.get(); }
+
+ private:
+  std::unique_ptr<OrtMemoryInfo> memory_info_;
+  std::unique_ptr<Ort::Allocator> allocator_;
+};
\ No newline at end of file
diff --git a/src/dml/dml_update_mask_kernel.cpp b/src/dml/dml_update_mask_kernel.cpp
new file mode 100644
index 000000000..a927b78d9
--- /dev/null
+++ b/src/dml/dml_update_mask_kernel.cpp
@@ -0,0 +1,153 @@
+#include <d3dx12.h>
+#include <assert.h>
+#include <wil/result.h>
+#include <stdexcept>
+#include "dml_update_mask_kernel.h"
+#include "dml_helpers.h"
+
+namespace DmlUpdateMask_Int32 {
+#include "generated_dml_shaders/update_mask_int32.h"
+}
+
+namespace DmlUpdateMask_Int64 {
+#include "generated_dml_shaders/update_mask_int64.h"
+}
+
+DmlUpdateMaskKernel::DmlUpdateMaskKernel(
+    ID3D12Device* d3d12_device,
+    DmlExecutionContext* execution_context,
+    uint32_t batch_size,
+    uint32_t max_seq_len,
+    ONNXTensorElementDataType dtype,
+    uint32_t seq_len,
+    ID3D12Resource* attention_mask_resource,
+    ID3D12Resource* attention_mask_next_resource)
+    : device_(d3d12_device),
+      execution_context_(execution_context),
+      dtype_(dtype),
+      attention_mask_resource_(attention_mask_resource),
+      attention_mask_next_resource_(attention_mask_next_resource) {
+  constants_.element_count = batch_size * max_seq_len;
+  constants_.max_seq_len = max_seq_len;
+  constants_.seq_len = seq_len;
+  total_element_count_ = batch_size * max_seq_len;
+
+  // Compute root signature.
+  std::vector<CD3DX12_ROOT_PARAMETER1> root_parameters;
+  root_parameters.resize(uav_count_ + 1);
+
+  for (UINT i = 0; i < uav_count_; i++) {
+    root_parameters[i].InitAsUnorderedAccessView(i);
+  }
+
+  root_parameters[uav_count_].InitAsConstants(constant_count_, 0);
+
+  CD3DX12_VERSIONED_ROOT_SIGNATURE_DESC desc;
+  desc.Init_1_1(static_cast<uint32_t>(root_parameters.size()), root_parameters.data());
+
+  ComPtr<ID3DBlob> root_signature_blob;
+  ComPtr<ID3DBlob> root_signature_error_blob;
+  THROW_IF_FAILED(D3D12SerializeVersionedRootSignature(
+      &desc,
+      root_signature_blob.GetAddressOf(),
+      root_signature_error_blob.GetAddressOf()));
+
+  THROW_IF_FAILED(device_->CreateRootSignature(
+      0,
+      root_signature_blob->GetBufferPointer(),
+      root_signature_blob->GetBufferSize(),
+      IID_PPV_ARGS(&root_signature_)));
+
+  D3D12_COMPUTE_PIPELINE_STATE_DESC compute_pso_desc = {};
+  compute_pso_desc.pRootSignature = root_signature_.Get();
+
+  if (dtype == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32) {
+    compute_pso_desc.CS = CD3DX12_SHADER_BYTECODE(DmlUpdateMask_Int32::g_CSMain, sizeof(DmlUpdateMask_Int32::g_CSMain));
+  } else if (dtype == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64) {
+    compute_pso_desc.CS = CD3DX12_SHADER_BYTECODE(DmlUpdateMask_Int64::g_CSMain, sizeof(DmlUpdateMask_Int64::g_CSMain));
+  } else {
+    THROW_HR(E_NOTIMPL);
+  }
+
+  THROW_IF_FAILED(device_->CreateComputePipelineState(&compute_pso_desc, IID_PPV_ARGS(&pipeline_state_)));
+
+  THROW_IF_FAILED(d3d12_device->CreateCommandAllocator(
+      D3D12_COMMAND_LIST_TYPE_COMPUTE,
+      IID_PPV_ARGS(command_allocator_.ReleaseAndGetAddressOf())));
+
+  THROW_IF_FAILED(d3d12_device->CreateCommandList(
+      0,
+      D3D12_COMMAND_LIST_TYPE_COMPUTE,
+      command_allocator_.Get(),
+      nullptr,
+      IID_PPV_ARGS(graphics_command_list_.ReleaseAndGetAddressOf())));
+
+  D3D12_DESCRIPTOR_HEAP_DESC heap_desc = {};
+  heap_desc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE;
+  heap_desc.NumDescriptors = uav_count_;
+  heap_desc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
+
+  THROW_IF_FAILED(d3d12_device->CreateDescriptorHeap(&heap_desc, IID_PPV_ARGS(heap_.ReleaseAndGetAddressOf())));
+
+  ID3D12DescriptorHeap* descriptor_heaps[] = {heap_.Get()};
+  graphics_command_list_->SetDescriptorHeaps(ARRAYSIZE(descriptor_heaps), descriptor_heaps);
+
+  // Set the root signature and pipeline state
+  graphics_command_list_->SetComputeRootSignature(root_signature_.Get());
+  graphics_command_list_->SetPipelineState(pipeline_state_.Get());
+  graphics_command_list_->SetComputeRootUnorderedAccessView(0, attention_mask_resource_->GetGPUVirtualAddress());
+  graphics_command_list_->SetComputeRootUnorderedAccessView(1, attention_mask_next_resource_->GetGPUVirtualAddress());
+
+  auto pending_element_count = total_element_count_;
+  auto constants = constants_;
+
+  // Dispatch up to the maximum number of threads per iteration until
+  // all elements are completed
+  while (pending_element_count > 0) {
+    constants.start_index = total_element_count_ - pending_element_count;
+
+    uint32_t dispatch_size_x;
+
+    DmlHelpers::GetNextDispatchSize(
+        pending_element_count,
+        256,
+        dispatch_size_x,
+        pending_element_count);
+
+    // Set root constants
+    graphics_command_list_->SetComputeRoot32BitConstants(
+        uav_count_,       // root parameter index
+        constant_count_,  // Constant count
+        &constants,
+        0  // offset
+    );
+
+    graphics_command_list_->Dispatch(dispatch_size_x, 1, 1);
+  }
+
+  // Barrier before doing the copy
+  std::array<D3D12_RESOURCE_BARRIER, 3> before_copy_barriers = {
+      CD3DX12_RESOURCE_BARRIER::UAV(attention_mask_next_resource_.Get()),
+      CD3DX12_RESOURCE_BARRIER::Transition(attention_mask_resource_.Get(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_DEST),
+      CD3DX12_RESOURCE_BARRIER::Transition(attention_mask_next_resource_.Get(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_SOURCE),
+  };
+  graphics_command_list_->ResourceBarrier(static_cast<uint32_t>(before_copy_barriers.size()), before_copy_barriers.data());
+
+  // Copy the next mask to the current mask for next iteration
+  if (dtype_ == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32) {
+    graphics_command_list_->CopyBufferRegion(attention_mask_resource_.Get(), 0, attention_mask_next_resource_.Get(), 0, constants_.element_count * sizeof(int32_t));
+  } else if (dtype_ == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64) {
+    graphics_command_list_->CopyBufferRegion(attention_mask_resource_.Get(), 0, attention_mask_next_resource_.Get(), 0, constants_.element_count * sizeof(int64_t));
+  } else {
+    THROW_HR(E_NOTIMPL);
+  }
+
+  // Barrier after doing the copy
+  std::array<D3D12_RESOURCE_BARRIER, 2> after_copy_barriers = {
+      CD3DX12_RESOURCE_BARRIER::Transition(attention_mask_resource_.Get(), D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_UNORDERED_ACCESS),
+      CD3DX12_RESOURCE_BARRIER::Transition(attention_mask_next_resource_.Get(), D3D12_RESOURCE_STATE_COPY_SOURCE, D3D12_RESOURCE_STATE_UNORDERED_ACCESS),
+  };
+  graphics_command_list_->ResourceBarrier(static_cast<uint32_t>(after_copy_barriers.size()), after_copy_barriers.data());
+
+  graphics_command_list_->Close();
+}
\ No newline at end of file
diff --git a/src/dml/dml_update_mask_kernel.h b/src/dml/dml_update_mask_kernel.h
new file mode 100644
index 000000000..a0e2e779a
--- /dev/null
+++ b/src/dml/dml_update_mask_kernel.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include <numeric>
+#include <d3d12.h>
+#include <wrl/client.h>
+#include <vector>
+#include "dml_execution_context.h"
+
+using Microsoft::WRL::ComPtr;
+
+class DmlUpdateMaskKernel {
+ public:
+  DmlUpdateMaskKernel(
+      ID3D12Device* d3d12_device,
+      DmlExecutionContext* execution_context,
+      uint32_t batch_size,
+      uint32_t max_seq_len,
+      ONNXTensorElementDataType dtype,
+      uint32_t seq_len,
+      ID3D12Resource* attention_mask_resource,
+      ID3D12Resource* attention_mask_next_resource);
+
+  ID3D12GraphicsCommandList* GetCommandList() { return graphics_command_list_.Get(); }
+
+ private:
+  struct Constants {
+    uint32_t max_seq_len;
+    uint32_t seq_len;
+    uint32_t element_count;
+    uint32_t start_index;
+  };
+
+  ComPtr<ID3D12Device> device_;
+  ComPtr<ID3D12RootSignature> root_signature_;
+  ComPtr<ID3D12PipelineState> pipeline_state_;
+  Constants constants_;
+  DmlExecutionContext* execution_context_;
+
+  ComPtr<ID3D12GraphicsCommandList> graphics_command_list_;
+  ComPtr<ID3D12CommandAllocator> command_allocator_;
+  ComPtr<ID3D12DescriptorHeap> heap_;
+
+  ONNXTensorElementDataType dtype_;
+  ComPtr<ID3D12Resource> attention_mask_resource_;
+  ComPtr<ID3D12Resource> attention_mask_next_resource_;
+  uint32_t total_element_count_;
+
+  constexpr static uint32_t constant_count_ = sizeof(Constants) / sizeof(uint32_t);
+  constexpr static uint32_t uav_count_ = 2;
+};
\ No newline at end of file
diff --git a/src/dml/generated_dml_shaders/.clang-format b/src/dml/generated_dml_shaders/.clang-format
new file mode 100644
index 000000000..57fe428dc
--- /dev/null
+++ b/src/dml/generated_dml_shaders/.clang-format
@@ -0,0 +1,3 @@
+---
+DisableFormat: true
+...
diff --git a/src/dml/generated_dml_shaders/increment_values_int32.h b/src/dml/generated_dml_shaders/increment_values_int32.h
new file mode 100644
index 000000000..6072b1966
--- /dev/null
+++ b/src/dml/generated_dml_shaders/increment_values_int32.h
@@ -0,0 +1,269 @@
+#if 0
+;
+; Input signature:
+;
+; Name                 Index   Mask Register SysValue  Format   Used
+; -------------------- ----- ------ -------- -------- ------- ------
+; no parameters
+;
+; Output signature:
+;
+; Name                 Index   Mask Register SysValue  Format   Used
+; -------------------- ----- ------ -------- -------- ------- ------
+; no parameters
+; shader hash: 9f7e9ca05d93206ff785399ea6b26de6
+;
+; Pipeline Runtime Information: 
+;
+; Compute Shader
+; NumThreads=(256,1,1)
+;
+;
+; Buffer Definitions:
+;
+; cbuffer 
+; {
+;
+;   [8 x i8] (type annotation not present)
+;
+; }
+;
+; Resource bind info for 
+; {
+;
+;   [4 x i8] (type annotation not present)
+;
+; }
+;
+;
+; Resource Bindings:
+;
+; Name                                 Type  Format         Dim      ID      HLSL Bind  Count
+; ------------------------------ ---------- ------- ----------- ------- -------------- ------
+;                                   cbuffer      NA          NA     CB0            cb0     1
+;                                       UAV  struct         r/w      U0             u0     1
+;
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%dx.types.Handle = type { i8* }
+%dx.types.CBufRet.i32 = type { i32, i32, i32, i32 }
+%dx.types.ResRet.i32 = type { i32, i32, i32, i32, i32 }
+%"class.RWStructuredBuffer<int>" = type { i32 }
+%Constants = type { i32, i32 }
+
+define void @CSMain() {
+  %1 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 1, i32 0, i32 0, i1 false)  ; CreateHandle(resourceClass,rangeId,index,nonUniformIndex)
+  %2 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 0, i1 false)  ; CreateHandle(resourceClass,rangeId,index,nonUniformIndex)
+  %3 = call i32 @dx.op.threadId.i32(i32 93, i32 0)  ; ThreadId(component)
+  %4 = call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32(i32 59, %dx.types.Handle %2, i32 0)  ; CBufferLoadLegacy(handle,regIndex)
+  %5 = extractvalue %dx.types.CBufRet.i32 %4, 1
+  %6 = add i32 %5, %3
+  %7 = extractvalue %dx.types.CBufRet.i32 %4, 0
+  %8 = icmp ult i32 %6, %7
+  br i1 %8, label %9, label %13
+
+; <label>:9                                       ; preds = %0
+  %10 = call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %1, i32 %6, i32 0, i8 1, i32 4)  ; RawBufferLoad(srv,index,elementOffset,mask,alignment)
+  %11 = extractvalue %dx.types.ResRet.i32 %10, 0
+  %12 = add nsw i32 %11, 1
+  call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %1, i32 %6, i32 0, i32 %12, i32 undef, i32 undef, i32 undef, i8 1, i32 4)  ; RawBufferStore(uav,index,elementOffset,value0,value1,value2,value3,mask,alignment)
+  br label %13
+
+; <label>:13                                      ; preds = %9, %0
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @dx.op.threadId.i32(i32, i32) #0
+
+; Function Attrs: nounwind readonly
+declare %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32, %dx.types.Handle, i32, i32, i8, i32) #1
+
+; Function Attrs: nounwind
+declare void @dx.op.rawBufferStore.i32(i32, %dx.types.Handle, i32, i32, i32, i32, i32, i32, i8, i32) #2
+
+; Function Attrs: nounwind readonly
+declare %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32(i32, %dx.types.Handle, i32) #1
+
+; Function Attrs: nounwind readonly
+declare %dx.types.Handle @dx.op.createHandle(i32, i8, i32, i32, i1) #1
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind readonly }
+attributes #2 = { nounwind }
+
+!llvm.ident = !{!0}
+!dx.version = !{!1}
+!dx.valver = !{!2}
+!dx.shaderModel = !{!3}
+!dx.resources = !{!4}
+!dx.entryPoints = !{!10}
+
+!0 = !{!"dxcoob 1.7.2308.7 (69e54e290)"}
+!1 = !{i32 1, i32 2}
+!2 = !{i32 1, i32 7}
+!3 = !{!"cs", i32 6, i32 2}
+!4 = !{null, !5, !8, null}
+!5 = !{!6}
+!6 = !{i32 0, %"class.RWStructuredBuffer<int>"* undef, !"", i32 0, i32 0, i32 1, i32 12, i1 false, i1 false, i1 false, !7}
+!7 = !{i32 1, i32 4}
+!8 = !{!9}
+!9 = !{i32 0, %Constants* undef, !"", i32 0, i32 0, i32 1, i32 8, null}
+!10 = !{void ()* @CSMain, !"CSMain", null, !4, !11}
+!11 = !{i32 0, i64 16, i32 4, !12}
+!12 = !{i32 256, i32 1, i32 1}
+
+#endif
+
+const unsigned char g_CSMain[] = {
+  0x44, 0x58, 0x42, 0x43, 0x3e, 0xdf, 0x1d, 0xc2, 0x3b, 0x1e, 0xe3, 0xce,
+  0x12, 0x6e, 0x3a, 0xd1, 0xca, 0x67, 0x18, 0xc4, 0x01, 0x00, 0x00, 0x00,
+  0xf4, 0x06, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00,
+  0xe8, 0x00, 0x00, 0x00, 0x04, 0x01, 0x00, 0x00, 0x53, 0x46, 0x49, 0x30,
+  0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x49, 0x53, 0x47, 0x31, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x08, 0x00, 0x00, 0x00, 0x4f, 0x53, 0x47, 0x31, 0x08, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x50, 0x53, 0x56, 0x30,
+  0x78, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x05, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x48, 0x41, 0x53, 0x48, 0x14, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x9f, 0x7e, 0x9c, 0xa0, 0x5d, 0x93, 0x20, 0x6f,
+  0xf7, 0x85, 0x39, 0x9e, 0xa6, 0xb2, 0x6d, 0xe6, 0x44, 0x58, 0x49, 0x4c,
+  0xe8, 0x05, 0x00, 0x00, 0x62, 0x00, 0x05, 0x00, 0x7a, 0x01, 0x00, 0x00,
+  0x44, 0x58, 0x49, 0x4c, 0x02, 0x01, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+  0xd0, 0x05, 0x00, 0x00, 0x42, 0x43, 0xc0, 0xde, 0x21, 0x0c, 0x00, 0x00,
+  0x71, 0x01, 0x00, 0x00, 0x0b, 0x82, 0x20, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x13, 0x00, 0x00, 0x00, 0x07, 0x81, 0x23, 0x91, 0x41, 0xc8, 0x04, 0x49,
+  0x06, 0x10, 0x32, 0x39, 0x92, 0x01, 0x84, 0x0c, 0x25, 0x05, 0x08, 0x19,
+  0x1e, 0x04, 0x8b, 0x62, 0x80, 0x14, 0x45, 0x02, 0x42, 0x92, 0x0b, 0x42,
+  0xa4, 0x10, 0x32, 0x14, 0x38, 0x08, 0x18, 0x4b, 0x0a, 0x32, 0x52, 0x88,
+  0x48, 0x90, 0x14, 0x20, 0x43, 0x46, 0x88, 0xa5, 0x00, 0x19, 0x32, 0x42,
+  0xe4, 0x48, 0x0e, 0x90, 0x91, 0x22, 0xc4, 0x50, 0x41, 0x51, 0x81, 0x8c,
+  0xe1, 0x83, 0xe5, 0x8a, 0x04, 0x29, 0x46, 0x06, 0x51, 0x18, 0x00, 0x00,
+  0x08, 0x00, 0x00, 0x00, 0x1b, 0x8c, 0xe0, 0xff, 0xff, 0xff, 0xff, 0x07,
+  0x40, 0x02, 0xa8, 0x0d, 0x86, 0xf0, 0xff, 0xff, 0xff, 0xff, 0x03, 0x20,
+  0x01, 0xd5, 0x06, 0x62, 0xf8, 0xff, 0xff, 0xff, 0xff, 0x01, 0x90, 0x00,
+  0x49, 0x18, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x13, 0x82, 0x60, 0x42,
+  0x20, 0x4c, 0x08, 0x06, 0x00, 0x00, 0x00, 0x00, 0x89, 0x20, 0x00, 0x00,
+  0x33, 0x00, 0x00, 0x00, 0x32, 0x22, 0x48, 0x09, 0x20, 0x64, 0x85, 0x04,
+  0x93, 0x22, 0xa4, 0x84, 0x04, 0x93, 0x22, 0xe3, 0x84, 0xa1, 0x90, 0x14,
+  0x12, 0x4c, 0x8a, 0x8c, 0x0b, 0x84, 0xa4, 0x4c, 0x10, 0x6c, 0x23, 0x00,
+  0x25, 0x00, 0x14, 0xe6, 0x08, 0xc0, 0xa0, 0x0c, 0x63, 0x0c, 0x22, 0x37,
+  0x0d, 0x97, 0x3f, 0x61, 0x0f, 0x21, 0xf9, 0x2b, 0x21, 0xad, 0xc4, 0xe4,
+  0x23, 0xb7, 0x8d, 0x8a, 0x31, 0xc6, 0x18, 0x73, 0x04, 0x08, 0x9d, 0x7b,
+  0x86, 0xcb, 0x9f, 0xb0, 0x87, 0x90, 0xfc, 0x10, 0x68, 0x86, 0x85, 0x40,
+  0x01, 0x2a, 0x87, 0x19, 0x69, 0x8c, 0x33, 0x48, 0x95, 0x05, 0x8c, 0x34,
+  0xc6, 0x18, 0x63, 0x9c, 0x41, 0xec, 0xa8, 0xe1, 0xf2, 0x27, 0xec, 0x21,
+  0x24, 0x9f, 0xdb, 0xa8, 0x62, 0x25, 0x26, 0x1f, 0xb9, 0x6d, 0x44, 0x8c,
+  0x31, 0x46, 0x21, 0xdc, 0x48, 0x83, 0xde, 0x1c, 0x41, 0x50, 0x8c, 0x34,
+  0xce, 0x18, 0x91, 0xe4, 0x40, 0xc0, 0x4c, 0xdd, 0x38, 0xb0, 0x43, 0x38,
+  0xcc, 0xc3, 0x3c, 0xb8, 0x81, 0x2c, 0xdc, 0xc2, 0x2c, 0xd0, 0x83, 0x3c,
+  0xd4, 0xc3, 0x38, 0xd0, 0x43, 0x3d, 0xc8, 0x43, 0x39, 0x90, 0x83, 0x28,
+  0xd4, 0x83, 0x39, 0x98, 0x43, 0x39, 0xc8, 0x03, 0x1f, 0xa4, 0x83, 0x3b,
+  0xd0, 0x83, 0x1f, 0xa0, 0x60, 0x50, 0xbd, 0x84, 0x73, 0x1a, 0x69, 0x02,
+  0x9a, 0x49, 0x42, 0xc2, 0x18, 0x74, 0xe7, 0x08, 0x40, 0x61, 0x0a, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x13, 0x14, 0x72, 0xc0, 0x87, 0x74, 0x60, 0x87,
+  0x36, 0x68, 0x87, 0x79, 0x68, 0x03, 0x72, 0xc0, 0x87, 0x0d, 0xaf, 0x50,
+  0x0e, 0x6d, 0xd0, 0x0e, 0x7a, 0x50, 0x0e, 0x6d, 0x00, 0x0f, 0x7a, 0x30,
+  0x07, 0x72, 0xa0, 0x07, 0x73, 0x20, 0x07, 0x6d, 0x90, 0x0e, 0x71, 0xa0,
+  0x07, 0x73, 0x20, 0x07, 0x6d, 0x90, 0x0e, 0x78, 0xa0, 0x07, 0x73, 0x20,
+  0x07, 0x6d, 0x90, 0x0e, 0x71, 0x60, 0x07, 0x7a, 0x30, 0x07, 0x72, 0xd0,
+  0x06, 0xe9, 0x30, 0x07, 0x72, 0xa0, 0x07, 0x73, 0x20, 0x07, 0x6d, 0x90,
+  0x0e, 0x76, 0x40, 0x07, 0x7a, 0x60, 0x07, 0x74, 0xd0, 0x06, 0xe6, 0x10,
+  0x07, 0x76, 0xa0, 0x07, 0x73, 0x20, 0x07, 0x6d, 0x60, 0x0e, 0x73, 0x20,
+  0x07, 0x7a, 0x30, 0x07, 0x72, 0xd0, 0x06, 0xe6, 0x60, 0x07, 0x74, 0xa0,
+  0x07, 0x76, 0x40, 0x07, 0x6d, 0xe0, 0x0e, 0x78, 0xa0, 0x07, 0x71, 0x60,
+  0x07, 0x7a, 0x30, 0x07, 0x72, 0xa0, 0x07, 0x76, 0x40, 0x07, 0x43, 0x9e,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x86,
+  0x3c, 0x04, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x0c, 0x79, 0x14, 0x20, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x18, 0xf2, 0x30, 0x40, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x30, 0xe4, 0x79, 0x80, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x60, 0xc8, 0x23, 0x01, 0x01, 0x20, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x40, 0x16, 0x08, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0x32, 0x1e, 0x98, 0x14, 0x19, 0x11, 0x4c, 0x90, 0x8c, 0x09, 0x26, 0x47,
+  0xc6, 0x04, 0x43, 0x1a, 0x25, 0x50, 0x04, 0xe5, 0x50, 0x0c, 0x23, 0x00,
+  0x85, 0x51, 0x08, 0x05, 0x51, 0x80, 0x04, 0x14, 0x47, 0x00, 0x28, 0x17,
+  0x08, 0xe1, 0x19, 0x00, 0xb2, 0x33, 0x00, 0x00, 0x79, 0x18, 0x00, 0x00,
+  0x3e, 0x00, 0x00, 0x00, 0x1a, 0x03, 0x4c, 0x90, 0x46, 0x02, 0x13, 0xc4,
+  0x8e, 0x0c, 0x6f, 0xec, 0xed, 0x4d, 0x0c, 0x24, 0xc6, 0xe5, 0xc6, 0x45,
+  0x66, 0x06, 0x06, 0xc7, 0xe5, 0x06, 0x04, 0xc5, 0x26, 0xa7, 0xac, 0x86,
+  0xa6, 0x4c, 0x26, 0x07, 0x26, 0x65, 0x43, 0x10, 0x4c, 0x10, 0x06, 0x63,
+  0x82, 0x30, 0x1c, 0x1b, 0x84, 0x81, 0x98, 0x20, 0x0c, 0xc8, 0x06, 0x61,
+  0x30, 0x28, 0x8c, 0xcd, 0x4d, 0x10, 0x86, 0x64, 0xc3, 0x80, 0x24, 0xc4,
+  0x04, 0x61, 0x50, 0x26, 0x08, 0x96, 0x44, 0x60, 0x82, 0x30, 0x2c, 0x13,
+  0x84, 0xe8, 0x99, 0x20, 0x0c, 0xcc, 0x06, 0x61, 0x80, 0x36, 0x2c, 0x0b,
+  0xd3, 0x2c, 0xcb, 0xe0, 0x3c, 0xcf, 0x13, 0x6d, 0x08, 0xa4, 0x09, 0x02,
+  0x16, 0x4d, 0x10, 0x86, 0x66, 0x03, 0xb2, 0x50, 0xcd, 0xb2, 0x0c, 0x15,
+  0xb0, 0x21, 0xb0, 0x36, 0x10, 0xc0, 0x74, 0x01, 0x13, 0x04, 0x01, 0xa0,
+  0x31, 0x34, 0xd5, 0x14, 0x96, 0xe6, 0x36, 0x41, 0xc8, 0xa0, 0x09, 0xc2,
+  0xe0, 0x6c, 0x18, 0xb8, 0x61, 0xd8, 0x40, 0x2c, 0x1b, 0xd4, 0x6d, 0x28,
+  0x32, 0x0d, 0xc0, 0xbc, 0x2a, 0x6c, 0x6c, 0x76, 0x6d, 0x2e, 0x69, 0x64,
+  0x65, 0x6e, 0x74, 0x53, 0x82, 0xa0, 0x0a, 0x19, 0x9e, 0x8b, 0x5d, 0x99,
+  0xdc, 0x5c, 0xda, 0x9b, 0xdb, 0x94, 0x80, 0x68, 0x42, 0x86, 0xe7, 0x62,
+  0x17, 0xc6, 0x66, 0x57, 0x26, 0x37, 0x25, 0x30, 0xea, 0x90, 0xe1, 0xb9,
+  0xcc, 0xa1, 0x85, 0x91, 0x95, 0xc9, 0x35, 0xbd, 0x91, 0x95, 0xb1, 0x4d,
+  0x09, 0x92, 0x32, 0x64, 0x78, 0x2e, 0x72, 0x65, 0x73, 0x6f, 0x75, 0x72,
+  0x63, 0x65, 0x73, 0x53, 0x82, 0xab, 0x0e, 0x19, 0x9e, 0x4b, 0x99, 0x1b,
+  0x9d, 0x5c, 0x1e, 0xd4, 0x5b, 0x9a, 0x1b, 0xdd, 0xdc, 0x94, 0xc0, 0x03,
+  0x79, 0x18, 0x00, 0x00, 0x51, 0x00, 0x00, 0x00, 0x33, 0x08, 0x80, 0x1c,
+  0xc4, 0xe1, 0x1c, 0x66, 0x14, 0x01, 0x3d, 0x88, 0x43, 0x38, 0x84, 0xc3,
+  0x8c, 0x42, 0x80, 0x07, 0x79, 0x78, 0x07, 0x73, 0x98, 0x71, 0x0c, 0xe6,
+  0x00, 0x0f, 0xed, 0x10, 0x0e, 0xf4, 0x80, 0x0e, 0x33, 0x0c, 0x42, 0x1e,
+  0xc2, 0xc1, 0x1d, 0xce, 0xa1, 0x1c, 0x66, 0x30, 0x05, 0x3d, 0x88, 0x43,
+  0x38, 0x84, 0x83, 0x1b, 0xcc, 0x03, 0x3d, 0xc8, 0x43, 0x3d, 0x8c, 0x03,
+  0x3d, 0xcc, 0x78, 0x8c, 0x74, 0x70, 0x07, 0x7b, 0x08, 0x07, 0x79, 0x48,
+  0x87, 0x70, 0x70, 0x07, 0x7a, 0x70, 0x03, 0x76, 0x78, 0x87, 0x70, 0x20,
+  0x87, 0x19, 0xcc, 0x11, 0x0e, 0xec, 0x90, 0x0e, 0xe1, 0x30, 0x0f, 0x6e,
+  0x30, 0x0f, 0xe3, 0xf0, 0x0e, 0xf0, 0x50, 0x0e, 0x33, 0x10, 0xc4, 0x1d,
+  0xde, 0x21, 0x1c, 0xd8, 0x21, 0x1d, 0xc2, 0x61, 0x1e, 0x66, 0x30, 0x89,
+  0x3b, 0xbc, 0x83, 0x3b, 0xd0, 0x43, 0x39, 0xb4, 0x03, 0x3c, 0xbc, 0x83,
+  0x3c, 0x84, 0x03, 0x3b, 0xcc, 0xf0, 0x14, 0x76, 0x60, 0x07, 0x7b, 0x68,
+  0x07, 0x37, 0x68, 0x87, 0x72, 0x68, 0x07, 0x37, 0x80, 0x87, 0x70, 0x90,
+  0x87, 0x70, 0x60, 0x07, 0x76, 0x28, 0x07, 0x76, 0xf8, 0x05, 0x76, 0x78,
+  0x87, 0x77, 0x80, 0x87, 0x5f, 0x08, 0x87, 0x71, 0x18, 0x87, 0x72, 0x98,
+  0x87, 0x79, 0x98, 0x81, 0x2c, 0xee, 0xf0, 0x0e, 0xee, 0xe0, 0x0e, 0xf5,
+  0xc0, 0x0e, 0xec, 0x30, 0x03, 0x62, 0xc8, 0xa1, 0x1c, 0xe4, 0xa1, 0x1c,
+  0xcc, 0xa1, 0x1c, 0xe4, 0xa1, 0x1c, 0xdc, 0x61, 0x1c, 0xca, 0x21, 0x1c,
+  0xc4, 0x81, 0x1d, 0xca, 0x61, 0x06, 0xd6, 0x90, 0x43, 0x39, 0xc8, 0x43,
+  0x39, 0x98, 0x43, 0x39, 0xc8, 0x43, 0x39, 0xb8, 0xc3, 0x38, 0x94, 0x43,
+  0x38, 0x88, 0x03, 0x3b, 0x94, 0xc3, 0x2f, 0xbc, 0x83, 0x3c, 0xfc, 0x82,
+  0x3b, 0xd4, 0x03, 0x3b, 0xb0, 0xc3, 0x0c, 0xc4, 0x21, 0x07, 0x7c, 0x70,
+  0x03, 0x7a, 0x28, 0x87, 0x76, 0x80, 0x87, 0x19, 0xd1, 0x43, 0x0e, 0xf8,
+  0xe0, 0x06, 0xe4, 0x20, 0x0e, 0xe7, 0xe0, 0x06, 0xf6, 0x10, 0x0e, 0xf2,
+  0xc0, 0x0e, 0xe1, 0x90, 0x0f, 0xef, 0x50, 0x0f, 0xf4, 0x30, 0x83, 0x81,
+  0xc8, 0x01, 0x1f, 0xdc, 0x40, 0x1c, 0xe4, 0xa1, 0x1c, 0xc2, 0x61, 0x1d,
+  0xdc, 0x40, 0x1c, 0xe4, 0x01, 0x00, 0x00, 0x00, 0x71, 0x20, 0x00, 0x00,
+  0x1a, 0x00, 0x00, 0x00, 0x06, 0x60, 0x70, 0xac, 0x09, 0x20, 0x8d, 0x11,
+  0x6c, 0xc3, 0xe5, 0x3b, 0x8f, 0x2f, 0x04, 0x54, 0x51, 0x10, 0x51, 0xe9,
+  0x00, 0x43, 0x49, 0x18, 0x80, 0x80, 0xf9, 0xc8, 0x6d, 0x5b, 0x81, 0x34,
+  0x5c, 0xbe, 0xf3, 0xf8, 0x42, 0x44, 0x00, 0x13, 0x11, 0x02, 0xcd, 0xb0,
+  0x10, 0x26, 0x70, 0x0d, 0x97, 0xef, 0x3c, 0x7e, 0x04, 0x58, 0x1b, 0x55,
+  0x14, 0x44, 0x54, 0x3a, 0xc0, 0xe0, 0x23, 0xb7, 0x6d, 0x03, 0xd8, 0x70,
+  0xf9, 0xce, 0xe3, 0x47, 0x80, 0xb5, 0x51, 0x45, 0x41, 0x44, 0xec, 0xe4,
+  0x44, 0x84, 0x8f, 0xdc, 0xb6, 0x05, 0x48, 0xc3, 0xe5, 0x3b, 0x8f, 0x3f,
+  0x1d, 0x11, 0x01, 0x0c, 0xe2, 0xe0, 0x23, 0xb7, 0x0d, 0x00, 0x00, 0x00,
+  0x61, 0x20, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x13, 0x04, 0x43, 0x2c,
+  0x10, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x34, 0x66, 0x00, 0x4a,
+  0xae, 0x74, 0x03, 0xca, 0xae, 0x2c, 0x05, 0x0a, 0x53, 0x80, 0x4e, 0x09,
+  0x14, 0x01, 0x00, 0x00, 0x23, 0x06, 0x09, 0x00, 0x82, 0x60, 0x20, 0x59,
+  0x87, 0x10, 0x45, 0xcc, 0x88, 0x41, 0x02, 0x80, 0x20, 0x18, 0x48, 0x17,
+  0x22, 0x48, 0x52, 0x33, 0x62, 0x60, 0x00, 0x20, 0x08, 0x06, 0x04, 0x87,
+  0x4c, 0x23, 0x06, 0x07, 0x00, 0x82, 0x60, 0xf0, 0x68, 0x88, 0x40, 0x8d,
+  0x26, 0x04, 0x41, 0x05, 0x03, 0x8c, 0x26, 0x0c, 0xc0, 0x70, 0x83, 0x10,
+  0x90, 0xc1, 0x2c, 0x43, 0x20, 0x04, 0x23, 0x06, 0x0a, 0x00, 0x82, 0x60,
+  0xa0, 0x84, 0x01, 0x83, 0x0c, 0x99, 0x72, 0x8d, 0x26, 0x04, 0x80, 0x05,
+  0x1f, 0x08, 0x46, 0x0c, 0x1c, 0x00, 0x04, 0xc1, 0x80, 0x19, 0x03, 0x67,
+  0x31, 0xb8, 0x60, 0x9a, 0xa6, 0x46, 0x9b, 0x25, 0x10, 0x10, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00
+};
diff --git a/src/dml/generated_dml_shaders/increment_values_int64.h b/src/dml/generated_dml_shaders/increment_values_int64.h
new file mode 100644
index 000000000..260d52578
--- /dev/null
+++ b/src/dml/generated_dml_shaders/increment_values_int64.h
@@ -0,0 +1,284 @@
+#if 0
+;
+; Note: shader requires additional functionality:
+;       64-Bit integer
+;
+;
+; Input signature:
+;
+; Name                 Index   Mask Register SysValue  Format   Used
+; -------------------- ----- ------ -------- -------- ------- ------
+; no parameters
+;
+; Output signature:
+;
+; Name                 Index   Mask Register SysValue  Format   Used
+; -------------------- ----- ------ -------- -------- ------- ------
+; no parameters
+; shader hash: eb500d50fbbd44488ac62aed0cc293b1
+;
+; Pipeline Runtime Information: 
+;
+; Compute Shader
+; NumThreads=(256,1,1)
+;
+;
+; Buffer Definitions:
+;
+; cbuffer 
+; {
+;
+;   [8 x i8] (type annotation not present)
+;
+; }
+;
+; Resource bind info for 
+; {
+;
+;   [8 x i8] (type annotation not present)
+;
+; }
+;
+;
+; Resource Bindings:
+;
+; Name                                 Type  Format         Dim      ID      HLSL Bind  Count
+; ------------------------------ ---------- ------- ----------- ------- -------------- ------
+;                                   cbuffer      NA          NA     CB0            cb0     1
+;                                       UAV  struct         r/w      U0             u0     1
+;
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%dx.types.Handle = type { i8* }
+%dx.types.CBufRet.i32 = type { i32, i32, i32, i32 }
+%dx.types.ResRet.i32 = type { i32, i32, i32, i32, i32 }
+%"class.RWStructuredBuffer<long long>" = type { i64 }
+%Constants = type { i32, i32 }
+
+define void @CSMain() {
+  %1 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 1, i32 0, i32 0, i1 false)  ; CreateHandle(resourceClass,rangeId,index,nonUniformIndex)
+  %2 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 0, i1 false)  ; CreateHandle(resourceClass,rangeId,index,nonUniformIndex)
+  %3 = call i32 @dx.op.threadId.i32(i32 93, i32 0)  ; ThreadId(component)
+  %4 = call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32(i32 59, %dx.types.Handle %2, i32 0)  ; CBufferLoadLegacy(handle,regIndex)
+  %5 = extractvalue %dx.types.CBufRet.i32 %4, 1
+  %6 = add i32 %5, %3
+  %7 = extractvalue %dx.types.CBufRet.i32 %4, 0
+  %8 = icmp ult i32 %6, %7
+  br i1 %8, label %9, label %21
+
+; <label>:9                                       ; preds = %0
+  %10 = call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %1, i32 %6, i32 0, i8 3, i32 8)  ; RawBufferLoad(srv,index,elementOffset,mask,alignment)
+  %11 = extractvalue %dx.types.ResRet.i32 %10, 0
+  %12 = extractvalue %dx.types.ResRet.i32 %10, 1
+  %13 = zext i32 %11 to i64
+  %14 = zext i32 %12 to i64
+  %15 = shl i64 %14, 32
+  %16 = or i64 %13, %15
+  %17 = add nsw i64 %16, 1
+  %18 = trunc i64 %17 to i32
+  %19 = lshr i64 %17, 32
+  %20 = trunc i64 %19 to i32
+  call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %1, i32 %6, i32 0, i32 %18, i32 %20, i32 undef, i32 undef, i8 3, i32 8)  ; RawBufferStore(uav,index,elementOffset,value0,value1,value2,value3,mask,alignment)
+  br label %21
+
+; <label>:21                                      ; preds = %9, %0
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @dx.op.threadId.i32(i32, i32) #0
+
+; Function Attrs: nounwind readonly
+declare %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32(i32, %dx.types.Handle, i32) #1
+
+; Function Attrs: nounwind readonly
+declare %dx.types.Handle @dx.op.createHandle(i32, i8, i32, i32, i1) #1
+
+; Function Attrs: nounwind readonly
+declare %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32, %dx.types.Handle, i32, i32, i8, i32) #1
+
+; Function Attrs: nounwind
+declare void @dx.op.rawBufferStore.i32(i32, %dx.types.Handle, i32, i32, i32, i32, i32, i32, i8, i32) #2
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind readonly }
+attributes #2 = { nounwind }
+
+!llvm.ident = !{!0}
+!dx.version = !{!1}
+!dx.valver = !{!2}
+!dx.shaderModel = !{!3}
+!dx.resources = !{!4}
+!dx.entryPoints = !{!10}
+
+!0 = !{!"dxcoob 1.7.2308.7 (69e54e290)"}
+!1 = !{i32 1, i32 2}
+!2 = !{i32 1, i32 7}
+!3 = !{!"cs", i32 6, i32 2}
+!4 = !{null, !5, !8, null}
+!5 = !{!6}
+!6 = !{i32 0, %"class.RWStructuredBuffer<long long>"* undef, !"", i32 0, i32 0, i32 1, i32 12, i1 false, i1 false, i1 false, !7}
+!7 = !{i32 1, i32 8}
+!8 = !{!9}
+!9 = !{i32 0, %Constants* undef, !"", i32 0, i32 0, i32 1, i32 8, null}
+!10 = !{void ()* @CSMain, !"CSMain", null, !4, !11}
+!11 = !{i32 0, i64 1048592, i32 4, !12}
+!12 = !{i32 256, i32 1, i32 1}
+
+#endif
+
+const unsigned char g_CSMain[] = {
+  0x44, 0x58, 0x42, 0x43, 0x1a, 0x59, 0x35, 0xe4, 0xd2, 0x35, 0xda, 0x6f,
+  0xea, 0x68, 0x6f, 0xbf, 0x1f, 0x5c, 0xf7, 0x45, 0x01, 0x00, 0x00, 0x00,
+  0x1c, 0x07, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00,
+  0xe8, 0x00, 0x00, 0x00, 0x04, 0x01, 0x00, 0x00, 0x53, 0x46, 0x49, 0x30,
+  0x08, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x49, 0x53, 0x47, 0x31, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x08, 0x00, 0x00, 0x00, 0x4f, 0x53, 0x47, 0x31, 0x08, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x50, 0x53, 0x56, 0x30,
+  0x78, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x05, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x48, 0x41, 0x53, 0x48, 0x14, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xeb, 0x50, 0x0d, 0x50, 0xfb, 0xbd, 0x44, 0x48,
+  0x8a, 0xc6, 0x2a, 0xed, 0x0c, 0xc2, 0x93, 0xb1, 0x44, 0x58, 0x49, 0x4c,
+  0x10, 0x06, 0x00, 0x00, 0x62, 0x00, 0x05, 0x00, 0x84, 0x01, 0x00, 0x00,
+  0x44, 0x58, 0x49, 0x4c, 0x02, 0x01, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+  0xf8, 0x05, 0x00, 0x00, 0x42, 0x43, 0xc0, 0xde, 0x21, 0x0c, 0x00, 0x00,
+  0x7b, 0x01, 0x00, 0x00, 0x0b, 0x82, 0x20, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x13, 0x00, 0x00, 0x00, 0x07, 0x81, 0x23, 0x91, 0x41, 0xc8, 0x04, 0x49,
+  0x06, 0x10, 0x32, 0x39, 0x92, 0x01, 0x84, 0x0c, 0x25, 0x05, 0x08, 0x19,
+  0x1e, 0x04, 0x8b, 0x62, 0x80, 0x14, 0x45, 0x02, 0x42, 0x92, 0x0b, 0x42,
+  0xa4, 0x10, 0x32, 0x14, 0x38, 0x08, 0x18, 0x4b, 0x0a, 0x32, 0x52, 0x88,
+  0x48, 0x90, 0x14, 0x20, 0x43, 0x46, 0x88, 0xa5, 0x00, 0x19, 0x32, 0x42,
+  0xe4, 0x48, 0x0e, 0x90, 0x91, 0x22, 0xc4, 0x50, 0x41, 0x51, 0x81, 0x8c,
+  0xe1, 0x83, 0xe5, 0x8a, 0x04, 0x29, 0x46, 0x06, 0x51, 0x18, 0x00, 0x00,
+  0x08, 0x00, 0x00, 0x00, 0x1b, 0x8c, 0xe0, 0xff, 0xff, 0xff, 0xff, 0x07,
+  0x40, 0x02, 0xa8, 0x0d, 0x86, 0xf0, 0xff, 0xff, 0xff, 0xff, 0x03, 0x20,
+  0x01, 0xd5, 0x06, 0x62, 0xf8, 0xff, 0xff, 0xff, 0xff, 0x01, 0x90, 0x00,
+  0x49, 0x18, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x13, 0x82, 0x60, 0x42,
+  0x20, 0x4c, 0x08, 0x06, 0x00, 0x00, 0x00, 0x00, 0x89, 0x20, 0x00, 0x00,
+  0x35, 0x00, 0x00, 0x00, 0x32, 0x22, 0x48, 0x09, 0x20, 0x64, 0x85, 0x04,
+  0x93, 0x22, 0xa4, 0x84, 0x04, 0x93, 0x22, 0xe3, 0x84, 0xa1, 0x90, 0x14,
+  0x12, 0x4c, 0x8a, 0x8c, 0x0b, 0x84, 0xa4, 0x4c, 0x10, 0x6c, 0x23, 0x00,
+  0x25, 0x00, 0x14, 0xe6, 0x08, 0xc0, 0xa0, 0x0c, 0x63, 0x0c, 0x22, 0x47,
+  0x0d, 0x97, 0x3f, 0x61, 0x0f, 0x21, 0xf9, 0xdc, 0x46, 0x15, 0x2b, 0x31,
+  0xf9, 0xc8, 0x6d, 0x23, 0x62, 0x8c, 0x31, 0xe6, 0x08, 0x10, 0x3a, 0xf7,
+  0x0c, 0x97, 0x3f, 0x61, 0x0f, 0x21, 0xf9, 0x21, 0xd0, 0x0c, 0x0b, 0x81,
+  0x02, 0x54, 0x08, 0x33, 0xd2, 0x20, 0x35, 0x47, 0x10, 0x14, 0x23, 0x8d,
+  0x33, 0x06, 0xa3, 0x76, 0xd3, 0x70, 0xf9, 0x13, 0xf6, 0x10, 0x92, 0xbf,
+  0x12, 0xd2, 0x4a, 0x4c, 0x3e, 0x72, 0xdb, 0xa8, 0x18, 0x63, 0x8c, 0x51,
+  0x8e, 0x37, 0xd2, 0x18, 0x67, 0x10, 0x2c, 0x0b, 0x18, 0x69, 0x8c, 0x31,
+  0xc6, 0x38, 0x83, 0xe4, 0x40, 0xc0, 0x1c, 0x01, 0x28, 0xcc, 0x34, 0x06,
+  0xe3, 0xc0, 0x0e, 0xe1, 0x30, 0x0f, 0xf3, 0xe0, 0x06, 0xb2, 0x70, 0x0b,
+  0xb3, 0x40, 0x0f, 0xf2, 0x50, 0x0f, 0xe3, 0x40, 0x0f, 0xf5, 0x20, 0x0f,
+  0xe5, 0x40, 0x0e, 0xa2, 0x50, 0x0f, 0xe6, 0x60, 0x0e, 0xe5, 0x20, 0x0f,
+  0x7c, 0xc0, 0x0e, 0xef, 0xe0, 0x0e, 0xe7, 0x00, 0x06, 0xec, 0xf0, 0x0e,
+  0xee, 0x70, 0x0e, 0x7e, 0x80, 0x82, 0x4a, 0xf6, 0x12, 0xce, 0x69, 0xa4,
+  0x09, 0x68, 0x26, 0x09, 0x09, 0x63, 0x10, 0x9e, 0x02, 0x00, 0x00, 0x00,
+  0x13, 0x14, 0x72, 0xc0, 0x87, 0x74, 0x60, 0x87, 0x36, 0x68, 0x87, 0x79,
+  0x68, 0x03, 0x72, 0xc0, 0x87, 0x0d, 0xaf, 0x50, 0x0e, 0x6d, 0xd0, 0x0e,
+  0x7a, 0x50, 0x0e, 0x6d, 0x00, 0x0f, 0x7a, 0x30, 0x07, 0x72, 0xa0, 0x07,
+  0x73, 0x20, 0x07, 0x6d, 0x90, 0x0e, 0x71, 0xa0, 0x07, 0x73, 0x20, 0x07,
+  0x6d, 0x90, 0x0e, 0x78, 0xa0, 0x07, 0x73, 0x20, 0x07, 0x6d, 0x90, 0x0e,
+  0x71, 0x60, 0x07, 0x7a, 0x30, 0x07, 0x72, 0xd0, 0x06, 0xe9, 0x30, 0x07,
+  0x72, 0xa0, 0x07, 0x73, 0x20, 0x07, 0x6d, 0x90, 0x0e, 0x76, 0x40, 0x07,
+  0x7a, 0x60, 0x07, 0x74, 0xd0, 0x06, 0xe6, 0x10, 0x07, 0x76, 0xa0, 0x07,
+  0x73, 0x20, 0x07, 0x6d, 0x60, 0x0e, 0x73, 0x20, 0x07, 0x7a, 0x30, 0x07,
+  0x72, 0xd0, 0x06, 0xe6, 0x60, 0x07, 0x74, 0xa0, 0x07, 0x76, 0x40, 0x07,
+  0x6d, 0xe0, 0x0e, 0x78, 0xa0, 0x07, 0x71, 0x60, 0x07, 0x7a, 0x30, 0x07,
+  0x72, 0xa0, 0x07, 0x76, 0x40, 0x07, 0x43, 0x9e, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x86, 0x3c, 0x04, 0x10, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x79, 0x14, 0x20,
+  0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0xf2, 0x34,
+  0x40, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0xe4,
+  0x81, 0x80, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x60,
+  0xc8, 0x23, 0x01, 0x01, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x40, 0x16, 0x08, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x32, 0x1e, 0x98, 0x14,
+  0x19, 0x11, 0x4c, 0x90, 0x8c, 0x09, 0x26, 0x47, 0xc6, 0x04, 0x43, 0x1a,
+  0x25, 0x50, 0x04, 0xe5, 0x50, 0x0c, 0x23, 0x00, 0x85, 0x51, 0x10, 0x85,
+  0x50, 0x80, 0x04, 0xc4, 0x46, 0x00, 0xa8, 0x16, 0x28, 0x20, 0x60, 0x00,
+  0xdd, 0x19, 0x00, 0xca, 0x33, 0x00, 0x00, 0x00, 0x79, 0x18, 0x00, 0x00,
+  0x3e, 0x00, 0x00, 0x00, 0x1a, 0x03, 0x4c, 0x90, 0x46, 0x02, 0x13, 0xc4,
+  0x8e, 0x0c, 0x6f, 0xec, 0xed, 0x4d, 0x0c, 0x24, 0xc6, 0xe5, 0xc6, 0x45,
+  0x66, 0x06, 0x06, 0xc7, 0xe5, 0x06, 0x04, 0xc5, 0x26, 0xa7, 0xac, 0x86,
+  0xa6, 0x4c, 0x26, 0x07, 0x26, 0x65, 0x43, 0x10, 0x4c, 0x10, 0x06, 0x63,
+  0x82, 0x30, 0x1c, 0x1b, 0x84, 0x81, 0x98, 0x20, 0x0c, 0xc8, 0x06, 0x61,
+  0x30, 0x28, 0x8c, 0xcd, 0x4d, 0x10, 0x86, 0x64, 0xc3, 0x80, 0x24, 0xc4,
+  0x04, 0x61, 0x50, 0x26, 0x08, 0x57, 0x44, 0x60, 0x82, 0x30, 0x2c, 0x13,
+  0x04, 0xe6, 0x99, 0x20, 0x0c, 0xcc, 0x06, 0x61, 0x80, 0x36, 0x2c, 0x0b,
+  0xd3, 0x2c, 0xcb, 0xe0, 0x3c, 0xcf, 0x13, 0x6d, 0x08, 0xa4, 0x09, 0x42,
+  0x26, 0x6d, 0x40, 0x16, 0xaa, 0x59, 0x96, 0x01, 0x02, 0x36, 0x04, 0xd5,
+  0x06, 0x02, 0x98, 0x2c, 0x60, 0x82, 0x20, 0x00, 0x34, 0x86, 0xa6, 0x9a,
+  0xc2, 0xd2, 0xdc, 0x26, 0x08, 0x15, 0x34, 0x41, 0x18, 0x9a, 0x09, 0xc2,
+  0xe0, 0x6c, 0x18, 0xb8, 0x61, 0xd8, 0x40, 0x2c, 0xda, 0xd6, 0x6d, 0x28,
+  0xb0, 0x0c, 0xb8, 0xbc, 0x2a, 0x6c, 0x6c, 0x76, 0x6d, 0x2e, 0x69, 0x64,
+  0x65, 0x6e, 0x74, 0x53, 0x82, 0xa0, 0x0a, 0x19, 0x9e, 0x8b, 0x5d, 0x99,
+  0xdc, 0x5c, 0xda, 0x9b, 0xdb, 0x94, 0x80, 0x68, 0x42, 0x86, 0xe7, 0x62,
+  0x17, 0xc6, 0x66, 0x57, 0x26, 0x37, 0x25, 0x30, 0xea, 0x90, 0xe1, 0xb9,
+  0xcc, 0xa1, 0x85, 0x91, 0x95, 0xc9, 0x35, 0xbd, 0x91, 0x95, 0xb1, 0x4d,
+  0x09, 0x92, 0x32, 0x64, 0x78, 0x2e, 0x72, 0x65, 0x73, 0x6f, 0x75, 0x72,
+  0x63, 0x65, 0x73, 0x53, 0x02, 0xab, 0x0e, 0x19, 0x9e, 0x4b, 0x99, 0x1b,
+  0x9d, 0x5c, 0x1e, 0xd4, 0x5b, 0x9a, 0x1b, 0xdd, 0xdc, 0x94, 0xc0, 0x03,
+  0x79, 0x18, 0x00, 0x00, 0x51, 0x00, 0x00, 0x00, 0x33, 0x08, 0x80, 0x1c,
+  0xc4, 0xe1, 0x1c, 0x66, 0x14, 0x01, 0x3d, 0x88, 0x43, 0x38, 0x84, 0xc3,
+  0x8c, 0x42, 0x80, 0x07, 0x79, 0x78, 0x07, 0x73, 0x98, 0x71, 0x0c, 0xe6,
+  0x00, 0x0f, 0xed, 0x10, 0x0e, 0xf4, 0x80, 0x0e, 0x33, 0x0c, 0x42, 0x1e,
+  0xc2, 0xc1, 0x1d, 0xce, 0xa1, 0x1c, 0x66, 0x30, 0x05, 0x3d, 0x88, 0x43,
+  0x38, 0x84, 0x83, 0x1b, 0xcc, 0x03, 0x3d, 0xc8, 0x43, 0x3d, 0x8c, 0x03,
+  0x3d, 0xcc, 0x78, 0x8c, 0x74, 0x70, 0x07, 0x7b, 0x08, 0x07, 0x79, 0x48,
+  0x87, 0x70, 0x70, 0x07, 0x7a, 0x70, 0x03, 0x76, 0x78, 0x87, 0x70, 0x20,
+  0x87, 0x19, 0xcc, 0x11, 0x0e, 0xec, 0x90, 0x0e, 0xe1, 0x30, 0x0f, 0x6e,
+  0x30, 0x0f, 0xe3, 0xf0, 0x0e, 0xf0, 0x50, 0x0e, 0x33, 0x10, 0xc4, 0x1d,
+  0xde, 0x21, 0x1c, 0xd8, 0x21, 0x1d, 0xc2, 0x61, 0x1e, 0x66, 0x30, 0x89,
+  0x3b, 0xbc, 0x83, 0x3b, 0xd0, 0x43, 0x39, 0xb4, 0x03, 0x3c, 0xbc, 0x83,
+  0x3c, 0x84, 0x03, 0x3b, 0xcc, 0xf0, 0x14, 0x76, 0x60, 0x07, 0x7b, 0x68,
+  0x07, 0x37, 0x68, 0x87, 0x72, 0x68, 0x07, 0x37, 0x80, 0x87, 0x70, 0x90,
+  0x87, 0x70, 0x60, 0x07, 0x76, 0x28, 0x07, 0x76, 0xf8, 0x05, 0x76, 0x78,
+  0x87, 0x77, 0x80, 0x87, 0x5f, 0x08, 0x87, 0x71, 0x18, 0x87, 0x72, 0x98,
+  0x87, 0x79, 0x98, 0x81, 0x2c, 0xee, 0xf0, 0x0e, 0xee, 0xe0, 0x0e, 0xf5,
+  0xc0, 0x0e, 0xec, 0x30, 0x03, 0x62, 0xc8, 0xa1, 0x1c, 0xe4, 0xa1, 0x1c,
+  0xcc, 0xa1, 0x1c, 0xe4, 0xa1, 0x1c, 0xdc, 0x61, 0x1c, 0xca, 0x21, 0x1c,
+  0xc4, 0x81, 0x1d, 0xca, 0x61, 0x06, 0xd6, 0x90, 0x43, 0x39, 0xc8, 0x43,
+  0x39, 0x98, 0x43, 0x39, 0xc8, 0x43, 0x39, 0xb8, 0xc3, 0x38, 0x94, 0x43,
+  0x38, 0x88, 0x03, 0x3b, 0x94, 0xc3, 0x2f, 0xbc, 0x83, 0x3c, 0xfc, 0x82,
+  0x3b, 0xd4, 0x03, 0x3b, 0xb0, 0xc3, 0x0c, 0xc4, 0x21, 0x07, 0x7c, 0x70,
+  0x03, 0x7a, 0x28, 0x87, 0x76, 0x80, 0x87, 0x19, 0xd1, 0x43, 0x0e, 0xf8,
+  0xe0, 0x06, 0xe4, 0x20, 0x0e, 0xe7, 0xe0, 0x06, 0xf6, 0x10, 0x0e, 0xf2,
+  0xc0, 0x0e, 0xe1, 0x90, 0x0f, 0xef, 0x50, 0x0f, 0xf4, 0x30, 0x83, 0x81,
+  0xc8, 0x01, 0x1f, 0xdc, 0x40, 0x1c, 0xe4, 0xa1, 0x1c, 0xc2, 0x61, 0x1d,
+  0xdc, 0x40, 0x1c, 0xe4, 0x01, 0x00, 0x00, 0x00, 0x71, 0x20, 0x00, 0x00,
+  0x1a, 0x00, 0x00, 0x00, 0x06, 0x60, 0x70, 0xac, 0x09, 0x20, 0x8d, 0x09,
+  0x6c, 0xc3, 0xe5, 0x3b, 0x8f, 0x2f, 0x04, 0x54, 0x51, 0x10, 0x51, 0xe9,
+  0x00, 0x43, 0x49, 0x18, 0x80, 0x80, 0xf9, 0xc8, 0x6d, 0xdb, 0x80, 0x34,
+  0x5c, 0xbe, 0xf3, 0xf8, 0x42, 0x44, 0x00, 0x13, 0x11, 0x02, 0xcd, 0xb0,
+  0x10, 0x46, 0x70, 0x0d, 0x97, 0xef, 0x3c, 0x7e, 0x04, 0x58, 0x1b, 0x55,
+  0x14, 0x44, 0x54, 0x3a, 0xc0, 0xe0, 0x23, 0xb7, 0x6d, 0x05, 0xd8, 0x70,
+  0xf9, 0xce, 0xe3, 0x47, 0x80, 0xb5, 0x51, 0x45, 0x41, 0x44, 0xec, 0xe4,
+  0x44, 0x84, 0x8f, 0xdc, 0xb6, 0x05, 0x48, 0xc3, 0xe5, 0x3b, 0x8f, 0x3f,
+  0x1d, 0x11, 0x01, 0x0c, 0xe2, 0xe0, 0x23, 0xb7, 0x0d, 0x00, 0x00, 0x00,
+  0x61, 0x20, 0x00, 0x00, 0x29, 0x00, 0x00, 0x00, 0x13, 0x04, 0x43, 0x2c,
+  0x10, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x34, 0x4a, 0x6e, 0x06,
+  0xa0, 0x74, 0x03, 0xca, 0xae, 0x2c, 0x05, 0x0a, 0x53, 0x80, 0x4e, 0x19,
+  0x94, 0x40, 0x11, 0x50, 0x2d, 0xa0, 0x12, 0x00, 0x23, 0x06, 0x09, 0x00,
+  0x82, 0x60, 0xd0, 0x6c, 0x0b, 0x41, 0x51, 0xcf, 0x88, 0x41, 0x02, 0x80,
+  0x20, 0x18, 0x34, 0x1c, 0x43, 0x54, 0x15, 0x34, 0x62, 0x60, 0x00, 0x20,
+  0x08, 0x06, 0xc4, 0xb7, 0x58, 0x23, 0x06, 0x07, 0x00, 0x82, 0x60, 0xa0,
+  0x7c, 0x8b, 0x70, 0x8d, 0x26, 0x04, 0x41, 0x05, 0x03, 0x8c, 0x26, 0x0c,
+  0xc0, 0x70, 0x83, 0x10, 0x90, 0xc1, 0x2c, 0x43, 0x20, 0x04, 0x23, 0x06,
+  0x0a, 0x00, 0x82, 0x60, 0x00, 0x89, 0xc1, 0x83, 0x0c, 0x5c, 0xa3, 0x8d,
+  0x26, 0x04, 0xc0, 0x68, 0x82, 0x10, 0x9c, 0x50, 0xe3, 0x84, 0x1a, 0x15,
+  0x3c, 0x57, 0x43, 0xb0, 0x16, 0x40, 0x20, 0xb8, 0x60, 0x40, 0x09, 0x13,
+  0x5e, 0x30, 0x60, 0xc4, 0xc0, 0x01, 0x40, 0x10, 0x0c, 0x24, 0x36, 0xc8,
+  0x26, 0xe7, 0x0c, 0x86, 0xa0, 0xeb, 0xb0, 0x32, 0x98, 0x25, 0x10, 0x10,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
diff --git a/src/dml/generated_dml_shaders/update_mask_int32.h b/src/dml/generated_dml_shaders/update_mask_int32.h
new file mode 100644
index 000000000..9b47eff3d
--- /dev/null
+++ b/src/dml/generated_dml_shaders/update_mask_int32.h
@@ -0,0 +1,321 @@
+#if 0
+;
+; Input signature:
+;
+; Name                 Index   Mask Register SysValue  Format   Used
+; -------------------- ----- ------ -------- -------- ------- ------
+; no parameters
+;
+; Output signature:
+;
+; Name                 Index   Mask Register SysValue  Format   Used
+; -------------------- ----- ------ -------- -------- ------- ------
+; no parameters
+; shader hash: 14be2159f74b9b5d2904839e1d59b251
+;
+; Pipeline Runtime Information: 
+;
+; Compute Shader
+; NumThreads=(256,1,1)
+;
+;
+; Buffer Definitions:
+;
+; cbuffer 
+; {
+;
+;   [16 x i8] (type annotation not present)
+;
+; }
+;
+; Resource bind info for 
+; {
+;
+;   [4 x i8] (type annotation not present)
+;
+; }
+;
+; Resource bind info for 
+; {
+;
+;   [4 x i8] (type annotation not present)
+;
+; }
+;
+;
+; Resource Bindings:
+;
+; Name                                 Type  Format         Dim      ID      HLSL Bind  Count
+; ------------------------------ ---------- ------- ----------- ------- -------------- ------
+;                                   cbuffer      NA          NA     CB0            cb0     1
+;                                       UAV  struct         r/w      U0             u0     1
+;                                       UAV  struct         r/w      U1             u1     1
+;
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%dx.types.Handle = type { i8* }
+%dx.types.CBufRet.i32 = type { i32, i32, i32, i32 }
+%dx.types.ResRet.i32 = type { i32, i32, i32, i32, i32 }
+%"class.RWStructuredBuffer<int>" = type { i32 }
+%Constants = type { i32, i32, i32, i32 }
+
+define void @CSMain() {
+  %1 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 1, i32 1, i32 1, i1 false)  ; CreateHandle(resourceClass,rangeId,index,nonUniformIndex)
+  %2 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 1, i32 0, i32 0, i1 false)  ; CreateHandle(resourceClass,rangeId,index,nonUniformIndex)
+  %3 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 0, i1 false)  ; CreateHandle(resourceClass,rangeId,index,nonUniformIndex)
+  %4 = call i32 @dx.op.threadId.i32(i32 93, i32 0)  ; ThreadId(component)
+  %5 = call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32(i32 59, %dx.types.Handle %3, i32 0)  ; CBufferLoadLegacy(handle,regIndex)
+  %6 = extractvalue %dx.types.CBufRet.i32 %5, 3
+  %7 = add i32 %6, %4
+  %8 = extractvalue %dx.types.CBufRet.i32 %5, 2
+  %9 = icmp ult i32 %7, %8
+  br i1 %9, label %10, label %32
+
+; <label>:10                                      ; preds = %0
+  %11 = extractvalue %dx.types.CBufRet.i32 %5, 0
+  %12 = urem i32 %7, %11
+  %13 = extractvalue %dx.types.CBufRet.i32 %5, 1
+  %14 = icmp ugt i32 %13, 1
+  br i1 %14, label %15, label %18
+
+; <label>:15                                      ; preds = %10
+  %16 = icmp ult i32 %12, %13
+  %17 = zext i1 %16 to i32
+  call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %1, i32 %7, i32 0, i32 %17, i32 undef, i32 undef, i32 undef, i8 1, i32 4)  ; RawBufferStore(uav,index,elementOffset,value0,value1,value2,value3,mask,alignment)
+  br label %32
+
+; <label>:18                                      ; preds = %10
+  %19 = icmp eq i32 %12, 0
+  br i1 %19, label %29, label %20
+
+; <label>:20                                      ; preds = %18
+  %21 = call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %2, i32 %12, i32 0, i8 1, i32 4)  ; RawBufferLoad(srv,index,elementOffset,mask,alignment)
+  %22 = extractvalue %dx.types.ResRet.i32 %21, 0
+  %23 = icmp eq i32 %22, 1
+  br i1 %23, label %29, label %24
+
+; <label>:24                                      ; preds = %20
+  %25 = add i32 %12, -1
+  %26 = call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %2, i32 %25, i32 0, i8 1, i32 4)  ; RawBufferLoad(srv,index,elementOffset,mask,alignment)
+  %27 = extractvalue %dx.types.ResRet.i32 %26, 0
+  %28 = icmp eq i32 %27, 1
+  br label %29
+
+; <label>:29                                      ; preds = %24, %20, %18
+  %30 = phi i1 [ true, %20 ], [ true, %18 ], [ %28, %24 ]
+  %31 = zext i1 %30 to i32
+  call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %1, i32 %7, i32 0, i32 %31, i32 undef, i32 undef, i32 undef, i8 1, i32 4)  ; RawBufferStore(uav,index,elementOffset,value0,value1,value2,value3,mask,alignment)
+  br label %32
+
+; <label>:32                                      ; preds = %29, %15, %0
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @dx.op.threadId.i32(i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.rawBufferStore.i32(i32, %dx.types.Handle, i32, i32, i32, i32, i32, i32, i8, i32) #1
+
+; Function Attrs: nounwind readonly
+declare %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32, %dx.types.Handle, i32, i32, i8, i32) #2
+
+; Function Attrs: nounwind readonly
+declare %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32(i32, %dx.types.Handle, i32) #2
+
+; Function Attrs: nounwind readonly
+declare %dx.types.Handle @dx.op.createHandle(i32, i8, i32, i32, i1) #2
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind readonly }
+
+!llvm.ident = !{!0}
+!dx.version = !{!1}
+!dx.valver = !{!2}
+!dx.shaderModel = !{!3}
+!dx.resources = !{!4}
+!dx.entryPoints = !{!11}
+
+!0 = !{!"dxcoob 1.7.2308.7 (69e54e290)"}
+!1 = !{i32 1, i32 2}
+!2 = !{i32 1, i32 7}
+!3 = !{!"cs", i32 6, i32 2}
+!4 = !{null, !5, !9, null}
+!5 = !{!6, !8}
+!6 = !{i32 0, %"class.RWStructuredBuffer<int>"* undef, !"", i32 0, i32 0, i32 1, i32 12, i1 false, i1 false, i1 false, !7}
+!7 = !{i32 1, i32 4}
+!8 = !{i32 1, %"class.RWStructuredBuffer<int>"* undef, !"", i32 0, i32 1, i32 1, i32 12, i1 false, i1 false, i1 false, !7}
+!9 = !{!10}
+!10 = !{i32 0, %Constants* undef, !"", i32 0, i32 0, i32 1, i32 16, null}
+!11 = !{void ()* @CSMain, !"CSMain", null, !4, !12}
+!12 = !{i32 0, i64 16, i32 4, !13}
+!13 = !{i32 256, i32 1, i32 1}
+
+#endif
+
+const unsigned char g_CSMain[] = {
+  0x44, 0x58, 0x42, 0x43, 0x11, 0xed, 0x0e, 0xe2, 0xe4, 0x80, 0x9d, 0xaa,
+  0xbc, 0xdf, 0x65, 0xa3, 0xc6, 0x2e, 0x7e, 0x29, 0x01, 0x00, 0x00, 0x00,
+  0x98, 0x07, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00,
+  0x00, 0x01, 0x00, 0x00, 0x1c, 0x01, 0x00, 0x00, 0x53, 0x46, 0x49, 0x30,
+  0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x49, 0x53, 0x47, 0x31, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x08, 0x00, 0x00, 0x00, 0x4f, 0x53, 0x47, 0x31, 0x08, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x50, 0x53, 0x56, 0x30,
+  0x90, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x05, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x48, 0x41, 0x53, 0x48, 0x14, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x14, 0xbe, 0x21, 0x59, 0xf7, 0x4b, 0x9b, 0x5d,
+  0x29, 0x04, 0x83, 0x9e, 0x1d, 0x59, 0xb2, 0x51, 0x44, 0x58, 0x49, 0x4c,
+  0x74, 0x06, 0x00, 0x00, 0x62, 0x00, 0x05, 0x00, 0x9d, 0x01, 0x00, 0x00,
+  0x44, 0x58, 0x49, 0x4c, 0x02, 0x01, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+  0x5c, 0x06, 0x00, 0x00, 0x42, 0x43, 0xc0, 0xde, 0x21, 0x0c, 0x00, 0x00,
+  0x94, 0x01, 0x00, 0x00, 0x0b, 0x82, 0x20, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x13, 0x00, 0x00, 0x00, 0x07, 0x81, 0x23, 0x91, 0x41, 0xc8, 0x04, 0x49,
+  0x06, 0x10, 0x32, 0x39, 0x92, 0x01, 0x84, 0x0c, 0x25, 0x05, 0x08, 0x19,
+  0x1e, 0x04, 0x8b, 0x62, 0x80, 0x14, 0x45, 0x02, 0x42, 0x92, 0x0b, 0x42,
+  0xa4, 0x10, 0x32, 0x14, 0x38, 0x08, 0x18, 0x4b, 0x0a, 0x32, 0x52, 0x88,
+  0x48, 0x90, 0x14, 0x20, 0x43, 0x46, 0x88, 0xa5, 0x00, 0x19, 0x32, 0x42,
+  0xe4, 0x48, 0x0e, 0x90, 0x91, 0x22, 0xc4, 0x50, 0x41, 0x51, 0x81, 0x8c,
+  0xe1, 0x83, 0xe5, 0x8a, 0x04, 0x29, 0x46, 0x06, 0x51, 0x18, 0x00, 0x00,
+  0x08, 0x00, 0x00, 0x00, 0x1b, 0x8c, 0xe0, 0xff, 0xff, 0xff, 0xff, 0x07,
+  0x40, 0x02, 0xa8, 0x0d, 0x84, 0xf0, 0xff, 0xff, 0xff, 0xff, 0x03, 0x20,
+  0x6d, 0x30, 0x86, 0xff, 0xff, 0xff, 0xff, 0x1f, 0x00, 0x09, 0xa8, 0x00,
+  0x49, 0x18, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x13, 0x82, 0x60, 0x42,
+  0x20, 0x4c, 0x08, 0x06, 0x00, 0x00, 0x00, 0x00, 0x89, 0x20, 0x00, 0x00,
+  0x33, 0x00, 0x00, 0x00, 0x32, 0x22, 0x48, 0x09, 0x20, 0x64, 0x85, 0x04,
+  0x93, 0x22, 0xa4, 0x84, 0x04, 0x93, 0x22, 0xe3, 0x84, 0xa1, 0x90, 0x14,
+  0x12, 0x4c, 0x8a, 0x8c, 0x0b, 0x84, 0xa4, 0x4c, 0x10, 0x6c, 0x23, 0x00,
+  0x25, 0x00, 0x14, 0xe6, 0x08, 0xc0, 0xa0, 0x0c, 0x63, 0x0c, 0x22, 0x73,
+  0x04, 0x08, 0x99, 0x7b, 0x86, 0xcb, 0x9f, 0xb0, 0x87, 0x90, 0xfc, 0x10,
+  0x68, 0x86, 0x85, 0x40, 0xc1, 0x29, 0x0b, 0x18, 0x68, 0x8c, 0x31, 0xc6,
+  0x30, 0x83, 0xd2, 0x4d, 0xc3, 0xe5, 0x4f, 0xd8, 0x43, 0x48, 0xfe, 0x4a,
+  0x48, 0x2b, 0x31, 0xf9, 0xc8, 0x6d, 0xa3, 0x62, 0x8c, 0x31, 0x46, 0x39,
+  0xd6, 0x40, 0x63, 0x98, 0x41, 0xec, 0xa8, 0xe1, 0xf2, 0x27, 0xec, 0x21,
+  0x24, 0x9f, 0xdb, 0xa8, 0x62, 0x25, 0x26, 0x1f, 0xb9, 0x6d, 0x44, 0x8c,
+  0x31, 0x46, 0x21, 0xdc, 0x40, 0x83, 0xde, 0x1c, 0x41, 0x50, 0x0c, 0x34,
+  0xcc, 0x18, 0x91, 0xe4, 0x40, 0xc0, 0x4c, 0xdd, 0x38, 0xb0, 0x43, 0x38,
+  0xcc, 0xc3, 0x3c, 0xb8, 0x81, 0x2c, 0xdc, 0xc2, 0x2c, 0xd0, 0x83, 0x3c,
+  0xd4, 0xc3, 0x38, 0xd0, 0x43, 0x3d, 0xc8, 0x43, 0x39, 0x90, 0x83, 0x28,
+  0xd4, 0x83, 0x39, 0x98, 0x43, 0x39, 0xc8, 0x03, 0x1f, 0xa4, 0x83, 0x3b,
+  0xd0, 0x83, 0x1f, 0xa0, 0x60, 0x50, 0xbd, 0x84, 0x73, 0x1a, 0x69, 0x02,
+  0x9a, 0x49, 0x42, 0xc4, 0x18, 0x63, 0xd0, 0x9d, 0x23, 0x00, 0x85, 0x29,
+  0x00, 0x00, 0x00, 0x00, 0x13, 0x14, 0x72, 0xc0, 0x87, 0x74, 0x60, 0x87,
+  0x36, 0x68, 0x87, 0x79, 0x68, 0x03, 0x72, 0xc0, 0x87, 0x0d, 0xaf, 0x50,
+  0x0e, 0x6d, 0xd0, 0x0e, 0x7a, 0x50, 0x0e, 0x6d, 0x00, 0x0f, 0x7a, 0x30,
+  0x07, 0x72, 0xa0, 0x07, 0x73, 0x20, 0x07, 0x6d, 0x90, 0x0e, 0x71, 0xa0,
+  0x07, 0x73, 0x20, 0x07, 0x6d, 0x90, 0x0e, 0x78, 0xa0, 0x07, 0x73, 0x20,
+  0x07, 0x6d, 0x90, 0x0e, 0x71, 0x60, 0x07, 0x7a, 0x30, 0x07, 0x72, 0xd0,
+  0x06, 0xe9, 0x30, 0x07, 0x72, 0xa0, 0x07, 0x73, 0x20, 0x07, 0x6d, 0x90,
+  0x0e, 0x76, 0x40, 0x07, 0x7a, 0x60, 0x07, 0x74, 0xd0, 0x06, 0xe6, 0x10,
+  0x07, 0x76, 0xa0, 0x07, 0x73, 0x20, 0x07, 0x6d, 0x60, 0x0e, 0x73, 0x20,
+  0x07, 0x7a, 0x30, 0x07, 0x72, 0xd0, 0x06, 0xe6, 0x60, 0x07, 0x74, 0xa0,
+  0x07, 0x76, 0x40, 0x07, 0x6d, 0xe0, 0x0e, 0x78, 0xa0, 0x07, 0x71, 0x60,
+  0x07, 0x7a, 0x30, 0x07, 0x72, 0xa0, 0x07, 0x76, 0x40, 0x07, 0x43, 0x9e,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x86,
+  0x3c, 0x04, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x0c, 0x79, 0x12, 0x20, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x18, 0xf2, 0x30, 0x40, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x30, 0xe4, 0x79, 0x80, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x60, 0xc8, 0x23, 0x01, 0x01, 0x30, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x40, 0x16, 0x08, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0x32, 0x1e, 0x98, 0x14, 0x19, 0x11, 0x4c, 0x90, 0x8c, 0x09, 0x26, 0x47,
+  0xc6, 0x04, 0x43, 0x1a, 0x25, 0x50, 0x04, 0xe5, 0x50, 0x0c, 0x23, 0x00,
+  0x85, 0x51, 0x08, 0x05, 0x52, 0x80, 0x04, 0x14, 0x47, 0x00, 0x28, 0x17,
+  0x08, 0xe1, 0x19, 0x00, 0xb2, 0x33, 0x00, 0x00, 0x79, 0x18, 0x00, 0x00,
+  0x41, 0x00, 0x00, 0x00, 0x1a, 0x03, 0x4c, 0x90, 0x46, 0x02, 0x13, 0xc4,
+  0x8e, 0x0c, 0x6f, 0xec, 0xed, 0x4d, 0x0c, 0x24, 0xc6, 0xe5, 0xc6, 0x45,
+  0x66, 0x06, 0x06, 0xc7, 0xe5, 0x06, 0x04, 0xc5, 0x26, 0xa7, 0xac, 0x86,
+  0xa6, 0x4c, 0x26, 0x07, 0x26, 0x65, 0x43, 0x10, 0x4c, 0x10, 0x06, 0x63,
+  0x82, 0x30, 0x1c, 0x1b, 0x84, 0x81, 0x98, 0x20, 0x0c, 0xc8, 0x06, 0x61,
+  0x30, 0x28, 0x8c, 0xcd, 0x4d, 0x10, 0x86, 0x64, 0xc3, 0x80, 0x24, 0xc4,
+  0x04, 0x61, 0x50, 0x26, 0x08, 0x96, 0x44, 0x60, 0x82, 0x30, 0x2c, 0x13,
+  0x84, 0xe8, 0x99, 0x20, 0x0c, 0xcc, 0x06, 0x61, 0x80, 0x36, 0x2c, 0x0b,
+  0xd3, 0x2c, 0xcb, 0xe0, 0x3c, 0xcf, 0x13, 0x6d, 0x58, 0x06, 0xa6, 0x59,
+  0x86, 0xc1, 0x79, 0x9e, 0x27, 0xda, 0x20, 0x48, 0xd3, 0x04, 0x01, 0x8b,
+  0x26, 0x08, 0x43, 0xb3, 0x01, 0x59, 0xaa, 0x66, 0x59, 0x06, 0x0b, 0xd8,
+  0x10, 0x5c, 0x1b, 0x08, 0x80, 0xc2, 0x80, 0x09, 0x82, 0x00, 0xd0, 0x18,
+  0x9a, 0x6a, 0x0a, 0x4b, 0x73, 0x9b, 0x20, 0x64, 0xd0, 0x04, 0x61, 0x70,
+  0x36, 0x0c, 0xdd, 0x30, 0x6c, 0x20, 0x16, 0x0e, 0xf2, 0x36, 0x14, 0xda,
+  0x06, 0x64, 0x5f, 0x15, 0x36, 0x36, 0xbb, 0x36, 0x97, 0x34, 0xb2, 0x32,
+  0x37, 0xba, 0x29, 0x41, 0x50, 0x85, 0x0c, 0xcf, 0xc5, 0xae, 0x4c, 0x6e,
+  0x2e, 0xed, 0xcd, 0x6d, 0x4a, 0x40, 0x34, 0x21, 0xc3, 0x73, 0xb1, 0x0b,
+  0x63, 0xb3, 0x2b, 0x93, 0x9b, 0x12, 0x18, 0x75, 0xc8, 0xf0, 0x5c, 0xe6,
+  0xd0, 0xc2, 0xc8, 0xca, 0xe4, 0x9a, 0xde, 0xc8, 0xca, 0xd8, 0xa6, 0x04,
+  0x49, 0x19, 0x32, 0x3c, 0x17, 0xb9, 0xb2, 0xb9, 0xb7, 0x3a, 0xb9, 0xb1,
+  0xb2, 0xb9, 0x29, 0x01, 0x56, 0x87, 0x0c, 0xcf, 0xa5, 0xcc, 0x8d, 0x4e,
+  0x2e, 0x0f, 0xea, 0x2d, 0xcd, 0x8d, 0x6e, 0x6e, 0x4a, 0xf0, 0x01, 0x00,
+  0x79, 0x18, 0x00, 0x00, 0x51, 0x00, 0x00, 0x00, 0x33, 0x08, 0x80, 0x1c,
+  0xc4, 0xe1, 0x1c, 0x66, 0x14, 0x01, 0x3d, 0x88, 0x43, 0x38, 0x84, 0xc3,
+  0x8c, 0x42, 0x80, 0x07, 0x79, 0x78, 0x07, 0x73, 0x98, 0x71, 0x0c, 0xe6,
+  0x00, 0x0f, 0xed, 0x10, 0x0e, 0xf4, 0x80, 0x0e, 0x33, 0x0c, 0x42, 0x1e,
+  0xc2, 0xc1, 0x1d, 0xce, 0xa1, 0x1c, 0x66, 0x30, 0x05, 0x3d, 0x88, 0x43,
+  0x38, 0x84, 0x83, 0x1b, 0xcc, 0x03, 0x3d, 0xc8, 0x43, 0x3d, 0x8c, 0x03,
+  0x3d, 0xcc, 0x78, 0x8c, 0x74, 0x70, 0x07, 0x7b, 0x08, 0x07, 0x79, 0x48,
+  0x87, 0x70, 0x70, 0x07, 0x7a, 0x70, 0x03, 0x76, 0x78, 0x87, 0x70, 0x20,
+  0x87, 0x19, 0xcc, 0x11, 0x0e, 0xec, 0x90, 0x0e, 0xe1, 0x30, 0x0f, 0x6e,
+  0x30, 0x0f, 0xe3, 0xf0, 0x0e, 0xf0, 0x50, 0x0e, 0x33, 0x10, 0xc4, 0x1d,
+  0xde, 0x21, 0x1c, 0xd8, 0x21, 0x1d, 0xc2, 0x61, 0x1e, 0x66, 0x30, 0x89,
+  0x3b, 0xbc, 0x83, 0x3b, 0xd0, 0x43, 0x39, 0xb4, 0x03, 0x3c, 0xbc, 0x83,
+  0x3c, 0x84, 0x03, 0x3b, 0xcc, 0xf0, 0x14, 0x76, 0x60, 0x07, 0x7b, 0x68,
+  0x07, 0x37, 0x68, 0x87, 0x72, 0x68, 0x07, 0x37, 0x80, 0x87, 0x70, 0x90,
+  0x87, 0x70, 0x60, 0x07, 0x76, 0x28, 0x07, 0x76, 0xf8, 0x05, 0x76, 0x78,
+  0x87, 0x77, 0x80, 0x87, 0x5f, 0x08, 0x87, 0x71, 0x18, 0x87, 0x72, 0x98,
+  0x87, 0x79, 0x98, 0x81, 0x2c, 0xee, 0xf0, 0x0e, 0xee, 0xe0, 0x0e, 0xf5,
+  0xc0, 0x0e, 0xec, 0x30, 0x03, 0x62, 0xc8, 0xa1, 0x1c, 0xe4, 0xa1, 0x1c,
+  0xcc, 0xa1, 0x1c, 0xe4, 0xa1, 0x1c, 0xdc, 0x61, 0x1c, 0xca, 0x21, 0x1c,
+  0xc4, 0x81, 0x1d, 0xca, 0x61, 0x06, 0xd6, 0x90, 0x43, 0x39, 0xc8, 0x43,
+  0x39, 0x98, 0x43, 0x39, 0xc8, 0x43, 0x39, 0xb8, 0xc3, 0x38, 0x94, 0x43,
+  0x38, 0x88, 0x03, 0x3b, 0x94, 0xc3, 0x2f, 0xbc, 0x83, 0x3c, 0xfc, 0x82,
+  0x3b, 0xd4, 0x03, 0x3b, 0xb0, 0xc3, 0x0c, 0xc4, 0x21, 0x07, 0x7c, 0x70,
+  0x03, 0x7a, 0x28, 0x87, 0x76, 0x80, 0x87, 0x19, 0xd1, 0x43, 0x0e, 0xf8,
+  0xe0, 0x06, 0xe4, 0x20, 0x0e, 0xe7, 0xe0, 0x06, 0xf6, 0x10, 0x0e, 0xf2,
+  0xc0, 0x0e, 0xe1, 0x90, 0x0f, 0xef, 0x50, 0x0f, 0xf4, 0x30, 0x83, 0x81,
+  0xc8, 0x01, 0x1f, 0xdc, 0x40, 0x1c, 0xe4, 0xa1, 0x1c, 0xc2, 0x61, 0x1d,
+  0xdc, 0x40, 0x1c, 0xe4, 0x01, 0x00, 0x00, 0x00, 0x71, 0x20, 0x00, 0x00,
+  0x1a, 0x00, 0x00, 0x00, 0x06, 0x60, 0x70, 0xac, 0x09, 0x20, 0x8d, 0x11,
+  0x6c, 0xc3, 0xe5, 0x3b, 0x8f, 0x2f, 0x04, 0x54, 0x51, 0x10, 0x51, 0xe9,
+  0x00, 0x43, 0x49, 0x18, 0x80, 0x80, 0xf9, 0xc8, 0x6d, 0x5b, 0x81, 0x34,
+  0x5c, 0xbe, 0xf3, 0xf8, 0x42, 0x44, 0x00, 0x13, 0x11, 0x02, 0xcd, 0xb0,
+  0x10, 0x36, 0x70, 0x0d, 0x97, 0xef, 0x3c, 0x7e, 0x04, 0x58, 0x1b, 0x55,
+  0x14, 0x44, 0x54, 0x3a, 0xc0, 0xe0, 0x23, 0xb7, 0x6d, 0x02, 0xd8, 0x70,
+  0xf9, 0xce, 0xe3, 0x47, 0x80, 0xb5, 0x51, 0x45, 0x41, 0x44, 0xec, 0xe4,
+  0x44, 0x84, 0x8f, 0xdc, 0xb6, 0x05, 0x48, 0xc3, 0xe5, 0x3b, 0x8f, 0x3f,
+  0x1d, 0x11, 0x01, 0x0c, 0xe2, 0xe0, 0x23, 0xb7, 0x0d, 0x00, 0x00, 0x00,
+  0x61, 0x20, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x13, 0x04, 0x48, 0x2c,
+  0x10, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x34, 0x66, 0x00, 0x4a,
+  0xae, 0x30, 0x05, 0xca, 0x52, 0xa0, 0x74, 0x03, 0xca, 0xae, 0x06, 0xc8,
+  0x94, 0x40, 0x11, 0x50, 0xac, 0x01, 0x00, 0x00, 0x23, 0x06, 0x09, 0x00,
+  0x82, 0x60, 0x20, 0x61, 0xc9, 0x70, 0x5d, 0xce, 0x88, 0x41, 0x02, 0x80,
+  0x20, 0x18, 0x48, 0x99, 0x42, 0x50, 0xd4, 0x33, 0x62, 0x90, 0x00, 0x20,
+  0x08, 0x06, 0x92, 0xb6, 0x10, 0x55, 0x05, 0x8d, 0x18, 0x18, 0x00, 0x08,
+  0x82, 0x01, 0xf1, 0x25, 0xd6, 0x88, 0xc1, 0x01, 0x80, 0x20, 0x18, 0x3c,
+  0x5d, 0x22, 0x5c, 0xa3, 0x09, 0xc1, 0x50, 0xc1, 0x00, 0xa3, 0x09, 0x83,
+  0x30, 0xdc, 0x20, 0x04, 0x64, 0x30, 0xcb, 0x10, 0x1c, 0xc1, 0x68, 0x42,
+  0x01, 0x14, 0x11, 0xd4, 0x68, 0xc2, 0x11, 0x0c, 0x37, 0x04, 0x63, 0x20,
+  0x06, 0xb3, 0x0c, 0xc2, 0x10, 0x0c, 0x37, 0x0c, 0x02, 0x19, 0x5c, 0x30,
+  0x62, 0xc4, 0xc0, 0x01, 0x40, 0x10, 0x0c, 0x12, 0x35, 0xb8, 0x9e, 0x44,
+  0x0c, 0x82, 0x2c, 0xcb, 0x24, 0x30, 0x98, 0x25, 0x38, 0x86, 0x1b, 0x0a,
+  0x31, 0x00, 0x83, 0x59, 0x06, 0x83, 0x08, 0x46, 0x0c, 0x14, 0x00, 0x04,
+  0xc1, 0x80, 0x51, 0x83, 0xeb, 0x31, 0xc6, 0x60, 0x0a, 0x83, 0xd1, 0x84,
+  0x00, 0x18, 0x6e, 0x08, 0xd2, 0x00, 0x0c, 0x66, 0x19, 0x8c, 0x22, 0xa8,
+  0xe4, 0x82, 0x11, 0x03, 0x05, 0x00, 0x41, 0x30, 0x60, 0xdc, 0x60, 0x9b,
+  0x82, 0x33, 0xb8, 0xca, 0x60, 0x34, 0x21, 0x00, 0x86, 0x1b, 0x82, 0x36,
+  0x00, 0x83, 0x59, 0x02, 0x63, 0xa0, 0x23, 0x82, 0x03, 0x02, 0x0e, 0x06,
+  0xa1, 0xb8, 0x60, 0xc4, 0x88, 0x81, 0x03, 0x80, 0x20, 0x18, 0x24, 0x74,
+  0x10, 0x06, 0xd9, 0xc4, 0x06, 0xc1, 0x18, 0x8c, 0xc1, 0x18, 0x70, 0x6a,
+  0x30, 0x4b, 0x70, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
diff --git a/src/dml/generated_dml_shaders/update_mask_int64.h b/src/dml/generated_dml_shaders/update_mask_int64.h
new file mode 100644
index 000000000..ee487c4d7
--- /dev/null
+++ b/src/dml/generated_dml_shaders/update_mask_int64.h
@@ -0,0 +1,347 @@
+#if 0
+;
+; Note: shader requires additional functionality:
+;       64-Bit integer
+;
+;
+; Input signature:
+;
+; Name                 Index   Mask Register SysValue  Format   Used
+; -------------------- ----- ------ -------- -------- ------- ------
+; no parameters
+;
+; Output signature:
+;
+; Name                 Index   Mask Register SysValue  Format   Used
+; -------------------- ----- ------ -------- -------- ------- ------
+; no parameters
+; shader hash: 57ae7076edc2ae5d0d8edcca94ee09a7
+;
+; Pipeline Runtime Information: 
+;
+; Compute Shader
+; NumThreads=(256,1,1)
+;
+;
+; Buffer Definitions:
+;
+; cbuffer 
+; {
+;
+;   [16 x i8] (type annotation not present)
+;
+; }
+;
+; Resource bind info for 
+; {
+;
+;   [8 x i8] (type annotation not present)
+;
+; }
+;
+; Resource bind info for 
+; {
+;
+;   [8 x i8] (type annotation not present)
+;
+; }
+;
+;
+; Resource Bindings:
+;
+; Name                                 Type  Format         Dim      ID      HLSL Bind  Count
+; ------------------------------ ---------- ------- ----------- ------- -------------- ------
+;                                   cbuffer      NA          NA     CB0            cb0     1
+;                                       UAV  struct         r/w      U0             u0     1
+;                                       UAV  struct         r/w      U1             u1     1
+;
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%dx.types.Handle = type { i8* }
+%dx.types.CBufRet.i32 = type { i32, i32, i32, i32 }
+%dx.types.ResRet.i32 = type { i32, i32, i32, i32, i32 }
+%"class.RWStructuredBuffer<long long>" = type { i64 }
+%Constants = type { i32, i32, i32, i32 }
+
+define void @CSMain() {
+  %1 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 1, i32 1, i32 1, i1 false)  ; CreateHandle(resourceClass,rangeId,index,nonUniformIndex)
+  %2 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 1, i32 0, i32 0, i1 false)  ; CreateHandle(resourceClass,rangeId,index,nonUniformIndex)
+  %3 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 0, i1 false)  ; CreateHandle(resourceClass,rangeId,index,nonUniformIndex)
+  %4 = call i32 @dx.op.threadId.i32(i32 93, i32 0)  ; ThreadId(component)
+  %5 = call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32(i32 59, %dx.types.Handle %3, i32 0)  ; CBufferLoadLegacy(handle,regIndex)
+  %6 = extractvalue %dx.types.CBufRet.i32 %5, 3
+  %7 = add i32 %6, %4
+  %8 = extractvalue %dx.types.CBufRet.i32 %5, 2
+  %9 = icmp ult i32 %7, %8
+  br i1 %9, label %10, label %48
+
+; <label>:10                                      ; preds = %0
+  %11 = extractvalue %dx.types.CBufRet.i32 %5, 0
+  %12 = urem i32 %7, %11
+  %13 = extractvalue %dx.types.CBufRet.i32 %5, 1
+  %14 = icmp ugt i32 %13, 1
+  br i1 %14, label %15, label %21
+
+; <label>:15                                      ; preds = %10
+  %16 = icmp ult i32 %12, %13
+  %17 = zext i1 %16 to i64
+  %18 = trunc i64 %17 to i32
+  %19 = lshr i64 %17, 32
+  %20 = trunc i64 %19 to i32
+  call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %1, i32 %7, i32 0, i32 %18, i32 %20, i32 undef, i32 undef, i8 3, i32 8)  ; RawBufferStore(uav,index,elementOffset,value0,value1,value2,value3,mask,alignment)
+  br label %48
+
+; <label>:21                                      ; preds = %10
+  %22 = icmp eq i32 %12, 0
+  br i1 %22, label %42, label %23
+
+; <label>:23                                      ; preds = %21
+  %24 = call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %2, i32 %12, i32 0, i8 3, i32 8)  ; RawBufferLoad(srv,index,elementOffset,mask,alignment)
+  %25 = extractvalue %dx.types.ResRet.i32 %24, 0
+  %26 = extractvalue %dx.types.ResRet.i32 %24, 1
+  %27 = zext i32 %25 to i64
+  %28 = zext i32 %26 to i64
+  %29 = shl i64 %28, 32
+  %30 = or i64 %27, %29
+  %31 = icmp eq i64 %30, 1
+  br i1 %31, label %42, label %32
+
+; <label>:32                                      ; preds = %23
+  %33 = add i32 %12, -1
+  %34 = call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %2, i32 %33, i32 0, i8 3, i32 8)  ; RawBufferLoad(srv,index,elementOffset,mask,alignment)
+  %35 = extractvalue %dx.types.ResRet.i32 %34, 0
+  %36 = extractvalue %dx.types.ResRet.i32 %34, 1
+  %37 = zext i32 %35 to i64
+  %38 = zext i32 %36 to i64
+  %39 = shl i64 %38, 32
+  %40 = or i64 %37, %39
+  %41 = icmp eq i64 %40, 1
+  br label %42
+
+; <label>:42                                      ; preds = %32, %23, %21
+  %43 = phi i1 [ true, %23 ], [ true, %21 ], [ %41, %32 ]
+  %44 = zext i1 %43 to i64
+  %45 = trunc i64 %44 to i32
+  %46 = lshr i64 %44, 32
+  %47 = trunc i64 %46 to i32
+  call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %1, i32 %7, i32 0, i32 %45, i32 %47, i32 undef, i32 undef, i8 3, i32 8)  ; RawBufferStore(uav,index,elementOffset,value0,value1,value2,value3,mask,alignment)
+  br label %48
+
+; <label>:48                                      ; preds = %42, %15, %0
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @dx.op.threadId.i32(i32, i32) #0
+
+; Function Attrs: nounwind readonly
+declare %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32(i32, %dx.types.Handle, i32) #1
+
+; Function Attrs: nounwind readonly
+declare %dx.types.Handle @dx.op.createHandle(i32, i8, i32, i32, i1) #1
+
+; Function Attrs: nounwind
+declare void @dx.op.rawBufferStore.i32(i32, %dx.types.Handle, i32, i32, i32, i32, i32, i32, i8, i32) #2
+
+; Function Attrs: nounwind readonly
+declare %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32, %dx.types.Handle, i32, i32, i8, i32) #1
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind readonly }
+attributes #2 = { nounwind }
+
+!llvm.ident = !{!0}
+!dx.version = !{!1}
+!dx.valver = !{!2}
+!dx.shaderModel = !{!3}
+!dx.resources = !{!4}
+!dx.entryPoints = !{!11}
+
+!0 = !{!"dxcoob 1.7.2308.7 (69e54e290)"}
+!1 = !{i32 1, i32 2}
+!2 = !{i32 1, i32 7}
+!3 = !{!"cs", i32 6, i32 2}
+!4 = !{null, !5, !9, null}
+!5 = !{!6, !8}
+!6 = !{i32 0, %"class.RWStructuredBuffer<long long>"* undef, !"", i32 0, i32 0, i32 1, i32 12, i1 false, i1 false, i1 false, !7}
+!7 = !{i32 1, i32 8}
+!8 = !{i32 1, %"class.RWStructuredBuffer<long long>"* undef, !"", i32 0, i32 1, i32 1, i32 12, i1 false, i1 false, i1 false, !7}
+!9 = !{!10}
+!10 = !{i32 0, %Constants* undef, !"", i32 0, i32 0, i32 1, i32 16, null}
+!11 = !{void ()* @CSMain, !"CSMain", null, !4, !12}
+!12 = !{i32 0, i64 1048592, i32 4, !13}
+!13 = !{i32 256, i32 1, i32 1}
+
+#endif
+
+const unsigned char g_CSMain[] = {
+  0x44, 0x58, 0x42, 0x43, 0xe7, 0xcb, 0x56, 0x9e, 0x42, 0xbe, 0x58, 0x49,
+  0x4f, 0x2a, 0x90, 0x27, 0x8b, 0xc3, 0x1d, 0xe6, 0x01, 0x00, 0x00, 0x00,
+  0xd8, 0x07, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00,
+  0x00, 0x01, 0x00, 0x00, 0x1c, 0x01, 0x00, 0x00, 0x53, 0x46, 0x49, 0x30,
+  0x08, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x49, 0x53, 0x47, 0x31, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x08, 0x00, 0x00, 0x00, 0x4f, 0x53, 0x47, 0x31, 0x08, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x50, 0x53, 0x56, 0x30,
+  0x90, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x05, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x48, 0x41, 0x53, 0x48, 0x14, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x57, 0xae, 0x70, 0x76, 0xed, 0xc2, 0xae, 0x5d,
+  0x0d, 0x8e, 0xdc, 0xca, 0x94, 0xee, 0x09, 0xa7, 0x44, 0x58, 0x49, 0x4c,
+  0xb4, 0x06, 0x00, 0x00, 0x62, 0x00, 0x05, 0x00, 0xad, 0x01, 0x00, 0x00,
+  0x44, 0x58, 0x49, 0x4c, 0x02, 0x01, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+  0x9c, 0x06, 0x00, 0x00, 0x42, 0x43, 0xc0, 0xde, 0x21, 0x0c, 0x00, 0x00,
+  0xa4, 0x01, 0x00, 0x00, 0x0b, 0x82, 0x20, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x13, 0x00, 0x00, 0x00, 0x07, 0x81, 0x23, 0x91, 0x41, 0xc8, 0x04, 0x49,
+  0x06, 0x10, 0x32, 0x39, 0x92, 0x01, 0x84, 0x0c, 0x25, 0x05, 0x08, 0x19,
+  0x1e, 0x04, 0x8b, 0x62, 0x80, 0x14, 0x45, 0x02, 0x42, 0x92, 0x0b, 0x42,
+  0xa4, 0x10, 0x32, 0x14, 0x38, 0x08, 0x18, 0x4b, 0x0a, 0x32, 0x52, 0x88,
+  0x48, 0x90, 0x14, 0x20, 0x43, 0x46, 0x88, 0xa5, 0x00, 0x19, 0x32, 0x42,
+  0xe4, 0x48, 0x0e, 0x90, 0x91, 0x22, 0xc4, 0x50, 0x41, 0x51, 0x81, 0x8c,
+  0xe1, 0x83, 0xe5, 0x8a, 0x04, 0x29, 0x46, 0x06, 0x51, 0x18, 0x00, 0x00,
+  0x08, 0x00, 0x00, 0x00, 0x1b, 0x8c, 0xe0, 0xff, 0xff, 0xff, 0xff, 0x07,
+  0x40, 0x02, 0xa8, 0x0d, 0x86, 0xf0, 0xff, 0xff, 0xff, 0xff, 0x03, 0x20,
+  0x01, 0xd5, 0x06, 0x62, 0xf8, 0xff, 0xff, 0xff, 0xff, 0x01, 0x90, 0x00,
+  0x49, 0x18, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x13, 0x82, 0x60, 0x42,
+  0x20, 0x4c, 0x08, 0x06, 0x00, 0x00, 0x00, 0x00, 0x89, 0x20, 0x00, 0x00,
+  0x35, 0x00, 0x00, 0x00, 0x32, 0x22, 0x48, 0x09, 0x20, 0x64, 0x85, 0x04,
+  0x93, 0x22, 0xa4, 0x84, 0x04, 0x93, 0x22, 0xe3, 0x84, 0xa1, 0x90, 0x14,
+  0x12, 0x4c, 0x8a, 0x8c, 0x0b, 0x84, 0xa4, 0x4c, 0x10, 0x6c, 0x23, 0x00,
+  0x25, 0x00, 0x14, 0xe6, 0x08, 0xc0, 0xa0, 0x0c, 0x63, 0x0c, 0x22, 0x47,
+  0x0d, 0x97, 0x3f, 0x61, 0x0f, 0x21, 0xf9, 0xdc, 0x46, 0x15, 0x2b, 0x31,
+  0xf9, 0xc8, 0x6d, 0x23, 0x62, 0x8c, 0x31, 0xe6, 0x08, 0x10, 0x3a, 0xf7,
+  0x0c, 0x97, 0x3f, 0x61, 0x0f, 0x21, 0xf9, 0x21, 0xd0, 0x0c, 0x0b, 0x81,
+  0x02, 0x54, 0x08, 0x33, 0xd2, 0x20, 0x35, 0x47, 0x10, 0x14, 0x23, 0x8d,
+  0x33, 0x06, 0xa3, 0x56, 0x16, 0x30, 0xd2, 0x18, 0x63, 0x8c, 0x71, 0x06,
+  0xbd, 0x9b, 0x86, 0xcb, 0x9f, 0xb0, 0x87, 0x90, 0xfc, 0x95, 0x90, 0x56,
+  0x62, 0xf2, 0x91, 0xdb, 0x46, 0xc5, 0x18, 0x63, 0x8c, 0x72, 0xc4, 0x91,
+  0xc6, 0x38, 0x83, 0xe4, 0x40, 0xc0, 0x1c, 0x01, 0x28, 0xcc, 0x34, 0x06,
+  0xe3, 0xc0, 0x0e, 0xe1, 0x30, 0x0f, 0xf3, 0xe0, 0x06, 0xb2, 0x70, 0x0b,
+  0xb3, 0x40, 0x0f, 0xf2, 0x50, 0x0f, 0xe3, 0x40, 0x0f, 0xf5, 0x20, 0x0f,
+  0xe5, 0x40, 0x0e, 0xa2, 0x50, 0x0f, 0xe6, 0x60, 0x0e, 0xe5, 0x20, 0x0f,
+  0x7c, 0xc0, 0x0e, 0xef, 0xe0, 0x0e, 0xe7, 0x00, 0x06, 0xec, 0xf0, 0x0e,
+  0xee, 0x70, 0x0e, 0x7e, 0x80, 0x82, 0x4a, 0xf6, 0x12, 0xce, 0x69, 0xa4,
+  0x09, 0x68, 0x26, 0x09, 0x11, 0x63, 0x8c, 0x41, 0x78, 0x0a, 0x00, 0x00,
+  0x13, 0x14, 0x72, 0xc0, 0x87, 0x74, 0x60, 0x87, 0x36, 0x68, 0x87, 0x79,
+  0x68, 0x03, 0x72, 0xc0, 0x87, 0x0d, 0xaf, 0x50, 0x0e, 0x6d, 0xd0, 0x0e,
+  0x7a, 0x50, 0x0e, 0x6d, 0x00, 0x0f, 0x7a, 0x30, 0x07, 0x72, 0xa0, 0x07,
+  0x73, 0x20, 0x07, 0x6d, 0x90, 0x0e, 0x71, 0xa0, 0x07, 0x73, 0x20, 0x07,
+  0x6d, 0x90, 0x0e, 0x78, 0xa0, 0x07, 0x73, 0x20, 0x07, 0x6d, 0x90, 0x0e,
+  0x71, 0x60, 0x07, 0x7a, 0x30, 0x07, 0x72, 0xd0, 0x06, 0xe9, 0x30, 0x07,
+  0x72, 0xa0, 0x07, 0x73, 0x20, 0x07, 0x6d, 0x90, 0x0e, 0x76, 0x40, 0x07,
+  0x7a, 0x60, 0x07, 0x74, 0xd0, 0x06, 0xe6, 0x10, 0x07, 0x76, 0xa0, 0x07,
+  0x73, 0x20, 0x07, 0x6d, 0x60, 0x0e, 0x73, 0x20, 0x07, 0x7a, 0x30, 0x07,
+  0x72, 0xd0, 0x06, 0xe6, 0x60, 0x07, 0x74, 0xa0, 0x07, 0x76, 0x40, 0x07,
+  0x6d, 0xe0, 0x0e, 0x78, 0xa0, 0x07, 0x71, 0x60, 0x07, 0x7a, 0x30, 0x07,
+  0x72, 0xa0, 0x07, 0x76, 0x40, 0x07, 0x43, 0x9e, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x86, 0x3c, 0x04, 0x10, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x79, 0x14, 0x20,
+  0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0xf2, 0x34,
+  0x40, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0xe4,
+  0x79, 0x80, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x60,
+  0xc8, 0x23, 0x01, 0x01, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x40, 0x16, 0x08, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x32, 0x1e, 0x98, 0x14,
+  0x19, 0x11, 0x4c, 0x90, 0x8c, 0x09, 0x26, 0x47, 0xc6, 0x04, 0x43, 0x1a,
+  0x25, 0x50, 0x04, 0xe5, 0x50, 0x0c, 0x23, 0x00, 0x85, 0x51, 0x10, 0x05,
+  0x52, 0x08, 0x05, 0x48, 0x40, 0x6c, 0x04, 0x80, 0x6a, 0x81, 0x02, 0x02,
+  0x06, 0xd0, 0x9d, 0x01, 0xa0, 0x3c, 0x03, 0x00, 0x79, 0x18, 0x00, 0x00,
+  0x42, 0x00, 0x00, 0x00, 0x1a, 0x03, 0x4c, 0x90, 0x46, 0x02, 0x13, 0xc4,
+  0x8e, 0x0c, 0x6f, 0xec, 0xed, 0x4d, 0x0c, 0x24, 0xc6, 0xe5, 0xc6, 0x45,
+  0x66, 0x06, 0x06, 0xc7, 0xe5, 0x06, 0x04, 0xc5, 0x26, 0xa7, 0xac, 0x86,
+  0xa6, 0x4c, 0x26, 0x07, 0x26, 0x65, 0x43, 0x10, 0x4c, 0x10, 0x06, 0x63,
+  0x82, 0x30, 0x1c, 0x1b, 0x84, 0x81, 0x98, 0x20, 0x0c, 0xc8, 0x06, 0x61,
+  0x30, 0x28, 0x8c, 0xcd, 0x4d, 0x10, 0x86, 0x64, 0xc3, 0x80, 0x24, 0xc4,
+  0x04, 0x61, 0x50, 0x26, 0x08, 0x97, 0x44, 0x60, 0x82, 0x30, 0x2c, 0x13,
+  0x04, 0x06, 0x9a, 0x20, 0x0c, 0xcc, 0x06, 0x61, 0x80, 0x36, 0x2c, 0x0b,
+  0xd3, 0x2c, 0xcb, 0xe0, 0x3c, 0xcf, 0x13, 0x6d, 0x58, 0x06, 0xa6, 0x59,
+  0x86, 0xc1, 0x79, 0x9e, 0x27, 0xda, 0x20, 0x48, 0xd3, 0x04, 0x21, 0x9b,
+  0x26, 0x08, 0x43, 0xb3, 0x01, 0x59, 0xaa, 0x66, 0x59, 0x06, 0x0b, 0xd8,
+  0x10, 0x5c, 0x1b, 0x08, 0x80, 0xc2, 0x80, 0x09, 0x82, 0x00, 0xd0, 0x18,
+  0x9a, 0x6a, 0x0a, 0x4b, 0x73, 0x9b, 0x20, 0x54, 0xd1, 0x04, 0x61, 0x70,
+  0x26, 0x08, 0xc3, 0xb3, 0x61, 0xf0, 0x86, 0x61, 0x03, 0xb1, 0x70, 0xdd,
+  0xb7, 0xa1, 0xd0, 0x36, 0x20, 0x03, 0x83, 0x2a, 0x6c, 0x6c, 0x76, 0x6d,
+  0x2e, 0x69, 0x64, 0x65, 0x6e, 0x74, 0x53, 0x82, 0xa0, 0x0a, 0x19, 0x9e,
+  0x8b, 0x5d, 0x99, 0xdc, 0x5c, 0xda, 0x9b, 0xdb, 0x94, 0x80, 0x68, 0x42,
+  0x86, 0xe7, 0x62, 0x17, 0xc6, 0x66, 0x57, 0x26, 0x37, 0x25, 0x30, 0xea,
+  0x90, 0xe1, 0xb9, 0xcc, 0xa1, 0x85, 0x91, 0x95, 0xc9, 0x35, 0xbd, 0x91,
+  0x95, 0xb1, 0x4d, 0x09, 0x92, 0x32, 0x64, 0x78, 0x2e, 0x72, 0x65, 0x73,
+  0x6f, 0x75, 0x72, 0x63, 0x65, 0x73, 0x53, 0x02, 0xac, 0x0e, 0x19, 0x9e,
+  0x4b, 0x99, 0x1b, 0x9d, 0x5c, 0x1e, 0xd4, 0x5b, 0x9a, 0x1b, 0xdd, 0xdc,
+  0x94, 0x00, 0x0c, 0x00, 0x79, 0x18, 0x00, 0x00, 0x51, 0x00, 0x00, 0x00,
+  0x33, 0x08, 0x80, 0x1c, 0xc4, 0xe1, 0x1c, 0x66, 0x14, 0x01, 0x3d, 0x88,
+  0x43, 0x38, 0x84, 0xc3, 0x8c, 0x42, 0x80, 0x07, 0x79, 0x78, 0x07, 0x73,
+  0x98, 0x71, 0x0c, 0xe6, 0x00, 0x0f, 0xed, 0x10, 0x0e, 0xf4, 0x80, 0x0e,
+  0x33, 0x0c, 0x42, 0x1e, 0xc2, 0xc1, 0x1d, 0xce, 0xa1, 0x1c, 0x66, 0x30,
+  0x05, 0x3d, 0x88, 0x43, 0x38, 0x84, 0x83, 0x1b, 0xcc, 0x03, 0x3d, 0xc8,
+  0x43, 0x3d, 0x8c, 0x03, 0x3d, 0xcc, 0x78, 0x8c, 0x74, 0x70, 0x07, 0x7b,
+  0x08, 0x07, 0x79, 0x48, 0x87, 0x70, 0x70, 0x07, 0x7a, 0x70, 0x03, 0x76,
+  0x78, 0x87, 0x70, 0x20, 0x87, 0x19, 0xcc, 0x11, 0x0e, 0xec, 0x90, 0x0e,
+  0xe1, 0x30, 0x0f, 0x6e, 0x30, 0x0f, 0xe3, 0xf0, 0x0e, 0xf0, 0x50, 0x0e,
+  0x33, 0x10, 0xc4, 0x1d, 0xde, 0x21, 0x1c, 0xd8, 0x21, 0x1d, 0xc2, 0x61,
+  0x1e, 0x66, 0x30, 0x89, 0x3b, 0xbc, 0x83, 0x3b, 0xd0, 0x43, 0x39, 0xb4,
+  0x03, 0x3c, 0xbc, 0x83, 0x3c, 0x84, 0x03, 0x3b, 0xcc, 0xf0, 0x14, 0x76,
+  0x60, 0x07, 0x7b, 0x68, 0x07, 0x37, 0x68, 0x87, 0x72, 0x68, 0x07, 0x37,
+  0x80, 0x87, 0x70, 0x90, 0x87, 0x70, 0x60, 0x07, 0x76, 0x28, 0x07, 0x76,
+  0xf8, 0x05, 0x76, 0x78, 0x87, 0x77, 0x80, 0x87, 0x5f, 0x08, 0x87, 0x71,
+  0x18, 0x87, 0x72, 0x98, 0x87, 0x79, 0x98, 0x81, 0x2c, 0xee, 0xf0, 0x0e,
+  0xee, 0xe0, 0x0e, 0xf5, 0xc0, 0x0e, 0xec, 0x30, 0x03, 0x62, 0xc8, 0xa1,
+  0x1c, 0xe4, 0xa1, 0x1c, 0xcc, 0xa1, 0x1c, 0xe4, 0xa1, 0x1c, 0xdc, 0x61,
+  0x1c, 0xca, 0x21, 0x1c, 0xc4, 0x81, 0x1d, 0xca, 0x61, 0x06, 0xd6, 0x90,
+  0x43, 0x39, 0xc8, 0x43, 0x39, 0x98, 0x43, 0x39, 0xc8, 0x43, 0x39, 0xb8,
+  0xc3, 0x38, 0x94, 0x43, 0x38, 0x88, 0x03, 0x3b, 0x94, 0xc3, 0x2f, 0xbc,
+  0x83, 0x3c, 0xfc, 0x82, 0x3b, 0xd4, 0x03, 0x3b, 0xb0, 0xc3, 0x0c, 0xc4,
+  0x21, 0x07, 0x7c, 0x70, 0x03, 0x7a, 0x28, 0x87, 0x76, 0x80, 0x87, 0x19,
+  0xd1, 0x43, 0x0e, 0xf8, 0xe0, 0x06, 0xe4, 0x20, 0x0e, 0xe7, 0xe0, 0x06,
+  0xf6, 0x10, 0x0e, 0xf2, 0xc0, 0x0e, 0xe1, 0x90, 0x0f, 0xef, 0x50, 0x0f,
+  0xf4, 0x30, 0x83, 0x81, 0xc8, 0x01, 0x1f, 0xdc, 0x40, 0x1c, 0xe4, 0xa1,
+  0x1c, 0xc2, 0x61, 0x1d, 0xdc, 0x40, 0x1c, 0xe4, 0x01, 0x00, 0x00, 0x00,
+  0x71, 0x20, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00, 0x06, 0x60, 0x70, 0xac,
+  0x09, 0x20, 0x8d, 0x09, 0x6c, 0xc3, 0xe5, 0x3b, 0x8f, 0x2f, 0x04, 0x54,
+  0x51, 0x10, 0x51, 0xe9, 0x00, 0x43, 0x49, 0x18, 0x80, 0x80, 0xf9, 0xc8,
+  0x6d, 0xdb, 0x80, 0x34, 0x5c, 0xbe, 0xf3, 0xf8, 0x42, 0x44, 0x00, 0x13,
+  0x11, 0x02, 0xcd, 0xb0, 0x10, 0x56, 0x70, 0x0d, 0x97, 0xef, 0x3c, 0x7e,
+  0x04, 0x58, 0x1b, 0x55, 0x14, 0x44, 0x54, 0x3a, 0xc0, 0xe0, 0x23, 0xb7,
+  0x6d, 0x04, 0xd8, 0x70, 0xf9, 0xce, 0xe3, 0x47, 0x80, 0xb5, 0x51, 0x45,
+  0x41, 0x44, 0xec, 0xe4, 0x44, 0x84, 0x8f, 0xdc, 0xb6, 0x05, 0x48, 0xc3,
+  0xe5, 0x3b, 0x8f, 0x3f, 0x1d, 0x11, 0x01, 0x0c, 0xe2, 0xe0, 0x23, 0xb7,
+  0x0d, 0x00, 0x00, 0x00, 0x61, 0x20, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00,
+  0x13, 0x04, 0x48, 0x2c, 0x10, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,
+  0x34, 0x66, 0x00, 0x4a, 0xae, 0x30, 0x05, 0xca, 0x52, 0xa0, 0x74, 0x03,
+  0xca, 0xae, 0x06, 0xe8, 0x94, 0x41, 0x09, 0x14, 0x01, 0xb1, 0x1a, 0xa0,
+  0x5a, 0x40, 0x25, 0x00, 0x23, 0x06, 0x09, 0x00, 0x82, 0x60, 0xd0, 0x78,
+  0x4c, 0xb1, 0x6d, 0xd1, 0x88, 0x41, 0x02, 0x80, 0x20, 0x18, 0x34, 0x5f,
+  0x63, 0x60, 0x98, 0x34, 0x62, 0x90, 0x00, 0x20, 0x08, 0x06, 0x0d, 0x18,
+  0x38, 0x46, 0x96, 0x4d, 0x23, 0x06, 0x06, 0x00, 0x82, 0x60, 0x40, 0x8c,
+  0x01, 0xa3, 0x8d, 0x18, 0x1c, 0x00, 0x08, 0x82, 0x81, 0x32, 0x06, 0x8c,
+  0xb0, 0x8d, 0x26, 0x04, 0x43, 0x05, 0x03, 0x8c, 0x26, 0x0c, 0xc2, 0x70,
+  0x83, 0x10, 0x90, 0xc1, 0x2c, 0x43, 0x70, 0x04, 0xa3, 0x09, 0x05, 0x50,
+  0x44, 0x50, 0xa3, 0x09, 0x47, 0x30, 0xdc, 0x10, 0x9c, 0x81, 0x18, 0xcc,
+  0x32, 0x08, 0x43, 0x30, 0xdc, 0x30, 0x08, 0x64, 0x70, 0x41, 0x8d, 0x0b,
+  0x06, 0x94, 0x20, 0xe1, 0x05, 0x03, 0x46, 0x0c, 0x1c, 0x00, 0x04, 0xc1,
+  0xe0, 0x79, 0x83, 0x4e, 0x62, 0xd2, 0x60, 0x08, 0xbe, 0x0f, 0x3b, 0x83,
+  0x59, 0x82, 0x63, 0xb8, 0x01, 0x49, 0x03, 0x30, 0x98, 0x65, 0x30, 0x88,
+  0x60, 0xc4, 0x40, 0x01, 0x40, 0x10, 0x0c, 0xa4, 0x37, 0xe8, 0xa4, 0x44,
+  0x0d, 0x32, 0x34, 0x18, 0x4d, 0x08, 0x80, 0xd1, 0x04, 0x21, 0x38, 0xa1,
+  0xc6, 0x09, 0x35, 0x2a, 0xd0, 0xae, 0x86, 0x60, 0x87, 0x1b, 0x82, 0x0d,
+  0x0c, 0x66, 0x19, 0x8c, 0x22, 0xa8, 0x48, 0x0c, 0x60, 0xc4, 0x40, 0x01,
+  0x40, 0x10, 0x0c, 0x24, 0x3c, 0x30, 0x83, 0x2d, 0x98, 0x03, 0x31, 0x88,
+  0x83, 0xd1, 0x84, 0x00, 0x18, 0x4d, 0x10, 0x82, 0x13, 0x6a, 0x9c, 0x50,
+  0xa3, 0x82, 0x31, 0xb8, 0x1a, 0x82, 0x1d, 0x6e, 0x08, 0xc8, 0x00, 0x0c,
+  0x66, 0x09, 0x8c, 0x81, 0x0e, 0xc6, 0x15, 0x08, 0x57, 0x18, 0x84, 0xe2,
+  0x82, 0x1a, 0x17, 0x0c, 0x28, 0x21, 0x0d, 0xf0, 0x82, 0x01, 0x23, 0x06,
+  0x0e, 0x00, 0x82, 0x60, 0xf0, 0x98, 0x02, 0x1d, 0xa4, 0xc1, 0x18, 0x80,
+  0xc2, 0x10, 0xd8, 0x81, 0x1d, 0xbc, 0x81, 0x1f, 0xcc, 0x12, 0x1c, 0x08,
+  0x00, 0x00, 0x00, 0x00
+};
diff --git a/src/models/captured_graph_pool.cpp b/src/models/captured_graph_pool.cpp
new file mode 100644
index 000000000..2862ac8d9
--- /dev/null
+++ b/src/models/captured_graph_pool.cpp
@@ -0,0 +1,107 @@
+#include "onnxruntime_c_api.h"
+#include "captured_graph_pool.h"
+#include "../generators.h"
+#include "model.h"
+
+namespace Generators {
+
+void CapturedGraphInfoRecycler::operator()(CapturedGraphInfo* captured_graph_info) {
+  if (captured_graph_info) {
+    auto pool = captured_graph_info->pool_.lock();
+
+    if (pool) {
+      // Return the graph to the pool if available
+      pool->AddCapturedGraph(CapturedGraphInfoPtr(captured_graph_info));
+    } else {
+      // If the pool has already been destroyed, we simply destroy the graph
+      delete captured_graph_info;
+    }
+  }
+}
+
+static std::tuple<int, int, int> MakeKey(int max_batch_size, int max_length, int num_beams) {
+  return std::make_tuple(max_batch_size, max_length, num_beams);
+}
+
+CapturedGraphInfoPtr CapturedGraphPool::ReserveCapturedGraph(const Model& model, const GeneratorParams& params) const {
+  if (!model.use_cuda_graph_ || (model.device_type_ != DeviceType::CUDA && model.device_type_ != DeviceType::DML)) {
+    return nullptr;
+  }
+
+  // Multiple generators can reserve graphs in parallel, so we need to make it thread saf
+  std::unique_lock lock(captured_graph_mutex_);
+
+  auto key = MakeKey(params.max_batch_size, params.search.max_length, params.search.num_beams);
+  auto& captured_graphs = captured_graphs_map_[key];
+
+  // If no graphs are available, create a graph with a new ID
+  if (captured_graphs.empty()) {
+    auto new_captured_graph = CapturedGraphInfoPtr(new CapturedGraphInfo);
+
+    // Create a unique annotation id
+    new_captured_graph->index_ = current_graph_annotation_id_++;
+
+    // We can unlock the mutex here since we don't access state that is subject to changes after this point
+    lock.unlock();
+
+    new_captured_graph->max_batch_size_ = params.max_batch_size;
+    new_captured_graph->max_length_ = params.search.max_length;
+    new_captured_graph->num_beams_ = params.search.num_beams;
+    new_captured_graph->pool_ = shared_from_this();
+
+    // Create the static buffer for the input ids
+    size_t max_beam_batch_size = static_cast<size_t>(params.search.num_beams) * params.max_batch_size;
+    new_captured_graph->sb_input_ids_ = std::make_unique<StaticBuffer>(allocator_device_, max_beam_batch_size);
+
+#if USE_DML
+    new_captured_graph->sb_input_ids_int32_ = std::make_unique<StaticBuffer>(allocator_device_, max_beam_batch_size);
+#endif
+
+    // Create the static buffers for the cache
+    int layer_count = config_->model.decoder.num_hidden_layers;
+    new_captured_graph->sb_kv_caches_.reserve(layer_count * 2);
+
+    for (int i = 0; i < layer_count * 2; ++i) {
+      new_captured_graph->sb_kv_caches_.push_back(std::make_unique<StaticBuffer>(allocator_device_, max_beam_batch_size));
+    }
+
+    // Create the static buffer for the position ids, if needed
+    if (session_info_->HasInput(config_->model.decoder.inputs.position_ids)) {
+      new_captured_graph->sb_position_ids_ = std::make_unique<StaticBuffer>(allocator_device_, max_beam_batch_size);
+    }
+
+    // Create the static buffer for the attention mask, if needed
+    if (session_info_->HasInput(config_->model.decoder.inputs.attention_mask)) {
+      new_captured_graph->sb_attention_mask_ = std::make_unique<StaticBuffer>(allocator_device_, max_beam_batch_size);
+
+#if USE_DML
+      // DML currently needs an additional static buffer for the mask
+      new_captured_graph->sb_attention_mask_next_ = std::make_unique<StaticBuffer>(allocator_device_, max_beam_batch_size);
+#endif
+    }
+
+    auto output_type = session_info_->GetOutputDataType(config_->model.decoder.outputs.logits);
+
+    if (output_type == Ort::TypeToTensorType<float>::type) {
+      new_captured_graph->sb_logits32_ = std::make_unique<StaticBuffer>(allocator_device_, max_beam_batch_size);
+    }
+
+    if (output_type == Ort::TypeToTensorType<Ort::Float16_t>::type) {
+      new_captured_graph->sb_logits16_ = std::make_unique<StaticBuffer>(allocator_device_, max_beam_batch_size);
+    }
+
+    return new_captured_graph;
+  }
+
+  // We found a graph, so take it from the pool and return it to the caller
+  auto captured_graph = std::move(captured_graphs.front());
+  captured_graphs.pop_front();
+  return captured_graph;
+}
+
+void CapturedGraphPool::AddCapturedGraph(CapturedGraphInfoPtr&& captured_graph) const {
+  std::unique_lock lock(captured_graph_mutex_);
+  auto key = MakeKey(captured_graph->max_batch_size_, captured_graph->max_length_, captured_graph->num_beams_);
+  captured_graphs_map_[key].push_back(std::move(captured_graph));
+}
+}  // namespace Generators
\ No newline at end of file
diff --git a/src/models/captured_graph_pool.h b/src/models/captured_graph_pool.h
new file mode 100644
index 000000000..7e323512f
--- /dev/null
+++ b/src/models/captured_graph_pool.h
@@ -0,0 +1,89 @@
+#pragma once
+
+#include <vector>
+#include <list>
+#include <mutex>
+#include <unordered_map>
+#include "static_buffer.h"
+
+// From boost http://www.boost.org/doc/libs/1_35_0/doc/html/hash/combine.html
+template <class T>
+inline void hash_combine(size_t& seed, T const& v) {
+  seed ^= std::hash<T>()(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+}
+
+struct tuple_hash {
+  size_t operator()(const std::tuple<int, int, int>& p) const {
+    size_t seed = 0;
+    auto [first, second, third] = p;
+    hash_combine(seed, first);
+    hash_combine(seed, second);
+    hash_combine(seed, third);
+    return seed;
+  }
+};
+
+namespace Generators {
+struct CapturedGraphInfo;
+struct Config;
+struct SessionInfo;
+struct Model;
+struct GeneratorParams;
+
+struct CapturedGraphInfoRecycler {
+  void operator()(CapturedGraphInfo* captured_graph_info);
+};
+
+using CapturedGraphInfoPtr = std::unique_ptr<CapturedGraphInfo, CapturedGraphInfoRecycler>;
+
+class CapturedGraphPool : public std::enable_shared_from_this<CapturedGraphPool> {
+ public:
+  CapturedGraphPool(const Config* config, const SessionInfo* session_info, Ort::Allocator* allocator_device)
+      : config_(config),
+        session_info_(session_info),
+        allocator_device_(allocator_device){};
+
+  void AddCapturedGraph(CapturedGraphInfoPtr&& captured_graph) const;
+  CapturedGraphInfoPtr ReserveCapturedGraph(const Model& model, const GeneratorParams& params) const;
+
+ private:
+  // Map from batch_size/max_length to a list of captured graphs
+  mutable std::unordered_map<std::tuple<int, int, int>, std::list<CapturedGraphInfoPtr>, tuple_hash> captured_graphs_map_;
+  mutable std::mutex captured_graph_mutex_;
+
+  // 0 is reserved for internal usage in cuda graphs, so we start from 1
+  mutable int current_graph_annotation_id_ = 1;
+  const Config* config_;
+  const SessionInfo* session_info_;
+  Ort::Allocator* allocator_device_;
+};
+
+struct CapturedGraphInfo {
+  std::weak_ptr<const CapturedGraphPool> pool_;
+  int max_batch_size_;
+  int max_length_;
+  int num_beams_;
+  int index_;
+  std::unique_ptr<Generators::StaticBuffer> sb_input_ids_;
+  std::vector<std::unique_ptr<Generators::StaticBuffer>> sb_kv_caches_;
+  std::unique_ptr<Generators::StaticBuffer> sb_logits16_;
+  std::unique_ptr<Generators::StaticBuffer> sb_logits32_;
+  std::unique_ptr<Generators::StaticBuffer> sb_position_ids_;
+  std::unique_ptr<Generators::StaticBuffer> sb_attention_mask_;
+
+#if USE_DML
+  std::unique_ptr<Generators::StaticBuffer> sb_attention_mask_next_;
+  std::unique_ptr<Generators::StaticBuffer> sb_input_ids_int32_;
+#endif
+
+  // Generates a unique annotation ID across different captured graph objects. This is necessary because different
+  // generators could be alive at the same time and run the same batch size but with different static buffers, so
+  // they need to have different annotation IDs.
+  int GenerateUniqueAnnotationID(int batch_size) {
+    // Keep the upper half (minus 1 for the sign bit) of the bits for the unique ID, and keep the lower half for the batch
+    // size. This should give us 32,767 values for the index and 65,535 values for the batch size, which is more than enough.
+    int bit_shift = sizeof(int) * 8 / 2;
+    return (index_ << bit_shift) | batch_size;
+  }
+};
+}  // namespace Generators
diff --git a/src/models/decoder_only.cpp b/src/models/decoder_only.cpp
index 2483289ac..83d1f03d3 100644
--- a/src/models/decoder_only.cpp
+++ b/src/models/decoder_only.cpp
@@ -16,6 +16,7 @@ std::unique_ptr<State> DecoderOnly_Model::CreateState(RoamingArray<int32_t> sequ
 DecoderOnly_State::DecoderOnly_State(const DecoderOnly_Model& model, RoamingArray<int32_t> sequence_lengths_unk, const GeneratorParams& params)
     : State{params},
       model_{model},
+      captured_graph_info_(model.GetCapturedGraphPool()->ReserveCapturedGraph(model, params)),
       position_inputs_{model, *this, sequence_lengths_unk} {
   input_ids_.Add();
   position_inputs_.Add();
@@ -37,10 +38,11 @@ RoamingArray<float> DecoderOnly_State::Run(int current_length, RoamingArray<int3
 
   // Set the graph id for the following runs.
   if (model_.use_cuda_graph_) {
-    int new_graph_annotation_id = GetGraphAnnotationId();
-    if (new_graph_annotation_id != graph_annotation_id_) {
-      graph_annotation_id_ = new_graph_annotation_id;
-      model_.run_options_->AddConfigEntry("gpu_graph_id", std::to_string(graph_annotation_id_).c_str());
+    int new_batch_size = static_cast<int>(input_ids_.GetShape()[0]);
+    if (new_batch_size != current_batch_size_) {
+      current_batch_size_ = new_batch_size;
+      auto annotation_id = std::to_string(captured_graph_info_->GenerateUniqueAnnotationID(new_batch_size));
+      model_.run_options_->AddConfigEntry("gpu_graph_id", annotation_id.c_str());
     }
   }
   return logits_.Get();
@@ -52,9 +54,4 @@ void DecoderOnly_State::UpdateInputs(const RoamingArray<int32_t>& next_tokens_un
   kv_cache_.Update(beam_indices.GetCPU(), current_length);
 }
 
-int DecoderOnly_State::GetGraphAnnotationId() const {
-  // Here we use the batch size as the graph annotation id.
-  return static_cast<int>(input_ids_.GetShape()[0]);
-}
-
 }  // namespace Generators
diff --git a/src/models/decoder_only.h b/src/models/decoder_only.h
index 9296e4bf3..0bd17473f 100644
--- a/src/models/decoder_only.h
+++ b/src/models/decoder_only.h
@@ -18,14 +18,15 @@ struct DecoderOnly_Model : Model {
 struct DecoderOnly_State : State {
   DecoderOnly_State(const DecoderOnly_Model& model, RoamingArray<int32_t> sequence_lengths, const GeneratorParams& params);
   RoamingArray<float> Run(int current_length, RoamingArray<int32_t> next_tokens, RoamingArray<int32_t> next_indices) override;
+  const CapturedGraphInfo* GetCapturedGraphInfo() const override { return captured_graph_info_.get(); };
 
  private:
   void UpdateInputs(const RoamingArray<int32_t>& next_tokens, RoamingArray<int32_t> next_indices, int current_length);
-  int GetGraphAnnotationId() const;
 
   const DecoderOnly_Model& model_;
+  CapturedGraphInfoPtr captured_graph_info_;
   bool first_run_{true};
-  int graph_annotation_id_{0};
+  int current_batch_size_{0};
 
   InputIDs input_ids_{model_, *this};
   Logits logits_{model_, *this};
diff --git a/src/models/input_ids.cpp b/src/models/input_ids.cpp
index 07f04d92e..3cf44326c 100644
--- a/src/models/input_ids.cpp
+++ b/src/models/input_ids.cpp
@@ -28,9 +28,12 @@ InputIDs::InputIDs(const Model& model, State& state)
   value_ = model_.ExpandInputs(value_, state_.params_->search.num_beams);
   shape_[0] *= state_.params_->search.num_beams;
 
-  if (model_.device_type_ == DeviceType::CUDA && model_.use_cuda_graph_) {
-    size_t max_beam_batch_size = static_cast<size_t>(model_.config_->search.num_beams) * model_.max_batch_size_;
-    sb_input_ids_ = std::make_unique<StaticBuffer>(model_.allocator_device_, max_beam_batch_size);
+  if (state_.GetCapturedGraphInfo()) {
+    sb_input_ids_ = state_.GetCapturedGraphInfo()->sb_input_ids_.get();
+
+#if USE_DML
+    sb_input_ids_int32_ = state_.GetCapturedGraphInfo()->sb_input_ids_int32_.get();
+#endif
   }
 }
 
@@ -46,11 +49,17 @@ void InputIDs::Update(RoamingArray<int32_t> next_tokens_unk) {
   if (shape_[1] != 1) {
     shape_[1] = 1;
     if (!sb_input_ids_) {
-      // DML doesn't support on-device updates of input ids yet, so fall back to the CPU
-      auto& allocator = model_.device_type_ == DeviceType::DML ? model_.allocator_cpu_ : *model_.allocator_device_;
-      value_ = OrtValue::CreateTensor(allocator, shape_, type_);
+      value_ = OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_);
+
+#if USE_DML
+      value_int32_ = OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_);
+#endif
     } else {
       value_ = sb_input_ids_->CreateTensorOnStaticBuffer(shape_, type_);
+
+#if USE_DML
+      value_int32_ = sb_input_ids_int32_->CreateTensorOnStaticBuffer(shape_, Ort::TypeToTensorType<int32_t>::type);
+#endif
     }
 
     state_.inputs_[input_index_] = value_.get();
@@ -58,17 +67,46 @@ void InputIDs::Update(RoamingArray<int32_t> next_tokens_unk) {
 
   // Update input_ids with next tokens, converting from 32-bit to 64-bit
   if (type_ == Ort::TypeToTensorType<int64_t>::type) {
-    auto* data = value_->GetTensorMutableData<int64_t>();
+    switch (model_.device_type_) {
 #if USE_CUDA
-    if (model_.device_type_ == DeviceType::CUDA) {
-      auto next_tokens = next_tokens_unk.GetGPU();
-      cuda::LaunchInt32ToInt64(next_tokens.data(), data, static_cast<int>(next_tokens.size()), model_.cuda_stream_);
-    } else
+      case DeviceType::CUDA: {
+        auto* data = value_->GetTensorMutableData<int64_t>();
+        auto next_tokens = next_tokens_unk.GetGPU();
+        cuda::LaunchInt32ToInt64(next_tokens.data(), data, static_cast<int>(next_tokens.size()), model_.cuda_stream_);
+      } break;
+#endif
+
+#if USE_DML
+      case DeviceType::DML: {
+        ComPtr<ID3D12Resource> source_resource;
+        Ort::ThrowOnError(model_.GetOrtDmlApi()->GetD3D12ResourceFromAllocation(model_.allocator_device_, value_int32_->GetTensorMutableRawData(), &source_resource));
+
+        auto source = std::span<const uint8_t>(
+            reinterpret_cast<const uint8_t*>(next_tokens_unk.GetCPU().data()),
+            next_tokens_unk.GetCPU().size_bytes());
+
+        model_.GetDmlUploadHeap()->BeginUploadToGpu(
+            source_resource.Get(),
+            0,
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+            source);
+
+        DmlHelpers::DmlCastInputToOutput(
+            model_.GetDmlExecutionContext(),
+            *model_.allocator_device_,
+            *value_int32_,
+            value_,
+            model_.GetDmlDevice(),
+            model_.GetOrtDmlApi(),
+            input_ids_cast_command_list_state_);
+      } break;
 #endif
-    {
-      auto next_tokens = next_tokens_unk.GetCPU();
-      for (int i = 0; i < shape_[0]; i++) {
-        data[i] = next_tokens[i];
+      case DeviceType::CPU: {
+        auto* data = value_->GetTensorMutableData<int64_t>();
+        auto next_tokens = next_tokens_unk.GetCPU();
+        for (int i = 0; i < shape_[0]; i++) {
+          data[i] = next_tokens[i];
+        }
       }
     }
   } else {
diff --git a/src/models/input_ids.h b/src/models/input_ids.h
index 1cde87bd6..2d4e3d30c 100644
--- a/src/models/input_ids.h
+++ b/src/models/input_ids.h
@@ -23,7 +23,13 @@ struct InputIDs {
   std::unique_ptr<OrtValue> value_;
 
   // Used for decoding runs with cuda graphs.
-  std::unique_ptr<StaticBuffer> sb_input_ids_;
+  StaticBuffer* sb_input_ids_{};
+
+#if USE_DML
+  std::unique_ptr<OrtValue> value_int32_;
+  StaticBuffer* sb_input_ids_int32_{};
+  DmlReusedCommandListState input_ids_cast_command_list_state_{};
+#endif
 };
 
 }  // namespace Generators
diff --git a/src/models/kv_cache.cpp b/src/models/kv_cache.cpp
index 649265504..0c4436928 100644
--- a/src/models/kv_cache.cpp
+++ b/src/models/kv_cache.cpp
@@ -149,12 +149,11 @@ KV_Cache::KV_Cache(const Model& model, State& state)
   else
     shape_[2] = state_.params_->sequence_length;
 
-  if (model_.device_type_ == DeviceType::CUDA && model_.use_cuda_graph_) {
+  if (state_.GetCapturedGraphInfo()) {
     assert(past_present_share_buffer_);
-    size_t max_beam_batch_size = static_cast<size_t>(model_.config_->search.num_beams) * model_.max_batch_size_;
     sb_kv_caches_.reserve(layer_count_ * 2);
     for (int i = 0; i < layer_count_ * 2; ++i) {
-      sb_kv_caches_.push_back(std::make_unique<StaticBuffer>(model_.allocator_device_, max_beam_batch_size));
+      sb_kv_caches_.push_back(state_.GetCapturedGraphInfo()->sb_kv_caches_[i].get());
     }
   }
 
diff --git a/src/models/kv_cache.h b/src/models/kv_cache.h
index 77d37b072..bfefba973 100644
--- a/src/models/kv_cache.h
+++ b/src/models/kv_cache.h
@@ -51,8 +51,7 @@ struct KV_Cache {
   std::unique_ptr<OrtValue> empty_past_;
   std::vector<std::unique_ptr<OrtValue>> pasts_, presents_;
   std::vector<std::string> input_name_strings_, output_name_strings_;
-
-  std::vector<std::unique_ptr<StaticBuffer>> sb_kv_caches_;
+  std::vector<StaticBuffer*> sb_kv_caches_;
 };
 
 // Very similar to the KV_Cache, but is only created once at the encoder step, then used without modification for every decoder step
diff --git a/src/models/logits.cpp b/src/models/logits.cpp
index f74c6746c..57ce2f7f2 100644
--- a/src/models/logits.cpp
+++ b/src/models/logits.cpp
@@ -9,33 +9,40 @@ Logits::Logits(const Model& model, State& state)
       state_{state},
       shape_{static_cast<int64_t>(state_.params_->batch_size) * state_.params_->search.num_beams, state_.params_->sequence_length, state_.params_->vocab_size},
       type_{model_.session_info_->GetOutputDataType(model_.config_->model.decoder.outputs.logits)} {
-  // DML doesn't support on-device scoring yet, so fall back to the CPU
-  auto& allocator = model_.device_type_ == DeviceType::DML ? model_.allocator_cpu_ : *model_.allocator_device_;
-  auto logits_tensor = OrtValue::CreateTensor(allocator, shape_, type_);
+  auto logits_tensor = OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_);
   if (type_ == Ort::TypeToTensorType<float>::type)
     value32_ = std::move(logits_tensor);
   else
     value16_ = std::move(logits_tensor);
 
-  if (model_.device_type_ == DeviceType::CUDA && model_.use_cuda_graph_) {
-    size_t max_beam_batch_size = static_cast<size_t>(model_.config_->search.num_beams) * model_.max_batch_size_;
+  if (state_.GetCapturedGraphInfo()) {
     if (type_ == Ort::TypeToTensorType<float>::type) {
-      sb_logits32_ = std::make_unique<StaticBuffer>(model_.allocator_device_, max_beam_batch_size);
+      sb_logits32_ = state_.GetCapturedGraphInfo()->sb_logits32_.get();
     }
     if (type_ == Ort::TypeToTensorType<Ort::Float16_t>::type) {
-      sb_logits16_ = std::make_unique<StaticBuffer>(model_.allocator_device_, max_beam_batch_size);
+      sb_logits16_ = state_.GetCapturedGraphInfo()->sb_logits16_.get();
     }
   }
 }
 
 RoamingArray<float> Logits::Get() {
   size_t element_count = shape_[0] * shape_[1] * shape_[2];
-  // DML doesn't support on-device scoring yet, so fall back to the CPU
-  auto& allocator = model_.device_type_ == DeviceType::DML ? model_.allocator_cpu_ : *model_.allocator_device_;
 
   // Convert from float16 to float32 if necessary
-  if (type_ == Ort::TypeToTensorType<Ort::Float16_t>::type)
-    ConvertFp16ToFp32(allocator, *value16_, value32_, model_.device_type_, model_.cuda_stream_);
+  if (type_ == Ort::TypeToTensorType<Ort::Float16_t>::type) {
+#if USE_DML
+    DmlHelpers::DmlCastInputToOutput(
+        model_.GetDmlExecutionContext(),
+        *model_.allocator_device_,
+        *value16_,
+        value32_,
+        model_.GetDmlDevice(),
+        model_.GetOrtDmlApi(),
+        logits_cast_command_list_state_);
+#else
+    ConvertFp16ToFp32(*model_.allocator_device_, *value16_, value32_, model_.device_type_, model_.cuda_stream_);
+#endif
+  }
 
   // First iteration? Then copy the logits over to a {batch_beams, 1, vocab_size} tensor
   // We'll reuse this tensor for all future iterations
@@ -48,9 +55,13 @@ RoamingArray<float> Logits::Get() {
     shape_[1] = 1;
 
     // bugbug: not done yet
-    auto value_next = !sb_logits32_ ? OrtValue::CreateTensor<float>(allocator, shape_)
+    auto value_next = !sb_logits32_ ? OrtValue::CreateTensor<float>(*model_.allocator_device_, shape_)
                                     : sb_logits32_->CreateTensorOnStaticBuffer(shape_, type_);
-    auto logits_next = cpu_span<float>{value_next->GetTensorMutableData<float>(), element_count};
+
+#if USE_DML
+    // DML doesn't support on-device scoring yet, so we need to download some data to the CPU
+    value32_cpu_ = OrtValue::CreateTensor<float>(model_.allocator_cpu_, shape_);
+#endif
 
     size_t vocab_index = 0;  // Simpler math to have this index go up by vocab_size for every logit chunk we process
 
@@ -63,16 +74,47 @@ RoamingArray<float> Logits::Get() {
           break;
       }
 
-      auto logits = std::span<float>{value32_->GetTensorMutableData<float>(), element_count};
       for (int beam_index = 0; beam_index < num_beams; beam_index++) {
-        std::span<const float> source = logits.subspan(vocab_index * seq_length + token_index * vocab_size, vocab_size);
-        auto target = logits_next.subspan(vocab_index, vocab_size);
+        switch (model_.device_type_) {
+#ifdef USE_DML
+          case DeviceType::DML: {
+            ComPtr<ID3D12Resource> source_resource;
+            Ort::ThrowOnError(model_.GetOrtDmlApi()->GetD3D12ResourceFromAllocation(model_.allocator_device_, value32_->GetTensorMutableRawData(), &source_resource));
+
+            ComPtr<ID3D12Resource> target_resource;
+            Ort::ThrowOnError(model_.GetOrtDmlApi()->GetD3D12ResourceFromAllocation(model_.allocator_device_, value_next->GetTensorMutableRawData(), &target_resource));
+
+            uint64_t source_offset = (vocab_index * seq_length + token_index * vocab_size) * sizeof(float);
+            uint64_t target_offset = vocab_index * sizeof(float);
+            uint64_t size_in_bytes = vocab_size * sizeof(float);
+
+            model_.GetDmlExecutionContext()->CopyBufferRegion(
+                target_resource.Get(),
+                target_offset,
+                D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+                source_resource.Get(),
+                source_offset,
+                D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+                size_in_bytes);
+          } break;
+#endif
+
+          case DeviceType::CPU:
+          case DeviceType::CUDA: {
+            auto logits = std::span<float>{value32_->GetTensorMutableData<float>(), element_count};
+            auto logits_next = gpu_span<float>{value_next->GetTensorMutableData<float>(), element_count};
+            auto target = logits_next.subspan(vocab_index, vocab_size);
+            std::span<const float> source = logits.subspan(vocab_index * seq_length + token_index * vocab_size, vocab_size);
+            if (model_.device_type_ == DeviceType::CUDA)
 #if USE_CUDA
-        if (model_.device_type_ == DeviceType::CUDA)
-          CudaCheck() == cudaMemcpyAsync(target.data(), source.data(), source.size_bytes(), cudaMemcpyDeviceToDevice, state_.params_->cuda_stream);
-        else
+              CudaCheck() == cudaMemcpyAsync(target.data(), source.data(), source.size_bytes(), cudaMemcpyDeviceToDevice, state_.params_->cuda_stream);
+#else
+              throw std::runtime_error("Unexpected CUDA device usage");
 #endif
-          copy(source, target);
+            else
+              copy(source, target);
+          } break;
+        }
 
         vocab_index += vocab_size;
       }
@@ -82,7 +124,7 @@ RoamingArray<float> Logits::Get() {
 
     value32_ = std::move(value_next);
     if (type_ == Ort::TypeToTensorType<Ort::Float16_t>::type)
-      value16_ = !sb_logits16_ ? OrtValue::CreateTensor(allocator, shape_, type_)
+      value16_ = !sb_logits16_ ? OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_)
                                : sb_logits16_->CreateTensorOnStaticBuffer(shape_, type_);
 
     state_.outputs_[output_index_] = type_ == Ort::TypeToTensorType<float>::type ? value32_.get() : value16_.get();
@@ -91,7 +133,25 @@ RoamingArray<float> Logits::Get() {
 #if USE_CUDA
   if (model_.device_type_ == DeviceType::CUDA)
     return gpu_span<float>{value32_->GetTensorMutableData<float>(), element_count};
+#elif USE_DML
+  auto cpu_tensor = value32_cpu_->GetTensorMutableData<float>();
+  if (model_.device_type_ == DeviceType::DML) {
+    // DML doesn't support on-device scoring yet, so we transfer the data to the CPU
+    ComPtr<ID3D12Resource> gpu_resource;
+    Ort::ThrowOnError(model_.GetOrtDmlApi()->GetD3D12ResourceFromAllocation(model_.allocator_device_, value32_->GetTensorMutableRawData(), &gpu_resource));
+
+    size_t new_element_count = shape_[0] * shape_[1] * shape_[2];
+
+    model_.GetDmlReadbackHeap()->ReadbackFromGpu(
+        std::span(reinterpret_cast<uint8_t*>(cpu_tensor), new_element_count * sizeof(float)),
+        gpu_resource.Get(),
+        0,
+        D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
+
+    return cpu_span<float>{cpu_tensor, new_element_count};
+  }
 #endif
+
   return cpu_span<float>{value32_->GetTensorMutableData<float>(), element_count};
 }
 
diff --git a/src/models/logits.h b/src/models/logits.h
index bb439c952..481e9e44a 100644
--- a/src/models/logits.h
+++ b/src/models/logits.h
@@ -21,8 +21,13 @@ struct Logits {
   std::unique_ptr<OrtValue> value16_;  // When model output is fp16
 
   // Used for decoding runs with cuda graphs.
-  std::unique_ptr<StaticBuffer> sb_logits32_;
-  std::unique_ptr<StaticBuffer> sb_logits16_;
+  StaticBuffer* sb_logits32_{};
+  StaticBuffer* sb_logits16_{};
+
+#if USE_DML
+  DmlReusedCommandListState logits_cast_command_list_state_{};
+  std::unique_ptr<OrtValue> value32_cpu_;
+#endif
 };
 
 }  // namespace Generators
diff --git a/src/models/model.cpp b/src/models/model.cpp
index 84d96836d..bfc9e50c0 100644
--- a/src/models/model.cpp
+++ b/src/models/model.cpp
@@ -9,9 +9,9 @@
 #include "whisper.h"
 #include "kernels.h"
 #ifdef USE_DML
-//  Because dml_provider_factory includes windows headers that #define min and max, this next line will prevent this from happening
-#define NOMINMAX
+#include <wil/wrl.h>
 #include "dml_provider_factory.h"
+#include "../dml/dml_smart_container.h"
 
 EXTERN_C IMAGE_DOS_HEADER __ImageBase;
 
@@ -197,22 +197,6 @@ Ort::Allocator* GetCudaAllocator(OrtSession& session) {
 }
 #endif
 
-#if USE_DML
-// Since Python/Others can and will hold onto a generator object past the model object's lifetime we need to ensure
-// the allocator used is not destroyed until last. This keeps the allocator around until exit, after all other memory
-// has been destroyed.
-Ort::Allocator* GetDmlAllocator(OrtSession& session) {
-  static std::unique_ptr<OrtMemoryInfo> memory_info_dml_;
-  static std::unique_ptr<Ort::Allocator> allocator_dml_;
-
-  if (!allocator_dml_) {
-    memory_info_dml_ = OrtMemoryInfo::Create("DML", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault);
-    allocator_dml_ = Ort::Allocator::Create(session, *memory_info_dml_);
-  }
-  return allocator_dml_.get();
-}
-#endif
-
 SessionInfo::SessionInfo(OrtSession& session) {
   auto input_names = session.GetInputNames();
   std::vector<ONNXTensorElementDataType> input_types(input_names.size());
@@ -268,11 +252,25 @@ void Model::InitDeviceAllocator([[maybe_unused]] OrtSession& session) {
   }
 #elif USE_DML
   if (device_type_ == DeviceType::DML) {
-    allocator_device_ = GetDmlAllocator(session);
+    static constexpr GUID dml_smart_container_guid = {0x6b7ff369, 0xc805, 0x42cc, {0x8a, 0x5f, 0xb5, 0x5f, 0x67, 0xe5, 0xbd, 0xcc}};
+
+    ComPtr<DmlSmartContainer> smart_container;
+    uint32_t smart_container_ptr_size = static_cast<uint32_t>(sizeof(smart_container.GetAddressOf()));
+
+    // We reuse the allocator assigned to this device if possible; otherwise, we create a new one and store it on the device
+    if (FAILED(dml_objects_.d3d12_device->GetPrivateData(dml_smart_container_guid, &smart_container_ptr_size, smart_container.GetAddressOf()))) {
+      auto memory_info_dml = OrtMemoryInfo::Create("DML", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault);
+      auto allocator_dml = Ort::Allocator::Create(session, *memory_info_dml);
+      smart_container = wil::MakeOrThrow<DmlSmartContainer>(std::move(memory_info_dml), std::move(allocator_dml));
+      THROW_IF_FAILED(dml_objects_.d3d12_device->SetPrivateDataInterface(dml_smart_container_guid, smart_container.Get()));
+    }
+
+    allocator_device_ = smart_container->GetAllocator();
   }
 #endif
 
   session_info_ = std::make_unique<SessionInfo>(session);
+  captured_graph_pool_ = std::make_shared<CapturedGraphPool>(config_.get(), session_info_.get(), allocator_device_);
 }
 
 void Model::CreateSessionOptions() {
@@ -350,15 +348,38 @@ void Model::CreateSessionOptions() {
       ort_options.AppendExecutionProvider_ROCM(ort_provider_options);
 #ifdef USE_DML
     } else if (provider_options.name == "dml") {
-      device_type_ = DeviceType::DML;  // We use a DML allocator for input/output caches, but other tensors will use CPU tensors
-      const OrtDmlApi* p_dml_api{};
-      Ort::ThrowOnError(Ort::api->GetExecutionProviderApi("DML", ORT_API_VERSION, reinterpret_cast<const void**>(&p_dml_api)));
-      if (!p_dml_api)
-        throw std::runtime_error("Unexpected nullptr getting OrtDmlApi");
+      dml_objects_ = DmlHelpers::CreateDmlObjects();
+
       auto directml_dll = CurrentModulePath() + L"DirectML.dll";
-      if (LoadLibraryExW(directml_dll.c_str(), nullptr, 0) == NULL)
+      wil::unique_hmodule smart_directml_dll(LoadLibraryExW(directml_dll.c_str(), nullptr, 0));
+      THROW_LAST_ERROR_IF(!smart_directml_dll);
+
+      if (LoadLibraryExW(directml_dll.c_str(), nullptr, 0) == NULL) {
         throw std::runtime_error("DirectML.dll not found");
-      p_dml_api->SessionOptionsAppendExecutionProvider_DML(&ort_options, 0);
+      }
+
+      auto dml_create_device1_fn = reinterpret_cast<decltype(&DMLCreateDevice1)>(GetProcAddress(smart_directml_dll.get(), "DMLCreateDevice1"));
+      THROW_LAST_ERROR_IF(!dml_create_device1_fn);
+      THROW_IF_FAILED(dml_create_device1_fn(dml_objects_.d3d12_device.Get(), DML_CREATE_DEVICE_FLAG_NONE, DML_FEATURE_LEVEL_5_0, IID_PPV_ARGS(&dml_device_)));
+
+      Ort::ThrowOnError(Ort::api->GetExecutionProviderApi("DML", ORT_API_VERSION, reinterpret_cast<const void**>(&p_dml_api_)));
+      if (!p_dml_api_) {
+        throw std::runtime_error("Unexpected nullptr getting OrtDmlApi");
+      }
+
+      dml_execution_context_ = std::make_unique<DmlExecutionContext>(
+          dml_objects_.d3d12_device.Get(),
+          dml_device_.Get(),
+          dml_objects_.command_queue.Get(),
+          *allocator_device_,
+          p_dml_api_);
+
+      dml_pooled_upload_heap_ = std::make_unique<DmlPooledUploadHeap>(dml_objects_.d3d12_device.Get(), dml_execution_context_.get());
+      dml_readback_heap_ = std::make_unique<DmlReadbackHeap>(dml_objects_.d3d12_device.Get(), dml_execution_context_.get());
+
+      ort_options.AddConfigEntry("ep.dml.enable_graph_capture", "1");
+      p_dml_api_->SessionOptionsAppendExecutionProvider_DML1(&ort_options, dml_device_.Get(), dml_objects_.command_queue.Get());
+      device_type_ = DeviceType::DML;  // We use a DML allocator for input/output caches, but other tensors will use CPU tensors
 #endif
     } else
       throw std::runtime_error("Unknown provider type: " + provider_options.name);
@@ -516,7 +537,7 @@ std::unique_ptr<OrtValue> Model::ExpandInputs(std::unique_ptr<OrtValue>& input,
 }
 
 void Model::GetMaxBatchSizeFromGeneratorParams(const GeneratorParams& params) {
-  bool is_cuda_graph_enabled = IsCudaGraphEnabled(config_->model.decoder.session_options);
+  bool is_cuda_graph_enabled = device_type_ == DeviceType::DML || IsCudaGraphEnabled(config_->model.decoder.session_options);
   max_batch_size_ = params.max_batch_size;
 
   if (DeviceType::CUDA == device_type_) {
@@ -529,6 +550,12 @@ void Model::GetMaxBatchSizeFromGeneratorParams(const GeneratorParams& params) {
       }
       use_cuda_graph_ = true;
     }
+  } else if (DeviceType::DML == device_type_) {
+    if (max_batch_size_ == 0) {
+      throw std::runtime_error("max_batch_size needs to be set when using DirectML.");
+    }
+
+    use_cuda_graph_ = true;
   } else {
     if (is_cuda_graph_enabled || max_batch_size_ > 0) {
       throw std::runtime_error("CUDA graph is not supported on this device");
diff --git a/src/models/model.h b/src/models/model.h
index 9a7012b19..870c6494e 100644
--- a/src/models/model.h
+++ b/src/models/model.h
@@ -3,6 +3,16 @@
 #include "tfmtok_c.h"
 #endif
 
+#include "captured_graph_pool.h"
+
+#ifdef USE_DML
+#include "dml_provider_factory.h"
+#include "../dml/dml_helpers.h"
+#include "../dml/dml_execution_context.h"
+#include "../dml/dml_pooled_upload_heap.h"
+#include "../dml/dml_readback_heap.h"
+#endif
+
 namespace Generators {
 
 struct Tokenizer;
@@ -14,6 +24,7 @@ struct State {
   virtual ~State() = default;
 
   virtual RoamingArray<float> Run(int current_length, RoamingArray<int32_t> next_tokens, RoamingArray<int32_t> next_indices = {}) = 0;
+  virtual const CapturedGraphInfo* GetCapturedGraphInfo() const { return nullptr; }
 
   std::shared_ptr<const GeneratorParams> params_;
 
@@ -110,6 +121,8 @@ struct Model : std::enable_shared_from_this<Model> {
 
   void GetMaxBatchSizeFromGeneratorParams(const GeneratorParams& params);
 
+  CapturedGraphPool* GetCapturedGraphPool() const { return captured_graph_pool_.get(); }
+
   std::unique_ptr<Config> config_;
   std::unique_ptr<OrtSessionOptions> session_options_;
   std::unique_ptr<OrtRunOptions> run_options_;
@@ -126,9 +139,30 @@ struct Model : std::enable_shared_from_this<Model> {
   bool use_cuda_graph_{};
   int max_batch_size_{};
 
+#if USE_DML
+  DmlExecutionContext* GetDmlExecutionContext() const { return dml_execution_context_.get(); }
+  DmlReadbackHeap* GetDmlReadbackHeap() const { return dml_readback_heap_.get(); }
+  DmlPooledUploadHeap* GetDmlUploadHeap() const { return dml_pooled_upload_heap_.get(); }
+  const OrtDmlApi* GetOrtDmlApi() const { return p_dml_api_; }
+  IDMLDevice* GetDmlDevice() const { return dml_device_.Get(); }
+  ID3D12Device* GetD3D12Device() const { return dml_objects_.d3d12_device.Get(); }
+#endif
+
  protected:
   void InitDeviceAllocator(OrtSession& session);
   void CreateSessionOptions();
+
+ private:
+#if USE_DML
+  mutable DmlObjects dml_objects_;
+  const OrtDmlApi* p_dml_api_{};
+  std::unique_ptr<DmlPooledUploadHeap> dml_pooled_upload_heap_;
+  std::unique_ptr<DmlExecutionContext> dml_execution_context_;
+  std::unique_ptr<DmlReadbackHeap> dml_readback_heap_;
+  ComPtr<IDMLDevice> dml_device_;
+#endif
+
+  std::shared_ptr<CapturedGraphPool> captured_graph_pool_;
 };
 
 }  // namespace Generators
diff --git a/src/models/position_inputs.cpp b/src/models/position_inputs.cpp
index dca2b6c9b..e0387e288 100644
--- a/src/models/position_inputs.cpp
+++ b/src/models/position_inputs.cpp
@@ -3,6 +3,10 @@
 #include "position_inputs.h"
 #include "kernels.h"
 
+#if USE_DML
+#include "../dml/dml_update_mask_kernel.h"
+#endif
+
 namespace Generators {
 
 PositionInputs::PositionInputs(const Model& model, State& state, RoamingArray<int32_t>& sequence_lengths_unk)
@@ -46,13 +50,16 @@ PositionInputs::PositionInputs(const Model& model, State& state, RoamingArray<in
   position_ids_shape_ = shape;
   attention_mask_shape_ = shape;
 
-  if (model_.device_type_ == DeviceType::CUDA && model_.use_cuda_graph_) {
-    size_t max_beam_batch_size = static_cast<size_t>(model_.config_->search.num_beams) * model_.max_batch_size_;
+  if (state_.GetCapturedGraphInfo()) {
     if (has_posid_input_) {
-      sb_position_ids_ = std::make_unique<StaticBuffer>(model_.allocator_device_, max_beam_batch_size);
+      sb_position_ids_ = state_.GetCapturedGraphInfo()->sb_position_ids_.get();
     }
     if (has_mask_input_) {
-      sb_attention_mask_ = std::make_unique<StaticBuffer>(model_.allocator_device_, max_beam_batch_size);
+      sb_attention_mask_ = state_.GetCapturedGraphInfo()->sb_attention_mask_.get();
+
+#if USE_DML
+      sb_attention_mask_next_ = state_.GetCapturedGraphInfo()->sb_attention_mask_next_.get();
+#endif
     }
   }
 }
@@ -112,14 +119,57 @@ void PositionInputs::UpdatePositionIDs(int current_length) {
                         cudaMemcpyDeviceToDevice,
                         model_.cuda_stream_);
       }
+#elif USE_DML
+      position_ids_ = sb_position_ids_->CreateTensorOnStaticBuffer(position_ids_shape_, type_);
+      assert(model_.device_type_ == DeviceType::DML);
+
+      ComPtr<ID3D12Resource> target_resource;
+      Ort::ThrowOnError(model_.GetOrtDmlApi()->GetD3D12ResourceFromAllocation(model_.allocator_device_, position_ids_->GetTensorMutableRawData(), &target_resource));
+
+      if (type_ == Ort::TypeToTensorType<int32_t>::type) {
+        auto source = std::span(position_ids_next_->GetTensorData<const uint8_t>(), sizeof(int32_t) * position_ids_shape_[0]);
+
+        model_.GetDmlUploadHeap()->BeginUploadToGpu(
+            target_resource.Get(),
+            0,
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+            source);
+      } else {
+        auto source = std::span(position_ids_next_->GetTensorData<const uint8_t>(), sizeof(int64_t) * position_ids_shape_[0]);
+
+        model_.GetDmlUploadHeap()->BeginUploadToGpu(
+            target_resource.Get(),
+            0,
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+            source);
+      }
 #endif
     }
     is_first_posid_update_ = false;
     state_.inputs_[posid_input_index_] = position_ids_.get();
   } else {  // Just incrementing existing position IDs
     switch (model_.device_type_) {
-      case DeviceType::DML:
-        // DML doesn't support on-device position ids update yet, so we fall back to the CPU
+#if USE_DML
+      case DeviceType::DML: {
+        ComPtr<ID3D12Resource> target_resource;
+        Ort::ThrowOnError(model_.GetOrtDmlApi()->GetD3D12ResourceFromAllocation(model_.allocator_device_, position_ids_->GetTensorMutableRawData(), &target_resource));
+
+        // Lazily create the kernel only the first time it's needed
+        if (!dml_update_position_ids_kernel_) {
+          dml_update_position_ids_kernel_ = DmlIncrementValuesKernel(
+              model_.GetD3D12Device(),
+              model_.GetDmlExecutionContext(),
+              static_cast<uint32_t>(position_ids_shape_[0]),
+              type_,
+              target_resource.Get());
+        }
+
+        // Execute the cached command list
+        ComPtr<ID3D12Fence> fence;
+        uint64_t completion_value;
+        model_.GetDmlExecutionContext()->ExecuteCommandList(dml_update_position_ids_kernel_->GetCommandList(), &fence, &completion_value);
+      } break;
+#endif
       case DeviceType::CPU: {
         if (type_ == Ort::TypeToTensorType<int32_t>::type)
           UpdatePositionIDsImpl<int32_t>();
@@ -160,6 +210,10 @@ void PositionInputs::UpdateAttentionMask(int current_length) {
                         model_.cuda_stream_);
       }
     }
+#elif USE_DML
+    attention_mask_shape_[1] = state_.params_->search.max_length;
+    attention_mask_ = sb_attention_mask_->CreateTensorOnStaticBuffer(attention_mask_shape_, type_);
+    attention_mask_next_ = sb_attention_mask_next_->CreateTensorOnStaticBuffer(attention_mask_shape_, type_);
 #endif
   } else {
     // DML doesn't support on-device mask updating yet, so use a CPU allocator
@@ -170,8 +224,44 @@ void PositionInputs::UpdateAttentionMask(int current_length) {
   }
 
   switch (model_.device_type_) {
-    case DeviceType::DML:
-      // DML doesn't support on-device mask updating yet, so we fallback to the CPU
+#if USE_DML
+    case DeviceType::DML: {
+      ComPtr<ID3D12Resource> attention_mask_resource;
+      Ort::ThrowOnError(model_.GetOrtDmlApi()->GetD3D12ResourceFromAllocation(model_.allocator_device_, attention_mask_->GetTensorMutableRawData(), &attention_mask_resource));
+
+      ComPtr<ID3D12Resource> attention_mask_next_resource;
+      Ort::ThrowOnError(model_.GetOrtDmlApi()->GetD3D12ResourceFromAllocation(model_.allocator_device_, attention_mask_next_->GetTensorMutableRawData(), &attention_mask_next_resource));
+
+      if (is_first_mask_update_) {
+        dml_update_mask_kernel_ = DmlUpdateMaskKernel(
+            model_.GetD3D12Device(),
+            model_.GetDmlExecutionContext(),
+            static_cast<uint32_t>(attention_mask_shape_[0]),
+            static_cast<uint32_t>(attention_mask_shape_[1]),
+            type_,
+            current_length,
+            attention_mask_resource.Get(),
+            attention_mask_next_resource.Get());
+        is_second_mask_update_ = true;
+      } else if (is_second_mask_update_) {
+        dml_update_mask_kernel_ = DmlUpdateMaskKernel(
+            model_.GetD3D12Device(),
+            model_.GetDmlExecutionContext(),
+            static_cast<uint32_t>(attention_mask_shape_[0]),
+            static_cast<uint32_t>(attention_mask_shape_[1]),
+            type_,
+            1,
+            attention_mask_resource.Get(),
+            attention_mask_next_resource.Get());
+        is_second_mask_update_ = false;
+      }
+
+      ComPtr<ID3D12Fence> fence;
+      uint64_t completion_value;
+      model_.GetDmlExecutionContext()->ExecuteCommandList(dml_update_mask_kernel_->GetCommandList(), &fence, &completion_value);
+      break;
+    }
+#endif
     case DeviceType::CPU: {
       if (type_ == Ort::TypeToTensorType<int32_t>::type)
         UpdateAttentionMaskImpl(attention_mask_next_->GetTensorMutableData<int32_t>(),
@@ -210,7 +300,11 @@ void PositionInputs::UpdateAttentionMask(int current_length) {
     default:
       throw std::runtime_error("PositionIDs::Update - Unsupported device type");
   }
+
+#ifndef USE_DML
   attention_mask_ = std::move(attention_mask_next_);
+#endif
+
   state_.inputs_[mask_input_index_] = attention_mask_.get();
 
   is_first_mask_update_ = false;
diff --git a/src/models/position_inputs.h b/src/models/position_inputs.h
index 48986ecb0..4c714d78d 100644
--- a/src/models/position_inputs.h
+++ b/src/models/position_inputs.h
@@ -2,6 +2,11 @@
 
 #include "static_buffer.h"
 
+#if USE_DML
+#include "../dml/dml_update_mask_kernel.h"
+#include "../dml/dml_increment_values_kernel.h"
+#endif
+
 namespace Generators {
 
 struct PositionInputs {
@@ -46,11 +51,18 @@ struct PositionInputs {
   std::vector<int32_t> initial_sequence_lengths_;
 
   // Used for decoding runs with cuda graphs.
-  std::unique_ptr<StaticBuffer> sb_position_ids_;
-  std::unique_ptr<StaticBuffer> sb_attention_mask_;
+  StaticBuffer* sb_position_ids_{};
+  StaticBuffer* sb_attention_mask_{};
 
   bool is_first_posid_update_{true};
   bool is_first_mask_update_{true};
+
+#ifdef USE_DML
+  std::optional<DmlUpdateMaskKernel> dml_update_mask_kernel_;
+  StaticBuffer* sb_attention_mask_next_{};
+  std::optional<DmlIncrementValuesKernel> dml_update_position_ids_kernel_;
+  bool is_second_mask_update_{};
+#endif
 };
 
 }  // namespace Generators
diff --git a/src/models/static_buffer.cpp b/src/models/static_buffer.cpp
index c7f2e5f90..9bc5f50ea 100644
--- a/src/models/static_buffer.cpp
+++ b/src/models/static_buffer.cpp
@@ -1,5 +1,4 @@
 #include "../generators.h"
-#include "../span.h"
 #include "static_buffer.h"
 
 namespace Generators {
diff --git a/src/models/static_buffer.h b/src/models/static_buffer.h
index 8b3b43315..ce9e14686 100644
--- a/src/models/static_buffer.h
+++ b/src/models/static_buffer.h
@@ -1,5 +1,12 @@
 #pragma once
 
+#include <memory>
+#include "../span.h"
+
+namespace Ort {
+struct Allocator;
+}
+
 namespace Generators {
 
 struct StaticBuffer {
diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index bf203f50d..bc85d8bce 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -33,7 +33,7 @@ if(BUILD_WHEEL)
   if(USE_CUDA)
     set(TARGET_NAME "onnxruntime-genai-cuda")
   elseif(USE_DML)
-    set(TARGET_NAME "onnxruntime-genai-dml")
+    set(TARGET_NAME "onnxruntime-genai-directml")
   else()
     set(TARGET_NAME "onnxruntime-genai")
   endif()
diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
index c91cc6ce3..6ebc719e9 100644
--- a/src/python/py/models/builder.py
+++ b/src/python/py/models/builder.py
@@ -171,8 +171,9 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
             self.attention_attrs["use_packed_matmul"] = self.ep != "dml" and self.num_attn_heads == self.num_kv_heads
 
             # GQA + Rot.Emb. does not require `position ids` as input
-            self.attention_attrs["use_rotemb_in_attn"] = True
-            self.input_names.remove("position_ids")
+            if self.ep == "cuda":
+                self.attention_attrs["use_rotemb_in_attn"] = True
+                self.input_names.remove("position_ids")
 
         self.past_present_share_buffer = self.attention_attrs["op_type"] == "GroupQueryAttention"
 
@@ -1220,7 +1221,7 @@ def make_preprocessing_nodes(self):
         # TODO: add make_position_ids_reformatting() here
 
     def make_attention_mask_reformatting(self):
-        if self.ep_attrs["cuda"]["enable_cuda_graph"] == "1":
+        if self.ep_attrs["cuda"]["enable_cuda_graph"] == "1" or self.ep == "dml":
             # ORT does not allow nodes to be placed on mulitple execution providers
             # with cuda graph enabled. We've only verified it works with GQA and with
             # past_present_share_buffer enabled(so the total_seq_len in GQA is hardcoded
diff --git a/src/python/python.cpp b/src/python/python.cpp
index ead30a230..8ea4321b6 100644
--- a/src/python/python.cpp
+++ b/src/python/python.cpp
@@ -254,6 +254,14 @@ PYBIND11_MODULE(onnxruntime_genai, m) {
 #endif
   });
 
+  m.def("is_dml_available", []() {
+#ifdef USE_DML
+    return true;
+#else
+        return false;
+#endif
+  });
+
   m.def("set_current_gpu_device_id", [](int device_id) { Ort::SetCurrentGpuDeviceId(device_id); });
   m.def("get_current_gpu_device_id", []() { return Ort::GetCurrentGpuDeviceId(); });
 }
diff --git a/test/python/test_onnxruntime_genai_api.py b/test/python/test_onnxruntime_genai_api.py
index 66d217dae..c68f4a0d6 100644
--- a/test/python/test_onnxruntime_genai_api.py
+++ b/test/python/test_onnxruntime_genai_api.py
@@ -10,6 +10,13 @@
 import onnxruntime_genai as og
 import pytest
 
+devices = ["cpu"]
+
+if og.is_cuda_available():
+    devices.append("cuda")
+
+if og.is_dml_available():
+    devices.append("dml")
 
 @pytest.mark.parametrize(
     "relative_model_path",
@@ -62,9 +69,7 @@ def test_greedy_search(test_data_path, relative_model_path):
     sysconfig.get_platform().endswith("arm64") or sys.version_info.minor < 8,
     reason="Python 3.8 is required for downloading models.",
 )
-@pytest.mark.parametrize(
-    "device", ["cpu", "cuda"] if og.is_cuda_available() else ["cpu"]
-)
+@pytest.mark.parametrize("device", devices)
 @pytest.mark.parametrize("batch", [True, False])
 def test_tokenizer_encode_decode(device, phi2_for, batch):
     model_path = phi2_for(device)
@@ -93,9 +98,7 @@ def test_tokenizer_encode_decode(device, phi2_for, batch):
     sysconfig.get_platform().endswith("arm64") or sys.version_info.minor < 8,
     reason="Python 3.8 is required for downloading models.",
 )
-@pytest.mark.parametrize(
-    "device", ["cpu", "cuda"] if og.is_cuda_available() else ["cpu"]
-)
+@pytest.mark.parametrize("device", devices)
 def test_tokenizer_stream(device, phi2_for):
     model = og.Model(phi2_for(device))
     tokenizer = og.Tokenizer(model)
@@ -115,16 +118,13 @@ def test_tokenizer_stream(device, phi2_for):
 
         assert decoded_string == prompt
 
-
 # TODO: CUDA pipelines use python3.6 and do not have a way to download models since downloading models
 # requires pytorch and hf transformers. This test should be re-enabled once the pipeline is updated.
 @pytest.mark.skipif(
     sysconfig.get_platform().endswith("arm64") or sys.version_info.minor < 8,
     reason="Python 3.8 is required for downloading models.",
 )
-@pytest.mark.parametrize(
-    "device", ["cpu", "cuda"] if og.is_cuda_available() else ["cpu"]
-)
+@pytest.mark.parametrize("device", devices)
 def test_batching(device, phi2_for):
     model = og.Model(phi2_for(device))
     tokenizer = og.Tokenizer(model)
@@ -139,5 +139,8 @@ def test_batching(device, phi2_for):
     params.set_search_options(max_length=20)  # To run faster
     params.input_ids = tokenizer.encode_batch(prompts)
 
+    if device == "dml":
+        params.try_use_cuda_graph_with_max_batch_size(len(prompts))
+
     output_sequences = model.generate(params)
     print(tokenizer.decode_batch(output_sequences))