Address PR comments

microsoft · Apr 22, 2024 · d33f5ef · d33f5ef
1 parent 261b0b7
commit d33f5ef
Show file tree

Hide file tree

Showing 36 changed files with 29 additions and 31 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -93,8 +93,8 @@ if(USE_DML)
     add_compile_definitions(NOMINMAX)
 
     file(GLOB dml_srcs CONFIGURE_DEPENDS
-      "${MODELS_ROOT}/dml/*.h"
-      "${MODELS_ROOT}/dml/*.cpp"
+      "${PROJECT_SOURCE_DIR}/src/dml/*.h"
+      "${PROJECT_SOURCE_DIR}/src/dml/*.cpp"
     )
 
     list(APPEND generator_srcs ${dml_srcs})

diff --git a/src/models/dml/dml_command_allocator_ring.h → src/dml/dml_command_allocator_ring.h b/src/models/dml/dml_command_allocator_ring.h → src/dml/dml_command_allocator_ring.h
diff --git a/src/models/dml/dml_command_queue.cpp → src/dml/dml_command_queue.cpp b/src/models/dml/dml_command_queue.cpp → src/dml/dml_command_queue.cpp
diff --git a/src/models/dml/dml_command_queue.h → src/dml/dml_command_queue.h b/src/models/dml/dml_command_queue.h → src/dml/dml_command_queue.h
diff --git a/src/models/dml/dml_command_recorder.cpp → src/dml/dml_command_recorder.cpp b/src/models/dml/dml_command_recorder.cpp → src/dml/dml_command_recorder.cpp
@@ -6,7 +6,7 @@
 #include <wil/result.h>
 #include "dml_command_recorder.h"
 #include "dml_command_queue.h"
-#include "../onnxruntime_api.h"
+#include "../models/onnxruntime_api.h"
 
 DmlCommandRecorder::DmlCommandRecorder(
     ID3D12Device* d3d_device,

diff --git a/src/models/dml/dml_command_recorder.h → src/dml/dml_command_recorder.h b/src/models/dml/dml_command_recorder.h → src/dml/dml_command_recorder.h
diff --git a/src/models/dml/dml_descriptor_pool.cpp → src/dml/dml_descriptor_pool.cpp b/src/models/dml/dml_descriptor_pool.cpp → src/dml/dml_descriptor_pool.cpp
diff --git a/src/models/dml/dml_descriptor_pool.h → src/dml/dml_descriptor_pool.h b/src/models/dml/dml_descriptor_pool.h → src/dml/dml_descriptor_pool.h
diff --git a/src/models/dml/dml_execution_context.cpp → src/dml/dml_execution_context.cpp b/src/models/dml/dml_execution_context.cpp → src/dml/dml_execution_context.cpp
diff --git a/src/models/dml/dml_execution_context.h → src/dml/dml_execution_context.h b/src/models/dml/dml_execution_context.h → src/dml/dml_execution_context.h
@@ -7,7 +7,7 @@
 #include <d3dx12.h>
 #include "dml_command_recorder.h"
 #include "dml_gpu_event.h"
-#include "../onnxruntime_api.h"
+#include "../models/onnxruntime_api.h"
 
 // Asynchronously performs GPU work, and automatically manages command list recording and submission to queues.
 // Work submitted to the DmlExecutionContext is typically recorded onto a command list and may not immediately begin

diff --git a/src/models/dml/dml_gpu_event.h → src/dml/dml_gpu_event.h b/src/models/dml/dml_gpu_event.h → src/dml/dml_gpu_event.h
diff --git a/src/models/dml/dml_helpers.cpp → src/dml/dml_helpers.cpp b/src/models/dml/dml_helpers.cpp → src/dml/dml_helpers.cpp
diff --git a/src/models/dml/dml_helpers.h → src/dml/dml_helpers.h b/src/models/dml/dml_helpers.h → src/dml/dml_helpers.h
diff --git a/...odels/dml/dml_increment_values_kernel.cpp → src/dml/dml_increment_values_kernel.cpp b/...odels/dml/dml_increment_values_kernel.cpp → src/dml/dml_increment_values_kernel.cpp
diff --git a/src/models/dml/dml_increment_values_kernel.h → src/dml/dml_increment_values_kernel.h b/src/models/dml/dml_increment_values_kernel.h → src/dml/dml_increment_values_kernel.h
diff --git a/src/models/dml/dml_pooled_upload_heap.cpp → src/dml/dml_pooled_upload_heap.cpp b/src/models/dml/dml_pooled_upload_heap.cpp → src/dml/dml_pooled_upload_heap.cpp
diff --git a/src/models/dml/dml_pooled_upload_heap.h → src/dml/dml_pooled_upload_heap.h b/src/models/dml/dml_pooled_upload_heap.h → src/dml/dml_pooled_upload_heap.h
diff --git a/src/models/dml/dml_readback_heap.cpp → src/dml/dml_readback_heap.cpp b/src/models/dml/dml_readback_heap.cpp → src/dml/dml_readback_heap.cpp
diff --git a/src/models/dml/dml_readback_heap.h → src/dml/dml_readback_heap.h b/src/models/dml/dml_readback_heap.h → src/dml/dml_readback_heap.h
diff --git a/...dml/dml_shaders/dml_increment_values.hlsl → ...dml/dml_shaders/dml_increment_values.hlsl b/...dml/dml_shaders/dml_increment_values.hlsl → ...dml/dml_shaders/dml_increment_values.hlsl
diff --git a/...ml_shaders/dml_update_attention_mask.hlsl → ...ml_shaders/dml_update_attention_mask.hlsl b/...ml_shaders/dml_update_attention_mask.hlsl → ...ml_shaders/dml_update_attention_mask.hlsl
diff --git a/src/models/dml/dml_smart_container.h → src/dml/dml_smart_container.h b/src/models/dml/dml_smart_container.h → src/dml/dml_smart_container.h
@@ -6,7 +6,7 @@
 #include <memory>
 #include <wrl/client.h>
 #include <wrl/implements.h>
-#include "onnxruntime_api.h"
+#include "../models/onnxruntime_api.h"
 
 // Allows objects to be added to a D3D12 object via SetPrivateDataInterface and extend its lifetime beyond the life of the model. For
 // example, we can put the DML allocator on the D3D12 device (which is a unique singleton for each adapter) and be sure that the allocator won't be

diff --git a/src/models/dml/dml_update_mask_kernel.cpp → src/dml/dml_update_mask_kernel.cpp b/src/models/dml/dml_update_mask_kernel.cpp → src/dml/dml_update_mask_kernel.cpp
diff --git a/src/models/dml/dml_update_mask_kernel.h → src/dml/dml_update_mask_kernel.h b/src/models/dml/dml_update_mask_kernel.h → src/dml/dml_update_mask_kernel.h
diff --git a/...s/dml/generated_dml_shaders/.clang-format → src/dml/generated_dml_shaders/.clang-format b/...s/dml/generated_dml_shaders/.clang-format → src/dml/generated_dml_shaders/.clang-format
diff --git a/...ated_dml_shaders/increment_values_int32.h → ...ated_dml_shaders/increment_values_int32.h b/...ated_dml_shaders/increment_values_int32.h → ...ated_dml_shaders/increment_values_int32.h
diff --git a/...ated_dml_shaders/increment_values_int64.h → ...ated_dml_shaders/increment_values_int64.h b/...ated_dml_shaders/increment_values_int64.h → ...ated_dml_shaders/increment_values_int64.h
diff --git a/...generated_dml_shaders/update_mask_int32.h → ...generated_dml_shaders/update_mask_int32.h b/...generated_dml_shaders/update_mask_int32.h → ...generated_dml_shaders/update_mask_int32.h
diff --git a/...generated_dml_shaders/update_mask_int64.h → ...generated_dml_shaders/update_mask_int64.h b/...generated_dml_shaders/update_mask_int64.h → ...generated_dml_shaders/update_mask_int64.h
diff --git a/src/models/input_ids.h b/src/models/input_ids.h
@@ -23,11 +23,11 @@ struct InputIDs {
   std::unique_ptr<OrtValue> value_;
 
   // Used for decoding runs with cuda graphs.
-  StaticBuffer* sb_input_ids_ = nullptr;
+  StaticBuffer* sb_input_ids_{};
 
 #if USE_DML
   std::unique_ptr<OrtValue> value_int32_;
-  StaticBuffer* sb_input_ids_int32_ = nullptr;
+  StaticBuffer* sb_input_ids_int32_{};
   DmlReusedCommandListState input_ids_cast_command_list_state_{};
 #endif
 };

diff --git a/src/models/logits.cpp b/src/models/logits.cpp
@@ -99,22 +99,20 @@ RoamingArray<float> Logits::Get() {
           } break;
 #endif
 
-#if USE_CUDA
+          case DeviceType::CPU:
           case DeviceType::CUDA: {
             auto logits = std::span<float>{value32_->GetTensorMutableData<float>(), element_count};
             auto logits_next = gpu_span<float>{value_next->GetTensorMutableData<float>(), element_count};
             auto target = logits_next.subspan(vocab_index, vocab_size);
             std::span<const float> source = logits.subspan(vocab_index * seq_length + token_index * vocab_size, vocab_size);
-            CudaCheck() == cudaMemcpyAsync(target.data(), source.data(), source.size_bytes(), cudaMemcpyDeviceToDevice, state_.params_->cuda_stream);
-
-          } break;
+            if (model_.device_type_ == DeviceType::CUDA)
+#if USE_CUDA
+              CudaCheck() == cudaMemcpyAsync(target.data(), source.data(), source.size_bytes(), cudaMemcpyDeviceToDevice, state_.params_->cuda_stream);
+#else
+              throw std::runtime_error("Unexpected CUDA device usage");
 #endif
-          case DeviceType::CPU: {
-            auto logits = std::span<float>{value32_->GetTensorMutableData<float>(), element_count};
-            auto logits_next = cpu_span<float>{value_next->GetTensorMutableData<float>(), element_count};
-            auto target = logits_next.subspan(vocab_index, vocab_size);
-            std::span<const float> source = logits.subspan(vocab_index * seq_length + token_index * vocab_size, vocab_size);
-            copy(source, target);
+            else
+              copy(source, target);
           } break;
         }
 

diff --git a/src/models/logits.h b/src/models/logits.h
@@ -21,8 +21,8 @@ struct Logits {
   std::unique_ptr<OrtValue> value16_;  // When model output is fp16
 
   // Used for decoding runs with cuda graphs.
-  StaticBuffer* sb_logits32_ = nullptr;
-  StaticBuffer* sb_logits16_ = nullptr;
+  StaticBuffer* sb_logits32_{};
+  StaticBuffer* sb_logits16_{};
 
 #if USE_DML
   DmlReusedCommandListState logits_cast_command_list_state_{};

diff --git a/src/models/model.cpp b/src/models/model.cpp
@@ -11,7 +11,7 @@
 #ifdef USE_DML
 #include <wil/wrl.h>
 #include "dml_provider_factory.h"
-#include "dml/dml_smart_container.h"
+#include "../dml/dml_smart_container.h"
 
 EXTERN_C IMAGE_DOS_HEADER __ImageBase;
 

diff --git a/src/models/model.h b/src/models/model.h
@@ -7,10 +7,10 @@
 
 #ifdef USE_DML
 #include "dml_provider_factory.h"
-#include "dml/dml_helpers.h"
-#include "dml/dml_execution_context.h"
-#include "dml/dml_pooled_upload_heap.h"
-#include "dml/dml_readback_heap.h"
+#include "../dml/dml_helpers.h"
+#include "../dml/dml_execution_context.h"
+#include "../dml/dml_pooled_upload_heap.h"
+#include "../dml/dml_readback_heap.h"
 #endif
 
 namespace Generators {
@@ -155,7 +155,7 @@ struct Model : std::enable_shared_from_this<Model> {
  private:
 #if USE_DML
   mutable DmlObjects dml_objects_;
-  const OrtDmlApi* p_dml_api_ = nullptr;
+  const OrtDmlApi* p_dml_api_{};
   std::unique_ptr<DmlPooledUploadHeap> dml_pooled_upload_heap_;
   std::unique_ptr<DmlExecutionContext> dml_execution_context_;
   std::unique_ptr<DmlReadbackHeap> dml_readback_heap_;

diff --git a/src/models/position_inputs.cpp b/src/models/position_inputs.cpp
@@ -4,7 +4,7 @@
 #include "kernels.h"
 
 #if USE_DML
-#include "dml/dml_update_mask_kernel.h"
+#include "../dml/dml_update_mask_kernel.h"
 #endif
 
 namespace Generators {

diff --git a/src/models/position_inputs.h b/src/models/position_inputs.h
@@ -3,8 +3,8 @@
 #include "static_buffer.h"
 
 #if USE_DML
-#include "dml/dml_update_mask_kernel.h"
-#include "dml/dml_increment_values_kernel.h"
+#include "../dml/dml_update_mask_kernel.h"
+#include "../dml/dml_increment_values_kernel.h"
 #endif
 
 namespace Generators {
@@ -51,15 +51,15 @@ struct PositionInputs {
   std::vector<int32_t> initial_sequence_lengths_;
 
   // Used for decoding runs with cuda graphs.
-  StaticBuffer* sb_position_ids_ = nullptr;
-  StaticBuffer* sb_attention_mask_ = nullptr;
+  StaticBuffer* sb_position_ids_{};
+  StaticBuffer* sb_attention_mask_{};
 
   bool is_first_posid_update_{true};
   bool is_first_mask_update_{true};
 
 #ifdef USE_DML
   std::optional<DmlUpdateMaskKernel> dml_update_mask_kernel_;
-  StaticBuffer* sb_attention_mask_next_ = nullptr;
+  StaticBuffer* sb_attention_mask_next_{};
   std::optional<DmlIncrementValuesKernel> dml_update_position_ids_kernel_;
   bool is_second_mask_update_{false};
 #endif