microsoft · PatriceVignola · Apr 18, 2024 · Apr 16, 2024 · Apr 16, 2024 · Apr 16, 2024
diff --git a/src/generators.h b/src/generators.h
@@ -42,6 +42,7 @@ using TokenSequences = std::vector<std::vector<int32_t>>;
 enum struct DeviceType {
   CPU,
   CUDA,
+  DML,
 };
 
 struct GeneratorParams : std::enable_shared_from_this<GeneratorParams> {

diff --git a/src/models/input_ids.cpp b/src/models/input_ids.cpp
@@ -46,7 +46,9 @@ void InputIDs::Update(RoamingArray<int32_t> next_tokens_unk) {
   if (shape_[1] != 1) {
     shape_[1] = 1;
     if (!sb_input_ids_) {
-      value_ = OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_);
+      // DML doesn't support on-device updates of input ids yet, so fall back to the CPU
+      auto& allocator = model_.device_type_ == DeviceType::DML ? model_.allocator_cpu_ : *model_.allocator_device_;
+      value_ = OrtValue::CreateTensor(allocator, shape_, type_);
     } else {
       value_ = sb_input_ids_->CreateTensorOnStaticBuffer(shape_, type_);
     }

diff --git a/src/models/kv_cache.cpp b/src/models/kv_cache.cpp
@@ -117,7 +117,7 @@ KV_Cache::KV_Cache(const Model& model, State& state)
     : model_{model},
       state_{state},
       layer_count_{model_.config_->model.decoder.num_hidden_layers},
-      past_present_share_buffer_{state_.params_->search.past_present_share_buffer && state_.params_->search.num_beams == 1 && model_.device_type_ == DeviceType::CUDA},
+      past_present_share_buffer_{state_.params_->search.past_present_share_buffer && state_.params_->search.num_beams == 1 && (model_.device_type_ == DeviceType::CUDA || model_.device_type_ == DeviceType::DML)},
       shape_{state_.params_->BatchBeamSize(), model.config_->model.decoder.num_key_value_heads, 0, model.config_->model.decoder.head_size} {
   pasts_.resize(layer_count_ * 2);
   presents_.reserve(layer_count_ * 2);

diff --git a/src/models/logits.cpp b/src/models/logits.cpp
@@ -9,7 +9,9 @@ Logits::Logits(const Model& model, State& state)
       state_{state},
       shape_{static_cast<int64_t>(state_.params_->batch_size) * state_.params_->search.num_beams, state_.params_->sequence_length, state_.params_->vocab_size},
       type_{model_.session_info_->GetOutputDataType(model_.config_->model.decoder.outputs.logits)} {
-  auto logits_tensor = OrtValue::CreateTensor(*model.allocator_device_, shape_, type_);
+  // DML doesn't support on-device scoring yet, so fall back to the CPU
+  auto& allocator = model_.device_type_ == DeviceType::DML ? model_.allocator_cpu_ : *model_.allocator_device_;
+  auto logits_tensor = OrtValue::CreateTensor(allocator, shape_, type_);
   if (type_ == Ort::TypeToTensorType<float>::type)
     value32_ = std::move(logits_tensor);
   else
@@ -28,10 +30,12 @@ Logits::Logits(const Model& model, State& state)
 
 RoamingArray<float> Logits::Get() {
   size_t element_count = shape_[0] * shape_[1] * shape_[2];
+  // DML doesn't support on-device scoring yet, so fall back to the CPU
+  auto& allocator = model_.device_type_ == DeviceType::DML ? model_.allocator_cpu_ : *model_.allocator_device_;
 
   // Convert from float16 to float32 if necessary
   if (type_ == Ort::TypeToTensorType<Ort::Float16_t>::type)
-    ConvertFp16ToFp32(*model_.allocator_device_, *value16_, value32_, model_.device_type_, model_.cuda_stream_);
+    ConvertFp16ToFp32(allocator, *value16_, value32_, model_.device_type_, model_.cuda_stream_);
 
   // First iteration? Then copy the logits over to a {batch_beams, 1, vocab_size} tensor
   // We'll reuse this tensor for all future iterations
@@ -42,8 +46,9 @@ RoamingArray<float> Logits::Get() {
     const size_t num_beams = state_.params_->search.num_beams;
 
     shape_[1] = 1;
+
     // bugbug: not done yet
-    auto value_next = !sb_logits32_ ? OrtValue::CreateTensor<float>(*model_.allocator_device_, shape_)
+    auto value_next = !sb_logits32_ ? OrtValue::CreateTensor<float>(allocator, shape_)
                                     : sb_logits32_->CreateTensorOnStaticBuffer(shape_, type_);
     auto logits_next = cpu_span<float>{value_next->GetTensorMutableData<float>(), element_count};
 
@@ -77,7 +82,7 @@ RoamingArray<float> Logits::Get() {
 
     value32_ = std::move(value_next);
     if (type_ == Ort::TypeToTensorType<Ort::Float16_t>::type)
-      value16_ = !sb_logits16_ ? OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_)
+      value16_ = !sb_logits16_ ? OrtValue::CreateTensor(allocator, shape_, type_)
                                : sb_logits16_->CreateTensorOnStaticBuffer(shape_, type_);
 
     state_.outputs_[output_index_] = type_ == Ort::TypeToTensorType<float>::type ? value32_.get() : value16_.get();

diff --git a/src/models/model.cpp b/src/models/model.cpp
@@ -194,6 +194,22 @@ Ort::Allocator* GetCudaAllocator(OrtSession& session) {
 }
 #endif
 
+#if USE_DML
+// Since Python/Others can and will hold onto a generator object past the model object's lifetime we need to ensure
+// the allocator used is not destroyed until last. This keeps the allocator around until exit, after all other memory
+// has been destroyed.
+Ort::Allocator* GetDmlAllocator(OrtSession& session) {
+  static std::unique_ptr<OrtMemoryInfo> memory_info_dml_;
+  static std::unique_ptr<Ort::Allocator> allocator_dml_;
+
+  if (!allocator_dml_) {
+    memory_info_dml_ = OrtMemoryInfo::Create("DML", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault);
+    allocator_dml_ = Ort::Allocator::Create(session, *memory_info_dml_);
+  }
+  return allocator_dml_.get();
+}
+#endif
+
 SessionInfo::SessionInfo(OrtSession& session) {
   auto input_names = session.GetInputNames();
   std::vector<ONNXTensorElementDataType> input_types(input_names.size());
@@ -247,7 +263,12 @@ void Model::InitDeviceAllocator([[maybe_unused]] OrtSession& session) {
   if (device_type_ == DeviceType::CUDA) {
     allocator_device_ = GetCudaAllocator(session);
   }
+#elif USE_DML
+  if (device_type_ == DeviceType::DML) {
+    allocator_device_ = GetDmlAllocator(session);
+  }
 #endif
+
   session_info_ = std::make_unique<SessionInfo>(session);
 }
 
@@ -326,6 +347,7 @@ void Model::CreateSessionOptions() {
       ort_options.AppendExecutionProvider_ROCM(ort_provider_options);
 #ifdef USE_DML
     } else if (provider_options.name == "dml") {
+      device_type_ = DeviceType::DML;  // We use a DML allocator for input/output caches, but other tensors will use CPU tensors
       const OrtDmlApi* p_dml_api{};
       Ort::ThrowOnError(Ort::api->GetExecutionProviderApi("DML", ORT_API_VERSION, reinterpret_cast<const void**>(&p_dml_api)));
       if (!p_dml_api)
@@ -386,6 +408,8 @@ void ConvertFp16ToFp32(OrtAllocator& allocator, OrtValue& in, std::unique_ptr<Or
   auto* fp32 = p_out->GetTensorMutableData<float>();
 
   switch (device_type) {
+    case DeviceType::DML:
+      // DML doesn't currently support on-device scoring, so we fall back to the CPU
     case DeviceType::CPU:
       for (int i = 0; i < count; i++)
         fp32[i] = Float16ToFloat32(fp16[i]);
@@ -439,8 +463,9 @@ std::unique_ptr<OrtValue> Model::ExpandInputs(std::unique_ptr<OrtValue>& input,
   // Input shape (batch_size, sequence_length). The input is required with data type T.
   // Output shape (batch_size * num_beams, sequence_length)
 
-  // If we're on CUDA, we still want to do the copy to move the data over to CUDA memory where we will read from it later
-  if (num_beams == 1 && device_type_ == DeviceType::CPU) {
+  // If we're on CUDA, we still want to do the copy to move the data over to CUDA memory where we will read from it later.
+  // DML doesn't currently support on-device scoring, so we go the same route as the CPU
+  if (num_beams == 1 && (device_type_ == DeviceType::CPU || device_type_ == DeviceType::DML)) {
     return std::move(input);
   }
 
@@ -453,13 +478,15 @@ std::unique_ptr<OrtValue> Model::ExpandInputs(std::unique_ptr<OrtValue>& input,
 
   input_shape[0] *= num_beams;
 
-  auto expanded = OrtValue::CreateTensor(*allocator_device_, input_shape, element_type);
-
+  auto& allocator = device_type_ == DeviceType::DML ? allocator_cpu_ : *allocator_device_;
+  auto expanded = OrtValue::CreateTensor(allocator, input_shape, element_type);
   const auto* input_data = reinterpret_cast<const uint8_t*>(input->GetTensorRawData());
   auto* expanded_data = reinterpret_cast<uint8_t*>(expanded->GetTensorMutableRawData());
   auto* target = expanded_data;
 
   switch (device_type_) {
+    case DeviceType::DML:
+      // DML doesn't currently support on-device scoring, so we use the CPU for non-cache inputs/outputs
     case DeviceType::CPU:
       for (int i = 0; i < batch_size; i++) {
         for (int j = 0; j < num_beams; j++) {

diff --git a/src/models/position_inputs.cpp b/src/models/position_inputs.cpp
@@ -118,6 +118,8 @@ void PositionInputs::UpdatePositionIDs(int current_length) {
     state_.inputs_[posid_input_index_] = position_ids_.get();
   } else {  // Just incrementing existing position IDs
     switch (model_.device_type_) {
+      case DeviceType::DML:
+        // DML doesn't support on-device position ids update yet, so we fall back to the CPU
       case DeviceType::CPU: {
         if (type_ == Ort::TypeToTensorType<int32_t>::type)
           UpdatePositionIDsImpl<int32_t>();
@@ -160,12 +162,16 @@ void PositionInputs::UpdateAttentionMask(int current_length) {
     }
 #endif
   } else {
+    // DML doesn't support on-device mask updating yet, so use a CPU allocator
+    auto& allocator = model_.device_type_ == DeviceType::DML ? model_.allocator_cpu_ : *model_.allocator_device_;
     assert(attention_mask_shape_[1] == current_length - 1);  // We should always be growing by 1
     attention_mask_shape_[1] = current_length;
-    attention_mask_next_ = OrtValue::CreateTensor(*model_.allocator_device_, attention_mask_shape_, type_);
+    attention_mask_next_ = OrtValue::CreateTensor(allocator, attention_mask_shape_, type_);
   }
 
   switch (model_.device_type_) {
+    case DeviceType::DML:
+      // DML doesn't support on-device mask updating yet, so we fallback to the CPU
     case DeviceType::CPU: {
       if (type_ == Ort::TypeToTensorType<int32_t>::type)
         UpdateAttentionMaskImpl(attention_mask_next_->GetTensorMutableData<int32_t>(),

diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
@@ -32,6 +32,8 @@ if(BUILD_WHEEL)
   message("Setting up wheel files in : ${WHEEL_FILES_DIR}")
   if(USE_CUDA)
     set(TARGET_NAME "onnxruntime-genai-cuda")
+  elseif(USE_DML)
+    set(TARGET_NAME "onnxruntime-genai-dml")
   else()
     set(TARGET_NAME "onnxruntime-genai")
   endif()

diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
@@ -162,12 +162,13 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
             "use_rotemb_in_attn": False,                     # Use rotary embeddings within attention op (instead of a separate RotaryEmbedding op)
             "use_packed_matmul": False,                      # Use packed MatMul (instead of 3 separate MatMuls for Q/K/V)
         }
-        if self.ep == "cuda" and self.io_dtype == TensorProto.FLOAT16:
+        if self.ep in {"cuda", "dml"} and self.io_dtype == TensorProto.FLOAT16:
             # Change model settings for GroupQueryAttention
             self.attention_attrs["op_type"] = "GroupQueryAttention"
-            print("GroupQueryAttention (GQA) is used in this model. GQA is currently supported only for INT4 CUDA and FP16 CUDA.")
+            print("GroupQueryAttention (GQA) is used in this model. GQA is currently supported only for INT4 and FP16 on the CUDA and DML execution providers.")
 
-            self.attention_attrs["use_packed_matmul"] = self.num_attn_heads == self.num_kv_heads
+            # DML doesn't support stacked Q/K/V for GQA yet
+            self.attention_attrs["use_packed_matmul"] = self.ep != "dml" and self.num_attn_heads == self.num_kv_heads
 
             # GQA + Rot.Emb. does not require `position ids` as input
             self.attention_attrs["use_rotemb_in_attn"] = True
@@ -1767,7 +1768,7 @@ def get_args():
         "-e",
         "--execution_provider",
         required=True,
-        choices=["cpu", "cuda"],
+        choices=["cpu", "cuda", "dml"],
         help="Execution provider to target with precision of model (e.g. FP16 CUDA, INT4 CPU)",
     )
-Original file line number
+Diff line change
@@ Expand Up / @@ -42,6 +42,7 @@ using TokenSequences = std::vector<std::vector<int32_t>>; @@
     enum struct DeviceType {
       CPU,
       CUDA,
+      DML,
     };
     struct GeneratorParams : std::enable_shared_from_this<GeneratorParams> {
@@ Expand Down @@