Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add basic DML support #268

Merged
merged 5 commits into from
Apr 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/generators.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ using TokenSequences = std::vector<std::vector<int32_t>>;
enum struct DeviceType {
CPU,
CUDA,
DML,
};

struct GeneratorParams : std::enable_shared_from_this<GeneratorParams> {
Expand Down
4 changes: 3 additions & 1 deletion src/models/input_ids.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,9 @@ void InputIDs::Update(RoamingArray<int32_t> next_tokens_unk) {
if (shape_[1] != 1) {
shape_[1] = 1;
if (!sb_input_ids_) {
value_ = OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_);
// DML doesn't support on-device updates of input ids yet, so fall back to the CPU
auto& allocator = model_.device_type_ == DeviceType::DML ? model_.allocator_cpu_ : *model_.allocator_device_;
value_ = OrtValue::CreateTensor(allocator, shape_, type_);
} else {
value_ = sb_input_ids_->CreateTensorOnStaticBuffer(shape_, type_);
}
Expand Down
2 changes: 1 addition & 1 deletion src/models/kv_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ KV_Cache::KV_Cache(const Model& model, State& state)
: model_{model},
state_{state},
layer_count_{model_.config_->model.decoder.num_hidden_layers},
past_present_share_buffer_{state_.params_->search.past_present_share_buffer && state_.params_->search.num_beams == 1 && model_.device_type_ == DeviceType::CUDA},
past_present_share_buffer_{state_.params_->search.past_present_share_buffer && state_.params_->search.num_beams == 1 && (model_.device_type_ == DeviceType::CUDA || model_.device_type_ == DeviceType::DML)},
shape_{state_.params_->BatchBeamSize(), model.config_->model.decoder.num_key_value_heads, 0, model.config_->model.decoder.head_size} {
pasts_.resize(layer_count_ * 2);
presents_.reserve(layer_count_ * 2);
Expand Down
13 changes: 9 additions & 4 deletions src/models/logits.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@ Logits::Logits(const Model& model, State& state)
state_{state},
shape_{static_cast<int64_t>(state_.params_->batch_size) * state_.params_->search.num_beams, state_.params_->sequence_length, state_.params_->vocab_size},
type_{model_.session_info_->GetOutputDataType(model_.config_->model.decoder.outputs.logits)} {
auto logits_tensor = OrtValue::CreateTensor(*model.allocator_device_, shape_, type_);
// DML doesn't support on-device scoring yet, so fall back to the CPU
auto& allocator = model_.device_type_ == DeviceType::DML ? model_.allocator_cpu_ : *model_.allocator_device_;
auto logits_tensor = OrtValue::CreateTensor(allocator, shape_, type_);
if (type_ == Ort::TypeToTensorType<float>::type)
value32_ = std::move(logits_tensor);
else
Expand All @@ -28,10 +30,12 @@ Logits::Logits(const Model& model, State& state)

RoamingArray<float> Logits::Get() {
size_t element_count = shape_[0] * shape_[1] * shape_[2];
// DML doesn't support on-device scoring yet, so fall back to the CPU
auto& allocator = model_.device_type_ == DeviceType::DML ? model_.allocator_cpu_ : *model_.allocator_device_;

// Convert from float16 to float32 if necessary
if (type_ == Ort::TypeToTensorType<Ort::Float16_t>::type)
ConvertFp16ToFp32(*model_.allocator_device_, *value16_, value32_, model_.device_type_, model_.cuda_stream_);
ConvertFp16ToFp32(allocator, *value16_, value32_, model_.device_type_, model_.cuda_stream_);

// First iteration? Then copy the logits over to a {batch_beams, 1, vocab_size} tensor
// We'll reuse this tensor for all future iterations
Expand All @@ -42,8 +46,9 @@ RoamingArray<float> Logits::Get() {
const size_t num_beams = state_.params_->search.num_beams;

shape_[1] = 1;

// bugbug: not done yet
auto value_next = !sb_logits32_ ? OrtValue::CreateTensor<float>(*model_.allocator_device_, shape_)
auto value_next = !sb_logits32_ ? OrtValue::CreateTensor<float>(allocator, shape_)
: sb_logits32_->CreateTensorOnStaticBuffer(shape_, type_);
auto logits_next = cpu_span<float>{value_next->GetTensorMutableData<float>(), element_count};

Expand Down Expand Up @@ -77,7 +82,7 @@ RoamingArray<float> Logits::Get() {

value32_ = std::move(value_next);
if (type_ == Ort::TypeToTensorType<Ort::Float16_t>::type)
value16_ = !sb_logits16_ ? OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_)
value16_ = !sb_logits16_ ? OrtValue::CreateTensor(allocator, shape_, type_)
: sb_logits16_->CreateTensorOnStaticBuffer(shape_, type_);

state_.outputs_[output_index_] = type_ == Ort::TypeToTensorType<float>::type ? value32_.get() : value16_.get();
Expand Down
35 changes: 31 additions & 4 deletions src/models/model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,22 @@ Ort::Allocator* GetCudaAllocator(OrtSession& session) {
}
#endif

#if USE_DML
// Since Python/Others can and will hold onto a generator object past the model object's lifetime we need to ensure
// the allocator used is not destroyed until last. This keeps the allocator around until exit, after all other memory
// has been destroyed.
Ort::Allocator* GetDmlAllocator(OrtSession& session) {
static std::unique_ptr<OrtMemoryInfo> memory_info_dml_;
static std::unique_ptr<Ort::Allocator> allocator_dml_;

if (!allocator_dml_) {
memory_info_dml_ = OrtMemoryInfo::Create("DML", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault);
allocator_dml_ = Ort::Allocator::Create(session, *memory_info_dml_);
}
return allocator_dml_.get();
}
#endif

SessionInfo::SessionInfo(OrtSession& session) {
auto input_names = session.GetInputNames();
std::vector<ONNXTensorElementDataType> input_types(input_names.size());
Expand Down Expand Up @@ -247,7 +263,12 @@ void Model::InitDeviceAllocator([[maybe_unused]] OrtSession& session) {
if (device_type_ == DeviceType::CUDA) {
allocator_device_ = GetCudaAllocator(session);
}
#elif USE_DML
if (device_type_ == DeviceType::DML) {
allocator_device_ = GetDmlAllocator(session);
}
#endif

session_info_ = std::make_unique<SessionInfo>(session);
}

Expand Down Expand Up @@ -326,6 +347,7 @@ void Model::CreateSessionOptions() {
ort_options.AppendExecutionProvider_ROCM(ort_provider_options);
#ifdef USE_DML
} else if (provider_options.name == "dml") {
device_type_ = DeviceType::DML; // We use a DML allocator for input/output caches, but other tensors will use CPU tensors
const OrtDmlApi* p_dml_api{};
Ort::ThrowOnError(Ort::api->GetExecutionProviderApi("DML", ORT_API_VERSION, reinterpret_cast<const void**>(&p_dml_api)));
if (!p_dml_api)
Expand Down Expand Up @@ -386,6 +408,8 @@ void ConvertFp16ToFp32(OrtAllocator& allocator, OrtValue& in, std::unique_ptr<Or
auto* fp32 = p_out->GetTensorMutableData<float>();

switch (device_type) {
case DeviceType::DML:
// DML doesn't currently support on-device scoring, so we fall back to the CPU
case DeviceType::CPU:
for (int i = 0; i < count; i++)
fp32[i] = Float16ToFloat32(fp16[i]);
Expand Down Expand Up @@ -439,8 +463,9 @@ std::unique_ptr<OrtValue> Model::ExpandInputs(std::unique_ptr<OrtValue>& input,
// Input shape (batch_size, sequence_length). The input is required with data type T.
// Output shape (batch_size * num_beams, sequence_length)

// If we're on CUDA, we still want to do the copy to move the data over to CUDA memory where we will read from it later
if (num_beams == 1 && device_type_ == DeviceType::CPU) {
// If we're on CUDA, we still want to do the copy to move the data over to CUDA memory where we will read from it later.
// DML doesn't currently support on-device scoring, so we go the same route as the CPU
if (num_beams == 1 && (device_type_ == DeviceType::CPU || device_type_ == DeviceType::DML)) {
return std::move(input);
}

Expand All @@ -453,13 +478,15 @@ std::unique_ptr<OrtValue> Model::ExpandInputs(std::unique_ptr<OrtValue>& input,

input_shape[0] *= num_beams;

auto expanded = OrtValue::CreateTensor(*allocator_device_, input_shape, element_type);

auto& allocator = device_type_ == DeviceType::DML ? allocator_cpu_ : *allocator_device_;
auto expanded = OrtValue::CreateTensor(allocator, input_shape, element_type);
const auto* input_data = reinterpret_cast<const uint8_t*>(input->GetTensorRawData());
auto* expanded_data = reinterpret_cast<uint8_t*>(expanded->GetTensorMutableRawData());
auto* target = expanded_data;

switch (device_type_) {
case DeviceType::DML:
// DML doesn't currently support on-device scoring, so we use the CPU for non-cache inputs/outputs
case DeviceType::CPU:
for (int i = 0; i < batch_size; i++) {
for (int j = 0; j < num_beams; j++) {
Expand Down
8 changes: 7 additions & 1 deletion src/models/position_inputs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,8 @@ void PositionInputs::UpdatePositionIDs(int current_length) {
state_.inputs_[posid_input_index_] = position_ids_.get();
} else { // Just incrementing existing position IDs
switch (model_.device_type_) {
case DeviceType::DML:
// DML doesn't support on-device position ids update yet, so we fall back to the CPU
case DeviceType::CPU: {
if (type_ == Ort::TypeToTensorType<int32_t>::type)
UpdatePositionIDsImpl<int32_t>();
Expand Down Expand Up @@ -160,12 +162,16 @@ void PositionInputs::UpdateAttentionMask(int current_length) {
}
#endif
} else {
// DML doesn't support on-device mask updating yet, so use a CPU allocator
auto& allocator = model_.device_type_ == DeviceType::DML ? model_.allocator_cpu_ : *model_.allocator_device_;
assert(attention_mask_shape_[1] == current_length - 1); // We should always be growing by 1
attention_mask_shape_[1] = current_length;
attention_mask_next_ = OrtValue::CreateTensor(*model_.allocator_device_, attention_mask_shape_, type_);
attention_mask_next_ = OrtValue::CreateTensor(allocator, attention_mask_shape_, type_);
}

switch (model_.device_type_) {
case DeviceType::DML:
// DML doesn't support on-device mask updating yet, so we fallback to the CPU
case DeviceType::CPU: {
if (type_ == Ort::TypeToTensorType<int32_t>::type)
UpdateAttentionMaskImpl(attention_mask_next_->GetTensorMutableData<int32_t>(),
Expand Down
2 changes: 2 additions & 0 deletions src/python/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ if(BUILD_WHEEL)
message("Setting up wheel files in : ${WHEEL_FILES_DIR}")
if(USE_CUDA)
set(TARGET_NAME "onnxruntime-genai-cuda")
elseif(USE_DML)
set(TARGET_NAME "onnxruntime-genai-dml")
else()
set(TARGET_NAME "onnxruntime-genai")
endif()
Expand Down
9 changes: 5 additions & 4 deletions src/python/py/models/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,12 +162,13 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
"use_rotemb_in_attn": False, # Use rotary embeddings within attention op (instead of a separate RotaryEmbedding op)
"use_packed_matmul": False, # Use packed MatMul (instead of 3 separate MatMuls for Q/K/V)
}
if self.ep == "cuda" and self.io_dtype == TensorProto.FLOAT16:
if self.ep in {"cuda", "dml"} and self.io_dtype == TensorProto.FLOAT16:
# Change model settings for GroupQueryAttention
self.attention_attrs["op_type"] = "GroupQueryAttention"
print("GroupQueryAttention (GQA) is used in this model. GQA is currently supported only for INT4 CUDA and FP16 CUDA.")
print("GroupQueryAttention (GQA) is used in this model. GQA is currently supported only for INT4 and FP16 on the CUDA and DML execution providers.")

self.attention_attrs["use_packed_matmul"] = self.num_attn_heads == self.num_kv_heads
# DML doesn't support stacked Q/K/V for GQA yet
self.attention_attrs["use_packed_matmul"] = self.ep != "dml" and self.num_attn_heads == self.num_kv_heads

# GQA + Rot.Emb. does not require `position ids` as input
self.attention_attrs["use_rotemb_in_attn"] = True
Expand Down Expand Up @@ -1767,7 +1768,7 @@ def get_args():
"-e",
"--execution_provider",
required=True,
choices=["cpu", "cuda"],
choices=["cpu", "cuda", "dml"],
help="Execution provider to target with precision of model (e.g. FP16 CUDA, INT4 CPU)",
)

Expand Down
Loading