Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cherry-pick for 1.17.1 patch release #19477

Merged
merged 23 commits into from
Feb 21, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
506eddb
Whisper Crash Fix (#19345)
petermcaughan Jan 31, 2024
6e61306
Fix Split index bugs uncovered by QNN SDK 2.19 (#19381)
adrianlizarraga Feb 2, 2024
ad63507
[DML EP] Fix external data unpacking (#19415)
PatriceVignola Feb 7, 2024
a77ee4a
Add contrib Q/DQ ops to symbolic shape inference tool (#19340)
adrianlizarraga Jan 31, 2024
5269e93
[Quant tool] Ensure MSFT opset for Q/DQ models (#19335)
adrianlizarraga Jan 31, 2024
c1ce74d
Windows - Only set thread affinity on Server with auto affinity (#19318)
ivberg Jan 30, 2024
098ef2c
[js/web] fix types exports in package.json (#19458)
fs-eire Feb 8, 2024
f5f5cc8
Add capturestate / rundown ETW support logging for session and provid…
ivberg Feb 8, 2024
e02b783
Disable streams for the DML EP (#19481)
PatriceVignola Feb 10, 2024
14543de
Remove cuda gencode 90 to reduce onnxruntime-training package size (#…
baijumeswani Feb 12, 2024
605adb0
Ovep 1.17.1 (#19482)
preetha-intel Feb 12, 2024
27c0a2f
[QNN EP] Build x64 python wheel for QNN EP (#19499)
adrianlizarraga Feb 13, 2024
61730bd
Fix subgraph quantization regression in onnxruntime 1.17 (#19421)
fxmarty Feb 13, 2024
166488e
Restrict L2 Cache Core check to Intel devices (#19483)
smk2007 Feb 14, 2024
ad02db8
Update the default std flag used during torch extensions compilation …
baijumeswani Feb 14, 2024
4917fff
add ATen support for bicubic interpolation (#19380)
prathikr Feb 5, 2024
34c3623
Optimize KahnsTopologicalSort and PriorityNodeCompare (#19475)
smk2007 Feb 16, 2024
ad86d13
Support ONNX export of OpenAi Whisper model (#17316)
shubhambhokare1 Feb 9, 2024
485e17e
Whisper Timestamps and Temperature (#19509)
kunal-vaishnavi Feb 16, 2024
e79a06b
Enable DML on Windows and CUDA on Linux for Node.js binding (#19274)
jchen351 Feb 5, 2024
e96506e
add option DefaultTensorType to specify the default tensor type to qu…
xadupre Feb 20, 2024
1aa73b2
Disable __cpuid check on arm64 builds as intrinsic is not available (…
smk2007 Feb 20, 2024
d636587
Changed command line argpasrse to process '--symmetric [True|False]'.…
satyajandhyala Feb 21, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ Status BeamSearchGpt<T>::Execute(const FeedsFetchesManager* init_run_feeds_fetch
cpu_state.sequences.InitDevice(beam_state.sequences_device);
ORT_RETURN_IF_ERROR(this->device_copy_int32_func_(beam_state.sequences_device.subspan(0, beam_state.sequences_device.size() / 2),
cpu_state.sequences_space.subspan(0, cpu_state.sequences_space.size() / 2),
nullptr,
this->ort_stream_,
DeviceCopyDirection::hostToDevice));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ Status BeamSearchT5<T>::Execute(const FeedsFetchesManager& encoder_feeds_fetches
cpu_state.sequences.InitDevice(beam_state.sequences_device);
ORT_RETURN_IF_ERROR(this->device_copy_int32_func_(beam_state.sequences_device.subspan(0, beam_state.sequences_device.size() / 2),
cpu_state.sequences_space.subspan(0, cpu_state.sequences_space.size() / 2),
nullptr,
this->ort_stream_,
DeviceCopyDirection::hostToDevice));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ Status BeamSearchWhisper<T>::Execute(const FeedsFetchesManager& encoder_feeds_fe
cpu_state.sequences.InitDevice(beam_state.sequences_device);
ORT_RETURN_IF_ERROR(this->device_copy_int32_func_(beam_state.sequences_device.subspan(0, beam_state.sequences_device.size() / 2),
cpu_state.sequences_space.subspan(0, cpu_state.sequences_space.size() / 2),
nullptr,
this->ort_stream_,
DeviceCopyDirection::hostToDevice));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -344,20 +344,25 @@ namespace Dml::GraphDescBuilder
dmlFusedNodeInputIndex < isConstGpuGraphInputCount &&
isConstGpuGraphInput[dmlFusedNodeInputIndex])
{
// This is a highly inefficient approach to generating constant nodes. It duplicates constant data
// across the graph input as well as every consumer's unique constant node. However it is currently
// This is a highly inefficient approach to generating constant nodes. It duplicates constant data
// across the graph input as well as every consumer's unique constant node. However it is currently
// only used for small inputs.
uint32_t c_maxConstNodeDataSize = 8;

ComPtr<OnnxTensorWrapper> constantInput = constantCpuGraphInputGetter(arg->Name());

auto& operatorGraphInputNode = graphNodeCreateInfo.nodesAsOperatorDesc[operatorGraphInputEdge.ToNodeIndex];
std::vector<DmlBufferTensorDesc*> toNodeInputTensorDescs = operatorGraphInputNode->GetInputTensors();
DmlBufferTensorDesc* tensorDesc = toNodeInputTensorDescs[operatorGraphInputEdge.ToNodeInputIndex];
ComPtr<OnnxTensorWrapper> constantInput;

if (constantInput && tensorDesc->totalTensorSizeInBytes < c_maxConstNodeDataSize)
if (tensorDesc->totalTensorSizeInBytes < c_maxConstNodeDataSize)
{
// The tensor description's size should be no larger than the constant input unless it was rounded to
constantInput = constantCpuGraphInputGetter(arg->Name());
}

if (constantInput)
{
// The tensor description's size should be no larger than the constant input unless it was rounded to
// the required alignment.
assert(((constantInput->GetTensorByteSize() + 3) & ~3) >= tensorDesc->totalTensorSizeInBytes);
size_t minimumConstantSize = std::min(constantInput->GetTensorByteSize(), gsl::narrow_cast<size_t>(tensorDesc->totalTensorSizeInBytes));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1123,7 +1123,7 @@ namespace Windows::AI::MachineLearning::Adapter
}
ORT_CATCH_RETURN
}

template <class NodeInfoImpl_t, class Base1_t, class Base2_t>
HRESULT STDMETHODCALLTYPE OpNodeInfoWrapper<NodeInfoImpl_t, Base1_t, Base2_t>::GetConstantInputTensor(uint32_t inputIndex, IMLOperatorTensor** tensor) const noexcept
{
Expand Down Expand Up @@ -1168,7 +1168,7 @@ namespace Windows::AI::MachineLearning::Adapter
m_requiredConstantCpuInputs.begin(),
m_requiredConstantCpuInputs.end(),
inputIndex) != m_requiredConstantCpuInputs.end();

// This shouldn't happen since kernel creation is deferred and repeated when required constant inputs are not present.
ORT_THROW_HR_IF(E_UNEXPECTED, inputRequiredAsConstant);
}
Expand Down Expand Up @@ -1562,7 +1562,13 @@ namespace Windows::AI::MachineLearning::Adapter
OnnxTensorWrapper::OnnxTensorWrapper(onnx::TensorProto* impl, const onnxruntime::Path& modelPath) : m_impl(impl)
{
// The tensor may be stored as raw data or in typed fields.
if (impl->has_raw_data())
if (impl->data_location() == onnx::TensorProto_DataLocation_EXTERNAL)
{
THROW_IF_NOT_OK(onnxruntime::utils::UnpackInitializerData(*impl, modelPath, m_unpackedExternalTensor));
m_dataPtr = reinterpret_cast<std::byte*>(m_unpackedExternalTensor.data());
m_tensorByteSize = m_unpackedExternalTensor.size();
}
else if (impl->has_raw_data())
{
m_dataPtr = reinterpret_cast<std::byte*>(impl->mutable_raw_data()->data());
m_tensorByteSize = impl->raw_data().size();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,7 @@ class OnnxTensorWrapper : public WRL::Base<IMLOperatorTensor>, public Closable
private:
size_t m_tensorByteSize = 0;
std::unique_ptr<std::byte[]> m_unpackedTensor;
std::vector<uint8_t> m_unpackedExternalTensor;
std::byte* m_dataPtr = nullptr;

// Lifetime is managed by the caller and guaranteed to outlive this class
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,19 @@ Status SplitOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
return Status::OK();
}

// Converts an ONNX list of split lengths to a QNN list of split indices.
// Note that the first split index at 0 is implicit (QNN SDK >= 2.19 will raise a validation error if included).
static void ConvertSplitLengthsToSplitIndices(gsl::span<const int64_t> split_lengths,
std::vector<uint32_t>& split_indices) {
uint32_t split_it = 0;
for (size_t i = 0; i < split_lengths.size(); ++i) {
if (i > 0) { // Do not include the 0th split index.
split_indices.push_back(split_it);
}
split_it += SafeInt<uint32_t>(split_lengths[i]);
}
}

Status SplitOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& node_unit,
std::vector<std::string>&& input_names,
Expand All @@ -79,22 +92,15 @@ Status SplitOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wr
const int64_t* tensor_data = reinterpret_cast<const int64_t*>(unpacked_tensor.data());
size_t tensor_byte_size = unpacked_tensor.size();
size_t size = tensor_byte_size / sizeof(int64_t);
split_index.push_back(0); // QNN need the start index of each range and starts from 0
std::transform(tensor_data, tensor_data + size, std::back_inserter(split_index),
[](int64_t item) { return SafeInt<uint32_t>(item); });
split_index.pop_back();
ConvertSplitLengthsToSplitIndices({tensor_data, size}, split_index);
} else {
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN doesn't support dynamic split");
}
} else {
NodeAttrHelper node_helper(node_unit);
if (node_helper.HasAttr("split")) {
auto split = node_helper.Get("split", std::vector<int32_t>{0});
uint32_t split_it = 0;
for (size_t i = 0; i < split.size(); ++i) {
split_index.push_back(split_it);
split_it += split[i];
}
auto split_lengths = node_helper.Get("split", std::vector<int64_t>{0});
ConvertSplitLengthsToSplitIndices(split_lengths, split_index);
}
}

Expand All @@ -105,11 +111,19 @@ Status SplitOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wr
"Cannot get shape");
ORT_ENFORCE(static_cast<int32_t>(input_shape.size()) > axis_value, "axis not valid!");
ORT_RETURN_IF_NOT(input_shape.at(axis_value) > 0, "Shape value not valid!");
auto num_outputs = node_unit.Outputs().size();
auto step = SafeInt<uint32_t>(input_shape.at(axis_value) / num_outputs);

// ONNX spec states that if not evenly divisible by `num_outputs`, the last chunk is smaller.
// Therefore, we have to use ceil() when computing shape[axis] / num_outputs.
// See: core/providers/cpu/tensor/split.cc::PrepareForCompute()
const float num_outputs = static_cast<float>(node_unit.Outputs().size());
const float split_dim_size = static_cast<float>(input_shape[axis_value]);
const uint32_t step = SafeInt<uint32_t>(std::ceil(split_dim_size / num_outputs));
uint32_t split_it = 0;

for (size_t i = 0; i < num_outputs; ++i) {
split_index.push_back(split_it);
if (i > 0) { // 0th split index is implicit (QNN >= 2.19 raises validation error if included)
split_index.push_back(split_it);
}
split_it += step;
}
}
Expand Down
10 changes: 10 additions & 0 deletions onnxruntime/core/util/thread_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

#ifdef _WIN32
#include <Windows.h>
#include <versionhelpers.h>
#endif
#include <thread>
#include "core/session/ort_apis.h"
Expand Down Expand Up @@ -98,7 +99,16 @@ CreateThreadPoolHelper(Env* env, OrtThreadPoolParams options) {
}
options.thread_pool_size = static_cast<int>(default_affinities.size());
if (options.auto_set_affinity) {
#ifdef _WIN32
// Only set thread affinity on Server with auto affinity.
// On client best to let OS scheduler handle.
// On big (P-Core) / little (E-Core) CPU designs affinity overrides QoS and has high power usage
if (IsWindowsServer()) {
to.affinities = std::move(default_affinities);
}
#else
to.affinities = std::move(default_affinities);
#endif
}
}
if (options.thread_pool_size <= 1) {
Expand Down
2 changes: 2 additions & 0 deletions onnxruntime/python/tools/quantization/qdq_quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,8 @@ def quantize_model(self):

self.model.model.producer_name = __producer__
self.model.model.producer_version = __version__
if self.qdq_op_domain == ms_domain:
self.model.set_opset_import(ms_domain, 1)

return self.model.model

Expand Down
27 changes: 27 additions & 0 deletions onnxruntime/python/tools/symbolic_shape_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ def __init__(self, int_max, auto_merge, guess_output_rank, verbose, prefix=""):
"BiasGelu": self._infer_BiasGelu,
"BiasSplitGelu": self._infer_BiasSplitGelu,
"DecoderMaskedMultiHeadAttention": self._infer_DecoderMaskedMultiHeadAttention,
"DequantizeLinear": self._infer_DequantizeLinear,
"EmbedLayerNormalization": self._infer_EmbedLayerNormalization,
"FastGelu": self._infer_FastGelu,
"GatedRelativePositionBias": self._infer_GatedRelativePositionBias,
Expand All @@ -212,6 +213,7 @@ def __init__(self, int_max, auto_merge, guess_output_rank, verbose, prefix=""):
"PackedAttention": self._infer_PackedAttention,
"PackedMultiHeadAttention": self._infer_PackedMultiHeadAttention,
"PythonOp": self._infer_PythonOp,
"QuantizeLinear": self._infer_QuantizeLinear,
"QuickGelu": self._infer_FastGelu,
"RelativePositionBias": self._infer_RelativePositionBias,
"RemovePadding": self._infer_RemovePadding,
Expand Down Expand Up @@ -457,6 +459,8 @@ def _onnx_infer_single_node(self, node):
"GemmFastGelu",
"LayerNormalization",
"LongformerAttention",
"DequantizeLinear",
"QuantizeLinear",
"RelativePositionBias",
"RemovePadding",
"RestorePadding",
Expand Down Expand Up @@ -979,6 +983,29 @@ def _infer_NhwcConv(self, node): # noqa: N802
)
)

def _infer_DequantizeLinear(self, node): # noqa: N802
# Get the output data type from the scale input (index 1, required).
output_dtype = self.known_vi_[node.input[1]].type.tensor_type.elem_type

# Get the output shape from the first input.
output_shape = self._get_shape(node, 0)

vi = self.known_vi_[node.output[0]]
vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_dtype, output_shape))

def _infer_QuantizeLinear(self, node): # noqa: N802
# Get the output data type from the zero-point input (index 2, optional).
# Otherwise, default to uint8
output_dtype = onnx.TensorProto.UINT8
if len(node.input) > 2 and node.input[2]:
output_dtype = self.known_vi_[node.input[2]].type.tensor_type.elem_type

# Get the output shape from the first input.
output_shape = self._get_shape(node, 0)

vi = self.known_vi_[node.output[0]]
vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_dtype, output_shape))

def _infer_Einsum(self, node): # noqa: N802
# ref:https://github.com/onnx/onnx/blob/623dfaa0151b2e4ce49779c3ec31cbd78c592b80/onnx/defs/math/defs.cc#L3275
equation = get_attribute(node, "equation")
Expand Down
41 changes: 34 additions & 7 deletions onnxruntime/test/providers/qnn/split_op_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -302,19 +302,46 @@ TEST_F(QnnHTPBackendTests, Split_Int32_Opset13) {
// Test 8-bit QDQ Split opset 18 on HTP backend: equal split of axis 0 via 'num_outputs' attribute
// and 'split' input.
TEST_F(QnnHTPBackendTests, Split_Equal_Axis0_Opset18) {
// Split 6 into 3 outputs of lengths [2, 2, 2]
TestInputDef<float> input_def({6, 2}, false,
{0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.f, 8.f, 9.0f, 10.0f, 11.0f});

// Use 'split' input (initializer).
RunQDQSplitOpTestOnHTP<uint8_t>(TestInputDef<float>({4, 2}, false, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.f, 8.f}),
{2, 2}, // split
0, // axis
-1, // num_outputs
18, // opset
RunQDQSplitOpTestOnHTP<uint8_t>(input_def,
{2, 2, 2}, // split
0, // axis
-1, // num_outputs
18, // opset
ExpectedEPNodeAssignment::All);

// Use 'num_outputs' attribute.
RunQDQSplitOpTestOnHTP<uint8_t>(TestInputDef<float>({4, 2}, false, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.f, 8.f}),
RunQDQSplitOpTestOnHTP<uint8_t>(input_def,
{}, // split (use num_outputs instead)
0, // axis
3, // num_outputs
18, // opset
ExpectedEPNodeAssignment::All);
}

// Test 8-bit QDQ Split opset 18 on HTP backend. Use an uneven split (last chunk should be smaller).
TEST_F(QnnHTPBackendTests, Split_NonEqual_Axis0_Opset18) {
// Split 7 into 3 outputs of lengths [3, 3, 1]
TestInputDef<float> input_def({7, 2}, false,
{0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.f, 8.f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f});

// Use a `split` input with uneven split lengths.
RunQDQSplitOpTestOnHTP<uint8_t>(input_def,
{3, 3, 1}, // split
0, // axis
-1, // num_outputs
18, // opset
ExpectedEPNodeAssignment::All);

// Use a `num_outputs` attribute that does not evenly divide into shape[axis].
RunQDQSplitOpTestOnHTP<uint8_t>(input_def,
{}, // split (use num_outputs instead)
0, // axis
2, // num_outputs
3, // num_outputs
18, // opset
ExpectedEPNodeAssignment::All);
}
Expand Down
Loading
Loading