Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin' into prathikrao/gather-elements…
Browse files Browse the repository at this point in the history
…-webgpu-ep
  • Loading branch information
prathikr committed Dec 18, 2024
2 parents 9ddda16 + 5d7030e commit 3831e22
Show file tree
Hide file tree
Showing 43 changed files with 94 additions and 456 deletions.
1 change: 0 additions & 1 deletion java/src/test/java/ai/onnxruntime/InferenceTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -737,7 +737,6 @@ public void testCoreML() throws OrtException {
runProvider(OrtProvider.CORE_ML);
}

@Disabled("DirectML Java API hasn't been supported yet")
@Test
@EnabledIfSystemProperty(named = "USE_DML", matches = "1")
public void testDirectML() throws OrtException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,13 @@
import java.util.HashMap;
import java.util.Map;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.condition.DisabledIfSystemProperty;
import org.junit.jupiter.api.condition.EnabledIfSystemProperty;

public class ProviderOptionsTest {
private static final OrtEnvironment env = TestHelpers.getOrtEnvironment();

@Test
@EnabledIfSystemProperty(named = "USE_CUDA", matches = "1")
@DisabledIfSystemProperty(named = "NO_CUDA_TEST", matches = "1")
public void testCUDAOptions() throws OrtException {
// Test standard options
OrtCUDAProviderOptions cudaOpts = new OrtCUDAProviderOptions(0);
Expand Down Expand Up @@ -63,7 +61,6 @@ public void testCUDAOptions() throws OrtException {

@Test
@EnabledIfSystemProperty(named = "USE_TENSORRT", matches = "1")
@DisabledIfSystemProperty(named = "NO_CUDA_TEST", matches = "1")
public void testTensorRT() throws OrtException {
// Test standard options
OrtTensorRTProviderOptions rtOpts = new OrtTensorRTProviderOptions(0);
Expand Down
9 changes: 0 additions & 9 deletions onnxruntime/test/common/cuda_op_test_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,6 @@

#include "test/util/include/default_providers.h"

#define SKIP_CUDA_TEST_WITH_DML \
if (DefaultCudaExecutionProvider() == nullptr) { \
GTEST_SKIP() << "CUDA Tests are not supported while DML is enabled"; \
}

namespace onnxruntime {
namespace test {

Expand All @@ -18,10 +13,6 @@ namespace test {
int GetCudaArchitecture();

inline bool HasCudaEnvironment(int min_cuda_architecture) {
if (DefaultCudaExecutionProvider() == nullptr) {
return false;
}

if (DefaultCudaExecutionProvider().get() == nullptr) {
return false;
}
Expand Down
6 changes: 0 additions & 6 deletions onnxruntime/test/contrib_ops/beam_search_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,6 @@ TEST(BeamSearchTest, GptBeamSearchFp32) {
const char* const output_names[] = {"sequences"};

Ort::SessionOptions session_options;
#if defined(USE_CUDA) && defined(USE_DML)
SKIP_CUDA_TEST_WITH_DML;
#endif
#ifdef USE_CUDA
OrtCUDAProviderOptionsV2 cuda_options;
cuda_options.use_tf32 = false;
Expand Down Expand Up @@ -171,9 +168,6 @@ TEST(BeamSearchTest, GptBeamSearchFp16) {
bool enable_rocm = (nullptr != DefaultRocmExecutionProvider().get());
if (enable_cuda || enable_rocm) {
Ort::SessionOptions session_options;
#if defined(USE_CUDA) && defined(USE_DML)
SKIP_CUDA_TEST_WITH_DML;
#endif
#ifdef USE_CUDA
OrtCUDAProviderOptionsV2 cuda_options;
cuda_options.use_tf32 = false;
Expand Down
3 changes: 0 additions & 3 deletions onnxruntime/test/contrib_ops/bias_dropout_op_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -181,9 +181,6 @@ void RunBiasDropoutTest(const bool use_mask, const std::vector<int64_t>& input_s
t.SetCustomOutputVerifier(output_verifier);
std::vector<std::unique_ptr<IExecutionProvider>> t_eps;
#ifdef USE_CUDA
if (DefaultCudaExecutionProvider() == nullptr) {
return;
}
t_eps.emplace_back(DefaultCudaExecutionProvider());
#elif USE_ROCM
t_eps.emplace_back(DefaultRocmExecutionProvider());
Expand Down
7 changes: 1 addition & 6 deletions onnxruntime/test/contrib_ops/bitmask_dropout_op_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,7 @@ void RunTestForInference(const std::vector<int64_t>& input_dims, bool has_ratio

std::vector<std::unique_ptr<IExecutionProvider>> test_eps;
#ifdef USE_CUDA
if (DefaultCudaExecutionProvider() != nullptr) {
test_eps.emplace_back(DefaultCudaExecutionProvider());
}
test_eps.emplace_back(DefaultCudaExecutionProvider());
#elif USE_ROCM
test_eps.emplace_back(DefaultRocmExecutionProvider());
#endif
Expand Down Expand Up @@ -124,9 +122,6 @@ void RunTestForTraining(const std::vector<int64_t>& input_dims) {

std::vector<std::unique_ptr<IExecutionProvider>> dropout_eps;
#ifdef USE_CUDA
if (DefaultCudaExecutionProvider() == nullptr) {
return;
}
dropout_eps.emplace_back(DefaultCudaExecutionProvider());
#elif USE_ROCM
dropout_eps.emplace_back(DefaultRocmExecutionProvider());
Expand Down
13 changes: 3 additions & 10 deletions onnxruntime/test/contrib_ops/layer_norm_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
// Licensed under the MIT License.

#include "test/providers/compare_provider_test_utils.h"
#include "test/util/include/default_providers.h"

namespace onnxruntime {
namespace test {
Expand Down Expand Up @@ -80,20 +79,14 @@ static void TestLayerNorm(const std::vector<int64_t>& x_dims,
#endif

#ifdef USE_CUDA
if (DefaultCudaExecutionProvider() != nullptr) {
test.CompareWithCPU(kCudaExecutionProvider);
}
test.CompareWithCPU(kCudaExecutionProvider);
#elif USE_ROCM
test.CompareWithCPU(kRocmExecutionProvider);
#elif USE_DML
test.CompareWithCPU(kDmlExecutionProvider);
#elif USE_WEBGPU
test.CompareWithCPU(kWebGpuExecutionProvider);
#endif

#ifdef USE_DML
if (DefaultDmlExecutionProvider() != nullptr) {
test.CompareWithCPU(kDmlExecutionProvider);
}
#endif
}

TEST(CudaKernelTest, LayerNorm_NullInput) {
Expand Down
28 changes: 8 additions & 20 deletions onnxruntime/test/contrib_ops/matmul_4bits_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -490,17 +490,13 @@ void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, int64_t accura
std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
if (use_float16) {
#ifdef USE_CUDA
if (DefaultCudaExecutionProvider() != nullptr) {
execution_providers.push_back(DefaultCudaExecutionProvider());
}
execution_providers.push_back(DefaultCudaExecutionProvider());
#endif
#ifdef USE_ROCM
execution_providers.push_back(DefaultRocmExecutionProvider());
#endif
#ifdef USE_DML
if (DefaultDmlExecutionProvider() != nullptr) {
execution_providers.push_back(DefaultDmlExecutionProvider());
}
execution_providers.push_back(DefaultDmlExecutionProvider());
#endif
#ifdef USE_WEBGPU
execution_providers.push_back(DefaultWebGpuExecutionProvider());
Expand All @@ -518,11 +514,8 @@ void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, int64_t accura
} // namespace

TEST(MatMulNBits, Float16Cuda) {
#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML)
std::vector<bool> has_gidx_options = {true, false};
if (DefaultDmlExecutionProvider() != nullptr) {
has_gidx_options.assign(1, false);
}
#if defined(USE_CUDA) || defined(USE_ROCM)
auto has_gidx_options = {true, false};
#else
auto has_gidx_options = {false};
#endif
Expand All @@ -533,9 +526,7 @@ TEST(MatMulNBits, Float16Cuda) {
for (auto block_size : {16, 32, 64, 128}) {
for (auto has_gidx : has_gidx_options) {
#ifdef USE_DML
if (DefaultDmlExecutionProvider() != nullptr) {
RunTest(M, N, K, block_size, 0, false, true, has_gidx, true, 0.04f);
}
RunTest(M, N, K, block_size, 0, false, true, has_gidx, true, 0.04f);
#else
RunTest(M, N, K, block_size, 0, false, true, has_gidx);
RunTest(M, N, K, block_size, 0, true, true, has_gidx, false);
Expand All @@ -548,16 +539,12 @@ TEST(MatMulNBits, Float16Cuda) {
}

TEST(MatMulNBits, Float16Large) {
#if defined(USE_CUDA) || defined(USE_DML)
#ifdef USE_DML
// For some reason, the A10 machine that runs these tests during CI has a much bigger error than all retail
// machines we tested on. All consumer-grade machines from Nvidia/AMD/Intel seem to pass these tests with an
// absolute error of 0.08, but the A10 has errors going as high as 0.22. Ultimately, given the large number
// of elements in this test, ULPs should probably be used instead of absolute/relative tolerances.
float abs_error = 0.05f;
if (DefaultDmlExecutionProvider() != nullptr) {
// it means the ep is dml in runtime, the abs_error is changed to 0.3f
abs_error = 0.3f;
}
float abs_error = 0.3f;
#elif USE_WEBGPU
// See Intel A770 to pass these tests with an absolute error of 0.08.
float abs_error = 0.08f;
Expand All @@ -573,6 +560,7 @@ TEST(MatMulNBits, Float16Large) {
}
}
}

#endif // defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML)
} // namespace test
} // namespace onnxruntime
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_U8S8) {
}

// DML EP supports Float16 output type and Signed A Matrix and Unsigned B Matric for Float32 output
#if defined(USE_DML) && !defined(USE_CUDA)
#if defined(USE_DML)

TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8U8) {
RunMatMulIntegerToFloatTest<int8_t, uint8_t, float, true, false>();
Expand Down
20 changes: 1 addition & 19 deletions onnxruntime/test/contrib_ops/tensor_op_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -121,15 +121,7 @@ void MeanVarianceNormalizationAcrossChannels(bool across_channels, bool normaliz
test.AddAttribute("normalize_variance", normalize_variance ? one : zero);
test.AddInput<float>("input", {N, C, H, W}, X);
test.AddOutput<float>("output", {N, C, H, W}, result);
#if defined(USE_CUDA) && defined(USE_DML)
if (DefaultCudaExecutionProvider() == nullptr) {
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kCudaExecutionProvider, kTensorrtExecutionProvider});
} else if (DefaultDmlExecutionProvider() == nullptr) {
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kDmlExecutionProvider, kTensorrtExecutionProvider});
}
#else
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kTensorrtExecutionProvider}); // OpenVINO doesn't support MVN operator below opset 9. TensorRT doesn't support opset 8 of MVN operator.
#endif
}

void MeanVarianceNormalizationPerChannel(bool across_channels, bool normalize_variance) {
Expand Down Expand Up @@ -196,15 +188,7 @@ void MeanVarianceNormalizationPerChannel(bool across_channels, bool normalize_va
test.AddAttribute("normalize_variance", normalize_variance ? one : zero);
test.AddInput<float>("input", {N, C, H, W}, X);
test.AddOutput<float>("output", {N, C, H, W}, result);
#if defined(USE_CUDA) && defined(USE_DML)
if (DefaultCudaExecutionProvider() == nullptr) {
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kCudaExecutionProvider, kTensorrtExecutionProvider});
} else if (DefaultDmlExecutionProvider() == nullptr) {
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kDmlExecutionProvider, kTensorrtExecutionProvider});
}
#else
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kTensorrtExecutionProvider}); // OpenVINO doesn't support MVN operator below opset 9. TensorRT doesn't support opset 8 of MVN operator.
#endif
}

TEST(MVNContribOpTest, MeanVarianceNormalizationCPUTest_Version1_TO_8) {
Expand Down Expand Up @@ -246,9 +230,7 @@ TEST(UnfoldTensorOpTest, LastDim) {

std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
#ifdef USE_CUDA
if (DefaultCudaExecutionProvider() != nullptr) {
execution_providers.push_back(DefaultCudaExecutionProvider());
}
execution_providers.push_back(DefaultCudaExecutionProvider());
#endif
execution_providers.push_back(DefaultCpuExecutionProvider());
tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
Expand Down
Loading

0 comments on commit 3831e22

Please sign in to comment.