diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc index 8138829b057f2..9fa1e155f0d7a 100644 --- a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc @@ -485,13 +485,17 @@ void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, int64_t accura std::vector> execution_providers; if (use_float16) { #ifdef USE_CUDA - execution_providers.push_back(DefaultCudaExecutionProvider()); + if (DefaultCudaExecutionProvider() != nullptr) { + execution_providers.push_back(DefaultCudaExecutionProvider()); + } #endif #ifdef USE_ROCM execution_providers.push_back(DefaultRocmExecutionProvider()); #endif #ifdef USE_DML - execution_providers.push_back(DefaultDmlExecutionProvider()); + if (DefaultDmlExecutionProvider() != nullptr) { + execution_providers.push_back(DefaultDmlExecutionProvider()); + } #endif RunTest(opts, std::move(execution_providers)); @@ -506,8 +510,11 @@ void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, int64_t accura } // namespace TEST(MatMulNBits, Float16Cuda) { -#if defined(USE_CUDA) || defined(USE_ROCM) +#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML) auto has_gidx_options = {true, false}; + if (DefaultDmlExecutionProvider() != nullptr) { + has_gidx_options = {false}; + } #else auto has_gidx_options = {false}; #endif @@ -518,7 +525,9 @@ TEST(MatMulNBits, Float16Cuda) { for (auto block_size : {16, 32, 64, 128}) { for (auto has_gidx : has_gidx_options) { #ifdef USE_DML - RunTest(M, N, K, block_size, 0, false, true, has_gidx, true, 0.04f); + if (DefaultDmlExecutionProvider() != nullptr) { + RunTest(M, N, K, block_size, 0, false, true, has_gidx, true, 0.04f); + } #else RunTest(M, N, K, block_size, 0, false, true, has_gidx); RunTest(M, N, K, block_size, 0, true, true, has_gidx, false); @@ -531,12 +540,15 @@ TEST(MatMulNBits, Float16Cuda) { } TEST(MatMulNBits, Float16Large) { -#ifdef USE_DML +#if defined(USE_CUDA) || defined(USE_DML) // For some reason, the A10 machine that runs these tests during CI has a much bigger error than all retail // machines we tested on. All consumer-grade machines from Nvidia/AMD/Intel seem to pass these tests with an // absolute error of 0.08, but the A10 has errors going as high as 0.22. Ultimately, given the large number // of elements in this test, ULPs should probably be used instead of absolute/relative tolerances. float abs_error = 0.3f; + if (DefaultDmlExecutionProvider() != nullptr) { + abs_error = 0.05f; + } #else float abs_error = 0.05f; #endif @@ -549,7 +561,6 @@ TEST(MatMulNBits, Float16Large) { } } } - #endif // defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML) } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc index 8d7629b5fda1c..d88c3131a4ca5 100644 --- a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc @@ -227,7 +227,7 @@ TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_U8S8) { } // DML EP supports Float16 output type and Signed A Matrix and Unsigned B Matric for Float32 output -#if defined(USE_DML) +#if defined(USE_DML) && !defined(USE_CUDA) TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8U8) { RunMatMulIntegerToFloatTest(); diff --git a/onnxruntime/test/lora/lora_test.cc b/onnxruntime/test/lora/lora_test.cc index fde603858f9a9..4155cb7abc279 100644 --- a/onnxruntime/test/lora/lora_test.cc +++ b/onnxruntime/test/lora/lora_test.cc @@ -201,6 +201,14 @@ TEST(LoraAdapterTest, Load) { #ifdef USE_CUDA TEST(LoraAdapterTest, VerifyCudaDeviceCopy) { + if (DefaultCudaExecutionProvider() == nullptr) { + GTEST_SKIP() << "Skip This Test Due to this EP is null"; + } +#ifdef USE_DML + if (DefaultDmlExecutionProvider() == nullptr) { + GTEST_SKIP() << "It should not run with DML EP"; + } +#endif auto cpu_ep = DefaultCpuExecutionProvider(); auto cpu_allocator = cpu_ep->CreatePreferredAllocators()[0]; auto cuda_allocator = DefaultCudaExecutionProvider()->CreatePreferredAllocators()[0]; @@ -234,6 +242,14 @@ TEST(LoraAdapterTest, VerifyCudaDeviceCopy) { #ifdef USE_DML TEST(LoraAdapterTest, VerifyDmlDeviceCopy) { + if (DefaultDmlExecutionProvider() == nullptr) { + GTEST_SKIP() << "Skip This Test Due to this EP is null"; + } +#ifdef USE_CUDA + if (DefaultCudaExecutionProvider() == nullptr) { + GTEST_SKIP() << "It should not run with CUDA EP"; + } +#endif auto cpu_ep = DefaultCpuExecutionProvider(); auto cpu_allocator = cpu_ep->CreatePreferredAllocators()[0]; diff --git a/onnxruntime/test/providers/cpu/model_tests.cc b/onnxruntime/test/providers/cpu/model_tests.cc index e3c86a137484f..b46c253fb8ed9 100644 --- a/onnxruntime/test/providers/cpu/model_tests.cc +++ b/onnxruntime/test/providers/cpu/model_tests.cc @@ -491,6 +491,18 @@ ::std::vector<::std::basic_string> GetParameterStrings() { // the number of times these are run to reduce the CI time. provider_names.erase(provider_name_cpu); #endif + +#if defined(USE_CUDA) && defined(USE_DML) + const std::string no_cuda_ep_test = Env::Default().GetEnvironmentVar("NO_CUDA_TEST"); + if (no_cuda_ep_test == "1") { + provider_names.erase(provider_name_cuda); + } + const std::string no_dml_ep_test = Env::Default().GetEnvironmentVar("NO_DML_TEST"); + if (no_dml_ep_test == "1") { + provider_names.erase(provider_name_dml); + } +#endif + std::vector> v; // Permanently exclude following tests because ORT support only opset starting from 7, // Please make no more changes to the list diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index 2bc7be6b0115c..9624f9112c49f 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -2072,7 +2072,7 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs): executables.append("onnxruntime_global_thread_pools_test") executables.append("onnxruntime_customopregistration_test") for exe in executables: - test_output = f"--gtest_filter=*FusedMatMulOpTest* --gtest_output=xml:{cwd}/{exe}.{config}.results.xml" + test_output = f"--gtest_output=xml:{cwd}/{exe}.{config}.results.xml" run_subprocess([os.path.join(cwd, exe), test_output], cwd=cwd, dll_path=dll_path) else: ctest_cmd = [ctest_path, "--build-config", config, "--verbose", "--timeout", args.test_all_timeout]