From b2b39c9db1089626d292bdfe599c7264c09bfe4c Mon Sep 17 00:00:00 2001 From: mindest Date: Mon, 5 Aug 2024 10:01:58 +0000 Subject: [PATCH 01/36] Test new CI on MIGraphX CI. --- .../linux-migraphx-ci-pipeline.yml | 28 +++++++++++++++---- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml index 6bf6324252fb9..b2287432d1053 100644 --- a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml @@ -101,13 +101,12 @@ jobs: ccache -s; \ python tools/ci_build/build.py \ --config Release \ - --enable_training \ --cmake_extra_defines \ CMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \ onnxruntime_BUILD_KERNEL_EXPLORER=OFF \ - onnxruntime_USE_COMPOSABLE_KERNEL=OFF \ + onnxruntime_USE_COMPOSABLE_KERNEL=ON \ --mpi_home /opt/ompi \ - --use_migraphx \ + --use_rocm \ --rocm_version=$(RocmVersion) \ --rocm_home /opt/rocm \ --nccl_home /opt/rocm \ @@ -165,7 +164,7 @@ jobs: Dockerfile: tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile Context: tools/ci_build/github/linux/docker DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)$(RocmVersionPatchSuffix)" - Repository: onnxruntimetrainingmigraphx-cibuild-rocm$(RocmVersion) + Repository: onnxruntimerocm-cibuild-rocm$(RocmVersion) - task: CmdLine@2 inputs: @@ -185,7 +184,26 @@ jobs: /bin/bash -c " set -ex; \ cd /build/Release && xargs -a /build/Release/perms.txt chmod a+x; \ - bash /onnxruntime_src/tools/ci_build/github/pai/pai_test_launcher.sh" + python /onnxruntime_src/tools/ci_build/build.py \ + --config Release \ + --cmake_extra_defines \ + CMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \ + onnxruntime_BUILD_KERNEL_EXPLORER=OFF \ + onnxruntime_USE_COMPOSABLE_KERNEL=ON \ + --mpi_home /opt/ompi \ + --use_rocm \ + --rocm_version=$(RocmVersion) \ + --rocm_home /opt/rocm \ + --nccl_home /opt/rocm \ + --enable_nccl \ + --update \ + --build_dir /build \ + --build \ + --parallel \ + --build_wheel \ + --skip_submodule_sync \ + --tests --enable_onnx_tests --enable_transformers_tool_test \ + --cmake_path /usr/bin/cmake --ctest_path /usr/bin/ctest" workingDirectory: $(Build.SourcesDirectory) displayName: 'Run onnxruntime unit tests' From b9efcf0547e4f1df95cf290d95dcf63a0847d108 Mon Sep 17 00:00:00 2001 From: mindest Date: Mon, 5 Aug 2024 10:28:33 +0000 Subject: [PATCH 02/36] test: turn kernel explorer on --- .../github/azure-pipelines/linux-migraphx-ci-pipeline.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml index b2287432d1053..e4a683393659e 100644 --- a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml @@ -103,8 +103,7 @@ jobs: --config Release \ --cmake_extra_defines \ CMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \ - onnxruntime_BUILD_KERNEL_EXPLORER=OFF \ - onnxruntime_USE_COMPOSABLE_KERNEL=ON \ + onnxruntime_BUILD_KERNEL_EXPLORER=ON \ --mpi_home /opt/ompi \ --use_rocm \ --rocm_version=$(RocmVersion) \ From 26607a81ed1e533dbca32efd1eef597ae95bc45b Mon Sep 17 00:00:00 2001 From: mindest Date: Mon, 5 Aug 2024 11:29:23 +0000 Subject: [PATCH 03/36] test: restore original settings --- .../github/azure-pipelines/linux-migraphx-ci-pipeline.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml index e4a683393659e..111ed7bedc6b9 100644 --- a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml @@ -103,7 +103,8 @@ jobs: --config Release \ --cmake_extra_defines \ CMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \ - onnxruntime_BUILD_KERNEL_EXPLORER=ON \ + onnxruntime_BUILD_KERNEL_EXPLORER=OFF \ + onnxruntime_USE_COMPOSABLE_KERNEL=OFF \ --mpi_home /opt/ompi \ --use_rocm \ --rocm_version=$(RocmVersion) \ From 2ecbf492ba0bb5899e625ed10b7016c404880c20 Mon Sep 17 00:00:00 2001 From: mindest Date: Tue, 6 Aug 2024 03:33:48 +0000 Subject: [PATCH 04/36] Include nccl_service only in training --- onnxruntime/core/providers/rocm/rocm_provider_factory.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/core/providers/rocm/rocm_provider_factory.cc b/onnxruntime/core/providers/rocm/rocm_provider_factory.cc index a739fe0a5d193..d00aa92599762 100644 --- a/onnxruntime/core/providers/rocm/rocm_provider_factory.cc +++ b/onnxruntime/core/providers/rocm/rocm_provider_factory.cc @@ -13,7 +13,7 @@ #include "core/providers/rocm/gpu_data_transfer.h" #include "core/providers/rocm/math/unary_elementwise_ops_impl.h" -#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) +#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) && defined(ENABLE_TRAINING) #include "orttraining/training_ops/rocm/communication/nccl_service.h" #endif From 22c3dd1ebf7446c02e639400009f4e9e09bf5ba5 Mon Sep 17 00:00:00 2001 From: mindest Date: Tue, 6 Aug 2024 03:50:19 +0000 Subject: [PATCH 05/36] Dockerfile: add git install --- .../github/linux/docker/migraphx-ci-pipeline-env.Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile b/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile index 98ea5e119c319..7f1500794ca71 100644 --- a/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile +++ b/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile @@ -81,3 +81,5 @@ RUN ln -sf /usr/lib/x86_64-linux-gnu/libstdc++.so.6 ${CONDA_ENVIRONMENT_PATH}/bi RUN apt update && apt install -y migraphx RUN pip install numpy packaging ml_dtypes==0.3.0 + +RUN apt install -y git From 4bedff8070b812edd64d7d8dcbe4d7878ac8cd19 Mon Sep 17 00:00:00 2001 From: mindest Date: Tue, 6 Aug 2024 03:56:31 +0000 Subject: [PATCH 06/36] Turn on ck and ke --- .../github/azure-pipelines/linux-migraphx-ci-pipeline.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml index 111ed7bedc6b9..53fec98b33c35 100644 --- a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml @@ -103,8 +103,7 @@ jobs: --config Release \ --cmake_extra_defines \ CMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \ - onnxruntime_BUILD_KERNEL_EXPLORER=OFF \ - onnxruntime_USE_COMPOSABLE_KERNEL=OFF \ + onnxruntime_BUILD_KERNEL_EXPLORER=ON \ --mpi_home /opt/ompi \ --use_rocm \ --rocm_version=$(RocmVersion) \ @@ -188,8 +187,7 @@ jobs: --config Release \ --cmake_extra_defines \ CMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \ - onnxruntime_BUILD_KERNEL_EXPLORER=OFF \ - onnxruntime_USE_COMPOSABLE_KERNEL=ON \ + onnxruntime_BUILD_KERNEL_EXPLORER=ON \ --mpi_home /opt/ompi \ --use_rocm \ --rocm_version=$(RocmVersion) \ From f3fe61f80ce3d7984098041718ce0bea0b349f90 Mon Sep 17 00:00:00 2001 From: mindest Date: Tue, 6 Aug 2024 07:46:37 +0000 Subject: [PATCH 07/36] Correct flag --test. --- .../github/azure-pipelines/linux-migraphx-ci-pipeline.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml index 53fec98b33c35..4bd7eb73ffb4f 100644 --- a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml @@ -200,7 +200,7 @@ jobs: --parallel \ --build_wheel \ --skip_submodule_sync \ - --tests --enable_onnx_tests --enable_transformers_tool_test \ + --test --enable_onnx_tests --enable_transformers_tool_test \ --cmake_path /usr/bin/cmake --ctest_path /usr/bin/ctest" workingDirectory: $(Build.SourcesDirectory) displayName: 'Run onnxruntime unit tests' From 7c03f616444f85a266291b237b29e6fc5bf71d5b Mon Sep 17 00:00:00 2001 From: mindest Date: Tue, 6 Aug 2024 08:32:37 +0000 Subject: [PATCH 08/36] Specify arch gfx90a only; enable_training macro --- onnxruntime/core/providers/rocm/rocm_provider_factory.cc | 4 ++-- onnxruntime/core/providers/rocm/rocm_provider_factory.h | 2 +- .../github/azure-pipelines/linux-migraphx-ci-pipeline.yml | 2 ++ 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/onnxruntime/core/providers/rocm/rocm_provider_factory.cc b/onnxruntime/core/providers/rocm/rocm_provider_factory.cc index d00aa92599762..fdf64d07e0a6c 100644 --- a/onnxruntime/core/providers/rocm/rocm_provider_factory.cc +++ b/onnxruntime/core/providers/rocm/rocm_provider_factory.cc @@ -21,7 +21,7 @@ using namespace onnxruntime; namespace onnxruntime { -#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) +#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) && defined(ENABLE_TRAINING) namespace rocm { rocm::INcclService& GetINcclService(); } @@ -155,7 +155,7 @@ struct ProviderInfo_ROCM_Impl final : ProviderInfo_ROCM { info = ROCMExecutionProviderInfo::FromProviderOptions(options); } -#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) +#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) && defined(ENABLE_TRAINING) rocm::INcclService& GetINcclService() override { return rocm::GetINcclService(); } diff --git a/onnxruntime/core/providers/rocm/rocm_provider_factory.h b/onnxruntime/core/providers/rocm/rocm_provider_factory.h index 80b887af4eb75..3238d66cee479 100644 --- a/onnxruntime/core/providers/rocm/rocm_provider_factory.h +++ b/onnxruntime/core/providers/rocm/rocm_provider_factory.h @@ -39,7 +39,7 @@ struct ProviderInfo_ROCM { virtual int hipGetDeviceCount() = 0; virtual void ROCMExecutionProviderInfo__FromProviderOptions(const onnxruntime::ProviderOptions& options, onnxruntime::ROCMExecutionProviderInfo& info) = 0; -#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) +#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) && defined(ENABLE_TRAINING) virtual onnxruntime::rocm::INcclService& GetINcclService() = 0; #endif diff --git a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml index 4bd7eb73ffb4f..f8cc9c911f275 100644 --- a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml @@ -104,6 +104,7 @@ jobs: --cmake_extra_defines \ CMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \ onnxruntime_BUILD_KERNEL_EXPLORER=ON \ + CMAKE_HIP_ARCHITECTURES=gfx90a \ --mpi_home /opt/ompi \ --use_rocm \ --rocm_version=$(RocmVersion) \ @@ -188,6 +189,7 @@ jobs: --cmake_extra_defines \ CMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \ onnxruntime_BUILD_KERNEL_EXPLORER=ON \ + CMAKE_HIP_ARCHITECTURES=gfx90a \ --mpi_home /opt/ompi \ --use_rocm \ --rocm_version=$(RocmVersion) \ From 16105adaef952f904e74ee4f125a7bb9155991d0 Mon Sep 17 00:00:00 2001 From: mindest Date: Tue, 6 Aug 2024 12:53:49 +0000 Subject: [PATCH 09/36] Disable test_kernels; cd to /tmp --- cmake/onnxruntime_kernel_explorer.cmake | 2 +- .../github/azure-pipelines/linux-migraphx-ci-pipeline.yml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/cmake/onnxruntime_kernel_explorer.cmake b/cmake/onnxruntime_kernel_explorer.cmake index 4d3db9c949daf..7de4f7b3f926b 100644 --- a/cmake/onnxruntime_kernel_explorer.cmake +++ b/cmake/onnxruntime_kernel_explorer.cmake @@ -89,4 +89,4 @@ add_dependencies(kernel_explorer onnxruntime_pybind11_state) enable_testing() find_package(Python COMPONENTS Interpreter REQUIRED) -add_test(NAME test_kernels COMMAND ${Python_EXECUTABLE} -m pytest ..) +# add_test(NAME test_kernels COMMAND ${Python_EXECUTABLE} -m pytest ..) diff --git a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml index f8cc9c911f275..4723bcc49e763 100644 --- a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml @@ -184,6 +184,7 @@ jobs: /bin/bash -c " set -ex; \ cd /build/Release && xargs -a /build/Release/perms.txt chmod a+x; \ + cd /tmp; \ python /onnxruntime_src/tools/ci_build/build.py \ --config Release \ --cmake_extra_defines \ From eca3ba7697f21ea5ee8a32793201fe2abb0aab64 Mon Sep 17 00:00:00 2001 From: mindest Date: Tue, 6 Aug 2024 14:44:22 +0000 Subject: [PATCH 10/36] test: some debug outputs; cache_dir --- .../github/azure-pipelines/linux-migraphx-ci-pipeline.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml index 4723bcc49e763..efdaf2e50ff46 100644 --- a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml @@ -185,6 +185,8 @@ jobs: set -ex; \ cd /build/Release && xargs -a /build/Release/perms.txt chmod a+x; \ cd /tmp; \ + echo $HOME; \ + export CACHE_DIR=/tmp; \ python /onnxruntime_src/tools/ci_build/build.py \ --config Release \ --cmake_extra_defines \ From 9152170fe2c179af3da3ad189136cdc1ae89b457 Mon Sep 17 00:00:00 2001 From: mindest Date: Tue, 6 Aug 2024 16:40:50 +0000 Subject: [PATCH 11/36] test: revert --- .../github/azure-pipelines/linux-migraphx-ci-pipeline.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml index efdaf2e50ff46..f8cc9c911f275 100644 --- a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml @@ -184,9 +184,6 @@ jobs: /bin/bash -c " set -ex; \ cd /build/Release && xargs -a /build/Release/perms.txt chmod a+x; \ - cd /tmp; \ - echo $HOME; \ - export CACHE_DIR=/tmp; \ python /onnxruntime_src/tools/ci_build/build.py \ --config Release \ --cmake_extra_defines \ From 5f5cccfee98aedb13e11e0251fce355065ba3899 Mon Sep 17 00:00:00 2001 From: mindest Date: Wed, 7 Aug 2024 05:06:23 +0000 Subject: [PATCH 12/36] Correct image name --- .../github/azure-pipelines/linux-migraphx-ci-pipeline.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml index f8cc9c911f275..47458868546fd 100644 --- a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml @@ -94,7 +94,7 @@ jobs: --volume $(CCACHE_DIR):/cache \ -e CCACHE_DIR=/cache \ --workdir /onnxruntime_src \ - onnxruntimetrainingmigraphx-cibuild-rocm$(RocmVersion) \ + onnxruntimerocm-cibuild-rocm$(RocmVersion) \ /bin/bash -c " set -ex; \ env; \ @@ -180,7 +180,7 @@ jobs: --volume $(Build.SourcesDirectory):/onnxruntime_src \ --volume $(Build.BinariesDirectory):/build \ --workdir /build/Release \ - onnxruntimetrainingmigraphx-cibuild-rocm$(RocmVersion) \ + onnxruntimerocm-cibuild-rocm$(RocmVersion) \ /bin/bash -c " set -ex; \ cd /build/Release && xargs -a /build/Release/perms.txt chmod a+x; \ From dd54945b1991cf5b3714d7ab48967d30df4d0c0f Mon Sep 17 00:00:00 2001 From: mindest Date: Wed, 7 Aug 2024 05:48:32 +0000 Subject: [PATCH 13/36] Correct image name 2 --- .../github/azure-pipelines/linux-migraphx-ci-pipeline.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml index 47458868546fd..8c5067f7fdedf 100644 --- a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml @@ -66,7 +66,7 @@ jobs: Dockerfile: tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile Context: tools/ci_build/github/linux/docker DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)$(RocmVersionPatchSuffix)" - Repository: onnxruntimetrainingmigraphx-cibuild-rocm$(RocmVersion) + Repository: onnxruntimerocm-cibuild-rocm$(RocmVersion) - task: Cache@2 inputs: From 10febab4339e9cd5fde47b2757979967defb47c3 Mon Sep 17 00:00:00 2001 From: mindest Date: Wed, 7 Aug 2024 07:14:57 +0000 Subject: [PATCH 14/36] test: cache_dir --- .../github/azure-pipelines/linux-migraphx-ci-pipeline.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml index 8c5067f7fdedf..3bfdf4b0b9f67 100644 --- a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml @@ -184,6 +184,7 @@ jobs: /bin/bash -c " set -ex; \ cd /build/Release && xargs -a /build/Release/perms.txt chmod a+x; \ + export CACHE_DIR=/tmp; \ python /onnxruntime_src/tools/ci_build/build.py \ --config Release \ --cmake_extra_defines \ From 62101f752ea1a6b67dd2a5fa4762c283f09da212 Mon Sep 17 00:00:00 2001 From: mindest Date: Wed, 7 Aug 2024 07:27:56 +0000 Subject: [PATCH 15/36] Remove --update in test part. --- .../github/azure-pipelines/linux-migraphx-ci-pipeline.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml index 3bfdf4b0b9f67..dab0d340b31eb 100644 --- a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml @@ -184,7 +184,6 @@ jobs: /bin/bash -c " set -ex; \ cd /build/Release && xargs -a /build/Release/perms.txt chmod a+x; \ - export CACHE_DIR=/tmp; \ python /onnxruntime_src/tools/ci_build/build.py \ --config Release \ --cmake_extra_defines \ @@ -197,7 +196,6 @@ jobs: --rocm_home /opt/rocm \ --nccl_home /opt/rocm \ --enable_nccl \ - --update \ --build_dir /build \ --build \ --parallel \ From 1d0b082a2cc0db5e8a0b9a04593e85b083a98059 Mon Sep 17 00:00:00 2001 From: mindest Date: Wed, 7 Aug 2024 07:31:50 +0000 Subject: [PATCH 16/36] Remove --build in test part. --- .../github/azure-pipelines/linux-migraphx-ci-pipeline.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml index dab0d340b31eb..f22fc91ff5202 100644 --- a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml @@ -197,7 +197,6 @@ jobs: --nccl_home /opt/rocm \ --enable_nccl \ --build_dir /build \ - --build \ --parallel \ --build_wheel \ --skip_submodule_sync \ From d54bb42de797c793000ae331ffebd20d7030ea59 Mon Sep 17 00:00:00 2001 From: mindest Date: Wed, 7 Aug 2024 15:21:52 +0000 Subject: [PATCH 17/36] Add --build_shared_lib --enable_onnx_tests --- .../github/azure-pipelines/linux-migraphx-ci-pipeline.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml index f22fc91ff5202..c2c1dd9fd4257 100644 --- a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml @@ -114,8 +114,10 @@ jobs: --update \ --build_dir /build \ --build \ + --build_shared_lib \ --parallel \ --build_wheel \ + --enable_onnx_tests \ --skip_submodule_sync \ --use_cache \ --skip_tests --cmake_path /usr/bin/cmake --ctest_path /usr/bin/ctest; \ @@ -197,6 +199,7 @@ jobs: --nccl_home /opt/rocm \ --enable_nccl \ --build_dir /build \ + --build_shared_lib \ --parallel \ --build_wheel \ --skip_submodule_sync \ From f451a9f7e36d9c2183d8179180ba2f319b8a2785 Mon Sep 17 00:00:00 2001 From: mindest Date: Wed, 7 Aug 2024 15:33:33 +0000 Subject: [PATCH 18/36] Fix scatter op test for ROCm EP. --- onnxruntime/test/providers/cpu/tensor/scatter_op_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/test/providers/cpu/tensor/scatter_op_test.cc b/onnxruntime/test/providers/cpu/tensor/scatter_op_test.cc index 2a7a7158b5f62..d5da9a7631b42 100644 --- a/onnxruntime/test/providers/cpu/tensor/scatter_op_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/scatter_op_test.cc @@ -268,7 +268,7 @@ static void scatter_invalid_index(const char* op_name, int op_version) { test.AddOutput("y", {4, 2, 1}, {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 5.0f, 0.0f}); test.Run(OpTester::ExpectResult::kExpectFailure, "indices element out of data bounds, idx=4 must be within the inclusive range [-4,3]", - {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kTensorrtExecutionProvider}); + {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kRocmExecutionProvider}); } TEST(Scatter, InvalidIndex) { From 1b9f19af3b9a61de2f8d8f598f681b1c846b9d1f Mon Sep 17 00:00:00 2001 From: mindest Date: Thu, 8 Aug 2024 05:40:55 +0000 Subject: [PATCH 19/36] test: mount /data/build dir --- .../github/azure-pipelines/linux-migraphx-ci-pipeline.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml index c2c1dd9fd4257..e9d955dab74b9 100644 --- a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml @@ -181,6 +181,7 @@ jobs: --user $UID:$(id -g $USER) \ --volume $(Build.SourcesDirectory):/onnxruntime_src \ --volume $(Build.BinariesDirectory):/build \ + --volume /data/models:/build/models:ro \ --workdir /build/Release \ onnxruntimerocm-cibuild-rocm$(RocmVersion) \ /bin/bash -c " From 68eec095954c764ca88e45f4d2c4f67f84425af9 Mon Sep 17 00:00:00 2001 From: mindest Date: Thu, 8 Aug 2024 19:34:07 +0000 Subject: [PATCH 20/36] test: extend test time --- .../github/azure-pipelines/linux-migraphx-ci-pipeline.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml index e9d955dab74b9..a72d946ddff85 100644 --- a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml @@ -147,7 +147,7 @@ jobs: pool: AMD-GPU dependsOn: - Linux_Build - timeoutInMinutes: 120 + timeoutInMinutes: 240 steps: - task: DownloadPipelineArtifact@2 From 2f4f7db426f76949a0b93a9a5ac226c7a4e50b3c Mon Sep 17 00:00:00 2001 From: mindest Date: Fri, 9 Aug 2024 01:52:00 +0000 Subject: [PATCH 21/36] test: overwrite test timeout to 4h --- cmake/onnxruntime_unittests.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake index d5c3af748e528..0d0a3eac0977c 100644 --- a/cmake/onnxruntime_unittests.cmake +++ b/cmake/onnxruntime_unittests.cmake @@ -222,14 +222,14 @@ function(AddTest) ) endif() # Set test timeout to 3 hours. - set_tests_properties(${_UT_TARGET} PROPERTIES TIMEOUT 7200) + set_tests_properties(${_UT_TARGET} PROPERTIES TIMEOUT 14400) else() add_test(NAME ${_UT_TARGET} COMMAND ${_UT_TARGET} ${TEST_ARGS} WORKING_DIRECTORY $ ) # Set test timeout to 3 hours. - set_tests_properties(${_UT_TARGET} PROPERTIES TIMEOUT 7200) + set_tests_properties(${_UT_TARGET} PROPERTIES TIMEOUT 14400) endif() endif() endfunction(AddTest) From 705a45666f98380a7208db3fa036e21623cc47d7 Mon Sep 17 00:00:00 2001 From: mindest Date: Mon, 12 Aug 2024 08:47:43 +0000 Subject: [PATCH 22/36] Add disabled tests list; add ke test --- onnxruntime/test/providers/cpu/model_tests.cc | 27 +++++++++++++++++-- .../linux-migraphx-ci-pipeline.yml | 25 +++++++++++++++++ 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/onnxruntime/test/providers/cpu/model_tests.cc b/onnxruntime/test/providers/cpu/model_tests.cc index cb9887314eb66..da58c5183113a 100644 --- a/onnxruntime/test/providers/cpu/model_tests.cc +++ b/onnxruntime/test/providers/cpu/model_tests.cc @@ -95,7 +95,7 @@ TEST_P(ModelTest, Run) { // when cuda or openvino is enabled, set it to a larger value for resolving random MNIST test failure if (model_path.find(ORT_TSTR("_MNIST")) > 0) { - if (provider_name == "cuda" || provider_name == "openvino") { + if (provider_name == "cuda" || provider_name == "openvino" || provider_name =="rocm") { per_sample_tolerance = 2.5e-2; relative_per_sample_tolerance = 1e-2; } @@ -537,6 +537,21 @@ ::std::vector<::std::basic_string> GetParameterStrings() { ORT_TSTR("fp16_test_tiny_yolov2"), ORT_TSTR("fp16_test_shufflenet"), ORT_TSTR("keras2coreml_SimpleRNN_ImageNet")}; + static const ORTCHAR_T* rocm_disabled_tests[] = {ORT_TSTR("bvlc_alexnet"), + ORT_TSTR("bvlc_reference_caffenet"), + ORT_TSTR("bvlc_reference_rcnn_ilsvrc13"), + ORT_TSTR("coreml_Resnet50_ImageNet"), + ORT_TSTR("mlperf_resnet"), + ORT_TSTR("mobilenetv2-1.0"), + ORT_TSTR("shufflenet"), + // models from model zoo + ORT_TSTR("AlexNet"), + ORT_TSTR("CaffeNet"), + ORT_TSTR("MobileNet v2-7"), + ORT_TSTR("R-CNN ILSVRC13"), + ORT_TSTR("ShuffleNet-v1"), + ORT_TSTR("version-RFB-320"), + ORT_TSTR("version-RFB-640")}; static const ORTCHAR_T* openvino_disabled_tests[] = { ORT_TSTR("tf_mobilenet_v1_1.0_224"), ORT_TSTR("bertsquad"), @@ -663,7 +678,14 @@ ::std::vector<::std::basic_string> GetParameterStrings() { std::unordered_set> all_disabled_tests(std::begin(immutable_broken_tests), std::end(immutable_broken_tests)); - if (provider_name == provider_name_cuda) { + bool provider_cuda_or_rocm = provider_name == provider_name_cuda; +#ifdef USE_ROCM + if (provider_name == provider_name_rocm) { + provider_cuda_or_rocm = true; + all_disabled_tests.insert(std::begin(rocm_disabled_tests), std::end(rocm_disabled_tests)); + } +#endif + if (provider_cuda_or_rocm) { all_disabled_tests.insert(std::begin(cuda_flaky_tests), std::end(cuda_flaky_tests)); } else if (provider_name == provider_name_dml) { all_disabled_tests.insert(std::begin(dml_disabled_tests), std::end(dml_disabled_tests)); @@ -735,6 +757,7 @@ ::std::vector<::std::basic_string> GetParameterStrings() { continue; } std::basic_string test_case_name = path.parent_path().filename().native(); + // std::cout << "Adding test: " << test_case_name << std::endl; if (test_case_name.compare(0, 5, ORT_TSTR("test_")) == 0) test_case_name = test_case_name.substr(5); if (all_disabled_tests.find(test_case_name) != all_disabled_tests.end()) diff --git a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml index a72d946ddff85..0576021d07b78 100644 --- a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml @@ -209,4 +209,29 @@ jobs: workingDirectory: $(Build.SourcesDirectory) displayName: 'Run onnxruntime unit tests' + - task: CmdLine@2 + inputs: + script: |- + docker run --rm \ + --security-opt seccomp=unconfined \ + --shm-size=1024m \ + --device=/dev/kfd \ + --device=/dev/dri/renderD$DRIVER_RENDER \ + --group-add $(video) \ + --group-add $(render) \ + --user onnxruntimedev \ + --volume $(Build.SourcesDirectory):/onnxruntime_src \ + --volume $(Build.BinariesDirectory):/build \ + -e OPENBLAS_NUM_THREADS=1 \ + -e OPENMP_NUM_THREADS=1 \ + -e MKL_NUM_THREADS=1 \ + -e KERNEL_EXPLORER_BUILD_DIR=/build/$(BuildConfig) \ + -e KERNEL_EXPLORER_BATCHED_GEMM_MAX_BATCH_SIZE=8 \ + -e KERNEL_EXPLORER_TEST_USE_CUPY=1 \ + onnxruntimerocm-cibuild-rocm$(RocmVersion)-test \ + pytest /onnxruntime_src/onnxruntime/python/tools/kernel_explorer/ -n 4 --reruns 1 --durations=100 + workingDirectory: $(Build.SourcesDirectory) + displayName: 'Run kernel explorer tests' + condition: succeededOrFailed() + - template: templates/clean-agent-build-directory-step.yml From a6d9b5c818ccf6137b62ce9e1f3c7e5e322aa4c4 Mon Sep 17 00:00:00 2001 From: mindest Date: Tue, 13 Aug 2024 04:13:28 +0000 Subject: [PATCH 23/36] Fix EP name. --- onnxruntime/test/python/onnxruntime_test_python.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py index 32eac6f7638c1..4a197001c3d2a 100644 --- a/onnxruntime/test/python/onnxruntime_test_python.py +++ b/onnxruntime/test/python/onnxruntime_test_python.py @@ -1689,7 +1689,7 @@ def test_register_custom_e_ps_library(self): available_eps = C.get_available_providers() # skip amd gpu build - if "kRocmExecutionProvider" in available_eps: + if "ROCMExecutionProvider" in available_eps: return if sys.platform.startswith("win"): shared_library = "test_execution_provider.dll" From f263bf9543bef68e5b3b2f20a41e9b5aa7e97dda Mon Sep 17 00:00:00 2001 From: mindest Date: Tue, 13 Aug 2024 04:18:03 +0000 Subject: [PATCH 24/36] Fix ke test setting --- .../github/azure-pipelines/linux-migraphx-ci-pipeline.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml index 0576021d07b78..b1a4f8ca2e4d2 100644 --- a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml @@ -225,10 +225,10 @@ jobs: -e OPENBLAS_NUM_THREADS=1 \ -e OPENMP_NUM_THREADS=1 \ -e MKL_NUM_THREADS=1 \ - -e KERNEL_EXPLORER_BUILD_DIR=/build/$(BuildConfig) \ + -e KERNEL_EXPLORER_BUILD_DIR=/build/Release \ -e KERNEL_EXPLORER_BATCHED_GEMM_MAX_BATCH_SIZE=8 \ -e KERNEL_EXPLORER_TEST_USE_CUPY=1 \ - onnxruntimerocm-cibuild-rocm$(RocmVersion)-test \ + onnxruntimerocm-cibuild-rocm$(RocmVersion) \ pytest /onnxruntime_src/onnxruntime/python/tools/kernel_explorer/ -n 4 --reruns 1 --durations=100 workingDirectory: $(Build.SourcesDirectory) displayName: 'Run kernel explorer tests' From a4ad7b8c92174b1a7b774fe158098581bbc659ae Mon Sep 17 00:00:00 2001 From: mindest Date: Tue, 13 Aug 2024 05:28:08 +0000 Subject: [PATCH 25/36] Fix lint, unused variable; ke user. --- onnxruntime/test/providers/cpu/model_tests.cc | 4 +--- .../github/azure-pipelines/linux-migraphx-ci-pipeline.yml | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/onnxruntime/test/providers/cpu/model_tests.cc b/onnxruntime/test/providers/cpu/model_tests.cc index da58c5183113a..20510f7ab937b 100644 --- a/onnxruntime/test/providers/cpu/model_tests.cc +++ b/onnxruntime/test/providers/cpu/model_tests.cc @@ -95,7 +95,7 @@ TEST_P(ModelTest, Run) { // when cuda or openvino is enabled, set it to a larger value for resolving random MNIST test failure if (model_path.find(ORT_TSTR("_MNIST")) > 0) { - if (provider_name == "cuda" || provider_name == "openvino" || provider_name =="rocm") { + if (provider_name == "cuda" || provider_name == "openvino" || provider_name == "rocm") { per_sample_tolerance = 2.5e-2; relative_per_sample_tolerance = 1e-2; } @@ -679,12 +679,10 @@ ::std::vector<::std::basic_string> GetParameterStrings() { std::unordered_set> all_disabled_tests(std::begin(immutable_broken_tests), std::end(immutable_broken_tests)); bool provider_cuda_or_rocm = provider_name == provider_name_cuda; -#ifdef USE_ROCM if (provider_name == provider_name_rocm) { provider_cuda_or_rocm = true; all_disabled_tests.insert(std::begin(rocm_disabled_tests), std::end(rocm_disabled_tests)); } -#endif if (provider_cuda_or_rocm) { all_disabled_tests.insert(std::begin(cuda_flaky_tests), std::end(cuda_flaky_tests)); } else if (provider_name == provider_name_dml) { diff --git a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml index b1a4f8ca2e4d2..353227612ded3 100644 --- a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml @@ -219,7 +219,7 @@ jobs: --device=/dev/dri/renderD$DRIVER_RENDER \ --group-add $(video) \ --group-add $(render) \ - --user onnxruntimedev \ + --user $UID:$(id -g $USER) \ --volume $(Build.SourcesDirectory):/onnxruntime_src \ --volume $(Build.BinariesDirectory):/build \ -e OPENBLAS_NUM_THREADS=1 \ From 664122fb83ce053ee3cf09e33caa979410a5e381 Mon Sep 17 00:00:00 2001 From: mindest Date: Tue, 13 Aug 2024 05:42:47 +0000 Subject: [PATCH 26/36] Remove ifdef for ROCm --- onnxruntime/test/providers/cpu/model_tests.cc | 4 ---- 1 file changed, 4 deletions(-) diff --git a/onnxruntime/test/providers/cpu/model_tests.cc b/onnxruntime/test/providers/cpu/model_tests.cc index 20510f7ab937b..e2e926ba34715 100644 --- a/onnxruntime/test/providers/cpu/model_tests.cc +++ b/onnxruntime/test/providers/cpu/model_tests.cc @@ -407,9 +407,7 @@ static constexpr ORT_STRING_VIEW provider_name_migraphx = ORT_TSTR("migraphx"); #endif static constexpr ORT_STRING_VIEW provider_name_openvino = ORT_TSTR("openvino"); static constexpr ORT_STRING_VIEW provider_name_cuda = ORT_TSTR("cuda"); -#ifdef USE_ROCM static constexpr ORT_STRING_VIEW provider_name_rocm = ORT_TSTR("rocm"); -#endif static constexpr ORT_STRING_VIEW provider_name_dnnl = ORT_TSTR("dnnl"); // For any non-Android system, NNAPI will only be used for ort model converter #if defined(USE_NNAPI) && defined(__ANDROID__) @@ -449,9 +447,7 @@ ::std::vector<::std::basic_string> GetParameterStrings() { #ifdef USE_CUDA provider_names[provider_name_cuda] = {opset7, opset8, opset9, opset10, opset11, opset12, opset13, opset14, opset15, opset16, opset17, opset18}; #endif -#ifdef USE_ROCM provider_names[provider_name_rocm] = {opset7, opset8, opset9, opset10, opset11, opset12, opset13, opset14, opset15, opset16, opset17, opset18}; -#endif #ifdef USE_DNNL provider_names[provider_name_dnnl] = {opset10}; #endif From 4ff9fe93f1ff22c13958b0a5bf35b92d88305979 Mon Sep 17 00:00:00 2001 From: mindest Date: Tue, 13 Aug 2024 07:25:41 +0000 Subject: [PATCH 27/36] Add pytest in Dockerfile --- .../github/linux/docker/migraphx-ci-pipeline-env.Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile b/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile index 7f1500794ca71..16e3deb1dba39 100644 --- a/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile +++ b/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile @@ -83,3 +83,4 @@ RUN apt update && apt install -y migraphx RUN pip install numpy packaging ml_dtypes==0.3.0 RUN apt install -y git +RUN pip install pytest From a9e9be01042bd0981f976a423c9327bda4e3c149 Mon Sep 17 00:00:00 2001 From: mindest Date: Tue, 13 Aug 2024 09:24:45 +0000 Subject: [PATCH 28/36] Fix error --- onnxruntime/test/providers/cpu/model_tests.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/onnxruntime/test/providers/cpu/model_tests.cc b/onnxruntime/test/providers/cpu/model_tests.cc index e2e926ba34715..affd5406f3438 100644 --- a/onnxruntime/test/providers/cpu/model_tests.cc +++ b/onnxruntime/test/providers/cpu/model_tests.cc @@ -447,7 +447,9 @@ ::std::vector<::std::basic_string> GetParameterStrings() { #ifdef USE_CUDA provider_names[provider_name_cuda] = {opset7, opset8, opset9, opset10, opset11, opset12, opset13, opset14, opset15, opset16, opset17, opset18}; #endif +#ifdef USE_ROCM provider_names[provider_name_rocm] = {opset7, opset8, opset9, opset10, opset11, opset12, opset13, opset14, opset15, opset16, opset17, opset18}; +#endif #ifdef USE_DNNL provider_names[provider_name_dnnl] = {opset10}; #endif @@ -751,7 +753,6 @@ ::std::vector<::std::basic_string> GetParameterStrings() { continue; } std::basic_string test_case_name = path.parent_path().filename().native(); - // std::cout << "Adding test: " << test_case_name << std::endl; if (test_case_name.compare(0, 5, ORT_TSTR("test_")) == 0) test_case_name = test_case_name.substr(5); if (all_disabled_tests.find(test_case_name) != all_disabled_tests.end()) From 25079dedb439e94e701ea97f750dc3085608ed70 Mon Sep 17 00:00:00 2001 From: mindest Date: Tue, 13 Aug 2024 11:19:16 +0000 Subject: [PATCH 29/36] Fix pytest: -n, --reruns --- .../github/linux/docker/migraphx-ci-pipeline-env.Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile b/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile index 16e3deb1dba39..af70eed62368f 100644 --- a/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile +++ b/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile @@ -83,4 +83,4 @@ RUN apt update && apt install -y migraphx RUN pip install numpy packaging ml_dtypes==0.3.0 RUN apt install -y git -RUN pip install pytest +RUN pip install pytest==7.4.4 pytest-xdist pytest-rerunfailures From ff87a0e48f7a9c1f10d63d31302a514cf279da03 Mon Sep 17 00:00:00 2001 From: mindest Date: Tue, 13 Aug 2024 14:14:03 +0000 Subject: [PATCH 30/36] Restore test timeout; add missing pkgs in docker. --- cmake/onnxruntime_unittests.cmake | 4 ++-- .../linux-migraphx-ci-pipeline.yml | 4 ++-- .../migraphx-ci-pipeline-env.Dockerfile | 20 +++++++++---------- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake index 0d0a3eac0977c..d5c3af748e528 100644 --- a/cmake/onnxruntime_unittests.cmake +++ b/cmake/onnxruntime_unittests.cmake @@ -222,14 +222,14 @@ function(AddTest) ) endif() # Set test timeout to 3 hours. - set_tests_properties(${_UT_TARGET} PROPERTIES TIMEOUT 14400) + set_tests_properties(${_UT_TARGET} PROPERTIES TIMEOUT 7200) else() add_test(NAME ${_UT_TARGET} COMMAND ${_UT_TARGET} ${TEST_ARGS} WORKING_DIRECTORY $ ) # Set test timeout to 3 hours. - set_tests_properties(${_UT_TARGET} PROPERTIES TIMEOUT 14400) + set_tests_properties(${_UT_TARGET} PROPERTIES TIMEOUT 7200) endif() endif() endfunction(AddTest) diff --git a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml index 353227612ded3..a9224ab26edcd 100644 --- a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml @@ -147,7 +147,7 @@ jobs: pool: AMD-GPU dependsOn: - Linux_Build - timeoutInMinutes: 240 + timeoutInMinutes: 120 steps: - task: DownloadPipelineArtifact@2 @@ -229,7 +229,7 @@ jobs: -e KERNEL_EXPLORER_BATCHED_GEMM_MAX_BATCH_SIZE=8 \ -e KERNEL_EXPLORER_TEST_USE_CUPY=1 \ onnxruntimerocm-cibuild-rocm$(RocmVersion) \ - pytest /onnxruntime_src/onnxruntime/python/tools/kernel_explorer/ -n 4 --reruns 1 --durations=100 + "set -x; pip list; pytest /onnxruntime_src/onnxruntime/python/tools/kernel_explorer/ -n 4 --reruns 1 --durations=100" workingDirectory: $(Build.SourcesDirectory) displayName: 'Run kernel explorer tests' condition: succeededOrFailed() diff --git a/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile b/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile index af70eed62368f..31c1d4f2d2903 100644 --- a/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile +++ b/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile @@ -16,15 +16,15 @@ RUN apt-get update && \ curl -sL https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - &&\ printf "deb [arch=amd64] https://repo.radeon.com/rocm/apt/$ROCM_VERSION/ jammy main" | tee /etc/apt/sources.list.d/rocm.list && \ printf "deb [arch=amd64] https://repo.radeon.com/amdgpu/$AMDGPU_VERSION/ubuntu jammy main" | tee /etc/apt/sources.list.d/amdgpu.list && \ - apt-get update && apt-get install -y --no-install-recommends \ - sudo \ - libelf1 \ - kmod \ - file \ - python3 \ - python3-pip \ - rocm-dev \ - rocm-libs \ + apt-get update && apt-get install -y --no-install-recommends \ + sudo \ + libelf1 \ + kmod \ + file \ + python3 \ + python3-pip \ + rocm-dev \ + rocm-libs \ build-essential && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* @@ -83,4 +83,4 @@ RUN apt update && apt install -y migraphx RUN pip install numpy packaging ml_dtypes==0.3.0 RUN apt install -y git -RUN pip install pytest==7.4.4 pytest-xdist pytest-rerunfailures +RUN pip install pytest==7.4.4 pytest-xdist pytest-rerunfailures scipy==1.10.0 numpy=1.24.1 From f778f6337a1780df67a2377fa0b5f7511622be9e Mon Sep 17 00:00:00 2001 From: mindest Date: Wed, 14 Aug 2024 12:54:52 +0000 Subject: [PATCH 31/36] Fix error "==" --- .../github/linux/docker/migraphx-ci-pipeline-env.Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile b/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile index 31c1d4f2d2903..ea11fcf339913 100644 --- a/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile +++ b/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile @@ -83,4 +83,4 @@ RUN apt update && apt install -y migraphx RUN pip install numpy packaging ml_dtypes==0.3.0 RUN apt install -y git -RUN pip install pytest==7.4.4 pytest-xdist pytest-rerunfailures scipy==1.10.0 numpy=1.24.1 +RUN pip install pytest==7.4.4 pytest-xdist pytest-rerunfailures scipy==1.10.0 numpy==1.24.1 From d0e10dd10f1f5348a3e01118d3472e982965e545 Mon Sep 17 00:00:00 2001 From: mindest Date: Thu, 15 Aug 2024 00:56:08 +0000 Subject: [PATCH 32/36] Fix multiple cmd in docker run --- .../github/azure-pipelines/linux-migraphx-ci-pipeline.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml index a9224ab26edcd..17118e611a5af 100644 --- a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml @@ -229,7 +229,7 @@ jobs: -e KERNEL_EXPLORER_BATCHED_GEMM_MAX_BATCH_SIZE=8 \ -e KERNEL_EXPLORER_TEST_USE_CUPY=1 \ onnxruntimerocm-cibuild-rocm$(RocmVersion) \ - "set -x; pip list; pytest /onnxruntime_src/onnxruntime/python/tools/kernel_explorer/ -n 4 --reruns 1 --durations=100" + /bin/bash -c "set -x; pip list; pytest /onnxruntime_src/onnxruntime/python/tools/kernel_explorer/ -n 4 --reruns 1 --durations=100" workingDirectory: $(Build.SourcesDirectory) displayName: 'Run kernel explorer tests' condition: succeededOrFailed() From 52f2aa2fb82bf7898ed39da9d40de2418aa6d5f6 Mon Sep 17 00:00:00 2001 From: mindest Date: Thu, 15 Aug 2024 03:45:44 +0000 Subject: [PATCH 33/36] Add cupy in docker --- .../linux/docker/migraphx-ci-pipeline-env.Dockerfile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile b/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile index ea11fcf339913..40dbf4ee1d9bc 100644 --- a/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile +++ b/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile @@ -84,3 +84,11 @@ RUN pip install numpy packaging ml_dtypes==0.3.0 RUN apt install -y git RUN pip install pytest==7.4.4 pytest-xdist pytest-rerunfailures scipy==1.10.0 numpy==1.24.1 + +RUN git clone https://github.com/ROCm/cupy && cd cupy && \ + git checkout 432a8683351d681e00903640489cb2f4055d2e09 && \ + export CUPY_INSTALL_USE_HIP=1 && \ + export ROCM_HOME=/opt/rocm && \ + export HCC_AMDGPU_TARGET=gfx906,gfx908,gfx90a && \ + git submodule update --init && \ + pip install -e . --no-cache-dir -vvvv From 29be69a07cb201531bf2265f6e9e81c8f588260e Mon Sep 17 00:00:00 2001 From: mindest Date: Fri, 16 Aug 2024 03:04:01 +0000 Subject: [PATCH 34/36] Change workdir; add --use_migraphx --- .../github/azure-pipelines/linux-migraphx-ci-pipeline.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml index 17118e611a5af..ce3564f19ec90 100644 --- a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml @@ -106,6 +106,7 @@ jobs: onnxruntime_BUILD_KERNEL_EXPLORER=ON \ CMAKE_HIP_ARCHITECTURES=gfx90a \ --mpi_home /opt/ompi \ + --use_migraphx \ --use_rocm \ --rocm_version=$(RocmVersion) \ --rocm_home /opt/rocm \ @@ -186,7 +187,7 @@ jobs: onnxruntimerocm-cibuild-rocm$(RocmVersion) \ /bin/bash -c " set -ex; \ - cd /build/Release && xargs -a /build/Release/perms.txt chmod a+x; \ + xargs -a /build/Release/perms.txt chmod a+x; \ python /onnxruntime_src/tools/ci_build/build.py \ --config Release \ --cmake_extra_defines \ @@ -194,6 +195,7 @@ jobs: onnxruntime_BUILD_KERNEL_EXPLORER=ON \ CMAKE_HIP_ARCHITECTURES=gfx90a \ --mpi_home /opt/ompi \ + --use_migraphx \ --use_rocm \ --rocm_version=$(RocmVersion) \ --rocm_home /opt/rocm \ @@ -228,8 +230,9 @@ jobs: -e KERNEL_EXPLORER_BUILD_DIR=/build/Release \ -e KERNEL_EXPLORER_BATCHED_GEMM_MAX_BATCH_SIZE=8 \ -e KERNEL_EXPLORER_TEST_USE_CUPY=1 \ + --workdir /build/Release \ onnxruntimerocm-cibuild-rocm$(RocmVersion) \ - /bin/bash -c "set -x; pip list; pytest /onnxruntime_src/onnxruntime/python/tools/kernel_explorer/ -n 4 --reruns 1 --durations=100" + pytest /onnxruntime_src/onnxruntime/python/tools/kernel_explorer/ -n 4 --reruns 1 --durations=100 workingDirectory: $(Build.SourcesDirectory) displayName: 'Run kernel explorer tests' condition: succeededOrFailed() From ca5caf083145251c9e59db1e10571e77d12ae1c0 Mon Sep 17 00:00:00 2001 From: mindest Date: Fri, 16 Aug 2024 08:59:53 +0000 Subject: [PATCH 35/36] Remove --use_migraphx --- .../github/azure-pipelines/linux-migraphx-ci-pipeline.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml index ce3564f19ec90..5a37f7eeca5bb 100644 --- a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml @@ -106,7 +106,6 @@ jobs: onnxruntime_BUILD_KERNEL_EXPLORER=ON \ CMAKE_HIP_ARCHITECTURES=gfx90a \ --mpi_home /opt/ompi \ - --use_migraphx \ --use_rocm \ --rocm_version=$(RocmVersion) \ --rocm_home /opt/rocm \ @@ -195,7 +194,6 @@ jobs: onnxruntime_BUILD_KERNEL_EXPLORER=ON \ CMAKE_HIP_ARCHITECTURES=gfx90a \ --mpi_home /opt/ompi \ - --use_migraphx \ --use_rocm \ --rocm_version=$(RocmVersion) \ --rocm_home /opt/rocm \ From 2cc4d0e891fa88e81fdec3545c3f63149789989a Mon Sep 17 00:00:00 2001 From: mindest Date: Mon, 19 Aug 2024 04:14:32 +0000 Subject: [PATCH 36/36] Set cupy cache dir to avoid permission error. --- .../github/azure-pipelines/linux-migraphx-ci-pipeline.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml index 5a37f7eeca5bb..0b7ce03b46339 100644 --- a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml @@ -228,7 +228,7 @@ jobs: -e KERNEL_EXPLORER_BUILD_DIR=/build/Release \ -e KERNEL_EXPLORER_BATCHED_GEMM_MAX_BATCH_SIZE=8 \ -e KERNEL_EXPLORER_TEST_USE_CUPY=1 \ - --workdir /build/Release \ + -e CUPY_CACHE_DIR=/build/Release \ onnxruntimerocm-cibuild-rocm$(RocmVersion) \ pytest /onnxruntime_src/onnxruntime/python/tools/kernel_explorer/ -n 4 --reruns 1 --durations=100 workingDirectory: $(Build.SourcesDirectory)