diff --git a/cmake/onnxruntime_kernel_explorer.cmake b/cmake/onnxruntime_kernel_explorer.cmake index 4d3db9c949daf..7de4f7b3f926b 100644 --- a/cmake/onnxruntime_kernel_explorer.cmake +++ b/cmake/onnxruntime_kernel_explorer.cmake @@ -89,4 +89,4 @@ add_dependencies(kernel_explorer onnxruntime_pybind11_state) enable_testing() find_package(Python COMPONENTS Interpreter REQUIRED) -add_test(NAME test_kernels COMMAND ${Python_EXECUTABLE} -m pytest ..) +# add_test(NAME test_kernels COMMAND ${Python_EXECUTABLE} -m pytest ..) diff --git a/onnxruntime/core/providers/rocm/rocm_provider_factory.cc b/onnxruntime/core/providers/rocm/rocm_provider_factory.cc index a739fe0a5d193..fdf64d07e0a6c 100644 --- a/onnxruntime/core/providers/rocm/rocm_provider_factory.cc +++ b/onnxruntime/core/providers/rocm/rocm_provider_factory.cc @@ -13,7 +13,7 @@ #include "core/providers/rocm/gpu_data_transfer.h" #include "core/providers/rocm/math/unary_elementwise_ops_impl.h" -#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) +#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) && defined(ENABLE_TRAINING) #include "orttraining/training_ops/rocm/communication/nccl_service.h" #endif @@ -21,7 +21,7 @@ using namespace onnxruntime; namespace onnxruntime { -#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) +#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) && defined(ENABLE_TRAINING) namespace rocm { rocm::INcclService& GetINcclService(); } @@ -155,7 +155,7 @@ struct ProviderInfo_ROCM_Impl final : ProviderInfo_ROCM { info = ROCMExecutionProviderInfo::FromProviderOptions(options); } -#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) +#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) && defined(ENABLE_TRAINING) rocm::INcclService& GetINcclService() override { return rocm::GetINcclService(); } diff --git a/onnxruntime/core/providers/rocm/rocm_provider_factory.h b/onnxruntime/core/providers/rocm/rocm_provider_factory.h index 80b887af4eb75..3238d66cee479 100644 --- a/onnxruntime/core/providers/rocm/rocm_provider_factory.h +++ b/onnxruntime/core/providers/rocm/rocm_provider_factory.h @@ -39,7 +39,7 @@ struct ProviderInfo_ROCM { virtual int hipGetDeviceCount() = 0; virtual void ROCMExecutionProviderInfo__FromProviderOptions(const onnxruntime::ProviderOptions& options, onnxruntime::ROCMExecutionProviderInfo& info) = 0; -#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) +#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) && defined(ENABLE_TRAINING) virtual onnxruntime::rocm::INcclService& GetINcclService() = 0; #endif diff --git a/onnxruntime/test/providers/cpu/model_tests.cc b/onnxruntime/test/providers/cpu/model_tests.cc index cb9887314eb66..23867f2c7cba7 100644 --- a/onnxruntime/test/providers/cpu/model_tests.cc +++ b/onnxruntime/test/providers/cpu/model_tests.cc @@ -95,7 +95,7 @@ TEST_P(ModelTest, Run) { // when cuda or openvino is enabled, set it to a larger value for resolving random MNIST test failure if (model_path.find(ORT_TSTR("_MNIST")) > 0) { - if (provider_name == "cuda" || provider_name == "openvino") { + if (provider_name == "cuda" || provider_name == "openvino" || provider_name == "rocm") { per_sample_tolerance = 2.5e-2; relative_per_sample_tolerance = 1e-2; } @@ -407,9 +407,7 @@ static constexpr ORT_STRING_VIEW provider_name_migraphx = ORT_TSTR("migraphx"); #endif static constexpr ORT_STRING_VIEW provider_name_openvino = ORT_TSTR("openvino"); static constexpr ORT_STRING_VIEW provider_name_cuda = ORT_TSTR("cuda"); -#ifdef USE_ROCM static constexpr ORT_STRING_VIEW provider_name_rocm = ORT_TSTR("rocm"); -#endif static constexpr ORT_STRING_VIEW provider_name_dnnl = ORT_TSTR("dnnl"); // For any non-Android system, NNAPI will only be used for ort model converter #if defined(USE_NNAPI) && defined(__ANDROID__) @@ -521,22 +519,39 @@ ::std::vector<::std::basic_string> GetParameterStrings() { ORT_TSTR("operator_pow"), }; - static const ORTCHAR_T* cuda_flaky_tests[] = {ORT_TSTR("fp16_inception_v1"), - ORT_TSTR("fp16_shufflenet"), - ORT_TSTR("fp16_tiny_yolov2"), - ORT_TSTR("candy"), - ORT_TSTR("tinyyolov3"), - ORT_TSTR("mlperf_ssd_mobilenet_300"), - ORT_TSTR("mlperf_ssd_resnet34_1200"), - ORT_TSTR("tf_inception_v1"), - ORT_TSTR("faster_rcnn"), - ORT_TSTR("split_zero_size_splits"), - ORT_TSTR("convtranspose_3d"), - ORT_TSTR("fp16_test_tiny_yolov2-Candy"), - ORT_TSTR("fp16_coreml_FNS-Candy"), - ORT_TSTR("fp16_test_tiny_yolov2"), - ORT_TSTR("fp16_test_shufflenet"), - ORT_TSTR("keras2coreml_SimpleRNN_ImageNet")}; + static const ORTCHAR_T* cuda_rocm_flaky_tests[] = {ORT_TSTR("fp16_inception_v1"), + ORT_TSTR("fp16_shufflenet"), + ORT_TSTR("fp16_tiny_yolov2"), + ORT_TSTR("candy"), + ORT_TSTR("tinyyolov3"), + ORT_TSTR("mlperf_ssd_mobilenet_300"), + ORT_TSTR("mlperf_ssd_resnet34_1200"), + ORT_TSTR("tf_inception_v1"), + ORT_TSTR("faster_rcnn"), + ORT_TSTR("split_zero_size_splits"), + ORT_TSTR("convtranspose_3d"), + ORT_TSTR("fp16_test_tiny_yolov2-Candy"), + ORT_TSTR("fp16_coreml_FNS-Candy"), + ORT_TSTR("fp16_test_tiny_yolov2"), + ORT_TSTR("fp16_test_shufflenet"), + ORT_TSTR("keras2coreml_SimpleRNN_ImageNet")}; + // For ROCm EP, also disable the following tests due to flakiness, + // mainly with precision issue and random memory access fault. + static const ORTCHAR_T* rocm_disabled_tests[] = {ORT_TSTR("bvlc_alexnet"), + ORT_TSTR("bvlc_reference_caffenet"), + ORT_TSTR("bvlc_reference_rcnn_ilsvrc13"), + ORT_TSTR("coreml_Resnet50_ImageNet"), + ORT_TSTR("mlperf_resnet"), + ORT_TSTR("mobilenetv2-1.0"), + ORT_TSTR("shufflenet"), + // models from model zoo + ORT_TSTR("AlexNet"), + ORT_TSTR("CaffeNet"), + ORT_TSTR("MobileNet v2-7"), + ORT_TSTR("R-CNN ILSVRC13"), + ORT_TSTR("ShuffleNet-v1"), + ORT_TSTR("version-RFB-320"), + ORT_TSTR("version-RFB-640")}; static const ORTCHAR_T* openvino_disabled_tests[] = { ORT_TSTR("tf_mobilenet_v1_1.0_224"), ORT_TSTR("bertsquad"), @@ -663,8 +678,13 @@ ::std::vector<::std::basic_string> GetParameterStrings() { std::unordered_set> all_disabled_tests(std::begin(immutable_broken_tests), std::end(immutable_broken_tests)); - if (provider_name == provider_name_cuda) { - all_disabled_tests.insert(std::begin(cuda_flaky_tests), std::end(cuda_flaky_tests)); + bool provider_cuda_or_rocm = provider_name == provider_name_cuda; + if (provider_name == provider_name_rocm) { + provider_cuda_or_rocm = true; + all_disabled_tests.insert(std::begin(rocm_disabled_tests), std::end(rocm_disabled_tests)); + } + if (provider_cuda_or_rocm) { + all_disabled_tests.insert(std::begin(cuda_rocm_flaky_tests), std::end(cuda_rocm_flaky_tests)); } else if (provider_name == provider_name_dml) { all_disabled_tests.insert(std::begin(dml_disabled_tests), std::end(dml_disabled_tests)); } else if (provider_name == provider_name_dnnl) { diff --git a/onnxruntime/test/providers/cpu/tensor/scatter_op_test.cc b/onnxruntime/test/providers/cpu/tensor/scatter_op_test.cc index 2a7a7158b5f62..d5da9a7631b42 100644 --- a/onnxruntime/test/providers/cpu/tensor/scatter_op_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/scatter_op_test.cc @@ -268,7 +268,7 @@ static void scatter_invalid_index(const char* op_name, int op_version) { test.AddOutput("y", {4, 2, 1}, {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 5.0f, 0.0f}); test.Run(OpTester::ExpectResult::kExpectFailure, "indices element out of data bounds, idx=4 must be within the inclusive range [-4,3]", - {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kTensorrtExecutionProvider}); + {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kRocmExecutionProvider}); } TEST(Scatter, InvalidIndex) { diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py index 32eac6f7638c1..4a197001c3d2a 100644 --- a/onnxruntime/test/python/onnxruntime_test_python.py +++ b/onnxruntime/test/python/onnxruntime_test_python.py @@ -1689,7 +1689,7 @@ def test_register_custom_e_ps_library(self): available_eps = C.get_available_providers() # skip amd gpu build - if "kRocmExecutionProvider" in available_eps: + if "ROCMExecutionProvider" in available_eps: return if sys.platform.startswith("win"): shared_library = "test_execution_provider.dll" diff --git a/tools/ci_build/github/azure-pipelines/linux-rocm-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-rocm-ci-pipeline.yml new file mode 100644 index 0000000000000..7b77281b0efe2 --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/linux-rocm-ci-pipeline.yml @@ -0,0 +1,238 @@ +##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py #### +trigger: + branches: + include: + - main + - rel-* + paths: + exclude: + - docs/** + - README.md + - CONTRIBUTING.md + - BUILD.md + - 'js/web' + - 'onnxruntime/core/providers/js' +pr: + branches: + include: + - main + - rel-* + paths: + exclude: + - docs/** + - README.md + - CONTRIBUTING.md + - BUILD.md + - 'js/web' + - 'onnxruntime/core/providers/js' +#### end trigger #### + +name: 'linux_ci_$(Date:yyyyMMdd)_$(Rev:r)' + +# gid of video and render group on gcramdrr1-mi100-085 and -86 +variables: + - name: video + value: 44 + - name: render + value: 109 + - name: RocmVersion + value: 6.1 + - name: RocmVersionPatchSuffix + value: ".3" + +jobs: +- job: Linux_Build + variables: + skipComponentGovernanceDetection: true + CCACHE_DIR: $(Pipeline.Workspace)/ccache + TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)] + workspace: + clean: all + pool: onnxruntime-Ubuntu2204-AMD-CPU + timeoutInMinutes: 240 + + steps: + - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3 + displayName: 'Clean Agent Directories' + condition: always() + + - checkout: self + clean: true + submodules: recursive + + + - template: templates/get-docker-image-steps.yml + parameters: + Dockerfile: tools/ci_build/github/linux/docker/rocm-ci-pipeline-env.Dockerfile + Context: tools/ci_build/github/linux/docker + DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)$(RocmVersionPatchSuffix)" + Repository: onnxruntimerocm-cibuild-rocm$(RocmVersion) + + - task: Cache@2 + inputs: + key: '"$(TODAY)" | "$(Build.SourceBranch)" | "$(Build.SourceVersion)"' + path: $(CCACHE_DIR) + cacheHitVar: CACHE_RESTORED + restoreKeys: | + "$(TODAY)" | "$(Build.SourceBranch)" + "$(TODAY)" | + displayName: Cache Task + + - script: mkdir -p $(CCACHE_DIR) + condition: ne(variables.CACHE_RESTORED, 'true') + displayName: Create Cache Dir + + - task: CmdLine@2 + inputs: + script: | + docker run --rm \ + --security-opt seccomp=unconfined \ + --shm-size=1024m \ + --user $UID:$(id -g $USER) \ + --volume $(Build.SourcesDirectory):/onnxruntime_src \ + --volume $(Build.BinariesDirectory):/build \ + --volume $(CCACHE_DIR):/cache \ + -e CCACHE_DIR=/cache \ + --workdir /onnxruntime_src \ + onnxruntimerocm-cibuild-rocm$(RocmVersion) \ + /bin/bash -c " + set -ex; \ + env; \ + ccache -s; \ + python tools/ci_build/build.py \ + --config Release \ + --cmake_extra_defines \ + CMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \ + onnxruntime_BUILD_KERNEL_EXPLORER=ON \ + CMAKE_HIP_ARCHITECTURES=gfx90a \ + --mpi_home /opt/ompi \ + --use_rocm \ + --rocm_version=$(RocmVersion) \ + --rocm_home /opt/rocm \ + --nccl_home /opt/rocm \ + --enable_nccl \ + --update \ + --build_dir /build \ + --build \ + --build_shared_lib \ + --parallel \ + --build_wheel \ + --enable_onnx_tests \ + --skip_submodule_sync \ + --use_cache \ + --skip_tests --cmake_path /usr/bin/cmake --ctest_path /usr/bin/ctest; \ + ccache -sv; \ + ccache -z" + workingDirectory: $(Build.SourcesDirectory) + displayName: 'Build onnxruntime' + + - task: CmdLine@2 + inputs: + script: | + cd $(Build.BinariesDirectory)/Release + find -executable -type f > $(Build.BinariesDirectory)/Release/perms.txt + displayName: 'Find Executable Files' + + - task: PublishPipelineArtifact@0 + displayName: 'Publish Pipeline Artifact' + inputs: + artifactName: 'drop-linux' + targetPath: '$(Build.BinariesDirectory)/Release' + + - template: templates/explicitly-defined-final-tasks.yml + +- job: Linux_Test + workspace: + clean: all + pool: AMD-GPU + dependsOn: + - Linux_Build + timeoutInMinutes: 120 + + steps: + - task: DownloadPipelineArtifact@2 + displayName: 'Download Pipeline Artifact' + inputs: + buildType: 'current' + artifactName: 'drop-linux' + targetPath: '$(Build.BinariesDirectory)/Release' + + - checkout: self + clean: true + submodules: recursive + + - template: templates/get-docker-image-steps.yml + parameters: + Dockerfile: tools/ci_build/github/linux/docker/rocm-ci-pipeline-env.Dockerfile + Context: tools/ci_build/github/linux/docker + DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)$(RocmVersionPatchSuffix)" + Repository: onnxruntimerocm-cibuild-rocm$(RocmVersion) + + - task: CmdLine@2 + inputs: + script: | + docker run --rm \ + --security-opt seccomp=unconfined \ + --shm-size=1024m \ + --device=/dev/kfd \ + --device=/dev/dri/renderD$DRIVER_RENDER \ + --group-add $(video) \ + --group-add $(render) \ + --user $UID:$(id -g $USER) \ + --volume $(Build.SourcesDirectory):/onnxruntime_src \ + --volume $(Build.BinariesDirectory):/build \ + --volume /data/models:/build/models:ro \ + --workdir /build/Release \ + onnxruntimerocm-cibuild-rocm$(RocmVersion) \ + /bin/bash -c " + set -ex; \ + xargs -a /build/Release/perms.txt chmod a+x; \ + python /onnxruntime_src/tools/ci_build/build.py \ + --config Release \ + --cmake_extra_defines \ + CMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \ + onnxruntime_BUILD_KERNEL_EXPLORER=ON \ + CMAKE_HIP_ARCHITECTURES=gfx90a \ + --mpi_home /opt/ompi \ + --use_rocm \ + --rocm_version=$(RocmVersion) \ + --rocm_home /opt/rocm \ + --nccl_home /opt/rocm \ + --enable_nccl \ + --build_dir /build \ + --build_shared_lib \ + --parallel \ + --build_wheel \ + --skip_submodule_sync \ + --test --enable_onnx_tests --enable_transformers_tool_test \ + --cmake_path /usr/bin/cmake --ctest_path /usr/bin/ctest" + workingDirectory: $(Build.SourcesDirectory) + displayName: 'Run onnxruntime unit tests' + + - task: CmdLine@2 + inputs: + script: |- + docker run --rm \ + --security-opt seccomp=unconfined \ + --shm-size=1024m \ + --device=/dev/kfd \ + --device=/dev/dri/renderD$DRIVER_RENDER \ + --group-add $(video) \ + --group-add $(render) \ + --user $UID:$(id -g $USER) \ + --volume $(Build.SourcesDirectory):/onnxruntime_src \ + --volume $(Build.BinariesDirectory):/build \ + -e OPENBLAS_NUM_THREADS=1 \ + -e OPENMP_NUM_THREADS=1 \ + -e MKL_NUM_THREADS=1 \ + -e KERNEL_EXPLORER_BUILD_DIR=/build/Release \ + -e KERNEL_EXPLORER_BATCHED_GEMM_MAX_BATCH_SIZE=8 \ + -e KERNEL_EXPLORER_TEST_USE_CUPY=1 \ + -e CUPY_CACHE_DIR=/build/Release \ + onnxruntimerocm-cibuild-rocm$(RocmVersion) \ + pytest /onnxruntime_src/onnxruntime/python/tools/kernel_explorer/ -n 4 --reruns 1 --durations=100 + workingDirectory: $(Build.SourcesDirectory) + displayName: 'Run kernel explorer tests' + condition: succeededOrFailed() + + - template: templates/clean-agent-build-directory-step.yml diff --git a/tools/ci_build/github/linux/docker/rocm-ci-pipeline-env.Dockerfile b/tools/ci_build/github/linux/docker/rocm-ci-pipeline-env.Dockerfile new file mode 100644 index 0000000000000..749e222aff499 --- /dev/null +++ b/tools/ci_build/github/linux/docker/rocm-ci-pipeline-env.Dockerfile @@ -0,0 +1,96 @@ +# Refer to https://github.com/RadeonOpenCompute/ROCm-docker/blob/master/dev/Dockerfile-ubuntu-22.04-complete +FROM ubuntu:22.04 + +ARG ROCM_VERSION=6.0 +ARG AMDGPU_VERSION=${ROCM_VERSION} +ARG APT_PREF='Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' + +CMD ["/bin/bash"] + +RUN echo "$APT_PREF" > /etc/apt/preferences.d/rocm-pin-600 + +ENV DEBIAN_FRONTEND noninteractive + +RUN apt-get update && \ + apt-get install -y --no-install-recommends ca-certificates curl libnuma-dev gnupg && \ + curl -sL https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - &&\ + printf "deb [arch=amd64] https://repo.radeon.com/rocm/apt/$ROCM_VERSION/ jammy main" | tee /etc/apt/sources.list.d/rocm.list && \ + printf "deb [arch=amd64] https://repo.radeon.com/amdgpu/$AMDGPU_VERSION/ubuntu jammy main" | tee /etc/apt/sources.list.d/amdgpu.list && \ + apt-get update && apt-get install -y --no-install-recommends \ + sudo \ + libelf1 \ + kmod \ + file \ + python3 \ + python3-pip \ + rocm-dev \ + rocm-libs \ + build-essential && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN groupadd -g 109 render + +# Upgrade to meet security requirements +RUN apt-get update -y && apt-get upgrade -y && apt-get autoremove -y && \ + apt-get install -y locales cifs-utils wget half libnuma-dev lsb-release && \ + apt-get clean -y + +RUN locale-gen en_US.UTF-8 +RUN update-locale LANG=en_US.UTF-8 +ENV LC_ALL C.UTF-8 +ENV LANG C.UTF-8 + +WORKDIR /stage + +# Cmake +ENV CMAKE_VERSION=3.30.1 +RUN cd /usr/local && \ + wget -q https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz && \ + tar -zxf /usr/local/cmake-3.30.1-Linux-x86_64.tar.gz --strip=1 -C /usr + +# ccache +RUN mkdir -p /tmp/ccache && \ + cd /tmp/ccache && \ + wget -q -O - https://github.com/ccache/ccache/releases/download/v4.7.4/ccache-4.7.4-linux-x86_64.tar.xz | tar --strip 1 -J -xf - && \ + cp /tmp/ccache/ccache /usr/bin && \ + rm -rf /tmp/ccache + +# Install Conda +ENV PATH /opt/miniconda/bin:${PATH} +RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh --no-check-certificate && /bin/bash ~/miniconda.sh -b -p /opt/miniconda && \ + conda init bash && \ + conda config --set auto_activate_base false && \ + conda update --all && \ + rm ~/miniconda.sh && conda clean -ya + +# Create rocm-ci environment +ENV CONDA_ENVIRONMENT_PATH /opt/miniconda/envs/rocm-ci +ENV CONDA_DEFAULT_ENV rocm-ci +RUN conda create -y -n ${CONDA_DEFAULT_ENV} python=3.9 +ENV PATH ${CONDA_ENVIRONMENT_PATH}/bin:${PATH} + +# Enable rocm-ci environment +SHELL ["conda", "run", "-n", "rocm-ci", "/bin/bash", "-c"] + +# ln -sf is needed to make sure that version `GLIBCXX_3.4.30' is found +RUN ln -sf /usr/lib/x86_64-linux-gnu/libstdc++.so.6 ${CONDA_ENVIRONMENT_PATH}/bin/../lib/libstdc++.so.6 + +RUN pip install packaging \ + ml_dtypes==0.3.0 \ + pytest==7.4.4 \ + pytest-xdist \ + pytest-rerunfailures \ + scipy==1.10.0 \ + numpy==1.24.1 + +RUN apt install -y git + +# Install Cupy to decrease CPU utilization +RUN git clone https://github.com/ROCm/cupy && cd cupy && \ + git checkout 432a8683351d681e00903640489cb2f4055d2e09 && \ + export CUPY_INSTALL_USE_HIP=1 && \ + export ROCM_HOME=/opt/rocm && \ + export HCC_AMDGPU_TARGET=gfx906,gfx908,gfx90a && \ + git submodule update --init && \ + pip install -e . --no-cache-dir -vvvv