Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Linux ROCm CI Pipeline #21798

Merged
merged 5 commits into from
Aug 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmake/onnxruntime_kernel_explorer.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -89,4 +89,4 @@ add_dependencies(kernel_explorer onnxruntime_pybind11_state)

enable_testing()
find_package(Python COMPONENTS Interpreter REQUIRED)
add_test(NAME test_kernels COMMAND ${Python_EXECUTABLE} -m pytest ..)
# add_test(NAME test_kernels COMMAND ${Python_EXECUTABLE} -m pytest ..)
6 changes: 3 additions & 3 deletions onnxruntime/core/providers/rocm/rocm_provider_factory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,15 @@
#include "core/providers/rocm/gpu_data_transfer.h"
#include "core/providers/rocm/math/unary_elementwise_ops_impl.h"

#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P)
#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) && defined(ENABLE_TRAINING)
#include "orttraining/training_ops/rocm/communication/nccl_service.h"
#endif

using namespace onnxruntime;

namespace onnxruntime {

#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P)
#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) && defined(ENABLE_TRAINING)
namespace rocm {
rocm::INcclService& GetINcclService();
}
Expand Down Expand Up @@ -155,7 +155,7 @@ struct ProviderInfo_ROCM_Impl final : ProviderInfo_ROCM {
info = ROCMExecutionProviderInfo::FromProviderOptions(options);
}

#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P)
#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) && defined(ENABLE_TRAINING)
rocm::INcclService& GetINcclService() override {
return rocm::GetINcclService();
}
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/core/providers/rocm/rocm_provider_factory.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ struct ProviderInfo_ROCM {
virtual int hipGetDeviceCount() = 0;
virtual void ROCMExecutionProviderInfo__FromProviderOptions(const onnxruntime::ProviderOptions& options, onnxruntime::ROCMExecutionProviderInfo& info) = 0;

#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P)
#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) && defined(ENABLE_TRAINING)
virtual onnxruntime::rocm::INcclService& GetINcclService() = 0;
#endif

Expand Down
62 changes: 41 additions & 21 deletions onnxruntime/test/providers/cpu/model_tests.cc
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ TEST_P(ModelTest, Run) {

// when cuda or openvino is enabled, set it to a larger value for resolving random MNIST test failure
if (model_path.find(ORT_TSTR("_MNIST")) > 0) {
if (provider_name == "cuda" || provider_name == "openvino") {
if (provider_name == "cuda" || provider_name == "openvino" || provider_name == "rocm") {
per_sample_tolerance = 2.5e-2;
relative_per_sample_tolerance = 1e-2;
}
Expand Down Expand Up @@ -407,9 +407,7 @@ static constexpr ORT_STRING_VIEW provider_name_migraphx = ORT_TSTR("migraphx");
#endif
static constexpr ORT_STRING_VIEW provider_name_openvino = ORT_TSTR("openvino");
static constexpr ORT_STRING_VIEW provider_name_cuda = ORT_TSTR("cuda");
#ifdef USE_ROCM
static constexpr ORT_STRING_VIEW provider_name_rocm = ORT_TSTR("rocm");
#endif
static constexpr ORT_STRING_VIEW provider_name_dnnl = ORT_TSTR("dnnl");
// For any non-Android system, NNAPI will only be used for ort model converter
#if defined(USE_NNAPI) && defined(__ANDROID__)
Expand Down Expand Up @@ -521,22 +519,39 @@ ::std::vector<::std::basic_string<ORTCHAR_T>> GetParameterStrings() {
ORT_TSTR("operator_pow"),
};

static const ORTCHAR_T* cuda_flaky_tests[] = {ORT_TSTR("fp16_inception_v1"),
ORT_TSTR("fp16_shufflenet"),
ORT_TSTR("fp16_tiny_yolov2"),
ORT_TSTR("candy"),
ORT_TSTR("tinyyolov3"),
ORT_TSTR("mlperf_ssd_mobilenet_300"),
ORT_TSTR("mlperf_ssd_resnet34_1200"),
ORT_TSTR("tf_inception_v1"),
ORT_TSTR("faster_rcnn"),
ORT_TSTR("split_zero_size_splits"),
ORT_TSTR("convtranspose_3d"),
ORT_TSTR("fp16_test_tiny_yolov2-Candy"),
ORT_TSTR("fp16_coreml_FNS-Candy"),
ORT_TSTR("fp16_test_tiny_yolov2"),
ORT_TSTR("fp16_test_shufflenet"),
ORT_TSTR("keras2coreml_SimpleRNN_ImageNet")};
static const ORTCHAR_T* cuda_rocm_flaky_tests[] = {ORT_TSTR("fp16_inception_v1"),
ORT_TSTR("fp16_shufflenet"),
ORT_TSTR("fp16_tiny_yolov2"),
ORT_TSTR("candy"),
ORT_TSTR("tinyyolov3"),
ORT_TSTR("mlperf_ssd_mobilenet_300"),
ORT_TSTR("mlperf_ssd_resnet34_1200"),
ORT_TSTR("tf_inception_v1"),
ORT_TSTR("faster_rcnn"),
ORT_TSTR("split_zero_size_splits"),
ORT_TSTR("convtranspose_3d"),
ORT_TSTR("fp16_test_tiny_yolov2-Candy"),
ORT_TSTR("fp16_coreml_FNS-Candy"),
ORT_TSTR("fp16_test_tiny_yolov2"),
ORT_TSTR("fp16_test_shufflenet"),
ORT_TSTR("keras2coreml_SimpleRNN_ImageNet")};
// For ROCm EP, also disable the following tests due to flakiness,
// mainly with precision issue and random memory access fault.
static const ORTCHAR_T* rocm_disabled_tests[] = {ORT_TSTR("bvlc_alexnet"),
mindest marked this conversation as resolved.
Show resolved Hide resolved
ORT_TSTR("bvlc_reference_caffenet"),
ORT_TSTR("bvlc_reference_rcnn_ilsvrc13"),
ORT_TSTR("coreml_Resnet50_ImageNet"),
ORT_TSTR("mlperf_resnet"),
ORT_TSTR("mobilenetv2-1.0"),
ORT_TSTR("shufflenet"),
// models from model zoo
ORT_TSTR("AlexNet"),
ORT_TSTR("CaffeNet"),
ORT_TSTR("MobileNet v2-7"),
ORT_TSTR("R-CNN ILSVRC13"),
ORT_TSTR("ShuffleNet-v1"),
ORT_TSTR("version-RFB-320"),
ORT_TSTR("version-RFB-640")};
static const ORTCHAR_T* openvino_disabled_tests[] = {
ORT_TSTR("tf_mobilenet_v1_1.0_224"),
ORT_TSTR("bertsquad"),
Expand Down Expand Up @@ -663,8 +678,13 @@ ::std::vector<::std::basic_string<ORTCHAR_T>> GetParameterStrings() {

std::unordered_set<std::basic_string<ORTCHAR_T>> all_disabled_tests(std::begin(immutable_broken_tests),
std::end(immutable_broken_tests));
if (provider_name == provider_name_cuda) {
all_disabled_tests.insert(std::begin(cuda_flaky_tests), std::end(cuda_flaky_tests));
bool provider_cuda_or_rocm = provider_name == provider_name_cuda;
if (provider_name == provider_name_rocm) {
provider_cuda_or_rocm = true;
all_disabled_tests.insert(std::begin(rocm_disabled_tests), std::end(rocm_disabled_tests));
}
if (provider_cuda_or_rocm) {
all_disabled_tests.insert(std::begin(cuda_rocm_flaky_tests), std::end(cuda_rocm_flaky_tests));
} else if (provider_name == provider_name_dml) {
all_disabled_tests.insert(std::begin(dml_disabled_tests), std::end(dml_disabled_tests));
} else if (provider_name == provider_name_dnnl) {
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/test/providers/cpu/tensor/scatter_op_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ static void scatter_invalid_index(const char* op_name, int op_version) {
test.AddOutput<float>("y", {4, 2, 1}, {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 5.0f, 0.0f});
test.Run(OpTester::ExpectResult::kExpectFailure,
"indices element out of data bounds, idx=4 must be within the inclusive range [-4,3]",
{kCudaExecutionProvider, kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
{kCudaExecutionProvider, kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kRocmExecutionProvider});
}

TEST(Scatter, InvalidIndex) {
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/test/python/onnxruntime_test_python.py
Original file line number Diff line number Diff line change
Expand Up @@ -1689,7 +1689,7 @@ def test_register_custom_e_ps_library(self):

available_eps = C.get_available_providers()
# skip amd gpu build
if "kRocmExecutionProvider" in available_eps:
if "ROCMExecutionProvider" in available_eps:
return
if sys.platform.startswith("win"):
shared_library = "test_execution_provider.dll"
Expand Down
238 changes: 238 additions & 0 deletions tools/ci_build/github/azure-pipelines/linux-rocm-ci-pipeline.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py ####
trigger:
branches:
include:
- main
- rel-*
paths:
exclude:
- docs/**
- README.md
- CONTRIBUTING.md
- BUILD.md
- 'js/web'
- 'onnxruntime/core/providers/js'
pr:
branches:
include:
- main
- rel-*
paths:
exclude:
- docs/**
- README.md
- CONTRIBUTING.md
- BUILD.md
- 'js/web'
- 'onnxruntime/core/providers/js'
#### end trigger ####

name: 'linux_ci_$(Date:yyyyMMdd)_$(Rev:r)'

# gid of video and render group on gcramdrr1-mi100-085 and -86
variables:
- name: video
value: 44
- name: render
value: 109
- name: RocmVersion
value: 6.1
- name: RocmVersionPatchSuffix
value: ".3"

jobs:
- job: Linux_Build
variables:
skipComponentGovernanceDetection: true
CCACHE_DIR: $(Pipeline.Workspace)/ccache
TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
workspace:
clean: all
pool: onnxruntime-Ubuntu2204-AMD-CPU
timeoutInMinutes: 240

steps:
- task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
displayName: 'Clean Agent Directories'
condition: always()

- checkout: self
clean: true
submodules: recursive


- template: templates/get-docker-image-steps.yml
parameters:
Dockerfile: tools/ci_build/github/linux/docker/rocm-ci-pipeline-env.Dockerfile
Context: tools/ci_build/github/linux/docker
DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)$(RocmVersionPatchSuffix)"
Repository: onnxruntimerocm-cibuild-rocm$(RocmVersion)

- task: Cache@2
inputs:
key: '"$(TODAY)" | "$(Build.SourceBranch)" | "$(Build.SourceVersion)"'
path: $(CCACHE_DIR)
cacheHitVar: CACHE_RESTORED
restoreKeys: |
"$(TODAY)" | "$(Build.SourceBranch)"
"$(TODAY)" |
displayName: Cache Task

- script: mkdir -p $(CCACHE_DIR)
condition: ne(variables.CACHE_RESTORED, 'true')
displayName: Create Cache Dir

- task: CmdLine@2
inputs:
script: |
docker run --rm \
--security-opt seccomp=unconfined \
--shm-size=1024m \
--user $UID:$(id -g $USER) \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume $(CCACHE_DIR):/cache \
-e CCACHE_DIR=/cache \
--workdir /onnxruntime_src \
onnxruntimerocm-cibuild-rocm$(RocmVersion) \
/bin/bash -c "
set -ex; \
env; \
ccache -s; \
python tools/ci_build/build.py \
--config Release \
--cmake_extra_defines \
CMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \
onnxruntime_BUILD_KERNEL_EXPLORER=ON \
CMAKE_HIP_ARCHITECTURES=gfx90a \
--mpi_home /opt/ompi \
--use_rocm \
--rocm_version=$(RocmVersion) \
--rocm_home /opt/rocm \
--nccl_home /opt/rocm \
--enable_nccl \
--update \
--build_dir /build \
--build \
--build_shared_lib \
--parallel \
--build_wheel \
--enable_onnx_tests \
--skip_submodule_sync \
--use_cache \
--skip_tests --cmake_path /usr/bin/cmake --ctest_path /usr/bin/ctest; \
mindest marked this conversation as resolved.
Show resolved Hide resolved
ccache -sv; \
ccache -z"
workingDirectory: $(Build.SourcesDirectory)
displayName: 'Build onnxruntime'

- task: CmdLine@2
inputs:
script: |
cd $(Build.BinariesDirectory)/Release
find -executable -type f > $(Build.BinariesDirectory)/Release/perms.txt
displayName: 'Find Executable Files'

- task: PublishPipelineArtifact@0
displayName: 'Publish Pipeline Artifact'
inputs:
artifactName: 'drop-linux'
targetPath: '$(Build.BinariesDirectory)/Release'

- template: templates/explicitly-defined-final-tasks.yml

- job: Linux_Test
workspace:
clean: all
pool: AMD-GPU
dependsOn:
- Linux_Build
timeoutInMinutes: 120

steps:
- task: DownloadPipelineArtifact@2
displayName: 'Download Pipeline Artifact'
inputs:
buildType: 'current'
artifactName: 'drop-linux'
targetPath: '$(Build.BinariesDirectory)/Release'

- checkout: self
clean: true
submodules: recursive

- template: templates/get-docker-image-steps.yml
parameters:
Dockerfile: tools/ci_build/github/linux/docker/rocm-ci-pipeline-env.Dockerfile
Context: tools/ci_build/github/linux/docker
DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)$(RocmVersionPatchSuffix)"
Repository: onnxruntimerocm-cibuild-rocm$(RocmVersion)

- task: CmdLine@2
inputs:
script: |
docker run --rm \
--security-opt seccomp=unconfined \
--shm-size=1024m \
--device=/dev/kfd \
--device=/dev/dri/renderD$DRIVER_RENDER \
--group-add $(video) \
--group-add $(render) \
--user $UID:$(id -g $USER) \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume /data/models:/build/models:ro \
--workdir /build/Release \
onnxruntimerocm-cibuild-rocm$(RocmVersion) \
/bin/bash -c "
set -ex; \
xargs -a /build/Release/perms.txt chmod a+x; \
python /onnxruntime_src/tools/ci_build/build.py \
--config Release \
--cmake_extra_defines \
CMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \
onnxruntime_BUILD_KERNEL_EXPLORER=ON \
CMAKE_HIP_ARCHITECTURES=gfx90a \
--mpi_home /opt/ompi \
--use_rocm \
--rocm_version=$(RocmVersion) \
--rocm_home /opt/rocm \
--nccl_home /opt/rocm \
--enable_nccl \
--build_dir /build \
--build_shared_lib \
--parallel \
--build_wheel \
--skip_submodule_sync \
--test --enable_onnx_tests --enable_transformers_tool_test \
--cmake_path /usr/bin/cmake --ctest_path /usr/bin/ctest"
workingDirectory: $(Build.SourcesDirectory)
displayName: 'Run onnxruntime unit tests'

- task: CmdLine@2
inputs:
script: |-
mindest marked this conversation as resolved.
Show resolved Hide resolved
docker run --rm \
--security-opt seccomp=unconfined \
--shm-size=1024m \
--device=/dev/kfd \
--device=/dev/dri/renderD$DRIVER_RENDER \
--group-add $(video) \
--group-add $(render) \
--user $UID:$(id -g $USER) \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
-e OPENBLAS_NUM_THREADS=1 \
-e OPENMP_NUM_THREADS=1 \
-e MKL_NUM_THREADS=1 \
-e KERNEL_EXPLORER_BUILD_DIR=/build/Release \
-e KERNEL_EXPLORER_BATCHED_GEMM_MAX_BATCH_SIZE=8 \
-e KERNEL_EXPLORER_TEST_USE_CUPY=1 \
-e CUPY_CACHE_DIR=/build/Release \
onnxruntimerocm-cibuild-rocm$(RocmVersion) \
pytest /onnxruntime_src/onnxruntime/python/tools/kernel_explorer/ -n 4 --reruns 1 --durations=100
workingDirectory: $(Build.SourcesDirectory)
displayName: 'Run kernel explorer tests'
condition: succeededOrFailed()

- template: templates/clean-agent-build-directory-step.yml
Loading
Loading