Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Test] Test new ROCm CI on MIGraphX CI. #21614

Closed
wants to merge 37 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
b2b39c9
Test new CI on MIGraphX CI.
mindest Aug 5, 2024
b9efcf0
test: turn kernel explorer on
mindest Aug 5, 2024
26607a8
test: restore original settings
mindest Aug 5, 2024
2ecbf49
Include nccl_service only in training
mindest Aug 6, 2024
22c3dd1
Dockerfile: add git install
mindest Aug 6, 2024
4bedff8
Turn on ck and ke
mindest Aug 6, 2024
f3fe61f
Correct flag --test.
mindest Aug 6, 2024
7c03f61
Specify arch gfx90a only; enable_training macro
mindest Aug 6, 2024
16105ad
Disable test_kernels; cd to /tmp
mindest Aug 6, 2024
eca3ba7
test: some debug outputs; cache_dir
mindest Aug 6, 2024
9152170
test: revert
mindest Aug 6, 2024
5f5cccf
Correct image name
mindest Aug 7, 2024
dd54945
Correct image name 2
mindest Aug 7, 2024
10febab
test: cache_dir
mindest Aug 7, 2024
62101f7
Remove --update in test part.
mindest Aug 7, 2024
1d0b082
Remove --build in test part.
mindest Aug 7, 2024
d54bb42
Add --build_shared_lib --enable_onnx_tests
mindest Aug 7, 2024
f451a9f
Fix scatter op test for ROCm EP.
mindest Aug 7, 2024
1b9f19a
test: mount /data/build dir
mindest Aug 8, 2024
68eec09
test: extend test time
mindest Aug 8, 2024
2f4f7db
test: overwrite test timeout to 4h
mindest Aug 9, 2024
705a456
Add disabled tests list; add ke test
mindest Aug 12, 2024
a6d9b5c
Fix EP name.
mindest Aug 13, 2024
f263bf9
Fix ke test setting
mindest Aug 13, 2024
437edab
Merge branch 'main' into linmin/test_new_ci
mindest Aug 13, 2024
a4ad7b8
Fix lint, unused variable; ke user.
mindest Aug 13, 2024
664122f
Remove ifdef for ROCm
mindest Aug 13, 2024
4ff9fe9
Add pytest in Dockerfile
mindest Aug 13, 2024
a9e9be0
Fix error
mindest Aug 13, 2024
25079de
Fix pytest: -n, --reruns
mindest Aug 13, 2024
ff87a0e
Restore test timeout; add missing pkgs in docker.
mindest Aug 13, 2024
f778f63
Fix error "=="
mindest Aug 14, 2024
d0e10dd
Fix multiple cmd in docker run
mindest Aug 15, 2024
52f2aa2
Add cupy in docker
mindest Aug 15, 2024
29be69a
Change workdir; add --use_migraphx
mindest Aug 16, 2024
ca5caf0
Remove --use_migraphx
mindest Aug 16, 2024
2cc4d0e
Set cupy cache dir to avoid permission error.
mindest Aug 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmake/onnxruntime_kernel_explorer.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -89,4 +89,4 @@ add_dependencies(kernel_explorer onnxruntime_pybind11_state)

enable_testing()
find_package(Python COMPONENTS Interpreter REQUIRED)
add_test(NAME test_kernels COMMAND ${Python_EXECUTABLE} -m pytest ..)
# add_test(NAME test_kernels COMMAND ${Python_EXECUTABLE} -m pytest ..)
6 changes: 3 additions & 3 deletions onnxruntime/core/providers/rocm/rocm_provider_factory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,15 @@
#include "core/providers/rocm/gpu_data_transfer.h"
#include "core/providers/rocm/math/unary_elementwise_ops_impl.h"

#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P)
#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) && defined(ENABLE_TRAINING)
#include "orttraining/training_ops/rocm/communication/nccl_service.h"
#endif

using namespace onnxruntime;

namespace onnxruntime {

#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P)
#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) && defined(ENABLE_TRAINING)
namespace rocm {
rocm::INcclService& GetINcclService();
}
Expand Down Expand Up @@ -155,7 +155,7 @@ struct ProviderInfo_ROCM_Impl final : ProviderInfo_ROCM {
info = ROCMExecutionProviderInfo::FromProviderOptions(options);
}

#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P)
#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) && defined(ENABLE_TRAINING)
rocm::INcclService& GetINcclService() override {
return rocm::GetINcclService();
}
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/core/providers/rocm/rocm_provider_factory.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ struct ProviderInfo_ROCM {
virtual int hipGetDeviceCount() = 0;
virtual void ROCMExecutionProviderInfo__FromProviderOptions(const onnxruntime::ProviderOptions& options, onnxruntime::ROCMExecutionProviderInfo& info) = 0;

#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P)
#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) && defined(ENABLE_TRAINING)
virtual onnxruntime::rocm::INcclService& GetINcclService() = 0;
#endif

Expand Down
26 changes: 22 additions & 4 deletions onnxruntime/test/providers/cpu/model_tests.cc
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ TEST_P(ModelTest, Run) {

// when cuda or openvino is enabled, set it to a larger value for resolving random MNIST test failure
if (model_path.find(ORT_TSTR("_MNIST")) > 0) {
if (provider_name == "cuda" || provider_name == "openvino") {
if (provider_name == "cuda" || provider_name == "openvino" || provider_name == "rocm") {
per_sample_tolerance = 2.5e-2;
relative_per_sample_tolerance = 1e-2;
}
Expand Down Expand Up @@ -407,9 +407,7 @@ static constexpr ORT_STRING_VIEW provider_name_migraphx = ORT_TSTR("migraphx");
#endif
static constexpr ORT_STRING_VIEW provider_name_openvino = ORT_TSTR("openvino");
static constexpr ORT_STRING_VIEW provider_name_cuda = ORT_TSTR("cuda");
#ifdef USE_ROCM
static constexpr ORT_STRING_VIEW provider_name_rocm = ORT_TSTR("rocm");
#endif
static constexpr ORT_STRING_VIEW provider_name_dnnl = ORT_TSTR("dnnl");
// For any non-Android system, NNAPI will only be used for ort model converter
#if defined(USE_NNAPI) && defined(__ANDROID__)
Expand Down Expand Up @@ -537,6 +535,21 @@ ::std::vector<::std::basic_string<ORTCHAR_T>> GetParameterStrings() {
ORT_TSTR("fp16_test_tiny_yolov2"),
ORT_TSTR("fp16_test_shufflenet"),
ORT_TSTR("keras2coreml_SimpleRNN_ImageNet")};
static const ORTCHAR_T* rocm_disabled_tests[] = {ORT_TSTR("bvlc_alexnet"),
ORT_TSTR("bvlc_reference_caffenet"),
ORT_TSTR("bvlc_reference_rcnn_ilsvrc13"),
ORT_TSTR("coreml_Resnet50_ImageNet"),
ORT_TSTR("mlperf_resnet"),
ORT_TSTR("mobilenetv2-1.0"),
ORT_TSTR("shufflenet"),
// models from model zoo
ORT_TSTR("AlexNet"),
ORT_TSTR("CaffeNet"),
ORT_TSTR("MobileNet v2-7"),
ORT_TSTR("R-CNN ILSVRC13"),
ORT_TSTR("ShuffleNet-v1"),
ORT_TSTR("version-RFB-320"),
ORT_TSTR("version-RFB-640")};
static const ORTCHAR_T* openvino_disabled_tests[] = {
ORT_TSTR("tf_mobilenet_v1_1.0_224"),
ORT_TSTR("bertsquad"),
Expand Down Expand Up @@ -663,7 +676,12 @@ ::std::vector<::std::basic_string<ORTCHAR_T>> GetParameterStrings() {

std::unordered_set<std::basic_string<ORTCHAR_T>> all_disabled_tests(std::begin(immutable_broken_tests),
std::end(immutable_broken_tests));
if (provider_name == provider_name_cuda) {
bool provider_cuda_or_rocm = provider_name == provider_name_cuda;
if (provider_name == provider_name_rocm) {
provider_cuda_or_rocm = true;
all_disabled_tests.insert(std::begin(rocm_disabled_tests), std::end(rocm_disabled_tests));
}
if (provider_cuda_or_rocm) {
all_disabled_tests.insert(std::begin(cuda_flaky_tests), std::end(cuda_flaky_tests));
} else if (provider_name == provider_name_dml) {
all_disabled_tests.insert(std::begin(dml_disabled_tests), std::end(dml_disabled_tests));
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/test/providers/cpu/tensor/scatter_op_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ static void scatter_invalid_index(const char* op_name, int op_version) {
test.AddOutput<float>("y", {4, 2, 1}, {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 5.0f, 0.0f});
test.Run(OpTester::ExpectResult::kExpectFailure,
"indices element out of data bounds, idx=4 must be within the inclusive range [-4,3]",
{kCudaExecutionProvider, kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
{kCudaExecutionProvider, kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kRocmExecutionProvider});
}

TEST(Scatter, InvalidIndex) {
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/test/python/onnxruntime_test_python.py
Original file line number Diff line number Diff line change
Expand Up @@ -1689,7 +1689,7 @@ def test_register_custom_e_ps_library(self):

available_eps = C.get_available_providers()
# skip amd gpu build
if "kRocmExecutionProvider" in available_eps:
if "ROCMExecutionProvider" in available_eps:
return
if sys.platform.startswith("win"):
shared_library = "test_execution_provider.dll"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ jobs:
Dockerfile: tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile
Context: tools/ci_build/github/linux/docker
DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)$(RocmVersionPatchSuffix)"
Repository: onnxruntimetrainingmigraphx-cibuild-rocm$(RocmVersion)
Repository: onnxruntimerocm-cibuild-rocm$(RocmVersion)

- task: Cache@2
inputs:
Expand Down Expand Up @@ -94,29 +94,30 @@ jobs:
--volume $(CCACHE_DIR):/cache \
-e CCACHE_DIR=/cache \
--workdir /onnxruntime_src \
onnxruntimetrainingmigraphx-cibuild-rocm$(RocmVersion) \
onnxruntimerocm-cibuild-rocm$(RocmVersion) \
/bin/bash -c "
set -ex; \
env; \
ccache -s; \
python tools/ci_build/build.py \
--config Release \
--enable_training \
--cmake_extra_defines \
CMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \
onnxruntime_BUILD_KERNEL_EXPLORER=OFF \
onnxruntime_USE_COMPOSABLE_KERNEL=OFF \
onnxruntime_BUILD_KERNEL_EXPLORER=ON \
CMAKE_HIP_ARCHITECTURES=gfx90a \
--mpi_home /opt/ompi \
--use_migraphx \
--use_rocm \
--rocm_version=$(RocmVersion) \
--rocm_home /opt/rocm \
--nccl_home /opt/rocm \
--enable_nccl \
--update \
--build_dir /build \
--build \
--build_shared_lib \
--parallel \
--build_wheel \
--enable_onnx_tests \
--skip_submodule_sync \
--use_cache \
--skip_tests --cmake_path /usr/bin/cmake --ctest_path /usr/bin/ctest; \
Expand Down Expand Up @@ -165,7 +166,7 @@ jobs:
Dockerfile: tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile
Context: tools/ci_build/github/linux/docker
DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)$(RocmVersionPatchSuffix)"
Repository: onnxruntimetrainingmigraphx-cibuild-rocm$(RocmVersion)
Repository: onnxruntimerocm-cibuild-rocm$(RocmVersion)

- task: CmdLine@2
inputs:
Expand All @@ -180,13 +181,58 @@ jobs:
--user $UID:$(id -g $USER) \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume /data/models:/build/models:ro \
--workdir /build/Release \
onnxruntimetrainingmigraphx-cibuild-rocm$(RocmVersion) \
onnxruntimerocm-cibuild-rocm$(RocmVersion) \
/bin/bash -c "
set -ex; \
cd /build/Release && xargs -a /build/Release/perms.txt chmod a+x; \
bash /onnxruntime_src/tools/ci_build/github/pai/pai_test_launcher.sh"
xargs -a /build/Release/perms.txt chmod a+x; \
python /onnxruntime_src/tools/ci_build/build.py \
--config Release \
--cmake_extra_defines \
CMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \
onnxruntime_BUILD_KERNEL_EXPLORER=ON \
CMAKE_HIP_ARCHITECTURES=gfx90a \
--mpi_home /opt/ompi \
--use_rocm \
--rocm_version=$(RocmVersion) \
--rocm_home /opt/rocm \
--nccl_home /opt/rocm \
--enable_nccl \
--build_dir /build \
--build_shared_lib \
--parallel \
--build_wheel \
--skip_submodule_sync \
--test --enable_onnx_tests --enable_transformers_tool_test \
--cmake_path /usr/bin/cmake --ctest_path /usr/bin/ctest"
workingDirectory: $(Build.SourcesDirectory)
displayName: 'Run onnxruntime unit tests'

- task: CmdLine@2
inputs:
script: |-
docker run --rm \
--security-opt seccomp=unconfined \
--shm-size=1024m \
--device=/dev/kfd \
--device=/dev/dri/renderD$DRIVER_RENDER \
--group-add $(video) \
--group-add $(render) \
--user $UID:$(id -g $USER) \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
-e OPENBLAS_NUM_THREADS=1 \
-e OPENMP_NUM_THREADS=1 \
-e MKL_NUM_THREADS=1 \
-e KERNEL_EXPLORER_BUILD_DIR=/build/Release \
-e KERNEL_EXPLORER_BATCHED_GEMM_MAX_BATCH_SIZE=8 \
-e KERNEL_EXPLORER_TEST_USE_CUPY=1 \
-e CUPY_CACHE_DIR=/build/Release \
onnxruntimerocm-cibuild-rocm$(RocmVersion) \
pytest /onnxruntime_src/onnxruntime/python/tools/kernel_explorer/ -n 4 --reruns 1 --durations=100
workingDirectory: $(Build.SourcesDirectory)
displayName: 'Run kernel explorer tests'
condition: succeededOrFailed()

- template: templates/clean-agent-build-directory-step.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,15 @@ RUN apt-get update && \
curl -sL https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - &&\
printf "deb [arch=amd64] https://repo.radeon.com/rocm/apt/$ROCM_VERSION/ jammy main" | tee /etc/apt/sources.list.d/rocm.list && \
printf "deb [arch=amd64] https://repo.radeon.com/amdgpu/$AMDGPU_VERSION/ubuntu jammy main" | tee /etc/apt/sources.list.d/amdgpu.list && \
apt-get update && apt-get install -y --no-install-recommends \
sudo \
libelf1 \
kmod \
file \
python3 \
python3-pip \
rocm-dev \
rocm-libs \
apt-get update && apt-get install -y --no-install-recommends \
sudo \
libelf1 \
kmod \
file \
python3 \
python3-pip \
rocm-dev \
rocm-libs \
build-essential && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
Expand Down Expand Up @@ -81,3 +81,14 @@ RUN ln -sf /usr/lib/x86_64-linux-gnu/libstdc++.so.6 ${CONDA_ENVIRONMENT_PATH}/bi
RUN apt update && apt install -y migraphx

RUN pip install numpy packaging ml_dtypes==0.3.0

RUN apt install -y git
RUN pip install pytest==7.4.4 pytest-xdist pytest-rerunfailures scipy==1.10.0 numpy==1.24.1

RUN git clone https://github.com/ROCm/cupy && cd cupy && \
git checkout 432a8683351d681e00903640489cb2f4055d2e09 && \
export CUPY_INSTALL_USE_HIP=1 && \
export ROCM_HOME=/opt/rocm && \
export HCC_AMDGPU_TARGET=gfx906,gfx908,gfx90a && \
git submodule update --init && \
pip install -e . --no-cache-dir -vvvv
Loading