Skip to content

Commit

Permalink
[ROCm] Python 3.10 in ROCm CI, and ROCm 6.2.3 in MigraphX CI (#22527)
Browse files Browse the repository at this point in the history
### Description
Upgrade python from 3.9 to 3.10 in ROCm and MigraphX docker files and CI
pipelines. Upgrade ROCm version to 6.2.3 in most places except ROCm CI,
see comment below.

Some improvements/upgrades on ROCm/Migraphx docker or pipeline:
* rocm 6.0/6.1.3 => 6.2.3
* python 3.9 => 3.10
* Ubuntu 20.04 => 22.04
* Also upgrade ml_dtypes, numpy and scipy packages.
* Fix message "ROCm version from ..." with correct file path in
CMakeList.txt
* Exclude some NHWC tests since ROCm EP lacks support for NHWC
convolution.

#### ROCm CI Pipeline:
ROCm 6.1.3 is kept in the pipeline for now.
- Failed after upgrading to ROCm 6.2.3: `HIPBLAS_STATUS_INVALID_VALUE ;
GPU=0 ; hostname=76123b390aed ;
file=/onnxruntime_src/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
; line=170 ; expr=hipblasSetStream(hipblas_handle_, stream);` . It need
further investigation.
- cupy issues:
(1) It currently supports numpy < 1.27, might not work with numpy 2.x.
So we locked numpy==1.26.4 for now.
(2) cupy support of ROCm 6.2 is still in progress:
cupy/cupy#8606.

Note that miniconda issues: its libstdc++.so.6 and libgcc_s.so.1 might
have conflict with the system ones. So we created links to use the
system ones.

#### MigraphX CI pipeline

MigraphX CI does not use cupy, and we are able to use ROCm 6.2.3 and
numpy 2.x in the pipeline.

#### Other attempts

Other things that I've tried which might help in the future: 

Attempt to use a single docker file for both ROCm and Migraphx:
#22478

Upgrade to ubuntu 24.04 and python 3.12, and use venv like
[this](https://github.com/microsoft/onnxruntime/blob/27903e7ff1dd7256cd2b277c03766b4f2ad9e2f1/tools/ci_build/github/linux/docker/rocm-ci-pipeline-env.Dockerfile).

### Motivation and Context
In 1.20 release, ROCm nuget packaging pipeline will use 6.2:
#22461.
This upgrades rocm to 6.2.3 in CI pipelines to be consistent.
  • Loading branch information
tianleiwu authored Oct 25, 2024
1 parent 28efacf commit b4afc62
Show file tree
Hide file tree
Showing 11 changed files with 70 additions and 61 deletions.
69 changes: 39 additions & 30 deletions cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -291,12 +291,50 @@ if (onnxruntime_USE_ROCM)
message(FATAL_ERROR "ROCM does not support build with CUDA!")
endif()

# replicate strategy used by pytorch to get ROCM_VERSION
# https://github.com/pytorch/pytorch/blob/5c5b71b6eebae76d744261715231093e62f0d090/cmake/public/LoadHIP.cmake
# with modification
if (EXISTS "${onnxruntime_ROCM_HOME}/.info/version")
message("\n***** ROCm version from ${onnxruntime_ROCM_HOME}/.info/version ****\n")
file(READ "${onnxruntime_ROCM_HOME}/.info/version" ROCM_VERSION_DEV_RAW)
string(REGEX MATCH "^([0-9]+)\.([0-9]+)\.([0-9]+)-.*$" ROCM_VERSION_MATCH ${ROCM_VERSION_DEV_RAW})
elseif (EXISTS "${onnxruntime_ROCM_HOME}/include/rocm_version.h")
message("\n***** ROCm version from ${onnxruntime_ROCM_HOME}/include/rocm_version.h ****\n")
file(READ "${onnxruntime_ROCM_HOME}/include/rocm_version.h" ROCM_VERSION_H_RAW)
string(REGEX MATCH "\"([0-9]+)\.([0-9]+)\.([0-9]+).*\"" ROCM_VERSION_MATCH ${ROCM_VERSION_H_RAW})
elseif (EXISTS "${onnxruntime_ROCM_HOME}/include/rocm-core/rocm_version.h")
message("\n***** ROCm version from ${onnxruntime_ROCM_HOME}/include/rocm-core/rocm_version.h ****\n")
file(READ "${onnxruntime_ROCM_HOME}/include/rocm-core/rocm_version.h" ROCM_VERSION_H_RAW)
string(REGEX MATCH "\"([0-9]+)\.([0-9]+)\.([0-9]+).*\"" ROCM_VERSION_MATCH ${ROCM_VERSION_H_RAW})
endif()

if (ROCM_VERSION_MATCH)
set(ROCM_VERSION_DEV_MAJOR ${CMAKE_MATCH_1})
set(ROCM_VERSION_DEV_MINOR ${CMAKE_MATCH_2})
set(ROCM_VERSION_DEV_PATCH ${CMAKE_MATCH_3})
set(ROCM_VERSION_DEV "${ROCM_VERSION_DEV_MAJOR}.${ROCM_VERSION_DEV_MINOR}.${ROCM_VERSION_DEV_PATCH}")
math(EXPR ROCM_VERSION_DEV_INT "(${ROCM_VERSION_DEV_MAJOR}*10000) + (${ROCM_VERSION_DEV_MINOR}*100) + ${ROCM_VERSION_DEV_PATCH}")

message("ROCM_VERSION_DEV: ${ROCM_VERSION_DEV}")
message("ROCM_VERSION_DEV_MAJOR: ${ROCM_VERSION_DEV_MAJOR}")
message("ROCM_VERSION_DEV_MINOR: ${ROCM_VERSION_DEV_MINOR}")
message("ROCM_VERSION_DEV_PATCH: ${ROCM_VERSION_DEV_PATCH}")
message("ROCM_VERSION_DEV_INT: ${ROCM_VERSION_DEV_INT}")
else()
message(FATAL_ERROR "Cannot determine ROCm version string")
endif()


if (NOT CMAKE_HIP_COMPILER)
set(CMAKE_HIP_COMPILER "${onnxruntime_ROCM_HOME}/llvm/bin/clang++")
endif()

if (NOT CMAKE_HIP_ARCHITECTURES)
set(CMAKE_HIP_ARCHITECTURES "gfx908;gfx90a;gfx1030;gfx1100;gfx1101;gfx940;gfx941;gfx942;gfx1200;gfx1201")
if (ROCM_VERSION_DEV VERSION_LESS "6.2")
message(FATAL_ERROR "CMAKE_HIP_ARCHITECTURES is not set when ROCm version < 6.2")
else()
set(CMAKE_HIP_ARCHITECTURES "gfx908;gfx90a;gfx1030;gfx1100;gfx1101;gfx940;gfx941;gfx942;gfx1200;gfx1201")
endif()
endif()

file(GLOB rocm_cmake_components ${onnxruntime_ROCM_HOME}/lib/cmake/*)
Expand Down Expand Up @@ -328,35 +366,6 @@ if (onnxruntime_USE_ROCM)
set(onnxruntime_HIPIFY_PERL ${HIPIFY_PERL_PATH}/hipify-perl)
endif()

# replicate strategy used by pytorch to get ROCM_VERSION
# https://github.com/pytorch/pytorch/blob/5c5b71b6eebae76d744261715231093e62f0d090/cmake/public/LoadHIP.cmake
# with modification
if (EXISTS "${onnxruntime_ROCM_HOME}/.info/version")
file(READ "${onnxruntime_ROCM_HOME}/.info/version" ROCM_VERSION_DEV_RAW)
string(REGEX MATCH "^([0-9]+)\.([0-9]+)\.([0-9]+)-.*$" ROCM_VERSION_MATCH ${ROCM_VERSION_DEV_RAW})
elseif (EXISTS "${onnxruntime_ROCM_HOME}/include/rocm_version.h")
file(READ "${onnxruntime_ROCM_HOME}/include/rocm_version.h" ROCM_VERSION_H_RAW)
string(REGEX MATCH "\"([0-9]+)\.([0-9]+)\.([0-9]+).*\"" ROCM_VERSION_MATCH ${ROCM_VERSION_H_RAW})
elseif (EXISTS "${onnxruntime_ROCM_HOME}/include/rocm-core/rocm_version.h")
file(READ "${onnxruntime_ROCM_HOME}/include/rocm-core/rocm_version.h" ROCM_VERSION_H_RAW)
string(REGEX MATCH "\"([0-9]+)\.([0-9]+)\.([0-9]+).*\"" ROCM_VERSION_MATCH ${ROCM_VERSION_H_RAW})
endif()

if (ROCM_VERSION_MATCH)
set(ROCM_VERSION_DEV_MAJOR ${CMAKE_MATCH_1})
set(ROCM_VERSION_DEV_MINOR ${CMAKE_MATCH_2})
set(ROCM_VERSION_DEV_PATCH ${CMAKE_MATCH_3})
set(ROCM_VERSION_DEV "${ROCM_VERSION_DEV_MAJOR}.${ROCM_VERSION_DEV_MINOR}.${ROCM_VERSION_DEV_PATCH}")
math(EXPR ROCM_VERSION_DEV_INT "(${ROCM_VERSION_DEV_MAJOR}*10000) + (${ROCM_VERSION_DEV_MINOR}*100) + ${ROCM_VERSION_DEV_PATCH}")
else()
message(FATAL_ERROR "Cannot determine ROCm version string")
endif()
message("\n***** ROCm version from ${onnxruntime_ROCM_HOME}/.info/version ****\n")
message("ROCM_VERSION_DEV: ${ROCM_VERSION_DEV}")
message("ROCM_VERSION_DEV_MAJOR: ${ROCM_VERSION_DEV_MAJOR}")
message("ROCM_VERSION_DEV_MINOR: ${ROCM_VERSION_DEV_MINOR}")
message("ROCM_VERSION_DEV_PATCH: ${ROCM_VERSION_DEV_PATCH}")
message("ROCM_VERSION_DEV_INT: ${ROCM_VERSION_DEV_INT}")
message("\n***** HIP LANGUAGE CONFIG INFO ****\n")
message("CMAKE_HIP_COMPILER: ${CMAKE_HIP_COMPILER}")
message("CMAKE_HIP_ARCHITECTURES: ${CMAKE_HIP_ARCHITECTURES}")
Expand Down
2 changes: 1 addition & 1 deletion dockerfiles/Dockerfile.migraphx
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# Dockerfile to run ONNXRuntime with MIGraphX integration
#--------------------------------------------------------------------------

FROM rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1
FROM rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0

ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
ARG ONNXRUNTIME_BRANCH=main
Expand Down
2 changes: 1 addition & 1 deletion dockerfiles/Dockerfile.rocm
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# Dockerfile to run ONNXRuntime with ROCm integration
#--------------------------------------------------------------------------

FROM rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1
FROM rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0

ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
ARG ONNXRUNTIME_BRANCH=main
Expand Down
4 changes: 2 additions & 2 deletions dockerfiles/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ Nothing else from ONNX Runtime source tree will be copied/installed to the image
Note: When running the container you built in Docker, please either use 'nvidia-docker' command instead of 'docker', or use Docker command-line options to make sure NVIDIA runtime will be used and appropriate files mounted from host. Otherwise, CUDA libraries won't be found. You can also [set NVIDIA runtime as default in Docker](https://github.com/dusty-nv/jetson-containers#docker-default-runtime).

## MIGraphX
**Ubuntu 20.04, ROCm6.0, MIGraphX**
**Ubuntu 22.04, ROCm6.2.3, MIGraphX**

1. Build the docker image from the Dockerfile in this repository.
```
Expand All @@ -306,7 +306,7 @@ Note: When running the container you built in Docker, please either use 'nvidia-
```

## ROCm
**Ubuntu 20.04, ROCm6.0**
**Ubuntu 22.04, ROCm6.2.3**

1. Build the docker image from the Dockerfile in this repository.
```
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ TEST(InternalTestingEP, PreventSaveOfModelWithCompiledOps) {

// the internal NHWC operators are only included as part of contrib ops currently. as the EP requests the NHWC
// version of the ONNX operator when matching a static kernel, those are required.
#if !defined(DISABLE_CONTRIB_OPS)
#if !defined(DISABLE_CONTRIB_OPS) && !defined(USE_ROCM)
TEST(InternalTestingEP, TestMixOfStaticAndCompiledKernels) {
const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "transform/fusion/conv_relu_opset12.onnx";

Expand Down Expand Up @@ -256,10 +256,6 @@ TEST(InternalTestingEP, TestNhwcConversionOfStaticKernels) {
run_test(ort_model_path);
}

// This test can be deprecated now as the code logic has been changed so the model is not applicable
// TEST(InternalTestingEP, TestRegisterAllocatorHandlesUsageInMultipleSessions) {
//}

// make sure allocators returned by SessionState::GetAllocator are valid when IExecutionProvider::ReplaceAllocator
// is used. if something is off InferenceSession::Initialize will fail.
TEST(InternalTestingEP, TestReplaceAllocatorDoesntBreakDueToLocalAllocatorStorage) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,7 @@ variables:
- name: render
value: 109
- name: RocmVersion
value: 6.1
- name: RocmVersionPatchSuffix
value: ".3"
value: 6.2.3

jobs:
- job: Linux_Build
Expand All @@ -66,7 +64,7 @@ jobs:
parameters:
Dockerfile: tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile
Context: tools/ci_build/github/linux/docker
DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)$(RocmVersionPatchSuffix)"
DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)"
Repository: onnxruntimetrainingmigraphx-cibuild-rocm$(RocmVersion)

- task: Cache@2
Expand Down Expand Up @@ -165,7 +163,7 @@ jobs:
parameters:
Dockerfile: tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile
Context: tools/ci_build/github/linux/docker
DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)$(RocmVersionPatchSuffix)"
DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)"
Repository: onnxruntimetrainingmigraphx-cibuild-rocm$(RocmVersion)

- task: CmdLine@2
Expand Down
14 changes: 8 additions & 6 deletions tools/ci_build/github/azure-pipelines/linux-rocm-ci-pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,7 @@ variables:
- name: render
value: 109
- name: RocmVersion
value: 6.1
- name: RocmVersionPatchSuffix
value: ".3"
value: 6.1.3

jobs:
- job: Linux_Build
Expand All @@ -66,7 +64,7 @@ jobs:
parameters:
Dockerfile: tools/ci_build/github/linux/docker/rocm-ci-pipeline-env.Dockerfile
Context: tools/ci_build/github/linux/docker
DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)$(RocmVersionPatchSuffix)"
DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)"
Repository: onnxruntimerocm-cibuild-rocm$(RocmVersion)

- task: Cache@2
Expand Down Expand Up @@ -166,7 +164,7 @@ jobs:
parameters:
Dockerfile: tools/ci_build/github/linux/docker/rocm-ci-pipeline-env.Dockerfile
Context: tools/ci_build/github/linux/docker
DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)$(RocmVersionPatchSuffix)"
DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)"
Repository: onnxruntimerocm-cibuild-rocm$(RocmVersion)

- task: CmdLine@2
Expand Down Expand Up @@ -231,7 +229,11 @@ jobs:
-e KERNEL_EXPLORER_TEST_USE_CUPY=1 \
-e CUPY_CACHE_DIR=/build/Release \
onnxruntimerocm-cibuild-rocm$(RocmVersion) \
pytest /onnxruntime_src/onnxruntime/python/tools/kernel_explorer/ -n 4 --reruns 1 --durations=100
/bin/bash -c "
set -ex; \
python --version; \
ls /opt/miniconda/envs/rocm-ci/lib/; \
pytest /onnxruntime_src/onnxruntime/python/tools/kernel_explorer/ -n 4 --reruns 1 --durations=100"
workingDirectory: $(Build.SourcesDirectory)
displayName: 'Run kernel explorer tests'
condition: succeededOrFailed()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ ARG LD_LIBRARY_PATH_ARG=${DEVTOOLSET_ROOTPATH}/usr/lib64:${DEVTOOLSET_ROOTPATH}/
ARG PREPEND_PATH=${DEVTOOLSET_ROOTPATH}/usr/bin:

FROM $BASEIMAGE AS base_image
ARG ROCM_VERSION=5.5
ARG ROCM_VERSION=6.2.3

#Add our own dependencies
ADD scripts /tmp/scripts
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Refer to https://github.com/RadeonOpenCompute/ROCm-docker/blob/master/dev/Dockerfile-ubuntu-22.04-complete
FROM ubuntu:22.04

ARG ROCM_VERSION=6.0
ARG ROCM_VERSION=6.2.3
ARG AMDGPU_VERSION=${ROCM_VERSION}
ARG APT_PREF='Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600'

Expand Down Expand Up @@ -68,7 +68,7 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86
# Create migraphx-ci environment
ENV CONDA_ENVIRONMENT_PATH /opt/miniconda/envs/migraphx-ci
ENV CONDA_DEFAULT_ENV migraphx-ci
RUN conda create -y -n ${CONDA_DEFAULT_ENV} python=3.9
RUN conda create -y -n ${CONDA_DEFAULT_ENV} python=3.10
ENV PATH ${CONDA_ENVIRONMENT_PATH}/bin:${PATH}

# Enable migraphx-ci environment
Expand All @@ -80,4 +80,4 @@ RUN ln -sf /usr/lib/x86_64-linux-gnu/libstdc++.so.6 ${CONDA_ENVIRONMENT_PATH}/bi
# Install migraphx
RUN apt update && apt install -y migraphx

RUN pip install numpy packaging ml_dtypes==0.3.0
RUN pip install numpy packaging ml_dtypes==0.5.0
16 changes: 10 additions & 6 deletions tools/ci_build/github/linux/docker/rocm-ci-pipeline-env.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Refer to https://github.com/RadeonOpenCompute/ROCm-docker/blob/master/dev/Dockerfile-ubuntu-22.04-complete
FROM ubuntu:22.04

ARG ROCM_VERSION=6.0
ARG ROCM_VERSION=6.1.3
ARG AMDGPU_VERSION=${ROCM_VERSION}
ARG APT_PREF='Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600'

Expand Down Expand Up @@ -67,26 +67,30 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86
# Create rocm-ci environment
ENV CONDA_ENVIRONMENT_PATH /opt/miniconda/envs/rocm-ci
ENV CONDA_DEFAULT_ENV rocm-ci
RUN conda create -y -n ${CONDA_DEFAULT_ENV} python=3.9
RUN conda create -y -n ${CONDA_DEFAULT_ENV} python=3.10
ENV PATH ${CONDA_ENVIRONMENT_PATH}/bin:${PATH}

# Enable rocm-ci environment
SHELL ["conda", "run", "-n", "rocm-ci", "/bin/bash", "-c"]

# ln -sf is needed to make sure that version `GLIBCXX_3.4.30' is found
# Some DLLs in the conda environment have conflict with the one installed in Ubuntu system.
# For example, the GCC version in the conda environment is 12.x, while the one in the Ubuntu 22.04 is 11.x.
# ln -sf to make sure we always use libstdc++.so.6 and libgcc_s.so.1 in the system.
RUN ln -sf /usr/lib/x86_64-linux-gnu/libstdc++.so.6 ${CONDA_ENVIRONMENT_PATH}/bin/../lib/libstdc++.so.6
RUN ln -sf /usr/lib/x86_64-linux-gnu/libgcc_s.so.1 ${CONDA_ENVIRONMENT_PATH}/bin/../lib/libgcc_s.so.1

RUN pip install packaging \
ml_dtypes==0.3.0 \
ml_dtypes==0.5.0 \
pytest==7.4.4 \
pytest-xdist \
pytest-rerunfailures \
scipy==1.10.0 \
numpy==1.24.1
scipy==1.14.1 \
numpy==1.26.4

RUN apt install -y git

# Install Cupy to decrease CPU utilization
# Note that the version of Cupy requires numpy < 1.27
RUN git clone https://github.com/ROCm/cupy && cd cupy && \
git checkout 432a8683351d681e00903640489cb2f4055d2e09 && \
export CUPY_INSTALL_USE_HIP=1 && \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
set -e -x

# version
ROCM_VERSION=6.0
ROCM_VERSION=6.2.3

while getopts "r:" parameter_Option
do case "${parameter_Option}"
Expand Down

0 comments on commit b4afc62

Please sign in to comment.