From 5d4d88c3529e477cd55aabeedd71a6b3afe9d87c Mon Sep 17 00:00:00 2001 From: peixuanzuo Date: Fri, 15 Sep 2023 06:16:00 +0000 Subject: [PATCH 1/2] try use manylinux build --- .../orttraining-pai-ci-pipeline.yml | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml index 523390debc887..55ad049d1e5e1 100644 --- a/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml @@ -41,11 +41,15 @@ jobs: clean: true submodules: recursive - - template: templates/get-docker-image-steps.yml + - template: set-python-manylinux-variables-step.yml + - template: get-docker-image-steps.yml parameters: - Dockerfile: tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile + Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm Context: tools/ci_build/github/linux/docker - DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)" + DockerBuildArgs: >- + --build-arg INSTALL_DEPS_EXTRA_ARGS=-tmur + --network=host + --build-arg ROCM_VERSION=$(RocmVersion) Repository: onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion)-build #- script: |- @@ -72,19 +76,23 @@ jobs: script: |- export ROCM_HOME=/opt/rocm docker run --rm \ + --ipc=host \ + --network=host \ + --cap-add=SYS_PTRACE \ --security-opt seccomp=unconfined \ --shm-size=1024m \ --user $UID:$(id -g $USER) \ + -e CC=/opt/rh/devtoolset-10/root/usr/bin/cc -e CXX=/opt/rh/devtoolset-10/root/usr/bin/c++ -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" \ + -e CCACHE_DIR=/cache \ --volume $(Build.SourcesDirectory):/onnxruntime_src \ --volume $(Build.BinariesDirectory):/build \ --volume $(CCACHE_DIR):/cache \ - -e CCACHE_DIR=/cache \ --workdir /onnxruntime_src \ onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion)-build \ /bin/bash -c " set -ex; \ ccache -s; \ - python tools/ci_build/build.py \ + /opt/python/cp38-cp38/bin/python3 tools/ci_build/build.py \ --config $(BuildConfig) \ --enable_training \ --mpi_home /opt/ompi \ From 31dc91a1039429d7c77aaf3fdc717999a23d6b66 Mon Sep 17 00:00:00 2001 From: peixuanzuo Date: Fri, 15 Sep 2023 06:19:11 +0000 Subject: [PATCH 2/2] update --- .../orttraining-pai-ci-pipeline.yml | 117 ++++++++++++++++-- .../docker/Dockerfile.manylinux2_28_rocm | 7 ++ 2 files changed, 112 insertions(+), 12 deletions(-) diff --git a/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml index 55ad049d1e5e1..3333a7d22a41b 100644 --- a/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml @@ -11,6 +11,14 @@ pr: - 'onnxruntime/core/providers/js' name: 'orttraining_ci_$(Date:yyyyMMdd)_$(Rev:r)' +resources: + repositories: + - repository: manylinux + type: Github + endpoint: Microsoft + name: pypa/manylinux + ref: 5eda9aded5462201e6310105728d33016e637ea7 + variables: - name: video value: 44 @@ -22,7 +30,7 @@ variables: value: Release jobs: -- job: Linux_Build +- job: Linux_Build_manylinux variables: skipComponentGovernanceDetection: true CCACHE_DIR: $(Pipeline.Workspace)/ccache @@ -41,15 +49,105 @@ jobs: clean: true submodules: recursive - - template: set-python-manylinux-variables-step.yml - - template: get-docker-image-steps.yml + - template: templates/get-docker-image-steps.yml parameters: Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm Context: tools/ci_build/github/linux/docker DockerBuildArgs: >- --build-arg INSTALL_DEPS_EXTRA_ARGS=-tmur - --network=host + --network=host --build-arg POLICY=manylinux_2_28 --build-arg PLATFORM=x86_64 + --build-arg BUILD_UID=$(id -u) --build-arg ROCM_VERSION=$(RocmVersion) + --build-arg DEVTOOLSET_ROOTPATH=/opt/rh/gcc-toolset-12/root + --build-arg PREPEND_PATH=/opt/rh/gcc-toolset-12/root/usr/bin: + --build-arg LD_LIBRARY_PATH_ARG=/opt/rh/gcc-toolset-12/root/usr/lib64:/opt/rh/gcc-toolset-12/root/usr/lib:/opt/rh/gcc-toolset-12/root/usr/lib64/dyninst:/opt/rh/gcc-toolset-12/root/usr/lib/dyninst:/usr/local/lib64:/usr/local/lib + Repository: onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion)-manylinux-build + + - task: Cache@2 + inputs: + key: '"manylinux" | "$(TODAY)" | "$(Build.SourceBranch)" | "$(Build.SourceVersion)"' + path: $(CCACHE_DIR) + cacheHitVar: CACHE_RESTORED + restoreKeys: | + "manylinux" | "$(TODAY)" | "$(Build.SourceBranch)" + "manylinux" | "$(TODAY)" | + displayName: Cache Task + + - script: mkdir -p $(CCACHE_DIR) + condition: ne(variables.CACHE_RESTORED, 'true') + displayName: Create Cache Dir + + - task: CmdLine@2 + inputs: + script: |- + export ROCM_HOME=/opt/rocm + docker run --rm \ + --ipc=host \ + --network=host \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --shm-size=1024m \ + --user $UID:$(id -g $USER) \ + -e CC=/opt/rh/gcc-toolset-12/root/usr/bin/cc -e CXX=/opt/rh/gcc-toolset-12/root/usr/bin/c++ -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" \ + -e CCACHE_DIR=/cache \ + --volume $(Build.SourcesDirectory):/onnxruntime_src \ + --volume $(Build.BinariesDirectory):/build \ + --volume $(CCACHE_DIR):/cache \ + --workdir /onnxruntime_src \ + onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion)-manylinux-build \ + /bin/bash -c " + set -ex; \ + ccache -s; \ + /opt/python/cp38-cp38/bin/python3 tools/ci_build/build.py \ + --config $(BuildConfig) \ + --enable_training \ + --mpi_home /opt/ompi \ + --cmake_extra_defines \ + CMAKE_HIP_COMPILER=${ROCM_HOME}/llvm/bin/clang++ \ + onnxruntime_BUILD_UNIT_TESTS=OFF \ + FETCHCONTENT_TRY_FIND_PACKAGE_MODE=NEVER \ + --use_cache \ + --use_rocm \ + --rocm_version=$(RocmVersion) \ + --rocm_home ${ROCM_HOME} \ + --nccl_home ${ROCM_HOME}\ + --update \ + --build_dir /build \ + --build \ + --parallel \ + --build_wheel \ + --skip_submodule_sync \ + --skip_tests; \ + ccache -sv; \ + ccache -z" + displayName: 'Build onnxruntime' + + - template: templates/explicitly-defined-final-tasks.yml + +- job: Linux_Build_ubuntu + variables: + skipComponentGovernanceDetection: true + CCACHE_DIR: $(Pipeline.Workspace)/ccache + TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)] + workspace: + clean: all + pool: onnxruntime-Ubuntu2004-AMD-CPU + timeoutInMinutes: 120 + + steps: + - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3 + displayName: 'Clean Agent Directories' + condition: always() + + - checkout: self + clean: true + submodules: recursive + + - template: templates/get-docker-image-steps.yml + parameters: + Dockerfile: tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile + Context: tools/ci_build/github/linux/docker + DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)" Repository: onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion)-build #- script: |- @@ -76,23 +174,19 @@ jobs: script: |- export ROCM_HOME=/opt/rocm docker run --rm \ - --ipc=host \ - --network=host \ - --cap-add=SYS_PTRACE \ --security-opt seccomp=unconfined \ --shm-size=1024m \ --user $UID:$(id -g $USER) \ - -e CC=/opt/rh/devtoolset-10/root/usr/bin/cc -e CXX=/opt/rh/devtoolset-10/root/usr/bin/c++ -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" \ - -e CCACHE_DIR=/cache \ --volume $(Build.SourcesDirectory):/onnxruntime_src \ --volume $(Build.BinariesDirectory):/build \ --volume $(CCACHE_DIR):/cache \ + -e CCACHE_DIR=/cache \ --workdir /onnxruntime_src \ onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion)-build \ /bin/bash -c " set -ex; \ ccache -s; \ - /opt/python/cp38-cp38/bin/python3 tools/ci_build/build.py \ + python tools/ci_build/build.py \ --config $(BuildConfig) \ --enable_training \ --mpi_home /opt/ompi \ @@ -123,8 +217,7 @@ jobs: - template: templates/explicitly-defined-final-tasks.yml - -- job: Linux_Test +- job: Linux_Test_ubuntu workspace: clean: all pool: AMD-GPU diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm index 10ce8f0ed65f7..19599c9f613d4 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm +++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm @@ -185,6 +185,13 @@ RUN cd /tmp/scripts && \ rm -rf /tmp/scripts +# Install ccache to reuse this dockerfile for CI +RUN mkdir -p /tmp/ccache && \ + cd /tmp/ccache && \ + wget -q -O - https://github.com/ccache/ccache/releases/download/v4.7.4/ccache-4.7.4-linux-x86_64.tar.xz | tar --strip 1 -J -xf - && \ + cp /tmp/ccache/ccache /usr/bin && \ + rm -rf /tmp/ccache + ARG BUILD_UID=1001 ARG BUILD_USER=onnxruntimedev RUN adduser --uid $BUILD_UID $BUILD_USER