From fb085c09d051e8308070d4fc42c4344dbcadbd7a Mon Sep 17 00:00:00 2001 From: Abhishek Jindal Date: Fri, 17 Nov 2023 12:25:54 -0800 Subject: [PATCH 1/6] build ort training packaging pipeline for CUDA 12 --- ...ttraining-py-packaging-pipeline-cuda12.yml | 22 +++ ...Dockerfile.manylinux2_28_training_cuda12_2 | 180 ++++++++++++++++++ .../requirements.txt | 8 + 3 files changed, 210 insertions(+) create mode 100644 tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda12.yml create mode 100644 tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda12_2 create mode 100644 tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch2.1.0_cu12.2/requirements.txt diff --git a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda12.yml b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda12.yml new file mode 100644 index 0000000000000..422fb33eec5de --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda12.yml @@ -0,0 +1,22 @@ +trigger: none + +resources: + repositories: + - repository: manylinux + type: Github + endpoint: Microsoft + name: pypa/manylinux + ref: 5eda9aded5462201e6310105728d33016e637ea7 + +stages: +- template: templates/py-packaging-training-cuda-stage.yml + parameters: + build_py_parameters: --enable_training --update --build + torch_version: '2.1.0' + opset_version: '15' + cuda_version: '12.2' + cmake_cuda_architectures: 70;75;80;86;90 + docker_file: Dockerfile.manylinux2_28_training_cuda12_2 + agent_pool: Onnxruntime-Linux-GPU + upload_wheel: 'yes' + debug_build: false diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda12_2 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda12_2 new file mode 100644 index 0000000000000..a36f60b87768d --- /dev/null +++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda12_2 @@ -0,0 +1,180 @@ +ARG BASEIMAGE=nvidia/cuda:12.2.2-cudnn8-devel-ubi8 +ARG POLICY=manylinux2014 +ARG PLATFORM=x86_64 +ARG DEVTOOLSET_ROOTPATH= +ARG LD_LIBRARY_PATH_ARG= +ARG PREPEND_PATH= + +#We need both CUDA and manylinux. But the CUDA Toolkit End User License Agreement says NVIDIA CUDA Driver Libraries(libcuda.so, libnvidia-ptxjitcompiler.so) are only distributable in applications that meet this criteria: +#1. The application was developed starting from a NVIDIA CUDA container obtained from Docker Hub or the NVIDIA GPU Cloud, and +#2. The resulting application is packaged as a Docker container and distributed to users on Docker Hub or the NVIDIA GPU Cloud only. +#So we use CUDA as the base image then add manylinux on top of it. + +#Build manylinux2014 docker image begin +FROM $BASEIMAGE AS runtime_base +ARG POLICY +ARG PLATFORM +ARG DEVTOOLSET_ROOTPATH +ARG LD_LIBRARY_PATH_ARG +ARG PREPEND_PATH +LABEL maintainer="The ManyLinux project" + +ENV AUDITWHEEL_POLICY=${POLICY} AUDITWHEEL_ARCH=${PLATFORM} AUDITWHEEL_PLAT=${POLICY}_${PLATFORM} +ENV LC_ALL=en_US.UTF-8 LANG=en_US.UTF-8 LANGUAGE=en_US.UTF-8 +ENV DEVTOOLSET_ROOTPATH=${DEVTOOLSET_ROOTPATH} +ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH_ARG} +ENV PATH=${PREPEND_PATH}${PATH} +ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig + +# first copy the fixup mirrors script, keep the script around +COPY build_scripts/fixup-mirrors.sh /usr/local/sbin/fixup-mirrors + +# setup entrypoint, this will wrap commands with `linux32` with i686 images +COPY build_scripts/install-entrypoint.sh \ + build_scripts/build_utils.sh \ + /build_scripts/ + +RUN /build_scripts/install-entrypoint.sh && rm -rf /build_scripts +COPY manylinux-entrypoint /usr/local/bin/manylinux-entrypoint +ENTRYPOINT ["manylinux-entrypoint"] + +COPY build_scripts/install-runtime-packages.sh \ + build_scripts/build_utils.sh \ + /build_scripts/ +RUN manylinux-entrypoint /build_scripts/install-runtime-packages.sh && rm -rf /build_scripts/ + +COPY build_scripts/build_utils.sh /build_scripts/ + +COPY build_scripts/install-autoconf.sh /build_scripts/ +RUN export AUTOCONF_ROOT=autoconf-2.71 && \ + export AUTOCONF_HASH=431075ad0bf529ef13cb41e9042c542381103e80015686222b8a9d4abef42a1c && \ + export AUTOCONF_DOWNLOAD_URL=http://ftp.gnu.org/gnu/autoconf && \ + manylinux-entrypoint /build_scripts/install-autoconf.sh + +COPY build_scripts/install-automake.sh /build_scripts/ +RUN export AUTOMAKE_ROOT=automake-1.16.5 && \ + export AUTOMAKE_HASH=07bd24ad08a64bc17250ce09ec56e921d6343903943e99ccf63bbf0705e34605 && \ + export AUTOMAKE_DOWNLOAD_URL=http://ftp.gnu.org/gnu/automake && \ + manylinux-entrypoint /build_scripts/install-automake.sh + +COPY build_scripts/install-libtool.sh /build_scripts/ +RUN export LIBTOOL_ROOT=libtool-2.4.7 && \ + export LIBTOOL_HASH=04e96c2404ea70c590c546eba4202a4e12722c640016c12b9b2f1ce3d481e9a8 && \ + export LIBTOOL_DOWNLOAD_URL=http://ftp.gnu.org/gnu/libtool && \ + manylinux-entrypoint /build_scripts/install-libtool.sh + +COPY build_scripts/install-libxcrypt.sh /build_scripts/ +RUN export LIBXCRYPT_VERSION=4.4.28 && \ + export LIBXCRYPT_HASH=db7e37901969cb1d1e8020cb73a991ef81e48e31ea5b76a101862c806426b457 && \ + export LIBXCRYPT_DOWNLOAD_URL=https://github.com/besser82/libxcrypt/archive && \ + export PERL_ROOT=perl-5.34.0 && \ + export PERL_HASH=551efc818b968b05216024fb0b727ef2ad4c100f8cb6b43fab615fa78ae5be9a && \ + export PERL_DOWNLOAD_URL=https://www.cpan.org/src/5.0 && \ + manylinux-entrypoint /build_scripts/install-libxcrypt.sh + +FROM runtime_base AS build_base +COPY build_scripts/install-build-packages.sh /build_scripts/ +RUN manylinux-entrypoint /build_scripts/install-build-packages.sh + + +FROM build_base AS build_git +COPY build_scripts/build-git.sh /build_scripts/ +RUN export GIT_ROOT=git-2.36.2 && \ + export GIT_HASH=6dc2cdea5fb23d823ba4871cc23222c1db31dfbb6d6c6ff74c4128700df57c68 && \ + export GIT_DOWNLOAD_URL=https://www.kernel.org/pub/software/scm/git && \ + manylinux-entrypoint /build_scripts/build-git.sh + + +FROM build_base AS build_cpython +COPY build_scripts/build-sqlite3.sh /build_scripts/ +RUN export SQLITE_AUTOCONF_ROOT=sqlite-autoconf-3390200 && \ + export SQLITE_AUTOCONF_HASH=852be8a6183a17ba47cee0bbff7400b7aa5affd283bf3beefc34fcd088a239de && \ + export SQLITE_AUTOCONF_DOWNLOAD_URL=https://www.sqlite.org/2022 && \ + manylinux-entrypoint /build_scripts/build-sqlite3.sh + +COPY build_scripts/build-openssl.sh /build_scripts/ +RUN export OPENSSL_ROOT=openssl-1.1.1q && \ + export OPENSSL_HASH=d7939ce614029cdff0b6c20f0e2e5703158a489a72b2507b8bd51bf8c8fd10ca && \ + export OPENSSL_DOWNLOAD_URL=https://www.openssl.org/source && \ + manylinux-entrypoint /build_scripts/build-openssl.sh + +COPY build_scripts/build-cpython.sh /build_scripts/ + + +FROM build_cpython AS build_cpython38 +COPY build_scripts/ambv-pubkey.txt /build_scripts/cpython-pubkeys.txt +RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.8.13 + + +FROM build_cpython AS build_cpython39 +COPY build_scripts/ambv-pubkey.txt /build_scripts/cpython-pubkeys.txt +RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.9.13 + + +FROM build_cpython AS build_cpython310 +COPY build_scripts/cpython-pubkey-310-311.txt /build_scripts/cpython-pubkeys.txt +RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.10.5 + +FROM build_cpython AS build_cpython311 +COPY build_scripts/cpython-pubkey-310-311.txt /build_scripts/cpython-pubkeys.txt +RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.11.2 + +FROM build_cpython AS all_python +COPY build_scripts/install-pypy.sh \ + build_scripts/pypy.sha256 \ + build_scripts/finalize-python.sh \ + /build_scripts/ +RUN manylinux-entrypoint /build_scripts/install-pypy.sh 3.8 7.3.9 +RUN manylinux-entrypoint /build_scripts/install-pypy.sh 3.9 7.3.9 +COPY --from=build_cpython38 /opt/_internal /opt/_internal/ +COPY --from=build_cpython39 /opt/_internal /opt/_internal/ +COPY --from=build_cpython310 /opt/_internal /opt/_internal/ +COPY --from=build_cpython311 /opt/_internal /opt/_internal/ +RUN manylinux-entrypoint /build_scripts/finalize-python.sh + + +FROM runtime_base +COPY --from=build_git /manylinux-rootfs / +COPY --from=build_cpython /manylinux-rootfs / +COPY --from=all_python /opt/_internal /opt/_internal/ +COPY build_scripts/finalize.sh \ + build_scripts/python-tag-abi-tag.py \ + build_scripts/requirements3.8.txt \ + build_scripts/requirements3.9.txt \ + build_scripts/requirements3.10.txt \ + build_scripts/requirements3.11.txt \ + build_scripts/requirements-base-tools.txt \ + /build_scripts/ +COPY build_scripts/requirements-tools/* /build_scripts/requirements-tools/ +RUN manylinux-entrypoint /build_scripts/finalize.sh && rm -rf /build_scripts + +ENV SSL_CERT_FILE=/opt/_internal/certs.pem + +CMD ["/bin/bash"] + +#Build manylinux2014 docker image end +ARG PYTHON_VERSION=3.9 +ARG TORCH_VERSION=2.1.0 +ARG OPSET_VERSION=15 +ARG INSTALL_DEPS_EXTRA_ARGS + +#Add our own dependencies +ADD scripts /tmp/scripts +RUN cd /tmp/scripts && \ + /tmp/scripts/manylinux/install_centos.sh && \ + /tmp/scripts/install_os_deps.sh -d gpu $INSTALL_DEPS_EXTRA_ARGS && \ + /tmp/scripts/install_rust.sh + +ENV PATH="/root/.cargo/bin/:$PATH" + +RUN /tmp/scripts/install_ninja.sh && \ + /tmp/scripts/install_python_deps.sh -d gpu -v 12.2 -p $PYTHON_VERSION -h $TORCH_VERSION $INSTALL_DEPS_EXTRA_ARGS && \ + rm -rf /tmp/scripts + +ARG BUILD_UID=1001 +ARG BUILD_USER=onnxruntimedev +RUN adduser --uid $BUILD_UID $BUILD_USER +WORKDIR /home/$BUILD_USER +USER $BUILD_USER +ENV PATH /usr/local/dotnet:$PATH +ENV ORTMODULE_ONNX_OPSET_VERSION=$OPSET_VERSION diff --git a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch2.1.0_cu12.2/requirements.txt b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch2.1.0_cu12.2/requirements.txt new file mode 100644 index 0000000000000..d0b94ba6e3628 --- /dev/null +++ b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch2.1.0_cu12.2/requirements.txt @@ -0,0 +1,8 @@ +--pre +-f https://download.pytorch.org/whl/torch_stable.html +torch==2.1.0+cu121 +torchvision==0.16.1+cu121 +torchtext==0.16.1 +# TODO(bmeswani): packaging 22.0 removes support for LegacyVersion leading to errors because transformers 4.4.2 uses LegacyVersion +packaging==21.3 +setuptools>=68.2.2 From a24010c62be7e965218060e1798115fa118bd566 Mon Sep 17 00:00:00 2001 From: Abhishek Jindal Date: Fri, 17 Nov 2023 15:35:32 -0800 Subject: [PATCH 2/6] update requirements --- .../stage1/requirements_torch2.1.0_cu12.2/requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch2.1.0_cu12.2/requirements.txt b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch2.1.0_cu12.2/requirements.txt index d0b94ba6e3628..056438a79c6f8 100644 --- a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch2.1.0_cu12.2/requirements.txt +++ b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch2.1.0_cu12.2/requirements.txt @@ -1,8 +1,8 @@ --pre -f https://download.pytorch.org/whl/torch_stable.html torch==2.1.0+cu121 -torchvision==0.16.1+cu121 -torchtext==0.16.1 +torchvision==0.16.0+cu121 +torchtext==0.16.0 # TODO(bmeswani): packaging 22.0 removes support for LegacyVersion leading to errors because transformers 4.4.2 uses LegacyVersion packaging==21.3 setuptools>=68.2.2 From 87d95003163ab91e1a42004d4fa2080ea46b3c1a Mon Sep 17 00:00:00 2001 From: Abhishek Jindal Date: Mon, 20 Nov 2023 13:47:59 -0800 Subject: [PATCH 3/6] add package name for cuda 12 --- setup.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/setup.py b/setup.py index 1c04433c9a7ca..4df48239c8cbd 100644 --- a/setup.py +++ b/setup.py @@ -536,6 +536,9 @@ def finalize_options(self): # Training ROCM package for ADO feeds is called onnxruntime-training-rocm package_name = "onnxruntime-training-rocm" + if cuda_version and cuda_version.split(".")[0] == "12": + package_name = "onnxruntime-training-cuda12" + if package_name == "onnxruntime-tvm": packages += ["onnxruntime.providers.tvm"] From 937551d27f3a9984231cba1f960c7f74e997e1a0 Mon Sep 17 00:00:00 2001 From: Abhishek Jindal Date: Mon, 20 Nov 2023 16:03:33 -0800 Subject: [PATCH 4/6] remove cuda 12 package version name --- setup.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/setup.py b/setup.py index 608bffc082e07..798c8c4b2895b 100644 --- a/setup.py +++ b/setup.py @@ -535,9 +535,6 @@ def finalize_options(self): # Training ROCM package for ADO feeds is called onnxruntime-training-rocm package_name = "onnxruntime-training-rocm" - if cuda_version and cuda_version.split(".")[0] == "12": - package_name = "onnxruntime-training-cuda12" - if package_name == "onnxruntime-tvm": packages += ["onnxruntime.providers.tvm"] From 964efa3f05d458643d9f529d07252ef6505bc323 Mon Sep 17 00:00:00 2001 From: Abhishek Jindal Date: Mon, 20 Nov 2023 16:20:47 -0800 Subject: [PATCH 5/6] test latest packaging --- .../stage1/requirements_torch2.1.0_cu12.2/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch2.1.0_cu12.2/requirements.txt b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch2.1.0_cu12.2/requirements.txt index 056438a79c6f8..25658a2c8a2f2 100644 --- a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch2.1.0_cu12.2/requirements.txt +++ b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch2.1.0_cu12.2/requirements.txt @@ -4,5 +4,5 @@ torch==2.1.0+cu121 torchvision==0.16.0+cu121 torchtext==0.16.0 # TODO(bmeswani): packaging 22.0 removes support for LegacyVersion leading to errors because transformers 4.4.2 uses LegacyVersion -packaging==21.3 +packaging==23.1 setuptools>=68.2.2 From 312295d18ea1434c00e2254bed4c677622315de7 Mon Sep 17 00:00:00 2001 From: Abhishek Jindal Date: Mon, 20 Nov 2023 21:41:14 -0800 Subject: [PATCH 6/6] remove comment which is no longer required --- .../stage1/requirements_torch2.1.0_cu12.2/requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch2.1.0_cu12.2/requirements.txt b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch2.1.0_cu12.2/requirements.txt index 25658a2c8a2f2..152a17db90366 100644 --- a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch2.1.0_cu12.2/requirements.txt +++ b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch2.1.0_cu12.2/requirements.txt @@ -3,6 +3,5 @@ torch==2.1.0+cu121 torchvision==0.16.0+cu121 torchtext==0.16.0 -# TODO(bmeswani): packaging 22.0 removes support for LegacyVersion leading to errors because transformers 4.4.2 uses LegacyVersion packaging==23.1 setuptools>=68.2.2