-
Notifications
You must be signed in to change notification settings - Fork 3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Refactor Python Packaing Pipeline (Training Cuda 11.8) (#19910)
### Description 1. Use stage to organize the pipeline and split building and testing 2. Move compilation on CPU machine 3. test stage can leverage existing artifacts 4. check wheel size, it gives warning if the size above 300M 5. docker image name wasn't change even the argument changed, which caused the docker image was always rebuilt. So update the docker image name according to the argument can save the docker build time. Pipeline duration reduced by 60% (2 hours -> 50 minutes) Compilation time reduced by 75% (1.5hours -> 20 minutes) GPU time reduced by 87% ( 8 hours to 1 hours) for debugging, the GPU time could be reduced by above 95%, because we can choose run only one test stage and skip building. ### Motivation and Context Make the pipeline efficient. Optimized https://dev.azure.com/aiinfra/Lotus/_build/results?buildId=424177&view=results Curent https://dev.azure.com/aiinfra/Lotus/_build/results?buildId=422393&view=results ---------
- Loading branch information
Showing
3 changed files
with
279 additions
and
178 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
229 changes: 229 additions & 0 deletions
229
tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,229 @@ | ||
parameters: | ||
build_py_parameters: '' | ||
torch_version: '' | ||
opset_version: '' | ||
cuda_version: '' | ||
cmake_cuda_architectures: '' | ||
docker_file: '' | ||
upload_wheel: '' | ||
debug_build: '' | ||
python_version: '' | ||
stage_name: '' | ||
SpecificArtifact: false | ||
BuildId: '0' | ||
|
||
stages: | ||
- stage: Build_${{ parameters.stage_name }} | ||
variables: | ||
- name: isMain | ||
value: ${{ or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), startsWith(variables['Build.SourceBranch'], 'refs/heads/rel-')) }} | ||
- name: finalStorage | ||
${{ if eq(variables['isMain'], 'true') }}: | ||
value: '--final_storage' | ||
${{ else }}: | ||
value: '' | ||
- name: buildConfig | ||
${{ if eq(parameters['debug_build'], 'true') }}: | ||
value: 'Debug' | ||
${{ else }}: | ||
value: 'Release' | ||
- name: PythonVersion | ||
value: ${{ parameters.python_version }} | ||
- name: Repository | ||
value: onnxruntimetraininggpubuild_${{ parameters.python_version }} | ||
dependsOn: [] | ||
|
||
jobs: | ||
- job: Build | ||
pool: onnxruntime-Ubuntu2204-AMD-CPU | ||
steps: | ||
- task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3 | ||
displayName: 'Clean Agent Directories' | ||
condition: always() | ||
|
||
- task: CmdLine@2 | ||
displayName: 'check variables' | ||
inputs: | ||
script: | | ||
echo "Branch is "${{ variables['Build.SourceBranch'] }} && \ | ||
echo "isMain is "${{ variables['isMain'] }} && \ | ||
echo "final_storage is "${{ variables['finalStorage'] }} | ||
- checkout: self | ||
clean: true | ||
submodules: recursive | ||
|
||
- template: set-python-manylinux-variables-step.yml | ||
|
||
- template: get-docker-image-steps.yml | ||
parameters: | ||
Dockerfile: tools/ci_build/github/linux/docker/${{ parameters.docker_file }} | ||
Context: tools/ci_build/github/linux/docker | ||
DockerBuildArgs: >- | ||
--build-arg TORCH_VERSION=${{ parameters.torch_version }} | ||
--build-arg OPSET_VERSION=${{ parameters.opset_version }} | ||
--build-arg PYTHON_VERSION=${{ parameters.python_version }} | ||
--build-arg INSTALL_DEPS_EXTRA_ARGS=-tu | ||
--build-arg BUILD_UID=$(id -u) | ||
--network=host --build-arg POLICY=manylinux_2_28 --build-arg PLATFORM=x86_64 | ||
--build-arg DEVTOOLSET_ROOTPATH=/usr | ||
--build-arg PREPEND_PATH=/usr/local/cuda/bin: | ||
--build-arg LD_LIBRARY_PATH_ARG=/usr/local/lib64 | ||
Repository: $(Repository) | ||
|
||
- task: CmdLine@2 | ||
displayName: 'build onnxruntime' | ||
inputs: | ||
script: | | ||
set -e -x | ||
mkdir -p $HOME/.onnx | ||
docker run --rm -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" \ | ||
--volume /data/onnx:/data/onnx:ro \ | ||
--volume $(Build.SourcesDirectory):/onnxruntime_src \ | ||
--volume $(Build.BinariesDirectory):/build \ | ||
--volume /data/models:/build/models:ro \ | ||
--volume $HOME/.onnx:/home/onnxruntimedev/.onnx \ | ||
-e NVIDIA_VISIBLE_DEVICES=all \ | ||
-e NIGHTLY_BUILD \ | ||
-e DEFAULT_TRAINING_PACKAGE_DEVICE \ | ||
-e BUILD_BUILDNUMBER \ | ||
-e ORT_DISABLE_PYTHON_PACKAGE_LOCAL_VERSION \ | ||
$(Repository) \ | ||
$(PythonManylinuxDir)/bin/python3 /onnxruntime_src/tools/ci_build/build.py \ | ||
--build_dir /build \ | ||
--config ${{ variables['buildConfig'] }} \ | ||
--skip_submodule_sync \ | ||
--parallel --use_binskim_compliant_compile_flags \ | ||
--build_wheel \ | ||
--enable_onnx_tests \ | ||
${{ parameters.build_py_parameters }} \ | ||
--cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=${{ parameters.cmake_cuda_architectures }}' onnxruntime_BUILD_UNIT_TESTS=OFF \ | ||
--use_cuda --cuda_version=${{ parameters.cuda_version }} --cuda_home=/usr/local/cuda-${{ parameters.cuda_version }} --cudnn_home=/usr/local/cuda-${{ parameters.cuda_version }}; | ||
workingDirectory: $(Build.SourcesDirectory) | ||
|
||
- task: CopyFiles@2 | ||
displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)' | ||
inputs: | ||
SourceFolder: '$(Build.BinariesDirectory)' | ||
Contents: "${{ variables['buildConfig'] }}/dist/*.whl" | ||
TargetFolder: '$(Build.ArtifactStagingDirectory)' | ||
|
||
- task: PublishBuildArtifacts@1 | ||
displayName: 'Publish Artifact: ONNXRuntime python wheel and documentation' | ||
inputs: | ||
ArtifactName: "onnxruntime_gpu_${{ variables['buildConfig'] }}_${{ parameters.python_version }}" | ||
|
||
- template: component-governance-component-detection-steps.yml | ||
parameters: | ||
condition: 'succeeded' | ||
|
||
- template: clean-agent-build-directory-step.yml | ||
|
||
- stage: Test_${{ parameters.stage_name }} | ||
variables: | ||
- name: isMain | ||
value: ${{ or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), startsWith(variables['Build.SourceBranch'], 'refs/heads/rel-')) }} | ||
- name: finalStorage | ||
${{ if eq(variables['isMain'], 'true') }}: | ||
value: '--final_storage' | ||
${{ else }}: | ||
value: '' | ||
- name: buildConfig | ||
${{ if eq(parameters['debug_build'], 'true') }}: | ||
value: 'Debug' | ||
${{ else }}: | ||
value: 'Release' | ||
- name: PythonVersion | ||
value: ${{ parameters.python_version }} | ||
- name: Repository | ||
value: onnxruntimetraininggpubuild_${{ parameters.python_version }} | ||
dependsOn: Build_${{ parameters.stage_name }} | ||
jobs: | ||
- job: Test_GPU | ||
pool: Onnxruntime-Linux-GPU | ||
steps: | ||
- task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3 | ||
displayName: 'Clean Agent Directories' | ||
condition: always() | ||
|
||
- checkout: self | ||
clean: true | ||
submodules: none | ||
|
||
- template: set-python-manylinux-variables-step.yml | ||
|
||
- template: flex-downloadPipelineArtifact.yml | ||
parameters: | ||
ArtifactName: "onnxruntime_gpu_${{ variables['buildConfig'] }}_${{ parameters.python_version }}" | ||
StepName: 'Download Pipeline Artifact - Linux Training Build' | ||
TargetPath: '$(Build.ArtifactStagingDirectory)' | ||
SpecificArtifact: ${{ parameters.SpecificArtifact }} | ||
BuildId: ${{ parameters.BuildId }} | ||
|
||
- script: | | ||
set -e -x | ||
whlfilename=$(ls $(Build.ArtifactStagingDirectory)/Release/dist/*.whl | head -n 1) ; \ | ||
echo $whlfilename ; du -sh $whlfilename ; \ | ||
(( $(wc -c < "$whlfilename") - 300*1024*1024 < 0 )) || ( echo 'Wheel size bigger than 300M'; exit 1) | ||
displayName: 'Check wheel size' | ||
continueOnError: true | ||
- template: get-docker-image-steps.yml | ||
parameters: | ||
Dockerfile: tools/ci_build/github/linux/docker/${{ parameters.docker_file }} | ||
Context: tools/ci_build/github/linux/docker | ||
DockerBuildArgs: >- | ||
--build-arg TORCH_VERSION=${{ parameters.torch_version }} | ||
--build-arg OPSET_VERSION=${{ parameters.opset_version }} | ||
--build-arg PYTHON_VERSION=${{ parameters.python_version }} | ||
--build-arg INSTALL_DEPS_EXTRA_ARGS=-tu | ||
--build-arg BUILD_UID=$(id -u) | ||
--network=host --build-arg POLICY=manylinux_2_28 --build-arg PLATFORM=x86_64 | ||
--build-arg DEVTOOLSET_ROOTPATH=/usr | ||
--build-arg PREPEND_PATH=/usr/local/cuda/bin: | ||
--build-arg LD_LIBRARY_PATH_ARG=/usr/local/lib64 | ||
Repository: $(Repository) | ||
|
||
- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/mnist" -d "/mnist" | ||
displayName: 'Mount MNIST' | ||
condition: succeededOrFailed() | ||
workingDirectory: $(Build.SourcesDirectory) | ||
|
||
- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/bert-data" -d "/bert_data" | ||
displayName: 'Mount bert-data' | ||
condition: succeededOrFailed() | ||
workingDirectory: $(Build.SourcesDirectory) | ||
|
||
- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/hf-models-cache" -d "/hf_models_cache" | ||
displayName: 'Mount hf-models-cache' | ||
condition: succeededOrFailed() | ||
workingDirectory: $(Build.SourcesDirectory) | ||
|
||
- task: CmdLine@2 | ||
displayName: 'test ortmodule' | ||
inputs: | ||
script: | | ||
set -ex ; \ | ||
whlfilename=$(ls $(Build.ArtifactStagingDirectory)/Release/dist/*.whl | head -n 1) ; \ | ||
echo $whlfilename ; \ | ||
basefilename=$(basename $whlfilename) ; \ | ||
docker run --rm \ | ||
--gpus all \ | ||
-e NVIDIA_VISIBLE_DEVICES=all \ | ||
--volume $(Build.ArtifactStagingDirectory):/build \ | ||
--volume /mnist:/mnist \ | ||
--volume /bert_data:/bert_data \ | ||
--volume /hf_models_cache:/hf_models_cache \ | ||
$(Repository) \ | ||
bash -c " $(PythonManylinuxDir)/bin/python3 -m pip install /build/Release/dist/$basefilename && $(PythonManylinuxDir)/bin/python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install " ; | ||
workingDirectory: $(Build.SourcesDirectory) | ||
|
||
- task: CmdLine@2 | ||
displayName: 'Upload wheel' | ||
condition: and(succeeded(), and(eq(variables['UploadWheel'], 'yes'), ne(variables['ORT_DISABLE_PYTHON_PACKAGE_LOCAL_VERSION'], 'true'))) | ||
inputs: | ||
script: | | ||
set -e -x | ||
whlfilename=$(ls $(Build.ArtifactStagingDirectory)/Release/dist/*.whl | head -n 1) ; \ | ||
python3 tools/ci_build/upload_python_package_to_azure_storage.py \ | ||
--python_wheel_path $whlfilename ${{ variables['finalStorage'] }} |
Oops, something went wrong.