From 5a18818e1d5b284f85bb07f3493228d42d2bd70d Mon Sep 17 00:00:00 2001 From: Yi Zhang Date: Fri, 10 May 2024 06:44:29 +0800 Subject: [PATCH] Migrate training storage from SAS to managed identity (#20618) ### Description orttrainingtestdatascus has only save mnist whose size is only 64M in Azure File To meet security requirements and reduce maintenance cost, move the test data to lotusscus and saved in Azure blob. --- ...ortmodule-distributed-test-ci-pipeline.yml | 8 +++---- .../jobs/download_training_test_data.yml | 8 +++++++ ...orttraining-linux-gpu-test-ci-pipeline.yml | 16 ++------------ ...py-packaging-training-cuda-stage-steps.yml | 21 +++---------------- 4 files changed, 16 insertions(+), 37 deletions(-) create mode 100644 tools/ci_build/github/azure-pipelines/templates/jobs/download_training_test_data.yml diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-distributed-test-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-distributed-test-ci-pipeline.yml index 654bc0921556a..2c6b6183a9aa0 100644 --- a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-distributed-test-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-distributed-test-ci-pipeline.yml @@ -41,6 +41,8 @@ stages: clean: true submodules: recursive + - template: templates/jobs/download_training_test_data.yml + - template: templates/run-docker-build-steps.yml parameters: RunDockerBuildArgs: | @@ -58,10 +60,6 @@ stages: -e DisplayName: 'Build' - - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/mnist" -d "/mnist" - displayName: 'Mount MNIST' - condition: succeededOrFailed() - # Entry point for all ORTModule distributed tests # Refer to orttraining/orttraining/test/python/how_to_add_ortmodule_distributed_ci_pipeline_tests.md for guidelines on how to add new tests to this pipeline. - script: | @@ -71,7 +69,7 @@ stages: --rm \ --volume $(Build.SourcesDirectory):/onnxruntime_src \ --volume $(Build.BinariesDirectory):/build \ - --volume /mnist:/mnist \ + --volume $(Agent.TempDirectory)/mnist:/mnist \ onnxruntime_ortmodule_distributed_tests_image \ bash -c "rm -rf /build/RelWithDebInfo/onnxruntime/ && python3 -m pip install /build/RelWithDebInfo/dist/onnxruntime*.whl && python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install && /build/RelWithDebInfo/launch_test.py --cmd_line_with_args 'python orttraining_ortmodule_distributed_tests.py --mnist /mnist' --cwd /build/RelWithDebInfo" \ displayName: 'Run orttraining_ortmodule_distributed_tests.py' diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_training_test_data.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_training_test_data.yml new file mode 100644 index 0000000000000..8f6434f7ac40d --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_training_test_data.yml @@ -0,0 +1,8 @@ +steps: + - script: | + azcopy cp --recursive https://lotusscus.blob.core.windows.net/orttrainingtestdatascus/mnist/ $(Agent.TempDirectory) + displayName: 'Download Training Test Data MNIST' + + - script: | + ls -al $(Agent.TempDirectory)/mnist + displayName: 'Print contents of Training Test Data MNIST' diff --git a/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-test-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-test-ci-pipeline.yml index 5dc156e301357..f832315c1f0df 100644 --- a/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-test-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-test-ci-pipeline.yml @@ -6,17 +6,7 @@ parameters: steps: -- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/mnist" -d "/mnist" - displayName: 'Mount MNIST' - condition: succeededOrFailed() - -- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/bert-data" -d "/bert_data" - displayName: 'Mount bert-data' - condition: succeededOrFailed() - -- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/hf-models-cache" -d "/hf_models_cache" - displayName: 'Mount hf-models-cache' - condition: succeededOrFailed() +- template: jobs/download_training_test_data.yml # Entry point for all ORTModule tests # The onnxruntime folder is deleted in the build directory @@ -29,9 +19,7 @@ steps: --rm \ --volume $(Build.SourcesDirectory):/onnxruntime_src \ --volume $(Build.BinariesDirectory)/${{ parameters.BuildConfig }}:/build \ - --volume /mnist:/mnist \ - --volume /bert_data:/bert_data \ - --volume /hf_models_cache:/hf_models_cache \ + --volume $(Agent.TempDirectory)/mnist:/mnist \ ${{ parameters.DockerImageTag }} \ bash -c "rm -rf /build/onnxruntime/ && python3 -m pip install /build/dist/onnxruntime*.whl && python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install && /build/launch_test.py --cmd_line_with_args 'python orttraining_ortmodule_tests.py --mnist /mnist --bert_data /bert_data/hf_data/glue_data/CoLA/original/raw' --cwd /build" \ displayName: 'Run orttraining_ortmodule_tests.py' diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml index f6b36733ebdd8..0890883bbe1e2 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml @@ -153,6 +153,8 @@ stages: clean: true submodules: none + - template: jobs/download_training_test_data.yml + - template: set-python-manylinux-variables-step.yml - template: flex-downloadPipelineArtifact.yml @@ -188,21 +190,6 @@ stages: --build-arg LD_LIBRARY_PATH_ARG=/usr/local/lib64 Repository: $(Repository) - - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/mnist" -d "/mnist" - displayName: 'Mount MNIST' - condition: succeededOrFailed() - workingDirectory: $(Build.SourcesDirectory) - - - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/bert-data" -d "/bert_data" - displayName: 'Mount bert-data' - condition: succeededOrFailed() - workingDirectory: $(Build.SourcesDirectory) - - - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/hf-models-cache" -d "/hf_models_cache" - displayName: 'Mount hf-models-cache' - condition: succeededOrFailed() - workingDirectory: $(Build.SourcesDirectory) - - task: CmdLine@2 displayName: 'test ortmodule' inputs: @@ -215,9 +202,7 @@ stages: --gpus all \ -e NVIDIA_VISIBLE_DEVICES=all \ --volume $(Build.ArtifactStagingDirectory):/build \ - --volume /mnist:/mnist \ - --volume /bert_data:/bert_data \ - --volume /hf_models_cache:/hf_models_cache \ + --volume $(Agent.TempDirectory)/MNIST:/mnist \ $(Repository) \ bash -c " $(PythonManylinuxDir)/bin/python3 -m pip install /build/Release/dist/$basefilename && $(PythonManylinuxDir)/bin/python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install " ; workingDirectory: $(Build.SourcesDirectory)