diff --git a/assets/training/general/environments/acpt-pytorch-2.2-cuda12.1-profiler/asset.yaml b/assets/training/general/environments/acpt-pytorch-2.2-cuda12.1-profiler/asset.yaml deleted file mode 100644 index fc712ecd8b..0000000000 --- a/assets/training/general/environments/acpt-pytorch-2.2-cuda12.1-profiler/asset.yaml +++ /dev/null @@ -1,11 +0,0 @@ -name: acpt-pytorch-2.2-cuda12.1-profiler -version: auto -type: environment -spec: spec.yaml -extra_config: environment.yaml -test: - pytest: - enabled: true - pip_requirements: tests/requirements.txt - tests_dir: tests -categories: ["PyTorch", "Training"] \ No newline at end of file diff --git a/assets/training/general/environments/acpt-pytorch-2.2-cuda12.1-profiler/context/Dockerfile b/assets/training/general/environments/acpt-pytorch-2.2-cuda12.1-profiler/context/Dockerfile deleted file mode 100644 index 7cdf5fc9b6..0000000000 --- a/assets/training/general/environments/acpt-pytorch-2.2-cuda12.1-profiler/context/Dockerfile +++ /dev/null @@ -1,55 +0,0 @@ -FROM mcr.microsoft.com/aifx/acpt/stable-ubuntu2004-cu121-py310-torch22x:{{latest-image-tag:biweekly\.\d{6}\.\d{1}.*}} - -# Install pip dependencies -COPY requirements.txt . -RUN pip install -r requirements.txt --no-cache-dir - -# Inference requirements -COPY --from=mcr.microsoft.com/azureml/o16n-base/python-assets:20230419.v1 /artifacts /var/ -RUN /var/requirements/install_system_requirements.sh && \ - cp /var/configuration/rsyslog.conf /etc/rsyslog.conf && \ - cp /var/configuration/nginx.conf /etc/nginx/sites-available/app && \ - ln -sf /etc/nginx/sites-available/app /etc/nginx/sites-enabled/app && \ - rm -f /etc/nginx/sites-enabled/default -ENV SVDIR=/var/runit -ENV WORKER_TIMEOUT=400 -EXPOSE 5001 8883 8888 - -# support Deepspeed launcher requirement of passwordless ssh login -RUN apt-get update -RUN apt-get install -y openssh-server openssh-client - - -ENV DEBIAN_FRONTEND=noninteractive -RUN apt install -y curl tmux wget systemd - -# install dcgm -RUN distribution=$(. /etc/os-release;echo $ID$VERSION_ID | sed -e 's/\.//g') && wget https://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_64/cuda-keyring_1.0-1_all.deb -RUN dpkg -i cuda-keyring_1.0-1_all.deb -RUN apt-get update -RUN apt-get install -y datacenter-gpu-manager - - -# install dynolog -RUN wget https://github.com/facebookincubator/dynolog/releases/download/v0.2.2/dynolog_0.2.2-0-amd64.deb -RUN dpkg -i dynolog_0.2.2-0-amd64.deb -RUN echo "--enable_ipc_monitor" | tee -a /etc/dynolog.gflags -RUN echo "--enable_gpu_monitor" | tee -a /etc/dynolog.gflags -RUN echo "--dcgm_lib_path=/usr/lib/x86_64-linux-gnu/libdcgm.so" | tee -a /etc/dynolog.gflags -RUN touch /var/log/dynolog.log -RUN echo "--use_JSON" | tee -a /etc/dynolog.gflags - -# run profiler -#dyno gputrace --log-file kineto-trace.json - -# install HTA -RUN pip install HolisticTraceAnalysis -WORKDIR /workspace -ENV KINETO_USE_DAEMON=1 - -CMD ["sudo", "systemctl", "--now enable nvidia-dcgm"] - -CMD ["sudo", "nv-hostengine", "-n --service-account nvidia-dcgm"] - -# Run dynolog with sudo -CMD ["sudo", "dynolog", "--flagfile=/etc/dynolog.gflags"] diff --git a/assets/training/general/environments/acpt-pytorch-2.2-cuda12.1-profiler/context/requirements.txt b/assets/training/general/environments/acpt-pytorch-2.2-cuda12.1-profiler/context/requirements.txt deleted file mode 100644 index 6484457661..0000000000 --- a/assets/training/general/environments/acpt-pytorch-2.2-cuda12.1-profiler/context/requirements.txt +++ /dev/null @@ -1,18 +0,0 @@ -azureml-core=={{latest-pypi-version}} -azureml-dataset-runtime=={{latest-pypi-version}} -azureml-defaults=={{latest-pypi-version}} -azure-ml=={{latest-pypi-version}} -azure-ml-component=={{latest-pypi-version}} -azureml-mlflow=={{latest-pypi-version}} -azureml-contrib-services=={{latest-pypi-version}} -azureml-contrib-services=={{latest-pypi-version}} -azureml-automl-common-tools=={{latest-pypi-version}} -torch-tb-profiler~=0.4.0 -azureml-inference-server-http -inference-schema -MarkupSafe==2.1.2 -regex -pybind11 -urllib3>=1.26.18 -cryptography>=42.0.4 -aiohttp>=3.8.5 \ No newline at end of file diff --git a/assets/training/general/environments/acpt-pytorch-2.2-cuda12.1-profiler/environment.yaml b/assets/training/general/environments/acpt-pytorch-2.2-cuda12.1-profiler/environment.yaml deleted file mode 100644 index 5d18418eb9..0000000000 --- a/assets/training/general/environments/acpt-pytorch-2.2-cuda12.1-profiler/environment.yaml +++ /dev/null @@ -1,12 +0,0 @@ -image: - name: azureml/curated/acpt-pytorch-2.2-cuda12.1-profiler - os: linux - context: - dir: context - dockerfile: Dockerfile - template_files: - - Dockerfile - - requirements.txt - publish: - location: mcr - visibility: public diff --git a/assets/training/general/environments/acpt-pytorch-2.2-cuda12.1-profiler/spec.yaml b/assets/training/general/environments/acpt-pytorch-2.2-cuda12.1-profiler/spec.yaml deleted file mode 100644 index 9df6c14ba2..0000000000 --- a/assets/training/general/environments/acpt-pytorch-2.2-cuda12.1-profiler/spec.yaml +++ /dev/null @@ -1,26 +0,0 @@ -$schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json - -description: >- - Recommended environment for Deep Learning in public preview with PyTorch on Azure containing the Azure ML SDK with the latest compatible versions of Ubuntu, Python, PyTorch, CUDA\RocM, combined with optimizers like ORT Training,+DeepSpeed+MSCCL+ORT MoE and more. The image introduces newly released PyTorch 2.1 for early testing, and preview of new fastcheckpointing capability called Nebula. - Azure Container Registry:acptdev.azurecr.io/test/public/aifx/acpt/stable-ubuntu2004-cu121-py310-torch212 - -name: "{{asset.name}}" -version: "{{asset.version}}" - -build: - path: "{{image.context.path}}" - dockerfile_path: "{{image.dockerfile.path}}" - -os_type: linux - -tags: - PyTorch: "2.2" - GPU: Cuda12 - OS: Ubuntu20.04 - Training: "" - Preview: "" - Python: "3.10" - DeepSpeed: "0.13.1" - ONNXRuntime: "1.17.1" - torch_ORT: "1.17.0" - Checkpointing:Nebula: "0.16.10" diff --git a/assets/training/general/environments/acpt-pytorch-2.2-cuda12.1-profiler/tests/pytorch2_2_sample_test.py b/assets/training/general/environments/acpt-pytorch-2.2-cuda12.1-profiler/tests/pytorch2_2_sample_test.py deleted file mode 100644 index 369f8feda3..0000000000 --- a/assets/training/general/environments/acpt-pytorch-2.2-cuda12.1-profiler/tests/pytorch2_2_sample_test.py +++ /dev/null @@ -1,94 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -"""Tests running a sample job in the pytorch 2.0 environment.""" -import os -import time -from pathlib import Path -from azure.ai.ml import command, Output, MLClient, PyTorchDistribution -from azure.ai.ml.entities import Environment, BuildContext, JobResourceConfiguration -from azure.identity import AzureCliCredential -import subprocess - -BUILD_CONTEXT = Path("../context") -JOB_SOURCE_CODE = "../../acpt-tests/src" -TIMEOUT_MINUTES = os.environ.get("timeout_minutes", 60) -STD_LOG = Path("artifacts/user_logs/std_log.txt") - - -def test_pytorch_2_2(): - """Tests a sample job using pytorch 2.0 as the environment.""" - this_dir = Path(__file__).parent - - subscription_id = os.environ.get("subscription_id") - resource_group = os.environ.get("resource_group") - workspace_name = os.environ.get("workspace") - - ml_client = MLClient( - AzureCliCredential(), subscription_id, resource_group, workspace_name - ) - - env_name = "acpt-pytorch-2_2-cuda12_1-profiler" - - env_docker_context = Environment( - build=BuildContext(path=this_dir / BUILD_CONTEXT), - name=env_name, - description="Pytorch 2.2 environment created from a Docker context.", - ) - ml_client.environments.create_or_update(env_docker_context) - - # create the command - job = command( - code=this_dir / JOB_SOURCE_CODE, # local path where the code is stored - command="pip install -r requirements.txt && pip install multiprocess==0.70.15" - " && python pretrain_glue.py --tensorboard_log_dir \"/outputs/runs/\"" - " --deepspeed ds_config.json --num_train_epochs 5 --output_dir outputs --disable_tqdm 1" - " --local_rank $RANK --evaluation_strategy \"epoch\" --logging_strategy \"epoch\"" - " --per_device_train_batch_size 93 --gradient_accumulation_steps 1" - " --per_device_eval_batch_size 93 --learning_rate 3e-05 --adam_beta1 0.8 --adam_beta2 0.999" - " --weight_decay 3e-07 --warmup_steps 500 --fp16 --logging_steps 1000" - " --model_checkpoint \"bert-large-uncased\"", - outputs={ - "output": Output( - type="uri_folder", - mode="rw_mount", - path="azureml://datastores/workspaceblobstore/paths/outputs" - ) - }, - environment=f"{env_name}@latest", - compute=os.environ.get("gpu_v100_cluster"), - display_name="bert-pretrain-GLUE", - description="Pretrain the BERT model on the GLUE dataset.", - experiment_name="pytorch22_Cuda121_py310_profiler_Experiment", - distribution=PyTorchDistribution(process_count_per_instance=1), - resources=JobResourceConfiguration(instance_count=2, shm_size='3100m'), - ) - - returned_job = ml_client.create_or_update(job) - assert returned_job is not None - - # Poll until final status is reached or timed out - timeout = time.time() + (TIMEOUT_MINUTES * 60) - while time.time() <= timeout: - current_status = ml_client.jobs.get(returned_job.name).status - if current_status in ["Completed", "Failed"]: - break - time.sleep(30) # sleep 30 seconds - - bashCommand = "ls" - process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) - output, error = process.communicate() - print(output) - print(error) - - if current_status == "Failed" or current_status == "Cancelled": - ml_client.jobs.download(returned_job.name) - if STD_LOG.exists(): - print(f"*** BEGIN {STD_LOG} ***") - with open(STD_LOG, "r") as f: - print(f.read(), end="") - print(f"*** END {STD_LOG} ***") - else: - ml_client.jobs.stream(returned_job.name) - - assert current_status == "Completed" diff --git a/assets/training/general/environments/acpt-pytorch-2.2-cuda12.1-profiler/tests/requirements.txt b/assets/training/general/environments/acpt-pytorch-2.2-cuda12.1-profiler/tests/requirements.txt deleted file mode 100644 index 5c92b83ebd..0000000000 --- a/assets/training/general/environments/acpt-pytorch-2.2-cuda12.1-profiler/tests/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -azure-ai-ml==1.2.0 -azure.identity==1.10.0 \ No newline at end of file