Merge branch 'main' of github.com:NVIDIA/NeMo into ashors/megatron-ti…

…mers
NVIDIA · Jan 16, 2025 · 8c81e07 · 8c81e07
2 parents 8b49c07 + fe2ae82
commit 8c81e07
Show file tree

Hide file tree

Showing 76 changed files with 1,218 additions and 378 deletions.
diff --git a/.github/workflows/_test_template.yml b/.github/workflows/_test_template.yml
@@ -47,7 +47,7 @@ jobs:
     steps:
         - name: Docker system cleanup
           run: |
-            docker system prune -a --filter "until=48h" --force || true
+            docker system prune -a --filter "until=24h" --force || true
 
         - name: Docker pull image
           run: |

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -128,14 +128,15 @@ jobs:
        SCRIPT: |
          NEMO_NUMBA_MINVER=0.53 pytest tests/collections/common -m "not pleasefixme" --with_downloads
 
-  L0_Unit_Tests_GPU_LLM:
+  OPTIONAL_L0_Unit_Tests_GPU_LLM:
      needs: [cicd-test-container-setup]
      uses: ./.github/workflows/_test_template.yml
-     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_LLM') || needs.cicd-test-container-setup.outputs.all == 'true'
+     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_GPU_LLM') || needs.cicd-test-container-setup.outputs.all == 'true'
      with:
        RUNNER: self-hosted-azure
        SCRIPT: |
          NEMO_NUMBA_MINVER=0.53 pytest tests/collections/llm -m "not pleasefixme" --with_downloads
+       IS_OPTIONAL: true
 
   L0_Unit_Tests_GPU_Multimodal:
      needs: [cicd-test-container-setup]
@@ -2937,7 +2938,7 @@ jobs:
     with:
       RUNNER: self-hosted-azure-gpus-2-h100
       SCRIPT: |
-        CUDA_DEVICE_MAX_CONNECTIONS=1 NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=1 python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
+        CUDA_DEVICE_MAX_CONNECTIONS=1 python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
         trainer.devices=2 \
         trainer.log_every_n_steps=1 \
         trainer.max_epochs=9999 \
@@ -2965,6 +2966,7 @@ jobs:
         +model.tp_comm_overlap_ag=False \
         +model.tp_comm_overlap_rs=False \
         +model.tp_comm_overlap_disable_qkv=True \
+        +model.attention_backend="unfused" \
         model.peft.peft_scheme="lora" \
         model.peft.lora_tuning.adapter_dim=16 \
         model.peft.lora_tuning.alpha=32 \
@@ -4329,11 +4331,24 @@ jobs:
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
-        python tests/collections/vlm/neva_train.py \
+        python tests/collections/vlm/test_neva_train.py \
         --devices=1 \
         --max-steps=5 \
         --experiment-dir=/tmp/nemo2_neva_results/${{ github.run_id }}
 
+  L2_NeMo_2_NEVA_MOCK_PACKED_TRAINING:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_NEVA_MOCK_PACKED_TRAINING') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python tests/collections/vlm/test_neva_train.py \
+        --devices=1 \
+        --max-steps=5 \
+        --experiment-dir=/tmp/nemo2_neva_results/${{ github.run_id }} \
+        --use_packed_sequence
+
   L2_NeMo_2_MLLAMA_MOCK_TRAINING:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -4342,7 +4357,7 @@ jobs:
       RUNNER: self-hosted-azure
       SCRIPT: |
         TRANSFORMERS_OFFLINE=1 \
-        python tests/collections/vlm/mllama_train.py \
+        python tests/collections/vlm/test_mllama_train.py \
         --devices=1 \
         --max-steps=5 \
         --experiment-dir=/tmp/nemo2_mllama_results/${{ github.run_id }}
@@ -4354,7 +4369,7 @@ jobs:
       with:
         RUNNER: self-hosted-azure
         SCRIPT: |
-          NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python3 tests/collections/llm/megatron_mixtral_pretraining.py \
+          python3 tests/collections/llm/megatron_mixtral_pretraining.py \
           --experiment-dir=/tmp/mixtral_pretrain_results \
           --data-path=/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document
 
@@ -4915,6 +4930,36 @@ jobs:
       AFTER_SCRIPT: |
         rm -rf /tmp/nemo2_llava_next_results
 
+  L2_NeMo_2_VLLM_EXPORT:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_VLLM_EXPORT') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python tests/setup/models/create_hf_model.py \
+          --model_name_or_path /home/TestData/nlp/megatron_llama/llama-ci-hf \
+          --output_dir /tmp/llama_head64 \
+          --config_updates "{\"hidden_size\": 512, \"num_attention_heads\": 4, \"numx_hidden_layers\": 2, \"num_key_value_heads\": 4, \"intermediate_size\": 1024, \"head_dim\": 128, \"num_hidden_layers\": 2, \"torch_dtype\": \"float16\" }"
+
+        python tests/collections/llm/test_hf_import.py --hf_model /tmp/llama_head64 --output_path /tmp/nemo2_ckpt
+
+        /opt/venv/bin/python tests/export/nemo_export.py \
+          --min_tps 1 \
+          --max_tps 1 \
+          --use_vllm True \
+          --model_type llama \
+          --max_output_len 128 \
+          --test_deployment True \
+          --model_name nemo2_ckpt \
+          --model_dir /tmp/vllm_from_nemo2 \
+          --checkpoint_dir /tmp/nemo2_ckpt
+
+      AFTER_SCRIPT: |
+        rm -rf /tmp/llama_head64
+        rm -rf /tmp/nemo2_ckpt
+        rm -rf /tmp/vllm_from_nemo2
+
   Nemo_CICD_Test:
     needs:
       - pre-flight
@@ -4923,7 +4968,7 @@ jobs:
       - L0_Unit_Tests_GPU_ASR
       - L0_Unit_Tests_GPU_Audio
       - L0_Unit_Tests_GPU_Common
-      - L0_Unit_Tests_GPU_LLM
+      #- OPTIONAL_L0_Unit_Tests_GPU_LLM
       - L0_Unit_Tests_GPU_Multimodal
       - L0_Unit_Tests_GPU_NLP
       - L0_Unit_Tests_GPU_TTS
@@ -5030,6 +5075,7 @@ jobs:
       - Speech_Checkpoints_tests
       - L2_Stable_Diffusion_Training
       - L2_NeMo_2_NEVA_MOCK_TRAINING
+      - L2_NeMo_2_NEVA_MOCK_PACKED_TRAINING
       - L2_NeMo_2_MLLAMA_MOCK_TRAINING
       - L2_NeMo_2_GPT_Pretraining_no_transformer_engine
       - L2_NeMo_2_GPT_DDP_Param_Parity_check
@@ -5102,6 +5148,7 @@ jobs:
       - L2_NeMo_2_LLAVA_NEXT_MOCK_TRAINING
       - L2_HF_Transformer_SFT_FSDP2_2gpu
       - L2_HF_Transformer_SFT_2gpu_nemorun_fsdp2
+      - L2_NeMo_2_VLLM_EXPORT
     if: always()
     runs-on: ubuntu-latest
     steps:

diff --git a/.github/workflows/import-test.yml b/.github/workflows/import-test.yml
@@ -1,73 +1,52 @@
 name: CI-Import-Check
 
 on:
-  push:
   pull_request:
     paths:
       - "**"
 
 # Check https://hub.docker.com/r/pytorch/pytorch/tags for latest tags
 jobs:
-
-  test-asr-imports:
-    runs-on: ubuntu-latest
-    container:
-      image: pytorch/pytorch:2.4.0-cuda11.8-cudnn9-runtime
+  test-imports:
+    name: test-${{ matrix.collection }}-import-${{ matrix.os }}-py${{ matrix.python }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, macos-latest]
+        collection: 
+          - asr
+          # - nlp # Currently broken
+          - tts
+        python: ['3.10', '3.11', '3.12']
     steps:
     - name: Checkout repo
       uses: actions/checkout@v2
-    - name: Update base dependencies
-      run: |
-        apt-get update && apt-get install -y build-essential
-        apt-get install -y libsndfile1 make
-    - name: Install nemo dependencies
+    - uses: actions/setup-python@v5
+      with:
+        python-version: '${{ matrix.python }}' 
+    - name: Build wheel
       id: nemo-wheel
       run:  |
-        pip install Cython
-        # install test requirements
-        pip install -r requirements/requirements_test.txt
         # Build nemo as a wheel
         pip install build
-        python -m build --no-isolation --wheel
+        python -m build --wheel
+        
         # Preserve wheel location
         DIST_FILE=$(find ./dist -name "*.whl" | head -n 1)
-        echo "::set-output name=DIST_FILE::${DIST_FILE}"
-    - name: Test ASR Domain Imports
-      run: |
-        # Install NeMo Domain
-        pip install "${{ steps.nemo-wheel.outputs.DIST_FILE }}[asr]"
-        # Run import checks
-        python tests/core_ptl/check_imports.py --domain "asr"
-        # Uninstall NeMo
-        pip uninstall -y nemo_toolkit
-  test-tts-imports:
-    runs-on: ubuntu-latest
-    container:
-      image: pytorch/pytorch:2.4.0-cuda11.8-cudnn9-runtime
-    steps:
-    - name: Checkout repo
-      uses: actions/checkout@v2
-    - name: Update base dependencies
+        echo "DIST_FILE=${DIST_FILE}" | tee -a "$GITHUB_OUTPUT"
+    
+    - name: Install NeMo + test dependencies
       run: |
-        apt-get update && apt-get install -y build-essential
-        apt-get install -y libsndfile1 make
-    - name: Install nemo dependencies
-      id: nemo-wheel
-      run:  |
-        pip install Cython
         # install test requirements
         pip install -r requirements/requirements_test.txt
-        # Build nemo as a wheel
-        pip install build
-        python -m build --no-isolation --wheel
-        # Preserve wheel location
-        DIST_FILE=$(find ./dist -name "*.whl" | head -n 1)
-        echo "::set-output name=DIST_FILE::${DIST_FILE}"
-    - name: Test TTS Domain Imports
-      run: |
+        
         # Install NeMo Domain
-        pip install "${{ steps.nemo-wheel.outputs.DIST_FILE }}[tts]"
+        pip install "${{ steps.nemo-wheel.outputs.DIST_FILE }}[test,${{ matrix.collection }}]"
+    
+    - name: Run ${{ matrix.collection }} checks
+      run: |
         # Run import checks
-        python tests/core_ptl/check_imports.py --domain "tts"
-        # Uninstall NeMo
-        pip uninstall -y nemo_toolkit
+        python tests/core_ptl/check_imports.py --domain "${{ matrix.collection }}"
+  
+
diff --git a/Dockerfile.ci b/Dockerfile.ci
@@ -34,17 +34,12 @@ EOF
 WORKDIR /workspace
 
 # Install Mamba Dependancy
-ARG CAUSAL_CONV_TAG=v1.2.2.post1
+ARG CAUSAL_CONV_TAG=v1.2.2.post1 
+ARG MAMBA_TAG=v2.2.0
 
 RUN <<"EOF" bash -ex
 # Mamba dependancy installation
-
-git clone --depth 1 --branch ${CAUSAL_CONV_TAG} https://github.com/Dao-AILab/causal-conv1d && \
-  cd causal-conv1d && \
-  python setup.py install && \
-  cd .. && \
-  rm -rf causal-conv1d
-
+MAMBA_FORCE_BUILD=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE pip3 install --no-cache-dir -v git+https://github.com/Dao-AILab/causal-conv1d.git@${CAUSAL_CONV_TAG} git+https://github.com/state-spaces/mamba.git@${MAMBA_TAG}
 EOF
 
 RUN pip install hatchling   # needed to install nemo-run
@@ -54,8 +49,6 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T
 # Install NeMo requirements
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
 ARG MODELOPT_VERSION=0.21.0
-ARG MCORE_TAG=bd677bfb13ac2f19deaa927adc6da6f9201d66aa
-
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
 RUN \
   --mount=type=bind,source=requirements,target=requirements \
@@ -65,23 +58,22 @@ RUN \
   --mount=type=bind,source=nemo/__init__.py,target=nemo/__init__.py <<"EOF" bash -ex
 pip install --no-cache-dir --no-build-isolation --extra-index-url https://pypi.nvidia.com \
 "transformer-engine @ git+https://github.com/NVIDIA/TransformerEngine.git@${TE_TAG}" \
-"megatron_core @ git+https://github.com/NVIDIA/Megatron-LM.git@${MCORE_TAG}" \
 "nvidia-modelopt[torch]~=${MODELOPT_VERSION}" \
 "apex @ git+https://github.com/NVIDIA/apex.git@${APEX_TAG}" \
 "unstructured==0.14.9" \
 "llama-index==0.10.43" \
 "onnxscript @ git+https://github.com/microsoft/onnxscript" \
 -r tools/ctc_segmentation/requirements.txt \
 ".[all]"
+EOF
 
-# Megatron Core installation
-git clone https://github.com/NVIDIA/Megatron-LM.git && \
-pushd Megatron-LM && \
-git checkout ${MCORE_TAG} && \
-  pushd megatron/core/datasets && \
-  make && \
-  popd && \
-popd
+ARG MCORE_TAG=4dc8977167d71f86bdec47a60a98e85c4cfa0031
+RUN <<"EOF" bash -ex
+# Megatron-LM installation
+git clone https://github.com/NVIDIA/Megatron-LM.git
+pushd Megatron-LM
+git checkout ${MCORE_TAG} 
+pip install -e .
 export PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM"
 
 # Install nvidia-resiliency-ext
@@ -98,4 +90,11 @@ pip install --no-cache-dir --no-build-isolation ".[all]"
 chmod 777 -R /workspace
 EOF
 
+# Install vLLM in virtualenv
+RUN pip install --no-cache-dir --no-build-isolation virtualenv && \
+  virtualenv /opt/venv && \
+  /opt/venv/bin/pip install --no-cache-dir --no-build-isolation \
+      -r /workspace/requirements/requirements_vllm.txt \
+      -r /workspace/requirements/requirements_infer.txt
+
 ENV PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM"
diff --git a/docs/source/nlp/information_retrieval.rst b/docs/source/nlp/information_retrieval.rst
@@ -70,9 +70,7 @@ Then you can fine-tune the sentence-BERT model using the following script:
     VALIDATION_DATASET_PATH= # Path to validation dataset 
     SAVE_DIR= # where the checkpoint and logs are saved
     mkdir -p $SAVE_DIR
-    export NVTE_FLASH_ATTN=0
     export NVTE_ALLOW_NONDETERMINISTIC_ALGO=0
-    export NVTE_FUSED_ATTN=0
     
     python NeMo/examples/nlp/information_retrieval/megatron_bert_embedding_finetuning.py \
     --config-path=${CONFIG_PATH} \
@@ -87,6 +85,7 @@ Then you can fine-tune the sentence-BERT model using the following script:
     model.post_process=False \
     model.global_batch_size=8 \ # should be NUM_DEVICES * model.micro_batch_size
     model.micro_batch_size=8 \
+    model.attention_backend="unfused" \ 
     model.optim.lr=0.000005 \
     model.optim.sched.min_lr=0.00000001 \
     model.optim.sched.warmup_steps=100 \