Skip to content

Commit

Permalink
Merge branch 'main' of github.com:NVIDIA/NeMo into ashors/megatron-ti…
Browse files Browse the repository at this point in the history
…mers
  • Loading branch information
ashors1 committed Jan 16, 2025
2 parents 8b49c07 + fe2ae82 commit 8c81e07
Show file tree
Hide file tree
Showing 76 changed files with 1,218 additions and 378 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/_test_template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ jobs:
steps:
- name: Docker system cleanup
run: |
docker system prune -a --filter "until=48h" --force || true
docker system prune -a --filter "until=24h" --force || true
- name: Docker pull image
run: |
Expand Down
61 changes: 54 additions & 7 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -128,14 +128,15 @@ jobs:
SCRIPT: |
NEMO_NUMBA_MINVER=0.53 pytest tests/collections/common -m "not pleasefixme" --with_downloads
L0_Unit_Tests_GPU_LLM:
OPTIONAL_L0_Unit_Tests_GPU_LLM:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_LLM') || needs.cicd-test-container-setup.outputs.all == 'true'
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_GPU_LLM') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure
SCRIPT: |
NEMO_NUMBA_MINVER=0.53 pytest tests/collections/llm -m "not pleasefixme" --with_downloads
IS_OPTIONAL: true

L0_Unit_Tests_GPU_Multimodal:
needs: [cicd-test-container-setup]
Expand Down Expand Up @@ -2937,7 +2938,7 @@ jobs:
with:
RUNNER: self-hosted-azure-gpus-2-h100
SCRIPT: |
CUDA_DEVICE_MAX_CONNECTIONS=1 NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=1 python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
CUDA_DEVICE_MAX_CONNECTIONS=1 python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
trainer.devices=2 \
trainer.log_every_n_steps=1 \
trainer.max_epochs=9999 \
Expand Down Expand Up @@ -2965,6 +2966,7 @@ jobs:
+model.tp_comm_overlap_ag=False \
+model.tp_comm_overlap_rs=False \
+model.tp_comm_overlap_disable_qkv=True \
+model.attention_backend="unfused" \
model.peft.peft_scheme="lora" \
model.peft.lora_tuning.adapter_dim=16 \
model.peft.lora_tuning.alpha=32 \
Expand Down Expand Up @@ -4329,11 +4331,24 @@ jobs:
with:
RUNNER: self-hosted-azure
SCRIPT: |
python tests/collections/vlm/neva_train.py \
python tests/collections/vlm/test_neva_train.py \
--devices=1 \
--max-steps=5 \
--experiment-dir=/tmp/nemo2_neva_results/${{ github.run_id }}
L2_NeMo_2_NEVA_MOCK_PACKED_TRAINING:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_NEVA_MOCK_PACKED_TRAINING') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure
SCRIPT: |
python tests/collections/vlm/test_neva_train.py \
--devices=1 \
--max-steps=5 \
--experiment-dir=/tmp/nemo2_neva_results/${{ github.run_id }} \
--use_packed_sequence
L2_NeMo_2_MLLAMA_MOCK_TRAINING:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
Expand All @@ -4342,7 +4357,7 @@ jobs:
RUNNER: self-hosted-azure
SCRIPT: |
TRANSFORMERS_OFFLINE=1 \
python tests/collections/vlm/mllama_train.py \
python tests/collections/vlm/test_mllama_train.py \
--devices=1 \
--max-steps=5 \
--experiment-dir=/tmp/nemo2_mllama_results/${{ github.run_id }}
Expand All @@ -4354,7 +4369,7 @@ jobs:
with:
RUNNER: self-hosted-azure
SCRIPT: |
NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python3 tests/collections/llm/megatron_mixtral_pretraining.py \
python3 tests/collections/llm/megatron_mixtral_pretraining.py \
--experiment-dir=/tmp/mixtral_pretrain_results \
--data-path=/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document
Expand Down Expand Up @@ -4915,6 +4930,36 @@ jobs:
AFTER_SCRIPT: |
rm -rf /tmp/nemo2_llava_next_results
L2_NeMo_2_VLLM_EXPORT:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_VLLM_EXPORT') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure
SCRIPT: |
python tests/setup/models/create_hf_model.py \
--model_name_or_path /home/TestData/nlp/megatron_llama/llama-ci-hf \
--output_dir /tmp/llama_head64 \
--config_updates "{\"hidden_size\": 512, \"num_attention_heads\": 4, \"numx_hidden_layers\": 2, \"num_key_value_heads\": 4, \"intermediate_size\": 1024, \"head_dim\": 128, \"num_hidden_layers\": 2, \"torch_dtype\": \"float16\" }"
python tests/collections/llm/test_hf_import.py --hf_model /tmp/llama_head64 --output_path /tmp/nemo2_ckpt
/opt/venv/bin/python tests/export/nemo_export.py \
--min_tps 1 \
--max_tps 1 \
--use_vllm True \
--model_type llama \
--max_output_len 128 \
--test_deployment True \
--model_name nemo2_ckpt \
--model_dir /tmp/vllm_from_nemo2 \
--checkpoint_dir /tmp/nemo2_ckpt
AFTER_SCRIPT: |
rm -rf /tmp/llama_head64
rm -rf /tmp/nemo2_ckpt
rm -rf /tmp/vllm_from_nemo2
Nemo_CICD_Test:
needs:
- pre-flight
Expand All @@ -4923,7 +4968,7 @@ jobs:
- L0_Unit_Tests_GPU_ASR
- L0_Unit_Tests_GPU_Audio
- L0_Unit_Tests_GPU_Common
- L0_Unit_Tests_GPU_LLM
#- OPTIONAL_L0_Unit_Tests_GPU_LLM
- L0_Unit_Tests_GPU_Multimodal
- L0_Unit_Tests_GPU_NLP
- L0_Unit_Tests_GPU_TTS
Expand Down Expand Up @@ -5030,6 +5075,7 @@ jobs:
- Speech_Checkpoints_tests
- L2_Stable_Diffusion_Training
- L2_NeMo_2_NEVA_MOCK_TRAINING
- L2_NeMo_2_NEVA_MOCK_PACKED_TRAINING
- L2_NeMo_2_MLLAMA_MOCK_TRAINING
- L2_NeMo_2_GPT_Pretraining_no_transformer_engine
- L2_NeMo_2_GPT_DDP_Param_Parity_check
Expand Down Expand Up @@ -5102,6 +5148,7 @@ jobs:
- L2_NeMo_2_LLAVA_NEXT_MOCK_TRAINING
- L2_HF_Transformer_SFT_FSDP2_2gpu
- L2_HF_Transformer_SFT_2gpu_nemorun_fsdp2
- L2_NeMo_2_VLLM_EXPORT
if: always()
runs-on: ubuntu-latest
steps:
Expand Down
79 changes: 29 additions & 50 deletions .github/workflows/import-test.yml
Original file line number Diff line number Diff line change
@@ -1,73 +1,52 @@
name: CI-Import-Check

on:
push:
pull_request:
paths:
- "**"

# Check https://hub.docker.com/r/pytorch/pytorch/tags for latest tags
jobs:

test-asr-imports:
runs-on: ubuntu-latest
container:
image: pytorch/pytorch:2.4.0-cuda11.8-cudnn9-runtime
test-imports:
name: test-${{ matrix.collection }}-import-${{ matrix.os }}-py${{ matrix.python }}
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, macos-latest]
collection:
- asr
# - nlp # Currently broken
- tts
python: ['3.10', '3.11', '3.12']
steps:
- name: Checkout repo
uses: actions/checkout@v2
- name: Update base dependencies
run: |
apt-get update && apt-get install -y build-essential
apt-get install -y libsndfile1 make
- name: Install nemo dependencies
- uses: actions/setup-python@v5
with:
python-version: '${{ matrix.python }}'
- name: Build wheel
id: nemo-wheel
run: |
pip install Cython
# install test requirements
pip install -r requirements/requirements_test.txt
# Build nemo as a wheel
pip install build
python -m build --no-isolation --wheel
python -m build --wheel
# Preserve wheel location
DIST_FILE=$(find ./dist -name "*.whl" | head -n 1)
echo "::set-output name=DIST_FILE::${DIST_FILE}"
- name: Test ASR Domain Imports
run: |
# Install NeMo Domain
pip install "${{ steps.nemo-wheel.outputs.DIST_FILE }}[asr]"
# Run import checks
python tests/core_ptl/check_imports.py --domain "asr"
# Uninstall NeMo
pip uninstall -y nemo_toolkit
test-tts-imports:
runs-on: ubuntu-latest
container:
image: pytorch/pytorch:2.4.0-cuda11.8-cudnn9-runtime
steps:
- name: Checkout repo
uses: actions/checkout@v2
- name: Update base dependencies
echo "DIST_FILE=${DIST_FILE}" | tee -a "$GITHUB_OUTPUT"
- name: Install NeMo + test dependencies
run: |
apt-get update && apt-get install -y build-essential
apt-get install -y libsndfile1 make
- name: Install nemo dependencies
id: nemo-wheel
run: |
pip install Cython
# install test requirements
pip install -r requirements/requirements_test.txt
# Build nemo as a wheel
pip install build
python -m build --no-isolation --wheel
# Preserve wheel location
DIST_FILE=$(find ./dist -name "*.whl" | head -n 1)
echo "::set-output name=DIST_FILE::${DIST_FILE}"
- name: Test TTS Domain Imports
run: |
# Install NeMo Domain
pip install "${{ steps.nemo-wheel.outputs.DIST_FILE }}[tts]"
pip install "${{ steps.nemo-wheel.outputs.DIST_FILE }}[test,${{ matrix.collection }}]"
- name: Run ${{ matrix.collection }} checks
run: |
# Run import checks
python tests/core_ptl/check_imports.py --domain "tts"
# Uninstall NeMo
pip uninstall -y nemo_toolkit
python tests/core_ptl/check_imports.py --domain "${{ matrix.collection }}"

37 changes: 18 additions & 19 deletions Dockerfile.ci
Original file line number Diff line number Diff line change
Expand Up @@ -34,17 +34,12 @@ EOF
WORKDIR /workspace

# Install Mamba Dependancy
ARG CAUSAL_CONV_TAG=v1.2.2.post1
ARG CAUSAL_CONV_TAG=v1.2.2.post1
ARG MAMBA_TAG=v2.2.0

RUN <<"EOF" bash -ex
# Mamba dependancy installation

git clone --depth 1 --branch ${CAUSAL_CONV_TAG} https://github.com/Dao-AILab/causal-conv1d && \
cd causal-conv1d && \
python setup.py install && \
cd .. && \
rm -rf causal-conv1d

MAMBA_FORCE_BUILD=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE pip3 install --no-cache-dir -v git+https://github.com/Dao-AILab/causal-conv1d.git@${CAUSAL_CONV_TAG} git+https://github.com/state-spaces/mamba.git@${MAMBA_TAG}
EOF

RUN pip install hatchling # needed to install nemo-run
Expand All @@ -54,8 +49,6 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T
# Install NeMo requirements
ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
ARG MODELOPT_VERSION=0.21.0
ARG MCORE_TAG=bd677bfb13ac2f19deaa927adc6da6f9201d66aa

ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
RUN \
--mount=type=bind,source=requirements,target=requirements \
Expand All @@ -65,23 +58,22 @@ RUN \
--mount=type=bind,source=nemo/__init__.py,target=nemo/__init__.py <<"EOF" bash -ex
pip install --no-cache-dir --no-build-isolation --extra-index-url https://pypi.nvidia.com \
"transformer-engine @ git+https://github.com/NVIDIA/TransformerEngine.git@${TE_TAG}" \
"megatron_core @ git+https://github.com/NVIDIA/Megatron-LM.git@${MCORE_TAG}" \
"nvidia-modelopt[torch]~=${MODELOPT_VERSION}" \
"apex @ git+https://github.com/NVIDIA/apex.git@${APEX_TAG}" \
"unstructured==0.14.9" \
"llama-index==0.10.43" \
"onnxscript @ git+https://github.com/microsoft/onnxscript" \
-r tools/ctc_segmentation/requirements.txt \
".[all]"
EOF

# Megatron Core installation
git clone https://github.com/NVIDIA/Megatron-LM.git && \
pushd Megatron-LM && \
git checkout ${MCORE_TAG} && \
pushd megatron/core/datasets && \
make && \
popd && \
popd
ARG MCORE_TAG=4dc8977167d71f86bdec47a60a98e85c4cfa0031
RUN <<"EOF" bash -ex
# Megatron-LM installation
git clone https://github.com/NVIDIA/Megatron-LM.git
pushd Megatron-LM
git checkout ${MCORE_TAG}
pip install -e .
export PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM"

# Install nvidia-resiliency-ext
Expand All @@ -98,4 +90,11 @@ pip install --no-cache-dir --no-build-isolation ".[all]"
chmod 777 -R /workspace
EOF

# Install vLLM in virtualenv
RUN pip install --no-cache-dir --no-build-isolation virtualenv && \
virtualenv /opt/venv && \
/opt/venv/bin/pip install --no-cache-dir --no-build-isolation \
-r /workspace/requirements/requirements_vllm.txt \
-r /workspace/requirements/requirements_infer.txt

ENV PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM"
3 changes: 1 addition & 2 deletions docs/source/nlp/information_retrieval.rst
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,7 @@ Then you can fine-tune the sentence-BERT model using the following script:
VALIDATION_DATASET_PATH= # Path to validation dataset
SAVE_DIR= # where the checkpoint and logs are saved
mkdir -p $SAVE_DIR
export NVTE_FLASH_ATTN=0
export NVTE_ALLOW_NONDETERMINISTIC_ALGO=0
export NVTE_FUSED_ATTN=0
python NeMo/examples/nlp/information_retrieval/megatron_bert_embedding_finetuning.py \
--config-path=${CONFIG_PATH} \
Expand All @@ -87,6 +85,7 @@ Then you can fine-tune the sentence-BERT model using the following script:
model.post_process=False \
model.global_batch_size=8 \ # should be NUM_DEVICES * model.micro_batch_size
model.micro_batch_size=8 \
model.attention_backend="unfused" \
model.optim.lr=0.000005 \
model.optim.sched.min_lr=0.00000001 \
model.optim.sched.warmup_steps=100 \
Expand Down
Loading

0 comments on commit 8c81e07

Please sign in to comment.