Skip to content

Commit

Permalink
Merge branch 'refs/heads/main' into chcui/nemotron5_support
Browse files Browse the repository at this point in the history
  • Loading branch information
cuichenx committed Jan 7, 2025
2 parents ecbdc5c + 79363ac commit 0e7f92b
Show file tree
Hide file tree
Showing 158 changed files with 8,920 additions and 1,300 deletions.
64 changes: 45 additions & 19 deletions .github/workflows/changelog-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,25 @@ name: 'Changelog Build (Release)'

on:
workflow_dispatch:
push:
tags:
- '*'
inputs:
last-release-tag:
description: Last Git tag to start from (exclusive) (e.g. `v2.0.0`)
type: string
required: true
release-branch:
description: Release branch to build changelog on (e.g. `r2.1.0`)
type: string
required: true

jobs:
changelog:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Checkout branch
uses: actions/checkout@v4
with:
fetch-depth: 0 # Required due to the way Git works, without it this action won't be able to find any or the correct tags

- name: Get Previous tag
id: previous_tag
# git for-each-ref --sort=-creatordate --format '%(refname)' refs/tags ==> refs/tags/vX.Y.Z in descending order of date
# awk 'FNR == 2 {print substr($1, 11, length($1))}') ==> Selects the 2nd tag from the list, then strips the /refs/tags/ part of the tag
# set-output name=tag_name:: ==> Takes the clean tag vX.Y.Z and sets it to steps.previous_tag.outputs.tag_name
run: |
echo "::set-output name=tag_name::$(git for-each-ref --sort=-creatordate --format '%(refname)' refs/tags | awk 'FNR == 2 {print substr($1, 11, length($1))}')"
echo ${{ steps.previous_tag.outputs.tag_name }}
ref: ko3n1g/ci/fix-changelog-generator
fetch-depth: 0

- name: Build Changelog
id: github_tag
Expand All @@ -38,10 +37,37 @@ jobs:
repo: "NeMo"
ignorePreReleases: "false"
failOnError: "false"
fromTag: ${{ steps.previous_tag.outputs.tag_name }}
toTag: ${{ github.ref_name || github.sha }}
fromTag: ${{ inputs.last-release-tag }}
toTag: ${{ inputs.release-branch }}

- name: Print Changelog
- name: Update changelog file
env:
RELEASE_BRANCH: ${{ inputs.release-branch }}
CHANGELOG: ${{ steps.github_tag.outputs.changelog }}
shell: bash -x -e -u -o pipefail {0}
run: |
echo "${{steps.github_tag.outputs.changelog}}"
echo "--- DONE ---"
RELEASE_VERSION=${RELEASE_BRANCH#r}
CHANGELOG=$(echo "$CHANGELOG" | sed '/^[[:blank:]]*#/s/#/###/')
RELEASE_NOTES="## NVIDIA Neural Modules $RELEASE_VERSION
### Detailed Changelogs:
$CHANGELOG"
printf "%s\n" "$RELEASE_NOTES" | sed '/<!-- Next changelog -->/r /dev/stdin' CHANGELOG.md > CHANGELOG.tmp.md
mv CHANGELOG.tmp.md CHANGELOG.md
- name: Inspect new changelog file
run: cat CHANGELOG.md

- name: Create Pull Request
uses: peter-evans/create-pull-request@v7
with:
commit-message: "beep boop: Update changelog"
title: "Update changelog for `${{ inputs.release-branch }}`"
signoff: true
sign-commits: true
base: main
branch: bot/chore/update-changelog-into-${{ inputs.release-branch }}
120 changes: 116 additions & 4 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -571,9 +571,24 @@ jobs:
prune.num_attention_heads=2 \
prune.num_query_groups=2 \
prune.hidden_size=128 \
export.save_path=examples/nlp/language_modeling/ci_prune_width.nemo
AFTER_SCRIPT: |
rm -rf examples/nlp/language_modeling/ci_prune_width.nemo
export.save_path=/tmp/ci_prune_width.nemo
L2_Prune_Depth_Llama2:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Prune_Depth_Llama2') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure
SCRIPT: |
python examples/nlp/language_modeling/megatron_gpt_prune.py \
trainer.devices=2 \
trainer.num_nodes=1 \
trainer.precision=bf16 \
model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
model.tensor_model_parallel_size=2 \
model.pipeline_model_parallel_size=1 \
'prune.drop_layers=[1]' \
export.save_path=/tmp/ci_prune_depth.nemo
# L2: ASR dev run
ASR_dev_run_Speech_to_Text:
Expand Down Expand Up @@ -3611,6 +3626,26 @@ jobs:
AFTER_SCRIPT: |
rm -rf nemo_experiments
L2_VLM_HF_Transformer_PEFT_FSDP:
needs: [ cicd-test-container-setup ]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_VLM_HF_Transformer_PEFT_FSDP') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure
SCRIPT: |
TRANSFORMERS_OFFLINE=1 python tests/collections/vlm/hf/peft.py --model /home/TestData/vlm/qwen2-2b/ --max-steps 3 --disable-ckpt --strategy fsdp --devices 2
L2_VLM_HF_Transformer_PEFT_4bit:
needs: [ cicd-test-container-setup ]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_VLM_HF_Transformer_PEFT_4bit') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure-gpus-1
SCRIPT: |
TRANSFORMERS_OFFLINE=1 python tests/collections/vlm/hf/peft.py --model /home/TestData/vlm/qwen2-2b/ --max-steps 3 --disable-ckpt --use-4bit
AFTER_SCRIPT: |
rm -rf nemo_experiments
L2_HF_Transformer_PEFT:
needs: [ cicd-test-container-setup ]
uses: ./.github/workflows/_test_template.yml
Expand Down Expand Up @@ -3666,6 +3701,17 @@ jobs:
AFTER_SCRIPT: |
rm -rf nemo_experiments
L2_HF_Transformer_PT_2gpu:
needs: [ cicd-test-container-setup ]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_HF_Transformer_PT_2gpu') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure
SCRIPT: |
TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/pretrain.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --max-steps 10 --devices 2 --strategy ddp
AFTER_SCRIPT: |
rm -rf nemo_experiments
L2_HF_Transformer_SFT_2gpu_nemorun:
needs: [ cicd-test-container-setup ]
uses: ./.github/workflows/_test_template.yml
Expand All @@ -3677,6 +3723,39 @@ jobs:
AFTER_SCRIPT: |
rm -rf nemo_experiments
L2_HF_Transformer_PT_2gpu_nemorun:
needs: [ cicd-test-container-setup ]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_HF_Transformer_PT_2gpu_nemorun') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure
SCRIPT: |
TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/pretrain_nemorun.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --max-steps 10 --devices 2 --strategy ddp
AFTER_SCRIPT: |
rm -rf nemo_experiments
L2_HF_Transformer_PT:
needs: [ cicd-test-container-setup ]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_HF_Transformer_PT') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure-gpus-1
SCRIPT: |
TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/pretrain.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --max-steps 10
AFTER_SCRIPT: |
rm -rf nemo_experiments
L2_HF_Transformer_PT_nemorun:
needs: [ cicd-test-container-setup ]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_HF_Transformer_PT_nemorun') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure-gpus-1
SCRIPT: |
TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/pretrain_nemorun.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --max-steps 10
AFTER_SCRIPT: |
rm -rf nemo_experiments
L2_HF_Transformer_SFT:
needs: [ cicd-test-container-setup ]
uses: ./.github/workflows/_test_template.yml
Expand All @@ -3698,7 +3777,7 @@ jobs:
TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/sft_nemorun.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --max-steps 10
AFTER_SCRIPT: |
rm -rf nemo_experiments
L2_HF_Transformer_SFT_TE_Acceleration:
needs: [ cicd-test-container-setup ]
uses: ./.github/workflows/_test_template.yml
Expand All @@ -3710,6 +3789,17 @@ jobs:
AFTER_SCRIPT: |
rm -rf nemo_experiments
L2_HF_Transformer_PT_TE_Acceleration:
needs: [ cicd-test-container-setup ]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_HF_Transformer_PT_TE_Acceleration') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure-gpus-1
SCRIPT: |
TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/pretrain.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --model-accelerator te --max-steps 10
AFTER_SCRIPT: |
rm -rf nemo_experiments
# L2: Megatron Mock Data Generation
L2_Megatron_Mock_Data_Generation_MockGPTDataset:
needs: [cicd-test-container-setup]
Expand Down Expand Up @@ -4705,6 +4795,18 @@ jobs:
--lora_checkpoint_path=/home/TestData/nemo2_ckpt/llama_lora_ci_checkpoint_v2/ \
--output_path=/tmp/nemo2_lora_merge/${{ github.run_id }}
L2_NEMO_2_LoRA_Export:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NEMO_2_LoRA_Export') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure-gpus-1
SCRIPT: |
python tests/collections/llm/peft/lora_export.py \
--lora_checkpoint_path=/home/TestData/nemo2_ckpt/llama_lora_ci_checkpoint_v2/ \
--output_path=/tmp/nemo2_lora_merge/${{ github.run_id }}
L2_NEMO_2_LoRA_Inference:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
Expand Down Expand Up @@ -4891,8 +4993,16 @@ jobs:
- L2_HF_Transformer_SFT_nemorun
- L2_HF_Transformer_SFT_2gpu
- L2_VLM_HF_Transformer_PEFT
- L2_VLM_HF_Transformer_PEFT_FSDP
- L2_VLM_HF_Transformer_PEFT_4bit
- L2_HF_Transformer_SFT_2gpu_nemorun
- L2_HF_Transformer_SFT_TE_Acceleration
- L2_HF_Transformer_PT
- L2_HF_Transformer_PT_nemorun
- L2_HF_Transformer_PT_2gpu
- L2_HF_Transformer_PT_2gpu_nemorun
- L2_HF_Transformer_PT_TE_Acceleration
- L2_VLM_HF_Transformer_PEFT
- L2_NeMo_2_SSM_Pretraining
- L2_NeMo_2_SSM_Finetuning
- L2_NeMo_2_T5_Pretraining
Expand All @@ -4917,12 +5027,14 @@ jobs:
- L2_NeMo_2_Mistral_LoRA_TP1PP1_MBS1
- L2_NeMo_2_Mistral_LoRA_TP2PP1_MBS1
- L2_NEMO_2_LoRA_MERGE
- L2_NEMO_2_LoRA_Export
- L2_NEMO_2_LoRA_Inference
- L2_NeMo_2_Mixtral_Pretraining
- L2_PTQ_Llama2_FP8
- L2_Community_LLM_Checkpoints_tests_Llama3
- L2_Distill_Llama2
- L2_Prune_Width_Llama2
- L2_Prune_Depth_Llama2
- L2_Speech_to_Text_AED
- L2_Speech_Estimate_Duration_Bins
- L2_Speech_Batch_Size_OOMptimizer
Expand Down
7 changes: 6 additions & 1 deletion .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ on:
description: Ref (SHA or branch name) to release
required: true
type: string
version-bump-branch:
description: Branch for version bump
required: true
type: string
dry-run:
description: Do not publish a wheel and GitHub release.
required: true
Expand All @@ -28,7 +32,7 @@ on:

jobs:
release:
uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_release_library.yml@v0.17.3
uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_release_library.yml@v0.18.3
with:
release-ref: ${{ inputs.release-ref }}
image-name: nemo_container
Expand All @@ -41,6 +45,7 @@ jobs:
container-workdir: /workspace
library-name: Neural Modules
dry-run: ${{ inputs.dry-run }}
version-bump-branch: ${{ inputs.version-bump-branch }}
secrets:
TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
Expand Down
Loading

0 comments on commit 0e7f92b

Please sign in to comment.