-
Notifications
You must be signed in to change notification settings - Fork 143
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into alit/griffin
- Loading branch information
Showing
13 changed files
with
455 additions
and
99 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,21 +18,21 @@ | |
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | ||
# DEALINGS IN THE SOFTWARE. | ||
|
||
ARG BIGNLP_BACKEND=pytorch | ||
ARG BIGNLP_BACKEND_BRANCH_TAG=24.01 | ||
ARG LAUNCHER_BACKEND=pytorch | ||
ARG LAUNCHER_BACKEND_BRANCH_TAG=24.03 | ||
|
||
FROM nvcr.io/nvidia/${BIGNLP_BACKEND}:${BIGNLP_BACKEND_BRANCH_TAG}-py3 as pytorch | ||
FROM nvcr.io/nvidia/${LAUNCHER_BACKEND}:${LAUNCHER_BACKEND_BRANCH_TAG}-py3 as pytorch | ||
|
||
################################## | ||
#### Build training container #### | ||
################################## | ||
FROM pytorch as training | ||
|
||
ENV NVIDIA_PRODUCT_NAME="NeMo Megatron" | ||
ENV NVIDIA_PRODUCT_NAME="NeMo Framework" | ||
|
||
ARG NVIDIA_BIGNLP_VERSION | ||
ENV NVIDIA_BIGNLP_VERSION=$NVIDIA_BIGNLP_VERSION | ||
LABEL com.nvidia.bignlp.version="${NVIDIA_BIGNLP_VERSION}" | ||
ARG NVIDIA_LAUNCHER_VERSION | ||
ENV NVIDIA_LAUNCHER_VERSION=$NVIDIA_LAUNCHER_VERSION | ||
LABEL com.nvidia.launcher.version="${NVIDIA_LAUNCHER_VERSION}" | ||
|
||
ENV DEBIAN_FRONTEND=noninteractive | ||
RUN apt-get update && apt-get install -y --no-install-recommends \ | ||
|
@@ -43,17 +43,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ | |
libb64-dev && \ | ||
rm -rf /var/lib/apt/lists/* | ||
|
||
# Install bits from BigNLP sources here... | ||
WORKDIR /opt | ||
### Note: if you don't want to ship the source code, | ||
### you can do this COPY and RUN building in a separate build stage using multistage docker, | ||
### and just install the resulting binary here using COPY --from or RUN --mount=from= | ||
### experimental syntax | ||
#COPY bignlp-scripts/src dst | ||
#RUN ... | ||
|
||
# Get fastertransformer_backend | ||
#RUN git clone https://github.com/triton-inference-server/fastertransformer_backend.git | ||
|
||
# Install SentencePiece | ||
RUN git clone https://github.com/google/sentencepiece.git && \ | ||
|
@@ -101,9 +95,6 @@ RUN git clone https://github.com/NVIDIA/NeMo-Aligner.git && \ | |
fi && \ | ||
pip install --no-deps -e . | ||
|
||
# HF cache | ||
RUN python -c "from transformers import AutoTokenizer; tok_gpt=AutoTokenizer.from_pretrained('gpt2'); tok_bert=AutoTokenizer.from_pretrained('bert-base-cased'); tok_large_bert=AutoTokenizer.from_pretrained('bert-large-cased'); tok_large_uncased_bert=AutoTokenizer.from_pretrained('bert-large-uncased'); AutoTokenizer.from_pretrained('mistralai/Mixtral-8x7B-v0.1'); AutoTokenizer.from_pretrained('mistralai/Mistral-7B-v0.1');" | ||
|
||
# Install TE | ||
ARG TE_COMMIT | ||
RUN git clone https://github.com/NVIDIA/TransformerEngine.git && \ | ||
|
@@ -123,7 +114,10 @@ RUN git clone https://github.com/NVIDIA/Megatron-LM.git && \ | |
git fetch origin $MEGATRONCORE_COMMIT && \ | ||
git checkout FETCH_HEAD; \ | ||
fi && \ | ||
pip install -e . | ||
pip install -e . && \ | ||
cd megatron/core/datasets && \ | ||
make && \ | ||
pip install git+https://github.com/fanshiqing/[email protected] | ||
|
||
# Install launch scripts | ||
ARG LAUNCHER_COMMIT | ||
|
@@ -139,55 +133,8 @@ RUN git clone https://github.com/NVIDIA/NeMo-Megatron-Launcher.git && \ | |
ENV LAUNCHER_SCRIPTS_PATH=/opt/NeMo-Megatron-Launcher/launcher_scripts | ||
ENV PYTHONPATH=/opt/NeMo-Megatron-Launcher/launcher_scripts:${PYTHONPATH} | ||
|
||
# pyenv setup for pytriton | ||
RUN apt-get update && apt-get install -y --no-install-recommends \ | ||
libffi-dev \ | ||
libreadline-dev \ | ||
libsqlite3-dev && \ | ||
rm -rf /var/lib/apt/lists/* | ||
RUN curl https://pyenv.run | bash && \ | ||
export PYENV_ROOT="$HOME/.pyenv" && \ | ||
command -v pyenv >/dev/null || export PATH="$PYENV_ROOT/bin:$PATH" && \ | ||
eval "$(pyenv init -)" && \ | ||
pyenv install 3.8 && \ | ||
pyenv global 3.8 && \ | ||
pip3 install virtualenv && \ | ||
mkdir -p ~/.cache/pytriton/ && \ | ||
python -mvenv ~/.cache/pytriton/python_backend_interpreter --copies --clear && \ | ||
source ~/.cache/pytriton/python_backend_interpreter/bin/activate && \ | ||
pip3 install numpy~=1.21 pyzmq~=23.0 && \ | ||
deactivate && \ | ||
pyenv global system | ||
|
||
# pip install required python packages | ||
RUN pip install --no-cache-dir wandb==0.15.3 \ | ||
black==20.8b1 \ | ||
'click>=8.0.1' \ | ||
'datasets>=1.2.1' \ | ||
jsonlines==2.0.0 \ | ||
lm_dataformat==0.0.19 \ | ||
mock==4.0.3 \ | ||
'numba>=0.57.1' \ | ||
'numexpr>=2.7.2' \ | ||
pybind11==2.8.0 \ | ||
pycountry==20.7.3 \ | ||
pytest==6.2.5 \ | ||
sacrebleu==1.5.0 \ | ||
'scikit-learn>=0.24.1' \ | ||
spacy==3.1.3 \ | ||
sqlitedict==1.6.0 \ | ||
'transformers>=4.1' \ | ||
tqdm-multiprocess==0.0.11 \ | ||
zstandard==0.17.0 \ | ||
tritonclient[all]~=2.33 \ | ||
'nvidia-pytriton==0.4.1' \ | ||
'nltk>=3.6.7' \ | ||
'ipython>=7.31.1' \ | ||
'torchmetrics==0.9.1' | ||
|
||
RUN pip install pytorch_lightning==2.2.1 | ||
# Copy FasterTransformer | ||
#COPY --from=ft_builder /workspace/FasterTransformer FasterTransformer | ||
# HF cache | ||
RUN python -c "from transformers import AutoTokenizer; tok_gpt=AutoTokenizer.from_pretrained('gpt2'); tok_bert=AutoTokenizer.from_pretrained('bert-base-cased'); tok_large_bert=AutoTokenizer.from_pretrained('bert-large-cased'); tok_large_uncased_bert=AutoTokenizer.from_pretrained('bert-large-uncased');" | ||
|
||
# Setup SSH config to allow mpi-operator to communicate with containers in k8s | ||
RUN echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ | ||
|
@@ -204,4 +151,4 @@ ARG NVIDIA_BUILD_ID | |
ENV NVIDIA_BUILD_ID ${NVIDIA_BUILD_ID:-<unknown>} | ||
LABEL com.nvidia.build.id="${NVIDIA_BUILD_ID}" | ||
ARG NVIDIA_BUILD_REF | ||
LABEL com.nvidia.build.ref="${NVIDIA_BUILD_REF}" | ||
LABEL com.nvidia.build.ref="${NVIDIA_BUILD_REF}" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
160 changes: 160 additions & 0 deletions
160
launcher_scripts/conf/fine_tuning/bert_embedding/sft.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,160 @@ | ||
run: | ||
name: sft_bert_embedding | ||
results_dir: ${base_results_dir}/${fine_tuning.run.name} | ||
time_limit: "00:30:00" | ||
dependency: "singleton" | ||
|
||
|
||
trainer: | ||
devices: 8 | ||
num_nodes: 1 | ||
accelerator: gpu | ||
precision: bf16 | ||
logger: False # logger provided by exp_manager | ||
enable_checkpointing: False | ||
use_distributed_sampler: False | ||
max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch. | ||
max_steps: 4 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches | ||
log_every_n_steps: 1 | ||
val_check_interval: 2 | ||
limit_val_batches: 50 | ||
limit_test_batches: 500 | ||
accumulate_grad_batches: 1 | ||
gradient_clip_val: 1.0 | ||
benchmark: False | ||
|
||
exp_manager: | ||
explicit_log_dir: ${fine_tuning.run.results_dir}/results | ||
exp_dir: null | ||
name: megatron_bert | ||
create_wandb_logger: False | ||
wandb_logger_kwargs: | ||
project: null | ||
name: null | ||
resume_if_exists: True | ||
resume_ignore_no_checkpoint: True | ||
create_checkpoint_callback: True | ||
checkpoint_callback_params: | ||
monitor: val_loss | ||
save_top_k: 10 | ||
mode: min | ||
always_save_nemo: False # saves nemo file during validation, not implemented for model parallel | ||
filename: 'megatron_bert--{val_loss:.2f}-{step}-{consumed_samples}' | ||
model_parallel_size: ${multiply:${fine_tuning.model.tensor_model_parallel_size}, ${fine_tuning.model.pipeline_model_parallel_size}} | ||
|
||
restore_from_path: ??? | ||
|
||
model: | ||
# model parallelism | ||
mcore_bert: True | ||
micro_batch_size: 1 | ||
global_batch_size: 8 | ||
tensor_model_parallel_size: 1 | ||
pipeline_model_parallel_size: 1 | ||
virtual_pipeline_model_parallel_size: null | ||
|
||
# model architecture | ||
encoder_seq_length: 512 | ||
max_position_embeddings: ${.encoder_seq_length} | ||
position_embedding_type: 'learned_absolute' # Position embedding type. Options ['learned_absolute', 'rope', 'alibi', 'kerple' , 'xpos', 'sandwich'] xpos and sandwich are experimental. | ||
num_layers: 24 | ||
hidden_size: 1024 | ||
ffn_hidden_size: 4096 # Transformer FFN hidden size. Usually 4 * hidden_size. | ||
num_attention_heads: 16 | ||
transformer_block_type: post_ln | ||
add_pooler: True | ||
add_lm_head: False | ||
init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.') | ||
hidden_dropout: 0.1 # Dropout probability for hidden state transformer. | ||
kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null | ||
apply_query_key_layer_scaling: False # scale Q * K^T by 1 / layer-number. | ||
normalization: layernorm | ||
layernorm_epsilon: 1e-12 | ||
make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency. | ||
pre_process: True # add embedding | ||
post_process: True # add pooler | ||
bert_binary_head: True # BERT binary head | ||
megatron_legacy: False | ||
tokenizer: | ||
library: 'huggingface' | ||
type: 'intfloat/e5-large-unsupervised' | ||
model: null | ||
vocab_file: null | ||
merge_file: null | ||
|
||
# precision | ||
native_amp_init_scale: 4294967296 # 2 ** 32 | ||
native_amp_growth_interval: 1000 | ||
fp32_residual_connection: False # Move residual connections to fp32 | ||
fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 | ||
|
||
# Megatron O2-style half-precision | ||
megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters | ||
grad_allreduce_chunk_size_mb: 125 | ||
grad_div_ar_fusion: False | ||
|
||
# miscellaneous | ||
seed: 1234 | ||
use_cpu_initialization: False # Init weights on the CPU (slow for large models) | ||
onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter. | ||
gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) | ||
|
||
## Activation Checkpointing | ||
# NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed. | ||
# These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+). | ||
# See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. | ||
# 'full' will checkpoint the entire transformer layer. | ||
activations_checkpoint_granularity: null # 'selective' or 'full' | ||
activations_checkpoint_method: null # 'uniform', 'block' | ||
# 'uniform' divides the total number of transformer layers and checkpoints the input activation | ||
# of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model. | ||
# 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity | ||
activations_checkpoint_num_layers: null | ||
# when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory. | ||
# when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage. | ||
num_micro_batches_with_partial_activation_checkpoints: null | ||
# This feature is valid only when used with pipeline-model-parallelism. | ||
# When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed | ||
# and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is | ||
# set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint | ||
# per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'. | ||
# This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage. | ||
activations_checkpoint_layers_per_pipeline: null | ||
# This feature is valid only when used with pipeline-model-parallelism. | ||
# When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later | ||
# pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than | ||
# stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage | ||
# uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints', | ||
# this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path. | ||
sequence_parallel: False | ||
|
||
data: | ||
# Path to data must be specified by the user. | ||
data_train: ??? | ||
data_validation: ??? | ||
hard_negatives_to_train: 4 | ||
index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix | ||
data_impl: mmap | ||
splits_string: 900,50,50 | ||
seq_length: ${fine_tuning.model.encoder_seq_length} | ||
skip_warmup: True | ||
num_workers: 0 | ||
dataloader_type: single # cyclic, LDDL | ||
reset_position_ids: False # Reset position ids after end-of-document token | ||
reset_attention_mask: False # Reset attention mask after end-of-document token | ||
eod_mask_loss: False # Mask loss for the end of document tokens | ||
masked_lm_prob: 0.15 # Probability of replacing a token with mask. | ||
short_seq_prob: 0.1 # Probability of producing a short sequence. | ||
|
||
optim: | ||
name: fused_adam | ||
lr: 5e-6 | ||
weight_decay: 0.01 | ||
betas: | ||
- 0.9 | ||
- 0.98 | ||
sched: | ||
name: CosineAnnealing | ||
warmup_steps: 100 | ||
constant_steps: 50000 | ||
min_lr: 5e-7 |
File renamed without changes.
Oops, something went wrong.