[example] updated hybrid model parallel GPT pretraining examples

hpcaitech · Jul 27, 2023 · 42674bd · 42674bd
1 parent ef4b99e
commit 42674bd
Show file tree

Hide file tree

Showing 9 changed files with 2,851 additions and 0 deletions.
diff --git a/examples/language/gpt/experiments/hybrid_parallel/Dockerfile b/examples/language/gpt/experiments/hybrid_parallel/Dockerfile
@@ -0,0 +1,20 @@
+FROM nvcr.io/nvidia/pytorch:22.12-py3
+
+WORKDIR /workspace
+
+RUN pip install -U --no-cache-dir torch==2.0.0+cu118 torchvision==0.15.1+cu118 torchaudio==2.0.1 torchtext torchdata --index-url https://download.pytorch.org/whl/cu118
+
+RUN pip install -U --no-cache-dir transformers datasets
+
+RUN pip uninstall -y apex && git clone https://github.com/NVIDIA/apex.git && cd apex && \
+ python setup.py install --cpp_ext --cuda_ext --fast_layer_norm --fmha --xentropy --fast_multihead_attn
+
+RUN pip install -U --no-cache-dir ninja && \
+ pip install -v -U --no-cache-dir git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
+
+RUN git clone https://github.com/Dao-AILab/flash-attention.git && \
+ cd flash-attention && python setup.py install && \
+ cd csrc/rotary && python setup.py install
+
+RUN git clone https://github.com/kurisusnowdeng/ColossalAI.git && cd ColossalAI && \
+ CUDA_EXT=1 pip install -U -v --no-cache-dir -e .
diff --git a/examples/language/gpt/experiments/hybrid_parallel/README.md b/examples/language/gpt/experiments/hybrid_parallel/README.md
@@ -0,0 +1,90 @@
+# GPT2 benchmark
+
+## Preparation
+
+### Dependencies
+
+Install apex
+```shell
+git clone https://github.com/NVIDIA/apex
+cd apex
+pip install -v -U --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_layer_norm" .
+```
+
+Install xformers
+```shell
+pip install ninja
+git clone https://github.com/facebookresearch/xformers.git
+cd xformers
+git submodule update --init --recursive
+pip install -v -U .
+```
+
+Install bitsandbytes (e.g. CUDA 11.8)
+```shell
+git clone https://github.com/timdettmers/bitsandbytes.git
+cd bitsandbytes
+CUDA_VERSION=118 make cuda11x
+python setup.py install
+```
+
+Install ColossalAI
+```shell
+git clone https://github.com/hpcaitech/ColossalAI.git
+cd ColossalAI
+CUDA_EXT=1 pip install -v -U .
+```
+
+### Dataset
+
+```shell
+pip install -U transformers datasets
+python process_data.py --output-path /PATH/TO/PROCESSED/OPENWEBTEXT
+```
+
+## Usage
+
+### PyTorch FSDP
+
+```shell
+OMP_NUM_THREADS=128 torchrun --nproc_per_node 8 --master_port 23333 train_torch.py \
+ --data-path /PATH/TO/PROCESSED/OPENWEBTEXT \
+ --model gpt2-10b \
+ --max-iters 10 --eval-iters 1 --warmup-iters 0 \
+ --batch-size 4 --global-batch-size 128 \
+ --optim AdamW \
+ --dtype float16 \
+ --recompute \
+ --zero-stage 3
+```
+
+### ColossalAI Gemini
+
+```shell
+OMP_NUM_THREADS=128 torchrun --nproc_per_node 8 --master_port 23333 train_gemini.py \
+ --data-path /PATH/TO/PROCESSED/OPENWEBTEXT \
+ --model gpt2-10b \
+ --max-iters 10 --eval-iters 1 --warmup-iters 0 \
+ --batch-size 4 \
+ --optim AdamW \
+ --dtype float16 \
+ --recompute \
+ --flash \
+ --zero-stage 3
+```
+
+### ColossalAI Tensor Parallelism
+
+```shell
+OMP_NUM_THREADS=128 torchrun --nproc_per_node 8 --master_port 23333 train_col.py \
+ --data-path /PATH/TO/PROCESSED/OPENWEBTEXT \
+ --model gpt2-10b \
+ --max-iters 10 --eval-iters 1 --warmup-iters 0 \
+ --batch-size 4 --global-batch-size 128 \
+ --optim AdamW \
+ --dtype float16 --amp-level 2 \
+ --recompute \
+ --flash \
+ --tp 1d --tp-size 4 \
+ --zero-stage 3
+```