diff --git a/.gitignore b/.gitignore index cd06a347d..747699a86 100644 --- a/.gitignore +++ b/.gitignore @@ -129,4 +129,8 @@ venv.bak/ dmypy.json # Pyre type checker -.pyre/ \ No newline at end of file +.pyre/ + +config/ +version.py +output/ \ No newline at end of file diff --git a/README.md b/README.md index 2f631b982..ea20ee6c9 100644 --- a/README.md +++ b/README.md @@ -1,117 +1,13 @@ - +Please refer to the [official repository](https://github.com/Oneflow-Inc/libai) and the [official documentation page](https://libai.readthedocs.io/en/latest/) for guidance on installation and other related topics. -

LiBai

-

- - docs - - - GitHub - - - GitHub release - - - PRs Welcome - - - Python Checks - - - Docs Release Status - -

- - -## Introduction - -**English** | [简体中文](/README_zh-CN.md) - -LiBai is a large-scale open-source model training toolbox based on OneFlow. The main branch works with OneFlow 0.7.0. - -
- Highlights - -- **Support a collection of parallel training components** - - LiBai provides multiple parallelisms such as Data Parallelism, Tensor Parallelism, and Pipeline Parallelism. It's also extensible for other new parallelisms. - -- **Varied training techniques** - - LiBai provides many out-of-the-box training techniques such as Distributed Training, Mixed Precision Training, Activation Checkpointing, Recomputation, Gradient Accumulation, and Zero Redundancy Optimizer(ZeRO). - -- **Support for both CV and NLP tasks** - - LiBai has predifined data process for both CV and NLP datasets such as CIFAR, ImageNet, and BERT Dataset. - -- **Easy to use** - - LiBai's components are designed to be modular for easier usage as follows: - - LazyConfig system for more flexible syntax and no predefined structures - - Friendly trainer and engine - - Used as a library to support building research projects on it. See [projects/](/projects) for some projects that are built based on LiBai - -- **High Efficiency** - -
- -## Installation - -See [Installation instructions](https://libai.readthedocs.io/en/latest/tutorials/get_started/Installation.html). - -## Getting Started - -See [Quick Run](https://libai.readthedocs.io/en/latest/tutorials/get_started/quick_run.html) for the basic usage of LiBai. - -## Documentation - -See LiBai's [documentation](https://libai.readthedocs.io/en/latest/index.html) for full API documentation and tutorials. - -## ChangeLog - -**Beta 0.2.0** was released in 07/07/2022, the general changes in **0.2.0** version are as follows: - -**Features:** -- Support evaluation enabled and set `eval_iter` -- Support customized sampler in `config.py` -- Support rdma for pipeline-model-parallel -- Support multi fused kernel - - fused_scale_mask_softmax_dropout - - fused_scale_tril_softmax_mask_scale - - fused_self_attention in branch `libai_bench` -- User Experience Optimization -- Optimization for training throughput, see [benchmark](https://libai.readthedocs.io/en/latest/tutorials/get_started/Benchmark.html) for more details - -**Supported Models:** -- Support 3D parallel [Roberta](https://arxiv.org/abs/1907.11692) model -- Support 2D parallel (data parallel + tensor model parallel) [SimCSE](https://arxiv.org/abs/2104.08821) model -- Support Data parallel [MAE](https://arxiv.org/abs/2111.06377) model -- Support Data parallel [MOCOV3](https://arxiv.org/abs/2104.02057) model - -See [changelog](./changelog.md) for details and release history. - -## Contributing - -We appreciate all contributions to improve LiBai. See [CONTRIBUTING](./CONTRIBUTING.md) for the contributing guideline. - -## License - -This project is released under the [Apache 2.0 license](LICENSE). - -## Citation - -If you find this project useful for your research, consider cite: - -```BibTeX -@misc{of2021libai, - author = {Xingyu Liao and Peng Cheng and Tianhe Ren and Depeng Liang and - Kai Dang and Yi Wang and Xiaoyu Xu}, - title = {LiBai}, - howpublished = {\url{https://github.com/Oneflow-Inc/libai}}, - year = {2021} -} +## Running experiments in the OCCL paper +```shell +bash tools/train.sh tools/train_net.py configs/vit_imagenet.py ``` -## Join the WeChat group - -![LiBai_Wechat_QRcode](./docs/source/tutorials/assets/LiBai_Wechat.png) \ No newline at end of file +Notes: +- Prepare the ImageNet dataset in advance. +- Edit the [configs/vit_imagenet.py](configs/vit_imagenet.py#L84-L86) to switch among different distributed DNN training methods, following the guidelines in the [official doc](https://libai.readthedocs.io/en/latest/tutorials/basics/Distributed_Configuration.html). +- For training across multiple machines, edit the `NODE`, `NODE_RANK`, `ADDR`, and `ADDR_RANK` variables in [tools/train.sh](tools/train.sh#L8-L11). +- Edit [configs/vit_imagenet.py](configs/vit_imagenet.py#L2) to choose between the base ViT configuration or the large ViT configuration. +- If the environment virable `ONEFLOW_ENABLE_OFCCL` in [train.sh](tools/train.sh#L28) is set to `1`, OCCL will be used during training; otherwise, NCCL will be employed. diff --git a/configs/common/models/vit/vit_base_patch16_224.py b/configs/common/models/vit/vit_base_patch16_224.py index dd102b656..d66cb962c 100644 --- a/configs/common/models/vit/vit_base_patch16_224.py +++ b/configs/common/models/vit/vit_base_patch16_224.py @@ -6,6 +6,6 @@ cfg.patch_size = 16 cfg.embed_dim = 768 -cfg.num_heads = 12 +cfg.num_heads = 16 model = LazyCall(VisionTransformer)(cfg=cfg) diff --git a/configs/vit_imagenet.py b/configs/vit_imagenet.py index 91bdc914b..33728b5b0 100644 --- a/configs/vit_imagenet.py +++ b/configs/vit_imagenet.py @@ -1,5 +1,5 @@ from libai.config import LazyCall -from .common.models.vit.vit_base_patch16_224 import model +from .common.models.vit.vit_base_patch16_224 import model #from .common.models.vit.vit_large_patch16_224 import model from .common.models.graph import graph from .common.train import train from .common.optim import optim @@ -12,6 +12,31 @@ dataloader.train.dataset[0].root = "/path/to/imagenet" dataloader.test[0].dataset.root = "/path/to/imagenet" +import os +host = os.getenv("HOST") + +if (host == "oneflow-28"): + dataloader.train.dataset[0].root = "/ssd/dataset/ImageNet/extract" + dataloader.test[0].dataset.root = "/ssd/dataset/ImageNet/extract" +elif (host == "oneflow-15"): + dataloader.train.dataset[0].root = "/minio/sdd/dataset/imagenet/extract" + dataloader.test[0].dataset.root = "/minio/sdd/dataset/imagenet/extract" +elif (host == "oneflow-16"): + dataloader.train.dataset[0].root = "/dataset/ImageNet/extract" + dataloader.test[0].dataset.root = "/dataset/ImageNet/extract" +elif (host == "oneflow-25"): + dataloader.train.dataset[0].root = "/data/dataset/ImageNet/extract" + dataloader.test[0].dataset.root = "/data/dataset/ImageNet/extract" +elif (host == "oneflow-26"): + dataloader.train.dataset[0].root = "/ssd/dataset/ImageNet/extract" + dataloader.test[0].dataset.root = "/ssd/dataset/ImageNet/extract" +elif (host == "oneflow-27"): + dataloader.train.dataset[0].root = "/ssd/dataset/ImageNet/extract" + dataloader.test[0].dataset.root = "/ssd/dataset/ImageNet/extract" +else: + print("NO LEGAL HOST, exit.") + exit(1) + # Refine model cfg for vit training on imagenet model.cfg.num_classes = 1000 model.cfg.loss_func = SoftTargetCrossEntropy() @@ -37,9 +62,12 @@ # Refine train cfg for vit model train.train_micro_batch_size = 128 train.test_micro_batch_size = 128 -train.train_epoch = 300 +# train.train_epoch = 300 +train.train_epoch = 0 +train.train_iter = int(os.getenv("NUM_ITER_ENV")) train.warmup_ratio = 5 / 300 -train.evaluation.eval_period = 1000 +train.evaluation.enabled = False +# train.evaluation.eval_period = 100 train.log_period = 1 # Scheduler @@ -50,8 +78,12 @@ # Set fp16 ON train.amp.enabled = True +# zero +train.zero_optimization.enabled = False +train.zero_optimization.stage = 1 + # Distributed Settings train.dist.pipeline_num_layers = model.cfg.depth -train.dist.data_parallel_size = 1 -train.dist.tensor_parallel_size = 1 -train.dist.pipeline_parallel_size = 1 +train.dist.data_parallel_size = 2 +train.dist.tensor_parallel_size = 2 +train.dist.pipeline_parallel_size = 2 diff --git a/configs/vit_imagenet_a100.py b/configs/vit_imagenet_a100.py new file mode 100644 index 000000000..35e931d21 --- /dev/null +++ b/configs/vit_imagenet_a100.py @@ -0,0 +1,65 @@ +from libai.config import LazyCall +from .common.models.vit.vit_base_patch16_224 import model #from .common.models.vit.vit_large_patch16_224 import model +from .common.models.graph import graph +from .common.train import train +from .common.optim import optim +from .common.data.imagenet import dataloader + +from flowvision.data import Mixup +from flowvision.loss.cross_entropy import SoftTargetCrossEntropy + +# Refine data path to imagenet +dataloader.train.dataset[0].root = "/data/ImageNet/extract" +dataloader.test[0].dataset.root = "/data/ImageNet/extract" + +# Refine model cfg for vit training on imagenet +model.cfg.num_classes = 1000 +model.cfg.loss_func = SoftTargetCrossEntropy() + +# Add Mixup Func +dataloader.train.mixup_func = LazyCall(Mixup)( + mixup_alpha=0.8, + cutmix_alpha=1.0, + prob=1.0, + switch_prob=0.5, + mode="batch", + num_classes=model.cfg.num_classes, +) + +# Refine optimizer cfg for vit model +optim.lr = 1e-3 # 5e-4 * 1024 (batchsize) / 512 +optim.eps = 1e-8 +optim.weight_decay = 0.05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None +optim.params.overrides = {"pos_embed": {"weight_decay": 0.0}, "cls_token": {"weight_decay": 0.0}} + +# Refine train cfg for vit model +train.train_micro_batch_size = 128 +train.test_micro_batch_size = 128 +# train.train_epoch = 300 +train.train_epoch = 0 +import os +train.train_iter = int(os.getenv("NUM_ITER_ENV")) +train.warmup_ratio = 5 / 300 +train.evaluation.enabled = False +# train.evaluation.eval_period = 100 +train.log_period = 1 + +# Scheduler +train.scheduler.warmup_factor = 0.001 +train.scheduler.alpha = 0.01 +train.scheduler.warmup_method = "linear" + +# Set fp16 ON +train.amp.enabled = True + +# zero +train.zero_optimization.enabled = False +train.zero_optimization.stage = 1 + +# Distributed Settings +train.dist.pipeline_num_layers = model.cfg.depth +train.dist.data_parallel_size = 2 +train.dist.tensor_parallel_size = 2 +train.dist.pipeline_parallel_size = 2 diff --git a/configs/vit_imagenet_para_4090.py b/configs/vit_imagenet_para_4090.py new file mode 100644 index 000000000..6ddf5a1d8 --- /dev/null +++ b/configs/vit_imagenet_para_4090.py @@ -0,0 +1,68 @@ +from libai.config import LazyCall +from .common.models.vit.vit_base_patch16_224 import model #from .common.models.vit.vit_large_patch16_224 import model +from .common.models.graph import graph +from .common.train import train +from .common.optim import optim +from .common.data.imagenet import dataloader + +from flowvision.data import Mixup +from flowvision.loss.cross_entropy import SoftTargetCrossEntropy + + +import os +host = os.getenv("HOST") + + +dataloader.train.dataset[0].root = "/HOME/scw6cab/run/OCCL/ImageNet" +dataloader.test[0].dataset.root = "/HOME/scw6cab/run/OCCL/ImageNet" + +# Refine model cfg for vit training on imagenet +model.cfg.num_classes = 1000 +model.cfg.loss_func = SoftTargetCrossEntropy() + +# Add Mixup Func +dataloader.train.mixup_func = LazyCall(Mixup)( + mixup_alpha=0.8, + cutmix_alpha=1.0, + prob=1.0, + switch_prob=0.5, + mode="batch", + num_classes=model.cfg.num_classes, +) + +# Refine optimizer cfg for vit model +optim.lr = 1e-3 # 5e-4 * 1024 (batchsize) / 512 +optim.eps = 1e-8 +optim.weight_decay = 0.05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None +optim.params.overrides = {"pos_embed": {"weight_decay": 0.0}, "cls_token": {"weight_decay": 0.0}} + +# Refine train cfg for vit model +train.train_micro_batch_size = 128 +train.test_micro_batch_size = 128 +# train.train_epoch = 300 +train.train_epoch = 0 +train.train_iter = int(os.getenv("NUM_ITER_ENV")) +train.warmup_ratio = 5 / 300 +train.evaluation.enabled = False +# train.evaluation.eval_period = 100 +train.log_period = 1 + +# Scheduler +train.scheduler.warmup_factor = 0.001 +train.scheduler.alpha = 0.01 +train.scheduler.warmup_method = "linear" + +# Set fp16 ON +train.amp.enabled = True + +# zero +train.zero_optimization.enabled = False +train.zero_optimization.stage = 1 + +# Distributed Settings +train.dist.pipeline_num_layers = model.cfg.depth +train.dist.data_parallel_size = 2 +train.dist.tensor_parallel_size = 2 +train.dist.pipeline_parallel_size = 2 diff --git a/libai/models/utils/graph_base.py b/libai/models/utils/graph_base.py index dc49a7a7a..4d532558e 100644 --- a/libai/models/utils/graph_base.py +++ b/libai/models/utils/graph_base.py @@ -73,8 +73,12 @@ def __init__( # Enable cuda stream for computation and communication as the same stream. # This will reduce memory when using model parallelism. dist_util = dist.get_dist_util() - if dist_util.is_tensor_model_parallel() or dist_util.is_pipeline_model_parallel(): - flow.boxing.nccl.enable_use_compute_stream(True) + import os + enable_occl = os.getenv("ONEFLOW_ENABLE_OFCCL") + disable_nccl_compute_stream = os.getenv("DISABLE_NCCL_COMPUTE_STREAM") + if enable_occl != "1" and disable_nccl_compute_stream != "1": + if dist_util.is_tensor_model_parallel() or dist_util.is_pipeline_model_parallel(): + flow.boxing.nccl.enable_use_compute_stream(True) # auto_parallel if auto_parallel_conf is not None and auto_parallel_conf.enabled: diff --git a/tools/train.sh b/tools/train.sh index 714ac9953..d526528b4 100755 --- a/tools/train.sh +++ b/tools/train.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash +clear + FILE=$1 CONFIG=$2 GPUS=$3 @@ -8,9 +10,104 @@ NODE_RANK=${NODE_RANK:-0} ADDR=${ADDR:-127.0.0.1} PORT=${PORT:-12345} +export GLOG_logtostderr=1 +export ONEFLOW_ACTOR_ENABLE_LIGHT_ACTOR=0 # 禁用lightweight actor + +export NCCL_PROTO=Simple +export NCCL_ALGO=Ring +# export NCCL_MAX_NCHANNELS=1 +# export NCCL_MIN_NCHANNELS=1 +# export NCCL_NTHREADS=64 + +if [ -z $RUN_TYPE ];then + RUN_TYPE="PURE" + # RUN_TYPE="GDB" + # RUN_TYPE="NSYS" +fi + +export ONEFLOW_ENABLE_OFCCL=1 +export DISABLE_NCCL_COMPUTE_STREAM=1 +# export ONEFLOW_TIME_SHAPE=1 +export ONEFLOW_DEBUG_MODE=1 +export ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE=1 + +export GLOG_vmodule=plan_util*=1,of_collective_actor*=1,of_collective_boxing_kernels*=1,collective_backend_ofccl*=1,hierarchical_sub_task_graph_builder_impl*=1,of_request_store*=1,request_store*=1,runtime*=1,scheduler*=1,collective_manager*=1,of_collective_boxing_sub_task_graph_builder*=1 +# nn_graph*=1, +# export GLOG_v=1 + +export SHOW_ALL_PREPARED_COLL=1 + +export DEV_TRY_ROUND=10 +export CHECK_REMAINING_SQE_INTERVAL=10000 +export DEBUG_FILE="/home/panlichen/work/oneflow/log/oneflow_cpu_rank_" + +export NUM_ITER_ENV=200 +echo NUM_ITER_ENV=$NUM_ITER_ENV + +export ONEFLOW_OFCCL_SKIP_NEGO=0 +export RECV_SUCCESS_FACTOR=5 +export RECV_SUCCESS_THRESHOLD=10000000 +export BASE_CTX_SWITCH_THRESHOLD=20000 +export TOLERANT_UNPROGRESSED_CNT=80000 +export NUM_TRY_TASKQ_HEAD=10 + +echo GPUS=$GPUS +echo ONEFLOW_ENABLE_OFCCL=$ONEFLOW_ENABLE_OFCCL +echo ONEFLOW_OFCCL_SKIP_NEGO=$ONEFLOW_OFCCL_SKIP_NEGO +echo ONEFLOW_ENABLE_OFCCL=$ONEFLOW_DEBUG_MODE +echo ONEFLOW_OFCCL_SKIP_NEGO=$ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE +echo NCCL_PROTO=$NCCL_PROTO +echo NCCL_ALGO=$NCCL_ALGO +echo NCCL_MAX_NCHANNELS=$NCCL_MAX_NCHANNELS +echo NCCL_NTHREADS=$NCCL_NTHREADS +echo ONEFLOW_OFCCL_CHAIN=$ONEFLOW_OFCCL_CHAIN +echo GLOG_vmodule=$GLOG_vmodule +echo GLOG_v=$GLOG_v +echo GLOG_logtostderr=$GLOG_logtostderr + +echo RECV_SUCCESS_FACTOR=$RECV_SUCCESS_FACTOR +echo TOLERANT_UNPROGRESSED_CNT=$TOLERANT_UNPROGRESSED_CNT +echo BASE_CTX_SWITCH_THRESHOLD=$BASE_CTX_SWITCH_THRESHOLD +echo NUM_TRY_TASKQ_HEAD=$NUM_TRY_TASKQ_HEAD +echo DEV_TRY_ROUND=$DEV_TRY_ROUND +echo CHECK_REMAINING_SQE_INTERVAL=$CHECK_REMAINING_SQE_INTERVAL +echo DEBUG_FILE=$DEBUG_FILE + +export PYTHONUNBUFFERED=1 +echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED +export NCCL_LAUNCH_MODE=PARALLEL +echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE +# export NCCL_DEBUG=INFO + +rm -rf /home/panlichen/work/libai/log +mkdir -p /home/panlichen/work/libai/log + +rm -rf /home/panlichen/work/oneflow/log +mkdir -p /home/panlichen/work/oneflow/log + export ONEFLOW_FUSE_OPTIMIZER_UPDATE_CAST=true -python3 -m oneflow.distributed.launch \ ---nproc_per_node $GPUS --nnodes $NODE --node_rank $NODE_RANK --master_addr $ADDR --master_port $PORT \ -$FILE --config-file $CONFIG ${@:4} +if [ "$ONEFLOW_ENABLE_OFCCL" == "1" ]; then + NSYS_FILE="ofccl_vit"_${HOST}_${GPUS}_card +else + NSYS_FILE="nccl_vit"_${HOST}_${GPUS}_card +fi + +if [ "$RUN_TYPE" == "PURE" ];then + cmd="python3 -m oneflow.distributed.launch" +elif [ "$RUN_TYPE" == "GDB" ];then + cmd="gdb -ex r --args python3 -m oneflow.distributed.launch" +elif [ "$RUN_TYPE" == "NSYS" ];then + if [ ! -d "/home/panlichen/work/oneflow/log/nsys" ];then + mkdir -p /home/panlichen/work/oneflow/log/nsys + fi + # cmd="nsys profile -f true --trace=cuda,cudnn,cublas,osrt,nvtx -o /home/panlichen/work/oneflow/log/nsys/$NSYS_FILE python3 -m oneflow.distributed.launch" + cmd="nsys profile -f true -o /home/panlichen/work/oneflow/log/nsys/$NSYS_FILE python3 -m oneflow.distributed.launch" +fi +echo cmd=$cmd + +$cmd \ + --nproc_per_node $GPUS --nnodes $NODE --node_rank $NODE_RANK --master_addr $ADDR --master_port $PORT \ + $FILE --config-file $CONFIG ${@:4} \ + # > /home/panlichen/work/oneflow/log/oneflow.log 2>&1 diff --git a/tools/train_27.sh b/tools/train_27.sh new file mode 100755 index 000000000..44305b1fe --- /dev/null +++ b/tools/train_27.sh @@ -0,0 +1,209 @@ +#!/usr/bin/env bash + +clear + +FILE=$1 +CONFIG=$2 +GPUS=$3 +NODE=${NODE:-1} +NODE_RANK=${NODE_RANK:-0} +ADDR=${ADDR:-127.0.0.1} +PORT=${PORT:-12345} + +export GLOG_logtostderr=1 +export ONEFLOW_ACTOR_ENABLE_LIGHT_ACTOR=0 # 禁用lightweight actor + +export NCCL_PROTO=Simple +export NCCL_ALGO=Ring +# export NCCL_MAX_NCHANNELS=1 +# export NCCL_MIN_NCHANNELS=1 +# export NCCL_NTHREADS=64 + +if [ -z $RUN_TYPE ];then + RUN_TYPE="PURE" + # RUN_TYPE="GDB" + # RUN_TYPE="NSYS" +fi + +export ONEFLOW_ENABLE_OFCCL=1 +export DISABLE_NCCL_COMPUTE_STREAM=1 +# export ONEFLOW_TIME_SHAPE=1 +export ONEFLOW_DEBUG_MODE=1 +export ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE=1 + +export GLOG_vmodule=plan_util*=1,of_collective_actor*=1,of_collective_boxing_kernels*=1,collective_backend_ofccl*=1,hierarchical_sub_task_graph_builder_impl*=1,of_request_store*=1,request_store*=1,runtime*=1,scheduler*=1,collective_manager*=1,of_collective_boxing_sub_task_graph_builder*=1,collective_boxing_sub_task_graph_builder*=1 +# nn_graph*=1, +# export GLOG_v=1 + +export SHOW_ALL_PREPARED_COLL=1 + +export DEV_TRY_ROUND=10 +export CHECK_REMAINING_SQE_INTERVAL=10000 +export DEBUG_FILE="/home/panlichen/work/oneflow/log/oneflow_cpu_rank_" + +export NUM_ITER_ENV=200 +echo NUM_ITER_ENV=$NUM_ITER_ENV + +if [ $GPUS = 2 ]; then + export CUDA_VISIBLE_DEVICES=4,5 + + #pure dp + # export RECV_SUCCESS_FACTOR=5 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=100 + # export TOLERANT_UNPROGRESSED_CNT=2000 + # export NUM_TRY_TASKQ_HEAD=40 + + #pure tp + export RECV_SUCCESS_FACTOR=20 + export RECV_SUCCESS_THRESHOLD=10000 + export BASE_CTX_SWITCH_THRESHOLD=120 + export TOLERANT_UNPROGRESSED_CNT=10000 + export NUM_TRY_TASKQ_HEAD=100 +elif [ $GPUS = 4 ]; then + export CUDA_VISIBLE_DEVICES=0,1,4,5 + export ONEFLOW_OFCCL_SKIP_NEGO=0 + + #pure dp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=40 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=30000 + # export TOLERANT_UNPROGRESSED_CNT=30000 + # export NUM_TRY_TASKQ_HEAD=200 + + #pure tp + export ONEFLOW_OFCCL_SKIP_NEGO=0 + export RECV_SUCCESS_FACTOR=40 + export RECV_SUCCESS_THRESHOLD=1000000000 + export BASE_CTX_SWITCH_THRESHOLD=100000 + export TOLERANT_UNPROGRESSED_CNT=16000 + export NUM_TRY_TASKQ_HEAD=200 + +elif [ $GPUS = 8 ]; then + + #pure dp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=30 + # export RECV_SUCCESS_THRESHOLD=100000000 + # export BASE_CTX_SWITCH_THRESHOLD=120000 + # export TOLERANT_UNPROGRESSED_CNT=180000 + # export NUM_TRY_TASKQ_HEAD=240 + + #pure tp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=10 + # export RECV_SUCCESS_THRESHOLD=1000000 + # export BASE_CTX_SWITCH_THRESHOLD=6000 + # export TOLERANT_UNPROGRESSED_CNT=8000 + # export NUM_TRY_TASKQ_HEAD=10 + + #4tp2dp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=10 + # export RECV_SUCCESS_THRESHOLD=10000000 + # export BASE_CTX_SWITCH_THRESHOLD=20000 + # export TOLERANT_UNPROGRESSED_CNT=9000 + # export NUM_TRY_TASKQ_HEAD=10 + + #3d + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=5 + # export RECV_SUCCESS_THRESHOLD=10000000 + # export BASE_CTX_SWITCH_THRESHOLD=20000 + # export TOLERANT_UNPROGRESSED_CNT=80000 + # export NUM_TRY_TASKQ_HEAD=10 + + export ONEFLOW_OFCCL_SKIP_NEGO=0 + export RECV_SUCCESS_FACTOR=5 + export RECV_SUCCESS_THRESHOLD=2000 + export BASE_CTX_SWITCH_THRESHOLD=200 + export TOLERANT_UNPROGRESSED_CNT=80000 + export NUM_TRY_TASKQ_HEAD=5 + + #2dp4pp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=5 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=8000 + # export TOLERANT_UNPROGRESSED_CNT=80000 + # export NUM_TRY_TASKQ_HEAD=10 + + #2tp4pp + # export ONEFLOW_OFCCL_SKIP_NEGO=1 + # export RECV_SUCCESS_FACTOR=10 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=12000 + # export TOLERANT_UNPROGRESSED_CNT=8000 + # export NUM_TRY_TASKQ_HEAD=10 + + #4tp2pp + # export ONEFLOW_OFCCL_SKIP_NEGO=1 + # export RECV_SUCCESS_FACTOR=10 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=14000 + # export TOLERANT_UNPROGRESSED_CNT=8000 + # export NUM_TRY_TASKQ_HEAD=10 + +fi + +echo GPUS=$GPUS +echo ONEFLOW_ENABLE_OFCCL=$ONEFLOW_ENABLE_OFCCL +echo ONEFLOW_OFCCL_SKIP_NEGO=$ONEFLOW_OFCCL_SKIP_NEGO +echo ONEFLOW_ENABLE_OFCCL=$ONEFLOW_DEBUG_MODE +echo ONEFLOW_OFCCL_SKIP_NEGO=$ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE +echo NCCL_PROTO=$NCCL_PROTO +echo NCCL_ALGO=$NCCL_ALGO +echo NCCL_MAX_NCHANNELS=$NCCL_MAX_NCHANNELS +echo NCCL_NTHREADS=$NCCL_NTHREADS +echo ONEFLOW_OFCCL_CHAIN=$ONEFLOW_OFCCL_CHAIN +echo GLOG_vmodule=$GLOG_vmodule +echo GLOG_v=$GLOG_v +echo GLOG_logtostderr=$GLOG_logtostderr + +echo RECV_SUCCESS_FACTOR=$RECV_SUCCESS_FACTOR +echo TOLERANT_UNPROGRESSED_CNT=$TOLERANT_UNPROGRESSED_CNT +echo BASE_CTX_SWITCH_THRESHOLD=$BASE_CTX_SWITCH_THRESHOLD +echo NUM_TRY_TASKQ_HEAD=$NUM_TRY_TASKQ_HEAD +echo DEV_TRY_ROUND=$DEV_TRY_ROUND +echo CHECK_REMAINING_SQE_INTERVAL=$CHECK_REMAINING_SQE_INTERVAL +echo DEBUG_FILE=$DEBUG_FILE + +export PYTHONUNBUFFERED=1 +echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED +export NCCL_LAUNCH_MODE=PARALLEL +echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE +# export NCCL_DEBUG=INFO + +rm -rf /home/panlichen/work/libai/log +mkdir -p /home/panlichen/work/libai/log + +rm -rf /home/panlichen/work/oneflow/log +mkdir -p /home/panlichen/work/oneflow/log + +export ONEFLOW_FUSE_OPTIMIZER_UPDATE_CAST=true + +if [ "$ONEFLOW_ENABLE_OFCCL" == "1" ]; then + NSYS_FILE="ofccl_vit"_${HOST}_${GPUS}_card +else + NSYS_FILE="nccl_vit"_${HOST}_${GPUS}_card +fi + +if [ "$RUN_TYPE" == "PURE" ];then + cmd="python3 -m oneflow.distributed.launch" +elif [ "$RUN_TYPE" == "GDB" ];then + cmd="gdb -ex r --args python3 -m oneflow.distributed.launch" +elif [ "$RUN_TYPE" == "NSYS" ];then + if [ ! -d "/home/panlichen/work/oneflow/log/nsys" ];then + mkdir -p /home/panlichen/work/oneflow/log/nsys + fi + # cmd="nsys profile -f true --trace=cuda,cudnn,cublas,osrt,nvtx -o /home/panlichen/work/oneflow/log/nsys/$NSYS_FILE python3 -m oneflow.distributed.launch" + cmd="nsys profile -f true -o /home/panlichen/work/oneflow/log/nsys/$NSYS_FILE python3 -m oneflow.distributed.launch" +fi +echo cmd=$cmd + +$cmd \ + --nproc_per_node $GPUS --nnodes $NODE --node_rank $NODE_RANK --master_addr $ADDR --master_port $PORT \ + $FILE --config-file $CONFIG ${@:4} \ + > /home/panlichen/work/oneflow/log/oneflow.log 2>&1 + diff --git a/tools/train_27_25.sh b/tools/train_27_25.sh new file mode 100755 index 000000000..6c0252005 --- /dev/null +++ b/tools/train_27_25.sh @@ -0,0 +1,215 @@ +#!/usr/bin/env bash + +clear + +FILE=$1 +CONFIG=$2 +GPUS=$3 + +# NODE=${NODE:-1} +# NODE_RANK=${NODE_RANK:-0} +# ADDR=${ADDR:-127.0.0.1} +# PORT=${PORT:-12345} + +NODE=2 + +if [[ $HOST = "oneflow-27" ]]; then + NODE_RANK=0 +elif [[ $HOST = "oneflow-25" ]]; then + NODE_RANK=1 +fi +echo $NODE_RANK + +ADDR=11.11.1.27 +PORT=12345 + +export GLOG_logtostderr=1 +export ONEFLOW_ACTOR_ENABLE_LIGHT_ACTOR=0 # 禁用lightweight actor + +export NCCL_PROTO=Simple +export NCCL_ALGO=Ring +# export NCCL_MAX_NCHANNELS=1 +# export NCCL_MIN_NCHANNELS=1 +# export NCCL_NTHREADS=64 + +if [ -z $RUN_TYPE ];then + RUN_TYPE="PURE" + # RUN_TYPE="GDB" + # RUN_TYPE="NSYS" +fi + +export ONEFLOW_ENABLE_OFCCL=1 +export DISABLE_NCCL_COMPUTE_STREAM=1 +# export ONEFLOW_TIME_SHAPE=1 +export ONEFLOW_DEBUG_MODE=1 +export ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE=1 + +export GLOG_vmodule=plan_util*=1,of_collective_actor*=1,of_collective_boxing_kernels*=1,collective_backend_ofccl*=1,hierarchical_sub_task_graph_builder_impl*=1,of_request_store*=1,request_store*=1,runtime*=1,scheduler*=1,collective_manager*=1,of_collective_boxing_sub_task_graph_builder*=1,collective_boxing_sub_task_graph_builder*=1 +# nn_graph*=1, +# export GLOG_v=1 + +export SHOW_ALL_PREPARED_COLL=1 + +export DEV_TRY_ROUND=10 +export CHECK_REMAINING_SQE_INTERVAL=10000 +export DEBUG_FILE="/home/panlichen/work/oneflow/log/oneflow_cpu_rank_" + +export NUM_ITER_ENV=20 +echo NUM_ITER_ENV=$NUM_ITER_ENV + +if [ $GPUS = 2 ]; then + export CUDA_VISIBLE_DEVICES=4,5 + + #pure dp + # export RECV_SUCCESS_FACTOR=5 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=100 + # export TOLERANT_UNPROGRESSED_CNT=2000 + # export NUM_TRY_TASKQ_HEAD=40 + + #pure tp + export RECV_SUCCESS_FACTOR=20 + export RECV_SUCCESS_THRESHOLD=10000 + export BASE_CTX_SWITCH_THRESHOLD=120 + export TOLERANT_UNPROGRESSED_CNT=10000 + export NUM_TRY_TASKQ_HEAD=100 +elif [ $GPUS = 4 ]; then + export CUDA_VISIBLE_DEVICES=0,1,4,5 + export ONEFLOW_OFCCL_SKIP_NEGO=0 + + #pure dp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=40 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=30000 + # export TOLERANT_UNPROGRESSED_CNT=30000 + # export NUM_TRY_TASKQ_HEAD=200 + + #pure tp + export ONEFLOW_OFCCL_SKIP_NEGO=0 + export RECV_SUCCESS_FACTOR=40 + export RECV_SUCCESS_THRESHOLD=1000000000 + export BASE_CTX_SWITCH_THRESHOLD=100000 + export TOLERANT_UNPROGRESSED_CNT=16000 + export NUM_TRY_TASKQ_HEAD=200 + +elif [ $GPUS = 8 ]; then + + #pure dp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=30 + # export RECV_SUCCESS_THRESHOLD=100000000 + # export BASE_CTX_SWITCH_THRESHOLD=120000 + # export TOLERANT_UNPROGRESSED_CNT=180000 + # export NUM_TRY_TASKQ_HEAD=240 + + #pure tp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=10 + # export RECV_SUCCESS_THRESHOLD=1000000 + # export BASE_CTX_SWITCH_THRESHOLD=6000 + # export TOLERANT_UNPROGRESSED_CNT=8000 + # export NUM_TRY_TASKQ_HEAD=10 + + #4tp2dp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=10 + # export RECV_SUCCESS_THRESHOLD=10000000 + # export BASE_CTX_SWITCH_THRESHOLD=20000 + # export TOLERANT_UNPROGRESSED_CNT=9000 + # export NUM_TRY_TASKQ_HEAD=10 + + #3d + export ONEFLOW_OFCCL_SKIP_NEGO=0 + export RECV_SUCCESS_FACTOR=5 + export RECV_SUCCESS_THRESHOLD=10000000 + export BASE_CTX_SWITCH_THRESHOLD=20000 + export TOLERANT_UNPROGRESSED_CNT=80000 + export NUM_TRY_TASKQ_HEAD=10 + + #2dp4pp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=5 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=8000 + # export TOLERANT_UNPROGRESSED_CNT=80000 + # export NUM_TRY_TASKQ_HEAD=10 + + #2tp4pp + # export ONEFLOW_OFCCL_SKIP_NEGO=1 + # export RECV_SUCCESS_FACTOR=10 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=12000 + # export TOLERANT_UNPROGRESSED_CNT=8000 + # export NUM_TRY_TASKQ_HEAD=10 + + #4tp2pp + # export ONEFLOW_OFCCL_SKIP_NEGO=1 + # export RECV_SUCCESS_FACTOR=10 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=14000 + # export TOLERANT_UNPROGRESSED_CNT=8000 + # export NUM_TRY_TASKQ_HEAD=10 + +fi + +echo GPUS=$GPUS +echo ONEFLOW_ENABLE_OFCCL=$ONEFLOW_ENABLE_OFCCL +echo ONEFLOW_OFCCL_SKIP_NEGO=$ONEFLOW_OFCCL_SKIP_NEGO +echo ONEFLOW_ENABLE_OFCCL=$ONEFLOW_DEBUG_MODE +echo ONEFLOW_OFCCL_SKIP_NEGO=$ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE +echo NCCL_PROTO=$NCCL_PROTO +echo NCCL_ALGO=$NCCL_ALGO +echo NCCL_MAX_NCHANNELS=$NCCL_MAX_NCHANNELS +echo NCCL_NTHREADS=$NCCL_NTHREADS +echo ONEFLOW_OFCCL_CHAIN=$ONEFLOW_OFCCL_CHAIN +echo GLOG_vmodule=$GLOG_vmodule +echo GLOG_v=$GLOG_v +echo GLOG_logtostderr=$GLOG_logtostderr + +echo RECV_SUCCESS_FACTOR=$RECV_SUCCESS_FACTOR +echo TOLERANT_UNPROGRESSED_CNT=$TOLERANT_UNPROGRESSED_CNT +echo BASE_CTX_SWITCH_THRESHOLD=$BASE_CTX_SWITCH_THRESHOLD +echo NUM_TRY_TASKQ_HEAD=$NUM_TRY_TASKQ_HEAD +echo DEV_TRY_ROUND=$DEV_TRY_ROUND +echo CHECK_REMAINING_SQE_INTERVAL=$CHECK_REMAINING_SQE_INTERVAL +echo DEBUG_FILE=$DEBUG_FILE + +export PYTHONUNBUFFERED=1 +echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED +export NCCL_LAUNCH_MODE=PARALLEL +echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE +# export NCCL_DEBUG=INFO + +rm -rf /home/panlichen/work/libai/log +mkdir -p /home/panlichen/work/libai/log + +rm -rf /home/panlichen/work/oneflow/log +mkdir -p /home/panlichen/work/oneflow/log + +export ONEFLOW_FUSE_OPTIMIZER_UPDATE_CAST=true + +if [ "$ONEFLOW_ENABLE_OFCCL" == "1" ]; then + NSYS_FILE="ofccl_vit"_${HOST}_${GPUS}_card +else + NSYS_FILE="nccl_vit"_${HOST}_${GPUS}_card +fi + +if [ "$RUN_TYPE" == "PURE" ];then + cmd="python3 -m oneflow.distributed.launch" +elif [ "$RUN_TYPE" == "GDB" ];then + cmd="gdb -ex r --args python3 -m oneflow.distributed.launch" +elif [ "$RUN_TYPE" == "NSYS" ];then + if [ ! -d "/home/panlichen/work/oneflow/log/nsys" ];then + mkdir -p /home/panlichen/work/oneflow/log/nsys + fi + # cmd="nsys profile -f true --trace=cuda,cudnn,cublas,osrt,nvtx -o /home/panlichen/work/oneflow/log/nsys/$NSYS_FILE python3 -m oneflow.distributed.launch" + cmd="nsys profile -f true -o /home/panlichen/work/oneflow/log/nsys/$NSYS_FILE python3 -m oneflow.distributed.launch" +fi +echo cmd=$cmd + +$cmd \ + --nproc_per_node $GPUS --nnodes $NODE --node_rank $NODE_RANK --master_addr $ADDR --master_port $PORT \ + $FILE --config-file $CONFIG ${@:4} \ + > /home/panlichen/work/oneflow/log/oneflow.log 2>&1 + diff --git a/tools/train_a100.sh b/tools/train_a100.sh new file mode 100755 index 000000000..20cc9930e --- /dev/null +++ b/tools/train_a100.sh @@ -0,0 +1,202 @@ +#!/usr/bin/env bash + +clear + +FILE=$1 +CONFIG=$2 +GPUS=$3 +NODE=${NODE:-1} +NODE_RANK=${NODE_RANK:-0} +ADDR=${ADDR:-127.0.0.1} +PORT=${PORT:-12345} + +export GLOG_logtostderr=1 +export ONEFLOW_ACTOR_ENABLE_LIGHT_ACTOR=0 # 禁用lightweight actor + +export NCCL_PROTO=Simple +export NCCL_ALGO=Ring +# export NCCL_MAX_NCHANNELS=1 +# export NCCL_MIN_NCHANNELS=1 +# export NCCL_NTHREADS=64 + +if [ -z $RUN_TYPE ];then + RUN_TYPE="PURE" + # RUN_TYPE="GDB" + # RUN_TYPE="NSYS" +fi + +export ONEFLOW_ENABLE_OFCCL=1 +export DISABLE_NCCL_COMPUTE_STREAM=1 +# export ONEFLOW_TIME_SHAPE=1 +export ONEFLOW_DEBUG_MODE=1 +export ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE=1 + +export GLOG_vmodule=plan_util*=1,of_collective_actor*=1,of_collective_boxing_kernels*=1,collective_backend_ofccl*=1,hierarchical_sub_task_graph_builder_impl*=1,of_request_store*=1,request_store*=1,runtime*=1,scheduler*=1,collective_manager*=1,of_collective_boxing_sub_task_graph_builder*=1,collective_boxing_sub_task_graph_builder*=1 +# nn_graph*=1, +# export GLOG_v=1 + +export SHOW_ALL_PREPARED_COLL=1 + +export DEV_TRY_ROUND=10 +export CHECK_REMAINING_SQE_INTERVAL=10000 +export DEBUG_FILE="/home/panlichen/work/oneflow/log/oneflow_cpu_rank_" + +export NUM_ITER_ENV=200 +echo NUM_ITER_ENV=$NUM_ITER_ENV + +if [ $GPUS = 2 ]; then + export CUDA_VISIBLE_DEVICES=4,5 + + #pure dp + # export RECV_SUCCESS_FACTOR=5 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=100 + # export TOLERANT_UNPROGRESSED_CNT=2000 + # export NUM_TRY_TASKQ_HEAD=40 + + #pure tp + export RECV_SUCCESS_FACTOR=20 + export RECV_SUCCESS_THRESHOLD=10000 + export BASE_CTX_SWITCH_THRESHOLD=120 + export TOLERANT_UNPROGRESSED_CNT=10000 + export NUM_TRY_TASKQ_HEAD=100 +elif [ $GPUS = 4 ]; then + export CUDA_VISIBLE_DEVICES=0,1,4,5 + export ONEFLOW_OFCCL_SKIP_NEGO=0 + + #pure dp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=40 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=30000 + # export TOLERANT_UNPROGRESSED_CNT=30000 + # export NUM_TRY_TASKQ_HEAD=200 + + #pure tp + export ONEFLOW_OFCCL_SKIP_NEGO=0 + export RECV_SUCCESS_FACTOR=40 + export RECV_SUCCESS_THRESHOLD=1000000000 + export BASE_CTX_SWITCH_THRESHOLD=100000 + export TOLERANT_UNPROGRESSED_CNT=16000 + export NUM_TRY_TASKQ_HEAD=200 + +elif [ $GPUS = 8 ]; then + + #pure dp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=30 + # export RECV_SUCCESS_THRESHOLD=100000000 + # export BASE_CTX_SWITCH_THRESHOLD=120000 + # export TOLERANT_UNPROGRESSED_CNT=180000 + # export NUM_TRY_TASKQ_HEAD=240 + + #pure tp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=10 + # export RECV_SUCCESS_THRESHOLD=1000000 + # export BASE_CTX_SWITCH_THRESHOLD=6000 + # export TOLERANT_UNPROGRESSED_CNT=8000 + # export NUM_TRY_TASKQ_HEAD=10 + + #4tp2dp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=10 + # export RECV_SUCCESS_THRESHOLD=10000000 + # export BASE_CTX_SWITCH_THRESHOLD=20000 + # export TOLERANT_UNPROGRESSED_CNT=9000 + # export NUM_TRY_TASKQ_HEAD=10 + + #3d + export ONEFLOW_OFCCL_SKIP_NEGO=0 + export RECV_SUCCESS_FACTOR=5 + export RECV_SUCCESS_THRESHOLD=10000000 + export BASE_CTX_SWITCH_THRESHOLD=20000 + export TOLERANT_UNPROGRESSED_CNT=80000 + export NUM_TRY_TASKQ_HEAD=10 + + #2dp4pp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=5 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=8000 + # export TOLERANT_UNPROGRESSED_CNT=80000 + # export NUM_TRY_TASKQ_HEAD=10 + + #2tp4pp + # export ONEFLOW_OFCCL_SKIP_NEGO=1 + # export RECV_SUCCESS_FACTOR=10 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=12000 + # export TOLERANT_UNPROGRESSED_CNT=8000 + # export NUM_TRY_TASKQ_HEAD=10 + + #4tp2pp + # export ONEFLOW_OFCCL_SKIP_NEGO=1 + # export RECV_SUCCESS_FACTOR=10 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=14000 + # export TOLERANT_UNPROGRESSED_CNT=8000 + # export NUM_TRY_TASKQ_HEAD=10 + +fi + +echo GPUS=$GPUS +echo ONEFLOW_ENABLE_OFCCL=$ONEFLOW_ENABLE_OFCCL +echo ONEFLOW_OFCCL_SKIP_NEGO=$ONEFLOW_OFCCL_SKIP_NEGO +echo ONEFLOW_ENABLE_OFCCL=$ONEFLOW_DEBUG_MODE +echo ONEFLOW_OFCCL_SKIP_NEGO=$ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE +echo NCCL_PROTO=$NCCL_PROTO +echo NCCL_ALGO=$NCCL_ALGO +echo NCCL_MAX_NCHANNELS=$NCCL_MAX_NCHANNELS +echo NCCL_NTHREADS=$NCCL_NTHREADS +echo ONEFLOW_OFCCL_CHAIN=$ONEFLOW_OFCCL_CHAIN +echo GLOG_vmodule=$GLOG_vmodule +echo GLOG_v=$GLOG_v +echo GLOG_logtostderr=$GLOG_logtostderr + +echo RECV_SUCCESS_FACTOR=$RECV_SUCCESS_FACTOR +echo TOLERANT_UNPROGRESSED_CNT=$TOLERANT_UNPROGRESSED_CNT +echo BASE_CTX_SWITCH_THRESHOLD=$BASE_CTX_SWITCH_THRESHOLD +echo NUM_TRY_TASKQ_HEAD=$NUM_TRY_TASKQ_HEAD +echo DEV_TRY_ROUND=$DEV_TRY_ROUND +echo CHECK_REMAINING_SQE_INTERVAL=$CHECK_REMAINING_SQE_INTERVAL +echo DEBUG_FILE=$DEBUG_FILE + +export PYTHONUNBUFFERED=1 +echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED +export NCCL_LAUNCH_MODE=PARALLEL +echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE +# export NCCL_DEBUG=INFO + +rm -rf /home/panlichen/work/libai/log +mkdir -p /home/panlichen/work/libai/log + +rm -rf /home/panlichen/work/oneflow/log +mkdir -p /home/panlichen/work/oneflow/log + +export ONEFLOW_FUSE_OPTIMIZER_UPDATE_CAST=true + +if [ "$ONEFLOW_ENABLE_OFCCL" == "1" ]; then + NSYS_FILE="ofccl_vit"_${HOST}_${GPUS}_card +else + NSYS_FILE="nccl_vit"_${HOST}_${GPUS}_card +fi + +if [ "$RUN_TYPE" == "PURE" ];then + cmd="python3 -m oneflow.distributed.launch" +elif [ "$RUN_TYPE" == "GDB" ];then + cmd="gdb -ex r --args python3 -m oneflow.distributed.launch" +elif [ "$RUN_TYPE" == "NSYS" ];then + if [ ! -d "/home/panlichen/work/oneflow/log/nsys" ];then + mkdir -p /home/panlichen/work/oneflow/log/nsys + fi + # cmd="nsys profile -f true --trace=cuda,cudnn,cublas,osrt,nvtx -o /home/panlichen/work/oneflow/log/nsys/$NSYS_FILE python3 -m oneflow.distributed.launch" + cmd="nsys profile -f true -o /home/panlichen/work/oneflow/log/nsys/$NSYS_FILE python3 -m oneflow.distributed.launch" +fi +echo cmd=$cmd + +$cmd \ + --nproc_per_node $GPUS --nnodes $NODE --node_rank $NODE_RANK --master_addr $ADDR --master_port $PORT \ + $FILE --config-file $CONFIG ${@:4} \ + > /home/panlichen/work/oneflow/log/oneflow.log 2>&1 + diff --git a/tools/train_para_4090.sh b/tools/train_para_4090.sh new file mode 100755 index 000000000..4128a7f2a --- /dev/null +++ b/tools/train_para_4090.sh @@ -0,0 +1,202 @@ +#!/usr/bin/env bash + +clear + +FILE=$1 +CONFIG=$2 +GPUS=$3 +NODE=${NODE:-1} +NODE_RANK=${NODE_RANK:-0} +ADDR=${ADDR:-127.0.0.1} +PORT=${PORT:-12345} + +export GLOG_logtostderr=1 +export ONEFLOW_ACTOR_ENABLE_LIGHT_ACTOR=0 # 禁用lightweight actor + +export NCCL_PROTO=Simple +export NCCL_ALGO=Ring +# export NCCL_MAX_NCHANNELS=1 +# export NCCL_MIN_NCHANNELS=1 +# export NCCL_NTHREADS=64 + +if [ -z $RUN_TYPE ];then + RUN_TYPE="PURE" + # RUN_TYPE="GDB" + # RUN_TYPE="NSYS" +fi + +export ONEFLOW_ENABLE_OFCCL=0 +export DISABLE_NCCL_COMPUTE_STREAM=1 +# export ONEFLOW_TIME_SHAPE=1 +export ONEFLOW_DEBUG_MODE=1 +export ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE=1 + +export GLOG_vmodule=plan_util*=1,of_collective_actor*=1,of_collective_boxing_kernels*=1,collective_backend_ofccl*=1,hierarchical_sub_task_graph_builder_impl*=1,of_request_store*=1,request_store*=1,runtime*=1,scheduler*=1,collective_manager*=1,of_collective_boxing_sub_task_graph_builder*=1,collective_boxing_sub_task_graph_builder*=1 +# nn_graph*=1, +# export GLOG_v=1 + +export SHOW_ALL_PREPARED_COLL=1 + +export DEV_TRY_ROUND=10 +export CHECK_REMAINING_SQE_INTERVAL=10000 +export DEBUG_FILE="/HOME/scw6cab/run/OCCL/oneflow/log/oneflow_cpu_rank_" + +export NUM_ITER_ENV=200 +echo NUM_ITER_ENV=$NUM_ITER_ENV + +if [ $GPUS = 2 ]; then + # export CUDA_VISIBLE_DEVICES=4,5 + + #pure dp + # export RECV_SUCCESS_FACTOR=5 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=100 + # export TOLERANT_UNPROGRESSED_CNT=2000 + # export NUM_TRY_TASKQ_HEAD=40 + + #pure tp + export RECV_SUCCESS_FACTOR=20 + export RECV_SUCCESS_THRESHOLD=10000 + export BASE_CTX_SWITCH_THRESHOLD=120 + export TOLERANT_UNPROGRESSED_CNT=10000 + export NUM_TRY_TASKQ_HEAD=100 +elif [ $GPUS = 4 ]; then + # export CUDA_VISIBLE_DEVICES=0,1,4,5 + export ONEFLOW_OFCCL_SKIP_NEGO=0 + + #pure dp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=40 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=30000 + # export TOLERANT_UNPROGRESSED_CNT=30000 + # export NUM_TRY_TASKQ_HEAD=200 + + #pure tp + export ONEFLOW_OFCCL_SKIP_NEGO=0 + export RECV_SUCCESS_FACTOR=40 + export RECV_SUCCESS_THRESHOLD=1000000000 + export BASE_CTX_SWITCH_THRESHOLD=100000 + export TOLERANT_UNPROGRESSED_CNT=16000 + export NUM_TRY_TASKQ_HEAD=200 + +elif [ $GPUS = 8 ]; then + + #pure dp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=30 + # export RECV_SUCCESS_THRESHOLD=100000000 + # export BASE_CTX_SWITCH_THRESHOLD=120000 + # export TOLERANT_UNPROGRESSED_CNT=180000 + # export NUM_TRY_TASKQ_HEAD=240 + + #pure tp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=10 + # export RECV_SUCCESS_THRESHOLD=1000000 + # export BASE_CTX_SWITCH_THRESHOLD=6000 + # export TOLERANT_UNPROGRESSED_CNT=8000 + # export NUM_TRY_TASKQ_HEAD=10 + + #4tp2dp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=10 + # export RECV_SUCCESS_THRESHOLD=10000000 + # export BASE_CTX_SWITCH_THRESHOLD=20000 + # export TOLERANT_UNPROGRESSED_CNT=9000 + # export NUM_TRY_TASKQ_HEAD=10 + + #3d + export ONEFLOW_OFCCL_SKIP_NEGO=0 + export RECV_SUCCESS_FACTOR=5 + export RECV_SUCCESS_THRESHOLD=10000000 + export BASE_CTX_SWITCH_THRESHOLD=20000 + export TOLERANT_UNPROGRESSED_CNT=80000 + export NUM_TRY_TASKQ_HEAD=10 + + #2dp4pp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=5 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=8000 + # export TOLERANT_UNPROGRESSED_CNT=80000 + # export NUM_TRY_TASKQ_HEAD=10 + + #2tp4pp + # export ONEFLOW_OFCCL_SKIP_NEGO=1 + # export RECV_SUCCESS_FACTOR=10 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=12000 + # export TOLERANT_UNPROGRESSED_CNT=8000 + # export NUM_TRY_TASKQ_HEAD=10 + + #4tp2pp + # export ONEFLOW_OFCCL_SKIP_NEGO=1 + # export RECV_SUCCESS_FACTOR=10 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=14000 + # export TOLERANT_UNPROGRESSED_CNT=8000 + # export NUM_TRY_TASKQ_HEAD=10 + +fi + +echo GPUS=$GPUS +echo ONEFLOW_ENABLE_OFCCL=$ONEFLOW_ENABLE_OFCCL +echo ONEFLOW_OFCCL_SKIP_NEGO=$ONEFLOW_OFCCL_SKIP_NEGO +echo ONEFLOW_ENABLE_OFCCL=$ONEFLOW_DEBUG_MODE +echo ONEFLOW_OFCCL_SKIP_NEGO=$ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE +echo NCCL_PROTO=$NCCL_PROTO +echo NCCL_ALGO=$NCCL_ALGO +echo NCCL_MAX_NCHANNELS=$NCCL_MAX_NCHANNELS +echo NCCL_NTHREADS=$NCCL_NTHREADS +echo ONEFLOW_OFCCL_CHAIN=$ONEFLOW_OFCCL_CHAIN +echo GLOG_vmodule=$GLOG_vmodule +echo GLOG_v=$GLOG_v +echo GLOG_logtostderr=$GLOG_logtostderr + +echo RECV_SUCCESS_FACTOR=$RECV_SUCCESS_FACTOR +echo TOLERANT_UNPROGRESSED_CNT=$TOLERANT_UNPROGRESSED_CNT +echo BASE_CTX_SWITCH_THRESHOLD=$BASE_CTX_SWITCH_THRESHOLD +echo NUM_TRY_TASKQ_HEAD=$NUM_TRY_TASKQ_HEAD +echo DEV_TRY_ROUND=$DEV_TRY_ROUND +echo CHECK_REMAINING_SQE_INTERVAL=$CHECK_REMAINING_SQE_INTERVAL +echo DEBUG_FILE=$DEBUG_FILE + +export PYTHONUNBUFFERED=1 +echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED +export NCCL_LAUNCH_MODE=PARALLEL +echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE +# export NCCL_DEBUG=INFO + +rm -rf /HOME/scw6cab/run/OCCL/libai/log +mkdir -p /HOME/scw6cab/run/OCCL/libai/log + +rm -rf /HOME/scw6cab/run/OCCL/oneflow/log +mkdir -p /HOME/scw6cab/run/OCCL/oneflow/log + +export ONEFLOW_FUSE_OPTIMIZER_UPDATE_CAST=true + +if [ "$ONEFLOW_ENABLE_OFCCL" == "1" ]; then + NSYS_FILE="ofccl_vit"_${HOST}_${GPUS}_card +else + NSYS_FILE="nccl_vit"_${HOST}_${GPUS}_card +fi + +if [ "$RUN_TYPE" == "PURE" ];then + cmd="python3 -m oneflow.distributed.launch" +elif [ "$RUN_TYPE" == "GDB" ];then + cmd="gdb -ex r --args python3 -m oneflow.distributed.launch" +elif [ "$RUN_TYPE" == "NSYS" ];then + if [ ! -d "/HOME/scw6cab/run/OCCL/oneflow/log/nsys" ];then + mkdir -p /HOME/scw6cab/run/OCCL/oneflow/log/nsys + fi + # cmd="nsys profile -f true --trace=cuda,cudnn,cublas,osrt,nvtx -o /HOME/scw6cab/run/OCCL/oneflow/log/nsys/$NSYS_FILE python3 -m oneflow.distributed.launch" + cmd="nsys profile -f true -o /HOME/scw6cab/run/OCCL/oneflow/log/nsys/$NSYS_FILE python3 -m oneflow.distributed.launch" +fi +echo cmd=$cmd + +$cmd \ + --nproc_per_node $GPUS --nnodes $NODE --node_rank $NODE_RANK --master_addr $ADDR --master_port $PORT \ + $FILE --config-file $CONFIG ${@:4} \ + > /HOME/scw6cab/run/OCCL/oneflow/log/oneflow.log 2>&1 +