From bdb125f83fb079e82963aeca6f50a23493a2ac6b Mon Sep 17 00:00:00 2001 From: Guangyao Zhang Date: Sat, 14 Sep 2024 11:01:05 +0800 Subject: [PATCH] [doc] FP8 training and communication document (#6050) * Add FP8 training and communication document * add fp8 docstring for plugins * fix typo * fix typo --- colossalai/booster/plugin/gemini_plugin.py | 2 ++ .../booster/plugin/hybrid_parallel_plugin.py | 3 ++- colossalai/booster/plugin/low_level_zero_plugin.py | 2 ++ .../booster/plugin/moe_hybrid_parallel_plugin.py | 4 +++- colossalai/booster/plugin/torch_ddp_plugin.py | 1 + .../mixed_precision_training_with_booster.md | 12 ++++++++++-- .../mixed_precision_training_with_booster.md | 14 +++++++++++--- 7 files changed, 31 insertions(+), 7 deletions(-) diff --git a/colossalai/booster/plugin/gemini_plugin.py b/colossalai/booster/plugin/gemini_plugin.py index 6a5d0c1613df..ae49aa8b148d 100644 --- a/colossalai/booster/plugin/gemini_plugin.py +++ b/colossalai/booster/plugin/gemini_plugin.py @@ -323,7 +323,9 @@ class GeminiPlugin(DPPluginBase): enable_jit_fused (bool, optional): Whether to switch on JIT in Shardformer. Default to False. enable_sequence_parallelism (bool): Whether to turn on sequence parallelism in Shardformer. Defaults to False. enable_sequence_overlap (bool): Whether to turn on sequence overlap in Shardformer. Defaults to False. + use_fp8 (bool, optional): Whether to enable fp8 mixed precision training. Defaults to False. verbose (bool, optional): verbose mode. Debug info including chunk search result will be printed. Defaults to False. + fp8_communication (bool, optional): Whether to enable fp8 communication. Defaults to False. """ def __init__( diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py index 8e972d0146da..bb663f6a6d5e 100644 --- a/colossalai/booster/plugin/hybrid_parallel_plugin.py +++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py @@ -981,7 +981,8 @@ class HybridParallelPlugin(PipelinePluginBase): gradient_checkpoint_config (GradientCheckpointConfig, optional): Configuration for gradient checkpointing. Defaults to None. enable_metadata_cache (bool, optional): Whether to enable metadata cache for pipeline parallelism. Defaults to True. make_vocab_size_divisible_by (int, optional): it's used when padding the vocabulary size, to make it choose an faster kenel. Default to 64. - fp8_communication (bool, optional): Whether to enable fp8 communication in model parallelism + fp8_communication (bool, optional): Whether to enable fp8 communication. Defaults to False. + use_fp8 (bool, optional): Whether to enable fp8 mixed precision training. Defaults to False. overlap_p2p (bool, optional): Whether to overlap the p2p communication in pipeline parallelism inner_ring_size (int, optional): The inner ring size of 2D Ring Attention when sp mode is "ring_attn". It's advisable to not tune this (especially in single-node settings) and let it be heuristically set based on topology by default. diff --git a/colossalai/booster/plugin/low_level_zero_plugin.py b/colossalai/booster/plugin/low_level_zero_plugin.py index cec15dd5dd34..b167b5c7a59e 100644 --- a/colossalai/booster/plugin/low_level_zero_plugin.py +++ b/colossalai/booster/plugin/low_level_zero_plugin.py @@ -327,6 +327,8 @@ class LowLevelZeroPlugin(DPPluginBase): overlap_communication (bool, optional): whether to overlap communication and computation. Defaults to True. cpu_offload (bool, optional): whether to offload grad, master weight and optimizer state to cpu. Defaults to False. verbose (bool, optional): verbose mode. Debug info including grad overflow will be printed. Defaults to False. + use_fp8 (bool, optional): Whether to enable fp8 mixed precision training. Defaults to False. + fp8_communication (bool, optional): Whether to enable fp8 communication. Defaults to False. """ def __init__( diff --git a/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py b/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py index 2324a5239d79..0807b374901a 100644 --- a/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py +++ b/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py @@ -170,7 +170,9 @@ class MoeHybridParallelPlugin(HybridParallelPlugin): gradient_checkpoint_config (GradientCheckpointConfig, optional): Configuration for gradient checkpointing. Defaults to None. enable_metadata_cache (bool, optional): Whether to enable metadata cache for pipeline parallelism. Defaults to True. make_vocab_size_divisible_by (int, optional): it's used when padding the vocabulary size, to make it choose an faster kenel. Default to 64. - overlap_p2p (bool, optional): Whether to overlap the p2p communication in pipeline parallelism + overlap_p2p (bool, optional): Whether to overlap the p2p communication in pipeline parallelism. + use_fp8 (bool, optional): Whether to enable fp8 mixed precision training. Defaults to False. + fp8_communication (bool, optional): Whether to enable fp8 communication. Defaults to False. """ def __init__( diff --git a/colossalai/booster/plugin/torch_ddp_plugin.py b/colossalai/booster/plugin/torch_ddp_plugin.py index 61d785a4c9e7..ec7ce7f9aae4 100644 --- a/colossalai/booster/plugin/torch_ddp_plugin.py +++ b/colossalai/booster/plugin/torch_ddp_plugin.py @@ -169,6 +169,7 @@ class TorchDDPPlugin(DPPluginBase): check_reduction (bool, optional): Whether to check reduction. Defaults to False. gradient_as_bucket_view (bool, optional): Whether to use gradient as bucket view. Defaults to False. static_graph (bool, optional): Whether to use static graph. Defaults to False. + fp8_communication (bool, optional): Whether to enable fp8 communication. Defaults to False. """ def __init__( diff --git a/docs/source/en/features/mixed_precision_training_with_booster.md b/docs/source/en/features/mixed_precision_training_with_booster.md index baaaacdddf9e..65304b1f4e65 100644 --- a/docs/source/en/features/mixed_precision_training_with_booster.md +++ b/docs/source/en/features/mixed_precision_training_with_booster.md @@ -9,6 +9,7 @@ Author: [Mingyan Jiang](https://github.com/jiangmingyan) **Related Paper** - [Accelerating Scientific Computations with Mixed Precision Algorithms](https://arxiv.org/abs/0808.2794) +- [FP8 Formats for Deep Learning](https://arxiv.org/pdf/2209.05433) ## Introduction @@ -60,7 +61,11 @@ However, there are other operations, like reductions, which require the dynamic ## AMP in Colossal-AI -We supported three AMP training methods and allowed the user to train with AMP with no code. If you want to train with amp, just assign `mixed_precision` with `fp16` when you instantiate the `Booster`. Next we will support `bf16`, `fp8`. +We supported three AMP training methods and allowed the user to train with AMP with no code. If you want to train with amp, just assign `mixed_precision` with `fp16` when you instantiate the `Booster`. Next we will support `bf16`. + +Currently we only support `fp8` mixed precision training for the `Linear` layer. Please specify the `use_fp8` parameter when create the plugin object. + +To reduce the communication volume inter nodes in low-bandwidth scenarios, we support FP8 communication compression. Please specify the `fp8_communication` parameter when create the plugin object. ### Start with Booster @@ -74,7 +79,6 @@ instantiate `Booster` with `mixed_precision="fp16"`, then you can train with tor 'fp16': torch amp 'fp16_apex': apex amp, 'bf16': bf16, - 'fp8': fp8, 'fp16_naive': naive amp """ from colossalai import Booster @@ -128,6 +132,10 @@ The output model is converted to AMP model of smaller memory consumption. If your input model is already too large to fit in a GPU, please instantiate your model weights in `dtype=torch.float16`. Otherwise, try smaller models or checkout more parallelization training techniques! +### FP8 Communication + +In low-bandwidth scenarios, to reduce the communication load multiple nodes, we support FP8 communication compression, which can be enabled by using `fp8_communication=True` when you when create the plugin object (such as `GeminiPlugin`). The all-to-all, all-gather and P2P operations inter nodes will use FP8 format for data transmission. Currently the FP8 communication of reduction operators such as all-reduce and reduce-scatter is currently not supported due to lack of support of the NCCL library. + ## Hands-on Practice Now we will introduce the use of AMP with Colossal-AI. In this practice, we will use Torch AMP as an example. diff --git a/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md b/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md index 53d9013db296..da377ceb294b 100644 --- a/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md +++ b/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md @@ -9,6 +9,7 @@ **相关论文** - [Accelerating Scientific Computations with Mixed Precision Algorithms](https://arxiv.org/abs/0808.2794) +- [FP8 Formats for Deep Learning](https://arxiv.org/pdf/2209.05433) ## 引言 @@ -56,9 +57,13 @@ AMP 代表自动混合精度训练。 ## Colossal-AI 中的 AMP -我们支持三种 AMP 训练方法,并允许用户在没有改变代码的情况下使用 AMP 进行训练。booster 支持 amp 特性注入,如果您要使用混合精度训练,则在创建 booster 实例时指定`mixed_precision`参数;后续将会拓展`bf16`,`pf8`的混合精度训练. +我们支持三种 AMP 训练方法,并允许用户在没有改变代码的情况下使用 AMP 进行训练。booster 支持 amp 特性注入,如果您要使用混合精度训练,则在创建 booster 实例时指定`mixed_precision`参数; 后续将会拓展`bf16`. -#### booster 启动方式 +我们目前只支持`Linear`层的`fp8`混合精度训练,如果您需要使用,请在创建 plugin实例时指定`use_fp8`参数。 + +为了减少低带宽场景下多机之间的通讯负载,我们还支持了FP8通讯。如果您需要使用,请在创建 plugin实例时指定`fp8_communication`参数。 + +### booster 启动方式 您可以在创建 booster 实例时,指定`mixed_precision="fp16"`即使用 torch amp。 @@ -70,7 +75,6 @@ AMP 代表自动混合精度训练。 'fp16': torch amp 'fp16_apex': apex amp, 'bf16': bf16, - 'fp8': fp8, 'fp16_naive': naive amp """ from colossalai import Booster @@ -118,6 +122,10 @@ booster = Booster(mixed_precision=mixed_precision,...) 当使用`colossalai.booster`时, 首先需要实例化一个模型、一个优化器和一个标准。将输出模型转换为内存消耗较小的 AMP 模型。如果您的输入模型已经太大,无法放置在 GPU 中,请使用`dtype=torch.float16`实例化你的模型。或者请尝试更小的模型,或尝试更多的并行化训练技术! +### FP8通讯 + +在低带宽场景下,为了减少多机间的通讯负载,我们支持使用FP8的形式对通讯进行压缩,可以在初始化plugin实例(如`GeminiPlugin`)时使用fp8_communication=True来启用。此时多机之间all-to-all, all-gather以及P2P操作将使用FP8的格式进行数据传输。受限于NCCL库的支持,目前不支持缩减(Reduction)算子如Allreduce, ReduceScatter的FP8通讯。 + ## 实例 下面我们将展现如何在 Colossal-AI 使用 AMP。在该例程中,我们使用 Torch AMP.