From 254cdbbce2361ba39a26186d174849b31e3fb4f2 Mon Sep 17 00:00:00 2001 From: Rick Ho Date: Mon, 8 Nov 2021 15:06:01 +0800 Subject: [PATCH 1/2] update release note --- cuda/utils/cublas_wrapper.h | 20 ++++++++++---------- cuda/utils/helper_cuda.h | 21 +++++++++++---------- doc/release-note.md | 31 +++++++++++++++++++++++++++++-- fmoe/layers.py | 2 +- fmoe/megatron/patch.py | 4 ++-- 5 files changed, 53 insertions(+), 25 deletions(-) diff --git a/cuda/utils/cublas_wrapper.h b/cuda/utils/cublas_wrapper.h index 4e7bc1cb..c74c034f 100644 --- a/cuda/utils/cublas_wrapper.h +++ b/cuda/utils/cublas_wrapper.h @@ -93,18 +93,18 @@ inline cublasStatus_t cublasXgemm(cublasHandle_t handle, const c10::Half *beta, c10::Half *C, int ldc) { #ifdef FMOE_USE_HIP - return rocblas_hgemm(handle, transa, transb, m, n, k, - (const rocblas_half*)alpha, - (const rocblas_half*)A, lda, - (const rocblas_half*)B, ldb, - (const rocblas_half*)beta, + return rocblas_hgemm(handle, transa, transb, m, n, k, + (const rocblas_half*)alpha, + (const rocblas_half*)A, lda, + (const rocblas_half*)B, ldb, + (const rocblas_half*)beta, (rocblas_half*)C, ldc); #else - return cublasHgemm(handle, transa, transb, m, n, k, - (const __half*)alpha, - (const __half*)A, lda, - (const __half*)B, ldb, - (const __half*)beta, + return cublasHgemm(handle, transa, transb, m, n, k, + (const __half*)alpha, + (const __half*)A, lda, + (const __half*)B, ldb, + (const __half*)beta, (__half*)C, ldc); #endif } diff --git a/cuda/utils/helper_cuda.h b/cuda/utils/helper_cuda.h index 1e4830eb..1bd4d2b1 100644 --- a/cuda/utils/helper_cuda.h +++ b/cuda/utils/helper_cuda.h @@ -54,28 +54,28 @@ static const char *_cudaGetErrorEnum(CUresult error) { #ifdef FMOE_USE_HIP static const char *_cudaGetErrorEnum(cublasStatus_t error) { switch (error) { - + case rocblas_status_success: return "rocblas_status_success"; - + case rocblas_status_invalid_handle: return "rocblas_status_invalid_handle"; - + case rocblas_status_not_implemented: return "rocblas_status_not_implemented"; case rocblas_status_invalid_pointer: return "rocblas_status_invalid_pointer:"; - + case rocblas_status_invalid_size: return "rocblas_status_invalid_size"; - + case rocblas_status_memory_error: return "rocblas_status_memory_error"; - + case rocblas_status_internal_error: return "rocblas_status_internal_error"; - + case rocblas_status_perf_degraded: return "rocblas_status_perf_degraded"; @@ -84,13 +84,13 @@ static const char *_cudaGetErrorEnum(cublasStatus_t error) { case rocblas_status_size_increased: return "rocblas_status_size_increased"; - + case rocblas_status_size_unchanged: return "rocblas_status_size_unchanged"; - + case rocblas_status_invalid_value: return "rocblas_status_invalid_value"; - + case rocblas_status_continue: return "rocblas_status_continue"; } @@ -627,3 +627,4 @@ void check(T result, char const *const func, const char *const file, #define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) #endif // HELPER_CUDA_H + diff --git a/doc/release-note.md b/doc/release-note.md index 178f5447..a8620833 100644 --- a/doc/release-note.md +++ b/doc/release-note.md @@ -1,10 +1,37 @@ +## v0.3.0 + +### FMoE core + +* Previous `mp_group` is renamed to `slice_gruop`, indicating that all workers in the group receive the same input batch, and process a slice of the input. `mp_gruop` will be deprecated in our next release. +* ROCm supported. +* `FMoELinear` is moved to a stand-alone file. + +### Groupped data parallel + +* Support any group name by their relative tag name. + +### Load balancing + +* A brand new balancing strategy - SWIPE. Contributed by authors of a (currently unpublished) paper. +* A property `has_loss` is added to each gate, in order to identify whether balance loss should be collected. + +### Megatron-LM support + +* Experts are partitioned by tensor model parallelism in `mp_group`, instead of expert parallelism. +* Support arbitrary customized gate in `MegatronMLP`. +* Move the patches to a stand-alone file. + +### Tests + +* Move util functions into `test_ddp.py`. + ## v0.2.1 ## Load balancing * Fix gradient for balance loss. -## Misc +### Misc * Typos. * Update benchmark interface. @@ -12,7 +39,7 @@ * Enable `USE_NCCL` by default. * Compatibility for PyTorch `<1.8.0` and `>=1.8.0`. -## Megatron adaption +### Megatron adaption * Patch for numerical correctness of gradient clipping. * Support to pipeline parallelism. diff --git a/fmoe/layers.py b/fmoe/layers.py index 8fb23158..6f124ae5 100644 --- a/fmoe/layers.py +++ b/fmoe/layers.py @@ -72,7 +72,7 @@ class FMoE(nn.Module): group hold the same copy of input feature, and requires the same copy of the output. For each worker, FMoE only computes the output of a certain slice of the input batch, and will all-gather the outputs after - computation. + computation. * `top_k` stands for the number of experts each token is going to. * `gate` is a gate class which can found in `fmoe.gates`. * `expert` can be specified as a module class, it is used to generate diff --git a/fmoe/megatron/patch.py b/fmoe/megatron/patch.py index f54af8d7..4a5cce21 100644 --- a/fmoe/megatron/patch.py +++ b/fmoe/megatron/patch.py @@ -59,9 +59,9 @@ def patch_model_provider(model_provider, gate=None): def fmoefied_model_provider(): from .layers import fmoefy args = get_args() - hhs = args.hidden_size * 4 + hhs = args.hidden_size * 4 assert hhs % args.top_k == 0 - hhs = hhs // args.top_k + hhs = hhs // args.top_k assert hhs % args.tensor_model_parallel_size == 0 hhs = hhs // args.tensor_model_parallel_size return fmoefy( From a461be6c6a4d1385e11c7daae6d9dfbd182eef8c Mon Sep 17 00:00:00 2001 From: Rick Ho Date: Mon, 8 Nov 2021 17:30:56 +0800 Subject: [PATCH 2/2] fix typo --- doc/release-note.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/release-note.md b/doc/release-note.md index a8620833..ba016b3a 100644 --- a/doc/release-note.md +++ b/doc/release-note.md @@ -2,7 +2,7 @@ ### FMoE core -* Previous `mp_group` is renamed to `slice_gruop`, indicating that all workers in the group receive the same input batch, and process a slice of the input. `mp_gruop` will be deprecated in our next release. +* Previous `mp_group` is renamed to `slice_group`, indicating that all workers in the group receive the same input batch, and process a slice of the input. `mp_group` will be deprecated in our next release. * ROCm supported. * `FMoELinear` is moved to a stand-alone file.