From 254cdbbce2361ba39a26186d174849b31e3fb4f2 Mon Sep 17 00:00:00 2001
From: Rick Ho <laekov.h@gmail.com>
Date: Mon, 8 Nov 2021 15:06:01 +0800
Subject: [PATCH 1/2] update release note

---
 cuda/utils/cublas_wrapper.h | 20 ++++++++++----------
 cuda/utils/helper_cuda.h    | 21 +++++++++++----------
 doc/release-note.md         | 31 +++++++++++++++++++++++++++++--
 fmoe/layers.py              |  2 +-
 fmoe/megatron/patch.py      |  4 ++--
 5 files changed, 53 insertions(+), 25 deletions(-)

diff --git a/cuda/utils/cublas_wrapper.h b/cuda/utils/cublas_wrapper.h
index 4e7bc1cb..c74c034f 100644
--- a/cuda/utils/cublas_wrapper.h
+++ b/cuda/utils/cublas_wrapper.h
@@ -93,18 +93,18 @@ inline cublasStatus_t cublasXgemm(cublasHandle_t handle,
                                 const c10::Half *beta,
                                 c10::Half *C, int ldc) {
 #ifdef FMOE_USE_HIP
-    return rocblas_hgemm(handle, transa, transb, m, n, k, 
-            (const rocblas_half*)alpha, 
-            (const rocblas_half*)A, lda, 
-            (const rocblas_half*)B, ldb, 
-            (const rocblas_half*)beta, 
+    return rocblas_hgemm(handle, transa, transb, m, n, k,
+            (const rocblas_half*)alpha,
+            (const rocblas_half*)A, lda,
+            (const rocblas_half*)B, ldb,
+            (const rocblas_half*)beta,
             (rocblas_half*)C, ldc);
 #else
-    return cublasHgemm(handle, transa, transb, m, n, k, 
-            (const __half*)alpha, 
-            (const __half*)A, lda, 
-            (const __half*)B, ldb, 
-            (const __half*)beta, 
+    return cublasHgemm(handle, transa, transb, m, n, k,
+            (const __half*)alpha,
+            (const __half*)A, lda,
+            (const __half*)B, ldb,
+            (const __half*)beta,
             (__half*)C, ldc);
 #endif
 }
diff --git a/cuda/utils/helper_cuda.h b/cuda/utils/helper_cuda.h
index 1e4830eb..1bd4d2b1 100644
--- a/cuda/utils/helper_cuda.h
+++ b/cuda/utils/helper_cuda.h
@@ -54,28 +54,28 @@ static const char *_cudaGetErrorEnum(CUresult error) {
 #ifdef FMOE_USE_HIP
 static const char *_cudaGetErrorEnum(cublasStatus_t error) {
   switch (error) {
-    
+
     case rocblas_status_success:
       return "rocblas_status_success";
-    
+
     case rocblas_status_invalid_handle:
       return "rocblas_status_invalid_handle";
-    
+
     case rocblas_status_not_implemented:
       return "rocblas_status_not_implemented";
 
     case rocblas_status_invalid_pointer:
       return "rocblas_status_invalid_pointer:";
-    
+
     case rocblas_status_invalid_size:
       return "rocblas_status_invalid_size";
-    
+
     case rocblas_status_memory_error:
       return "rocblas_status_memory_error";
-   
+
     case rocblas_status_internal_error:
       return "rocblas_status_internal_error";
-    
+
     case rocblas_status_perf_degraded:
       return "rocblas_status_perf_degraded";
 
@@ -84,13 +84,13 @@ static const char *_cudaGetErrorEnum(cublasStatus_t error) {
 
     case rocblas_status_size_increased:
       return "rocblas_status_size_increased";
-      
+
     case rocblas_status_size_unchanged:
       return "rocblas_status_size_unchanged";
-      
+
     case rocblas_status_invalid_value:
       return "rocblas_status_invalid_value";
-      
+
     case rocblas_status_continue:
       return "rocblas_status_continue";
   }
@@ -627,3 +627,4 @@ void check(T result, char const *const func, const char *const file,
 #define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
 
 #endif  // HELPER_CUDA_H
+
diff --git a/doc/release-note.md b/doc/release-note.md
index 178f5447..a8620833 100644
--- a/doc/release-note.md
+++ b/doc/release-note.md
@@ -1,10 +1,37 @@
+## v0.3.0
+
+### FMoE core
+
+* Previous `mp_group` is renamed to `slice_gruop`, indicating that all workers in the group receive the same input batch, and process a slice of the input. `mp_gruop` will be deprecated in our next release.
+* ROCm supported.
+* `FMoELinear` is moved to a stand-alone file.
+
+### Groupped data parallel
+
+* Support any group name by their relative tag name.
+
+###  Load balancing
+
+* A brand new balancing strategy - SWIPE. Contributed by authors of a (currently unpublished) paper.
+* A property `has_loss` is added to each gate, in order to identify whether balance loss should be collected.
+
+### Megatron-LM support
+
+* Experts are partitioned by tensor model parallelism in `mp_group`, instead of expert parallelism.
+* Support arbitrary customized gate in `MegatronMLP`.
+* Move the patches to a stand-alone file.
+
+### Tests
+
+* Move util functions into `test_ddp.py`.
+
 ## v0.2.1
 
 ## Load balancing
 
 * Fix gradient for balance loss.
 
-## Misc
+### Misc
 
 * Typos.
 * Update benchmark interface.
@@ -12,7 +39,7 @@
 * Enable `USE_NCCL` by default.
 * Compatibility for PyTorch `<1.8.0` and `>=1.8.0`.
 
-## Megatron adaption
+### Megatron adaption
 
 * Patch for numerical correctness of gradient clipping.
 * Support to pipeline parallelism.
diff --git a/fmoe/layers.py b/fmoe/layers.py
index 8fb23158..6f124ae5 100644
--- a/fmoe/layers.py
+++ b/fmoe/layers.py
@@ -72,7 +72,7 @@ class FMoE(nn.Module):
     group hold the same copy of input feature, and requires the same copy of
     the output. For each worker, FMoE only computes the output of a certain
     slice of the input batch, and will all-gather the outputs after
-    computation.  
+    computation.
     * `top_k` stands for the number of experts each token is going to.
     * `gate` is a gate class which can found in `fmoe.gates`.
     * `expert` can be specified as a module class, it is used to generate
diff --git a/fmoe/megatron/patch.py b/fmoe/megatron/patch.py
index f54af8d7..4a5cce21 100644
--- a/fmoe/megatron/patch.py
+++ b/fmoe/megatron/patch.py
@@ -59,9 +59,9 @@ def patch_model_provider(model_provider, gate=None):
     def fmoefied_model_provider():
         from .layers import fmoefy
         args = get_args()
-        hhs = args.hidden_size * 4 
+        hhs = args.hidden_size * 4
         assert hhs % args.top_k == 0
-        hhs = hhs // args.top_k 
+        hhs = hhs // args.top_k
         assert hhs % args.tensor_model_parallel_size == 0
         hhs = hhs // args.tensor_model_parallel_size
         return fmoefy(

From a461be6c6a4d1385e11c7daae6d9dfbd182eef8c Mon Sep 17 00:00:00 2001
From: Rick Ho <laekov.h@gmail.com>
Date: Mon, 8 Nov 2021 17:30:56 +0800
Subject: [PATCH 2/2] fix typo

---
 doc/release-note.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/release-note.md b/doc/release-note.md
index a8620833..ba016b3a 100644
--- a/doc/release-note.md
+++ b/doc/release-note.md
@@ -2,7 +2,7 @@
 
 ### FMoE core
 
-* Previous `mp_group` is renamed to `slice_gruop`, indicating that all workers in the group receive the same input batch, and process a slice of the input. `mp_gruop` will be deprecated in our next release.
+* Previous `mp_group` is renamed to `slice_group`, indicating that all workers in the group receive the same input batch, and process a slice of the input. `mp_group` will be deprecated in our next release.
 * ROCm supported.
 * `FMoELinear` is moved to a stand-alone file.