From 486dc65613088996a478f80e01feac89f92cc740 Mon Sep 17 00:00:00 2001
From: Megvii Engine Team <megengine@megvii.com>
Date: Fri, 10 May 2024 18:04:37 +0800
Subject: [PATCH] feat(kernel): support elemwise bias in Arm64 Fp32 NCHW44
 Conv1x1

GitOrigin-RevId: 419e87f87bfc4ee60af99665bc022f0bffe6df55
---
 .../ConvKernel/Fp32/Fp32Conv1x1Mk4M8N12.cpp   |  41 ++++-
 .../Arm/Arm64/ConvKernel/Fp32/Fp32Im2col.cpp  |   4 +
 .../InternalKernel/Fp32M8N12K4Matmul.cpp      | 147 +++++++++++++++---
 .../Arm64/MatMulKernel/Fp32MatMulM8N12K4.cpp  |   2 +-
 compiler/lib/KernelGen/Common/ConvKernel.h    |  15 ++
 compiler/test/kernel/opr/arm/conv.cpp         |   5 +
 script/build_and_test_not_standard_os.sh      |   2 +-
 7 files changed, 182 insertions(+), 34 deletions(-)
diff --git a/compiler/lib/KernelGen/Arm/Arm64/ConvKernel/Fp32/Fp32Conv1x1Mk4M8N12.cpp b/compiler/lib/KernelGen/Arm/Arm64/ConvKernel/Fp32/Fp32Conv1x1Mk4M8N12.cpp
index ccae049c..be75e71e 100644
--- a/compiler/lib/KernelGen/Arm/Arm64/ConvKernel/Fp32/Fp32Conv1x1Mk4M8N12.cpp
+++ b/compiler/lib/KernelGen/Arm/Arm64/ConvKernel/Fp32/Fp32Conv1x1Mk4M8N12.cpp
@@ -35,7 +35,8 @@ bool Conv1x1FloatMk4::IsAvailable(TContext* ctx) const {
                    ctx->getAttrOprand("operand:2").dtype == "f32";
     bool layout_ok = ctx->getAttrOprand("operand:0").shape.size() == 5 &&
                      ctx->getAttrOprand("operand:0").shape[4] == 4;
-    bool bias_ok = !is_bias(ctx) || is_channel_broadcast_bias(ctx);
+    bool bias_ok =
+            !is_bias(ctx) || is_channel_broadcast_bias(ctx) || is_elemwise_bias(ctx);
     return param_value_ok && param_mode_ok && type_ok && noline_ok && layout_ok &&
            bias_ok;
 }
@@ -43,7 +44,12 @@ bool Conv1x1FloatMk4::IsAvailable(TContext* ctx) const {
 std::string Conv1x1FloatMk4::GetKernelSymbol(TContext* ctx) const {
     std::stringstream extra_ss;
     if (is_bias(ctx)) {
-        extra_ss << "_bias";
+        if (is_channel_broadcast_bias(ctx))
+            extra_ss << "_channel_broadcast_bias";
+        else if (is_elemwise_bias(ctx))
+            extra_ss << "_elemwise_bias";
+        else
+            CC_ABORT << "only support channel broadcast and elemwise bias mode.";
     }
     if (ctx->haveAttr("nonlineMode") && ctx->getAttrStr("nonlineMode") != "IDENTITY") {
         extra_ss << "_" << ctx->getAttrStr("nonlineMode");
@@ -163,7 +169,13 @@ std::shared_ptr<TContext> Conv1x1FloatMk4::GetInnerCtx(TContext* ctx) const {
     if (ctx->haveAttr("nonlineMode")) {
         inner_ctx->setAttr("nonlineMode", CCAttr(ctx->getAttrStr("nonlineMode")));
     }
-    inner_ctx->setAttr("with_bias", ConvImpl::is_bias(ctx));
+    std::string bias_mode = "NO_BIAS";
+    if (is_channel_broadcast_bias(ctx)) {
+        bias_mode = "CHANNEL_BROADCAST_BIAS";
+    } else if (is_elemwise_bias(ctx)) {
+        bias_mode = "ELEMWISE_BIAS";
+    }
+    inner_ctx->setAttr("bias_mode", bias_mode);
     inner_ctx->setAttr("transposeA", false);
     inner_ctx->setAttr("transposeB", false);
     inner_ctx->setAttr("format", "MK4");
@@ -177,9 +189,22 @@ std::string Conv1x1FloatMk4::GetKernelBody(TContext* ctx) const {
     writer << m_inner_gemm.GetNakedKernelSignature(inner_ctx.get()) << ";\n";
     writer << m_inner_gemm.GetPackBSignature(inner_ctx.get()) << ";\n";
     writer << GenCommonRet() << " " << GetKernelSignature(ctx);
-    std::string bias_ptr_str = is_bias(ctx) ? "inputs[2]->ptr;" : "0;";
+    std::string elemwise_bias_init, channel_broadcast_bias_init;
+    if (is_channel_broadcast_bias(ctx)) {
+        channel_broadcast_bias_init = "bias_data = inputs[2]->ptr;";
+    } else if (is_elemwise_bias(ctx)) {
+        elemwise_bias_init = "bias_data = inputs[2]->ptr;";
+    }
+    std::string bias_offset = "0";
+    if (is_channel_broadcast_bias(ctx)) {
+        bias_offset = "ocpg";
+    } else if (is_elemwise_bias(ctx)) {
+        bias_offset = "ocpg * out_h * out_w";
+    }
     writer << StringTemplate::StringTemplateArgs()
-                      .add("bias_ptr_str", bias_ptr_str)
+                      .add("elemwise_bias_init", elemwise_bias_init)
+                      .add("channel_broadcast_bias_init", channel_broadcast_bias_init)
+                      .add("bias_offset", bias_offset)
                       .add("packB", m_inner_gemm.GetPackBSymbol(inner_ctx.get()))
                       .add("kern", m_inner_gemm.GetNakedKernelSymbol(inner_ctx.get()))
                       .render(R"({
@@ -215,16 +240,18 @@ std::string Conv1x1FloatMk4::GetKernelBody(TContext* ctx) const {
     const int LDB = in_h * in_w * PACK_C_SIZE;
 
     void* workspace_ptr = workspace->ptr;
+    const float* bias_data = NULL;
+    ${elemwise_bias_init}
     for (int n_idx = 0; n_idx < in_n; ++n_idx) {
         float* weight_data = inputs[1]->ptr;
-        float* bias_data = ${bias_ptr_str};
+        ${channel_broadcast_bias_init}
         for(int group_idx = 0; group_idx < group; ++group_idx){
             ${packB}(workspace_ptr, input_data, LDB, 0, in_h * in_w, 0, icpg);
             ${kern}(weight_data, workspace_ptr, output_data, LDC, ocpg, N, icpg, bias_data);
             input_data += icpg * in_h * in_w;
             output_data += ocpg * out_h * out_w;
             weight_data += weight_layout.stride[0];
-            bias_data += ocpg;
+            bias_data += ${bias_offset};
         }
     }
     return TinyNN_SUCCESS;
diff --git a/compiler/lib/KernelGen/Arm/Arm64/ConvKernel/Fp32/Fp32Im2col.cpp b/compiler/lib/KernelGen/Arm/Arm64/ConvKernel/Fp32/Fp32Im2col.cpp
index 2bd0a055..f053db11 100644
--- a/compiler/lib/KernelGen/Arm/Arm64/ConvKernel/Fp32/Fp32Im2col.cpp
+++ b/compiler/lib/KernelGen/Arm/Arm64/ConvKernel/Fp32/Fp32Im2col.cpp
@@ -261,6 +261,10 @@ std::shared_ptr<TContext> ConvIm2colFloat::GetInnerCtx(TContext* ctx) const {
         inner_ctx->setAttr("nonlineMode", CCAttr(ctx->getAttrStr("nonlineMode")));
     }
     inner_ctx->setAttr("with_bias", ConvImpl::is_bias(ctx));
+    inner_ctx->setAttr(
+            "bias_mode", ConvImpl::is_channel_broadcast_bias(ctx)
+                                 ? "CHANNEL_BROADCAST_BIAS"
+                                 : "NO_BIAS");
     inner_ctx->setAttr("transposeA", false);
     inner_ctx->setAttr("transposeB", false);
     inner_ctx->setAttr("dtype", "f32");
diff --git a/compiler/lib/KernelGen/Arm/Arm64/InternalKernel/Fp32M8N12K4Matmul.cpp b/compiler/lib/KernelGen/Arm/Arm64/InternalKernel/Fp32M8N12K4Matmul.cpp
index d9acc974..26f7b9c1 100644
--- a/compiler/lib/KernelGen/Arm/Arm64/InternalKernel/Fp32M8N12K4Matmul.cpp
+++ b/compiler/lib/KernelGen/Arm/Arm64/InternalKernel/Fp32M8N12K4Matmul.cpp
@@ -76,7 +76,6 @@ static std::string kern_8x12(TContext* ctx) {
     auto nonline_mode =
             ctx->haveAttr("nonlineMode") ? ctx->getAttrStr("nonlineMode") : "IDENTITY";
     auto activation_gen = create_activation_gener(nonline_mode);
-    bool with_bias = ctx->getAttrBool("with_bias");
 
     std::stringstream writer;
     //! kern_8x12
@@ -115,15 +114,17 @@ static std::string kern_8x12(TContext* ctx) {
         const float* b_ptr = packB;
         float* output0 = output;
         float* output1 = output0 + LDC;
+        const float* bias_ptr0 = bias_ptr;
+        const float* bias_ptr1 = bias_ptr0 + LDC;
 
         int oddk = (K & 1);
         K = ((K + 1) / 2) - 1;
 
         asm volatile()";
     //! if convolution with bias
-    if (with_bias) {
+    if (ctx->getAttrStr("bias_mode") == "CHANNEL_BROADCAST_BIAS") {
         writer << R"(
-            "ld1 {v6.4s}, [%[bias_ptr]], #16\n"
+            "ld1 {v6.4s}, [%[bias_ptr0]], #16\n"
             "mov    v8.16b, v6.16b             \n"
             "mov    v9.16b, v6.16b             \n"
             "mov    v10.16b, v6.16b            \n"
@@ -136,7 +137,7 @@ static std::string kern_8x12(TContext* ctx) {
             "mov    v15.16b, v6.16b            \n"
             "ld1 {v2.4s}, [%[b_ptr]], #16\n"
             "mov    v16.16b, v6.16b            \n"
-            "ld1 {v7.4s}, [%[bias_ptr]], #16\n"
+            "ld1 {v7.4s}, [%[bias_ptr0]], #16\n"
             "mov    v17.16b, v6.16b            \n"
             "mov    v18.16b, v6.16b            \n"
             "mov    v19.16b, v6.16b            \n"
@@ -156,7 +157,38 @@ static std::string kern_8x12(TContext* ctx) {
             "mov    v30.16b, v7.16b            \n"
             "mov    v31.16b, v7.16b            \n")";
 
-        //! if convolution without bias
+    } else if (ctx->getAttrStr("bias_mode") == "ELEMWISE_BIAS") {
+        writer << R"(
+            "ld1 {v8.4s}, [%[bias_ptr0]], #16\n"
+            "ld1 {v9.4s}, [%[bias_ptr0]], #16\n"
+            "ld1 {v10.4s}, [%[bias_ptr0]], #16\n"
+            "prfm pstl1keep, [%[output0]]\n"
+            "ld1 {v11.4s}, [%[bias_ptr0]], #16\n"
+            "ld1 {v12.4s}, [%[bias_ptr0]], #16\n"
+            "ld1 {v13.4s}, [%[bias_ptr0]], #16\n"
+            "prfm pstl1keep, [%[output1]]\n"
+            "ld1 {v14.4s}, [%[bias_ptr0]], #16\n"
+            "ld1 {v15.4s}, [%[bias_ptr0]], #16\n"
+            "ld1 {v2.4s}, [%[b_ptr]], #16\n"
+            "ld1 {v16.4s}, [%[bias_ptr0]], #16\n"
+            "ld1 {v17.4s}, [%[bias_ptr0]], #16\n"
+            "ld1 {v18.4s}, [%[bias_ptr0]], #16\n"
+            "ld1 {v19.4s}, [%[bias_ptr0]], #16\n"
+            "ld1 {v20.4s}, [%[bias_ptr1]], #16\n"
+            "ld1 {v3.4s}, [%[b_ptr]], #16\n"
+            "ld1 {v21.4s}, [%[bias_ptr1]], #16\n"
+            "ld1 {v22.4s}, [%[bias_ptr1]], #16\n"
+            "ld1 {v23.4s}, [%[bias_ptr1]], #16\n"
+            "ld1 {v4.4s}, [%[b_ptr]], #16\n"
+            "ld1 {v24.4s}, [%[bias_ptr1]], #16\n"
+            "ld1 {v25.4s}, [%[bias_ptr1]], #16\n"
+            "ld1 {v26.4s}, [%[bias_ptr1]], #16\n"
+            "ld1 {v27.4s}, [%[bias_ptr1]], #16\n"
+            "ld1 {v28.4s}, [%[bias_ptr1]], #16\n"
+            "ld1 {v0.4s}, [%[a_ptr]], #16\n"
+            "ld1 {v29.4s}, [%[bias_ptr1]], #16\n"
+            "ld1 {v30.4s}, [%[bias_ptr1]], #16\n"
+            "ld1 {v31.4s}, [%[bias_ptr1]], #16\n")";
     } else {
         writer << R"(
             "eor  v8.16b, v8.16b, v8.16b     \n"
@@ -438,7 +470,7 @@ static std::string kern_8x12(TContext* ctx) {
 
             "6:\n"
             : [ a_ptr ] "+r"(a_ptr), [ b_ptr ] "+r"(b_ptr), [ K ] "+r"(K),
-              [ bias_ptr ] "+r"(bias_ptr), [ oddk ] "+r"(oddk),
+              [ bias_ptr0 ] "+r"(bias_ptr0), [ bias_ptr1 ] "+r"(bias_ptr1), [ oddk ] "+r"(oddk),
               [ output0 ] "+r"(output0), [ output1 ] "+r"(output1)
             :
             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
@@ -459,7 +491,6 @@ static std::string kern_4x12(TContext* ctx) {
     auto nonline_mode =
             ctx->haveAttr("nonlineMode") ? ctx->getAttrStr("nonlineMode") : "IDENTITY";
     auto activation_gen = create_activation_gener(nonline_mode);
-    bool with_bias = ctx->getAttrBool("with_bias");
     std::stringstream writer;
     writer << R"(static inline void kern_4x12_bias_relu(const float* packA, const float* packB, int K,
                           float* output, int LDC, const float* bias_ptr) {
@@ -473,7 +504,7 @@ static std::string kern_4x12(TContext* ctx) {
 
         asm volatile()";
     //! if convolution with bias
-    if (with_bias) {
+    if (ctx->getAttrStr("bias_mode") == "CHANNEL_BROADCAST_BIAS") {
         writer << R"(
                 "ld1 {v6.4s}, [%[bias_ptr]], #16\n"
                 "mov    v8.16b, v6.16b             \n"
@@ -492,6 +523,24 @@ static std::string kern_4x12(TContext* ctx) {
                 "mov    v18.16b, v6.16b            \n"
                 "mov    v19.16b, v6.16b            \n"
         )";
+    } else if (ctx->getAttrStr("bias_mode") == "ELEMWISE_BIAS") {
+        writer << R"(
+                "ld1 {v8.4s}, [%[bias_ptr]], #16\n"
+                "ld1 {v9.4s}, [%[bias_ptr]], #16\n"
+                "ld1 {v10.4s}, [%[bias_ptr]], #16\n"
+                "prfm pstl1keep, [%[output0]]\n"
+                "ld1 {v11.4s}, [%[bias_ptr]], #16\n"
+                "ld1 {v12.4s}, [%[bias_ptr]], #16\n"
+                "ld1 {v13.4s}, [%[bias_ptr]], #16\n"
+                "ld1 {v2.4s, v3.4s, v4.4s}, [%[b_ptr]], #48\n"
+                "ld1 {v14.4s}, [%[bias_ptr]], #16\n"
+                "ld1 {v15.4s}, [%[bias_ptr]], #16\n"
+                "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+                "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+                "ld1 {v0.4s}, [%[a_ptr]], #16\n"
+                "ld1 {v18.4s}, [%[bias_ptr]], #16\n"
+                "ld1 {v19.4s}, [%[bias_ptr]], #16\n"
+        )";
     } else {
         //! if convolution without bias
         writer << R"(
@@ -639,7 +688,6 @@ static std::string kern_8x4(TContext* ctx) {
     auto nonline_mode =
             ctx->haveAttr("nonlineMode") ? ctx->getAttrStr("nonlineMode") : "IDENTITY";
     auto activation_gen = create_activation_gener(nonline_mode);
-    bool with_bias = ctx->getAttrBool("with_bias");
 
     std::stringstream writer;
     //! kern_8x4
@@ -677,6 +725,8 @@ static std::string kern_8x4(TContext* ctx) {
             const float* b_ptr = packB;
             float* output0 = output;
             float* output1 = output0 + LDC;
+            const float* bias_ptr0 = bias_ptr;
+            const float* bias_ptr1 = bias_ptr0 + LDC;
 
             int oddk = (K & 1);
             K = ((K + 1) / 2) - 1;
@@ -707,10 +757,10 @@ static std::string kern_8x4(TContext* ctx) {
             //clang-format on
 
             asm volatile()";
-    if (with_bias) {
+    if (ctx->getAttrStr("bias_mode") == "CHANNEL_BROADCAST_BIAS") {
         writer << R"(
-            "ld1 {v30.4s}, [%[bias_ptr]], #16\n"
-            "ld1 {v31.4s}, [%[bias_ptr]], #16\n"
+            "ld1 {v30.4s}, [%[bias_ptr0]], #16\n"
+            "ld1 {v31.4s}, [%[bias_ptr0]], #16\n"
             "mov v8.16b, v30.16b            \n"
             "mov v9.16b, v30.16b            \n"
             "ld1 {v0.4s}, [%[a_ptr]], #16\n"
@@ -723,6 +773,20 @@ static std::string kern_8x4(TContext* ctx) {
             "mov v14.16b, v31.16b            \n"
             "ld1 {v2.4s}, [%[b_ptr]], #16\n"
             "mov v15.16b, v31.16b            \n")";
+    } else if (ctx->getAttrStr("bias_mode") == "ELEMWISE_BIAS") {
+        writer << R"(
+            "ld1 {v8.4s}, [%[bias_ptr0]], #16\n"
+            "ld1 {v9.4s}, [%[bias_ptr0]], #16\n"
+            "ld1 {v0.4s}, [%[a_ptr]], #16\n"
+            "ld1 {v10.4s}, [%[bias_ptr0]], #16\n"
+            "prfm pstl1keep, [%[output0]]\n"
+            "ld1 {v11.4s}, [%[bias_ptr0]], #16\n"
+            "ld1 {v12.4s}, [%[bias_ptr1]], #16\n"
+            "prfm pstl1keep, [%[output1]]\n"
+            "ld1 {v13.4s}, [%[bias_ptr1]], #16\n"
+            "ld1 {v14.4s}, [%[bias_ptr1]], #16\n"
+            "ld1 {v2.4s}, [%[b_ptr]], #16\n"
+            "ld1 {v15.4s}, [%[bias_ptr1]], #16\n")";
     } else {
         writer << R"(
             "eor    v8.16b, v8.16b, v8.16b   \n"
@@ -818,7 +882,7 @@ static std::string kern_8x4(TContext* ctx) {
             STORE_C
 
             : [ a_ptr ] "+r"(a_ptr), [ b_ptr ] "+r"(b_ptr), [ K ] "+r"(K),
-              [ bias_ptr ] "+r"(bias_ptr), [ oddk ] "+r"(oddk),
+              [ bias_ptr0 ] "+r"(bias_ptr0), [ bias_ptr1 ] "+r"(bias_ptr1), [ oddk ] "+r"(oddk),
               [ output0 ] "+r"(output0), [ output1 ] "+r"(output1),
               [ n_remain ] "+r"(n_remain)
             :
@@ -839,7 +903,6 @@ static std::string kern_4x4(TContext* ctx) {
     auto nonline_mode =
             ctx->haveAttr("nonlineMode") ? ctx->getAttrStr("nonlineMode") : "IDENTITY";
     auto activation_gen = create_activation_gener(nonline_mode);
-    bool with_bias = ctx->getAttrBool("with_bias");
     std::stringstream writer;
     //! kern_4x4
     writer << R"(
@@ -898,7 +961,7 @@ static std::string kern_4x4(TContext* ctx) {
                 //clang-format on
 
                 asm volatile(     )";
-    if (with_bias) {
+    if (ctx->getAttrStr("bias_mode") == "CHANNEL_BROADCAST_BIAS") {
         writer << R"(
                 // load accumulator C
                 "ld1 {v30.4s}, [%[bias_ptr]], #16\n"
@@ -908,6 +971,15 @@ static std::string kern_4x4(TContext* ctx) {
                 "ld1 {v0.4s}, [%[a_ptr]], #16\n"
                 "mov v10.16b, v30.16b            \n"
                 "mov v11.16b, v30.16b            \n")";
+    } else if (ctx->getAttrStr("bias_mode") == "ELEMWISE_BIAS") {
+        writer << R"(
+                // load accumulator C
+                "ld1 {v8.4s}, [%[bias_ptr]], #16\n"
+                "ld1 {v2.4s}, [%[b_ptr]], #16\n"
+                "ld1 {v9.4s}, [%[bias_ptr]], #16\n"
+                "ld1 {v0.4s}, [%[a_ptr]], #16\n"
+                "ld1 {v10.4s}, [%[bias_ptr]], #16\n"
+                "ld1 {v11.4s}, [%[bias_ptr]], #16\n")";
     } else {
         writer << R"(
                 "eor  v8.16b, v8.16b, v8.16b     \n"
@@ -1107,6 +1179,21 @@ std::string gen_kernel(
         const std::string& sig, TContext* ctx, const std::string& postprocess_call,
         const std::string& preset_str = "") {
     auto post_process_strs = gen_postprocess_inline(ctx);
+    std::string channel_broadcast_bias_init =
+            ctx->getAttrStr("bias_mode") == "CHANNEL_BROADCAST_BIAS"
+                    ? "_bias_ptr = bias_ptr + m;"
+                    : "";
+    std::string elemwise_bias_init = ctx->getAttrStr("bias_mode") == "ELEMWISE_BIAS"
+                                           ? "_bias_ptr = bias_ptr + m * N;"
+                                           : "";
+    std::string elemwise_bias_update_n12 =
+            ctx->getAttrStr("bias_mode") == "ELEMWISE_BIAS"
+                    ? "_bias_ptr += n_block * pack_mk;"
+                    : "";
+    std::string elemwise_bias_update_n4 =
+            ctx->getAttrStr("bias_mode") == "ELEMWISE_BIAS"
+                    ? "_bias_ptr += 4 * pack_mk;"
+                    : "";
     std::string keren_body =
             R"(
     ${kernel_sig}{
@@ -1118,46 +1205,54 @@ std::string gen_kernel(
         const int K12 = K * 12;
         const int K8 = K * 8;
         const int K4 = K * 4;
-        size_t m = 0;        
+        size_t m = 0;
+        const float* _bias_ptr = NULL;
         for (; m + m_block <= M; m += m_block) {
             float* output = C + (m / pack_mk * LDC);
+            ${channel_broadcast_bias_init}
+            ${elemwise_bias_init}
 
             size_t n = 0;
             const float* cur_pack_b = pack_b;
             for (; n + n_block <= N; n += n_block) {
                 kern_8x12_bias_relu(pack_a, cur_pack_b, K, output, LDC,
-                                    bias_ptr);
+                                    _bias_ptr);
                 output += n_block * pack_mk;
+                ${elemwise_bias_update_n12};
                 cur_pack_b += K12;
             }
 
             for (; n < N; n += 4) {                
                 kern_8x4_bias_relu(pack_a, cur_pack_b, K, output, LDC,
-                                   bias_ptr, N - n > 4 ? 4 : N - n);
+                                   _bias_ptr, N - n > 4 ? 4 : N - n);
                 output += 4 * pack_mk;
+                ${elemwise_bias_update_n4};
                 cur_pack_b += K4;
             }
             pack_a += K8;
-            bias_ptr += m_block;
         }        
         for (; m < M; m += m_block_4) {
             float* output = C + (m / pack_mk * LDC);
+            ${channel_broadcast_bias_init}
+            ${elemwise_bias_init}
+
             size_t n = 0;
             const float* cur_pack_b = pack_b;
             for (; n + n_block - 1 < N; n += n_block) {                
                 kern_4x12_bias_relu(pack_a, cur_pack_b, K, output, LDC,
-                                    bias_ptr);
+                                    _bias_ptr);
                 output += n_block * pack_mk;
+                ${elemwise_bias_update_n12};
                 cur_pack_b += K12;
             }
             for (; n < N; n += 4) {                
                 kern_4x4_bias_relu(pack_a, cur_pack_b, K, output, LDC,
-                                   bias_ptr, N - n > 4 ? 4 : N - n);
+                                   _bias_ptr, N - n > 4 ? 4 : N - n);
                 output += 4 * pack_mk;
+                ${elemwise_bias_update_n4};
                 cur_pack_b += K4;
             }
             pack_a += K4;
-            bias_ptr += m_block_4;
         }
         ${postprocess_call}
     }
@@ -1166,6 +1261,10 @@ std::string gen_kernel(
             .add("postprocess_call", postprocess_call)
             .add("preset_str", preset_str)
             .add("kernel_sig", sig)
+            .add("channel_broadcast_bias_init", channel_broadcast_bias_init)
+            .add("elemwise_bias_init", elemwise_bias_init)
+            .add("elemwise_bias_update_n12", elemwise_bias_update_n12)
+            .add("elemwise_bias_update_n4", elemwise_bias_update_n4)
             .render(keren_body);
 }
 
@@ -1174,9 +1273,7 @@ std::string gen_kernel(
 std::string MatmulM8N12MK4Kernel::GetKernelSymbol(TContext* ctx) const {
     std::stringstream ss;
     ss << "Arm64_fp32_m8_n12_mk4_matmul";
-    if (ctx->getAttrBool("with_bias")) {
-        ss << "_bias";
-    }
+    ss << "_" << ctx->getAttrStr("bias_mode");
     if (ctx->haveAttr("nonlineMode") && ctx->getAttrStr("nonlineMode") != "IDENTITY") {
         ss << "_" << ctx->getAttrStr("nonlineMode");
     }
diff --git a/compiler/lib/KernelGen/Arm/Arm64/MatMulKernel/Fp32MatMulM8N12K4.cpp b/compiler/lib/KernelGen/Arm/Arm64/MatMulKernel/Fp32MatMulM8N12K4.cpp
index ae44eb4c..6f419ae1 100644
--- a/compiler/lib/KernelGen/Arm/Arm64/MatMulKernel/Fp32MatMulM8N12K4.cpp
+++ b/compiler/lib/KernelGen/Arm/Arm64/MatMulKernel/Fp32MatMulM8N12K4.cpp
@@ -10,7 +10,7 @@ using namespace Arm64;
 std::shared_ptr<TContext> Fp32MatMulM8N12K4::GetInnerCtx(TContext* ctx) const {
     auto inner_ctx = std::make_shared<CodeGenContext>();
     inner_ctx->setAttr("format", "MK4");
-    inner_ctx->setAttr("with_bias", false);
+    inner_ctx->setAttr("bias_mode", "NO_BIAS");
     inner_ctx->setAttr("transposeA", false);
     inner_ctx->setAttr("transposeB", false);
     inner_ctx->setAttr("dtype", "f32");
diff --git a/compiler/lib/KernelGen/Common/ConvKernel.h b/compiler/lib/KernelGen/Common/ConvKernel.h
index 43420c64..ce514b2e 100644
--- a/compiler/lib/KernelGen/Common/ConvKernel.h
+++ b/compiler/lib/KernelGen/Common/ConvKernel.h
@@ -19,6 +19,21 @@ class ConvImpl : public KernelFunc {
         }
         return false;
     }
+    static bool is_elemwise_bias(TContext* ctx) {
+        if (is_bias(ctx)) {
+            CCOperand bias = ctx->getAttrOprand("operand:2");
+            CCOperand dst = Utils::get_last_operand(ctx);
+            if (bias.shape.size() != dst.shape.size())
+                return false;
+            size_t len = bias.shape.size();
+            for (size_t i = 0; i < len; ++i) {
+                if (bias.shape[i] != dst.shape[i])
+                    return false;
+            }
+            return true;
+        }
+        return false;
+    }
     static bool is_no_pad(TContext* ctx) {
         auto pad_h = ctx->getAttrInt("pad_h");
         auto pad_w = ctx->getAttrInt("pad_w");
diff --git a/compiler/test/kernel/opr/arm/conv.cpp b/compiler/test/kernel/opr/arm/conv.cpp
index a1fb6214..42095854 100644
--- a/compiler/test/kernel/opr/arm/conv.cpp
+++ b/compiler/test/kernel/opr/arm/conv.cpp
@@ -333,7 +333,10 @@ TEST(AARCH64, ConvBias1x1NCHW44) {
           ConvBiasForward::Param::NonlineMode::H_SWISH}) {
         param.nonlineMode = noline;
         checker.set_param(param);
+        checker.execs({{1, 3, 1, 1, 4}, {5, 3, 1, 1, 4, 4}, {1, 5, 1, 1, 4}, {}, {}});
         checker.execs({{2, 3, 5, 11, 4}, {5, 3, 1, 1, 4, 4}, {1, 5, 1, 1, 4}, {}, {}});
+        checker.execs({{2, 3, 5, 11, 4}, {5, 3, 1, 1, 4, 4}, {2, 5, 5, 11, 4}, {}, {}});
+        checker.execs({{2, 3, 5, 11, 4}, {5, 3, 1, 1, 4, 4}, {}, {}, {}});
     }
 
     param.sparse = ConvolutionForward::Param::Sparse::GROUP;
@@ -345,6 +348,8 @@ TEST(AARCH64, ConvBias1x1NCHW44) {
         checker.set_param(param);
         checker.execs(
                 {{2, 6, 17, 19, 4}, {2, 4, 3, 1, 1, 4, 4}, {1, 8, 1, 1, 4}, {}, {}});
+        checker.execs(
+                {{2, 6, 17, 19, 4}, {2, 4, 3, 1, 1, 4, 4}, {2, 8, 17, 19, 4}, {}, {}});
     }
 }
 
diff --git a/script/build_and_test_not_standard_os.sh b/script/build_and_test_not_standard_os.sh
index b9f3506b..135a98d6 100755
--- a/script/build_and_test_not_standard_os.sh
+++ b/script/build_and_test_not_standard_os.sh
@@ -28,7 +28,7 @@ cmake --build "$MEGCC_BUILD_DIR" -j$(nproc) --target mgb-to-tinynn --target mgb-
 
 function check_key_words() {
     #elf self mangle words, we do not care!!
-    white_list="@MEGW mgb1 5Mbg6 MGBi O:MgBnWk <mbG =MEG>Yr]< 4emUi0B >HMgE kMEG RmEg MbGV4 MEgIy @MEg mGe#S BMgb MGB( mBg: MBgr8C A&mGB mEg; mGb>/ mEg= .strtab .shstrtab A=MgE= mgb=g MGe= g=MgE <mgE= =Mgb> MGE< 8<MGE= =Mge =Mgb=K <MBG <MGE =MGB= y?=Mgb="
+    white_list="@MEGW mgb1 5Mbg6 MGBi O:MgBnWk <mbG =MEG>Yr]< 4emUi0B >HMgE kMEG RmEg MbGV4 MEgIy @MEg mGe#S BMgb MGB( mBg: MBgr8C A&mGB mEg; mGb>/ mEg= .strtab .shstrtab A=MgE= mgb=g MGe= g=MgE <mgE= =Mgb> MGE< 8<MGE= =Mge =Mgb=K <MBG <MGE =MGB= y?=Mgb= Mge="
     elf_file=$1
     if [ ! -f ${elf_file} ];then
         echo "ERR: can not find ${elf_file}"