Feature/ntt benchmark roofline 3 (#1241)

* update for pack M&N x86 * revert for some performance fallback * add funroll-loops for gcc * shutdown for ci * revise bug for unroll * add softmax benchmark * add ctest for softmax * Fix ctest failure for softmax. * opt for x86 softmax * revise benchmark for softmax * Add rvv optimization of tanh with max_ulp_error = 2. * add tanh for x86 ulp version * remove usless headfile * Apply code-format changes * Optimize mul_add for rvv(performance boost 15% ~ 32%) * Remove ntt softmax and fix reduce conflict of x86_64. * change roofline for reduce x86 * Optimize matmul for rvv and update roofline. * Update reduce roofline for rvv. * Update Max_reduceMN_PackN roofline. * Add ratio for roofline / actual. * update tanh Roofline * Specialize max/min for float and update roofline for reduce no_pack. * problem about x86 roofline * [NTT] Add ukernel for matmul * Apply code-format changes * Fix build * change some reality for x86 * fallback roofline * change sequence for test * add warmup for unary * add primitive size auto test * revise bug in daily test * revise bug in daily test * avoid bug for temp * total fallback * add info for primitive size * remove tile infor * change the way * Add tensor.squeeze * remove Primitive infor * Apply code-format changes * temp change test * add table name * merge two table * remove typo * typo test * change back for daily test * test for table * change back tor test over * change for primitive size * Support odd matmul * Fix build * Apply code-format changes * Optimze erf for rvv. * Fix build * Add markdown for ntt mamtul. * Add u_matmul policy for rvv * add erf ulp version * fix typo * Refactor benchmark ntt py to support both ntt and ntt_matmul. * Apply code-format changes * Add ntt.store, optimize u_matmul for RVV * Fix pack MKN for RVV * Fix macos build and show gflops with floating point. * revise typo * Force compiler do not unroll k loops * Use pragma unroll 1 instead of volatile * set performance for cpu0 * Apply code-format changes * temp fallback for ci test --------- Co-authored-by: guodongliang <[email protected]> Co-authored-by: zhangyang2057 <[email protected]> Co-authored-by: uranus0515 <[email protected]> Co-authored-by: sunnycase <[email protected]> Co-authored-by: sunnycase <[email protected]> Co-authored-by: zhangyang2057 <[email protected]>
kendryte · Oct 8, 2024 · 9f289a3 · 9f289a3
1 parent e6d95d2
commit 9f289a3
Show file tree

Hide file tree

Showing 22 changed files with 2,871 additions and 638 deletions.
diff --git a/src/Native/include/nncase/ntt/arch/riscv64/primitive_ops.h b/src/Native/include/nncase/ntt/arch/riscv64/primitive_ops.h
@@ -14,6 +14,8 @@
  */
 #pragma once
 #include "../../primitive_ops.h"
+#include "nncase/ntt/arch/riscv64/arch_types.h"
+#include "nncase/ntt/vector.h"
 #include "rvv_mathfun.h"
 
 #ifdef __riscv_vector
@@ -29,6 +31,15 @@ namespace nncase::ntt::ops {
     kernel(1, 32) kernel(2, 16) kernel(4, 8) kernel(8, 4)
 #endif
 
+template <>
+struct store<ntt::vector<float, NTT_VLEN / 32>,
+             ntt::vector<float, NTT_VLEN / 32>> {
+    void operator()(ntt::vector<float, NTT_VLEN / 32> &dest,
+                    const ntt::vector<float, NTT_VLEN / 32> &v) const noexcept {
+        __riscv_vse32_v_f32m1((float *)&dest, v, NTT_VLEN / 32);
+    }
+};
+
 #define RVV_UNARY_OP(op, dtype, vl, kernel)                                    \
     template <> struct op<ntt::vector<dtype, vl>> {                            \
         ntt::vector<dtype, vl>                                                 \
@@ -610,6 +621,16 @@ REGISTER_RVV_UNARY_OP(square, float, square_float32)
 REGISTER_RVV_KERNEL(TANH_FLOAT32)
 REGISTER_RVV_UNARY_OP(tanh, float, tanh_float32)
 
+// erf
+#define ERF_FLOAT32(lmul, mlen)                                                \
+    inline vfloat32m##lmul##_t erf_float32(const vfloat32m##lmul##_t &v,       \
+                                           const size_t vl) {                  \
+        return erf_ps(v, vl);                                                  \
+    }
+
+REGISTER_RVV_KERNEL(ERF_FLOAT32)
+REGISTER_RVV_UNARY_OP(erf, float, erf_float32)
+
 // binary
 #define RVV_BINARY_OP(op, dtype, vl, kernel)                                   \
     template <> struct op<ntt::vector<dtype, vl>, ntt::vector<dtype, vl>> {    \
@@ -761,6 +782,16 @@ REGISTER_RVV_KERNEL(MOD_FLOAT32)
 REGISTER_RVV_BINARY_OP(mod, float, mod_float32)
 
 // min
+template <> struct min<float, float> {
+    auto operator()(const float &s1, const float &s2) const noexcept {
+        float ret;
+        __asm("fmin.s %[ret], %[s1], %[s2];"
+              : [ret] "=f"(ret)
+              : [s1] "f"(s1), [s2] "f"(s2));
+        return ret;
+    }
+};
+
 #define MIN_FLOAT32(lmul, mlen)                                                \
     inline vfloat32m##lmul##_t min_float32(const vfloat32m##lmul##_t &v1,      \
                                            const vfloat32m##lmul##_t &v2,      \
@@ -782,6 +813,16 @@ REGISTER_RVV_KERNEL(MIN_FLOAT32)
 REGISTER_RVV_BINARY_OP(min, float, min_float32)
 
 // max
+template <> struct max<float, float> {
+    auto operator()(const float &s1, const float &s2) const noexcept {
+        float ret;
+        __asm("fmax.s %[ret], %[s1], %[s2];"
+              : [ret] "=f"(ret)
+              : [s1] "f"(s1), [s2] "f"(s2));
+        return ret;
+    }
+};
+
 #define MAX_FLOAT32(lmul, mlen)                                                \
     inline vfloat32m##lmul##_t max_float32(const vfloat32m##lmul##_t &v1,      \
                                            const vfloat32m##lmul##_t &v2,      \
@@ -969,6 +1010,7 @@ REGISTER_RVV_KERNEL(INNER_PRODUCT_FLOAT32)
 REGISTER_RVV_INNER_PRODUCT_OP(float, inner_product_float32)
 
 // register mul_add kernel
+#if 0
 #define MUL_ADD_FLOAT32(lmul, mlen)                                            \
     inline vfloat32m##lmul##_t mul_add_float32(                                \
         const vfloat32m##lmul##_t &v1, const vfloat32m##lmul##_t &v2,          \
@@ -987,6 +1029,26 @@ REGISTER_RVV_INNER_PRODUCT_OP(float, inner_product_float32)
         const vfloat32m##lmul##_t &v3, const size_t vl) {                      \
         return __riscv_vfmadd_vf_f32m##lmul(v2, s1, v3, vl);                   \
     }
+#else
+#define MUL_ADD_FLOAT32(lmul, mlen)                                            \
+    inline vfloat32m##lmul##_t mul_add_float32(                                \
+        const vfloat32m##lmul##_t &v1, const vfloat32m##lmul##_t &v2,          \
+        const vfloat32m##lmul##_t &v3, const size_t vl) {                      \
+        return __riscv_vfmacc_vv_f32m##lmul(v3, v1, v2, vl);                   \
+    }                                                                          \
+                                                                               \
+    inline vfloat32m##lmul##_t mul_add_float32(                                \
+        const vfloat32m##lmul##_t &v1, const float &s2,                        \
+        const vfloat32m##lmul##_t &v3, const size_t vl) {                      \
+        return __riscv_vfmacc_vf_f32m##lmul(v3, s2, v1, vl);                   \
+    }                                                                          \
+                                                                               \
+    inline vfloat32m##lmul##_t mul_add_float32(                                \
+        const float &s1, const vfloat32m##lmul##_t &v2,                        \
+        const vfloat32m##lmul##_t &v3, const size_t vl) {                      \
+        return __riscv_vfmacc_vf_f32m##lmul(v3, s1, v2, vl);                   \
+    }
+#endif
 
 REGISTER_RVV_KERNEL(MUL_ADD_FLOAT32)
 
@@ -1029,7 +1091,6 @@ REGISTER_RVV_KERNEL(MUL_ADD_FLOAT32)
 
 REGISTER_RVV_MUL_ADD_OP(float, mul_add_float32)
 
-#if 1
 template <bool AccC>
 struct mma<AccC, ntt::vector<float, 1, 4>, ntt::vector<float, 4, 4>,
            ntt::vector<float, 1, 4>> {
@@ -1038,11 +1099,67 @@ struct mma<AccC, ntt::vector<float, 1, 4>, ntt::vector<float, 4, 4>,
                const ntt::vector<float, 4, 4> &rhs,
                const ntt::vector<float, 1, 4> &v3) const noexcept {
         auto output = v3;
-        for (size_t k = 0; k < 4; k++) {
-            output(0) = (k != 0 || AccC)
-                            ? ntt::mul_add(lhs(0, k), rhs(k), output(0))
-                            : ntt::mul(lhs(0, k), rhs(k));
-        }
+        auto t0 = AccC ? ntt::mul_add(lhs(0, 0), rhs(0), output(0))
+                       : ntt::mul(lhs(0, 0), rhs(0));
+        auto t1 = ntt::mul(lhs(0, 1), rhs(1));
+        t0 = ntt::mul_add(lhs(0, 2), rhs(2), t0);
+        t1 = ntt::mul_add(lhs(0, 3), rhs(3), t1);
+        output(0) = ntt::add(t0, t1);
+        return output;
+    }
+};
+
+template <bool AccC>
+struct mma<AccC, ntt::vector<float, 1, 32>, ntt::vector<float, 32, 32>,
+           ntt::vector<float, 1, 32>> {
+    ntt::vector<float, 1, 32>
+    operator()(const ntt::vector<float, 1, 32> &lhs,
+               const ntt::vector<float, 32, 32> &rhs,
+               const ntt::vector<float, 1, 32> &v3) const noexcept {
+        auto output = v3;
+
+        auto t0 = AccC ? ntt::mul_add(lhs(0, 0), rhs(0), output(0))
+                       : ntt::mul(lhs(0, 0), rhs(0));
+        auto t1 = ntt::mul(lhs(0, 1), rhs(1));
+        t0 = ntt::mul_add(lhs(0, 2), rhs(2), t0);
+        t1 = ntt::mul_add(lhs(0, 3), rhs(3), t1);
+
+        t0 = ntt::mul_add(lhs(0, 4), rhs(4), t0);
+        t1 = ntt::mul_add(lhs(0, 5), rhs(5), t1);
+        t0 = ntt::mul_add(lhs(0, 6), rhs(6), t0);
+        t1 = ntt::mul_add(lhs(0, 7), rhs(7), t1);
+
+        t0 = ntt::mul_add(lhs(0, 8), rhs(8), t0);
+        t1 = ntt::mul_add(lhs(0, 9), rhs(9), t1);
+        t0 = ntt::mul_add(lhs(0, 10), rhs(10), t0);
+        t1 = ntt::mul_add(lhs(0, 11), rhs(11), t1);
+
+        t0 = ntt::mul_add(lhs(0, 12), rhs(12), t0);
+        t1 = ntt::mul_add(lhs(0, 13), rhs(13), t1);
+        t0 = ntt::mul_add(lhs(0, 14), rhs(14), t0);
+        t1 = ntt::mul_add(lhs(0, 15), rhs(15), t1);
+
+        t0 = ntt::mul_add(lhs(0, 16), rhs(16), t0);
+        t1 = ntt::mul_add(lhs(0, 17), rhs(17), t1);
+        t0 = ntt::mul_add(lhs(0, 18), rhs(18), t0);
+        t1 = ntt::mul_add(lhs(0, 19), rhs(19), t1);
+
+        t0 = ntt::mul_add(lhs(0, 20), rhs(20), t0);
+        t1 = ntt::mul_add(lhs(0, 21), rhs(21), t1);
+        t0 = ntt::mul_add(lhs(0, 22), rhs(22), t0);
+        t1 = ntt::mul_add(lhs(0, 23), rhs(23), t1);
+
+        t0 = ntt::mul_add(lhs(0, 24), rhs(24), t0);
+        t1 = ntt::mul_add(lhs(0, 25), rhs(25), t1);
+        t0 = ntt::mul_add(lhs(0, 26), rhs(26), t0);
+        t1 = ntt::mul_add(lhs(0, 27), rhs(27), t1);
+
+        t0 = ntt::mul_add(lhs(0, 28), rhs(28), t0);
+        t1 = ntt::mul_add(lhs(0, 29), rhs(29), t1);
+        t0 = ntt::mul_add(lhs(0, 30), rhs(30), t0);
+        t1 = ntt::mul_add(lhs(0, 31), rhs(31), t1);
+
+        output(0) = ntt::add(t0, t1);
         return output;
     }
 };
@@ -1055,33 +1172,134 @@ struct mma<AccC, ntt::vector<float, 4, 4>, ntt::vector<float, 4, 4>,
                const ntt::vector<float, 4, 4> &rhs,
                const ntt::vector<float, 4, 4> &v3) const noexcept {
         auto output = v3;
-        for (size_t k = 0; k < 4; k++) {
-            output(0) = (k != 0 || AccC)
-                            ? ntt::mul_add(lhs(0, k), rhs(k), output(0))
-                            : ntt::mul(lhs(0, k), rhs(k));
-        }
-
-        for (size_t k = 0; k < 4; k++) {
-            output(1) = (k != 0 || AccC)
-                            ? ntt::mul_add(lhs(1, k), rhs(k), output(1))
-                            : ntt::mul(lhs(1, k), rhs(k));
-        }
-
-        for (size_t k = 0; k < 4; k++) {
-            output(2) = (k != 0 || AccC)
-                            ? ntt::mul_add(lhs(2, k), rhs(k), output(2))
-                            : ntt::mul(lhs(2, k), rhs(k));
-        }
-
-        for (size_t k = 0; k < 4; k++) {
-            output(3) = (k != 0 || AccC)
-                            ? ntt::mul_add(lhs(3, k), rhs(k), output(3))
-                            : ntt::mul(lhs(3, k), rhs(k));
-        }
+        ntt::fixed_tensor_alike_t<ntt::vector<float, 4>, 1, 4> lhs_2d[4]{
+            {{lhs(0)}},
+            {{lhs(1)}},
+            {{lhs(2)}},
+            {{lhs(3)}},
+        };
+        ntt::fixed_tensor_alike_t<ntt::vector<float, 4>, 1, 4> output_2d[4]{
+            {{v3(0)}},
+            {{v3(1)}},
+            {{v3(2)}},
+            {{v3(3)}},
+        };
+
+        output_2d[0] = ntt::mma<AccC>(lhs_2d[0], rhs, output_2d[0]);
+        output_2d[1] = ntt::mma<AccC>(lhs_2d[1], rhs, output_2d[1]);
+        output_2d[2] = ntt::mma<AccC>(lhs_2d[2], rhs, output_2d[2]);
+        output_2d[3] = ntt::mma<AccC>(lhs_2d[3], rhs, output_2d[3]);
+
+        output(0) = output_2d[0](0);
+        output(1) = output_2d[1](0);
+        output(2) = output_2d[2](0);
+        output(3) = output_2d[3](0);
+
+        return output;
+    }
+};
+
+template <bool AccC>
+struct mma<AccC, ntt::vector<float, 32, 32>, ntt::vector<float, 32, 32>,
+           ntt::vector<float, 32, 32>> {
+    ntt::vector<float, 32, 32>
+    operator()(const ntt::vector<float, 32, 32> &lhs,
+               const ntt::vector<float, 32, 32> &rhs,
+               const ntt::vector<float, 32, 32> &v3) const noexcept {
+        auto output = v3;
+        ntt::fixed_tensor_alike_t<ntt::vector<float, 32>, 1, 32> lhs_2d[]{
+            {{lhs(0)}},  {{lhs(1)}},  {{lhs(2)}},  {{lhs(3)}},  {{lhs(4)}},
+            {{lhs(5)}},  {{lhs(6)}},  {{lhs(7)}},  {{lhs(8)}},  {{lhs(9)}},
+            {{lhs(10)}}, {{lhs(11)}}, {{lhs(12)}}, {{lhs(13)}}, {{lhs(14)}},
+            {{lhs(15)}}, {{lhs(16)}}, {{lhs(17)}}, {{lhs(18)}}, {{lhs(19)}},
+            {{lhs(20)}}, {{lhs(21)}}, {{lhs(22)}}, {{lhs(23)}}, {{lhs(24)}},
+            {{lhs(25)}}, {{lhs(26)}}, {{lhs(27)}}, {{lhs(28)}}, {{lhs(29)}},
+            {{lhs(30)}}, {{lhs(31)}}};
+
+        ntt::fixed_tensor_alike_t<ntt::vector<float, 32>, 1, 32> output_2d[]{
+            {{v3(0)}},  {{v3(1)}},  {{v3(2)}},  {{v3(3)}},  {{v3(4)}},
+            {{v3(5)}},  {{v3(6)}},  {{v3(7)}},  {{v3(8)}},  {{v3(9)}},
+            {{v3(10)}}, {{v3(11)}}, {{v3(12)}}, {{v3(13)}}, {{v3(14)}},
+            {{v3(15)}}, {{v3(16)}}, {{v3(17)}}, {{v3(18)}}, {{v3(19)}},
+            {{v3(20)}}, {{v3(21)}}, {{v3(22)}}, {{v3(23)}}, {{v3(24)}},
+            {{v3(25)}}, {{v3(26)}}, {{v3(27)}}, {{v3(28)}}, {{v3(29)}},
+            {{v3(30)}}, {{v3(31)}}};
+
+        output_2d[0] = ntt::mma<AccC>(lhs_2d[0], rhs, output_2d[0]);
+        output_2d[1] = ntt::mma<AccC>(lhs_2d[1], rhs, output_2d[1]);
+        output_2d[2] = ntt::mma<AccC>(lhs_2d[2], rhs, output_2d[2]);
+        output_2d[3] = ntt::mma<AccC>(lhs_2d[3], rhs, output_2d[3]);
+        output_2d[4] = ntt::mma<AccC>(lhs_2d[4], rhs, output_2d[4]);
+        output_2d[5] = ntt::mma<AccC>(lhs_2d[5], rhs, output_2d[5]);
+        output_2d[6] = ntt::mma<AccC>(lhs_2d[6], rhs, output_2d[6]);
+        output_2d[7] = ntt::mma<AccC>(lhs_2d[7], rhs, output_2d[7]);
+
+        output_2d[8] = ntt::mma<AccC>(lhs_2d[8], rhs, output_2d[8]);
+        output_2d[9] = ntt::mma<AccC>(lhs_2d[9], rhs, output_2d[9]);
+        output_2d[10] = ntt::mma<AccC>(lhs_2d[10], rhs, output_2d[10]);
+        output_2d[11] = ntt::mma<AccC>(lhs_2d[11], rhs, output_2d[11]);
+        output_2d[12] = ntt::mma<AccC>(lhs_2d[12], rhs, output_2d[12]);
+        output_2d[13] = ntt::mma<AccC>(lhs_2d[13], rhs, output_2d[13]);
+        output_2d[14] = ntt::mma<AccC>(lhs_2d[14], rhs, output_2d[14]);
+        output_2d[15] = ntt::mma<AccC>(lhs_2d[15], rhs, output_2d[15]);
+
+        output_2d[16] = ntt::mma<AccC>(lhs_2d[16], rhs, output_2d[16]);
+        output_2d[17] = ntt::mma<AccC>(lhs_2d[17], rhs, output_2d[17]);
+        output_2d[18] = ntt::mma<AccC>(lhs_2d[18], rhs, output_2d[18]);
+        output_2d[19] = ntt::mma<AccC>(lhs_2d[19], rhs, output_2d[19]);
+        output_2d[20] = ntt::mma<AccC>(lhs_2d[20], rhs, output_2d[20]);
+        output_2d[21] = ntt::mma<AccC>(lhs_2d[21], rhs, output_2d[21]);
+        output_2d[22] = ntt::mma<AccC>(lhs_2d[22], rhs, output_2d[22]);
+        output_2d[23] = ntt::mma<AccC>(lhs_2d[23], rhs, output_2d[23]);
+
+        output_2d[24] = ntt::mma<AccC>(lhs_2d[24], rhs, output_2d[24]);
+        output_2d[25] = ntt::mma<AccC>(lhs_2d[25], rhs, output_2d[25]);
+        output_2d[26] = ntt::mma<AccC>(lhs_2d[26], rhs, output_2d[26]);
+        output_2d[27] = ntt::mma<AccC>(lhs_2d[27], rhs, output_2d[27]);
+        output_2d[28] = ntt::mma<AccC>(lhs_2d[28], rhs, output_2d[28]);
+        output_2d[29] = ntt::mma<AccC>(lhs_2d[29], rhs, output_2d[29]);
+        output_2d[30] = ntt::mma<AccC>(lhs_2d[30], rhs, output_2d[30]);
+        output_2d[31] = ntt::mma<AccC>(lhs_2d[31], rhs, output_2d[31]);
+
+        output(0) = output_2d[0](0);
+        output(1) = output_2d[1](0);
+        output(2) = output_2d[2](0);
+        output(3) = output_2d[3](0);
+        output(4) = output_2d[4](0);
+        output(5) = output_2d[5](0);
+        output(6) = output_2d[6](0);
+        output(7) = output_2d[7](0);
+
+        output(8) = output_2d[8](0);
+        output(9) = output_2d[9](0);
+        output(10) = output_2d[10](0);
+        output(11) = output_2d[11](0);
+        output(12) = output_2d[12](0);
+        output(13) = output_2d[13](0);
+        output(14) = output_2d[14](0);
+        output(15) = output_2d[15](0);
+
+        output(16) = output_2d[16](0);
+        output(17) = output_2d[17](0);
+        output(18) = output_2d[18](0);
+        output(19) = output_2d[19](0);
+        output(20) = output_2d[20](0);
+        output(21) = output_2d[21](0);
+        output(22) = output_2d[22](0);
+        output(23) = output_2d[23](0);
+
+        output(24) = output_2d[24](0);
+        output(25) = output_2d[25](0);
+        output(26) = output_2d[26](0);
+        output(27) = output_2d[27](0);
+        output(28) = output_2d[28](0);
+        output(29) = output_2d[29](0);
+        output(30) = output_2d[30](0);
+        output(31) = output_2d[31](0);
+
         return output;
     }
 };
-#endif
 
 // register reduce_sum kernel
 #define REDUCE_ADD_FLOAT32(lmul, mlen)                                         \