Skip to content

Commit

Permalink
Feature/ntt benchmark roofline 3 (#1241)
Browse files Browse the repository at this point in the history
* update for pack M&N x86

* revert for some performance fallback

* add funroll-loops for gcc

* shutdown for ci

* revise bug for unroll

* add softmax benchmark

* add ctest for softmax

* Fix ctest failure for softmax.

* opt for x86 softmax

* revise benchmark for softmax

* Add rvv optimization of tanh with max_ulp_error = 2.

* add tanh for x86 ulp version

* remove usless headfile

* Apply code-format changes

* Optimize mul_add for rvv(performance boost 15% ~ 32%)

* Remove ntt softmax and fix reduce conflict of x86_64.

* change roofline for reduce x86

* Optimize matmul for rvv and update roofline.

* Update reduce roofline for rvv.

* Update Max_reduceMN_PackN roofline.

* Add ratio for roofline / actual.

* update tanh Roofline

* Specialize max/min for float and update roofline for reduce no_pack.

* problem about x86 roofline

* [NTT] Add ukernel for matmul

* Apply code-format changes

* Fix build

* change some reality for x86

* fallback roofline

* change sequence for test

* add warmup for unary

* add primitive size auto test

* revise bug in daily test

* revise bug in daily test

* avoid bug for temp

* total fallback

* add info for primitive size

* remove tile infor

* change the way

* Add tensor.squeeze

* remove Primitive infor

* Apply code-format changes

* temp change test

* add table name

* merge two table

* remove typo

* typo test

* change back for daily test

* test for table

* change back tor test over

* change for primitive size

* Support odd matmul

* Fix build

* Apply code-format changes

* Optimze erf for rvv.

* Fix build

* Add markdown for ntt mamtul.

* Add u_matmul policy for rvv

* add erf ulp version

* fix typo

* Refactor benchmark ntt py to support both ntt and ntt_matmul.

* Apply code-format changes

* Add ntt.store, optimize u_matmul for RVV

* Fix pack MKN for RVV

* Fix macos build and show gflops with floating point.

* revise typo

* Force compiler do not unroll k loops

* Use pragma unroll 1 instead of volatile

* set performance for cpu0

* Apply code-format changes

* temp fallback for ci test

---------

Co-authored-by: guodongliang <[email protected]>
Co-authored-by: zhangyang2057 <[email protected]>
Co-authored-by: uranus0515 <[email protected]>
Co-authored-by: sunnycase <[email protected]>
Co-authored-by: sunnycase <[email protected]>
Co-authored-by: zhangyang2057 <[email protected]>
  • Loading branch information
7 people authored Oct 8, 2024
1 parent e6d95d2 commit 9f289a3
Show file tree
Hide file tree
Showing 22 changed files with 2,871 additions and 638 deletions.
278 changes: 248 additions & 30 deletions src/Native/include/nncase/ntt/arch/riscv64/primitive_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
*/
#pragma once
#include "../../primitive_ops.h"
#include "nncase/ntt/arch/riscv64/arch_types.h"
#include "nncase/ntt/vector.h"
#include "rvv_mathfun.h"

#ifdef __riscv_vector
Expand All @@ -29,6 +31,15 @@ namespace nncase::ntt::ops {
kernel(1, 32) kernel(2, 16) kernel(4, 8) kernel(8, 4)
#endif

template <>
struct store<ntt::vector<float, NTT_VLEN / 32>,
ntt::vector<float, NTT_VLEN / 32>> {
void operator()(ntt::vector<float, NTT_VLEN / 32> &dest,
const ntt::vector<float, NTT_VLEN / 32> &v) const noexcept {
__riscv_vse32_v_f32m1((float *)&dest, v, NTT_VLEN / 32);
}
};

#define RVV_UNARY_OP(op, dtype, vl, kernel) \
template <> struct op<ntt::vector<dtype, vl>> { \
ntt::vector<dtype, vl> \
Expand Down Expand Up @@ -610,6 +621,16 @@ REGISTER_RVV_UNARY_OP(square, float, square_float32)
REGISTER_RVV_KERNEL(TANH_FLOAT32)
REGISTER_RVV_UNARY_OP(tanh, float, tanh_float32)

// erf
#define ERF_FLOAT32(lmul, mlen) \
inline vfloat32m##lmul##_t erf_float32(const vfloat32m##lmul##_t &v, \
const size_t vl) { \
return erf_ps(v, vl); \
}

REGISTER_RVV_KERNEL(ERF_FLOAT32)
REGISTER_RVV_UNARY_OP(erf, float, erf_float32)

// binary
#define RVV_BINARY_OP(op, dtype, vl, kernel) \
template <> struct op<ntt::vector<dtype, vl>, ntt::vector<dtype, vl>> { \
Expand Down Expand Up @@ -761,6 +782,16 @@ REGISTER_RVV_KERNEL(MOD_FLOAT32)
REGISTER_RVV_BINARY_OP(mod, float, mod_float32)

// min
template <> struct min<float, float> {
auto operator()(const float &s1, const float &s2) const noexcept {
float ret;
__asm("fmin.s %[ret], %[s1], %[s2];"
: [ret] "=f"(ret)
: [s1] "f"(s1), [s2] "f"(s2));
return ret;
}
};

#define MIN_FLOAT32(lmul, mlen) \
inline vfloat32m##lmul##_t min_float32(const vfloat32m##lmul##_t &v1, \
const vfloat32m##lmul##_t &v2, \
Expand All @@ -782,6 +813,16 @@ REGISTER_RVV_KERNEL(MIN_FLOAT32)
REGISTER_RVV_BINARY_OP(min, float, min_float32)

// max
template <> struct max<float, float> {
auto operator()(const float &s1, const float &s2) const noexcept {
float ret;
__asm("fmax.s %[ret], %[s1], %[s2];"
: [ret] "=f"(ret)
: [s1] "f"(s1), [s2] "f"(s2));
return ret;
}
};

#define MAX_FLOAT32(lmul, mlen) \
inline vfloat32m##lmul##_t max_float32(const vfloat32m##lmul##_t &v1, \
const vfloat32m##lmul##_t &v2, \
Expand Down Expand Up @@ -969,6 +1010,7 @@ REGISTER_RVV_KERNEL(INNER_PRODUCT_FLOAT32)
REGISTER_RVV_INNER_PRODUCT_OP(float, inner_product_float32)

// register mul_add kernel
#if 0
#define MUL_ADD_FLOAT32(lmul, mlen) \
inline vfloat32m##lmul##_t mul_add_float32( \
const vfloat32m##lmul##_t &v1, const vfloat32m##lmul##_t &v2, \
Expand All @@ -987,6 +1029,26 @@ REGISTER_RVV_INNER_PRODUCT_OP(float, inner_product_float32)
const vfloat32m##lmul##_t &v3, const size_t vl) { \
return __riscv_vfmadd_vf_f32m##lmul(v2, s1, v3, vl); \
}
#else
#define MUL_ADD_FLOAT32(lmul, mlen) \
inline vfloat32m##lmul##_t mul_add_float32( \
const vfloat32m##lmul##_t &v1, const vfloat32m##lmul##_t &v2, \
const vfloat32m##lmul##_t &v3, const size_t vl) { \
return __riscv_vfmacc_vv_f32m##lmul(v3, v1, v2, vl); \
} \
\
inline vfloat32m##lmul##_t mul_add_float32( \
const vfloat32m##lmul##_t &v1, const float &s2, \
const vfloat32m##lmul##_t &v3, const size_t vl) { \
return __riscv_vfmacc_vf_f32m##lmul(v3, s2, v1, vl); \
} \
\
inline vfloat32m##lmul##_t mul_add_float32( \
const float &s1, const vfloat32m##lmul##_t &v2, \
const vfloat32m##lmul##_t &v3, const size_t vl) { \
return __riscv_vfmacc_vf_f32m##lmul(v3, s1, v2, vl); \
}
#endif

REGISTER_RVV_KERNEL(MUL_ADD_FLOAT32)

Expand Down Expand Up @@ -1029,7 +1091,6 @@ REGISTER_RVV_KERNEL(MUL_ADD_FLOAT32)

REGISTER_RVV_MUL_ADD_OP(float, mul_add_float32)

#if 1
template <bool AccC>
struct mma<AccC, ntt::vector<float, 1, 4>, ntt::vector<float, 4, 4>,
ntt::vector<float, 1, 4>> {
Expand All @@ -1038,11 +1099,67 @@ struct mma<AccC, ntt::vector<float, 1, 4>, ntt::vector<float, 4, 4>,
const ntt::vector<float, 4, 4> &rhs,
const ntt::vector<float, 1, 4> &v3) const noexcept {
auto output = v3;
for (size_t k = 0; k < 4; k++) {
output(0) = (k != 0 || AccC)
? ntt::mul_add(lhs(0, k), rhs(k), output(0))
: ntt::mul(lhs(0, k), rhs(k));
}
auto t0 = AccC ? ntt::mul_add(lhs(0, 0), rhs(0), output(0))
: ntt::mul(lhs(0, 0), rhs(0));
auto t1 = ntt::mul(lhs(0, 1), rhs(1));
t0 = ntt::mul_add(lhs(0, 2), rhs(2), t0);
t1 = ntt::mul_add(lhs(0, 3), rhs(3), t1);
output(0) = ntt::add(t0, t1);
return output;
}
};

template <bool AccC>
struct mma<AccC, ntt::vector<float, 1, 32>, ntt::vector<float, 32, 32>,
ntt::vector<float, 1, 32>> {
ntt::vector<float, 1, 32>
operator()(const ntt::vector<float, 1, 32> &lhs,
const ntt::vector<float, 32, 32> &rhs,
const ntt::vector<float, 1, 32> &v3) const noexcept {
auto output = v3;

auto t0 = AccC ? ntt::mul_add(lhs(0, 0), rhs(0), output(0))
: ntt::mul(lhs(0, 0), rhs(0));
auto t1 = ntt::mul(lhs(0, 1), rhs(1));
t0 = ntt::mul_add(lhs(0, 2), rhs(2), t0);
t1 = ntt::mul_add(lhs(0, 3), rhs(3), t1);

t0 = ntt::mul_add(lhs(0, 4), rhs(4), t0);
t1 = ntt::mul_add(lhs(0, 5), rhs(5), t1);
t0 = ntt::mul_add(lhs(0, 6), rhs(6), t0);
t1 = ntt::mul_add(lhs(0, 7), rhs(7), t1);

t0 = ntt::mul_add(lhs(0, 8), rhs(8), t0);
t1 = ntt::mul_add(lhs(0, 9), rhs(9), t1);
t0 = ntt::mul_add(lhs(0, 10), rhs(10), t0);
t1 = ntt::mul_add(lhs(0, 11), rhs(11), t1);

t0 = ntt::mul_add(lhs(0, 12), rhs(12), t0);
t1 = ntt::mul_add(lhs(0, 13), rhs(13), t1);
t0 = ntt::mul_add(lhs(0, 14), rhs(14), t0);
t1 = ntt::mul_add(lhs(0, 15), rhs(15), t1);

t0 = ntt::mul_add(lhs(0, 16), rhs(16), t0);
t1 = ntt::mul_add(lhs(0, 17), rhs(17), t1);
t0 = ntt::mul_add(lhs(0, 18), rhs(18), t0);
t1 = ntt::mul_add(lhs(0, 19), rhs(19), t1);

t0 = ntt::mul_add(lhs(0, 20), rhs(20), t0);
t1 = ntt::mul_add(lhs(0, 21), rhs(21), t1);
t0 = ntt::mul_add(lhs(0, 22), rhs(22), t0);
t1 = ntt::mul_add(lhs(0, 23), rhs(23), t1);

t0 = ntt::mul_add(lhs(0, 24), rhs(24), t0);
t1 = ntt::mul_add(lhs(0, 25), rhs(25), t1);
t0 = ntt::mul_add(lhs(0, 26), rhs(26), t0);
t1 = ntt::mul_add(lhs(0, 27), rhs(27), t1);

t0 = ntt::mul_add(lhs(0, 28), rhs(28), t0);
t1 = ntt::mul_add(lhs(0, 29), rhs(29), t1);
t0 = ntt::mul_add(lhs(0, 30), rhs(30), t0);
t1 = ntt::mul_add(lhs(0, 31), rhs(31), t1);

output(0) = ntt::add(t0, t1);
return output;
}
};
Expand All @@ -1055,33 +1172,134 @@ struct mma<AccC, ntt::vector<float, 4, 4>, ntt::vector<float, 4, 4>,
const ntt::vector<float, 4, 4> &rhs,
const ntt::vector<float, 4, 4> &v3) const noexcept {
auto output = v3;
for (size_t k = 0; k < 4; k++) {
output(0) = (k != 0 || AccC)
? ntt::mul_add(lhs(0, k), rhs(k), output(0))
: ntt::mul(lhs(0, k), rhs(k));
}

for (size_t k = 0; k < 4; k++) {
output(1) = (k != 0 || AccC)
? ntt::mul_add(lhs(1, k), rhs(k), output(1))
: ntt::mul(lhs(1, k), rhs(k));
}

for (size_t k = 0; k < 4; k++) {
output(2) = (k != 0 || AccC)
? ntt::mul_add(lhs(2, k), rhs(k), output(2))
: ntt::mul(lhs(2, k), rhs(k));
}

for (size_t k = 0; k < 4; k++) {
output(3) = (k != 0 || AccC)
? ntt::mul_add(lhs(3, k), rhs(k), output(3))
: ntt::mul(lhs(3, k), rhs(k));
}
ntt::fixed_tensor_alike_t<ntt::vector<float, 4>, 1, 4> lhs_2d[4]{
{{lhs(0)}},
{{lhs(1)}},
{{lhs(2)}},
{{lhs(3)}},
};
ntt::fixed_tensor_alike_t<ntt::vector<float, 4>, 1, 4> output_2d[4]{
{{v3(0)}},
{{v3(1)}},
{{v3(2)}},
{{v3(3)}},
};

output_2d[0] = ntt::mma<AccC>(lhs_2d[0], rhs, output_2d[0]);
output_2d[1] = ntt::mma<AccC>(lhs_2d[1], rhs, output_2d[1]);
output_2d[2] = ntt::mma<AccC>(lhs_2d[2], rhs, output_2d[2]);
output_2d[3] = ntt::mma<AccC>(lhs_2d[3], rhs, output_2d[3]);

output(0) = output_2d[0](0);
output(1) = output_2d[1](0);
output(2) = output_2d[2](0);
output(3) = output_2d[3](0);

return output;
}
};

template <bool AccC>
struct mma<AccC, ntt::vector<float, 32, 32>, ntt::vector<float, 32, 32>,
ntt::vector<float, 32, 32>> {
ntt::vector<float, 32, 32>
operator()(const ntt::vector<float, 32, 32> &lhs,
const ntt::vector<float, 32, 32> &rhs,
const ntt::vector<float, 32, 32> &v3) const noexcept {
auto output = v3;
ntt::fixed_tensor_alike_t<ntt::vector<float, 32>, 1, 32> lhs_2d[]{
{{lhs(0)}}, {{lhs(1)}}, {{lhs(2)}}, {{lhs(3)}}, {{lhs(4)}},
{{lhs(5)}}, {{lhs(6)}}, {{lhs(7)}}, {{lhs(8)}}, {{lhs(9)}},
{{lhs(10)}}, {{lhs(11)}}, {{lhs(12)}}, {{lhs(13)}}, {{lhs(14)}},
{{lhs(15)}}, {{lhs(16)}}, {{lhs(17)}}, {{lhs(18)}}, {{lhs(19)}},
{{lhs(20)}}, {{lhs(21)}}, {{lhs(22)}}, {{lhs(23)}}, {{lhs(24)}},
{{lhs(25)}}, {{lhs(26)}}, {{lhs(27)}}, {{lhs(28)}}, {{lhs(29)}},
{{lhs(30)}}, {{lhs(31)}}};

ntt::fixed_tensor_alike_t<ntt::vector<float, 32>, 1, 32> output_2d[]{
{{v3(0)}}, {{v3(1)}}, {{v3(2)}}, {{v3(3)}}, {{v3(4)}},
{{v3(5)}}, {{v3(6)}}, {{v3(7)}}, {{v3(8)}}, {{v3(9)}},
{{v3(10)}}, {{v3(11)}}, {{v3(12)}}, {{v3(13)}}, {{v3(14)}},
{{v3(15)}}, {{v3(16)}}, {{v3(17)}}, {{v3(18)}}, {{v3(19)}},
{{v3(20)}}, {{v3(21)}}, {{v3(22)}}, {{v3(23)}}, {{v3(24)}},
{{v3(25)}}, {{v3(26)}}, {{v3(27)}}, {{v3(28)}}, {{v3(29)}},
{{v3(30)}}, {{v3(31)}}};

output_2d[0] = ntt::mma<AccC>(lhs_2d[0], rhs, output_2d[0]);
output_2d[1] = ntt::mma<AccC>(lhs_2d[1], rhs, output_2d[1]);
output_2d[2] = ntt::mma<AccC>(lhs_2d[2], rhs, output_2d[2]);
output_2d[3] = ntt::mma<AccC>(lhs_2d[3], rhs, output_2d[3]);
output_2d[4] = ntt::mma<AccC>(lhs_2d[4], rhs, output_2d[4]);
output_2d[5] = ntt::mma<AccC>(lhs_2d[5], rhs, output_2d[5]);
output_2d[6] = ntt::mma<AccC>(lhs_2d[6], rhs, output_2d[6]);
output_2d[7] = ntt::mma<AccC>(lhs_2d[7], rhs, output_2d[7]);

output_2d[8] = ntt::mma<AccC>(lhs_2d[8], rhs, output_2d[8]);
output_2d[9] = ntt::mma<AccC>(lhs_2d[9], rhs, output_2d[9]);
output_2d[10] = ntt::mma<AccC>(lhs_2d[10], rhs, output_2d[10]);
output_2d[11] = ntt::mma<AccC>(lhs_2d[11], rhs, output_2d[11]);
output_2d[12] = ntt::mma<AccC>(lhs_2d[12], rhs, output_2d[12]);
output_2d[13] = ntt::mma<AccC>(lhs_2d[13], rhs, output_2d[13]);
output_2d[14] = ntt::mma<AccC>(lhs_2d[14], rhs, output_2d[14]);
output_2d[15] = ntt::mma<AccC>(lhs_2d[15], rhs, output_2d[15]);

output_2d[16] = ntt::mma<AccC>(lhs_2d[16], rhs, output_2d[16]);
output_2d[17] = ntt::mma<AccC>(lhs_2d[17], rhs, output_2d[17]);
output_2d[18] = ntt::mma<AccC>(lhs_2d[18], rhs, output_2d[18]);
output_2d[19] = ntt::mma<AccC>(lhs_2d[19], rhs, output_2d[19]);
output_2d[20] = ntt::mma<AccC>(lhs_2d[20], rhs, output_2d[20]);
output_2d[21] = ntt::mma<AccC>(lhs_2d[21], rhs, output_2d[21]);
output_2d[22] = ntt::mma<AccC>(lhs_2d[22], rhs, output_2d[22]);
output_2d[23] = ntt::mma<AccC>(lhs_2d[23], rhs, output_2d[23]);

output_2d[24] = ntt::mma<AccC>(lhs_2d[24], rhs, output_2d[24]);
output_2d[25] = ntt::mma<AccC>(lhs_2d[25], rhs, output_2d[25]);
output_2d[26] = ntt::mma<AccC>(lhs_2d[26], rhs, output_2d[26]);
output_2d[27] = ntt::mma<AccC>(lhs_2d[27], rhs, output_2d[27]);
output_2d[28] = ntt::mma<AccC>(lhs_2d[28], rhs, output_2d[28]);
output_2d[29] = ntt::mma<AccC>(lhs_2d[29], rhs, output_2d[29]);
output_2d[30] = ntt::mma<AccC>(lhs_2d[30], rhs, output_2d[30]);
output_2d[31] = ntt::mma<AccC>(lhs_2d[31], rhs, output_2d[31]);

output(0) = output_2d[0](0);
output(1) = output_2d[1](0);
output(2) = output_2d[2](0);
output(3) = output_2d[3](0);
output(4) = output_2d[4](0);
output(5) = output_2d[5](0);
output(6) = output_2d[6](0);
output(7) = output_2d[7](0);

output(8) = output_2d[8](0);
output(9) = output_2d[9](0);
output(10) = output_2d[10](0);
output(11) = output_2d[11](0);
output(12) = output_2d[12](0);
output(13) = output_2d[13](0);
output(14) = output_2d[14](0);
output(15) = output_2d[15](0);

output(16) = output_2d[16](0);
output(17) = output_2d[17](0);
output(18) = output_2d[18](0);
output(19) = output_2d[19](0);
output(20) = output_2d[20](0);
output(21) = output_2d[21](0);
output(22) = output_2d[22](0);
output(23) = output_2d[23](0);

output(24) = output_2d[24](0);
output(25) = output_2d[25](0);
output(26) = output_2d[26](0);
output(27) = output_2d[27](0);
output(28) = output_2d[28](0);
output(29) = output_2d[29](0);
output(30) = output_2d[30](0);
output(31) = output_2d[31](0);

return output;
}
};
#endif

// register reduce_sum kernel
#define REDUCE_ADD_FLOAT32(lmul, mlen) \
Expand Down
Loading

0 comments on commit 9f289a3

Please sign in to comment.