Skip to content

Commit

Permalink
Merge branch 'feature/ntt_benchmark_roofline_5' of https://github.com…
Browse files Browse the repository at this point in the history
…/kendryte/nncase into feature/ntt_benchmark_roofline_5
  • Loading branch information
guodongliang committed Nov 21, 2024
2 parents 40b44b0 + 1bfe0f0 commit 8df9bf5
Show file tree
Hide file tree
Showing 6 changed files with 65 additions and 32 deletions.
38 changes: 17 additions & 21 deletions ntt/include/nncase/ntt/arch/riscv64/ukernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,19 +77,15 @@ SPECIALIZE_U_BINARY(floor_mod, 8)
#undef SPECIALIZE_U_BINARY

// clamp
template <> struct u_clamp_policy<true> {
static constexpr size_t unroll = 8;
};
template <> struct u_clamp_policy<true> { static constexpr size_t unroll = 8; };

// reduce
template <reduce_op Op, class T> struct u_reduce_policy<Op, T, true> {
static constexpr size_t unroll = 8;
};

// cast
template <> struct u_cast_policy<true> {
static constexpr size_t unroll = 8;
};
template <> struct u_cast_policy<true> { static constexpr size_t unroll = 8; };

// matmul
template <>
Expand Down Expand Up @@ -670,10 +666,10 @@ struct u_unpack_1d_fixed<axis_stride, NTT_VLEN / 32, T1, float, true,
}
};

template <size_t low_axis_stride, size_t high_axis_stride, class T1,
size_t PackAxis1, size_t PackAxis2>
class u_unpack_2d_fixed<low_axis_stride, NTT_VLEN / 32, high_axis_stride,
NTT_VLEN / 32, T1, float, true> {
template <size_t low_stride, size_t high_stride, class T1, size_t PackAxis1,
size_t PackAxis2>
class u_unpack_2d_fixed<low_stride, NTT_VLEN / 32, high_stride, NTT_VLEN / 32,
T1, float, true, PackAxis1, PackAxis2> {
public:
void operator()(const T1 &input, size_t in_stride, float *output,
size_t count) noexcept {
Expand All @@ -686,19 +682,19 @@ class u_unpack_2d_fixed<low_axis_stride, NTT_VLEN / 32, high_axis_stride,
size_t in_offset = 0;
size_t low_idx = 0;
size_t high_idx = 0;
constexpr auto high_dim = low_axis_stride / high_axis_stride;
constexpr auto out_low_strides = low_axis_stride * vl;
constexpr auto low_extra = low_axis_stride * (vl * vl - 1);
constexpr auto high_extra = high_axis_stride * (vl - 1);
constexpr auto high_dim = low_stride / high_stride;
constexpr auto out_low_strides = low_stride * vl;
constexpr auto low_extra = low_stride * (vl * vl - 1);
constexpr auto high_extra = high_stride * (vl - 1);
asm("vsetvli zero, %[vl], e32, m1\n" ::[vl] "r"(vl));
auto in_strides = sizeof(vector<float, vl>);
in_stride = in_stride + 1;
auto out_strides = high_axis_stride * sizeof(float);
auto out_strides = high_stride * sizeof(float);

while (count / high_axis_stride) {
while (count / high_stride) {
auto out_ptr = output + in_offset + low_idx * low_extra +
high_idx * high_extra;
auto out_end = out_ptr + high_axis_stride;
auto out_end = out_ptr + high_stride;
while (out_ptr < out_end) {
auto tmp = vl;
size_t i_idx = 0;
Expand Down Expand Up @@ -748,14 +744,14 @@ class u_unpack_2d_fixed<low_axis_stride, NTT_VLEN / 32, high_axis_stride,

for (; i_idx < vl; i_idx++) {
for (size_t j = 0; j < vl; j++)
*(out_ptr + i_idx * out_low_strides +
j * high_axis_stride) = (*in_ptr)(i_idx)(j);
*(out_ptr + i_idx * out_low_strides + j * high_stride) =
(*in_ptr)(i_idx)(j);
}

out_ptr += 1;
}
in_offset += high_axis_stride;
count -= high_axis_stride;
in_offset += high_stride;
count -= high_stride;
high_idx++;
if (high_idx == high_dim) {
high_idx = 0;
Expand Down
9 changes: 5 additions & 4 deletions ntt/include/nncase/ntt/arch/x86_64/ukernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -137,10 +137,11 @@ class u_pack<M, N, MStrides, true, float, vector<float, 8>> {
};

template <class TIn, class TOut, size_t... Axes>
requires(sizeof...(Axes) > 0 &&
(std::get<sizeof...(Axes) - 1>(std::array<size_t, sizeof...(Axes)>{
Axes...}) == (TIn::rank() - 1)))
class u_pack2d<true, TIn, TOut, float, vector<float, 8, 8>, Axes...> {
requires(sizeof...(Axes) > 0 &&
(std::get<sizeof...(Axes) - 1>(std::array<size_t, sizeof...(Axes)>{
Axes...}) ==
(TIn::rank() - 1))) class u_pack2d<true, TIn, TOut, float,
vector<float, 8, 8>, Axes...> {
public:
constexpr void operator()(const TIn &input, TOut &output) noexcept {
using TVec = vector<float, 8, 8>;
Expand Down
10 changes: 5 additions & 5 deletions ntt/include/nncase/ntt/kernels/unpack.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,15 +94,15 @@ class unpack_impl<fixed_shape<InDims...>, fixed_shape<InElemDims...>, OutShape,
} else {
constexpr auto elem_rank = TVec::shape_type::rank();
constexpr fixed_shape<InDims..., InElemDims...> domain{};
constexpr auto axes = std::array<size_t, 2>{Axis1, Axis2};
apply(domain, [&](auto index) {
auto in_index = slice_index<rank>(index);
auto elem_index = slice_index<elem_rank>(index, rank);
auto out_index = slice_index<rank>(index);
out_index[low_axis] =
out_index[low_axis] * TVec::shape()[low_axis] + index[rank];
out_index[high_axis] =
out_index[high_axis] * TVec::shape()[high_axis] +
index[rank];
loop<axes.size()>([&](auto i) {
out_index[axes[i]] =
out_index[axes[i]] * TVec::shape()[i] + index[rank + i];
});
output(out_index) = input(in_index)(elem_index);
});
}
Expand Down
2 changes: 1 addition & 1 deletion ntt/test/benchmark_test/benchmark_ntt.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,7 @@ def __init__(self, target: str, bin_path: str):
'W': '4.3',
'NC': '6',
'CH': '6',
'HW': '6',
'HW': '4.3',
},
}

Expand Down
37 changes: 37 additions & 0 deletions ntt/test/ctest/test_ntt_unpack.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,43 @@ TEST(UnpackTestFloat, fixed_shape_dim_H_W) {
EXPECT_TRUE(NttTest::compare_tensor(ntt_output1, ntt_output2));
}

TEST(UnpackTestFloat, fixed_shape_dim_N_W) {
constexpr size_t P = NTT_VLEN / (sizeof(float) * 8);
constexpr size_t N = P * 2;
constexpr size_t C = P;
constexpr size_t H = P;
constexpr size_t W = P * 2;
float min_input = -10.0f;
float max_input = 10.0f;

// init
using tensor_type1 = ntt::tensor<ntt::vector<float, P, P>,
ntt::fixed_shape<N / P, C, H, W / P>>;
alignas(32) tensor_type1 ntt_input;
NttTest::init_tensor(ntt_input, min_input, max_input);

// ntt
using tensor_type2 = ntt::tensor<float, ntt::fixed_shape<N, C, H, W>>;
alignas(32) tensor_type2 ntt_output1;
ntt::unpack<0, 3>(ntt_input, ntt_output1);

// ort
auto ort_input = NttTest::ntt2ort(ntt_input);
int64_t perms[] = {0, 4, 1, 2, 3, 5};
auto tmp = ortki_Transpose(ort_input, perms, std::size(perms));
int64_t data[] = {N, C, H, W};
int64_t data_shape[] = {std::size(data)};
auto ort_type = NttTest::primitive_type2ort_type<int64_t>();
auto shape = make_tensor(reinterpret_cast<void *>(data), ort_type,
data_shape, std::size(data_shape));
auto ort_output = ortki_Reshape(tmp, shape, 0);

// compare
alignas(32) tensor_type2 ntt_output2;
NttTest::ort2ntt(ort_output, ntt_output2);
EXPECT_TRUE(NttTest::compare_tensor(ntt_output1, ntt_output2));
}

TEST(UnpackTestFloat, ranked_shape_dim_N) {
constexpr size_t P = NTT_VLEN / (sizeof(float) * 8);
constexpr size_t N = P * 2;
Expand Down
1 change: 0 additions & 1 deletion tools/clang-format.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,5 @@ find "${ROOT_DIR}/tests" \
"${ROOT_DIR}/modules" \
"${ROOT_DIR}/python" \
"${ROOT_DIR}/targets" \
"${ROOT_DIR}/ntt" \
\( -name "*.h" -o -name "*.c" -o -name "*.cc" -o -name "*.cxx" -o -name "*.cpp" -o -name "*.hpp" -o -name "*.cppm" \) -and -not -wholename "*/.*" | \
xargs ${CLANG_FORMAT_LLVM_INSTALL_DIR}/bin/clang-format -i -style=file

0 comments on commit 8df9bf5

Please sign in to comment.