Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add NTT expand #1272

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 2 additions & 5 deletions ntt/include/nncase/ntt/kernels/expand.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,14 @@ void expand_impl(const TIn &input, TOut &&output) noexcept {
constexpr auto in_rank = TIn::shape_type::rank();
constexpr auto in_shape = typename TIn::shape_type{};
constexpr auto out_shape = std::decay_t<TOut>::shape();
// constexpr auto input_strides = TIn::strides();
// constexpr auto output_strides = std::decay_t<TOut>::strides();

using TIElem = typename TIn::element_type;
using TOElem = typename std::decay_t<TOut>::element_type;

static_assert(IsScalar<TOElem> && IsScalar<TIElem>,
"Only support scalar type for now");
constexpr auto dims_ext = out_shape.rank() - in_rank;

apply(out_shape, [&](auto index) {
const auto in_index = get_reduced_offset<in_rank>(index, in_shape);
const auto in_index = get_reduced_offset<in_rank, decltype(index), decltype(in_shape), dims_ext>(index, in_shape);
output(index) = input(in_index);
});
}
Expand Down
46 changes: 27 additions & 19 deletions ntt/include/nncase/ntt/shape.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,10 @@ struct fixed_shape : detail::fixed_dims_base<Dims...> {
template <size_t I> struct append { using type = fixed_shape<Dims..., I>; };

static constexpr size_t length() noexcept { return (Dims * ... * 1); }
static constexpr size_t size() noexcept { return sizeof...(Dims); }

constexpr auto begin() const { return &detail::fixed_dims_base<Dims...>::operator[](0); }
constexpr auto end() const { return &detail::fixed_dims_base<Dims...>::operator[](0) + sizeof...(Dims); }
};

template <size_t Rank> struct ranked_shape : detail::ranked_dims_base<Rank> {
Expand Down Expand Up @@ -313,18 +317,6 @@ constexpr size_t linear_offset(const Index &index,
return offset;
}

template <class Strides>
ranked_shape<Strides::rank()> unravel_index(const size_t offset,
const Strides &strides) noexcept {
size_t remain = offset;
ranked_shape<Strides::rank()> index;
for (size_t i = 0; i < Strides::rank(); i++) {
index[i] = remain / strides[i];
remain = remain % strides[i];
}
return index;
}

template <class Shape, class Strides>
constexpr size_t linear_size(const Shape &shape,
const Strides &strides) noexcept {
Expand All @@ -340,13 +332,6 @@ constexpr size_t linear_size(const Shape &shape,
return size;
}

/**
* @brief calculate the number of contigous dimensions.
*
* @param shape fixed/ranked
* @param strides fixed/ranked
* @return constexpr size_t contigous dimension numbers.
*/
template <class Shape, class Strides>
constexpr size_t contiguous_dims(const Shape &shape, const Strides &strides) {
auto def_strides = default_strides(shape);
Expand Down Expand Up @@ -379,6 +364,15 @@ constexpr bool in_bound(const Index &index, const Shape &shape) {
return false;
}

template <size_t Rank, class Index, class Shape, size_t DimsExt>
constexpr ranked_shape<Rank> get_reduced_offset(Index in_offset, Shape reduced_shape) {
ranked_shape<Rank> off;
for (size_t i = 0; i < reduced_shape.rank(); i++) {
off.at(i) = (in_offset.at(i + DimsExt) >= reduced_shape.at(i)) ? 0 : in_offset.at(i + DimsExt);
}
return off;
}

template <size_t Rank, class Index, class Shape>
ranked_shape<Rank> get_reduced_offset(Index in_offset, Shape reduced_shape) {
ranked_shape<Rank> off;
Expand All @@ -393,6 +387,20 @@ ranked_shape<Rank> get_reduced_offset(Index in_offset, Shape reduced_shape) {
return off;
}

// template <size_t Rank, class Index, class Shape>
// ranked_shape<Rank> get_reduced_offset(Index in_offset, Shape reduced_shape) {
// ranked_shape<Rank> off;
// const auto dims_ext = in_offset.rank() - reduced_shape.rank();
// loop<reduced_shape.rank()>([&](auto i) {
// if (in_offset.at(i + dims_ext) >= reduced_shape.at(i))
// off.at(i) = 0;
// else
// off.at(i) = in_offset.at(i + dims_ext);
// });

// return off;
// }

template <size_t Axes, size_t Rank, class Index>
ranked_shape<Rank> get_reduced_offset(Index in_offset) {
ranked_shape<Rank> off;
Expand Down
154 changes: 154 additions & 0 deletions ntt/test/benchmark_test/benchmark_ntt_expand.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
#include "ntt_test.h"
#include <iomanip>
#include <memory>
#include <nncase/ntt/ntt.h>

using namespace nncase;

namespace nncase::ntt {
template <>
struct vector_storage_traits<float> {
using buffer_type = float;
};
}

template <typename T, size_t M, size_t P>
void benchmark_ntt_expand_nopack(T init_low, T init_high) {
std::string pack_mode = "NoPack";
constexpr size_t warmup_size = 10;
#if __riscv
constexpr size_t run_size = 300;
#elif __x86_64__
constexpr size_t run_size = 20000;
#else
constexpr size_t run_size = 20000;
#endif

using in_tensor_type = ntt::tensor<T, ntt::fixed_shape<M>>;
using out_tensor_type = ntt::tensor<T, ntt::fixed_shape<M, P>>;

std::unique_ptr<in_tensor_type> ntt_input(new in_tensor_type);
std::unique_ptr<out_tensor_type> ntt_output(new out_tensor_type);

NttTest::init_tensor(*ntt_input, init_low, init_high);

// warm up
for (size_t i = 0; i < warmup_size; i++)
ntt::expand(*ntt_input, *ntt_output);

// run
auto t1 = NttTest::get_cpu_cycle();
for (size_t i = 0; i < run_size; i++) {
ntt::expand(*ntt_input, *ntt_output);
asm volatile("" ::"g"(ntt_output));
}
auto t2 = NttTest::get_cpu_cycle();
std::cout << __FUNCTION__ << " took "
<< std::setprecision(1) << std::fixed
<< static_cast<float>(t2 - t1) / M / run_size << " cycles"
<< std::endl;
}

template <typename T, size_t M, size_t N, size_t P>
void benchmark_ntt_expand_nopack1(T init_low, T init_high) {
std::string pack_mode = "NoPack";
constexpr size_t warmup_size = 10;
#if __riscv
constexpr size_t run_size = 300;
#elif __x86_64__
constexpr size_t run_size = 20000;
#else
constexpr size_t run_size = 20000;
#endif

using in_tensor_type = ntt::tensor<T, ntt::fixed_shape<M, N>>;
using out_tensor_type = ntt::tensor<T, ntt::fixed_shape<M, P>>;

std::unique_ptr<in_tensor_type> ntt_input(new in_tensor_type);
std::unique_ptr<out_tensor_type> ntt_output(new out_tensor_type);

NttTest::init_tensor(*ntt_input, init_low, init_high);

// warm up
for (size_t i = 0; i < warmup_size; i++)
ntt::expand(*ntt_input, *ntt_output);

// run
auto t1 = NttTest::get_cpu_cycle();
for (size_t i = 0; i < run_size; i++) {
ntt::expand(*ntt_input, *ntt_output);
asm volatile("" ::"g"(ntt_output));
}
auto t2 = NttTest::get_cpu_cycle();
std::cout << __FUNCTION__ << " took "
<< std::setprecision(1) << std::fixed
<< static_cast<float>(t2 - t1) / M / run_size << " cycles"
<< std::endl;
}

template <typename T, size_t M, size_t N, size_t P, size_t VLEN>
void benchmark_ntt_expand_pack(T init_low, T init_high) {
std::string pack_mode = "Pack";
constexpr size_t warmup_size = 10;
#if __riscv
constexpr size_t run_size = 300;
#elif __x86_64__
constexpr size_t run_size = 20000;
#else
constexpr size_t run_size = 20000;
#endif

using in_tensor_type = ntt::tensor<T, ntt::fixed_shape<M, N>>;
using out_tensor_type = ntt::tensor<T, ntt::fixed_shape<M, P>>;
using packed_in_tensor_type = ntt::tensor<ntt::vector<T, VLEN>, ntt::fixed_shape<M / VLEN, N>>;
using packed_out_tensor_type = ntt::tensor<ntt::vector<T, VLEN>, ntt::fixed_shape<M / VLEN, P>>;

std::unique_ptr<in_tensor_type> ntt_input(new in_tensor_type);
std::unique_ptr<out_tensor_type> ntt_output(new out_tensor_type);
std::unique_ptr<packed_in_tensor_type> packed_input(new packed_in_tensor_type);
std::unique_ptr<packed_out_tensor_type> packed_output(new packed_out_tensor_type);

NttTest::init_tensor(*ntt_input, init_low, init_high);

// Pack the input tensor
ntt::pack<0>(*ntt_input, *packed_input);

// Warm up
for (size_t i = 0; i < warmup_size; i++)
ntt::expand(*packed_input, *packed_output);

// Run
auto t1 = NttTest::get_cpu_cycle();
for (size_t i = 0; i < run_size; i++) {
ntt::expand(*packed_input, *packed_output);
asm volatile("" ::"g"(packed_output));
}
auto t2 = NttTest::get_cpu_cycle();

std::cout << __FUNCTION__ << "_" << pack_mode << " took "
<< std::setprecision(1) << std::fixed
<< static_cast<float>(t2 - t1) / (M / VLEN) / run_size << " cycles"
<< std::endl;
}

int main(int argc, char *argv[]) {
(void)argc;
(void)argv;

constexpr size_t M1 = 1;
constexpr size_t P1 = 2;
benchmark_ntt_expand_nopack<float, M1, P1>(-10.f, 10.f);

constexpr size_t M2 = 1024;
constexpr size_t N2 = 1;
constexpr size_t P2 = 2048;
benchmark_ntt_expand_nopack1<float, M2, N2, P2>(-10.f, 10.f);

constexpr size_t M3 = 32;
constexpr size_t N3 = 1;
constexpr size_t P3 = 2;
constexpr size_t VLEN3 = 4;
benchmark_ntt_expand_pack<float, M3, N3, P3, VLEN3>(-10.f, 10.f);

return 0;
}
123 changes: 123 additions & 0 deletions ntt/test/ctest/test_ntt_expand.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
/* Copyright 2019-2024 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "ntt_test.h"
#include "ortki_helper.h"
#include <gtest/gtest.h>
#include <iostream>
#include <nncase/ntt/ntt.h>
#include <ortki/operators.h>

using namespace nncase;
using namespace ortki;

TEST(ExpandTestFloat, NoPack) {
constexpr size_t M = 1024;
constexpr size_t N = 1;
constexpr size_t K = 2048;

float min_input = static_cast<float>(-10);
float max_input = static_cast<float>(10);

// init
using input_tensor_type = ntt::tensor<float, ntt::fixed_shape<M, N>>;
using output_tensor_type = ntt::tensor<float, ntt::fixed_shape<M, K>>;

std::unique_ptr<input_tensor_type> ntt_input(new input_tensor_type);
NttTest::init_tensor(*ntt_input, min_input, max_input);

// ntt
std::unique_ptr<output_tensor_type> ntt_output1(new output_tensor_type);
ntt::expand(*ntt_input, *ntt_output1);

// ort
auto ort_input = NttTest::ntt2ort(*ntt_input);
int64_t target_shape[] = {M, K};
int64_t shape_size = 2;
int64_t shape[] = {shape_size};
auto shape_tensor = make_tensor(reinterpret_cast<void*>(target_shape), DataType_INT64, shape, 1);
auto ort_output = ortki_Expand(ort_input, shape_tensor);

// compare
std::unique_ptr<output_tensor_type> ntt_output2(new output_tensor_type);
NttTest::ort2ntt(ort_output, *ntt_output2);
EXPECT_TRUE(NttTest::compare_tensor(*ntt_output1, *ntt_output2));
}

TEST(ExpandTestFloat, NoPack1) {
constexpr size_t M = 1;
constexpr size_t K = 2;

float min_input = static_cast<float>(-10);
float max_input = static_cast<float>(10);

// init
using input_tensor_type = ntt::tensor<float, ntt::fixed_shape<M>>;
using output_tensor_type = ntt::tensor<float, ntt::fixed_shape<M, K>>;
std::unique_ptr<input_tensor_type> ntt_input(new input_tensor_type);
NttTest::init_tensor(*ntt_input, min_input, max_input);

// ntt
std::unique_ptr<output_tensor_type> ntt_output1(new output_tensor_type);
ntt::expand(*ntt_input, *ntt_output1);

// ort
auto ort_input = NttTest::ntt2ort(*ntt_input);
int64_t target_shape[] = {M, K};
int64_t shape_size = 2;
int64_t shape[] = {shape_size};
auto shape_tensor = make_tensor(reinterpret_cast<void*>(target_shape), DataType_INT64, shape, 1);
auto ort_output = ortki_Expand(ort_input, shape_tensor);

// compare
std::unique_ptr<output_tensor_type> ntt_output2(new output_tensor_type);
NttTest::ort2ntt(ort_output, *ntt_output2);
EXPECT_TRUE(NttTest::compare_tensor(*ntt_output1, *ntt_output2));
}

TEST(ExpandTestFloat, Pack_M_K) {
constexpr size_t P = NTT_VLEN / (sizeof(float) * 8);

// init
using input_tensor_type = ntt::tensor<float, ntt::fixed_shape<32, 1>>;
std::unique_ptr<input_tensor_type> ntt_input(new input_tensor_type);
NttTest::init_tensor(*ntt_input, -10.f, 10.f);

constexpr size_t pack_dim = (31 + P) / P;
alignas(32) ntt::tensor<ntt::vector<float, 128>, ntt::fixed_shape<pack_dim, 1>> p_ntt_lhs;
ntt::pack<0>(*ntt_input, p_ntt_lhs);

// ntt
using output_tensor_type = ntt::tensor<float, ntt::fixed_shape<32, 2>>;
std::unique_ptr<output_tensor_type> ntt_output1(new output_tensor_type);
ntt::expand(*ntt_input, *ntt_output1);

// ort
auto ort_input = NttTest::ntt2ort(*ntt_input);
int64_t target_shape[] = {32, 2};
int64_t shape_size = 2;
int64_t shape[] = {shape_size};
auto shape_tensor = make_tensor(reinterpret_cast<void*>(target_shape), DataType_INT64, shape, 1);
auto ort_output = ortki_Expand(ort_input, shape_tensor);
std::unique_ptr<output_tensor_type> ntt_output2(new output_tensor_type);
NttTest::ort2ntt(ort_output, *ntt_output2);

// compare
EXPECT_TRUE(NttTest::compare_tensor(*ntt_output1, *ntt_output2));
}

int main(int argc, char *argv[]) {
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
Loading