kendryte · cqqd3 · Nov 21, 2024 · Nov 22, 2024
diff --git a/ntt/include/nncase/ntt/kernels/expand.h b/ntt/include/nncase/ntt/kernels/expand.h
@@ -27,17 +27,14 @@ void expand_impl(const TIn &input, TOut &&output) noexcept {
     constexpr auto in_rank = TIn::shape_type::rank();
     constexpr auto in_shape = typename TIn::shape_type{};
     constexpr auto out_shape = std::decay_t<TOut>::shape();
-    // constexpr auto input_strides = TIn::strides();
-    // constexpr auto output_strides = std::decay_t<TOut>::strides();
 
     using TIElem = typename TIn::element_type;
     using TOElem = typename std::decay_t<TOut>::element_type;
 
-    static_assert(IsScalar<TOElem> && IsScalar<TIElem>,
-                  "Only support scalar type for now");
+    constexpr auto dims_ext = out_shape.rank() - in_rank;
 
     apply(out_shape, [&](auto index) {
-        const auto in_index = get_reduced_offset<in_rank>(index, in_shape);
+        const auto in_index = get_reduced_offset<in_rank, decltype(index), decltype(in_shape), dims_ext>(index, in_shape);
         output(index) = input(in_index);
     });
 }

diff --git a/ntt/include/nncase/ntt/shape.h b/ntt/include/nncase/ntt/shape.h
@@ -84,6 +84,10 @@ struct fixed_shape : detail::fixed_dims_base<Dims...> {
     template <size_t I> struct append { using type = fixed_shape<Dims..., I>; };
 
     static constexpr size_t length() noexcept { return (Dims * ... * 1); }
+    static constexpr size_t size() noexcept { return sizeof...(Dims); }
+
+    constexpr auto begin() const { return &detail::fixed_dims_base<Dims...>::operator[](0); }
+    constexpr auto end() const { return &detail::fixed_dims_base<Dims...>::operator[](0) + sizeof...(Dims); }
 };
 
 template <size_t Rank> struct ranked_shape : detail::ranked_dims_base<Rank> {
@@ -313,18 +317,6 @@ constexpr size_t linear_offset(const Index &index,
     return offset;
 }
 
-template <class Strides>
-ranked_shape<Strides::rank()> unravel_index(const size_t offset,
-                                            const Strides &strides) noexcept {
-    size_t remain = offset;
-    ranked_shape<Strides::rank()> index;
-    for (size_t i = 0; i < Strides::rank(); i++) {
-        index[i] = remain / strides[i];
-        remain = remain % strides[i];
-    }
-    return index;
-}
-
 template <class Shape, class Strides>
 constexpr size_t linear_size(const Shape &shape,
                              const Strides &strides) noexcept {
@@ -340,13 +332,6 @@ constexpr size_t linear_size(const Shape &shape,
     return size;
 }
 
-/**
- * @brief calculate the number of contigous dimensions.
- *
- * @param shape fixed/ranked
- * @param strides fixed/ranked
- * @return constexpr size_t contigous dimension numbers.
- */
 template <class Shape, class Strides>
 constexpr size_t contiguous_dims(const Shape &shape, const Strides &strides) {
     auto def_strides = default_strides(shape);
@@ -379,6 +364,15 @@ constexpr bool in_bound(const Index &index, const Shape &shape) {
     return false;
 }
 
+template <size_t Rank, class Index, class Shape, size_t DimsExt>
+constexpr ranked_shape<Rank> get_reduced_offset(Index in_offset, Shape reduced_shape) {
+    ranked_shape<Rank> off;
+    for (size_t i = 0; i < reduced_shape.rank(); i++) {
+        off.at(i) = (in_offset.at(i + DimsExt) >= reduced_shape.at(i)) ? 0 : in_offset.at(i + DimsExt);
+    }
+    return off;
+}
+
 template <size_t Rank, class Index, class Shape>
 ranked_shape<Rank> get_reduced_offset(Index in_offset, Shape reduced_shape) {
     ranked_shape<Rank> off;
@@ -393,6 +387,20 @@ ranked_shape<Rank> get_reduced_offset(Index in_offset, Shape reduced_shape) {
     return off;
 }
 
+// template <size_t Rank, class Index, class Shape>
+// ranked_shape<Rank> get_reduced_offset(Index in_offset, Shape reduced_shape) {
+//     ranked_shape<Rank> off;
+//     const auto dims_ext = in_offset.rank() - reduced_shape.rank();
+//     loop<reduced_shape.rank()>([&](auto i) {
+//         if (in_offset.at(i + dims_ext) >= reduced_shape.at(i))
+//             off.at(i) = 0;
+//         else
+//             off.at(i) = in_offset.at(i + dims_ext);
+//     });
+
+//     return off;
+// }
+
 template <size_t Axes, size_t Rank, class Index>
 ranked_shape<Rank> get_reduced_offset(Index in_offset) {
     ranked_shape<Rank> off;

diff --git a/ntt/test/benchmark_test/benchmark_ntt_expand.cpp b/ntt/test/benchmark_test/benchmark_ntt_expand.cpp
@@ -0,0 +1,154 @@
+#include "ntt_test.h"
+#include <iomanip>
+#include <memory>
+#include <nncase/ntt/ntt.h>
+
+using namespace nncase;
+
+namespace nncase::ntt {
+    template <>
+    struct vector_storage_traits<float> {
+        using buffer_type = float;
+    };
+}
+
+template <typename T, size_t M, size_t P>
+void benchmark_ntt_expand_nopack(T init_low, T init_high) {
+    std::string pack_mode = "NoPack";
+    constexpr size_t warmup_size = 10;
+#if __riscv
+    constexpr size_t run_size = 300;
+#elif __x86_64__
+    constexpr size_t run_size = 20000;
+#else
+    constexpr size_t run_size = 20000;
+#endif
+
+    using in_tensor_type = ntt::tensor<T, ntt::fixed_shape<M>>;
+    using out_tensor_type = ntt::tensor<T, ntt::fixed_shape<M, P>>;
+
+    std::unique_ptr<in_tensor_type> ntt_input(new in_tensor_type);
+    std::unique_ptr<out_tensor_type> ntt_output(new out_tensor_type);
+
+    NttTest::init_tensor(*ntt_input, init_low, init_high);
+
+    // warm up
+    for (size_t i = 0; i < warmup_size; i++)
+        ntt::expand(*ntt_input, *ntt_output);
+
+    // run
+    auto t1 = NttTest::get_cpu_cycle();
+    for (size_t i = 0; i < run_size; i++) {
+        ntt::expand(*ntt_input, *ntt_output);
+        asm volatile("" ::"g"(ntt_output));
+    }
+    auto t2 = NttTest::get_cpu_cycle();
+    std::cout << __FUNCTION__ << " took "
+              << std::setprecision(1) << std::fixed
+              << static_cast<float>(t2 - t1) / M / run_size << " cycles"
+              << std::endl;
+}
+
+template <typename T, size_t M, size_t N, size_t P>
+void benchmark_ntt_expand_nopack1(T init_low, T init_high) {
+    std::string pack_mode = "NoPack";
+    constexpr size_t warmup_size = 10;
+#if __riscv
+    constexpr size_t run_size = 300;
+#elif __x86_64__
+    constexpr size_t run_size = 20000;
+#else
+    constexpr size_t run_size = 20000;
+#endif
+
+    using in_tensor_type = ntt::tensor<T, ntt::fixed_shape<M, N>>;
+    using out_tensor_type = ntt::tensor<T, ntt::fixed_shape<M, P>>;
+
+    std::unique_ptr<in_tensor_type> ntt_input(new in_tensor_type);
+    std::unique_ptr<out_tensor_type> ntt_output(new out_tensor_type);
+
+    NttTest::init_tensor(*ntt_input, init_low, init_high);
+
+    // warm up
+    for (size_t i = 0; i < warmup_size; i++)
+        ntt::expand(*ntt_input, *ntt_output);
+
+    // run
+    auto t1 = NttTest::get_cpu_cycle();
+    for (size_t i = 0; i < run_size; i++) {
+        ntt::expand(*ntt_input, *ntt_output);
+        asm volatile("" ::"g"(ntt_output));
+    }
+    auto t2 = NttTest::get_cpu_cycle();
+    std::cout << __FUNCTION__ << " took "
+              << std::setprecision(1) << std::fixed
+              << static_cast<float>(t2 - t1) / M / run_size << " cycles"
+              << std::endl;
+}
+
+template <typename T, size_t M, size_t N, size_t P, size_t VLEN>
+void benchmark_ntt_expand_pack(T init_low, T init_high) {
+    std::string pack_mode = "Pack";
+    constexpr size_t warmup_size = 10;
+#if __riscv
+    constexpr size_t run_size = 300;
+#elif __x86_64__
+    constexpr size_t run_size = 20000;
+#else
+    constexpr size_t run_size = 20000;
+#endif
+
+    using in_tensor_type = ntt::tensor<T, ntt::fixed_shape<M, N>>;
+    using out_tensor_type = ntt::tensor<T, ntt::fixed_shape<M, P>>;
+    using packed_in_tensor_type = ntt::tensor<ntt::vector<T, VLEN>, ntt::fixed_shape<M / VLEN, N>>;
+    using packed_out_tensor_type = ntt::tensor<ntt::vector<T, VLEN>, ntt::fixed_shape<M / VLEN, P>>;
+
+    std::unique_ptr<in_tensor_type> ntt_input(new in_tensor_type);
+    std::unique_ptr<out_tensor_type> ntt_output(new out_tensor_type);
+    std::unique_ptr<packed_in_tensor_type> packed_input(new packed_in_tensor_type);
+    std::unique_ptr<packed_out_tensor_type> packed_output(new packed_out_tensor_type);
+
+    NttTest::init_tensor(*ntt_input, init_low, init_high);
+
+    // Pack the input tensor
+    ntt::pack<0>(*ntt_input, *packed_input);
+
+    // Warm up
+    for (size_t i = 0; i < warmup_size; i++)
+        ntt::expand(*packed_input, *packed_output);
+
+    // Run
+    auto t1 = NttTest::get_cpu_cycle();
+    for (size_t i = 0; i < run_size; i++) {
+        ntt::expand(*packed_input, *packed_output);
+        asm volatile("" ::"g"(packed_output));
+    }
+    auto t2 = NttTest::get_cpu_cycle();
+
+    std::cout << __FUNCTION__ << "_" << pack_mode << " took "
+              << std::setprecision(1) << std::fixed
+              << static_cast<float>(t2 - t1) / (M / VLEN) / run_size << " cycles"
+              << std::endl;
+}
+
+int main(int argc, char *argv[]) {
+    (void)argc;
+    (void)argv;
+
+    constexpr size_t M1 = 1;
+    constexpr size_t P1 = 2;
+    benchmark_ntt_expand_nopack<float, M1, P1>(-10.f, 10.f);
+
+    constexpr size_t M2 = 1024;
+    constexpr size_t N2 = 1;
+    constexpr size_t P2 = 2048;
+    benchmark_ntt_expand_nopack1<float, M2, N2, P2>(-10.f, 10.f);
+
+    constexpr size_t M3 = 32;
+    constexpr size_t N3 = 1;
+    constexpr size_t P3 = 2;
+    constexpr size_t VLEN3 = 4;
+    benchmark_ntt_expand_pack<float, M3, N3, P3, VLEN3>(-10.f, 10.f);
+
+    return 0;
+}
diff --git a/ntt/test/ctest/test_ntt_expand.cpp b/ntt/test/ctest/test_ntt_expand.cpp
@@ -0,0 +1,123 @@
+/* Copyright 2019-2024 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "ntt_test.h"
+#include "ortki_helper.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <nncase/ntt/ntt.h>
+#include <ortki/operators.h>
+
+using namespace nncase;
+using namespace ortki;
+
+TEST(ExpandTestFloat, NoPack) {
+    constexpr size_t M = 1024;
+    constexpr size_t N = 1;
+    constexpr size_t K = 2048;
+
+    float min_input = static_cast<float>(-10);
+    float max_input = static_cast<float>(10);
+
+    // init
+    using input_tensor_type = ntt::tensor<float, ntt::fixed_shape<M, N>>;
+    using output_tensor_type = ntt::tensor<float, ntt::fixed_shape<M, K>>;
+
+    std::unique_ptr<input_tensor_type> ntt_input(new input_tensor_type);
+    NttTest::init_tensor(*ntt_input, min_input, max_input);
+
+    // ntt
+    std::unique_ptr<output_tensor_type> ntt_output1(new output_tensor_type);
+    ntt::expand(*ntt_input, *ntt_output1);
+
+    // ort
+    auto ort_input = NttTest::ntt2ort(*ntt_input);
+    int64_t target_shape[] = {M, K};
+    int64_t shape_size = 2;
+    int64_t shape[] = {shape_size};
+    auto shape_tensor = make_tensor(reinterpret_cast<void*>(target_shape), DataType_INT64, shape, 1);
+    auto ort_output = ortki_Expand(ort_input, shape_tensor);
+
+    // compare
+    std::unique_ptr<output_tensor_type> ntt_output2(new output_tensor_type);
+    NttTest::ort2ntt(ort_output, *ntt_output2);
+    EXPECT_TRUE(NttTest::compare_tensor(*ntt_output1, *ntt_output2));
+}
+
+TEST(ExpandTestFloat, NoPack1) {
+    constexpr size_t M = 1;
+    constexpr size_t K = 2;
+
+    float min_input = static_cast<float>(-10);
+    float max_input = static_cast<float>(10);
+
+    // init
+    using input_tensor_type = ntt::tensor<float, ntt::fixed_shape<M>>;
+    using output_tensor_type = ntt::tensor<float, ntt::fixed_shape<M, K>>;
+    std::unique_ptr<input_tensor_type> ntt_input(new input_tensor_type);
+    NttTest::init_tensor(*ntt_input, min_input, max_input);
+
+   // ntt
+    std::unique_ptr<output_tensor_type> ntt_output1(new output_tensor_type);
+    ntt::expand(*ntt_input, *ntt_output1);
+
+    // ort
+    auto ort_input = NttTest::ntt2ort(*ntt_input);
+    int64_t target_shape[] = {M, K};
+    int64_t shape_size = 2;
+    int64_t shape[] = {shape_size};
+    auto shape_tensor = make_tensor(reinterpret_cast<void*>(target_shape), DataType_INT64, shape, 1);
+    auto ort_output = ortki_Expand(ort_input, shape_tensor);
+
+    // compare
+    std::unique_ptr<output_tensor_type> ntt_output2(new output_tensor_type);
+    NttTest::ort2ntt(ort_output, *ntt_output2);
+    EXPECT_TRUE(NttTest::compare_tensor(*ntt_output1, *ntt_output2));
+}
+
+TEST(ExpandTestFloat, Pack_M_K) {
+    constexpr size_t P = NTT_VLEN / (sizeof(float) * 8);
+
+    // init
+    using input_tensor_type = ntt::tensor<float, ntt::fixed_shape<32, 1>>;
+    std::unique_ptr<input_tensor_type> ntt_input(new input_tensor_type);
+    NttTest::init_tensor(*ntt_input, -10.f, 10.f);
+
+    constexpr size_t pack_dim = (31 + P) / P; 
+    alignas(32) ntt::tensor<ntt::vector<float, 128>, ntt::fixed_shape<pack_dim, 1>> p_ntt_lhs;
+    ntt::pack<0>(*ntt_input, p_ntt_lhs);
+
+    // ntt
+    using output_tensor_type = ntt::tensor<float, ntt::fixed_shape<32, 2>>;
+    std::unique_ptr<output_tensor_type> ntt_output1(new output_tensor_type);
+    ntt::expand(*ntt_input, *ntt_output1);
+
+    // ort
+    auto ort_input = NttTest::ntt2ort(*ntt_input);
+    int64_t target_shape[] = {32, 2};
+    int64_t shape_size = 2;
+    int64_t shape[] = {shape_size};
+    auto shape_tensor = make_tensor(reinterpret_cast<void*>(target_shape), DataType_INT64, shape, 1);
+    auto ort_output = ortki_Expand(ort_input, shape_tensor);
+    std::unique_ptr<output_tensor_type> ntt_output2(new output_tensor_type);
+    NttTest::ort2ntt(ort_output, *ntt_output2);
+
+    // compare
+    EXPECT_TRUE(NttTest::compare_tensor(*ntt_output1, *ntt_output2));
+}
+
+int main(int argc, char *argv[]) {
+    ::testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}