kendryte · uranus0515 · Nov 5, 2024 · Nov 5, 2024 · Nov 5, 2024 · Nov 5, 2024
diff --git a/ntt/include/nncase/bfloat16.h b/ntt/include/nncase/bfloat16.h
@@ -184,7 +184,7 @@ DEFINE_BF16_BINARY_BOOLRET(>=)
 DEFINE_BF16_BINARY_BOOLRET(>)
 
 #define DEFINE_BF16_BINARY_SELF_MOD(x, op)                                     \
-    inline bfloat16 &operator x(bfloat16 & a, bfloat16 b) noexcept {           \
+    inline bfloat16 &operator x(bfloat16 &a, bfloat16 b) noexcept {            \
         a = a op b;                                                            \
         return a;                                                              \
     }

diff --git a/ntt/include/nncase/half.h b/ntt/include/nncase/half.h
@@ -187,7 +187,7 @@ DEFINE_FP16_BINARY_BOOLRET(>=)
 DEFINE_FP16_BINARY_BOOLRET(>)
 
 #define DEFINE_FP16_BINARY_SELF_MOD(x, op)                                     \
-    inline half &operator x(half & a, half b) noexcept {                       \
+    inline half &operator x(half &a, half b) noexcept {                        \
         a = a op b;                                                            \
         return a;                                                              \
     }

diff --git a/ntt/include/nncase/ntt/arch/riscv64/ukernels.h b/ntt/include/nncase/ntt/arch/riscv64/ukernels.h
diff --git a/ntt/include/nncase/ntt/arch/x86_64/primitive_ops.h b/ntt/include/nncase/ntt/arch/x86_64/primitive_ops.h
@@ -189,6 +189,41 @@ template <> struct cast<ntt::vector<bool, 8>, ntt::vector<float, 8>> {
     }
 };
 
+// cast
+template <> struct cast<ntt::vector<bool, 32>, ntt::vector<float, 8>> {
+    void operator()(const ntt::vector<bool, 32> &v, ntt::vector<float, 8> &v0,
+                    ntt::vector<float, 8> &v1, ntt::vector<float, 8> &v2,
+                    ntt::vector<float, 8> &v3) const noexcept {
+        __m256i mask0 = _mm256_setr_epi32(
+            v(0) ? -1 : 0, v(1) ? -1 : 0, v(2) ? -1 : 0, v(3) ? -1 : 0,
+            v(4) ? -1 : 0, v(5) ? -1 : 0, v(6) ? -1 : 0, v(7) ? -1 : 0);
+
+        // Convert to float (1.0f for true, 0.0f for false)
+        v0 = _mm256_and_ps(_mm256_castsi256_ps(mask0), _mm256_set1_ps(1.0f));
+
+        __m256i mask1 = _mm256_setr_epi32(
+            v(8) ? -1 : 0, v(9) ? -1 : 0, v(10) ? -1 : 0, v(11) ? -1 : 0,
+            v(12) ? -1 : 0, v(13) ? -1 : 0, v(14) ? -1 : 0, v(15) ? -1 : 0);
+
+        // Convert to float (1.0f for true, 0.0f for false)
+        v1 = _mm256_and_ps(_mm256_castsi256_ps(mask1), _mm256_set1_ps(1.0f));
+
+        __m256i mask2 = _mm256_setr_epi32(
+            v(16) ? -1 : 0, v(17) ? -1 : 0, v(18) ? -1 : 0, v(19) ? -1 : 0,
+            v(20) ? -1 : 0, v(21) ? -1 : 0, v(22) ? -1 : 0, v(23) ? -1 : 0);
+
+        // Convert to float (1.0f for true, 0.0f for false)
+        v2 = _mm256_and_ps(_mm256_castsi256_ps(mask2), _mm256_set1_ps(1.0f));
+
+        __m256i mask3 = _mm256_setr_epi32(
+            v(24) ? -1 : 0, v(25) ? -1 : 0, v(26) ? -1 : 0, v(27) ? -1 : 0,
+            v(28) ? -1 : 0, v(29) ? -1 : 0, v(30) ? -1 : 0, v(31) ? -1 : 0);
+
+        // Convert to float (1.0f for true, 0.0f for false)
+        v3 = _mm256_and_ps(_mm256_castsi256_ps(mask3), _mm256_set1_ps(1.0f));
+    }
+};
+
 // cast
 template <> struct cast<ntt::vector<float, 8>, ntt::vector<int, 8>> {
     ntt::vector<int, 8>

diff --git a/ntt/include/nncase/ntt/arch/x86_64/ukernels.h b/ntt/include/nncase/ntt/arch/x86_64/ukernels.h
diff --git a/ntt/include/nncase/ntt/kernels/cast.h b/ntt/include/nncase/ntt/kernels/cast.h
@@ -21,64 +21,96 @@
 
 namespace nncase::ntt {
 namespace detail {
-template <class Shape, class InStrides, class OutStrides> class cast_impl;
+template <class InShape, class OutShape, class InStrides, class OutStrides>
+class cast_impl;
 
-template <size_t... Dims, size_t... InStrides, size_t... OutStrides>
-class cast_impl<fixed_shape<Dims...>, fixed_strides<InStrides...>,
-                fixed_strides<OutStrides...>> {
+template <size_t... InDims, size_t... OutDims, size_t... InStrides,
+          size_t... OutStrides>
+class cast_impl<fixed_shape<InDims...>, fixed_shape<OutDims...>,
+                fixed_strides<InStrides...>, fixed_strides<OutStrides...>> {
   public:
     template <class TIn, class TOut>
     constexpr void operator()(const TIn &input, TOut &output) {
-        constexpr size_t rank = sizeof...(Dims);
+
+        constexpr float scale =
+            (float)TIn::shape().length() / TOut::shape().length();
+
+        if constexpr (scale != 1.0f) {
+            static_assert(TIn::rank() == 1,
+                          "Only support 1D tensor repack for now!");
+        }
+
+        constexpr auto in_offset_scale =
+            scale > 1.0f ? (size_t)scale : (size_t)1;
+        constexpr auto out_offset_scale =
+            scale > 1.0f ? (size_t)1 : (size_t)(1.0f / scale);
+
+        constexpr size_t rank = sizeof...(InDims);
         ranked_shape<rank> index{};
         constexpr auto conti_dims =
-            std::min(contiguous_dims(fixed_shape<Dims...>{},
+            std::min(contiguous_dims(fixed_shape<InDims...>{},
                                      fixed_strides<InStrides...>{}),
-                     contiguous_dims(fixed_shape<Dims...>{},
+                     contiguous_dims(fixed_shape<InDims...>{},
                                      fixed_strides<OutStrides...>{}));
-        apply<TIn, TOut, 0, rank, conti_dims, Dims...>(index, input, output);
+
+        if constexpr (scale >= 1.0f) {
+            apply<in_offset_scale, out_offset_scale, TIn, TOut, 0, rank,
+                  conti_dims, OutDims...>(index, input, output);
+        } else {
+            apply<in_offset_scale, out_offset_scale, TIn, TOut, 0, rank,
+                  conti_dims, InDims...>(index, input, output);
+        }
     }
 
   private:
-    template <class TIn, class TOut, size_t Axis, size_t Rank,
-              size_t ContiguousDims, size_t... RestDims>
+    template <size_t in_offset_scale, size_t out_offset_scale, class TIn,
+              class TOut, size_t Axis, size_t Rank, size_t ContiguousDims,
+              size_t... RestDims>
     constexpr void apply(ranked_shape<Rank> &index, const TIn &input,
                          TOut &output) {
         if constexpr (ContiguousDims == sizeof...(RestDims)) {
             constexpr auto inner_size = fixed_shape<RestDims...>::length();
-            auto input_p =
-                input.elements().data() + linear_offset(index, input.strides());
-            auto output_p = output.elements().data() +
-                            linear_offset(index, output.strides());
-            cast_contiguous<inner_size>(input_p, output_p);
+
+            auto in_offset =
+                linear_offset(index, input.strides()) * in_offset_scale;
+            auto out_offset =
+                linear_offset(index, output.strides()) * out_offset_scale;
+            auto input_p = input.elements().data() + in_offset;
+            auto output_p = output.elements().data() + out_offset;
+            cast_contiguous<in_offset_scale, out_offset_scale, inner_size>(
+                input_p, output_p);
         } else {
-            apply_next<TIn, TOut, Axis, Rank, ContiguousDims, RestDims...>(
-                index, input, output);
+            apply_next<in_offset_scale, out_offset_scale, TIn, TOut, Axis, Rank,
+                       ContiguousDims, RestDims...>(index, input, output);
         }
     }
 
-    template <class TIn, class TOut, size_t Axis, size_t Rank,
-              size_t ContiguousDims, size_t Dim, size_t... RestDims>
+    template <size_t in_offset_scale, size_t out_offset_scale, class TIn,
+              class TOut, size_t Axis, size_t Rank, size_t ContiguousDims,
+              size_t Dim, size_t... RestDims>
     constexpr void apply_next(ranked_shape<Rank> &index, const TIn &input,
                               TOut &output) {
         for (index[Axis] = 0; index[Axis] < Dim; index[Axis]++) {
-            apply<TIn, TOut, Axis + 1, Rank, ContiguousDims, RestDims...>(
-                index, input, output);
+            apply<in_offset_scale, out_offset_scale, TIn, TOut, Axis + 1, Rank,
+                  ContiguousDims, RestDims...>(index, input, output);
         }
     }
 
-    template <size_t Extent, class T1, class T2>
+    template <size_t in_offset_scale, size_t out_offset_scale, size_t Extent,
+              class T1, class T2>
     constexpr void cast_contiguous(const T1 *input, T2 *output) {
-        ntt::u_cast(input, 1, output, 1, Extent);
+        ntt::u_cast<T1, T2, in_offset_scale, out_offset_scale>(input, 1, output,
+                                                               1, Extent);
     }
 };
 
-template <size_t Rank, class InStrides, class OutStrides>
-class cast_impl<ranked_shape<Rank>, InStrides, OutStrides> {
+template <size_t InRank, size_t OutRank, class InStrides, class OutStrides>
+class cast_impl<ranked_shape<InRank>, ranked_shape<OutRank>, InStrides,
+                OutStrides> {
   public:
     template <class TIn, class TOut>
     constexpr void operator()(const TIn &input, TOut &output) {
-        ranked_shape<Rank> index{};
+        ranked_shape<InRank> index{};
         auto conti_dims =
             std::min(contiguous_dims(input.shape(), input.strides()),
                      contiguous_dims(input.shape(), output.strides()));
@@ -87,9 +119,9 @@ class cast_impl<ranked_shape<Rank>, InStrides, OutStrides> {
 
   private:
     template <class TIn, class TOut, size_t Axis>
-    constexpr void apply(ranked_shape<Rank> &index, size_t conti_dims,
+    constexpr void apply(ranked_shape<InRank> &index, size_t conti_dims,
                          const TIn &input, TOut &output) {
-        const auto outer_dims = Rank - conti_dims;
+        const auto outer_dims = InRank - conti_dims;
         if (Axis >= outer_dims) {
             size_t inner_size = 1;
             for (size_t i = outer_dims; i < input.shape().rank(); i++)
@@ -99,7 +131,7 @@ class cast_impl<ranked_shape<Rank>, InStrides, OutStrides> {
             auto output_p =
                 output.buffer().data() + linear_offset(index, output.strides());
             cast_contiguous(input_p, output_p, inner_size);
-        } else if constexpr (Axis < Rank - 1) {
+        } else if constexpr (Axis < InRank - 1) {
             const auto dim = input.shape()[Axis];
             for (index[Axis] = 0; index[Axis] < dim; index[Axis]++) {
                 apply<TIn, TOut, Axis + 1>(index, conti_dims, input, output);
@@ -109,17 +141,16 @@ class cast_impl<ranked_shape<Rank>, InStrides, OutStrides> {
 
     template <class T1, class T2>
     constexpr void cast_contiguous(const T1 *input, T2 *output, size_t extent) {
-        ntt::u_cast(input, 1, output, 1, extent);
+        ntt::u_cast<T1, T2, 1, 1>(input, 1, output, 1, extent);
     }
 };
 } // namespace detail
 
 template <typename TIn, typename TOut>
 void cast(const TIn &input, TOut &&output) noexcept {
-    detail::cast_impl<common_shape_t<typename TIn::shape_type,
-                                     typename std::decay_t<TOut>::shape_type>,
-                      typename TIn::strides_type,
-                      typename std::decay_t<TOut>::strides_type>
+    detail::cast_impl<
+        typename TIn::shape_type, typename std::decay_t<TOut>::shape_type,
+        typename TIn::strides_type, typename std::decay_t<TOut>::strides_type>
         impl;
     impl(input, output);
 }