diff --git a/.github/disable-workflows/compiler-python-build.yml b/.github/disable-workflows/compiler-python-build.yml index c70237c0bd..af22688caf 100644 --- a/.github/disable-workflows/compiler-python-build.yml +++ b/.github/disable-workflows/compiler-python-build.yml @@ -38,7 +38,7 @@ jobs: run: python -m cibuildwheel --output-dir wheelhouse - name: Upload a Build Artifact - uses: actions/upload-artifact@v2.2.2 + uses: actions/upload-artifact@v4 if: runner.os == 'Windows' with: name: nncase-python-windows @@ -46,7 +46,7 @@ jobs: if-no-files-found: error - name: Upload a Build Artifact - uses: actions/upload-artifact@v2.2.2 + uses: actions/upload-artifact@v4 if: runner.os == 'Linux' with: name: nncase-python-linux @@ -54,7 +54,7 @@ jobs: if-no-files-found: error - name: Upload a Build Artifact - uses: actions/upload-artifact@v2.2.2 + uses: actions/upload-artifact@v4 if: runner.os == 'Macos' with: name: nncase-python-macos diff --git a/.github/disable-workflows/runtime-k210.yml b/.github/disable-workflows/runtime-k210.yml index cc98ee8c3b..d3b2cdb551 100644 --- a/.github/disable-workflows/runtime-k210.yml +++ b/.github/disable-workflows/runtime-k210.yml @@ -73,7 +73,7 @@ jobs: run: cmake --install . --prefix ../install - name: Upload a Build Artifact - uses: actions/upload-artifact@v2.2.2 + uses: actions/upload-artifact@v4 with: name: nncaseruntime-k210 path: ${{github.workspace}}/install diff --git a/.github/disable-workflows/runtime-linux-x64-gcc.yml b/.github/disable-workflows/runtime-linux-x64-gcc.yml index fe755cd81a..fc8ebf539e 100644 --- a/.github/disable-workflows/runtime-linux-x64-gcc.yml +++ b/.github/disable-workflows/runtime-linux-x64-gcc.yml @@ -58,14 +58,14 @@ jobs: run: ${{github.workspace}}/install/bin/benchnncase > benchnncase.log - name: Upload a Build Artifact - uses: actions/upload-artifact@v2.2.2 + uses: actions/upload-artifact@v4 with: name: nncaseruntime-linux-x64-gcc path: ${{github.workspace}}/install if-no-files-found: error - name: Upload Benchmark Result - uses: actions/upload-artifact@v2.2.2 + uses: actions/upload-artifact@v4 with: name: nncasebenchmark-linux-x64-gcc path: ${{github.workspace}}/benchnncase.log diff --git a/.github/disable-workflows/runtime-macos-x64-appleclang.yml b/.github/disable-workflows/runtime-macos-x64-appleclang.yml index 45b4a489dc..eabefadda9 100644 --- a/.github/disable-workflows/runtime-macos-x64-appleclang.yml +++ b/.github/disable-workflows/runtime-macos-x64-appleclang.yml @@ -55,14 +55,14 @@ jobs: run: ${{github.workspace}}/install/bin/benchnncase > benchnncase.log - name: Upload a Build Artifact - uses: actions/upload-artifact@v2.2.2 + uses: actions/upload-artifact@v4 with: name: nncaseruntime-macos-x64-appleclang path: ${{github.workspace}}/install if-no-files-found: error - name: Upload Benchmark Result - uses: actions/upload-artifact@v2.2.2 + uses: actions/upload-artifact@v4 with: name: nncasebenchmark-macos-x64-appleclang path: ${{github.workspace}}/benchnncase.log diff --git a/.github/disable-workflows/runtime-win-x64-msvc.yml b/.github/disable-workflows/runtime-win-x64-msvc.yml index a51d8be994..374cf15770 100644 --- a/.github/disable-workflows/runtime-win-x64-msvc.yml +++ b/.github/disable-workflows/runtime-win-x64-msvc.yml @@ -54,14 +54,14 @@ jobs: run: .\install\bin\benchnncase.exe > benchnncase.log - name: Upload a Build Artifact - uses: actions/upload-artifact@v2.2.2 + uses: actions/upload-artifact@v4 with: name: nncaseruntime-win-x64-msvc path: ${{github.workspace}}/install if-no-files-found: error - name: Upload Benchmark Result - uses: actions/upload-artifact@v2.2.2 + uses: actions/upload-artifact@v4 with: name: nncasebenchmark-win-x64-msvc path: ${{github.workspace}}/benchnncase.log diff --git a/.github/workflows/compiler-build.yml b/.github/workflows/compiler-build.yml index 2c7d703282..5348c36676 100644 --- a/.github/workflows/compiler-build.yml +++ b/.github/workflows/compiler-build.yml @@ -61,7 +61,7 @@ jobs: cmake --install build/${{matrix.config.buildType}} --prefix install - name: Upload nncase Native Build Artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: nncase-native-${{matrix.config.name}} path: ${{github.workspace}}/install @@ -97,7 +97,7 @@ jobs: ${{ runner.os }}-nuget- - name: Install nncase native Artifact - uses: actions/download-artifact@v2.0.9 + uses: actions/download-artifact@v4 with: name: nncase-native-${{matrix.config.name}} path: ${{github.workspace}}/install @@ -139,7 +139,7 @@ jobs: dotnet-coverage merge -o coverage.unit.xml -f cobertura -r coverage/*.xml - name: Upload Coverage - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: matrix.config.name == 'x86_64-linux' with: name: nncase-coverage-unit @@ -147,7 +147,7 @@ jobs: if-no-files-found: error - name: Upload nncase Build Artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: nncase-${{matrix.config.name}} path: ${{github.workspace}}/src/Nncase.Compiler/bin/${{matrix.config.buildType}}/net${{matrix.dotnet-version}}/${{matrix.config.rid}}/publish @@ -185,13 +185,13 @@ jobs: cache-dependency-path: '**/requirements.test.txt' - name: Install nncase native Artifact - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: name: nncase-native-${{matrix.config.name}} path: ${{github.workspace}}/install - name: Install nncase - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: name: nncase-${{matrix.config.name}} path: ${{github.workspace}}/install @@ -259,7 +259,7 @@ jobs: dotnet-coverage merge -o coverage.integration.xml -f cobertura -r coverage/*.xml - name: Upload Coverage - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: matrix.config.name == 'x86_64-linux' with: name: nncase-coverage-integration @@ -283,13 +283,13 @@ jobs: dotnet-version: "8.0" - name: Download Unit Test Coverage - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: name: nncase-coverage-unit path: ${{github.workspace}}/coverage - name: Download Integration Test Coverage - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: name: nncase-coverage-integration path: ${{github.workspace}}/coverage @@ -314,7 +314,7 @@ jobs: reportgenerator -reports:coverage.xml -targetdir:"coveragereport" -reporttypes:Html - name: Upload Coverage Report - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: nncase-coverage-report path: coveragereport diff --git a/.github/workflows/compiler-python-release.yml b/.github/workflows/compiler-python-release.yml index 7b36487bbc..dc4cff2939 100644 --- a/.github/workflows/compiler-python-release.yml +++ b/.github/workflows/compiler-python-release.yml @@ -39,7 +39,7 @@ jobs: dotnet publish src/Nncase.Compiler -c ${{matrix.config.buildType}} --no-restore --sc false -r ${{matrix.config.rid}} - name: Upload nncase Build Artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: nncase-${{matrix.config.name}} path: ${{github.workspace}}/src/Nncase.Compiler/bin/${{matrix.config.buildType}}/net${{matrix.dotnet-version}}/${{matrix.config.rid}}/publish @@ -69,7 +69,7 @@ jobs: dotnet-version: ${{matrix.dotnet-version}} - name: Install nncase - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: name: nncase-${{matrix.config.name}} path: ${{github.workspace}}/install @@ -97,7 +97,7 @@ jobs: run: python -m cibuildwheel --output-dir wheelhouse - name: Upload nncase-python Build Artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: nncase-python-${{matrix.config.name}} path: ${{github.workspace}}/wheelhouse diff --git a/.github/workflows/runtime-build.yml b/.github/workflows/runtime-build.yml index 7a1e8746fa..ea95335d53 100644 --- a/.github/workflows/runtime-build.yml +++ b/.github/workflows/runtime-build.yml @@ -67,14 +67,14 @@ jobs: # cat benchnncase.log - name: Upload nncaseruntime Build Artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: nncaseruntime-${{matrix.config.name}} path: ${{github.workspace}}/install if-no-files-found: error #- name: Upload nncaseruntime Benchmark - # uses: actions/upload-artifact@v3 + # uses: actions/upload-artifact@v4 # with: # name: nncaseruntime-benchmark-${{matrix.config.name}} # path: ${{github.workspace}}/benchnncase.log @@ -134,7 +134,7 @@ jobs: ctest -C ${{matrix.config.buildType}} --test-dir src/Native/test/ctest --output-on-failure -j4 - name: Upload nncaseruntime Build Artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: nncaseruntime-${{matrix.config.name}} path: ${{github.workspace}}/install diff --git a/modules/Nncase.Modules.CPU/CodeGen/CPU/CSourceExtensions.cs b/modules/Nncase.Modules.CPU/CodeGen/CPU/CSourceExtensions.cs index 3eb5c56457..f31e6216a7 100644 --- a/modules/Nncase.Modules.CPU/CodeGen/CPU/CSourceExtensions.cs +++ b/modules/Nncase.Modules.CPU/CodeGen/CPU/CSourceExtensions.cs @@ -41,9 +41,9 @@ public static string ToC(this PrimType primType) => { ReduceOp.Min => "min", ReduceOp.Max => "max", - ReduceOp.Sum => "add", + ReduceOp.Sum => "sum", ReduceOp.Mean => "mean", - ReduceOp.Prod => "mul", + ReduceOp.Prod => "prod", _ => throw new NotImplementedException(), }; diff --git a/modules/Nncase.Modules.CPU/CodeGen/CPU/KernelCSourceConvertVisitor.cs b/modules/Nncase.Modules.CPU/CodeGen/CPU/KernelCSourceConvertVisitor.cs index e67399d2a4..9546629d5f 100644 --- a/modules/Nncase.Modules.CPU/CodeGen/CPU/KernelCSourceConvertVisitor.cs +++ b/modules/Nncase.Modules.CPU/CodeGen/CPU/KernelCSourceConvertVisitor.cs @@ -452,7 +452,7 @@ protected override CSymbol VisitCall(Call expr) IndentScope.Writer.Write($"pad<{string.Join(",", pad.Paddings)}>({Visit(args[0]).Name}, {Visit(args[1]).Name}, {args[0].CheckedDataType.ToC()} {{ {pad.PadValue} }} );\n"); break; case TIR.CPU.Reduce reduce: - IndentScope.Writer.Write($"reduce({Visit(args[0]).Name}, {Visit(args[1]).Name}, fixed_shape<{string.Join(",", reduce.Axis)}>{{}}, fixed_shape<{string.Join(",", reduce.PackedAxes)}>{{}}, fixed_shape<{string.Join(",", reduce.PadedNums)}>{{}});\n"); + IndentScope.Writer.Write($"reduce_{reduce.ReduceOp.ToC()}, fixed_shape<{string.Join(",", reduce.PackedAxes)}>, fixed_shape<{string.Join(",", reduce.PadedNums)}>>({Visit(args[0]).Name}, {Visit(args[1]).Name});\n"); break; case TIR.CPU.ReduceArg reduceArg: IndentScope.Writer.Write($"reduce_arg({Visit(args[0]).Name}, {Visit(args[1]).Name}, fixed_shape<>{{}}, fixed_shape<>{{}});\n"); diff --git a/modules/Nncase.Modules.CPU/Passes/Rules/CPU/PackRule.cs b/modules/Nncase.Modules.CPU/Passes/Rules/CPU/PackRule.cs index ec1342c708..711125efb5 100644 --- a/modules/Nncase.Modules.CPU/Passes/Rules/CPU/PackRule.cs +++ b/modules/Nncase.Modules.CPU/Passes/Rules/CPU/PackRule.cs @@ -35,52 +35,6 @@ public PackRule(int rank, int lane) public override Expr? GetReplace(IMatchResult result, RunPassContext options) => throw new NotImplementedException(); } -public class PackSoftmax : PackRule -{ - public PackSoftmax(int rank, int lane) - : base(rank, lane) - { - } - - public override Pattern Pattern { get; } = IsSoftmax( - "target", - IsWildcard("input") with { TypePattern = IsFloat() }, - IsWildcard("axis") with { TypePattern = IsIntegralScalar() }); - - public override List GetReplaceCandidates(IMatchResult result, RunPassContext context) - { - var rets = new List(); - var input = (Expr)result["input"]; - var axis = ((TensorConst)result["axis"]).Value.ToScalar(); - var inShape = input.CheckedShape.ToValueArray(); - - void AddCandidate(int[] packedAxes, int[] lanes) - { - var packed = IR.F.CPU.Pack(PackUtility.PadForPack(input, inShape, packedAxes, lanes, float.NegativeInfinity, out var pads), lanes, packedAxes); - var softmax = IR.F.CPU.PackedSoftmax(packed, axis, packedAxes); - if (softmax.CheckedType is not InvalidType) - { - var post = PackUtility.SliceForPack(IR.F.CPU.Unpack(softmax, lanes, packedAxes), inShape, pads); - rets.Add(post); - } - } - - for (int i = 0; i < input.CheckedShape.Count; i++) - { - AddCandidate(new[] { i }, new[] { Lane }); - for (int j = i + 1; j < input.CheckedShape.Count; j++) - { - if (Rank > 1) - { - AddCandidate(new[] { i, j }, new[] { Lane, Lane }); - } - } - } - - return rets; - } -} - public sealed class PackResizeImage : PackRule { public PackResizeImage(int rank, int lane) @@ -190,78 +144,6 @@ void AddCandidate(int[] packedAxes, int[] lanes) } } -public sealed class PackLayerNorm : PackRule -{ - public PackLayerNorm(int rank, int lane) - : base(rank, lane) - { - } - - public override Pattern Pattern { get; } = IsLayerNorm( - "target", - _ => true, - IsWildcard("input") with { TypePattern = IsFloat() }, - IsWildcard("scale") with { TypePattern = IsFloat() }, - IsWildcard("bias") with { TypePattern = IsFloat() }); - - public override List GetReplaceCandidates(IMatchResult result, RunPassContext context) - { - var rets = new List(); - var op = (IR.NN.LayerNorm)result["target"]; - var input = (Expr)result["input"]; - var scale = (Expr)result["scale"]; - var bias = (Expr)result["bias"]; - var inShape = input.CheckedShape.ToValueArray(); - var pshape = scale.CheckedShape.ToValueArray(); - - void AddCandidate(int[] packedAxes, int[] lanes) - { - var packedInput = IR.F.CPU.Pack(PackUtility.PadForPack(input, inShape, packedAxes, lanes, 0f, out var padsInput), lanes, packedAxes); - - // todo support padings. - if (padsInput.Any(x => x > 0)) - { - return; - } - - var pAxes = packedAxes.Where(i => i >= op.Axis).Select(i => i - op.Axis).ToArray(); - var packedScale = PackUtility.PadForPack(scale, pshape, pAxes, lanes, 0f, out var padsScale); - if (pAxes.Length > 0) - { - packedScale = IR.F.CPU.Pack(packedScale, Enumerable.Repeat(Lane, pAxes.Length).ToArray(), pAxes); - } - - var packedBias = PackUtility.PadForPack(bias, pshape, pAxes, lanes, 0f, out var padsBias); - if (pAxes.Length > 0) - { - packedBias = IR.F.CPU.Pack(packedBias, Enumerable.Repeat(Lane, pAxes.Length).ToArray(), pAxes); - } - - var layernorm = IR.F.CPU.PackedLayerNorm(packedInput, packedScale, packedBias, op.Axis, op.Epsilon, op.UseMean, packedAxes, padsInput); - - if (layernorm.CheckedType is not InvalidType) - { - var post = PackUtility.SliceForPack(IR.F.CPU.Unpack(layernorm, lanes, packedAxes), inShape, padsInput); - rets.Add(post); - } - } - - for (int i = 0; i < input.CheckedShape.Count; i++) - { - AddCandidate(new[] { i }, new[] { Lane }); - for (int j = i + 1; j < input.CheckedShape.Count; j++) - { - if (Rank > 1) - { - AddCandidate(new[] { i, j }, new[] { Lane, Lane }); - } - } - } - - return rets; - } -} - public sealed class PackMatMul : PackRule { public PackMatMul(int rank, int lane) diff --git a/modules/Nncase.Modules.CPU/Targets/CPUTarget.cs b/modules/Nncase.Modules.CPU/Targets/CPUTarget.cs index 99d894abfa..c391ff123c 100644 --- a/modules/Nncase.Modules.CPU/Targets/CPUTarget.cs +++ b/modules/Nncase.Modules.CPU/Targets/CPUTarget.cs @@ -94,9 +94,7 @@ public void RegisterTargetDependentAfterQuantPass(IPassManager passManager, Comp // todo config it in the target options. var rank = 1; var lane = System.Runtime.Intrinsics.Vector256.IsHardwareAccelerated ? 8 : 4; - p.Add(rank, lane); p.Add(rank, lane); - p.Add(rank, lane); p.Add(rank, lane); p.Add(rank, lane); p.Add(rank, lane); diff --git a/src/Native/include/nncase/ntt/arch/x86_64/primitive_ops.h b/src/Native/include/nncase/ntt/arch/x86_64/primitive_ops.h index caa7a621f2..26c553e989 100644 --- a/src/Native/include/nncase/ntt/arch/x86_64/primitive_ops.h +++ b/src/Native/include/nncase/ntt/arch/x86_64/primitive_ops.h @@ -651,6 +651,60 @@ template <> struct max> { } }; +template <> struct reduce> { + float operator()(const ntt::vector &v, + float init_value) const noexcept { + return init_value + operator()(v); + } + + float operator()(const ntt::vector &v) const noexcept { + // Sum the elements in the 256-bit vector directly + __m128 sum = + _mm_add_ps(_mm256_castps256_ps128(v), _mm256_extractf128_ps(v, 1)); + sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum)); + sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 1)); + + // Extract and return the final sum + return _mm_cvtss_f32(sum); + } +}; + +template <> struct reduce> { + float operator()(const ntt::vector &v, + float init_value) const noexcept { + return ntt::max(init_value, operator()(v)); + } + + float operator()(const ntt::vector &v) const noexcept { + // Sum the elements in the 256-bit vector directly + __m128 sum = + _mm_max_ps(_mm256_castps256_ps128(v), _mm256_extractf128_ps(v, 1)); + sum = _mm_max_ps(sum, _mm_movehl_ps(sum, sum)); + sum = _mm_max_ss(sum, _mm_shuffle_ps(sum, sum, 1)); + + // Extract and return the final sum + return _mm_cvtss_f32(sum); + } +}; + +template <> struct reduce> { + float operator()(const ntt::vector &v, + float init_value) const noexcept { + return ntt::min(init_value, operator()(v)); + } + + float operator()(const ntt::vector &v) const noexcept { + // Sum the elements in the 256-bit vector directly + __m128 sum = + _mm_min_ps(_mm256_castps256_ps128(v), _mm256_extractf128_ps(v, 1)); + sum = _mm_min_ps(sum, _mm_movehl_ps(sum, sum)); + sum = _mm_min_ss(sum, _mm_shuffle_ps(sum, sum, 1)); + + // Extract and return the final sum + return _mm_cvtss_f32(sum); + } +}; + template struct mma, ntt::vector, ntt::vector> { diff --git a/src/Native/include/nncase/ntt/arch/x86_64/ukernels.h b/src/Native/include/nncase/ntt/arch/x86_64/ukernels.h index fe06e06fd7..721f530dc9 100644 --- a/src/Native/include/nncase/ntt/arch/x86_64/ukernels.h +++ b/src/Native/include/nncase/ntt/arch/x86_64/ukernels.h @@ -15,10 +15,12 @@ #pragma once #include "../../ukernels.h" #include "arch_types.h" +#include "nncase/ntt/vector.h" +#include namespace nncase::ntt::ukernels { template -class upack> { +class u_pack> { public: constexpr void operator()(const float *input, vector *output) noexcept { @@ -37,4 +39,8 @@ class upack> { } } }; + +template struct u_reduce_policy { + static constexpr size_t unroll = 8; +}; } // namespace nncase::ntt::ukernels diff --git a/src/Native/include/nncase/ntt/kernels/pack.h b/src/Native/include/nncase/ntt/kernels/pack.h index 982b25e845..3237f183e9 100644 --- a/src/Native/include/nncase/ntt/kernels/pack.h +++ b/src/Native/include/nncase/ntt/kernels/pack.h @@ -118,7 +118,7 @@ class pack_impl { slice_fixed_dims( TOut::shape()); constexpr auto N = rest_dims.length(); - ntt::upack(in_p, out_p); + ntt::u_pack(in_p, out_p); } else { for (size_t i = 0; i < TOut::shape()[Axis]; i++) { apply_transpose( diff --git a/src/Native/include/nncase/ntt/kernels/packed_layer_norm.h b/src/Native/include/nncase/ntt/kernels/packed_layer_norm.h deleted file mode 100644 index 6432e7685c..0000000000 --- a/src/Native/include/nncase/ntt/kernels/packed_layer_norm.h +++ /dev/null @@ -1,144 +0,0 @@ -/* Copyright 2019-2021 Canaan Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once -#include "../apply.h" -#include "../tensor_ops.h" -#include "../utility.h" -#include "binary.h" -#include "unary.h" - -namespace nncase::ntt { - -namespace packed_layer_norm_detail { - -template -void within_axis_pack_impl(const TIn &input, const TScale &scale, - const TBias &bias, TOut &&output, const TEp &epsilon, - const bool &use_mean, PackedAxes, PadedNums) { - using TElem = typename TIn::element_type; - constexpr auto input_shape = typename TIn::shape_type{}; - constexpr auto input_strides = typename TIn::strides_type{}; - constexpr auto scale_shape = typename TScale::shape_type{}; - constexpr auto scale_strides = typename TScale::strides_type{}; - constexpr auto bias_shape = typename TBias::shape_type{}; - constexpr auto bias_strides = typename TBias::strides_type{}; - constexpr auto output_shape = typename std::decay_t::shape_type{}; - constexpr auto output_strides = typename std::decay_t::strides_type{}; - constexpr size_t in_contigous_dim = - contiguous_dims(input_shape, input_strides); - constexpr size_t scale_contiguous_dims = - contiguous_dims(scale_shape, scale_strides); - constexpr size_t bias_contiguous_dims = - contiguous_dims(bias_shape, bias_strides); - constexpr size_t output_contiguous_dims = - contiguous_dims(output_shape, output_strides); - static_assert(in_contigous_dim != 0 || scale_contiguous_dims != 0 || - bias_contiguous_dims != 0 || output_contiguous_dims != 0, - "currently not support no contiguous!"); - static_assert(is_same_seq(input_shape, output_shape), "shape not match"); - static_assert(is_same_seq(input_strides, output_strides), - "strides not match"); - constexpr auto domain = slice_fixed_dims(input_shape); - constexpr auto strides = slice_fixed_dims(input_strides); - - constexpr size_t inner_size = - slice_fixed_dims(input_shape).length(); - // constexpr size_t no_paded_rank = - // PackedAxes::rank() == 0 ? 0 - // : input_shape.rank() - PackedAxes::at(0) - 1; - // constexpr size_t paded_axis = - // PackedAxes::rank() == 0 ? 0 : PackedAxes::at(0) + 1; - // // clang-format off - // constexpr size_t paded_inner_size = (PadedNums::rank() == 0 || - // (PadedNums::rank() == 1 && PadedNums::at(0) == 0)) - // ? 0 - // : PadedNums::at(0) * slice_fixed_dims(input_shape).length(); - // // clang-format on - constexpr bool UseVectorReduce = - PackedAxes::rank() == 1 && PackedAxes::at(0) >= Axis; - - TElem finner_size = (TElem)inner_size; - if constexpr (UseVectorReduce) { - finner_size = finner_size * (TElem)TElem::shape_type::length(); - } - // remove pad nums, NOTE after mul elem size - // finner_size = sub_op(finner_size, paded_inner_size); - - apply(domain, [&](auto index) { - const auto input_p = - input.elements().data() + linear_offset(index, strides); - const auto scale_p = scale.elements().data(); - const auto bias_p = bias.elements().data(); - auto output_p = - output.elements().data() + linear_offset(index, strides); - - // compute mean - TElem mean1 = (TElem)0; - if (use_mean) { - for (size_t i = 0; i < inner_size; i++) - mean1 = mean1 + (input_p[i] / finner_size); - if constexpr (UseVectorReduce) { - mean1 = (TElem)reduce_sum(mean1); - } - } - - std::array sub; - for (auto i = 0; i < inner_size; i++) - sub[i] = input_p[i] - mean1; - - std::array pow; - for (auto i = 0; i < inner_size; i++) - pow[i] = sub[i] * sub[i]; - - TElem mean2 = (TElem)0; - for (auto i = 0; i < inner_size; i++) - mean2 = mean2 + (pow[i] / finner_size); - if constexpr (UseVectorReduce) { - mean2 = (TElem)reduce_sum(mean2); - } - - TElem add = mean2 + epsilon; - TElem sqrt = ntt::sqrt(add); - - std::array norm; - for (auto i = 0; i < inner_size; i++) - norm[i] = sub[i] / sqrt; - - for (auto i = 0; i < inner_size; i++) - output_p[i] = (norm[i] * scale_p[i]) + (TElem)bias_p[i]; - }); -} -} // namespace packed_layer_norm_detail - -template -void packed_layer_norm(const TIn &input, const TScale &scale, const TBias &bias, - TOut &&output, const TEp &epsilon, const bool &use_mean, - PackedAxes packedAxes, PadedNums padedNums) { - static_assert(PackedAxes::rank() < 2, "currently not support 2d packing."); - if constexpr (PackedAxes::rank() <= 1) { - static_assert(PadedNums::rank() == 0 || - (PadedNums::rank() == 1 && PadedNums::at(0) == 0), - "not support padding"); - packed_layer_norm_detail::within_axis_pack_impl( - input, scale, bias, output, epsilon, use_mean, packedAxes, - padedNums); - } -} -} // namespace nncase::ntt diff --git a/src/Native/include/nncase/ntt/kernels/packed_softmax.h b/src/Native/include/nncase/ntt/kernels/packed_softmax.h deleted file mode 100644 index dbeebb40e6..0000000000 --- a/src/Native/include/nncase/ntt/kernels/packed_softmax.h +++ /dev/null @@ -1,110 +0,0 @@ -/* Copyright 2019-2021 Canaan Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once -#include "../apply.h" -#include "../shape_infer/reduce_axis.h" -#include "../tensor_ops.h" -#include "../utility.h" -#include "binary.h" -#include "unary.h" -#include - -namespace nncase::ntt { - -namespace softmax_detail { -template -void packed_on_axis_impl(const TIn &input, TOut &&output, - [[maybe_unused]] PackedAxes packedAxes) { - using TElem = typename TIn::element_type; - constexpr auto input_shape = typename TIn::shape_type{}; - constexpr auto output_shape = typename std::decay_t::shape_type{}; - static_assert(is_same_seq(input_shape, output_shape), - "the input output shape not equal!"); - - constexpr auto need_reduce = - PackedAxes::rank() != 0 && Axis == PackedAxes::at(0); - constexpr auto domain = - shape_infer::reduced_shape_by_axis(input_shape); - apply(domain, [&](auto index) { - // max - TElem max_value = input(index); - for (index[Axis] = 0; index[Axis] < input_shape.at(Axis); - index[Axis]++) { - max_value = max(max_value, input(index)); - } - - // reduce_max - if constexpr (need_reduce) { - max_value = (TElem)reduce_max(max_value); - } - - // (x - reduce_max) * beta - for (index[Axis] = 0; index[Axis] < input_shape.at(Axis); - index[Axis]++) { - output(index) = input(index) - max_value; - } - - // exp((x - reduce_max) * beta) and sum - TElem sum = (TElem)0; - for (index[Axis] = 0; index[Axis] < input_shape.at(Axis); - index[Axis]++) { - output(index) = exp(output(index)); - sum += output(index); - } - - // reduce sum - if constexpr (need_reduce) { - sum = (TElem)reduce_sum(sum); - } - - // div - for (index[Axis] = 0; index[Axis] < input_shape.at(Axis); - index[Axis]++) { - output(index) = output(index) / sum; - } - }); -} - -template -void packed_softmax_1d(const TIn &input, TOut &&output, PackedAxes packedAxes) { - packed_on_axis_impl(input, output, packedAxes); -} - -} // namespace softmax_detail - -/** - * @brief packed softmax - * implement notice: - * 1. need support 2d pack. - * 2. need support paded nums. - * 3. need different implementation when the packed axis is equal or not - * equal axis. - * @tparam Axis softmax reduced axis - * @param input input tensor. - * @param output output output. - * @param packedAxes packed axes - */ -template -void packed_softmax(const TIn &input, TOut &&output, - [[maybe_unused]] PackedAxes packedAxes - /* , [[maybe_unused]] PadedNums padednums */) noexcept { - static_assert(PackedAxes::rank() < 2, "currently not support 2d pack"); - // static_assert(PadedNums::at(0) == 0, "currently not support pad"); - softmax_detail::packed_softmax_1d(input, output, packedAxes); -} -} // namespace nncase::ntt diff --git a/src/Native/include/nncase/ntt/kernels/reduce.h b/src/Native/include/nncase/ntt/kernels/reduce.h index 70d8ab05f5..2bcfcc81ca 100644 --- a/src/Native/include/nncase/ntt/kernels/reduce.h +++ b/src/Native/include/nncase/ntt/kernels/reduce.h @@ -14,132 +14,171 @@ */ #pragma once #include "../apply.h" -#include "../loop.h" #include "../primitive_ops.h" +#include "../profiler.h" +#include "../shape_infer/reduce.h" #include "../tensor_ops.h" -#include "../unrool.h" +#include "../tensor_traits.h" +#include "../ukernels.h" #include "../utility.h" +#include "nncase/ntt/shape.h" +#include +#include +#include namespace nncase::ntt { +namespace detail { -namespace reduce_detail { - -template