Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GNNE-1904 fix op's datatype #1068

Merged
merged 60 commits into from
Sep 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
60 commits
Select commit Hold shift + click to select a range
7dad301
fix hardmax dt
Aug 25, 2023
4b61e92
fix celu's alpha
Aug 25, 2023
79d452a
Merge branch 'master' into feature/add_new_dt
Aug 28, 2023
657f063
fix print tensor fun
Aug 28, 2023
154c27e
add json file clear
Aug 28, 2023
52af856
add trilu test
Aug 28, 2023
f1cb055
Merge branch 'master' into feature/add_new_dt
Aug 28, 2023
8153889
add gather test
Aug 28, 2023
0dcce1a
Apply code-format changes
HeJunchao100813 Aug 28, 2023
8bae082
Merge branch 'master' into feature/add_new_dt
Aug 29, 2023
3b15330
Merge branch 'master' into feature/add_new_dt
Aug 30, 2023
d57fa6b
fix
Aug 30, 2023
e74fdbb
fix
Aug 30, 2023
7fac954
fix
Aug 31, 2023
531af9e
Merge branch 'master' into feature/add_new_dt
Aug 31, 2023
07f18db
Merge branch 'master' into feature/add_new_dt
Sep 6, 2023
6d6f501
Apply code-format changes
HeJunchao100813 Sep 6, 2023
3f65b0b
Merge branch 'master' into feature/add_new_dt
HeJunchao100813 Sep 7, 2023
9ca355d
fix hardmax
Sep 18, 2023
cc3633b
fix
Sep 18, 2023
2448f74
fix
Sep 18, 2023
452b40a
fix
Sep 18, 2023
6d760d1
Apply code-format changes
HeJunchao100813 Sep 18, 2023
e9b4486
fix matmul
Sep 18, 2023
f3ff273
Merge remote-tracking branch 'origin/feature/add_new_dt' into feature…
Sep 18, 2023
18d1b72
Merge branch 'master' into feature/add_new_dt
curioyang Sep 18, 2023
11229a6
fix reduce&&tile&&ln
Sep 18, 2023
d7718e1
Merge remote-tracking branch 'origin/feature/add_new_dt' into feature…
Sep 18, 2023
8589f8f
fix
Sep 18, 2023
bde39f5
fix
Sep 18, 2023
ed4a3f4
fix
Sep 18, 2023
604a40f
fix
Sep 19, 2023
8731784
Apply code-format changes
HeJunchao100813 Sep 19, 2023
431ea23
fix
Sep 19, 2023
91930c0
fix
Sep 19, 2023
d8e16bf
fix
Sep 19, 2023
4c46844
fix
Sep 19, 2023
5c50512
fix
Sep 20, 2023
418f07d
fix
Sep 20, 2023
f75785e
fix
Sep 20, 2023
b4d32d2
Apply code-format changes
HeJunchao100813 Sep 20, 2023
5c10735
fix
Sep 21, 2023
d8c8855
Merge branch 'master' into feature/add_new_dt
Sep 21, 2023
e90faf4
Apply code-format changes
HeJunchao100813 Sep 21, 2023
4266dac
Merge branch 'master' into feature/add_new_dt
HeJunchao100813 Sep 22, 2023
4ccbf0e
fix
Sep 22, 2023
13dbb80
Merge remote-tracking branch 'origin/feature/add_new_dt' into feature…
Sep 22, 2023
e947dc1
Merge branch 'master' into feature/add_new_dt
Sep 22, 2023
e3c72b5
Apply code-format changes
HeJunchao100813 Sep 22, 2023
e0efc40
fix
Sep 22, 2023
67a9ee0
Merge remote-tracking branch 'origin/feature/add_new_dt' into feature…
Sep 22, 2023
0361bf0
fix opt's conv
Sep 22, 2023
ca90ffd
Apply code-format changes
HeJunchao100813 Sep 22, 2023
2e80cc2
Merge branch 'master' into feature/add_new_dt
HeJunchao100813 Sep 22, 2023
85a7fde
fix
Sep 25, 2023
03c585f
Merge remote-tracking branch 'origin/feature/add_new_dt' into feature…
Sep 25, 2023
0429fe1
fix
Sep 25, 2023
158180d
Apply code-format changes
HeJunchao100813 Sep 25, 2023
23c3d27
fix
Sep 25, 2023
1516808
fix
Sep 26, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// Copyright (c) Canaan Inc. All rights reserved.
// Licensed under the Apache license. See LICENSE file in the project root for full license information.
/* This file is generated by tools/stackvm_gen/IsaGen at 2023/9/5 19:40:30 +08:00. */
/* This file is generated by tools/stackvm_gen/IsaGen at 2023/9/18 下午5:04:31 +08:00. */

using System;
using System.Collections.Generic;
Expand Down
163 changes: 96 additions & 67 deletions src/Native/src/kernels/stackvm/optimized/convolution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
* limitations under the License.
*/
#include "../reference/ref_ops.h"
#include "nncase/runtime/util.h"
#include "opt_ops.h"
#include <nncase/kernels/kernel_utils.h>
#include <nncase/runtime/runtime_op_utility.h>
Expand Down Expand Up @@ -55,9 +56,10 @@ using namespace nncase::kernels;
using namespace nncase::kernels::stackvm;
using namespace nncase::kernels::stackvm::optimized;

template <typename T>
result<void>
conv2d_1x1_s1(const float *input, const float *weights, const float *bias,
float *output, gsl::span<const size_t> in_shape,
conv2d_1x1_s1(const T *input, const T *weights, const T *bias, T *output,
gsl::span<const size_t> in_shape,
NNCASE_UNUSED gsl::span<const size_t> in_strides,
NNCASE_UNUSED gsl::span<const size_t> w_shape,
NNCASE_UNUSED gsl::span<const size_t> w_strides,
Expand All @@ -67,8 +69,7 @@ conv2d_1x1_s1(const float *input, const float *weights, const float *bias,
NNCASE_UNUSED const padding &padding_w,
NNCASE_UNUSED int32_t groups, NNCASE_UNUSED int32_t stride_h,
NNCASE_UNUSED int32_t stride_w, NNCASE_UNUSED int32_t dilation_h,
NNCASE_UNUSED int32_t dilation_w,
value_range<float> fused_activation,
NNCASE_UNUSED int32_t dilation_w, value_range<T> fused_activation,
NNCASE_UNUSED kernels::kernel_context &context) noexcept {
const auto widths = in_shape[2] * in_shape[3];
// if oc's type is size_t, openmp will throw error in visual studio
Expand All @@ -82,8 +83,8 @@ conv2d_1x1_s1(const float *input, const float *weights, const float *bias,
#endif
for (int oc = 0; oc < out_channels; oc++) {
const auto out_c = oc;
const float *now_weights = weights + out_c * w_strides[0];
const float *now_img_start = input + batch * in_strides[0];
const T *now_weights = weights + out_c * w_strides[0];
const T *now_img_start = input + batch * in_strides[0];
size_t channel = 0;

auto *now_output_channel_start =
Expand All @@ -94,26 +95,26 @@ conv2d_1x1_s1(const float *input, const float *weights, const float *bias,
bias[oc]);
for (; channel + 4 <= in_shape[1]; channel += 4, now_weights += 4) {
auto *w_output = now_output_channel_start;
const float w0 = now_weights[0];
const float w1 = now_weights[1];
const float w2 = now_weights[2];
const float w3 = now_weights[3];
const T w0 = now_weights[0];
const T w1 = now_weights[1];
const T w2 = now_weights[2];
const T w3 = now_weights[3];

const float *i0 = now_img_start + (channel + 0) * in_strides[1];
const float *i1 = now_img_start + (channel + 1) * in_strides[1];
const float *i2 = now_img_start + (channel + 2) * in_strides[1];
const float *i3 = now_img_start + (channel + 3) * in_strides[1];
const T *i0 = now_img_start + (channel + 0) * in_strides[1];
const T *i1 = now_img_start + (channel + 1) * in_strides[1];
const T *i2 = now_img_start + (channel + 2) * in_strides[1];
const T *i3 = now_img_start + (channel + 3) * in_strides[1];

const float *v0 = i0;
const float *v1 = i1;
const float *v2 = i2;
const float *v3 = i3;
const T *v0 = i0;
const T *v1 = i1;
const T *v2 = i2;
const T *v3 = i3;

for (size_t index = 0; index < widths; ++index) {
float sum0 = *v0 * w0;
float sum1 = *v1 * w1;
float sum2 = *v2 * w2;
float sum3 = *v3 * w3;
T sum0 = *v0 * w0;
T sum1 = *v1 * w1;
T sum2 = *v2 * w2;
T sum3 = *v3 * w3;

*w_output += sum0 + sum1 + sum2 + sum3;

Expand All @@ -127,9 +128,9 @@ conv2d_1x1_s1(const float *input, const float *weights, const float *bias,

for (; channel < in_shape[1]; ++channel) {
auto *w_output = now_output_channel_start;
const float *v = now_img_start + channel * in_strides[1];
const T *v = now_img_start + channel * in_strides[1];
for (size_t index = 0; index < widths; ++index) {
*w_output += (*now_weights) * (*v);
*w_output += (T)(*now_weights) * (T)(*v);
++w_output;
++v;
}
Expand All @@ -146,9 +147,10 @@ conv2d_1x1_s1(const float *input, const float *weights, const float *bias,
return ok();
}

template <typename T>
result<void>
conv2d_1x1_s2(const float *input, const float *weights, const float *bias,
float *output, gsl::span<const size_t> in_shape,
conv2d_1x1_s2(const T *input, const T *weights, const T *bias, T *output,
gsl::span<const size_t> in_shape,
NNCASE_UNUSED gsl::span<const size_t> in_strides,
NNCASE_UNUSED gsl::span<const size_t> w_shape,
NNCASE_UNUSED gsl::span<const size_t> w_strides,
Expand All @@ -158,8 +160,7 @@ conv2d_1x1_s2(const float *input, const float *weights, const float *bias,
NNCASE_UNUSED const padding &padding_w,
NNCASE_UNUSED int32_t groups, NNCASE_UNUSED int32_t stride_h,
NNCASE_UNUSED int32_t stride_w, NNCASE_UNUSED int32_t dilation_h,
NNCASE_UNUSED int32_t dilation_w,
value_range<float> fused_activation,
NNCASE_UNUSED int32_t dilation_w, value_range<T> fused_activation,
NNCASE_UNUSED kernels::kernel_context &context) noexcept {
const auto batch = in_shape[0], in_channels = in_shape[1],
in_h = in_shape[2], in_w = in_shape[3],
Expand All @@ -178,31 +179,30 @@ conv2d_1x1_s2(const float *input, const float *weights, const float *bias,
#pragma omp parallel for num_threads(context.num_threads)
#endif
for (int oc = 0; oc < out_channels; oc++) {
float *out = output + (b * out_strides[0] + oc * out_strides[1]);
T *out = output + (b * out_strides[0] + oc * out_strides[1]);

std::fill(out, out + out_h * out_w, bias[oc]);
size_t ic = 0;
for (; ic + 3 < in_channels; ic += 4) {
float *outptr = out;
const float *img0 =
T *outptr = out;
const T *img0 =
input + (b * in_strides[0]) + (ic * in_strides[1]);
const float *img1 =
const T *img1 =
input + (b * in_strides[0]) + ((ic + 1) * in_strides[1]);
const float *img2 =
const T *img2 =
input + (b * in_strides[0]) + ((ic + 2) * in_strides[1]);
const float *img3 =
const T *img3 =
input + (b * in_strides[0]) + ((ic + 3) * in_strides[1]);

const float *r0 = img0;
const float *r1 = img1;
const float *r2 = img2;
const float *r3 = img3;
const T *r0 = img0;
const T *r1 = img1;
const T *r2 = img2;
const T *r3 = img3;

const float *k0 =
weights + oc * w_strides[0] + ic * w_strides[1];
const float *k1 = k0 + 1;
const float *k2 = k0 + 2;
const float *k3 = k0 + 3;
const T *k0 = weights + oc * w_strides[0] + ic * w_strides[1];
const T *k1 = k0 + 1;
const T *k2 = k0 + 2;
const T *k3 = k0 + 3;
for (size_t i = 0; i < out_h; i++) {
for (size_t remain = 0; remain < out_w; remain++) {
*outptr += r0[0] * k0[0];
Expand All @@ -223,13 +223,13 @@ conv2d_1x1_s2(const float *input, const float *weights, const float *bias,
}

for (; ic < in_channels; ic++) {
float *outptr = out;
const float *img0 =
T *outptr = out;
const T *img0 =
input + (b * in_strides[0]) + (ic * in_strides[1]);
const float *kernel0 =
const T *kernel0 =
weights + oc * w_strides[0] + ic * w_strides[1];
const float *r0 = img0;
const float *k0 = kernel0;
const T *r0 = img0;
const T *k0 = kernel0;
for (size_t i = 0; i < out_h; i++) {
for (size_t remain = 0; remain < out_w; remain++) {
*outptr += r0[0] * k0[0];
Expand All @@ -240,7 +240,7 @@ conv2d_1x1_s2(const float *input, const float *weights, const float *bias,
}
}
for (size_t h = 0; h < out_h; h++) {
float *r_out = out + h * out_strides[2];
T *r_out = out + h * out_strides[2];
for (size_t w = 0; w < out_w; w++) {
*(r_out + w) = kernels::detail::apply_activation(
*(r_out + w), fused_activation);
Expand Down Expand Up @@ -415,10 +415,10 @@ void conv2d_channel(size_t out_h, size_t out_w, std::array<T, Parallel> &sum,
}

template <size_t Parallel, size_t Filter_h, size_t Filter_w, size_t Stride_h,
size_t Stride_w>
size_t Stride_w, typename T>
result<void>
conv2d_nxm(const float *input, const float *weights, const float *bias,
float *output, gsl::span<const size_t> in_shape,
conv2d_nxm(const T *input, const T *weights, const T *bias, float *output,
gsl::span<const size_t> in_shape,
NNCASE_UNUSED gsl::span<const size_t> in_strides,
NNCASE_UNUSED gsl::span<const size_t> w_shape,
NNCASE_UNUSED gsl::span<const size_t> w_strides,
Expand Down Expand Up @@ -485,9 +485,9 @@ conv2d_nxm(const float *input, const float *weights, const float *bias,
}

template <size_t Parallel, size_t Filter_h, size_t Filter_w, size_t Stride_h,
size_t Stride_w>
size_t Stride_w, typename T>
result<void> conv2d_depthwise_nxm(
const float *input, const float *weights, const float *bias, float *output,
const T *input, const T *weights, const T *bias, T *output,
gsl::span<const size_t> in_shape,
NNCASE_UNUSED gsl::span<const size_t> in_strides,
NNCASE_UNUSED gsl::span<const size_t> w_shape,
Expand All @@ -498,7 +498,7 @@ result<void> conv2d_depthwise_nxm(
NNCASE_UNUSED const padding &padding_w, NNCASE_UNUSED int32_t groups,
NNCASE_UNUSED int32_t stride_h, NNCASE_UNUSED int32_t stride_w,
NNCASE_UNUSED int32_t dilation_h, NNCASE_UNUSED int32_t dilation_w,
value_range<float> fused_activation,
value_range<T> fused_activation,
NNCASE_UNUSED kernels::kernel_context &context) noexcept {
const auto batch = in_shape[0], channels = w_shape[0], in_h = in_shape[2],
in_w = in_shape[3];
Expand All @@ -516,14 +516,13 @@ result<void> conv2d_depthwise_nxm(
#endif
for (int c = 0; c < channels; c++) // channel
{
std::array<float *, Parallel> outptr;
std::array<const float *,
compute_rsize<Parallel, Stride_h, Filter_h>()>
std::array<T *, Parallel> outptr;
std::array<const T *, compute_rsize<Parallel, Stride_h, Filter_h>()>
r;
std::array<const float *, Filter_h> k;
std::array<float, Parallel> sum;
std::array<const T *, Filter_h> k;
std::array<T, Parallel> sum;

float *out = output + out_strides[0] * b + out_strides[1] * c;
T *out = output + out_strides[0] * b + out_strides[1] * c;
std::fill_n(out,
out_strides[2]
? out_h * out_strides[2]
Expand All @@ -539,7 +538,7 @@ result<void> conv2d_depthwise_nxm(
out_h, out_w, sum, r, k, outptr, in_strides[2], out_strides[2],
tail_step);
for (size_t h = 0; h < out_h; h++) {
float *r_out = out + h * out_strides[2];
T *r_out = out + h * out_strides[2];
for (size_t w = 0; w < out_w; w++) {
*(r_out + w) = kernels::detail::apply_activation(
*(r_out + w), fused_activation);
Expand Down Expand Up @@ -610,7 +609,8 @@ result<void> conv2d_depthwise_nxm(
#endif

result<void> optimized::conv2d(
const float *input, const float *weights, const float *bias, float *output,
[[maybe_unused]] typecode_t typecode, const gsl::byte *input1,
const gsl::byte *weights1, const gsl::byte *bias1, gsl::byte *output1,
gsl::span<const size_t> in_shape, gsl::span<const size_t> in_strides,
gsl::span<const size_t> w_shape,
NNCASE_UNUSED gsl::span<const size_t> w_strides,
Expand All @@ -620,6 +620,10 @@ result<void> optimized::conv2d(
int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
value_range<float> fused_activation,
NNCASE_UNUSED kernels::kernel_context &context) noexcept {
[[maybe_unused]] auto input = IN_CAST(float, input1);
[[maybe_unused]] auto weights = IN_CAST(float, weights1);
[[maybe_unused]] auto bias = IN_CAST(float, bias1);
[[maybe_unused]] auto output = OUT_CAST(float, output1);
const auto filter_h = w_shape[2];
const auto filter_w = w_shape[3];

Expand Down Expand Up @@ -675,8 +679,33 @@ result<void> optimized::conv2d(
}
#endif
try_(nncase::kernels::stackvm::reference::conv2d(
input, weights, bias, output, in_shape, in_strides, w_shape, w_strides,
bias_strides, out_strides, padding_h, padding_w, groups, stride_h,
stride_w, dilation_h, dilation_w, fused_activation));
typecode, input1, weights1, bias1, output1, in_shape, in_strides,
w_shape, w_strides, bias_strides, out_strides, padding_h, padding_w,
groups, stride_h, stride_w, dilation_h, dilation_w, fused_activation));
return ok();
}
}

// result<void> optimized::conv2d(
// [[maybe_unused]] typecode_t typecode, const gsl::byte *input,
// const gsl::byte *weights, const gsl::byte *bias, gsl::byte *output,
// gsl::span<const size_t> in_shape, gsl::span<const size_t> in_strides,
// gsl::span<const size_t> w_shape,
// NNCASE_UNUSED gsl::span<const size_t> w_strides,
// NNCASE_UNUSED gsl::span<const size_t> bias_strides,
// NNCASE_UNUSED gsl::span<const size_t> out_strides, const padding
// &padding_h, const padding &padding_w, int32_t groups, int32_t stride_h,
// int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
// value_range<float> fused_activation,
// NNCASE_UNUSED kernels::kernel_context &context) noexcept {
// auto a = conv2d_impl(
// IN_CAST(float, input), IN_CAST(float, weights), IN_CAST(float, bias),
// OUT_CAST(float, output), in_shape, in_strides, w_shape, w_strides,
// bias_strides, out_strides, padding_h, padding_w, groups, stride_h,
// stride_w, dilation_h, dilation_w, fused_activation, context);
// try_(nncase::kernels::stackvm::reference::conv2d(
// typecode, input, weights, bias, output, in_shape, in_strides,
// w_shape, w_strides, bias_strides, out_strides, padding_h, padding_w,
// groups, stride_h, stride_w, dilation_h, dilation_w,
// fused_activation));
// return ok();
// }
7 changes: 4 additions & 3 deletions src/Native/src/kernels/stackvm/optimized/layer_norm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,9 @@ using namespace nncase::kernels::stackvm;
using namespace nncase::kernels::stackvm::optimized;

result<void> nncase::kernels::stackvm::optimized::layer_norm(
const float *input, float *output, const float *scale, const float *bias,
typecode_t typecode, const gsl::byte *input, gsl::byte *output,
const gsl::byte *scale, const gsl::byte *bias,
gsl::span<const size_t> in_shape, int32_t axis, float epsilon) {
return reference::layer_norm(input, output, scale, bias, in_shape, axis,
epsilon);
return reference::layer_norm(typecode, input, output, scale, bias, in_shape,
axis, epsilon);
}
14 changes: 7 additions & 7 deletions src/Native/src/kernels/stackvm/optimized/log_softmax.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,17 @@ using namespace nncase::kernels::stackvm::optimized;

#include <math.h>

template result<void> optimized::log_softmax<float>(
const float *input, float *output, gsl::span<const size_t> in_shape,
gsl::span<const size_t> in_strides, gsl::span<const size_t> out_strides,
int32_t axis) noexcept;
// template result<void> optimized::log_softmax<float>(
// typecode_t typecode, const gsl::byte *input, gsl::byte *output,
// gsl::span<const size_t> in_shape, gsl::span<const size_t> in_strides,
// gsl::span<const size_t> out_strides, int32_t axis) noexcept;

template <typename T>
result<void> optimized::log_softmax(const T *input, T *output,
result<void> optimized::log_softmax(typecode_t typecode, const gsl::byte *input,
gsl::byte *output,
gsl::span<const size_t> in_shape,
gsl::span<const size_t> in_strides,
gsl::span<const size_t> out_strides,
int32_t axis) noexcept {
return reference::log_softmax(input, output, in_shape, in_strides,
return reference::log_softmax(typecode, input, output, in_shape, in_strides,
out_strides, axis);
}
Loading
Loading