From 3723d150c5d8701aed62ed64947200e4630772bc Mon Sep 17 00:00:00 2001 From: "-T.K.-" Date: Wed, 17 Jul 2024 12:54:23 -0700 Subject: [PATCH] ADD: add more types to impls --- nn/impl/acc.h | 17 +++++++++++++++++ nn/impl/acc1.h | 24 +++++++++++++++++++++++- nn/impl/add.h | 12 ++++++++++++ nn/impl/add1.h | 25 +++++++++++++++++++++++++ nn/impl/cpu/acc.c | 18 ++++++++++++++++++ nn/impl/cpu/acc1.c | 30 +++++++++++++++++++++++++++--- nn/impl/cpu/add.c | 12 ++++++++++++ nn/impl/cpu/add1.c | 25 +++++++++++++++++++++++++ nn/impl/cpu/div.c | 24 ++++++++++++++++++++++++ nn/impl/cpu/dot.c | 24 ++++++++++++++++++++++++ nn/impl/cpu/fill.c | 1 + nn/impl/cpu/max.c | 36 ++++++++++++++++++++++++++++++++++++ nn/impl/cpu/maximum.c | 32 ++++++++++++++++++++++++++++++++ nn/impl/cpu/maximum1.c | 18 ++++++++++++++++++ nn/impl/cpu/min.c | 36 ++++++++++++++++++++++++++++++++++++ nn/impl/cpu/minimum.c | 32 ++++++++++++++++++++++++++++++++ nn/impl/cpu/minimum1.c | 18 ++++++++++++++++++ nn/impl/cpu/mul.c | 24 ++++++++++++++++++++++++ nn/impl/cpu/mul1.c | 24 ++++++++++++++++++++++++ nn/impl/cpu/neg.c | 24 ++++++++++++++++++++++++ nn/impl/cpu/softmax.c | 13 +++++++++++++ nn/impl/cpu/sqr.c | 24 ++++++++++++++++++++++++ nn/impl/cpu/sqrt.c | 6 ++++++ nn/impl/cpu/sum.c | 16 ++++++++++++++++ nn/impl/cpu/transpose.c | 32 ++++++++++++++++++++++++++++++++ nn/impl/div.h | 26 ++++++++++++++++++++++++++ nn/impl/dot.h | 18 ++++++++++++++++++ nn/impl/gemmini/mm.c | 12 ++++++------ nn/impl/max.h | 22 ++++++++++++++++++++++ nn/impl/maximum.h | 26 ++++++++++++++++++++++++++ nn/impl/maximum1.h | 18 ++++++++++++++++++ nn/impl/min.h | 22 ++++++++++++++++++++++ nn/impl/minimum.h | 26 ++++++++++++++++++++++++++ nn/impl/minimum1.h | 24 ++++++++++++++++++++++++ nn/impl/mul.h | 26 ++++++++++++++++++++++++++ nn/impl/mul1.h | 26 ++++++++++++++++++++++++++ nn/impl/neg.h | 22 ++++++++++++++++++++++ nn/impl/rms_norm.h | 1 + nn/impl/softmax.h | 7 +++++++ nn/impl/sqr.h | 22 ++++++++++++++++++++++ nn/impl/sqrt.h | 7 +++++++ nn/impl/sum.h | 12 ++++++++++++ nn/impl/transpose.h | 23 +++++++++++++++++++++++ 43 files changed, 877 insertions(+), 10 deletions(-) diff --git a/nn/impl/acc.h b/nn/impl/acc.h index 4375a17..2f319f2 100644 --- a/nn/impl/acc.h +++ b/nn/impl/acc.h @@ -4,12 +4,29 @@ #include #include +#include "nn_float16.h" + void NN__acc_i8(size_t n, int8_t *y, size_t incy, const int8_t *x, size_t incx ); +void NN__acc_i16(size_t n, + int16_t *y, size_t incy, + const int16_t *x, size_t incx + ); + +void NN__acc_i32(size_t n, + int32_t *y, size_t incy, + const int32_t *x, size_t incx + ); + +void NN__acc_f16(size_t n, + float16_t *y, size_t incy, + const float16_t *x, size_t incx + ); + void NN__acc_f32(size_t n, float *y, size_t incy, const float *x, size_t incx diff --git a/nn/impl/acc1.h b/nn/impl/acc1.h index f3c52af..ba2fded 100644 --- a/nn/impl/acc1.h +++ b/nn/impl/acc1.h @@ -3,9 +3,31 @@ #include +#include "nn_float16.h" + + +void NN__acc1_i8(size_t n, + int8_t *result, size_t incr, + int8_t scalar + ); + +void NN__acc1_i16(size_t n, + int16_t *result, size_t incr, + int16_t scalar + ); + +void NN__acc1_i32(size_t n, + int32_t *result, size_t incr, + int32_t scalar + ); + +void NN__acc1_f16(size_t n, + float16_t *result, size_t incr, + float16_t scalar + ); void NN__acc1_f32(size_t n, - float *result, size_t incx, + float *result, size_t incr, float scalar ); diff --git a/nn/impl/add.h b/nn/impl/add.h index 337b346..65063a3 100644 --- a/nn/impl/add.h +++ b/nn/impl/add.h @@ -13,6 +13,18 @@ void NN__add_i8(size_t n, const int8_t *y, size_t incy ); +void NN__add_i16(size_t n, + int16_t *z, size_t incz, + const int16_t *x, size_t incx, + const int16_t *y, size_t incy + ); + +void NN__add_i32(size_t n, + int32_t *z, size_t incz, + const int32_t *x, size_t incx, + const int32_t *y, size_t incy + ); + void NN__add_f16(size_t n, float16_t *z, size_t incz, const float16_t *x, size_t incx, diff --git a/nn/impl/add1.h b/nn/impl/add1.h index 99b6cde..779fdba 100644 --- a/nn/impl/add1.h +++ b/nn/impl/add1.h @@ -3,6 +3,31 @@ #include +#include "nn_float16.h" + +void NN__add1_i8(size_t n, + int8_t *z, size_t incz, + const int8_t *x, size_t incx, + int8_t scalar + ); + +void NN__add1_i16(size_t n, + int16_t *z, size_t incz, + const int16_t *x, size_t incx, + int16_t scalar + ); + +void NN__add1_i32(size_t n, + int32_t *z, size_t incz, + const int32_t *x, size_t incx, + int32_t scalar + ); + +void NN__add1_f16(size_t n, + float16_t *z, size_t incz, + const float16_t *x, size_t incx, + float16_t scalar + ); void NN__add1_f32(size_t n, float *z, size_t incz, diff --git a/nn/impl/cpu/acc.c b/nn/impl/cpu/acc.c index 2e49d40..35e9675 100644 --- a/nn/impl/cpu/acc.c +++ b/nn/impl/cpu/acc.c @@ -7,6 +7,24 @@ __attribute__((weak)) void NN__acc_i8(size_t n, int8_t *y, size_t incy, const in } } +__attribute__((weak)) void NN__acc_i16(size_t n, int16_t *y, size_t incy, const int16_t *x, size_t incx) { + for (size_t i = 0; i < n; i += 1) { + y[i * incy] += x[i * incx]; + } +} + +__attribute__((weak)) void NN__acc_i32(size_t n, int32_t *y, size_t incy, const int32_t *x, size_t incx) { + for (size_t i = 0; i < n; i += 1) { + y[i * incy] += x[i * incx]; + } +} + +__attribute__((weak)) void NN__acc_f16(size_t n, float16_t *y, size_t incy, const float16_t *x, size_t incx) { + for (size_t i = 0; i < n; i += incx) { + y[i * incy] = NN_float_to_half(NN_half_to_float(y[i * incy]) + NN_half_to_float(x[i * incx])); + } +} + __attribute__((weak)) void NN__acc_f32(size_t n, float *y, size_t incy, const float *x, size_t incx) { for (size_t i = 0; i < n; i += 1) { y[i * incy] += x[i * incx]; diff --git a/nn/impl/cpu/acc1.c b/nn/impl/cpu/acc1.c index 3b925a2..5eec8be 100644 --- a/nn/impl/cpu/acc1.c +++ b/nn/impl/cpu/acc1.c @@ -1,8 +1,32 @@ #include "acc1.h" -__attribute__((weak)) void NN__acc1_f32(size_t n, float *result, size_t incx, float scalar) { - for (size_t i = 0; i < n; i += incx) { - result[i] += scalar; +__attribute__((weak)) void NN__acc1_i8(size_t n, int8_t *result, size_t incr, int8_t scalar) { + for (size_t i = 0; i < n; i += 1) { + result[i + incr] += scalar; + } +} + +__attribute__((weak)) void NN__acc1_i16(size_t n, int16_t *result, size_t incr, int16_t scalar) { + for (size_t i = 0; i < n; i += 1) { + result[i + incr] += scalar; + } +} + +__attribute__((weak)) void NN__acc1_i32(size_t n, int32_t *result, size_t incr, int32_t scalar) { + for (size_t i = 0; i < n; i += 1) { + result[i + incr] += scalar; + } +} + +__attribute__((weak)) void NN__acc1_f16(size_t n, float16_t *result, size_t incr, float16_t scalar) { + for (size_t i = 0; i < n; i += 1) { + result[i + incr] = NN_float_to_half(NN_half_to_float(result[i * incr]) + NN_half_to_float(scalar)); + } +} + +__attribute__((weak)) void NN__acc1_f32(size_t n, float *result, size_t incr, float scalar) { + for (size_t i = 0; i < n; i += 1) { + result[i + incr] += scalar; } } \ No newline at end of file diff --git a/nn/impl/cpu/add.c b/nn/impl/cpu/add.c index a5fed5a..3de3045 100644 --- a/nn/impl/cpu/add.c +++ b/nn/impl/cpu/add.c @@ -7,6 +7,18 @@ __attribute__((weak)) void NN__add_i8(size_t n, int8_t *z, size_t incz, const in } } +__attribute__((weak)) void NN__add_i16(size_t n, int16_t *z, size_t incz, const int16_t *x, size_t incx, const int16_t *y, size_t incy) { + for (size_t i = 0; i < n; i += 1) { + z[i * incz] = x[i * incx] + y[i * incy]; + } +} + +__attribute__((weak)) void NN__add_i32(size_t n, int32_t *z, size_t incz, const int32_t *x, size_t incx, const int32_t *y, size_t incy) { + for (size_t i = 0; i < n; i += 1) { + z[i * incz] = x[i * incx] + y[i * incy]; + } +} + __attribute__((weak)) void NN__add_f16(size_t n, float16_t *z, size_t incz, const float16_t *x, size_t incx, const float16_t *y, size_t incy) { for (size_t i = 0; i < n; i += 1) { z[i * incz] = NN_float_to_half(NN_half_to_float(x[i * incx]) + NN_half_to_float(y[i * incy])); diff --git a/nn/impl/cpu/add1.c b/nn/impl/cpu/add1.c index 57ef2c1..31cbccc 100644 --- a/nn/impl/cpu/add1.c +++ b/nn/impl/cpu/add1.c @@ -1,6 +1,31 @@ #include "add1.h" + +__attribute__((weak)) void NN__add1_i8(size_t n, int8_t *y, size_t incy, const int8_t *x, size_t incx, int8_t scalar) { + for (size_t i = 0; i < n; i += 1) { + y[i * incy] = x[i * incx] + scalar; + } +} + +__attribute__((weak)) void NN__add1_i16(size_t n, int16_t *y, size_t incy, const int16_t *x, size_t incx, int16_t scalar) { + for (size_t i = 0; i < n; i += 1) { + y[i * incy] = x[i * incx] + scalar; + } +} + +__attribute__((weak)) void NN__add1_i32(size_t n, int32_t *y, size_t incy, const int32_t *x, size_t incx, int32_t scalar) { + for (size_t i = 0; i < n; i += 1) { + y[i * incy] = x[i * incx] + scalar; + } +} + +__attribute__((weak)) void NN__add1_f16(size_t n, float16_t *y, size_t incy, const float16_t *x, size_t incx, float16_t scalar) { + for (size_t i = 0; i < n; i += incx) { + y[i * incy] = NN_float_to_half(NN_half_to_float(x[i * incx]) + NN_half_to_float(scalar)); + } +} + __attribute__((weak)) void NN__add1_f32(size_t n, float *y, size_t incy, const float *x, size_t incx, float scalar) { for (size_t i = 0; i < n; i += 1) { y[i * incy] = x[i * incx] + scalar; diff --git a/nn/impl/cpu/div.c b/nn/impl/cpu/div.c index c317236..804595c 100644 --- a/nn/impl/cpu/div.c +++ b/nn/impl/cpu/div.c @@ -1,6 +1,30 @@ #include "div.h" +__attribute__((weak)) void NN__div_i8(size_t n, int8_t *z, size_t incz, const int8_t *x, size_t incx, const int8_t *y, size_t incy) { + for (size_t i = 0; i < n; i += 1) { + z[i * incz] = x[i * incx] / y[i * incy]; + } +} + +__attribute__((weak)) void NN__div_i16(size_t n, int16_t *z, size_t incz, const int16_t *x, size_t incx, const int16_t *y, size_t incy) { + for (size_t i = 0; i < n; i += 1) { + z[i * incz] = x[i * incx] / y[i * incy]; + } +} + +__attribute__((weak)) void NN__div_i32(size_t n, int32_t *z, size_t incz, const int32_t *x, size_t incx, const int32_t *y, size_t incy) { + for (size_t i = 0; i < n; i += 1) { + z[i * incz] = x[i * incx] / y[i * incy]; + } +} + +__attribute__((weak)) void NN__div_f16(size_t n, float16_t *z, size_t incz, const float16_t *x, size_t incx, const float16_t *y, size_t incy) { + for (size_t i = 0; i < n; i += 1) { + z[i * incz] = NN_float_to_half(NN_half_to_float(x[i * incx]) / NN_half_to_float(y[i * incy])); + } +} + __attribute__((weak)) void NN__div_f32(size_t n, float *z, size_t incz, const float *x, size_t incx, const float *y, size_t incy) { for (size_t i = 0; i < n; i += 1) { z[i * incz] = x[i * incx] / y[i * incy]; diff --git a/nn/impl/cpu/dot.c b/nn/impl/cpu/dot.c index ac3d86c..f96fcee 100644 --- a/nn/impl/cpu/dot.c +++ b/nn/impl/cpu/dot.c @@ -1,6 +1,30 @@ #include "dot.h" +__attribute__((weak)) void NN__dot_i8_to_i32(size_t n, int32_t *result, const int8_t *x, size_t incx, const int8_t *y, size_t incy) { + int32_t sum = 0; + for (size_t i = 0; i < n; i += 1) { + sum += x[i * incx] * y[i * incy]; + } + *result = sum; +} + +__attribute__((weak)) void NN__dot_i16_to_i32(size_t n, int32_t *result, const int16_t *x, size_t incx, const int16_t *y, size_t incy) { + int32_t sum = 0; + for (size_t i = 0; i < n; i += 1) { + sum += x[i * incx] * y[i * incy]; + } + *result = sum; +} + +__attribute__((weak)) void NN__dot_i32(size_t n, int32_t *result, const int32_t *x, size_t incx, const int32_t *y, size_t incy) { + int32_t sum = 0; + for (size_t i = 0; i < n; i += 1) { + sum += x[i * incx] * y[i * incy]; + } + *result = sum; +} + __attribute__((weak)) void NN__dot_f16(size_t n, float16_t *result, const float16_t *x, size_t incx, const float16_t *y, size_t incy) { float sum_f32 = 0; for (size_t i = 0; i < n; i += 1) { diff --git a/nn/impl/cpu/fill.c b/nn/impl/cpu/fill.c index b76dec1..f9530a7 100644 --- a/nn/impl/cpu/fill.c +++ b/nn/impl/cpu/fill.c @@ -1,5 +1,6 @@ #include "fill.h" + __attribute__((weak)) void NN__fill_u8(size_t n, uint8_t *x, size_t incx, uint8_t scalar) { for (size_t i = 0; i < n; i += 1) { x[i * incx] = scalar; diff --git a/nn/impl/cpu/max.c b/nn/impl/cpu/max.c index 0f7330d..441d87a 100644 --- a/nn/impl/cpu/max.c +++ b/nn/impl/cpu/max.c @@ -1,6 +1,42 @@ #include "max.h" +__attribute__((weak)) void NN__max_i8(size_t n, int8_t *result, const int8_t *x, size_t incx) { + int8_t max = INT8_MIN; + for (size_t i = 0; i < n; i += 1) { + int8_t val = x[i * incx]; + max = val > max ? val : max; + } + *result = max; +} + +__attribute__((weak)) void NN__max_i16(size_t n, int16_t *result, const int16_t *x, size_t incx) { + int16_t max = INT16_MIN; + for (size_t i = 0; i < n; i += 1) { + int16_t val = x[i * incx]; + max = val > max ? val : max; + } + *result = max; +} + +__attribute__((weak)) void NN__max_i32(size_t n, int32_t *result, const int32_t *x, size_t incx) { + int32_t max = INT32_MIN; + for (size_t i = 0; i < n; i += 1) { + int32_t val = x[i * incx]; + max = val > max ? val : max; + } + *result = max; +} + +__attribute__((weak)) void NN__max_f16(size_t n, float16_t *result, const float16_t *x, size_t incx) { + float16_t max = NN_float_to_half(-FLT_MAX); + for (size_t i = 0; i < n; i += 1) { + float16_t val = x[i * incx]; + max = NN_half_to_float(val) > NN_half_to_float(max) ? val : max; + } + *result = max; +} + __attribute__((weak)) void NN__max_f32(size_t n, float *result, const float *x, size_t incx) { float max = -FLT_MAX; for (size_t i = 0; i < n; i += 1) { diff --git a/nn/impl/cpu/maximum.c b/nn/impl/cpu/maximum.c index 31a5286..68258b0 100644 --- a/nn/impl/cpu/maximum.c +++ b/nn/impl/cpu/maximum.c @@ -1,6 +1,38 @@ #include "maximum.h" +__attribute__((weak)) void NN__maximum_i8(size_t n, int8_t *z, size_t incz, const int8_t *x, size_t incx, const int8_t *y, size_t incy) { + for (size_t i = 0; i < n; i += 1) { + int8_t x_val = x[i * incx]; + int8_t y_val = y[i * incy]; + z[i * incz] = x_val > y_val ? x_val : y_val; + } +} + +__attribute__((weak)) void NN__maximum_i16(size_t n, int16_t *z, size_t incz, const int16_t *x, size_t incx, const int16_t *y, size_t incy) { + for (size_t i = 0; i < n; i += 1) { + int16_t x_val = x[i * incx]; + int16_t y_val = y[i * incy]; + z[i * incz] = x_val > y_val ? x_val : y_val; + } +} + +__attribute__((weak)) void NN__maximum_i32(size_t n, int32_t *z, size_t incz, const int32_t *x, size_t incx, const int32_t *y, size_t incy) { + for (size_t i = 0; i < n; i += 1) { + int32_t x_val = x[i * incx]; + int32_t y_val = y[i * incy]; + z[i * incz] = x_val > y_val ? x_val : y_val; + } +} + +__attribute__((weak)) void NN__maximum_f16(size_t n, float16_t *z, size_t incz, const float16_t *x, size_t incx, const float16_t *y, size_t incy) { + for (size_t i = 0; i < n; i += 1) { + float16_t x_val = x[i * incx]; + float16_t y_val = y[i * incy]; + z[i * incz] = NN_half_to_float(x_val) > NN_half_to_float(y_val) ? x_val : y_val; + } +} + __attribute__((weak)) void NN__maximum_f32(size_t n, float *z, size_t incz, const float *x, size_t incx, const float *y, size_t incy) { for (size_t i = 0; i < n; i += 1) { float x_val = x[i * incx]; diff --git a/nn/impl/cpu/maximum1.c b/nn/impl/cpu/maximum1.c index 9a152bf..01312ac 100644 --- a/nn/impl/cpu/maximum1.c +++ b/nn/impl/cpu/maximum1.c @@ -1,6 +1,24 @@ #include "maximum1.h" +__attribute__((weak)) void NN__maximum1_i8(size_t n, int8_t *y, size_t incy, const int8_t *x, size_t incx, int8_t scalar) { + for (size_t i = 0; i < n; i += 1) { + y[i * incy] = x[i * incx] > scalar ? x[i * incx] : scalar; + } +} + +__attribute__((weak)) void NN__maximum1_i16(size_t n, int16_t *y, size_t incy, const int16_t *x, size_t incx, int16_t scalar) { + for (size_t i = 0; i < n; i += 1) { + y[i * incy] = x[i * incx] > scalar ? x[i * incx] : scalar; + } +} + +__attribute__((weak)) void NN__maximum1_i32(size_t n, int32_t *y, size_t incy, const int32_t *x, size_t incx, int32_t scalar) { + for (size_t i = 0; i < n; i += 1) { + y[i * incy] = x[i * incx] > scalar ? x[i * incx] : scalar; + } +} + __attribute__((weak)) void NN__maximum1_f16(size_t n, float16_t *y, size_t incy, const float16_t *x, size_t incx, float16_t scalar) { for (size_t i = 0; i < n; i += 1) { float16_t x_val = x[i * incx]; diff --git a/nn/impl/cpu/min.c b/nn/impl/cpu/min.c index 11ebf89..57198c8 100644 --- a/nn/impl/cpu/min.c +++ b/nn/impl/cpu/min.c @@ -1,6 +1,42 @@ #include "min.h" +__attribute__((weak)) void NN__min_i8(size_t n, int8_t *result, const int8_t *x, size_t incx) { + int8_t min = INT8_MAX; + for (size_t i = 0; i < n; i += 1) { + int8_t val = x[i * incx]; + min = val < min ? val : min; + } + *result = min; +} + +__attribute__((weak)) void NN__min_i16(size_t n, int16_t *result, const int16_t *x, size_t incx) { + int16_t min = INT16_MAX; + for (size_t i = 0; i < n; i += 1) { + int16_t val = x[i * incx]; + min = val < min ? val : min; + } + *result = min; +} + +__attribute__((weak)) void NN__min_i32(size_t n, int32_t *result, const int32_t *x, size_t incx) { + int32_t min = INT32_MAX; + for (size_t i = 0; i < n; i += 1) { + int32_t val = x[i * incx]; + min = val < min ? val : min; + } + *result = min; +} + +__attribute__((weak)) void NN__min_f16(size_t n, float16_t *result, const float16_t *x, size_t incx) { + float16_t min = NN_float_to_half(FLT_MAX); + for (size_t i = 0; i < n; i += 1) { + float16_t val = x[i * incx]; + min = NN_half_to_float(val) < NN_half_to_float(min) ? val : min; + } + *result = min; +} + __attribute__((weak)) void NN__min_f32(size_t n, float *result, const float *x, size_t incx) { float min = FLT_MAX; for (size_t i = 0; i < n; i += 1) { diff --git a/nn/impl/cpu/minimum.c b/nn/impl/cpu/minimum.c index 611db96..ede2df0 100644 --- a/nn/impl/cpu/minimum.c +++ b/nn/impl/cpu/minimum.c @@ -1,6 +1,38 @@ #include "minimum.h" +__attribute__((weak)) void NN__minimum_i8(size_t n, int8_t *z, size_t incz, const int8_t *x, size_t incx, const int8_t *y, size_t incy) { + for (size_t i = 0; i < n; i += 1) { + int8_t x_val = x[i * incx]; + int8_t y_val = y[i * incy]; + z[i * incz] = x_val < y_val ? x_val : y_val; + } +} + +__attribute__((weak)) void NN__minimum_i16(size_t n, int16_t *z, size_t incz, const int16_t *x, size_t incx, const int16_t *y, size_t incy) { + for (size_t i = 0; i < n; i += 1) { + int16_t x_val = x[i * incx]; + int16_t y_val = y[i * incy]; + z[i * incz] = x_val < y_val ? x_val : y_val; + } +} + +__attribute__((weak)) void NN__minimum_i32(size_t n, int32_t *z, size_t incz, const int32_t *x, size_t incx, const int32_t *y, size_t incy) { + for (size_t i = 0; i < n; i += 1) { + int32_t x_val = x[i * incx]; + int32_t y_val = y[i * incy]; + z[i * incz] = x_val < y_val ? x_val : y_val; + } +} + +__attribute__((weak)) void NN__minimum_f16(size_t n, float16_t *z, size_t incz, const float16_t *x, size_t incx, const float16_t *y, size_t incy) { + for (size_t i = 0; i < n; i += 1) { + float16_t x_val = x[i * incx]; + float16_t y_val = y[i * incy]; + z[i * incz] = NN_half_to_float(x_val) < NN_half_to_float(y_val) ? x_val : y_val; + } +} + __attribute__((weak)) void NN__minimum_f32(size_t n, float *z, size_t incz, const float *x, size_t incx, const float *y, size_t incy) { for (size_t i = 0; i < n; i += 1) { float x_val = x[i * incx]; diff --git a/nn/impl/cpu/minimum1.c b/nn/impl/cpu/minimum1.c index ba1a950..08f110a 100644 --- a/nn/impl/cpu/minimum1.c +++ b/nn/impl/cpu/minimum1.c @@ -1,6 +1,24 @@ #include "minimum1.h" +__attribute__((weak)) void NN__minimum1_i8(size_t n, int8_t *y, size_t incy, const int8_t *x, size_t incx, int8_t scalar) { + for (size_t i = 0; i < n; i += 1) { + y[i * incy] = x[i * incx] < scalar ? x[i * incx] : scalar; + } +} + +__attribute__((weak)) void NN__minimum1_i16(size_t n, int16_t *y, size_t incy, const int16_t *x, size_t incx, int16_t scalar) { + for (size_t i = 0; i < n; i += 1) { + y[i * incy] = x[i * incx] < scalar ? x[i * incx] : scalar; + } +} + +__attribute__((weak)) void NN__minimum1_i32(size_t n, int32_t *y, size_t incy, const int32_t *x, size_t incx, int32_t scalar) { + for (size_t i = 0; i < n; i += 1) { + y[i * incy] = x[i * incx] < scalar ? x[i * incx] : scalar; + } +} + __attribute__((weak)) void NN__minimum1_f16(size_t n, float16_t *y, size_t incy, const float16_t *x, size_t incx, float16_t scalar) { for (size_t i = 0; i < n; i += 1) { float16_t x_val = x[i * incx]; diff --git a/nn/impl/cpu/mul.c b/nn/impl/cpu/mul.c index fd599ce..5fbf438 100644 --- a/nn/impl/cpu/mul.c +++ b/nn/impl/cpu/mul.c @@ -1,6 +1,30 @@ #include "mul.h" +__attribute__((weak)) void NN__mul_i8(size_t n, int8_t *z, size_t incz, const int8_t *x, size_t incx, const int8_t *y, size_t incy) { + for (size_t i = 0; i < n; i += 1) { + z[i * incz] = x[i * incx] * y[i * incy]; + } +} + +__attribute__((weak)) void NN__mul_i16(size_t n, int16_t *z, size_t incz, const int16_t *x, size_t incx, const int16_t *y, size_t incy) { + for (size_t i = 0; i < n; i += 1) { + z[i * incz] = x[i * incx] * y[i * incy]; + } +} + +__attribute__((weak)) void NN__mul_i32(size_t n, int32_t *z, size_t incz, const int32_t *x, size_t incx, const int32_t *y, size_t incy) { + for (size_t i = 0; i < n; i += 1) { + z[i * incz] = x[i * incx] * y[i * incy]; + } +} + +__attribute__((weak)) void NN__mul_f16(size_t n, float16_t *z, size_t incz, const float16_t *x, size_t incx, const float16_t *y, size_t incy) { + for (size_t i = 0; i < n; i += 1) { + z[i * incz] = NN_float_to_half(NN_half_to_float(x[i * incx]) * NN_half_to_float(y[i * incy])); + } +} + __attribute__((weak)) void NN__mul_f32(size_t n, float *z, size_t incz, const float *x, size_t incx, const float *y, size_t incy) { for (size_t i = 0; i < n; i += 1) { z[i * incz] = x[i * incx] * y[i * incy]; diff --git a/nn/impl/cpu/mul1.c b/nn/impl/cpu/mul1.c index ead5bfe..597179a 100644 --- a/nn/impl/cpu/mul1.c +++ b/nn/impl/cpu/mul1.c @@ -1,6 +1,30 @@ #include "mul1.h" +__attribute__((weak)) void NN__mul1_i8(size_t n, int8_t *y, size_t incy, const int8_t *x, size_t incx, int8_t scalar) { + for (size_t i = 0; i < n; i += 1) { + y[i * incy] = x[i * incx] * scalar; + } +} + +__attribute__((weak)) void NN__mul1_i16(size_t n, int16_t *y, size_t incy, const int16_t *x, size_t incx, int16_t scalar) { + for (size_t i = 0; i < n; i += 1) { + y[i * incy] = x[i * incx] * scalar; + } +} + +__attribute__((weak)) void NN__mul1_i32(size_t n, int32_t *y, size_t incy, const int32_t *x, size_t incx, int32_t scalar) { + for (size_t i = 0; i < n; i += 1) { + y[i * incy] = x[i * incx] * scalar; + } +} + +__attribute__((weak)) void NN__mul1_f16(size_t n, float16_t *y, size_t incy, const float16_t *x, size_t incx, float16_t scalar) { + for (size_t i = 0; i < n; i += 1) { + y[i * incy] = NN_float_to_half(NN_half_to_float(x[i * incx]) * NN_half_to_float(scalar)); + } +} + __attribute__((weak)) void NN__mul1_f32(size_t n, float *y, size_t incy, const float *x, size_t incx, float scalar) { for (size_t i = 0; i < n; i += 1) { y[i * incy] = x[i * incx] * scalar; diff --git a/nn/impl/cpu/neg.c b/nn/impl/cpu/neg.c index 6be51e7..6c3d09b 100644 --- a/nn/impl/cpu/neg.c +++ b/nn/impl/cpu/neg.c @@ -1,6 +1,30 @@ #include "neg.h" +__attribute__((weak)) void NN__neg_i8(size_t n, int8_t *y, size_t incy, const int8_t *x, size_t incx) { + for (size_t i = 0; i < n; i += 1) { + y[i * incy] = -x[i * incx]; + } +} + +__attribute__((weak)) void NN__neg_i16(size_t n, int16_t *y, size_t incy, const int16_t *x, size_t incx) { + for (size_t i = 0; i < n; i += 1) { + y[i * incy] = -x[i * incx]; + } +} + +__attribute__((weak)) void NN__neg_i32(size_t n, int32_t *y, size_t incy, const int32_t *x, size_t incx) { + for (size_t i = 0; i < n; i += 1) { + y[i * incy] = -x[i * incx]; + } +} + +__attribute__((weak)) void NN__neg_f16(size_t n, float16_t *y, size_t incy, const float16_t *x, size_t incx) { + for (size_t i = 0; i < n; i += 1) { + y[i * incy] = -x[i * incx]; + } +} + __attribute__((weak)) void NN__neg_f32(size_t n, float *y, size_t incy, const float *x, size_t incx) { for (size_t i = 0; i < n; i += 1) { y[i * incy] = -x[i * incx]; diff --git a/nn/impl/cpu/softmax.c b/nn/impl/cpu/softmax.c index e3c5188..1af8422 100644 --- a/nn/impl/cpu/softmax.c +++ b/nn/impl/cpu/softmax.c @@ -1,6 +1,19 @@ #include "softmax.h" +__attribute__((weak)) void NN__softmax_f16(size_t n, float16_t *y, size_t incy, const float16_t *x, size_t incx) { + // exp and sum + float sum = 0.0f; + for (size_t i = 0; i < n; i += 1) { + y[i * incy] = NN_float_to_half(expf(NN_half_to_float(x[i * incx]))); + sum += NN_half_to_float(y[i * incy]); + } + // normalize + for (size_t i = 0; i < n; i += 1) { + y[i * incy] = NN_float_to_half(NN_half_to_float(y[i * incy]) / sum); + } +} + __attribute__((weak)) void NN__softmax_f32(size_t n, float *y, size_t incy, const float *x, size_t incx) { // exp and sum float sum = 0.0f; diff --git a/nn/impl/cpu/sqr.c b/nn/impl/cpu/sqr.c index be7c953..320ab83 100644 --- a/nn/impl/cpu/sqr.c +++ b/nn/impl/cpu/sqr.c @@ -1,6 +1,30 @@ #include "sqr.h" +__attribute__((weak)) void NN__sqr_i8(size_t n, int8_t *y, size_t incy, const int8_t *x, size_t incx) { + for (size_t i = 0; i < n; i += 1) { + y[i * incy] = x[i * incx] * x[i * incx]; + } +} + +__attribute__((weak)) void NN__sqr_i16(size_t n, int16_t *y, size_t incy, const int16_t *x, size_t incx) { + for (size_t i = 0; i < n; i += 1) { + y[i * incy] = x[i * incx] * x[i * incx]; + } +} + +__attribute__((weak)) void NN__sqr_i32(size_t n, int32_t *y, size_t incy, const int32_t *x, size_t incx) { + for (size_t i = 0; i < n; i += 1) { + y[i * incy] = x[i * incx] * x[i * incx]; + } +} + +__attribute__((weak)) void NN__sqr_f16(size_t n, float16_t *y, size_t incy, const float16_t *x, size_t incx) { + for (size_t i = 0; i < n; i += 1) { + y[i * incy] = NN_float_to_half(NN_half_to_float(x[i * incx]) * NN_half_to_float(x[i * incx])); + } +} + __attribute__((weak)) void NN__sqr_f32(size_t n, float *y, size_t incy, const float *x, size_t incx) { for (size_t i = 0; i < n; i += 1) { y[i * incy] = x[i * incx] * x[i * incx]; diff --git a/nn/impl/cpu/sqrt.c b/nn/impl/cpu/sqrt.c index 65c6408..65ef517 100644 --- a/nn/impl/cpu/sqrt.c +++ b/nn/impl/cpu/sqrt.c @@ -1,6 +1,12 @@ #include "sqrt.h" +__attribute__((weak)) void NN__sqrt_f16(size_t n, float16_t *y, size_t incy, const float16_t *x, size_t incx) { + for (size_t i = 0; i < n; i += 1) { + y[i * incy] = NN_float_to_half(sqrtf(NN_half_to_float(x[i * incx]))); + } +} + __attribute__((weak)) void NN__sqrt_f32(size_t n, float *y, size_t incy, const float *x, size_t incx) { for (size_t i = 0; i < n; i += 1) { y[i * incy] = sqrtf(x[i * incx]); diff --git a/nn/impl/cpu/sum.c b/nn/impl/cpu/sum.c index 0de20da..8d7fb52 100644 --- a/nn/impl/cpu/sum.c +++ b/nn/impl/cpu/sum.c @@ -9,6 +9,14 @@ __attribute__((weak)) void NN__sum_u8_to_i32(size_t n, int32_t *result, const ui *result = sum; } +__attribute__((weak)) void NN__sum_i8_to_i32(size_t n, int32_t *result, const int8_t *x, size_t incx) { + int32_t sum = 0; + for (size_t i = 0; i < n; i += 1) { + sum += (int32_t)x[i * incx]; + } + *result = sum; +} + __attribute__((weak)) void NN__sum_i16_to_i32(size_t n, int32_t *result, const int16_t *x, size_t incx) { int32_t sum = 0; for (size_t i = 0; i < n; i += 1) { @@ -25,6 +33,14 @@ __attribute__((weak)) void NN__sum_i32(size_t n, int32_t *result, const int32_t *result = sum; } +__attribute__((weak)) void NN__sum_f16(size_t n, float16_t *result, const float16_t *x, size_t incx) { + float sum = 0.f; + for (size_t i = 0; i < n; i += 1) { + sum += NN_half_to_float(x[i * incx]); + } + *result = NN_float_to_half(sum); +} + __attribute__((weak)) void NN__sum_f32(size_t n, float *result, const float *x, size_t incx) { float sum = 0.f; for (size_t i = 0; i < n; i += 1) { diff --git a/nn/impl/cpu/transpose.c b/nn/impl/cpu/transpose.c index cbbc046..f3ba509 100644 --- a/nn/impl/cpu/transpose.c +++ b/nn/impl/cpu/transpose.c @@ -1,6 +1,38 @@ #include "transpose.h" +__attribute__((weak)) void NN__transpose_i8(size_t m, size_t n, int8_t *y, const int8_t *x) { + for (size_t i = 0; i < m; i += 1) { + for (size_t j = 0; j < n; j += 1) { + y[j * m + i] = x[i * n + j]; + } + } +}; + +__attribute__((weak)) void NN__transpose_i16(size_t m, size_t n, int16_t *y, const int16_t *x) { + for (size_t i = 0; i < m; i += 1) { + for (size_t j = 0; j < n; j += 1) { + y[j * m + i] = x[i * n + j]; + } + } +}; + +__attribute__((weak)) void NN__transpose_i32(size_t m, size_t n, int32_t *y, const int32_t *x) { + for (size_t i = 0; i < m; i += 1) { + for (size_t j = 0; j < n; j += 1) { + y[j * m + i] = x[i * n + j]; + } + } +}; + +__attribute__((weak)) void NN__transpose_f16(size_t m, size_t n, float16_t *y, const float16_t *x) { + for (size_t i = 0; i < m; i += 1) { + for (size_t j = 0; j < n; j += 1) { + y[j * m + i] = x[i * n + j]; + } + } +}; + __attribute__((weak)) void NN__transpose_f32(size_t m, size_t n, float *y, const float *x) { for (size_t i = 0; i < m; i += 1) { for (size_t j = 0; j < n; j += 1) { diff --git a/nn/impl/div.h b/nn/impl/div.h index 58984a3..bc74939 100644 --- a/nn/impl/div.h +++ b/nn/impl/div.h @@ -4,6 +4,32 @@ #include #include +#include "nn_float16.h" + + +void NN__div_i8(size_t n, + int8_t *z, size_t incz, + const int8_t *x, size_t incx, + const int8_t *y, size_t incy + ); + +void NN__div_i16(size_t n, + int16_t *z, size_t incz, + const int16_t *x, size_t incx, + const int16_t *y, size_t incy + ); + +void NN__div_i32(size_t n, + int32_t *z, size_t incz, + const int32_t *x, size_t incx, + const int32_t *y, size_t incy + ); + +void NN__div_f16(size_t n, + float16_t *z, size_t incz, + const float16_t *x, size_t incx, + const float16_t *y, size_t incy + ); void NN__div_f32(size_t n, float *z, size_t incz, diff --git a/nn/impl/dot.h b/nn/impl/dot.h index 6eb953d..4b42f63 100644 --- a/nn/impl/dot.h +++ b/nn/impl/dot.h @@ -6,6 +6,24 @@ #include "nn_float16.h" +void NN__dot_i8_to_i32(size_t n, + int32_t *result, + const int8_t *x, size_t incx, + const int8_t *y, size_t incy + ); + +void NN__dot_i16_to_i32(size_t n, + int32_t *result, + const int16_t *x, size_t incx, + const int16_t *y, size_t incy + ); + +void NN__dot_i32(size_t n, + int32_t *result, + const int32_t *x, size_t incx, + const int32_t *y, size_t incy + ); + void NN__dot_f16(size_t n, float16_t *result, const float16_t *x, size_t incx, diff --git a/nn/impl/gemmini/mm.c b/nn/impl/gemmini/mm.c index 4b7d113..e2c5027 100644 --- a/nn/impl/gemmini/mm.c +++ b/nn/impl/gemmini/mm.c @@ -12,12 +12,12 @@ void NN__mm_f32(size_t m, size_t n, float16_t *z, float16_t *x, float16_t *y) { size_t stride_C = dim_J; tiled_matmul_auto(dim_I, dim_J, dim_K, - x, y, - NULL, z, - stride_A, stride_B, stride_D, stride_C, - MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, - NO_ACTIVATION, ACC_SCALE_IDENTITY, 0, - 0, 0, 0, 0, 0, 0, WS); + x, y, + NULL, z, + stride_A, stride_B, stride_D, stride_C, + MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, + NO_ACTIVATION, ACC_SCALE_IDENTITY, 0, + 0, 0, 0, 0, 0, 0, WS); return; }; diff --git a/nn/impl/max.h b/nn/impl/max.h index d1a1547..1198ed6 100644 --- a/nn/impl/max.h +++ b/nn/impl/max.h @@ -5,6 +5,28 @@ #include #include +#include "nn_float16.h" + + +void NN__max_i8(size_t n, + int8_t *result, + const int8_t *x, size_t incx + ); + +void NN__max_i16(size_t n, + int16_t *result, + const int16_t *x, size_t incx + ); + +void NN__max_i32(size_t n, + int32_t *result, + const int32_t *x, size_t incx + ); + +void NN__max_f16(size_t n, + float16_t *result, + const float16_t *x, size_t incx + ); void NN__max_f32(size_t n, float *result, diff --git a/nn/impl/maximum.h b/nn/impl/maximum.h index cdc425b..2325e43 100644 --- a/nn/impl/maximum.h +++ b/nn/impl/maximum.h @@ -4,6 +4,32 @@ #include #include +#include "nn_float16.h" + + +void NN__maximum_i8(size_t n, + int8_t *z, size_t incz, + const int8_t *x, size_t incx, + const int8_t *y, size_t incy + ); + +void NN__maximum_i16(size_t n, + int16_t *z, size_t incz, + const int16_t *x, size_t incx, + const int16_t *y, size_t incy + ); + +void NN__maximum_i32(size_t n, + int32_t *z, size_t incz, + const int32_t *x, size_t incx, + const int32_t *y, size_t incy + ); + +void NN__maximum_f16(size_t n, + float16_t *z, size_t incz, + const float16_t *x, size_t incx, + const float16_t *y, size_t incy + ); void NN__maximum_f32(size_t n, float *z, size_t incz, diff --git a/nn/impl/maximum1.h b/nn/impl/maximum1.h index 92cd42a..15821a6 100644 --- a/nn/impl/maximum1.h +++ b/nn/impl/maximum1.h @@ -7,6 +7,24 @@ #include "nn_float16.h" +void NN__maximum1_i8(size_t n, + int8_t *y, size_t incy, + const int8_t *x, size_t incx, + int8_t scalar + ); + +void NN__maximum1_i16(size_t n, + int16_t *y, size_t incy, + const int16_t *x, size_t incx, + int16_t scalar + ); + +void NN__maximum1_i32(size_t n, + int32_t *y, size_t incy, + const int32_t *x, size_t incx, + int32_t scalar + ); + void NN__maximum1_f16(size_t n, float16_t *y, size_t incy, const float16_t *x, size_t incx, diff --git a/nn/impl/min.h b/nn/impl/min.h index 1e75654..164539a 100644 --- a/nn/impl/min.h +++ b/nn/impl/min.h @@ -5,6 +5,28 @@ #include #include +#include "nn_float16.h" + + +void NN__min_i8(size_t n, + int8_t *result, + const int8_t *x, size_t incx + ); + +void NN__min_i16(size_t n, + int16_t *result, + const int16_t *x, size_t incx + ); + +void NN__min_i32(size_t n, + int32_t *result, + const int32_t *x, size_t incx + ); + +void NN__min_f16(size_t n, + float16_t *result, + const float16_t *x, size_t incx + ); void NN__min_f32(size_t n, float *result, diff --git a/nn/impl/minimum.h b/nn/impl/minimum.h index ae61945..ec301d9 100644 --- a/nn/impl/minimum.h +++ b/nn/impl/minimum.h @@ -4,6 +4,32 @@ #include #include +#include "nn_float16.h" + + +void NN__minimum_i8(size_t n, + int8_t *z, size_t incz, + const int8_t *x, size_t incx, + const int8_t *y, size_t incy + ); + +void NN__minimum_i16(size_t n, + int16_t *z, size_t incz, + const int16_t *x, size_t incx, + const int16_t *y, size_t incy + ); + +void NN__minimum_i32(size_t n, + int32_t *z, size_t incz, + const int32_t *x, size_t incx, + const int32_t *y, size_t incy + ); + +void NN__minimum_f16(size_t n, + float16_t *z, size_t incz, + const float16_t *x, size_t incx, + const float16_t *y, size_t incy + ); void NN__minimum_f32(size_t n, float *z, size_t incz, diff --git a/nn/impl/minimum1.h b/nn/impl/minimum1.h index d81e38f..02ecd35 100644 --- a/nn/impl/minimum1.h +++ b/nn/impl/minimum1.h @@ -7,6 +7,30 @@ #include "nn_float16.h" +void NN__minimum1_i8(size_t n, + int8_t *y, size_t incy, + const int8_t *x, size_t incx, + int8_t scalar + ); + +void NN__minimum1_i16(size_t n, + int16_t *y, size_t incy, + const int16_t *x, size_t incx, + int16_t scalar + ); + +void NN__minimum1_i32(size_t n, + int32_t *y, size_t incy, + const int32_t *x, size_t incx, + int32_t scalar + ); + +void NN__minimum1_f16(size_t n, + float16_t *y, size_t incy, + const float16_t *x, size_t incx, + float16_t scalar + ); + void NN__minimum1_f32(size_t n, float *y, size_t incy, const float *x, size_t incx, diff --git a/nn/impl/mul.h b/nn/impl/mul.h index ad2352c..23241cf 100644 --- a/nn/impl/mul.h +++ b/nn/impl/mul.h @@ -4,6 +4,32 @@ #include #include +#include "nn_float16.h" + + +void NN__mul_i8(size_t n, + int8_t *z, size_t incz, + const int8_t *x, size_t incx, + const int8_t *y, size_t incy + ); + +void NN__mul_i16(size_t n, + int16_t *z, size_t incz, + const int16_t *x, size_t incx, + const int16_t *y, size_t incy + ); + +void NN__mul_i32(size_t n, + int32_t *z, size_t incz, + const int32_t *x, size_t incx, + const int32_t *y, size_t incy + ); + +void NN__mul_f16(size_t n, + float16_t *z, size_t incz, + const float16_t *x, size_t incx, + const float16_t *y, size_t incy + ); void NN__mul_f32(size_t n, float *z, size_t incz, diff --git a/nn/impl/mul1.h b/nn/impl/mul1.h index 757e690..8f042f7 100644 --- a/nn/impl/mul1.h +++ b/nn/impl/mul1.h @@ -4,6 +4,32 @@ #include #include +#include "nn_float16.h" + + +void NN__mul1_i8(size_t n, + int8_t *y, size_t incy, + const int8_t *x, size_t incx, + int8_t scalar + ); + +void NN__mul1_i16(size_t n, + int16_t *y, size_t incy, + const int16_t *x, size_t incx, + int16_t scalar + ); + +void NN__mul1_i32(size_t n, + int32_t *y, size_t incy, + const int32_t *x, size_t incx, + int32_t scalar + ); + +void NN__mul1_f16(size_t n, + float16_t *y, size_t incy, + const float16_t *x, size_t incx, + float16_t scalar + ); void NN__mul1_f32(size_t n, float *y, size_t incy, diff --git a/nn/impl/neg.h b/nn/impl/neg.h index 7c19cb3..488cad6 100644 --- a/nn/impl/neg.h +++ b/nn/impl/neg.h @@ -4,6 +4,28 @@ #include #include +#include "nn_float16.h" + + +void NN__neg_i8(size_t n, + int8_t *y, size_t incy, + const int8_t *x, size_t incx + ); + +void NN__neg_i16(size_t n, + int16_t *y, size_t incy, + const int16_t *x, size_t incx + ); + +void NN__neg_i32(size_t n, + int32_t *y, size_t incy, + const int32_t *x, size_t incx + ); + +void NN__neg_f16(size_t n, + float16_t *y, size_t incy, + const float16_t *x, size_t incx + ); void NN__neg_f32(size_t n, float *y, size_t incy, diff --git a/nn/impl/rms_norm.h b/nn/impl/rms_norm.h index b877f11..05ea2a6 100644 --- a/nn/impl/rms_norm.h +++ b/nn/impl/rms_norm.h @@ -9,6 +9,7 @@ #include "mul1.h" #include "mul.h" + void NN__rms_norm_f32(size_t n, float* y, size_t incy, const float* x, size_t incx, diff --git a/nn/impl/softmax.h b/nn/impl/softmax.h index 274f084..2ecaebe 100644 --- a/nn/impl/softmax.h +++ b/nn/impl/softmax.h @@ -4,6 +4,13 @@ #include #include +#include "nn_float16.h" + + +void NN__softmax_f16(size_t n, + float16_t *y, size_t incy, + const float16_t *x, size_t incx + ); void NN__softmax_f32(size_t n, float *y, size_t incy, diff --git a/nn/impl/sqr.h b/nn/impl/sqr.h index 3e12733..338ba3d 100644 --- a/nn/impl/sqr.h +++ b/nn/impl/sqr.h @@ -5,6 +5,28 @@ #include #include +#include "nn_float16.h" + + +void NN__sqr_i8(size_t n, + int8_t *y, size_t incy, + const int8_t *x, size_t incx + ); + +void NN__sqr_i16(size_t n, + int16_t *y, size_t incy, + const int16_t *x, size_t incx + ); + +void NN__sqr_i32(size_t n, + int32_t *y, size_t incy, + const int32_t *x, size_t incx + ); + +void NN__sqr_f16(size_t n, + float16_t *y, size_t incy, + const float16_t *x, size_t incx + ); void NN__sqr_f32(size_t n, float *y, size_t incy, diff --git a/nn/impl/sqrt.h b/nn/impl/sqrt.h index 2085a10..b835229 100644 --- a/nn/impl/sqrt.h +++ b/nn/impl/sqrt.h @@ -5,6 +5,13 @@ #include #include +#include "nn_float16.h" + + +void NN__sqrt_f16(size_t n, + float16_t *y, size_t incy, + const float16_t *x, size_t incx + ); void NN__sqrt_f32(size_t n, float *y, size_t incy, diff --git a/nn/impl/sum.h b/nn/impl/sum.h index 4884e8e..8305ced 100644 --- a/nn/impl/sum.h +++ b/nn/impl/sum.h @@ -5,12 +5,19 @@ #include #include +#include "nn_float16.h" + void NN__sum_u8_to_i32(size_t n, int32_t *result, const uint8_t *x, size_t incx ); +void NN__sum_i8_to_i32(size_t n, + int32_t *result, + const int8_t *x, size_t incx + ); + void NN__sum_i16_to_i32(size_t n, int32_t *result, const int16_t *x, size_t incx @@ -21,6 +28,11 @@ void NN__sum_i32(size_t n, const int32_t *x, size_t incx ); +void NN__sum_f16(size_t n, + float16_t *result, + const float16_t *x, size_t incx + ); + void NN__sum_f32(size_t n, float *result, const float *x, size_t incx); diff --git a/nn/impl/transpose.h b/nn/impl/transpose.h index 9e972e8..be542e5 100644 --- a/nn/impl/transpose.h +++ b/nn/impl/transpose.h @@ -2,7 +2,30 @@ #define __NN__TRANSPOSE_H #include +#include +#include "nn_float16.h" + + +void NN__transpose_i8(size_t m, size_t n, + int8_t *y, + const int8_t *x + ); + +void NN__transpose_i16(size_t m, size_t n, + int16_t *y, + const int16_t *x + ); + +void NN__transpose_i32(size_t m, size_t n, + int32_t *y, + const int32_t *x + ); + +void NN__transpose_f16(size_t m, size_t n, + float16_t *y, + const float16_t *x + ); void NN__transpose_f32(size_t m, size_t n, float *y,