From 7147712598a87fad031671fc88fd382534fd9419 Mon Sep 17 00:00:00 2001 From: "-T.K.-" Date: Sat, 20 Jul 2024 13:07:19 -0700 Subject: [PATCH 1/4] FIX: fix import --- CMakeLists.txt | 4 ++-- examples/fast-depth/CMakeLists.txt | 1 + examples/fast-depth/model.h | 1 - examples/llama2/CMakeLists.txt | 1 + 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1b39cf1..a6168a0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -44,8 +44,8 @@ target_compile_features(target-riscv INTERFACE c_std_11) target_compile_definitions(target-riscv INTERFACE RISCV) set(WRAP_SPECS_FILE "htif_wrap.specs") -set(SPECS_FILE "nano.specs") -set(SPEC_FLAGS "-specs=${SPECS_FILE} -specs=${WRAP_SPECS_FILE}") +set(SPECS_FILE "htif_nano.specs") +set(SPEC_FLAGS -specs=${SPECS_FILE} -specs=${WRAP_SPECS_FILE}) set(MARCH "rv64gcv_zfh_zvfh_zvfhmin") set(MABI "lp64d") diff --git a/examples/fast-depth/CMakeLists.txt b/examples/fast-depth/CMakeLists.txt index af497b9..84047fa 100644 --- a/examples/fast-depth/CMakeLists.txt +++ b/examples/fast-depth/CMakeLists.txt @@ -21,3 +21,4 @@ endif () target_compile_options(fast-depth PRIVATE -O3 -Wall -Wextra) target_link_libraries(fast-depth PUBLIC nn) +target_link_libraries(fast-depth PUBLIC m) diff --git a/examples/fast-depth/model.h b/examples/fast-depth/model.h index 95c456c..23a083a 100644 --- a/examples/fast-depth/model.h +++ b/examples/fast-depth/model.h @@ -2,7 +2,6 @@ #define __MODEL_H #include "nn.h" -#include "nn_relu.h" // load the weight data block from the model.bin file diff --git a/examples/llama2/CMakeLists.txt b/examples/llama2/CMakeLists.txt index b3833b9..ae460fa 100644 --- a/examples/llama2/CMakeLists.txt +++ b/examples/llama2/CMakeLists.txt @@ -19,4 +19,5 @@ endif () target_compile_options(llama2 PRIVATE -O3 -Wall -Wextra) target_link_libraries(llama2 PUBLIC nn) +target_link_libraries(llama2 PUBLIC m) From b1b86068df8fa5eacbe020d58ed346537e822099 Mon Sep 17 00:00:00 2001 From: "-T.K.-" Date: Sat, 20 Jul 2024 13:07:50 -0700 Subject: [PATCH 2/4] FIX: change import directory structure --- nn/CMakeLists.txt | 5 +---- nn/functional/nn_abs.h | 2 +- nn/functional/nn_add.h | 8 ++++---- nn/functional/nn_clip.h | 4 ++-- nn/functional/nn_div.h | 2 +- nn/functional/nn_fill.h | 2 +- nn/functional/nn_layer_norm.h | 12 ++++++------ nn/functional/nn_max.h | 2 +- nn/functional/nn_maximum.h | 2 +- nn/functional/nn_min.h | 2 +- nn/functional/nn_minimum.h | 2 +- nn/functional/nn_mm.h | 2 +- nn/functional/nn_mul.h | 4 ++-- nn/functional/nn_mv.h | 2 +- nn/functional/nn_neg.h | 2 +- nn/functional/nn_relu.h | 2 +- nn/functional/nn_relu6.h | 4 ++-- nn/functional/nn_rms_norm.h | 2 +- nn/functional/nn_silu.h | 2 +- nn/functional/nn_softmax.h | 2 +- nn/functional/nn_sub.h | 2 +- nn/functional/nn_sum.h | 2 +- nn/functional/nn_transpose.h | 2 +- 23 files changed, 34 insertions(+), 37 deletions(-) diff --git a/nn/CMakeLists.txt b/nn/CMakeLists.txt index b277074..8826e4d 100644 --- a/nn/CMakeLists.txt +++ b/nn/CMakeLists.txt @@ -1,4 +1,4 @@ - +cmake_minimum_required(VERSION 3.10) set(cpu_impl ./impl/cpu/abs.c @@ -128,6 +128,3 @@ endif () target_link_libraries(nn m) - - - diff --git a/nn/functional/nn_abs.h b/nn/functional/nn_abs.h index b3c4131..9ab5ea8 100644 --- a/nn/functional/nn_abs.h +++ b/nn/functional/nn_abs.h @@ -4,7 +4,7 @@ #include #include "nn_tensor.h" -#include "abs.h" +#include "impl/abs.h" /** diff --git a/nn/functional/nn_add.h b/nn/functional/nn_add.h index f234eb3..7ad4670 100644 --- a/nn/functional/nn_add.h +++ b/nn/functional/nn_add.h @@ -6,10 +6,10 @@ #include "nn_tensor.h" #include "nn_print.h" -#include "acc.h" -#include "acc1.h" -#include "add.h" -#include "add1.h" +#include "impl/acc.h" +#include "impl/acc1.h" +#include "impl/add.h" +#include "impl/add1.h" /** diff --git a/nn/functional/nn_clip.h b/nn/functional/nn_clip.h index 7f88c7a..d08ef22 100644 --- a/nn/functional/nn_clip.h +++ b/nn/functional/nn_clip.h @@ -4,8 +4,8 @@ #include #include "nn_tensor.h" -#include "maximum1.h" -#include "minimum1.h" +#include "impl/maximum1.h" +#include "impl/minimum1.h" /** diff --git a/nn/functional/nn_div.h b/nn/functional/nn_div.h index 0c7945c..67f78d9 100644 --- a/nn/functional/nn_div.h +++ b/nn/functional/nn_div.h @@ -5,7 +5,7 @@ #include #include "nn_tensor.h" -#include "div.h" +#include "impl/div.h" /** diff --git a/nn/functional/nn_fill.h b/nn/functional/nn_fill.h index 48638f4..183b8f5 100644 --- a/nn/functional/nn_fill.h +++ b/nn/functional/nn_fill.h @@ -5,7 +5,7 @@ #include #include "nn_tensor.h" -#include "fill.h" +#include "impl/fill.h" /** * Fills the tensor with the specified value. diff --git a/nn/functional/nn_layer_norm.h b/nn/functional/nn_layer_norm.h index 188bafb..0732672 100644 --- a/nn/functional/nn_layer_norm.h +++ b/nn/functional/nn_layer_norm.h @@ -5,12 +5,12 @@ #include #include "nn_tensor.h" -#include "sum.h" -#include "add.h" -#include "add1.h" -#include "mul.h" -#include "mul1.h" -#include "sqr.h" +#include "impl/sum.h" +#include "impl/add.h" +#include "impl/add1.h" +#include "impl/mul.h" +#include "impl/mul1.h" +#include "impl/sqr.h" void NN_layer_norm( diff --git a/nn/functional/nn_max.h b/nn/functional/nn_max.h index 4cdf403..c08e570 100644 --- a/nn/functional/nn_max.h +++ b/nn/functional/nn_max.h @@ -5,7 +5,7 @@ #include #include "nn_tensor.h" -#include "max.h" +#include "impl/max.h" /** diff --git a/nn/functional/nn_maximum.h b/nn/functional/nn_maximum.h index 3b5a0a3..96c1295 100644 --- a/nn/functional/nn_maximum.h +++ b/nn/functional/nn_maximum.h @@ -4,7 +4,7 @@ #include #include "nn_tensor.h" -#include "maximum.h" +#include "impl/maximum.h" /** diff --git a/nn/functional/nn_min.h b/nn/functional/nn_min.h index 37abcd5..9a49c6a 100644 --- a/nn/functional/nn_min.h +++ b/nn/functional/nn_min.h @@ -5,7 +5,7 @@ #include #include "nn_tensor.h" -#include "min.h" +#include "impl/min.h" /** diff --git a/nn/functional/nn_minimum.h b/nn/functional/nn_minimum.h index 07c4e81..10b21c4 100644 --- a/nn/functional/nn_minimum.h +++ b/nn/functional/nn_minimum.h @@ -4,7 +4,7 @@ #include #include "nn_tensor.h" -#include "minimum.h" +#include "impl/minimum.h" /** diff --git a/nn/functional/nn_mm.h b/nn/functional/nn_mm.h index 9effea9..75ce785 100644 --- a/nn/functional/nn_mm.h +++ b/nn/functional/nn_mm.h @@ -4,7 +4,7 @@ #include #include "nn_tensor.h" -#include "dot.h" +#include "impl/dot.h" /** diff --git a/nn/functional/nn_mul.h b/nn/functional/nn_mul.h index f833b1e..24db770 100644 --- a/nn/functional/nn_mul.h +++ b/nn/functional/nn_mul.h @@ -5,8 +5,8 @@ #include #include "nn_tensor.h" -#include "mul.h" -#include "mul1.h" +#include "impl/mul.h" +#include "impl/mul1.h" /** diff --git a/nn/functional/nn_mv.h b/nn/functional/nn_mv.h index 6a9f9a1..df1fe07 100644 --- a/nn/functional/nn_mv.h +++ b/nn/functional/nn_mv.h @@ -4,7 +4,7 @@ #include #include "nn_tensor.h" -#include "dot.h" +#include "impl/dot.h" /** diff --git a/nn/functional/nn_neg.h b/nn/functional/nn_neg.h index 753d20b..3b3afa3 100644 --- a/nn/functional/nn_neg.h +++ b/nn/functional/nn_neg.h @@ -4,7 +4,7 @@ #include #include "nn_tensor.h" -#include "neg.h" +#include "impl/neg.h" /** diff --git a/nn/functional/nn_relu.h b/nn/functional/nn_relu.h index 2bdaea7..cbcc907 100644 --- a/nn/functional/nn_relu.h +++ b/nn/functional/nn_relu.h @@ -4,7 +4,7 @@ #include #include "nn_tensor.h" -#include "maximum1.h" +#include "impl/maximum1.h" /** diff --git a/nn/functional/nn_relu6.h b/nn/functional/nn_relu6.h index 4507d27..546cc8f 100644 --- a/nn/functional/nn_relu6.h +++ b/nn/functional/nn_relu6.h @@ -4,8 +4,8 @@ #include #include "nn_tensor.h" -#include "maximum1.h" -#include "minimum1.h" +#include "impl/maximum1.h" +#include "impl/minimum1.h" /** diff --git a/nn/functional/nn_rms_norm.h b/nn/functional/nn_rms_norm.h index 634d289..4d6f6dc 100644 --- a/nn/functional/nn_rms_norm.h +++ b/nn/functional/nn_rms_norm.h @@ -5,7 +5,7 @@ #include #include "nn_tensor.h" -#include "rms_norm.h" +#include "impl/rms_norm.h" /** diff --git a/nn/functional/nn_silu.h b/nn/functional/nn_silu.h index df4a195..42a5b03 100644 --- a/nn/functional/nn_silu.h +++ b/nn/functional/nn_silu.h @@ -5,7 +5,7 @@ #include #include "nn_tensor.h" -#include "maximum1.h" +#include "impl/maximum1.h" /** diff --git a/nn/functional/nn_softmax.h b/nn/functional/nn_softmax.h index ba377b4..b3a35e9 100644 --- a/nn/functional/nn_softmax.h +++ b/nn/functional/nn_softmax.h @@ -4,7 +4,7 @@ #include #include "nn_tensor.h" -#include "softmax.h" +#include "impl/softmax.h" /** diff --git a/nn/functional/nn_sub.h b/nn/functional/nn_sub.h index 384043a..5019e9b 100644 --- a/nn/functional/nn_sub.h +++ b/nn/functional/nn_sub.h @@ -6,7 +6,7 @@ #include "nn_tensor.h" #include "nn_print.h" -#include "sub.h" +#include "impl/sub.h" /** diff --git a/nn/functional/nn_sum.h b/nn/functional/nn_sum.h index 2f6d402..cd96ae4 100644 --- a/nn/functional/nn_sum.h +++ b/nn/functional/nn_sum.h @@ -4,7 +4,7 @@ #include #include "nn_tensor.h" -#include "sum.h" +#include "impl/sum.h" /** diff --git a/nn/functional/nn_transpose.h b/nn/functional/nn_transpose.h index 1b8c7de..5be3b4a 100644 --- a/nn/functional/nn_transpose.h +++ b/nn/functional/nn_transpose.h @@ -4,7 +4,7 @@ #include #include "nn_tensor.h" -#include "transpose.h" +#include "impl/transpose.h" /** * Transpose a 2D tensor From f52e40a3b1044d81aac20918268dd736cc3d89c9 Mon Sep 17 00:00:00 2001 From: "-T.K.-" Date: Sat, 20 Jul 2024 13:14:27 -0700 Subject: [PATCH 3/4] FIX: fix import --- nn/CMakeLists.txt | 5 +--- nn/impl/cpu/abs.c | 2 +- nn/impl/cpu/acc.c | 2 +- nn/impl/cpu/acc1.c | 2 +- nn/impl/cpu/add.c | 2 +- nn/impl/cpu/add1.c | 2 +- nn/impl/cpu/div.c | 2 +- nn/impl/cpu/dot.c | 2 +- nn/impl/cpu/fill.c | 2 +- nn/impl/cpu/log.c | 2 +- nn/impl/cpu/max.c | 2 +- nn/impl/cpu/maximum.c | 2 +- nn/impl/cpu/maximum1.c | 2 +- nn/impl/cpu/min.c | 2 +- nn/impl/cpu/minimum.c | 2 +- nn/impl/cpu/minimum1.c | 2 +- nn/impl/cpu/mul.c | 2 +- nn/impl/cpu/mul1.c | 2 +- nn/impl/cpu/neg.c | 2 +- nn/impl/cpu/norm.c | 2 +- nn/impl/cpu/rms_norm.c | 2 +- nn/impl/cpu/sgn.c | 2 +- nn/impl/cpu/softmax.c | 2 +- nn/impl/cpu/sqr.c | 2 +- nn/impl/cpu/sqrt.c | 2 +- nn/impl/cpu/sub.c | 2 +- nn/impl/cpu/sum.c | 2 +- nn/impl/cpu/transpose.c | 2 +- nn/impl/rvv/abs.c | 24 ++++++++-------- nn/impl/rvv/acc.c | 26 +++++++++-------- nn/impl/rvv/acc1.c | 24 ++++++++-------- nn/impl/rvv/add.c | 28 ++++++++++--------- nn/impl/rvv/add1.c | 26 +++++++++-------- nn/impl/rvv/div.c | 28 ++++++++++--------- nn/impl/rvv/dot.c | 40 +++++++++++++------------- nn/impl/rvv/fill.c | 20 +++++++------ nn/impl/rvv/max.c | 24 ++++++++-------- nn/impl/rvv/maximum.c | 28 ++++++++++--------- nn/impl/rvv/maximum1.c | 26 +++++++++-------- nn/impl/rvv/min.c | 24 ++++++++-------- nn/impl/rvv/minimum.c | 28 ++++++++++--------- nn/impl/rvv/minimum1.c | 26 +++++++++-------- nn/impl/rvv/mul.c | 28 ++++++++++--------- nn/impl/rvv/mul1.c | 26 +++++++++-------- nn/impl/rvv/neg.c | 2 +- nn/impl/rvv/rms_norm.c | 2 +- nn/impl/rvv/sub.c | 28 ++++++++++--------- nn/impl/rvv/transpose.c | 2 +- nn/nn.h | 62 ++++++++++++++++++++--------------------- 49 files changed, 306 insertions(+), 275 deletions(-) diff --git a/nn/CMakeLists.txt b/nn/CMakeLists.txt index 8826e4d..c55d8ab 100644 --- a/nn/CMakeLists.txt +++ b/nn/CMakeLists.txt @@ -112,10 +112,7 @@ add_library(nn ${cpu_impl} ) -target_include_directories(nn PUBLIC - ./ - ./functional - ./impl) +target_include_directories(nn PUBLIC ./) if (X86) message(STATUS "nn: Building for x86") diff --git a/nn/impl/cpu/abs.c b/nn/impl/cpu/abs.c index b1f986a..fde7af0 100644 --- a/nn/impl/cpu/abs.c +++ b/nn/impl/cpu/abs.c @@ -1,4 +1,4 @@ -#include "abs.h" +#include "impl/abs.h" __attribute__((weak)) void NN__abs_i8(size_t n, int8_t *y, size_t incy, const int8_t *x, size_t incx) { diff --git a/nn/impl/cpu/acc.c b/nn/impl/cpu/acc.c index 35e9675..86b2c84 100644 --- a/nn/impl/cpu/acc.c +++ b/nn/impl/cpu/acc.c @@ -1,4 +1,4 @@ -#include "acc.h" +#include "impl/acc.h" __attribute__((weak)) void NN__acc_i8(size_t n, int8_t *y, size_t incy, const int8_t *x, size_t incx) { diff --git a/nn/impl/cpu/acc1.c b/nn/impl/cpu/acc1.c index 5eec8be..df14c03 100644 --- a/nn/impl/cpu/acc1.c +++ b/nn/impl/cpu/acc1.c @@ -1,4 +1,4 @@ -#include "acc1.h" +#include "impl/acc1.h" __attribute__((weak)) void NN__acc1_i8(size_t n, int8_t *result, size_t incr, int8_t scalar) { diff --git a/nn/impl/cpu/add.c b/nn/impl/cpu/add.c index 3de3045..d321983 100644 --- a/nn/impl/cpu/add.c +++ b/nn/impl/cpu/add.c @@ -1,4 +1,4 @@ -#include "add.h" +#include "impl/add.h" __attribute__((weak)) void NN__add_i8(size_t n, int8_t *z, size_t incz, const int8_t *x, size_t incx, const int8_t *y, size_t incy) { diff --git a/nn/impl/cpu/add1.c b/nn/impl/cpu/add1.c index 31cbccc..0add413 100644 --- a/nn/impl/cpu/add1.c +++ b/nn/impl/cpu/add1.c @@ -1,4 +1,4 @@ -#include "add1.h" +#include "impl/add1.h" diff --git a/nn/impl/cpu/div.c b/nn/impl/cpu/div.c index 804595c..6dd320a 100644 --- a/nn/impl/cpu/div.c +++ b/nn/impl/cpu/div.c @@ -1,4 +1,4 @@ -#include "div.h" +#include "impl/div.h" __attribute__((weak)) void NN__div_i8(size_t n, int8_t *z, size_t incz, const int8_t *x, size_t incx, const int8_t *y, size_t incy) { diff --git a/nn/impl/cpu/dot.c b/nn/impl/cpu/dot.c index f96fcee..2e432f1 100644 --- a/nn/impl/cpu/dot.c +++ b/nn/impl/cpu/dot.c @@ -1,4 +1,4 @@ -#include "dot.h" +#include "impl/dot.h" __attribute__((weak)) void NN__dot_i8_to_i32(size_t n, int32_t *result, const int8_t *x, size_t incx, const int8_t *y, size_t incy) { diff --git a/nn/impl/cpu/fill.c b/nn/impl/cpu/fill.c index f9530a7..ad159ec 100644 --- a/nn/impl/cpu/fill.c +++ b/nn/impl/cpu/fill.c @@ -1,4 +1,4 @@ -#include "fill.h" +#include "impl/fill.h" __attribute__((weak)) void NN__fill_u8(size_t n, uint8_t *x, size_t incx, uint8_t scalar) { diff --git a/nn/impl/cpu/log.c b/nn/impl/cpu/log.c index 6722270..d16c4bc 100644 --- a/nn/impl/cpu/log.c +++ b/nn/impl/cpu/log.c @@ -1,4 +1,4 @@ -#include "log.h" +#include "impl/log.h" __attribute__((weak)) void NN__log_f32(size_t n, float *y, size_t incy, const float *x, size_t incx) { diff --git a/nn/impl/cpu/max.c b/nn/impl/cpu/max.c index 441d87a..ea8d0be 100644 --- a/nn/impl/cpu/max.c +++ b/nn/impl/cpu/max.c @@ -1,4 +1,4 @@ -#include "max.h" +#include "impl/max.h" __attribute__((weak)) void NN__max_i8(size_t n, int8_t *result, const int8_t *x, size_t incx) { diff --git a/nn/impl/cpu/maximum.c b/nn/impl/cpu/maximum.c index 68258b0..9321e43 100644 --- a/nn/impl/cpu/maximum.c +++ b/nn/impl/cpu/maximum.c @@ -1,4 +1,4 @@ -#include "maximum.h" +#include "impl/maximum.h" __attribute__((weak)) void NN__maximum_i8(size_t n, int8_t *z, size_t incz, const int8_t *x, size_t incx, const int8_t *y, size_t incy) { diff --git a/nn/impl/cpu/maximum1.c b/nn/impl/cpu/maximum1.c index 01312ac..cc4fd13 100644 --- a/nn/impl/cpu/maximum1.c +++ b/nn/impl/cpu/maximum1.c @@ -1,4 +1,4 @@ -#include "maximum1.h" +#include "impl/maximum1.h" __attribute__((weak)) void NN__maximum1_i8(size_t n, int8_t *y, size_t incy, const int8_t *x, size_t incx, int8_t scalar) { diff --git a/nn/impl/cpu/min.c b/nn/impl/cpu/min.c index 57198c8..2154f54 100644 --- a/nn/impl/cpu/min.c +++ b/nn/impl/cpu/min.c @@ -1,4 +1,4 @@ -#include "min.h" +#include "impl/min.h" __attribute__((weak)) void NN__min_i8(size_t n, int8_t *result, const int8_t *x, size_t incx) { diff --git a/nn/impl/cpu/minimum.c b/nn/impl/cpu/minimum.c index ede2df0..940ab9e 100644 --- a/nn/impl/cpu/minimum.c +++ b/nn/impl/cpu/minimum.c @@ -1,4 +1,4 @@ -#include "minimum.h" +#include "impl/minimum.h" __attribute__((weak)) void NN__minimum_i8(size_t n, int8_t *z, size_t incz, const int8_t *x, size_t incx, const int8_t *y, size_t incy) { diff --git a/nn/impl/cpu/minimum1.c b/nn/impl/cpu/minimum1.c index 08f110a..5b9f23f 100644 --- a/nn/impl/cpu/minimum1.c +++ b/nn/impl/cpu/minimum1.c @@ -1,4 +1,4 @@ -#include "minimum1.h" +#include "impl/minimum1.h" __attribute__((weak)) void NN__minimum1_i8(size_t n, int8_t *y, size_t incy, const int8_t *x, size_t incx, int8_t scalar) { diff --git a/nn/impl/cpu/mul.c b/nn/impl/cpu/mul.c index 5fbf438..215aa49 100644 --- a/nn/impl/cpu/mul.c +++ b/nn/impl/cpu/mul.c @@ -1,4 +1,4 @@ -#include "mul.h" +#include "impl/mul.h" __attribute__((weak)) void NN__mul_i8(size_t n, int8_t *z, size_t incz, const int8_t *x, size_t incx, const int8_t *y, size_t incy) { diff --git a/nn/impl/cpu/mul1.c b/nn/impl/cpu/mul1.c index 597179a..ad81564 100644 --- a/nn/impl/cpu/mul1.c +++ b/nn/impl/cpu/mul1.c @@ -1,4 +1,4 @@ -#include "mul1.h" +#include "impl/mul1.h" __attribute__((weak)) void NN__mul1_i8(size_t n, int8_t *y, size_t incy, const int8_t *x, size_t incx, int8_t scalar) { diff --git a/nn/impl/cpu/neg.c b/nn/impl/cpu/neg.c index 6c3d09b..b6fa158 100644 --- a/nn/impl/cpu/neg.c +++ b/nn/impl/cpu/neg.c @@ -1,4 +1,4 @@ -#include "neg.h" +#include "impl/neg.h" __attribute__((weak)) void NN__neg_i8(size_t n, int8_t *y, size_t incy, const int8_t *x, size_t incx) { diff --git a/nn/impl/cpu/norm.c b/nn/impl/cpu/norm.c index a066f04..4ff6a06 100644 --- a/nn/impl/cpu/norm.c +++ b/nn/impl/cpu/norm.c @@ -1,4 +1,4 @@ -#include "norm.h" +#include "impl/norm.h" __attribute__((weak)) void NN__norm_f32(size_t n, float *result, const float *x, size_t incx) { diff --git a/nn/impl/cpu/rms_norm.c b/nn/impl/cpu/rms_norm.c index e3bce60..59f68a6 100644 --- a/nn/impl/cpu/rms_norm.c +++ b/nn/impl/cpu/rms_norm.c @@ -1,4 +1,4 @@ -#include "rms_norm.h" +#include "impl/rms_norm.h" __attribute__((weak)) void NN__rms_norm_f32(size_t n, float* y, size_t incy, const float* x, size_t incx, const float* w, size_t incw, float eps) { diff --git a/nn/impl/cpu/sgn.c b/nn/impl/cpu/sgn.c index 860d420..c4be269 100644 --- a/nn/impl/cpu/sgn.c +++ b/nn/impl/cpu/sgn.c @@ -1,4 +1,4 @@ -#include "sgn.h" +#include "impl/sgn.h" __attribute__((weak)) void NN__sgn_f32(size_t n, float *y, size_t incy, const float *x, size_t incx) { diff --git a/nn/impl/cpu/softmax.c b/nn/impl/cpu/softmax.c index 1af8422..99ae6d7 100644 --- a/nn/impl/cpu/softmax.c +++ b/nn/impl/cpu/softmax.c @@ -1,4 +1,4 @@ -#include "softmax.h" +#include "impl/softmax.h" __attribute__((weak)) void NN__softmax_f16(size_t n, float16_t *y, size_t incy, const float16_t *x, size_t incx) { diff --git a/nn/impl/cpu/sqr.c b/nn/impl/cpu/sqr.c index 320ab83..bb5d949 100644 --- a/nn/impl/cpu/sqr.c +++ b/nn/impl/cpu/sqr.c @@ -1,4 +1,4 @@ -#include "sqr.h" +#include "impl/sqr.h" __attribute__((weak)) void NN__sqr_i8(size_t n, int8_t *y, size_t incy, const int8_t *x, size_t incx) { diff --git a/nn/impl/cpu/sqrt.c b/nn/impl/cpu/sqrt.c index 65ef517..5a338b7 100644 --- a/nn/impl/cpu/sqrt.c +++ b/nn/impl/cpu/sqrt.c @@ -1,4 +1,4 @@ -#include "sqrt.h" +#include "impl/sqrt.h" __attribute__((weak)) void NN__sqrt_f16(size_t n, float16_t *y, size_t incy, const float16_t *x, size_t incx) { diff --git a/nn/impl/cpu/sub.c b/nn/impl/cpu/sub.c index 738e765..19e49a8 100644 --- a/nn/impl/cpu/sub.c +++ b/nn/impl/cpu/sub.c @@ -1,4 +1,4 @@ -#include "sub.h" +#include "impl/sub.h" __attribute__((weak)) void NN__sub_u8(size_t n, uint8_t *z, size_t incz, const uint8_t *x, size_t incx, const uint8_t *y, size_t incy) { diff --git a/nn/impl/cpu/sum.c b/nn/impl/cpu/sum.c index 8d7fb52..7557fc5 100644 --- a/nn/impl/cpu/sum.c +++ b/nn/impl/cpu/sum.c @@ -1,4 +1,4 @@ -#include "sum.h" +#include "impl/sum.h" __attribute__((weak)) void NN__sum_u8_to_i32(size_t n, int32_t *result, const uint8_t *x, size_t incx) { diff --git a/nn/impl/cpu/transpose.c b/nn/impl/cpu/transpose.c index f3ba509..15f3e63 100644 --- a/nn/impl/cpu/transpose.c +++ b/nn/impl/cpu/transpose.c @@ -1,4 +1,4 @@ -#include "transpose.h" +#include "impl/transpose.h" __attribute__((weak)) void NN__transpose_i8(size_t m, size_t n, int8_t *y, const int8_t *x) { diff --git a/nn/impl/rvv/abs.c b/nn/impl/rvv/abs.c index ea168a2..f76ae0d 100644 --- a/nn/impl/rvv/abs.c +++ b/nn/impl/rvv/abs.c @@ -1,5 +1,5 @@ #include "riscv_vector.h" -#include "abs.h" +#include "impl/abs.h" #ifdef RVV @@ -45,17 +45,19 @@ void NN__abs_i32(size_t n, int32_t *y, size_t incy, const int32_t *x, size_t inc } } -void NN__abs_f16(size_t n, float16_t *y, size_t incy, const float16_t *x, size_t incx) { - while (n > 0) { - size_t vl = __riscv_vsetvl_e16m1(n); - vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); - vfloat16m1_t vec_y = __riscv_vfabs_v_f16m1(vec_x, vl); - __riscv_vsse16_v_f16m1(y, sizeof(float16_t) * incy, vec_y, vl); - x += vl; - y += vl; - n -= vl; +#ifdef ZVFH + void NN__abs_f16(size_t n, float16_t *y, size_t incy, const float16_t *x, size_t incx) { + while (n > 0) { + size_t vl = __riscv_vsetvl_e16m1(n); + vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); + vfloat16m1_t vec_y = __riscv_vfabs_v_f16m1(vec_x, vl); + __riscv_vsse16_v_f16m1(y, sizeof(float16_t) * incy, vec_y, vl); + x += vl; + y += vl; + n -= vl; + } } -} +#endif void NN__abs_f32(size_t n, float *y, size_t incy, const float *x, size_t incx) { while (n > 0) { diff --git a/nn/impl/rvv/acc.c b/nn/impl/rvv/acc.c index 5502f96..2e911f1 100644 --- a/nn/impl/rvv/acc.c +++ b/nn/impl/rvv/acc.c @@ -1,5 +1,5 @@ #include -#include "acc.h" +#include "impl/acc.h" #ifdef RVV @@ -42,18 +42,20 @@ void NN__acc_i32(size_t n, int32_t *y, size_t incy, const int32_t *x, size_t inc } } -void NN__acc_f16(size_t n, float16_t *y, size_t incy, const float16_t *x, size_t incx) { - while (n > 0) { - size_t vl = __riscv_vsetvl_e16m1(n); - vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); - vfloat16m1_t vec_y = __riscv_vlse16_v_f16m1(y, sizeof(float16_t) * incy, vl); - vec_y = __riscv_vfadd_vv_f16m1(vec_y, vec_x, vl); - __riscv_vsse16_v_f16m1(y, sizeof(float16_t) * incy, vec_y, vl); - x += vl; - y += vl; - n -= vl; +#ifdef ZVFH + void NN__acc_f16(size_t n, float16_t *y, size_t incy, const float16_t *x, size_t incx) { + while (n > 0) { + size_t vl = __riscv_vsetvl_e16m1(n); + vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); + vfloat16m1_t vec_y = __riscv_vlse16_v_f16m1(y, sizeof(float16_t) * incy, vl); + vec_y = __riscv_vfadd_vv_f16m1(vec_y, vec_x, vl); + __riscv_vsse16_v_f16m1(y, sizeof(float16_t) * incy, vec_y, vl); + x += vl; + y += vl; + n -= vl; + } } -} +#endif void NN__acc_f32(size_t n, float *y, size_t incy, const float *x, size_t incx) { while (n > 0) { diff --git a/nn/impl/rvv/acc1.c b/nn/impl/rvv/acc1.c index 7823b29..5f73276 100644 --- a/nn/impl/rvv/acc1.c +++ b/nn/impl/rvv/acc1.c @@ -1,5 +1,5 @@ #include -#include "acc1.h" +#include "impl/acc1.h" #ifdef RVV @@ -39,17 +39,19 @@ void NN__acc1_i32(size_t n, int32_t *result, size_t incx, int32_t scalar) { } } -void NN__acc1_f16(size_t n, float16_t *result, size_t incx, float16_t scalar) { - while (n > 0) { - size_t vl = __riscv_vsetvl_e16m1(n); - vfloat16m1_t vec_r = __riscv_vlse16_v_f16m1(result, sizeof(float16_t) * incx, vl); - vfloat16m1_t vec_s = __riscv_vfmv_v_f_f16m1(scalar, vl); - vec_r = __riscv_vfadd_vv_f16m1(vec_r, vec_s, vl); - __riscv_vse16_v_f16m1(result, vec_r, vl); - result += vl; - n -= vl; +#ifdef ZVFH + void NN__acc1_f16(size_t n, float16_t *result, size_t incx, float16_t scalar) { + while (n > 0) { + size_t vl = __riscv_vsetvl_e16m1(n); + vfloat16m1_t vec_r = __riscv_vlse16_v_f16m1(result, sizeof(float16_t) * incx, vl); + vfloat16m1_t vec_s = __riscv_vfmv_v_f_f16m1(scalar, vl); + vec_r = __riscv_vfadd_vv_f16m1(vec_r, vec_s, vl); + __riscv_vse16_v_f16m1(result, vec_r, vl); + result += vl; + n -= vl; + } } -} +#endif void NN__acc1_f32(size_t n, float *result, size_t incx, float scalar) { while (n > 0) { diff --git a/nn/impl/rvv/add.c b/nn/impl/rvv/add.c index 869a61c..92af49a 100644 --- a/nn/impl/rvv/add.c +++ b/nn/impl/rvv/add.c @@ -1,5 +1,5 @@ #include -#include "add.h" +#include "impl/add.h" #ifdef RVV @@ -45,19 +45,21 @@ void NN__add_i32(size_t n, int32_t *z, size_t incz, const int32_t *x, size_t inc } } -void NN__add_f16(size_t n, float16_t *z, size_t incz, const float16_t *x, size_t incx, const float16_t *y, size_t incy) { - while (n > 0) { - size_t vl = __riscv_vsetvl_e16m1(n); - vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); - vfloat16m1_t vec_y = __riscv_vlse16_v_f16m1(y, sizeof(float16_t) * incy, vl); - vfloat16m1_t vec_z = __riscv_vfadd_vv_f16m1(vec_x, vec_y, vl); - __riscv_vsse16_v_f16m1(z, sizeof(float16_t) * incz, vec_z, vl); - x += vl; - y += vl; - z += vl; - n -= vl; +#ifdef ZVFH + void NN__add_f16(size_t n, float16_t *z, size_t incz, const float16_t *x, size_t incx, const float16_t *y, size_t incy) { + while (n > 0) { + size_t vl = __riscv_vsetvl_e16m1(n); + vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); + vfloat16m1_t vec_y = __riscv_vlse16_v_f16m1(y, sizeof(float16_t) * incy, vl); + vfloat16m1_t vec_z = __riscv_vfadd_vv_f16m1(vec_x, vec_y, vl); + __riscv_vsse16_v_f16m1(z, sizeof(float16_t) * incz, vec_z, vl); + x += vl; + y += vl; + z += vl; + n -= vl; + } } -} +#endif void NN__add_f32(size_t n, float *z, size_t incz, const float *x, size_t incx, const float *y, size_t incy) { while (n > 0) { diff --git a/nn/impl/rvv/add1.c b/nn/impl/rvv/add1.c index 8b0d5e8..d8c96d7 100644 --- a/nn/impl/rvv/add1.c +++ b/nn/impl/rvv/add1.c @@ -1,5 +1,5 @@ #include -#include "add1.h" +#include "impl/add1.h" #ifdef RVV @@ -42,18 +42,20 @@ void NN__add1_i32(size_t n, int32_t *y, size_t incy, const int32_t *x, size_t in } } -void NN__add1_f16(size_t n, float16_t *y, size_t incy, const float16_t *x, size_t incx, float16_t scalar) { - while (n > 0) { - size_t vl = __riscv_vsetvl_e16m1(n); - vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); - vfloat16m1_t vec_s = __riscv_vfmv_v_f_f16m1(scalar, vl); - vfloat16m1_t vec_y = __riscv_vfadd_vv_f16m1(vec_x, vec_s, vl); - __riscv_vsse16_v_f16m1(y, sizeof(float16_t) * incy, vec_y, vl); - x += vl; - y += vl; - n -= vl; +#ifdef ZVFH + void NN__add1_f16(size_t n, float16_t *y, size_t incy, const float16_t *x, size_t incx, float16_t scalar) { + while (n > 0) { + size_t vl = __riscv_vsetvl_e16m1(n); + vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); + vfloat16m1_t vec_s = __riscv_vfmv_v_f_f16m1(scalar, vl); + vfloat16m1_t vec_y = __riscv_vfadd_vv_f16m1(vec_x, vec_s, vl); + __riscv_vsse16_v_f16m1(y, sizeof(float16_t) * incy, vec_y, vl); + x += vl; + y += vl; + n -= vl; + } } -} +#endif void NN__add1_f32(size_t n, float *y, size_t incy, const float *x, size_t incx, float scalar) { while (n > 0) { diff --git a/nn/impl/rvv/div.c b/nn/impl/rvv/div.c index 976975b..e2bb104 100644 --- a/nn/impl/rvv/div.c +++ b/nn/impl/rvv/div.c @@ -1,5 +1,5 @@ #include -#include "div.h" +#include "impl/div.h" #ifdef RVV @@ -45,19 +45,21 @@ void NN__div_i32(size_t n, int32_t *z, size_t incz, const int32_t *x, size_t inc } } -void NN__div_f16(size_t n, float16_t *z, size_t incz, const float16_t *x, size_t incx, const float16_t *y, size_t incy) { - while (n > 0) { - size_t vl = __riscv_vsetvl_e16m1(n); - vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); - vfloat16m1_t vec_y = __riscv_vlse16_v_f16m1(y, sizeof(float16_t) * incy, vl); - vfloat16m1_t vec_z = __riscv_vfdiv_vv_f16m1(vec_x, vec_y, vl); - __riscv_vsse16_v_f16m1(z, sizeof(float16_t) * incz, vec_z, vl); - x += vl; - y += vl; - z += vl; - n -= vl; +#ifdef ZVFH + void NN__div_f16(size_t n, float16_t *z, size_t incz, const float16_t *x, size_t incx, const float16_t *y, size_t incy) { + while (n > 0) { + size_t vl = __riscv_vsetvl_e16m1(n); + vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); + vfloat16m1_t vec_y = __riscv_vlse16_v_f16m1(y, sizeof(float16_t) * incy, vl); + vfloat16m1_t vec_z = __riscv_vfdiv_vv_f16m1(vec_x, vec_y, vl); + __riscv_vsse16_v_f16m1(z, sizeof(float16_t) * incz, vec_z, vl); + x += vl; + y += vl; + z += vl; + n -= vl; + } } -} +#endif void NN__div_f32(size_t n, float *z, size_t incz, const float *x, size_t incx, const float *y, size_t incy) { while (n > 0) { diff --git a/nn/impl/rvv/dot.c b/nn/impl/rvv/dot.c index fabb4f7..ba35780 100644 --- a/nn/impl/rvv/dot.c +++ b/nn/impl/rvv/dot.c @@ -1,5 +1,5 @@ #include -#include "dot.h" +#include "impl/dot.h" #ifdef RVV @@ -63,25 +63,27 @@ void NN__dot_i32(size_t n, int32_t *result, const int32_t *x, size_t incx, const *result = __riscv_vmv_x_s_i32m1_i32(vec_r); } -void NN__dot_f16(size_t n, float16_t *result, const float16_t *x, size_t incx, const float16_t *y, size_t incy) { - size_t vlmax = __riscv_vsetvlmax_e16m1(); - - vfloat16m1_t vec_zero = __riscv_vfmv_v_f_f16m1(0, vlmax); - vfloat16m1_t vec_r = __riscv_vfmv_v_f_f16m1(0, vlmax); - - while (n > 0) { - size_t vl = __riscv_vsetvl_e16m1(n); - vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); - vfloat16m1_t vec_y = __riscv_vlse16_v_f16m1(y, sizeof(float16_t) * incy, vl); - vec_r = __riscv_vfmacc_vv_f16m1(vec_r, vec_x, vec_y, vl); - - x += vl; - y += vl; - n -= vl; +#ifdef ZVFH + void NN__dot_f16(size_t n, float16_t *result, const float16_t *x, size_t incx, const float16_t *y, size_t incy) { + size_t vlmax = __riscv_vsetvlmax_e16m1(); + + vfloat16m1_t vec_zero = __riscv_vfmv_v_f_f16m1(0, vlmax); + vfloat16m1_t vec_r = __riscv_vfmv_v_f_f16m1(0, vlmax); + + while (n > 0) { + size_t vl = __riscv_vsetvl_e16m1(n); + vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); + vfloat16m1_t vec_y = __riscv_vlse16_v_f16m1(y, sizeof(float16_t) * incy, vl); + vec_r = __riscv_vfmacc_vv_f16m1(vec_r, vec_x, vec_y, vl); + + x += vl; + y += vl; + n -= vl; + } + vec_r = __riscv_vfredusum_vs_f16m1_f16m1(vec_r, vec_zero, vlmax); + *result = __riscv_vfmv_f_s_f16m1_f16(vec_r); } - vec_r = __riscv_vfredusum_vs_f16m1_f16m1(vec_r, vec_zero, vlmax); - *result = __riscv_vfmv_f_s_f16m1_f16(vec_r); -} +#endif void NN__dot_f32(size_t n, float *result, const float *x, size_t incx, const float *y, size_t incy) { size_t vlmax = __riscv_vsetvlmax_e32m1(); diff --git a/nn/impl/rvv/fill.c b/nn/impl/rvv/fill.c index 1de7e39..12ca699 100644 --- a/nn/impl/rvv/fill.c +++ b/nn/impl/rvv/fill.c @@ -1,5 +1,5 @@ #include -#include "fill.h" +#include "impl/fill.h" #ifdef RVV @@ -33,15 +33,17 @@ void NN__fill_i32(size_t n, int32_t *x, size_t incx, int32_t scalar) { } } -void NN__fill_f16(size_t n, float16_t *x, size_t incx, float16_t scalar) { - while (n > 0) { - size_t vl = __riscv_vsetvl_e16m1(n); - vfloat16m1_t vec_x = __riscv_vfmv_v_f_f16m1(scalar, vl); - __riscv_vsse16_v_f16m1(x, sizeof(float16_t) * incx, vec_x, vl); - x += vl; - n -= vl; +#ifdef ZVFH + void NN__fill_f16(size_t n, float16_t *x, size_t incx, float16_t scalar) { + while (n > 0) { + size_t vl = __riscv_vsetvl_e16m1(n); + vfloat16m1_t vec_x = __riscv_vfmv_v_f_f16m1(scalar, vl); + __riscv_vsse16_v_f16m1(x, sizeof(float16_t) * incx, vec_x, vl); + x += vl; + n -= vl; + } } -} +#endif void NN__fill_f32(size_t n, float *x, size_t incx, float scalar) { while (n > 0) { diff --git a/nn/impl/rvv/max.c b/nn/impl/rvv/max.c index 8e88adf..5edd8f1 100644 --- a/nn/impl/rvv/max.c +++ b/nn/impl/rvv/max.c @@ -1,5 +1,5 @@ #include -#include "max.h" +#include "impl/max.h" #ifdef RVV @@ -39,17 +39,19 @@ void NN__max_i32(size_t n, int32_t *result, const int32_t *x, size_t incx) { *result = __riscv_vmv_x_s_i32m1_i32(vec_max); } -void NN__max_f16(size_t n, float16_t *result, const float16_t *x, size_t incx) { - vfloat16m1_t vec_max = __riscv_vfmv_v_f_f16m1(-FLT_MAX, 1); - while (n > 0) { - size_t vl = __riscv_vsetvl_e16m1(n); - vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); - vec_max = __riscv_vfredmax_vs_f16m1_f16m1(vec_x, vec_max, vl); - x += vl; - n -= vl; +#ifdef ZVFH + void NN__max_f16(size_t n, float16_t *result, const float16_t *x, size_t incx) { + vfloat16m1_t vec_max = __riscv_vfmv_v_f_f16m1(-FLT_MAX, 1); + while (n > 0) { + size_t vl = __riscv_vsetvl_e16m1(n); + vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); + vec_max = __riscv_vfredmax_vs_f16m1_f16m1(vec_x, vec_max, vl); + x += vl; + n -= vl; + } + *result = __riscv_vfmv_f_s_f16m1_f16(vec_max); } - *result = __riscv_vfmv_f_s_f16m1_f16(vec_max); -} +#endif void NN__max_f32(size_t n, float *result, const float *x, size_t incx) { vfloat32m1_t vec_max = __riscv_vfmv_s_f_f32m1(-FLT_MAX, 1); diff --git a/nn/impl/rvv/maximum.c b/nn/impl/rvv/maximum.c index f877d7b..e2aa770 100644 --- a/nn/impl/rvv/maximum.c +++ b/nn/impl/rvv/maximum.c @@ -1,5 +1,5 @@ #include -#include "maximum.h" +#include "impl/maximum.h" #ifdef RVV @@ -45,19 +45,21 @@ void NN__maximum_i32(size_t n, int32_t *z, size_t incz, const int32_t *x, size_t } } -void NN__maximum_f16(size_t n, float16_t *z, size_t incz, const float16_t *x, size_t incx, const float16_t *y, size_t incy) { - while (n > 0) { - size_t vl = __riscv_vsetvl_e16m1(n); - vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); - vfloat16m1_t vec_y = __riscv_vlse16_v_f16m1(y, sizeof(float16_t) * incy, vl); - vfloat16m1_t vec_z = __riscv_vfmax_vv_f16m1(vec_x, vec_y, vl); - __riscv_vsse16_v_f16m1(z, sizeof(float16_t) * incz, vec_z, vl); - x += vl; - y += vl; - z += vl; - n -= vl; +#ifdef ZVFH + void NN__maximum_f16(size_t n, float16_t *z, size_t incz, const float16_t *x, size_t incx, const float16_t *y, size_t incy) { + while (n > 0) { + size_t vl = __riscv_vsetvl_e16m1(n); + vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); + vfloat16m1_t vec_y = __riscv_vlse16_v_f16m1(y, sizeof(float16_t) * incy, vl); + vfloat16m1_t vec_z = __riscv_vfmax_vv_f16m1(vec_x, vec_y, vl); + __riscv_vsse16_v_f16m1(z, sizeof(float16_t) * incz, vec_z, vl); + x += vl; + y += vl; + z += vl; + n -= vl; + } } -} +#endif void NN__maximum_f32(size_t n, float *z, size_t incz, const float *x, size_t incx, const float *y, size_t incy) { while (n > 0) { diff --git a/nn/impl/rvv/maximum1.c b/nn/impl/rvv/maximum1.c index aeb2ef0..5f9f4b1 100644 --- a/nn/impl/rvv/maximum1.c +++ b/nn/impl/rvv/maximum1.c @@ -1,5 +1,5 @@ #include -#include "maximum1.h" +#include "impl/maximum1.h" #ifdef RVV @@ -42,18 +42,20 @@ void NN__maximum1_i32(size_t n, int32_t *y, size_t incy, const int32_t *x, size_ } } -void NN__maximum1_f16(size_t n, float16_t *y, size_t incy, const float16_t *x, size_t incx, float16_t scalar) { - while (n > 0) { - size_t vl = __riscv_vsetvl_e16m1(n); - vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); - vfloat16m1_t vec_s = __riscv_vfmv_v_f_f16m1(scalar, vl); - vfloat16m1_t vec_y = __riscv_vfmax_vv_f16m1(vec_x, vec_s, vl); - __riscv_vsse16_v_f16m1(y, sizeof(float16_t) * incy, vec_y, vl); - x += vl; - y += vl; - n -= vl; +#ifdef ZVFH + void NN__maximum1_f16(size_t n, float16_t *y, size_t incy, const float16_t *x, size_t incx, float16_t scalar) { + while (n > 0) { + size_t vl = __riscv_vsetvl_e16m1(n); + vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); + vfloat16m1_t vec_s = __riscv_vfmv_v_f_f16m1(scalar, vl); + vfloat16m1_t vec_y = __riscv_vfmax_vv_f16m1(vec_x, vec_s, vl); + __riscv_vsse16_v_f16m1(y, sizeof(float16_t) * incy, vec_y, vl); + x += vl; + y += vl; + n -= vl; + } } -} +#endif void NN__maximum1_f32(size_t n, float *y, size_t incy, const float *x, size_t incx, float scalar) { while (n > 0) { diff --git a/nn/impl/rvv/min.c b/nn/impl/rvv/min.c index d17794d..35fafe3 100644 --- a/nn/impl/rvv/min.c +++ b/nn/impl/rvv/min.c @@ -1,5 +1,5 @@ #include -#include "min.h" +#include "impl/min.h" #ifdef RVV @@ -39,17 +39,19 @@ void NN__min_i32(size_t n, int32_t *result, const int32_t *x, size_t incx) { *result = __riscv_vmv_x_s_i32m1_i32(vec_max); } -void NN__min_f16(size_t n, float16_t *result, const float16_t *x, size_t incx) { - vfloat16m1_t vec_max = __riscv_vfmv_v_f_f16m1(-FLT_MAX, 1); - while (n > 0) { - size_t vl = __riscv_vsetvl_e16m1(n); - vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); - vec_max = __riscv_vfredmin_vs_f16m1_f16m1(vec_x, vec_max, vl); - x += vl; - n -= vl; +#ifdef ZVFH + void NN__min_f16(size_t n, float16_t *result, const float16_t *x, size_t incx) { + vfloat16m1_t vec_max = __riscv_vfmv_v_f_f16m1(-FLT_MAX, 1); + while (n > 0) { + size_t vl = __riscv_vsetvl_e16m1(n); + vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); + vec_max = __riscv_vfredmin_vs_f16m1_f16m1(vec_x, vec_max, vl); + x += vl; + n -= vl; + } + *result = __riscv_vfmv_f_s_f16m1_f16(vec_max); } - *result = __riscv_vfmv_f_s_f16m1_f16(vec_max); -} +#endif void NN__min_f32(size_t n, float *result, const float *x, size_t incx) { vfloat32m1_t vec_min = __riscv_vfmv_s_f_f32m1(FLT_MAX, 1); diff --git a/nn/impl/rvv/minimum.c b/nn/impl/rvv/minimum.c index ff93f1a..5301535 100644 --- a/nn/impl/rvv/minimum.c +++ b/nn/impl/rvv/minimum.c @@ -1,5 +1,5 @@ #include -#include "minimum.h" +#include "impl/minimum.h" #ifdef RVV @@ -45,19 +45,21 @@ void NN__minimum_i32(size_t n, int32_t *z, size_t incz, const int32_t *x, size_t } } -void NN__minimum_f16(size_t n, float16_t *z, size_t incz, const float16_t *x, size_t incx, const float16_t *y, size_t incy) { - while (n > 0) { - size_t vl = __riscv_vsetvl_e16m1(n); - vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); - vfloat16m1_t vec_y = __riscv_vlse16_v_f16m1(y, sizeof(float16_t) * incy, vl); - vfloat16m1_t vec_z = __riscv_vfmin_vv_f16m1(vec_x, vec_y, vl); - __riscv_vsse16_v_f16m1(z, sizeof(float16_t) * incz, vec_z, vl); - x += vl; - y += vl; - z += vl; - n -= vl; +#ifdef ZVFH + void NN__minimum_f16(size_t n, float16_t *z, size_t incz, const float16_t *x, size_t incx, const float16_t *y, size_t incy) { + while (n > 0) { + size_t vl = __riscv_vsetvl_e16m1(n); + vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); + vfloat16m1_t vec_y = __riscv_vlse16_v_f16m1(y, sizeof(float16_t) * incy, vl); + vfloat16m1_t vec_z = __riscv_vfmin_vv_f16m1(vec_x, vec_y, vl); + __riscv_vsse16_v_f16m1(z, sizeof(float16_t) * incz, vec_z, vl); + x += vl; + y += vl; + z += vl; + n -= vl; + } } -} +#endif void NN__minimum_f32(size_t n, float *z, size_t incz, const float *x, size_t incx, const float *y, size_t incy) { while (n > 0) { diff --git a/nn/impl/rvv/minimum1.c b/nn/impl/rvv/minimum1.c index 737ff92..eadf127 100644 --- a/nn/impl/rvv/minimum1.c +++ b/nn/impl/rvv/minimum1.c @@ -1,5 +1,5 @@ #include -#include "minimum1.h" +#include "impl/minimum1.h" #ifdef RVV @@ -42,18 +42,20 @@ void NN__minimum1_i32(size_t n, int32_t *y, size_t incy, const int32_t *x, size_ } } -void NN__minimum1_f16(size_t n, float16_t *y, size_t incy, const float16_t *x, size_t incx, float16_t scalar) { - while (n > 0) { - size_t vl = __riscv_vsetvl_e16m1(n); - vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); - vfloat16m1_t vec_s = __riscv_vfmv_v_f_f16m1(scalar, vl); - vfloat16m1_t vec_y = __riscv_vfmin_vv_f16m1(vec_x, vec_s, vl); - __riscv_vsse16_v_f16m1(y, sizeof(float16_t) * incy, vec_y, vl); - x += vl; - y += vl; - n -= vl; +#ifdef ZVFH + void NN__minimum1_f16(size_t n, float16_t *y, size_t incy, const float16_t *x, size_t incx, float16_t scalar) { + while (n > 0) { + size_t vl = __riscv_vsetvl_e16m1(n); + vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); + vfloat16m1_t vec_s = __riscv_vfmv_v_f_f16m1(scalar, vl); + vfloat16m1_t vec_y = __riscv_vfmin_vv_f16m1(vec_x, vec_s, vl); + __riscv_vsse16_v_f16m1(y, sizeof(float16_t) * incy, vec_y, vl); + x += vl; + y += vl; + n -= vl; + } } -} +#endif void NN__minimum1_f32(size_t n, float *y, size_t incy, const float *x, size_t incx, float scalar) { while (n > 0) { diff --git a/nn/impl/rvv/mul.c b/nn/impl/rvv/mul.c index 71bb296..b39487b 100644 --- a/nn/impl/rvv/mul.c +++ b/nn/impl/rvv/mul.c @@ -1,5 +1,5 @@ #include -#include "mul.h" +#include "impl/mul.h" #ifdef RVV @@ -45,19 +45,21 @@ void NN__mul_i32(size_t n, int32_t *z, size_t incz, const int32_t *x, size_t inc } } -void NN__mul_f16(size_t n, float16_t *z, size_t incz, const float16_t *x, size_t incx, const float16_t *y, size_t incy) { - while (n > 0) { - size_t vl = __riscv_vsetvl_e16m1(n); - vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); - vfloat16m1_t vec_y = __riscv_vlse16_v_f16m1(y, sizeof(float16_t) * incy, vl); - vfloat16m1_t vec_z = __riscv_vfmul_vv_f16m1(vec_x, vec_y, vl); - __riscv_vsse16_v_f16m1(z, sizeof(float16_t) * incz, vec_z, vl); - x += vl; - y += vl; - z += vl; - n -= vl; +#ifdef ZVFH + void NN__mul_f16(size_t n, float16_t *z, size_t incz, const float16_t *x, size_t incx, const float16_t *y, size_t incy) { + while (n > 0) { + size_t vl = __riscv_vsetvl_e16m1(n); + vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); + vfloat16m1_t vec_y = __riscv_vlse16_v_f16m1(y, sizeof(float16_t) * incy, vl); + vfloat16m1_t vec_z = __riscv_vfmul_vv_f16m1(vec_x, vec_y, vl); + __riscv_vsse16_v_f16m1(z, sizeof(float16_t) * incz, vec_z, vl); + x += vl; + y += vl; + z += vl; + n -= vl; + } } -} +#endif void NN__mul_f32(size_t n, float *z, size_t incz, const float *x, size_t incx, const float *y, size_t incy) { while (n > 0) { diff --git a/nn/impl/rvv/mul1.c b/nn/impl/rvv/mul1.c index 829250d..e082ab9 100644 --- a/nn/impl/rvv/mul1.c +++ b/nn/impl/rvv/mul1.c @@ -1,5 +1,5 @@ #include -#include "mul1.h" +#include "impl/mul1.h" #ifdef RVV @@ -42,18 +42,20 @@ void NN__mul1_i32(size_t n, int32_t *y, size_t incy, const int32_t *x, size_t in } } -void NN__mul1_f16(size_t n, float16_t *y, size_t incy, const float16_t *x, size_t incx, float16_t scalar) { - while (n > 0) { - size_t vl = __riscv_vsetvl_e16m1(n); - vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); - vfloat16m1_t vec_s = __riscv_vfmv_v_f_f16m1(scalar, vl); - vfloat16m1_t vec_y = __riscv_vfmul_vv_f16m1(vec_x, vec_s, vl); - __riscv_vsse16_v_f16m1(y, sizeof(float16_t) * incy, vec_y, vl); - x += vl; - y += vl; - n -= vl; +#ifdef ZVFH + void NN__mul1_f16(size_t n, float16_t *y, size_t incy, const float16_t *x, size_t incx, float16_t scalar) { + while (n > 0) { + size_t vl = __riscv_vsetvl_e16m1(n); + vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); + vfloat16m1_t vec_s = __riscv_vfmv_v_f_f16m1(scalar, vl); + vfloat16m1_t vec_y = __riscv_vfmul_vv_f16m1(vec_x, vec_s, vl); + __riscv_vsse16_v_f16m1(y, sizeof(float16_t) * incy, vec_y, vl); + x += vl; + y += vl; + n -= vl; + } } -} +#endif void NN__mul1_f32(size_t n, float *y, size_t incy, const float *x, size_t incx, float scalar) { while (n > 0) { diff --git a/nn/impl/rvv/neg.c b/nn/impl/rvv/neg.c index ec3f56a..9846a10 100644 --- a/nn/impl/rvv/neg.c +++ b/nn/impl/rvv/neg.c @@ -1,5 +1,5 @@ #include -#include "neg.h" +#include "impl/neg.h" #ifdef RVV diff --git a/nn/impl/rvv/rms_norm.c b/nn/impl/rvv/rms_norm.c index 7f6e2b0..521cb9f 100644 --- a/nn/impl/rvv/rms_norm.c +++ b/nn/impl/rvv/rms_norm.c @@ -1,5 +1,5 @@ #include -#include "rms_norm.h" +#include "impl/rms_norm.h" #ifdef RVV diff --git a/nn/impl/rvv/sub.c b/nn/impl/rvv/sub.c index 0d494ee..d353b31 100644 --- a/nn/impl/rvv/sub.c +++ b/nn/impl/rvv/sub.c @@ -1,5 +1,5 @@ #include -#include "sub.h" +#include "impl/sub.h" #ifdef RVV @@ -59,19 +59,21 @@ void NN__sub_i32(size_t n, int32_t *z, size_t incz, const int32_t *x, size_t inc } } -void NN__sub_f16(size_t n, float16_t *z, size_t incz, const float16_t *x, size_t incx, const float16_t *y, size_t incy) { - while (n > 0) { - size_t vl = __riscv_vsetvl_e16m1(n); - vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); - vfloat16m1_t vec_y = __riscv_vlse16_v_f16m1(y, sizeof(float16_t) * incy, vl); - vfloat16m1_t vec_z = __riscv_vfsub_vv_f16m1(vec_x, vec_y, vl); - __riscv_vsse16_v_f16m1(z, sizeof(float16_t) * incz, vec_z, vl); - x += vl; - y += vl; - z += vl; - n -= vl; +#ifdef ZVFH + void NN__sub_f16(size_t n, float16_t *z, size_t incz, const float16_t *x, size_t incx, const float16_t *y, size_t incy) { + while (n > 0) { + size_t vl = __riscv_vsetvl_e16m1(n); + vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); + vfloat16m1_t vec_y = __riscv_vlse16_v_f16m1(y, sizeof(float16_t) * incy, vl); + vfloat16m1_t vec_z = __riscv_vfsub_vv_f16m1(vec_x, vec_y, vl); + __riscv_vsse16_v_f16m1(z, sizeof(float16_t) * incz, vec_z, vl); + x += vl; + y += vl; + z += vl; + n -= vl; + } } -} +#endif void NN__sub_f32(size_t n, float *z, size_t incz, const float *x, size_t incx, const float *y, size_t incy) { while (n > 0) { diff --git a/nn/impl/rvv/transpose.c b/nn/impl/rvv/transpose.c index a542dcc..4cf8e9d 100644 --- a/nn/impl/rvv/transpose.c +++ b/nn/impl/rvv/transpose.c @@ -1,5 +1,5 @@ #include -#include "neg.h" +#include "impl/neg.h" #ifdef RVV diff --git a/nn/nn.h b/nn/nn.h index cc178c4..0ab5298 100644 --- a/nn/nn.h +++ b/nn/nn.h @@ -5,37 +5,37 @@ #include "nn_float16.h" #include "nn_tensor.h" -#include "nn_tensor_creation.h" -#include "nn_print.h" -#include "nn_abs.h" -#include "nn_add.h" -#include "nn_batch_norm2d.h" -#include "nn_clip.h" -#include "nn_conv2d.h" -#include "nn_copy.h" -#include "nn_div.h" -#include "nn_elu.h" -#include "nn_fill.h" -#include "nn_interpolate.h" -#include "nn_layer_norm.h" -#include "nn_linear.h" -#include "nn_matmul.h" -#include "nn_norm.h" -#include "nn_max.h" -#include "nn_mm.h" -#include "nn_maximum.h" -#include "nn_min.h" -#include "nn_minimum.h" -#include "nn_mul.h" -#include "nn_mv.h" -#include "nn_neg.h" -#include "nn_relu.h" -#include "nn_relu6.h" -#include "nn_rms_norm.h" -#include "nn_softmax.h" -#include "nn_silu.h" -#include "nn_sub.h" -#include "nn_sum.h" +#include "functional/nn_tensor_creation.h" +#include "functional/nn_print.h" +#include "functional/nn_abs.h" +#include "functional/nn_add.h" +#include "functional/nn_batch_norm2d.h" +#include "functional/nn_clip.h" +#include "functional/nn_conv2d.h" +#include "functional/nn_copy.h" +#include "functional/nn_div.h" +#include "functional/nn_elu.h" +#include "functional/nn_fill.h" +#include "functional/nn_interpolate.h" +#include "functional/nn_layer_norm.h" +#include "functional/nn_linear.h" +#include "functional/nn_matmul.h" +#include "functional/nn_norm.h" +#include "functional/nn_max.h" +#include "functional/nn_mm.h" +#include "functional/nn_maximum.h" +#include "functional/nn_min.h" +#include "functional/nn_minimum.h" +#include "functional/nn_mul.h" +#include "functional/nn_mv.h" +#include "functional/nn_neg.h" +#include "functional/nn_relu.h" +#include "functional/nn_relu6.h" +#include "functional/nn_rms_norm.h" +#include "functional/nn_softmax.h" +#include "functional/nn_silu.h" +#include "functional/nn_sub.h" +#include "functional/nn_sum.h" // http://elm-chan.org/junk/32bit/binclude.html From a755bf62dc36da8a6b67ec2a747e49eb5664387b Mon Sep 17 00:00:00 2001 From: "-T.K.-" Date: Sat, 20 Jul 2024 13:14:49 -0700 Subject: [PATCH 4/4] ADD: update tests --- tests/CMakeLists.txt | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 306f0b2..462a412 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,3 +1,5 @@ +cmake_minimum_required(VERSION 3.10) + project(tests LANGUAGES C) add_executable(tests src/generated.c) @@ -12,15 +14,6 @@ elseif (RISCV) endif() -# include_directories( -# ../nn -# ../nn/functional -# ../nn/impl) - -# find_library(LIB_TO_INCLUDE nn ./) - -# target_link_libraries(tests PUBLIC ${LIB_TO_INCLUDE}) - target_link_libraries(tests PUBLIC nn) target_link_libraries(tests PUBLIC m)