diff --git a/CMakeLists.txt b/CMakeLists.txt index 9775019..94dccda 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -47,7 +47,7 @@ set(WRAP_SPECS_FILE "htif_wrap.specs") set(SPECS_FILE "htif_nano.specs") set(LIBGLOSS_DIR "$ENV{RISCV}/riscv64-unknown-elf/lib/") -set(MARCH "rv64gcv_zfh_zvfh") +set(MARCH "rv64gcv_zfh_zvfh_zvfhmin") set(MABI "lp64d") set(MCMODEL "medany") @@ -60,6 +60,8 @@ target_compile_options(target-riscv INTERFACE -march=${MARCH} -mabi=${MABI} -mcm target_compile_options(target-riscv INTERFACE -Wl,-Map=output.map -specs=${SPECS_FILE} -specs=${WRAP_SPECS_FILE}) target_compile_options(target-riscv INTERFACE -T ${LINKER_SCRIPT}) +target_compile_definitions(target-riscv INTERFACE FLT16_MAX=65504.0f) + target_link_options(target-riscv INTERFACE -static) target_link_options(target-riscv INTERFACE -march=${MARCH} -mabi=${MABI} -mcmodel=${MCMODEL}) target_link_options(target-riscv INTERFACE -Wl,-Map=output.map -specs=${SPECS_FILE} -specs=${WRAP_SPECS_FILE}) diff --git a/nn/impl/rvv/abs.c b/nn/impl/rvv/abs.c index 5a64229..6e745f2 100644 --- a/nn/impl/rvv/abs.c +++ b/nn/impl/rvv/abs.c @@ -45,17 +45,17 @@ void NN__abs_i32(size_t n, int32_t *y, size_t incy, int32_t *x, size_t incx) { } } -// void NN__abs_f16(size_t n, float16_t *y, size_t incy, float16_t *x, size_t incx) { -// while (n > 0) { -// size_t vl = __riscv_vsetvl_e16m1(n); -// vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); -// vfloat16m1_t vec_y = __riscv_vfabs_v_f16m1(vec_x, vl); -// __riscv_vse16_v_f16m1(y, sizeof(float16_t) * incy, vec_y, vl); -// x += vl; -// y += vl; -// n -= vl; -// } -// } +void NN__abs_f16(size_t n, float16_t *y, size_t incy, float16_t *x, size_t incx) { + while (n > 0) { + size_t vl = __riscv_vsetvl_e16m1(n); + vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); + vfloat16m1_t vec_y = __riscv_vfabs_v_f16m1(vec_x, vl); + __riscv_vsse16_v_f16m1(y, sizeof(float16_t) * incy, vec_y, vl); + x += vl; + y += vl; + n -= vl; + } +} void NN__abs_f32(size_t n, float *y, size_t incy, float *x, size_t incx) { while (n > 0) { diff --git a/nn/impl/rvv/add.c b/nn/impl/rvv/add.c index c39aa16..cd0269a 100644 --- a/nn/impl/rvv/add.c +++ b/nn/impl/rvv/add.c @@ -20,25 +20,11 @@ void NN__add_i8(size_t n, int8_t *z, size_t incz, int8_t *x, size_t incx, int8_t void NN__add_f16(size_t n, float16_t *z, size_t incz, float16_t *x, size_t incx, float16_t *y, size_t incy) { while (n > 0) { - size_t vl; - - printf("hi\n"); - - // size_t vl = __riscv_vsetvl_e16m1(n); - asm volatile("vsetvli %0, %1, e16, m1, ta, ma" : "=r"(vl) : "r"(n)); - - // vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); - asm volatile("vlse16.v v24, (%0), %1" : : "r"(x), "r"(sizeof(float16_t) * incx)); - - // vfloat16m1_t vec_y = __riscv_vlse16_v_f16m1(y, sizeof(float16_t) * incy, vl); - asm volatile("vlse16.v v25, (%0), %1" : : "r"(y), "r"(sizeof(float16_t) * incy)); - - // // vfloat16m1_t vec_z = __riscv_vfadd_vv_f16m1(vec_x, vec_y, vl); - asm volatile("vfadd.vv v24, v24, v25"); - - // __riscv_vsse16_v_f16m1(z, sizeof(float16_t) * incz, vec_z, vl); - asm volatile("vsse16.v v24, (%0), %1" : : "r"(z), "r"(sizeof(float16_t) * incz)); - + size_t vl = __riscv_vsetvl_e16m1(n); + vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); + vfloat16m1_t vec_y = __riscv_vlse16_v_f16m1(y, sizeof(float16_t) * incy, vl); + vfloat16m1_t vec_z = __riscv_vfadd_vv_f16m1(vec_x, vec_y, vl); + __riscv_vsse16_v_f16m1(z, sizeof(float16_t) * incz, vec_z, vl); x += vl; y += vl; z += vl; diff --git a/nn/impl/rvv/maximum1.c b/nn/impl/rvv/maximum1.c index ab908b7..cb4b06f 100644 --- a/nn/impl/rvv/maximum1.c +++ b/nn/impl/rvv/maximum1.c @@ -6,22 +6,11 @@ void NN__maximum1_f16(size_t n, float16_t *y, size_t incy, float16_t *x, size_t incx, float16_t scalar) { while (n > 0) { - size_t vl; - // size_t vl = __riscv_vsetvl_e16m1(n); - asm volatile("vsetvli %0, %1, e16, m1, ta, ma" : "=r"(vl) : "r"(n)); - - // vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); - asm volatile("vlse16.v v26, (%0), %1" : : "r"(x), "r"(sizeof(float16_t) * incx)); - - // vfloat16m1_t vec_s = __riscv_vfmv_v_f_f16m1(scalar, vl); - asm volatile("vmv.v.x v25, %0" : : "r"(scalar)); - - // vfloat16m1_t vec_y = __riscv_vfmax_vv_f16m1(vec_x, vec_s, vl); - asm volatile("vfmax.vv v25, v26, v25"); - - // __riscv_vsse16_v_f16m1(y, sizeof(float16_t) * incy, vec_y, vl); - asm volatile("vsse16.v v25, (%0), %1" : : "r"(y), "r"(sizeof(float16_t) * incy)); - + size_t vl = __riscv_vsetvl_e16m1(n); + vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); + vfloat16m1_t vec_s = __riscv_vfmv_v_f_f16m1(scalar, vl); + vfloat16m1_t vec_y = __riscv_vfmax_vv_f16m1(vec_x, vec_s, vl); + __riscv_vsse16_v_f16m1(y, sizeof(float16_t) * incy, vec_y, vl); x += vl; y += vl; n -= vl; diff --git a/nn/impl/rvv/minimum1.c b/nn/impl/rvv/minimum1.c index 040cbda..135bfe4 100644 --- a/nn/impl/rvv/minimum1.c +++ b/nn/impl/rvv/minimum1.c @@ -6,22 +6,11 @@ void NN__minimum1_f16(size_t n, float16_t *y, size_t incy, float16_t *x, size_t incx, float16_t scalar) { while (n > 0) { - size_t vl; - // size_t vl = __riscv_vsetvl_e16m1(n); - asm volatile("vsetvli %0, %1, e16, m1, ta, ma" : "=r"(vl) : "r"(n)); - - // vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); - asm volatile("vlse16.v v26, (%0), %1" : : "r"(x), "r"(sizeof(float16_t) * incx)); - - // vfloat16m1_t vec_s = __riscv_vfmv_v_f_f16m1(scalar, vl); - asm volatile("vmv.v.x v25, %0" : : "r"(scalar)); - - // vfloat16m1_t vec_y = __riscv_vfmin_vv_f16m1(vec_x, vec_s, vl); - asm volatile("vfmin.vv v25, v26, v25"); - - // __riscv_vsse16_v_f16m1(y, sizeof(float16_t) * incy, vec_y, vl); - asm volatile("vsse16.v v25, (%0), %1" : : "r"(y), "r"(sizeof(float16_t) * incy)); - + size_t vl = __riscv_vsetvl_e16m1(n); + vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); + vfloat16m1_t vec_s = __riscv_vfmv_v_f_f16m1(scalar, vl); + vfloat16m1_t vec_y = __riscv_vfmin_vv_f16m1(vec_x, vec_s, vl); + __riscv_vsse16_v_f16m1(y, sizeof(float16_t) * incy, vec_y, vl); x += vl; y += vl; n -= vl; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index ae32572..e2b9ffc 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -12,6 +12,15 @@ elseif (RISCV) endif() -target_link_libraries(tests PUBLIC nn) +# include_directories( +# ../nn +# ../nn/functional +# ../nn/impl) + +find_library(LIB_TO_INCLUDE nn ./) + +target_link_libraries(tests PUBLIC ${LIB_TO_INCLUDE}) + +# target_link_libraries(tests PUBLIC nn) target_link_libraries(tests PUBLIC m) diff --git a/tests/src/generate_test.py b/tests/src/generate_test.py index bd37d49..ca795a6 100644 --- a/tests/src/generate_test.py +++ b/tests/src/generate_test.py @@ -41,9 +41,9 @@ def functional_rms_norm(x, w, eps): test_pattern = [ - # ("abs", lambda a: torch.abs(a), [("a", rand((7, 7))), ]), - # ("add", lambda a, b: a + b, [("a", rand((6, 7))), ("b", rand((6, 7))) ]), - # ("add", lambda a, b: a + b, [("a", rand((6, 7))), ("b", rand((1, 7))) ]), + ("abs", lambda a: torch.abs(a), [("a", rand((7, 7))), ]), + ("add", lambda a, b: a + b, [("a", rand((6, 7))), ("b", rand((6, 7))) ]), + ("add", lambda a, b: a + b, [("a", rand((6, 7))), ("b", rand((1, 7))) ]), # ("add", lambda a, b: a + b, [("a", rand((6, 7))), ("b", rand((6, 1))) ]), # ("add", lambda a, b: a + b, [("a", rand((6, 7))), ("b", rand((7, ))) ]), # ("add_inplace", lambda a, b: a + b, [("actual", torch.zeros((7, 7))), ("b", rand((7, 7))) ]), diff --git a/tests/src/generated.c b/tests/src/generated.c index 4433723..5dd4b06 100644 --- a/tests/src/generated.c +++ b/tests/src/generated.c @@ -16,12 +16,87 @@ int main() { { printf("abs: "); - // [[-0.332 -0.454 0.6143 0.1875 -0.1846 -0.255 0.6904]] - Tensor *a = NN_tensor(2, (size_t[]){ 1, 7 }, DTYPE_F16, (uint8_t[]){ 0x50,0xb5,0x44,0xb7,0xea,0x38,0x0,0x32,0xe8,0xb1,0x14,0xb4,0x86,0x39 }); + // [[-0.0374341 2.682218 -4.115226 -3.6796951 -1.9257718 1.3407868 -0.09 + Tensor *a = NN_tensor(2, (size_t[]){ 7, 7 }, DTYPE_F32, (uint8_t[]){ 0x80,0x54,0x19,0xbd,0x76,0xa9,0x2b,0x40,0xee,0xaf,0x83,0xc0,0x20,0x80,0x6b,0xc0,0xb1,0x7f,0xf6,0xbf,0xe7,0x9e,0xab,0x3f,0x10,0xe3,0xca,0xbd,0x82,0xb9,0x7d,0x40,0x4c,0x2f,0xe3,0xbe,0x1f,0x5a,0xa9,0x3f,0x97,0x6a,0xc1,0xbf,0x8c,0x9a,0x7b,0xbf,0x13,0xdb,0x98,0xc0,0x26,0xee,0x53,0xc0,0x51,0xe9,0x3,0xc0,0xc0,0xa9,0x3d,0x3e,0xb8,0x3,0xfd,0x3f,0xde,0x1,0x40,0x40,0xef,0xf0,0x58,0xc0,0x1e,0x59,0xb,0xc0,0x7e,0x75,0xe8,0x3f,0xb1,0xdc,0x84,0x40,0x4d,0xb6,0x83,0xbf,0xb3,0x75,0x6f,0x40,0x90,0x50,0x4e,0xbf,0x2a,0x71,0x7,0x3f,0x4e,0xe0,0x90,0x40,0x61,0x6d,0x94,0xc0,0xc0,0x73,0x49,0xc0,0x98,0x6,0xa2,0xbf,0xd3,0x78,0xf9,0xbf,0x79,0x3d,0x8a,0x40,0xe0,0x6a,0x4f,0xc0,0x78,0x4e,0x13,0xc0,0xa0,0x90,0x5f,0xc0,0x8a,0xd9,0x95,0xc0,0x5,0xcc,0x3a,0xc0,0x23,0x89,0x89,0x40,0x36,0xca,0xe,0x40,0x60,0x18,0x1b,0x40,0x68,0xa2,0x86,0x3e,0x8,0xf,0x24,0xc0,0x70,0x8e,0x58,0x3f,0x23,0x64,0x95,0xc0,0xa0,0x38,0x67,0xc0,0x38,0xf8,0x24,0xc0,0x70,0xe6,0x49,0x40,0x70,0x9f,0x3b,0x40,0x1d,0xeb,0xd,0xc0 }); - // [[0.332 0.454 0.6143 0.1875 0.1846 0.255 0.6904]] - Tensor *golden = NN_tensor(2, (size_t[]){ 1, 7 }, DTYPE_F16, (uint8_t[]){ 0x50,0x35,0x44,0x37,0xea,0x38,0x0,0x32,0xe8,0x31,0x14,0x34,0x86,0x39 }); + // [[0.0374341 2.682218 4.115226 3.6796951 1.9257718 1.3407868 0.0990659 ] + Tensor *golden = NN_tensor(2, (size_t[]){ 7, 7 }, DTYPE_F32, (uint8_t[]){ 0x80,0x54,0x19,0x3d,0x76,0xa9,0x2b,0x40,0xee,0xaf,0x83,0x40,0x20,0x80,0x6b,0x40,0xb1,0x7f,0xf6,0x3f,0xe7,0x9e,0xab,0x3f,0x10,0xe3,0xca,0x3d,0x82,0xb9,0x7d,0x40,0x4c,0x2f,0xe3,0x3e,0x1f,0x5a,0xa9,0x3f,0x97,0x6a,0xc1,0x3f,0x8c,0x9a,0x7b,0x3f,0x13,0xdb,0x98,0x40,0x26,0xee,0x53,0x40,0x51,0xe9,0x3,0x40,0xc0,0xa9,0x3d,0x3e,0xb8,0x3,0xfd,0x3f,0xde,0x1,0x40,0x40,0xef,0xf0,0x58,0x40,0x1e,0x59,0xb,0x40,0x7e,0x75,0xe8,0x3f,0xb1,0xdc,0x84,0x40,0x4d,0xb6,0x83,0x3f,0xb3,0x75,0x6f,0x40,0x90,0x50,0x4e,0x3f,0x2a,0x71,0x7,0x3f,0x4e,0xe0,0x90,0x40,0x61,0x6d,0x94,0x40,0xc0,0x73,0x49,0x40,0x98,0x6,0xa2,0x3f,0xd3,0x78,0xf9,0x3f,0x79,0x3d,0x8a,0x40,0xe0,0x6a,0x4f,0x40,0x78,0x4e,0x13,0x40,0xa0,0x90,0x5f,0x40,0x8a,0xd9,0x95,0x40,0x5,0xcc,0x3a,0x40,0x23,0x89,0x89,0x40,0x36,0xca,0xe,0x40,0x60,0x18,0x1b,0x40,0x68,0xa2,0x86,0x3e,0x8,0xf,0x24,0x40,0x70,0x8e,0x58,0x3f,0x23,0x64,0x95,0x40,0xa0,0x38,0x67,0x40,0x38,0xf8,0x24,0x40,0x70,0xe6,0x49,0x40,0x70,0x9f,0x3b,0x40,0x1d,0xeb,0xd,0x40 }); + Tensor *actual = NN_zeros(2, (size_t[]){ 7, 7 }, DTYPE_F32); + + cycles = read_cycles(); + NN_abs(actual, a); + cycles = read_cycles() - cycles; + printf("%s (%lu cycles)\n", compare_tensor(golden, actual, 1e-4) ? "PASS" : "FAIL", cycles); + + NN_delete_tensor(a); + + NN_delete_tensor(golden); + NN_free_tensor_data(actual); + NN_delete_tensor(actual); + } + + { + printf("add: "); + + // [[-0.18041193 3.1978035 4.9706655 1.9844109 0.6754643 3.3524318 -2.94 + Tensor *a = NN_tensor(2, (size_t[]){ 6, 7 }, DTYPE_F32, (uint8_t[]){ 0xe8,0xbd,0x38,0xbe,0xd0,0xa8,0x4c,0x40,0xb1,0xf,0x9f,0x40,0x2d,0x1,0xfe,0x3f,0x3a,0xeb,0x2c,0x3f,0x3e,0x8e,0x56,0x40,0xb0,0x6a,0x3c,0xc0,0x36,0x85,0x6e,0x3f,0x7,0x19,0x78,0xc0,0x9e,0xc9,0x5d,0xc0,0x86,0x4e,0x25,0xc0,0x98,0xca,0x10,0x40,0xfb,0xb0,0x0,0x40,0x84,0x8d,0x3d,0xc0,0x3a,0x59,0xc1,0x3f,0xca,0xab,0x2f,0x40,0xe8,0x8e,0x21,0xbf,0x50,0x7d,0x43,0x3e,0x80,0x4a,0x94,0x3f,0x40,0x85,0x46,0x40,0x8d,0xa1,0x99,0x40,0x7c,0x99,0x76,0xc0,0x67,0x8a,0xea,0xbf,0xbe,0x86,0xfb,0x3f,0x62,0x91,0x84,0x40,0xb1,0x3b,0x8b,0x40,0x55,0x2d,0x8d,0x40,0x16,0xbd,0x7e,0x3f,0x1b,0x22,0x8b,0xc0,0x34,0x80,0xeb,0x3e,0x97,0x31,0x48,0xc0,0xd8,0x1c,0x95,0xc0,0xa7,0x28,0x8e,0x40,0xac,0x50,0x73,0x40,0xbf,0x9a,0x9f,0xc0,0x8a,0x94,0x6f,0x3f,0xfa,0xa0,0x57,0xbf,0x66,0xa3,0x52,0xbf,0x72,0x7b,0x12,0xc0,0xaf,0x1d,0xf6,0x3f,0x81,0x89,0x3d,0xc0,0x53,0x9e,0xea,0x3f }); + // [[ 2.5285406 3.5793579 1.8695557 -4.948676 -3.2434845 2.496575 1.046 + Tensor *b = NN_tensor(2, (size_t[]){ 6, 7 }, DTYPE_F32, (uint8_t[]){ 0x9c,0xd3,0x21,0x40,0x33,0x14,0x65,0x40,0x9a,0x4d,0xef,0x3f,0x8e,0x5b,0x9e,0xc0,0x40,0x95,0x4f,0xc0,0xe3,0xc7,0x1f,0x40,0xef,0xf3,0x85,0x3f,0x7c,0xa0,0x79,0xc0,0x22,0x43,0x38,0xc0,0x17,0x85,0x96,0x40,0x29,0x9f,0x57,0x40,0x2e,0x87,0xb,0xc0,0x9,0x14,0xa1,0xbf,0x6b,0x6a,0x98,0xc0,0x20,0xe,0xb8,0xbd,0x96,0xfa,0x70,0xc0,0x8a,0xd5,0x76,0xc0,0x14,0xe,0x8d,0xbe,0x88,0x2f,0x40,0x3f,0xb8,0xc,0x3,0xc0,0x7e,0xe1,0x3d,0x40,0x86,0xbb,0x42,0xc0,0xe1,0x2d,0x91,0x40,0xc4,0x4b,0x5b,0x40,0xde,0xec,0x86,0xc0,0x35,0x49,0x9f,0xbf,0x20,0x7,0x67,0x3e,0xde,0xc0,0x3a,0x3f,0xa2,0xca,0x97,0x3f,0x73,0x27,0xfb,0x3f,0x2c,0x58,0x99,0x3e,0x20,0x23,0x1c,0xc0,0xa4,0x6b,0x17,0x40,0xd6,0x7a,0x99,0xc0,0x88,0xaa,0x3d,0xc0,0xb,0x36,0xa0,0xbf,0x54,0xe0,0x1b,0xc0,0xb3,0xe4,0xdf,0xbf,0xb4,0x23,0x83,0xc0,0x40,0x23,0x88,0xbf,0xdb,0xcd,0x88,0x3f,0x14,0x78,0x50,0xc0 }); + + + // [[ 2.3481288 6.7771616 6.8402214 -2.9642653 -2.5680203 5.8490067 -1.89 + Tensor *golden = NN_tensor(2, (size_t[]){ 6, 7 }, DTYPE_F32, (uint8_t[]){ 0xbe,0x47,0x16,0x40,0x82,0xde,0xd8,0x40,0x18,0xe3,0xda,0x40,0x86,0xb6,0x3d,0xc0,0x72,0x5a,0x24,0xc0,0x10,0x2b,0xbb,0x40,0x71,0xe1,0xf2,0xbf,0x2e,0xff,0x3d,0xc0,0x14,0x2e,0xd8,0xc0,0x20,0x81,0x9e,0x3f,0x8c,0x42,0x49,0x3f,0x40,0x6d,0xa8,0x3d,0xda,0x9b,0x40,0x3f,0x2d,0x31,0xf7,0xc0,0x58,0xd8,0xb5,0x3f,0x98,0x9d,0x82,0xbf,0xa2,0x9c,0x8f,0xc0,0xb0,0x3d,0xad,0xbd,0x44,0x62,0xf4,0x3f,0x10,0xf1,0x86,0x3f,0x4c,0x92,0xf8,0x40,0x81,0xaa,0xdc,0xc0,0x8e,0x16,0x2d,0x40,0x92,0x87,0xac,0x40,0x0,0xdf,0x96,0xbd,0xc8,0xd2,0x46,0x40,0x8e,0x65,0x94,0x40,0xfa,0xbe,0xdc,0x3f,0xe5,0x5e,0x4a,0xc0,0xc0,0x3,0x1b,0x40,0x92,0x6,0x35,0xc0,0x68,0x2e,0xe3,0xc0,0x79,0xde,0xd9,0x40,0x0,0x94,0x7e,0xbf,0x3,0x70,0xfe,0xc0,0x18,0xaf,0xa1,0xbe,0x92,0xc8,0x51,0xc0,0x33,0x9b,0x24,0xc0,0x6d,0x61,0xcc,0xc0,0xde,0xf4,0x5b,0x3f,0x27,0x45,0xf2,0xbf,0xd5,0x51,0xb6,0xbf }); + Tensor *actual = NN_zeros(2, (size_t[]){ 6, 7 }, DTYPE_F32); + + cycles = read_cycles(); + NN_add(actual, a, b); + cycles = read_cycles() - cycles; + printf("%s (%lu cycles)\n", compare_tensor(golden, actual, 1e-4) ? "PASS" : "FAIL", cycles); + + NN_delete_tensor(a); + NN_delete_tensor(b); + + NN_delete_tensor(golden); + NN_free_tensor_data(actual); + NN_delete_tensor(actual); + } + + { + printf("add: "); + + // [[-0.2565968 3.5792542 -0.51400125 0.13896108 -0.4313445 1.0119069 3.17 + Tensor *a = NN_tensor(2, (size_t[]){ 6, 7 }, DTYPE_F32, (uint8_t[]){ 0xa8,0x60,0x83,0xbe,0x80,0x12,0x65,0x40,0x96,0x95,0x3,0xbf,0xd0,0x4b,0xe,0x3e,0x30,0xd9,0xdc,0xbe,0x2a,0x86,0x81,0x3f,0xf8,0x77,0x4b,0x40,0x34,0x8f,0x97,0x40,0xc8,0x37,0x4b,0x40,0xfa,0xe7,0x97,0x40,0xb8,0x24,0xb9,0xbe,0x40,0xbb,0x8f,0xc0,0x68,0xb4,0x17,0xc0,0xc2,0xe3,0x59,0x40,0xe0,0xc2,0x4,0xbd,0xa,0xe,0x1f,0xc0,0x42,0x38,0x75,0xc0,0x80,0xbc,0x95,0xc0,0x94,0xa,0x87,0xc0,0xc6,0xd0,0x81,0xbf,0x6c,0x7d,0x2f,0x40,0x51,0x1,0x2d,0x40,0x21,0x4f,0x9a,0xc0,0x3a,0x9c,0x47,0x40,0x2c,0x67,0x7a,0xc0,0x76,0x4d,0x87,0xbf,0x52,0xc0,0x1,0xc0,0x2c,0x8c,0x76,0xbf,0x94,0x51,0x7b,0xbf,0x74,0x93,0x8f,0xc0,0x6a,0x26,0x8a,0xc0,0x32,0x4b,0x48,0xbf,0xf0,0x6c,0x84,0x3d,0x33,0x5e,0x11,0xc0,0x66,0x16,0xf1,0x3f,0x64,0x2,0x90,0xc0,0x70,0xc4,0xac,0xbe,0x5,0xb5,0x8c,0x40,0x83,0x86,0x2,0xc0,0x2,0x7b,0x90,0x40,0x46,0xc7,0xe7,0x3f,0xcd,0x64,0x90,0xc0 }); + // [[ 3.1634867 -0.57697237 -2.2320342 3.9982665 -4.040476 0.53652465 -1.0 + Tensor *b = NN_tensor(2, (size_t[]){ 1, 7 }, DTYPE_F32, (uint8_t[]){ 0x91,0x76,0x4a,0x40,0x76,0xb4,0x13,0xbf,0xa6,0xd9,0xe,0xc0,0x99,0xe3,0x7f,0x40,0x94,0x4b,0x81,0xc0,0xae,0x59,0x9,0x3f,0xf8,0xfe,0x85,0xbf }); + + + // [[ 2.90689 3.0022817 -2.7460356 4.1372275 -4.4718204 1.5484315 2.13 + Tensor *golden = NN_tensor(2, (size_t[]){ 6, 7 }, DTYPE_F32, (uint8_t[]){ 0x7c,0xa,0x3a,0x40,0x62,0x25,0x40,0x40,0xc,0xbf,0x2f,0xc0,0x2b,0x64,0x84,0x40,0x27,0x19,0x8f,0xc0,0x1,0x33,0xc6,0x3f,0x7c,0x78,0x8,0x40,0x7c,0xca,0xfc,0x40,0xaa,0x4a,0x26,0x40,0x4e,0xf6,0x20,0x40,0x2,0xbf,0x68,0x40,0x6a,0x83,0x8,0xc1,0xf9,0xbb,0xea,0xbf,0x46,0xe4,0x16,0x40,0x86,0x63,0x48,0x40,0x28,0xfb,0x43,0xc0,0xf4,0x8,0xc2,0xc0,0x9c,0x55,0x2e,0xbf,0x14,0x2b,0x4,0xc1,0xbc,0x8f,0xf4,0xbe,0xe0,0xfb,0xd8,0x3f,0xf1,0xbb,0xbb,0x40,0xb0,0xc5,0xac,0xc0,0x50,0xa,0x63,0x3f,0xa0,0x8d,0xaf,0x3d,0xf2,0x1e,0xa3,0xc0,0xcd,0xd3,0xbe,0xbf,0x87,0xa2,0x0,0xc0,0x2c,0xa2,0xb,0x40,0x3,0xa,0xa2,0xc0,0x3d,0x93,0xd1,0xc0,0xcc,0xd0,0x4d,0x40,0xc0,0x73,0x7e,0xc0,0x8f,0xf,0xde,0xbf,0xdc,0x2e,0x56,0x3f,0x6e,0x1c,0xab,0xbf,0xae,0x16,0x6a,0xbf,0x64,0x90,0xa,0x40,0x2c,0xba,0xfa,0x3f,0xe0,0xf6,0xf2,0x3e,0xe,0x3a,0x16,0x40,0x8b,0xe4,0xb1,0xc0 }); + Tensor *actual = NN_zeros(2, (size_t[]){ 6, 7 }, DTYPE_F32); + + cycles = read_cycles(); + NN_add(actual, a, b); + cycles = read_cycles() - cycles; + printf("%s (%lu cycles)\n", compare_tensor(golden, actual, 1e-4) ? "PASS" : "FAIL", cycles); + + NN_delete_tensor(a); + NN_delete_tensor(b); + + NN_delete_tensor(golden); + NN_free_tensor_data(actual); + NN_delete_tensor(actual); + } + + { + printf("abs: "); + + // [[-0.9893 -0.248 -0.7 0.0781 -0.9688 0.1777 0.125 ]] + Tensor *a = NN_tensor(2, (size_t[]){ 1, 7 }, DTYPE_F16, (uint8_t[]){ 0xea,0xbb,0xf0,0xb3,0x9a,0xb9,0x0,0x2d,0xc0,0xbb,0xb0,0x31,0x0,0x30 }); + + + // [[0.9893 0.248 0.7 0.0781 0.9688 0.1777 0.125 ]] + Tensor *golden = NN_tensor(2, (size_t[]){ 1, 7 }, DTYPE_F16, (uint8_t[]){ 0xea,0x3b,0xf0,0x33,0x9a,0x39,0x0,0x2d,0xc0,0x3b,0xb0,0x31,0x0,0x30 }); Tensor *actual = NN_zeros(2, (size_t[]){ 1, 7 }, DTYPE_F16); cycles = read_cycles(); @@ -39,14 +114,14 @@ int main() { { printf("add: "); - // [[ 0.3506 0.00879 0.706 -0.7295 0.7363 0.785 -0.415 ] [ 0.06836 0. - Tensor *a = NN_tensor(2, (size_t[]){ 6, 7 }, DTYPE_F16, (uint8_t[]){ 0x9c,0x35,0x80,0x20,0xa6,0x39,0xd6,0xb9,0xe4,0x39,0x48,0x3a,0xa4,0xb6,0x60,0x2c,0x60,0x37,0xa0,0xb6,0x30,0x36,0x8c,0xb9,0xfc,0xb4,0x98,0x37,0x64,0xb7,0x52,0xbb,0xa4,0xba,0xb0,0x38,0x78,0xb1,0x4a,0x39,0x9c,0xb7,0x98,0xb1,0x70,0xbb,0xb8,0xb3,0x28,0x3b,0x1a,0xbb,0x0,0xaa,0x34,0xb4,0x8a,0x39,0xfc,0x3b,0x9e,0x3a,0xbc,0x36,0x80,0xb8,0x70,0xb1,0x3a,0xbb,0xb0,0x3b,0x88,0x31,0x34,0xb4,0x18,0xb3,0xb4,0xb5,0x98,0x30,0x20,0xaf }); - // [[-0.587 0.2812 0.9385 0.2588 -0.3193 -0.376 -0.4688 ] [-0.4697 -0. - Tensor *b = NN_tensor(2, (size_t[]){ 6, 7 }, DTYPE_F16, (uint8_t[]){ 0xb2,0xb8,0x80,0x34,0x82,0x3b,0x24,0x34,0x1c,0xb5,0x4,0xb6,0x80,0xb7,0x84,0xb7,0xd8,0xb4,0x18,0xba,0xd2,0xba,0x74,0xb5,0x4,0xb8,0x2c,0x37,0x70,0xb0,0xb8,0x31,0xc8,0xbb,0xf0,0xb2,0x0,0xbb,0x0,0xbb,0x48,0x3b,0xa0,0x2a,0xa,0x3b,0x4c,0x38,0xa0,0x37,0x30,0xb4,0xde,0xb9,0x62,0xb9,0xf8,0xb8,0xd2,0x38,0x40,0x29,0xe8,0x36,0x84,0xb7,0x40,0xb0,0xfe,0xb9,0x4,0x35,0xce,0x3b,0x52,0x3b,0x8e,0xbb,0xba,0xb9,0xcc,0x3a,0x70,0x2f }); + // [[-0.2129 -0.7314 0.1699 0.541 0.3623 0.1797 -0.5664 ] [-0.52 + Tensor *a = NN_tensor(2, (size_t[]){ 6, 7 }, DTYPE_F16, (uint8_t[]){ 0xd0,0xb2,0xda,0xb9,0x70,0x31,0x54,0x38,0xcc,0x35,0xc0,0x31,0x88,0xb8,0x30,0xb8,0x9a,0x38,0xc4,0xb9,0x60,0x34,0xc,0xb6,0xf0,0xb3,0x6,0x38,0x44,0xbb,0x10,0x33,0xd6,0x3a,0x34,0x34,0xe0,0x3a,0xb0,0xb9,0x20,0x35,0xc0,0x24,0x84,0xb6,0x90,0x3a,0x68,0x36,0x7c,0xb8,0xc0,0x33,0x98,0xba,0x7a,0xb9,0x0,0xa5,0x7a,0x3a,0x1c,0x3a,0xc0,0x3a,0x10,0x34,0x86,0x3b,0x4c,0xb4,0xea,0xba,0x0,0x9c,0xa8,0xb6,0x34,0x37,0x38,0xb0,0xda,0x38 }); + // [[-0.1768 -0.3203 0.5156 0.1484 -0.0967 -0.8545 0.1074 ] [-0.9756 0. + Tensor *b = NN_tensor(2, (size_t[]){ 6, 7 }, DTYPE_F16, (uint8_t[]){ 0xa8,0xb1,0x20,0xb5,0x20,0x38,0xc0,0x30,0x30,0xae,0xd6,0xba,0xe0,0x2e,0xce,0xbb,0x40,0x37,0x40,0xac,0xf2,0x3a,0x50,0xb0,0x50,0x2f,0xf4,0x36,0xa6,0x38,0x7c,0xb5,0xd0,0x3a,0x0,0xae,0x20,0x33,0xec,0x37,0x14,0xb4,0xe,0xbb,0x18,0x35,0x56,0xb8,0xc2,0xb9,0xcc,0xb7,0x8e,0x3a,0x70,0x39,0xc0,0x30,0xc4,0xb6,0xae,0xbb,0xdc,0xb9,0x50,0x38,0xd8,0x36,0x62,0xba,0xa8,0xb3,0xb4,0xba,0x22,0xbb,0x74,0x35,0xf2,0x3b,0x8,0x30,0x84,0xb4 }); - // [[-0.2363 0.29 1.645 -0.4707 0.417 0.4092 -0.884 ] [-0.40 - Tensor *golden = NN_tensor(2, (size_t[]){ 6, 7 }, DTYPE_F16, (uint8_t[]){ 0x90,0xb3,0xa4,0x34,0x94,0x3e,0x88,0xb7,0xac,0x36,0x8c,0x36,0x12,0xbb,0x6c,0xb6,0x10,0x31,0xb4,0xbc,0x74,0xb7,0x23,0xbc,0x82,0xba,0x62,0x3b,0xce,0xb8,0xe4,0xb9,0x36,0xbf,0xe8,0x35,0x2f,0xbc,0xd8,0xb2,0xf4,0x36,0xe0,0xaf,0x60,0xaa,0xbc,0x34,0x7c,0x3d,0x99,0xbc,0x3e,0xba,0x7c,0xbb,0x90,0x2c,0x67,0x3e,0xf2,0x3a,0xd2,0x3a,0x21,0xbc,0xd8,0xb4,0x9c,0xbe,0x19,0x3d,0x98,0x3c,0x38,0x39,0xaa,0xbc,0x4a,0xbc,0xf2,0x3b,0x0,0x1d }); + // [[-0.3896 -1.052 0.6855 0.6895 0.2656 -0.675 -0.459 ] [-1.499 1. + Tensor *golden = NN_tensor(2, (size_t[]){ 6, 7 }, DTYPE_F16, (uint8_t[]){ 0x3c,0xb6,0x35,0xbc,0x7c,0x39,0x84,0x39,0x40,0x34,0x66,0xb9,0x58,0xb7,0xff,0xbd,0x1d,0x3c,0x4c,0xba,0x91,0x3c,0x1a,0xb8,0x48,0xb0,0x80,0x3b,0x3c,0xb5,0xd0,0xaf,0xd3,0x3e,0x68,0x31,0x54,0x3c,0xe8,0xb2,0x30,0x2c,0xe8,0xba,0xb0,0xad,0x74,0x34,0x1c,0xb5,0x31,0xbc,0x3f,0x3c,0xa0,0xb0,0x4a,0xb8,0x14,0xb7,0xd0,0xb0,0x0,0x28,0x88,0x3d,0x74,0x39,0x90,0x30,0x10,0xb8,0xcf,0xbe,0x2a,0xbb,0xd0,0xac,0xc6,0x3d,0x0,0x9e,0x30,0x35 }); Tensor *actual = NN_zeros(2, (size_t[]){ 6, 7 }, DTYPE_F16); cycles = read_cycles(); @@ -54,6 +129,9 @@ int main() { cycles = read_cycles() - cycles; printf("%s (%lu cycles)\n", compare_tensor(golden, actual, 1e-2) ? "PASS" : "FAIL", cycles); + NN_printf(golden); + NN_printf(actual); + NN_delete_tensor(a); NN_delete_tensor(b); @@ -65,14 +143,14 @@ int main() { { printf("matmul_t: "); - // [[-0.2393 -0.58 0.0801 -0.911 0.875 -0.6113 -0.4033 ] [ 0.552 -0. - Tensor *a = NN_tensor(2, (size_t[]){ 6, 7 }, DTYPE_F16, (uint8_t[]){ 0xa8,0xb3,0xa4,0xb8,0x20,0x2d,0x4a,0xbb,0x0,0x3b,0xe4,0xb8,0x74,0xb6,0x6a,0x38,0x40,0xaf,0x1c,0xb6,0x54,0x3b,0x58,0xbb,0x6a,0xba,0x78,0xb9,0xd0,0xb3,0x80,0xab,0xe0,0x29,0x4,0xb6,0xe8,0x3b,0x18,0x30,0x80,0xaa,0xe0,0xb0,0xa0,0x31,0xd0,0x3b,0x9c,0x3a,0x78,0x30,0x28,0x39,0x18,0x33,0x60,0xac,0x48,0xb3,0xc4,0xb4,0x30,0xaf,0x16,0xb9,0xaa,0x3a,0x40,0xb7,0xc4,0x34,0x8a,0x38,0xae,0xba,0x46,0x39,0x0,0x37,0xf0,0x2d,0x28,0x32 }); - // [[ 0.592 -0.75 0.8604 0.674 0.2852 0.908 -0.2031 ] [ 0.841 -0. - Tensor *b = NN_tensor(2, (size_t[]){ 5, 7 }, DTYPE_F16, (uint8_t[]){ 0xbc,0x38,0x0,0xba,0xe2,0x3a,0x64,0x39,0x90,0x34,0x44,0x3b,0x80,0xb2,0xba,0x3a,0xfa,0xb8,0xc4,0x38,0x54,0x38,0x60,0xaa,0x20,0xb9,0xce,0x3b,0xb0,0xae,0x6e,0xba,0x2,0xb9,0x0,0x38,0x14,0x3b,0x1c,0xb9,0x54,0xb7,0x60,0xac,0xfc,0x35,0xd2,0xbb,0x76,0x39,0xfc,0xba,0xe,0xb9,0x3c,0xbb,0x7c,0x38,0xd0,0xad,0x88,0xb4,0xa,0xbb,0x86,0x3b,0xb8,0xb6,0x5c,0x3b }); + // [[ 0.865 -0.207 -0.789 0.372 -0.9766 -0.9346 -0.04492 ] [-0.01 + Tensor *a = NN_tensor(2, (size_t[]){ 6, 7 }, DTYPE_F16, (uint8_t[]){ 0xec,0x3a,0xa0,0xb2,0x50,0xba,0xf4,0x35,0xd0,0xbb,0x7a,0xbb,0xc0,0xa9,0x40,0xa4,0x6,0x3a,0x2c,0xba,0x98,0x39,0xcc,0x3b,0xba,0xb9,0xac,0x39,0x0,0x98,0xc,0xb5,0x8a,0x39,0xa4,0xb4,0x54,0xb7,0xc0,0xbb,0x80,0x21,0xd0,0x38,0x58,0xba,0x50,0x30,0x6c,0x39,0xd6,0x3b,0x94,0x36,0xfa,0x38,0x38,0x39,0x44,0xb6,0xa0,0x37,0xf0,0xb3,0x36,0x38,0xa6,0x3b,0xc8,0x32,0xa4,0xb5,0x58,0xb2,0x9a,0xb9,0x70,0x31,0xa0,0x30,0xb0,0xac,0x3a,0x38 }); + // [[ 0.5654 -0.4658 -0.2119 0.3838 -0.9287 -0.71 -0.10254] [ 0.878 0. + Tensor *b = NN_tensor(2, (size_t[]){ 5, 7 }, DTYPE_F16, (uint8_t[]){ 0x86,0x38,0x74,0xb7,0xc8,0xb2,0x24,0x36,0x6e,0xbb,0xae,0xb9,0x90,0xae,0x6,0x3b,0x5c,0x3a,0x98,0xb4,0x64,0x3b,0xa0,0xb0,0x6,0x3a,0x3e,0x3a,0xb8,0xb3,0x28,0x36,0x36,0x38,0xa6,0xb8,0x72,0xba,0x9c,0x36,0xa0,0x2b,0x56,0xb9,0xf8,0xb0,0x80,0x29,0xf0,0xb2,0x44,0x3b,0x80,0x2d,0xbe,0xbb,0xec,0xb5,0x30,0xba,0x4,0xb4,0xb8,0xbb,0x0,0x3a,0x2c,0x37,0xbc,0xb9 }); - // [[-0.4753 -0.331 1.335 -0.9136 1.406 ] [-0.1509 0.695 0.743 2. - Tensor *golden = NN_tensor(2, (size_t[]){ 6, 5 }, DTYPE_F16, (uint8_t[]){ 0x9b,0xb7,0x4c,0xb5,0x57,0x3d,0x4f,0xbb,0xa0,0x3d,0xd4,0xb0,0x8f,0x39,0xf2,0x39,0xb0,0x41,0x23,0xbe,0x3,0x2e,0x34,0xb8,0x60,0x39,0xd1,0xbc,0x10,0x3c,0x5,0x3f,0xb4,0x38,0xb5,0xb9,0x2d,0xbc,0x29,0xbc,0x73,0x37,0x65,0xbc,0x89,0xb8,0x95,0x38,0xcc,0xbc,0xa8,0xb5,0x61,0xb0,0xd7,0x38,0xbf,0x3a,0x24,0x35 }); + // [[ 2.47 0.568 -0.522 -1.583 -1.44 ] [-0.398 1.325 -1.55 + Tensor *golden = NN_tensor(2, (size_t[]){ 6, 5 }, DTYPE_F16, (uint8_t[]){ 0xf1,0x40,0x8b,0x38,0x2d,0xb8,0x55,0xbe,0xc3,0xbd,0x5e,0xb6,0x4d,0x3d,0x3b,0xbe,0xe6,0xb0,0x9f,0xbc,0x0,0x3c,0x7f,0xbd,0x1b,0x36,0xde,0xb5,0xf4,0xb6,0x30,0xb5,0x8e,0x3c,0x6c,0xbd,0x2b,0xae,0xa1,0x31,0xad,0xba,0xa3,0x39,0x82,0x2c,0xd,0x2b,0xd2,0x3a,0xa1,0xa7,0x45,0x33,0x97,0xb8,0xff,0xb1,0xb8,0x9f }); Tensor *actual = NN_zeros(2, (size_t[]){ 6, 5 }, DTYPE_F16); cycles = read_cycles(); @@ -91,14 +169,14 @@ int main() { { printf("matmul: "); - // [[-0.2783 0.1445 -0.796 -0.4512 0.4014 -0.2363 0.04004] [ 0.05664 -0. - Tensor *a = NN_tensor(2, (size_t[]){ 6, 7 }, DTYPE_F16, (uint8_t[]){ 0x74,0xb4,0xa0,0x30,0x5e,0xba,0x38,0xb7,0x6c,0x36,0x90,0xb3,0x20,0x29,0x40,0x2b,0xfc,0xb4,0x80,0x28,0xb0,0xb3,0x54,0xbb,0x60,0x29,0xc0,0xb0,0xea,0xbb,0xf0,0xb3,0x9a,0xb9,0x0,0x2d,0xc0,0xbb,0xb0,0x31,0x0,0x30,0xd0,0xb2,0xda,0xb9,0x70,0x31,0x54,0x38,0xcc,0x35,0xc0,0x31,0x88,0xb8,0x30,0xb8,0x9a,0x38,0xc4,0xb9,0x60,0x34,0xc,0xb6,0xf0,0xb3,0x6,0x38,0x44,0xbb,0x10,0x33,0xd6,0x3a,0x34,0x34,0xe0,0x3a,0xb0,0xb9,0x20,0x35 }); - // [[ 0.01855 -0.4072 0.8203 0.4004 -0.5605 ] [ 0.2422 -0.824 -0.68 - Tensor *b = NN_tensor(2, (size_t[]){ 7, 5 }, DTYPE_F16, (uint8_t[]){ 0xc0,0x24,0x84,0xb6,0x90,0x3a,0x68,0x36,0x7c,0xb8,0xc0,0x33,0x98,0xba,0x7a,0xb9,0x0,0xa5,0x7a,0x3a,0x1c,0x3a,0xc0,0x3a,0x10,0x34,0x86,0x3b,0x4c,0xb4,0xea,0xba,0x0,0x9c,0xa8,0xb6,0x34,0x37,0x38,0xb0,0xda,0x38,0xa8,0xb1,0x20,0xb5,0x20,0x38,0xc0,0x30,0x30,0xae,0xd6,0xba,0xe0,0x2e,0xce,0xbb,0x40,0x37,0x40,0xac,0xf2,0x3a,0x50,0xb0,0x50,0x2f,0xf4,0x36 }); + // [[-0.8076 -0.127 0.927 0.248 0.0781 0.4355 -0.867 ] [-0.8154 -0. + Tensor *a = NN_tensor(2, (size_t[]){ 6, 7 }, DTYPE_F16, (uint8_t[]){ 0x76,0xba,0x10,0xb0,0x6a,0x3b,0xf0,0x33,0x0,0x2d,0xf8,0x36,0xf0,0xba,0x86,0xba,0xfe,0xba,0x5e,0xba,0xc0,0x35,0xba,0xb9,0xe0,0x35,0x6c,0x35,0xb0,0xba,0xe8,0x3a,0xe0,0xb2,0xa0,0xba,0x32,0x38,0x7a,0xb9,0xce,0x3a,0x8,0xb8,0xe8,0xb2,0x20,0x2f,0x3c,0x3a,0x40,0x27,0xe2,0x39,0x84,0x34,0xdc,0xba,0x44,0x34,0x46,0xba,0x58,0xb9,0x6,0xb8,0xf8,0xb7,0x76,0xbb,0x94,0x35,0x40,0xaf,0x50,0xb7,0x58,0xb0,0xc,0xb4,0x8,0x35,0xde,0x38 }); + // [[-0.911 0.333 -0.587 0.538 0.948 ] [ 0.6465 0.4043 0.6416 -0. + Tensor *b = NN_tensor(2, (size_t[]){ 7, 5 }, DTYPE_F16, (uint8_t[]){ 0x4a,0xbb,0x54,0x35,0xb2,0xb8,0x4e,0x38,0x96,0x3b,0x2c,0x39,0x78,0x36,0x22,0x39,0xe8,0xb4,0x20,0xab,0xdc,0xb9,0x20,0xb1,0xaa,0xbb,0x30,0x2d,0x9e,0xb8,0xb8,0xb2,0x46,0xbb,0x70,0x31,0x54,0x36,0x6e,0xba,0xb2,0x3a,0x1c,0xb8,0x40,0xb8,0xb6,0x3b,0x6e,0xb9,0x38,0x38,0xd6,0x3b,0x5e,0xba,0x5c,0xb6,0x74,0x3b,0x0,0x3b,0xac,0x3b,0x7e,0x38,0x0,0x24,0x2c,0xbb }); - // [[ 0.07556 -0.51 -0.501 -0.6235 0.516 ] [-0.3896 0.2615 0.6865 -0. - Tensor *golden = NN_tensor(2, (size_t[]){ 6, 5 }, DTYPE_F16, (uint8_t[]){ 0xd6,0x2c,0x14,0xb8,0x2,0xb8,0xfd,0xb8,0x21,0x38,0x3c,0xb6,0x2f,0x34,0x7e,0x39,0x9d,0xb8,0x17,0xb7,0x2d,0xbd,0x9c,0x30,0x51,0xb8,0xb1,0xbe,0x2e,0x38,0x77,0xb4,0xc7,0x2f,0xeb,0x2f,0x78,0x34,0x9b,0xb9,0x2a,0xbb,0xf3,0xb0,0x5f,0xbc,0x5e,0xb9,0xbc,0x3b,0x1f,0x3c,0x91,0x3e,0xbc,0xbc,0xe9,0x3e,0xe4,0x35 }); + // [[-0.541 -1.14 -1.328 -0.3337 -0.3628 ] [ 0.5767 0.2281 1.02 + Tensor *golden = NN_tensor(2, (size_t[]){ 6, 5 }, DTYPE_F16, (uint8_t[]){ 0x54,0xb8,0x8f,0xbc,0x50,0xbd,0x57,0xb5,0xce,0xb5,0x9d,0x38,0x4d,0x33,0x14,0x3c,0x64,0xbb,0xa9,0xa7,0xf2,0x40,0xdf,0x39,0x6a,0x3f,0x4c,0xb4,0x3b,0xbf,0xdd,0x39,0x28,0x9d,0x24,0xb4,0xbf,0xb0,0xf3,0xb9,0x65,0x31,0x8d,0xb8,0xcf,0x3d,0xb0,0xbc,0x0,0x3b,0x57,0x37,0x27,0x3d,0xd0,0x35,0x49,0xb3,0x8,0x39 }); Tensor *actual = NN_zeros(2, (size_t[]){ 6, 5 }, DTYPE_F16); cycles = read_cycles(); @@ -117,16 +195,16 @@ int main() { { printf("linear: "); - // [[ 0.581 -0.3428 0.8516 -0.09375 0.2227 0.495 -0.255 ] [-0.882 0. - Tensor *x = NN_tensor(2, (size_t[]){ 6, 7 }, DTYPE_F16, (uint8_t[]){ 0xa6,0x38,0x7c,0xb5,0xd0,0x3a,0x0,0xae,0x20,0x33,0xec,0x37,0x14,0xb4,0xe,0xbb,0x18,0x35,0x56,0xb8,0xc2,0xb9,0xcc,0xb7,0x8e,0x3a,0x70,0x39,0xc0,0x30,0xc4,0xb6,0xae,0xbb,0xdc,0xb9,0x50,0x38,0xd8,0x36,0x62,0xba,0xa8,0xb3,0xb4,0xba,0x22,0xbb,0x74,0x35,0xf2,0x3b,0x8,0x30,0x84,0xb4,0xec,0x3a,0xa0,0xb2,0x50,0xba,0xf4,0x35,0xd0,0xbb,0x7a,0xbb,0xc0,0xa9,0x40,0xa4,0x6,0x3a,0x2c,0xba,0x98,0x39,0xcc,0x3b,0xba,0xb9,0xac,0x39 }); - // [[-0.001953 -0.3154 0.6924 -0.29 -0.458 -0.9688 0.01074 ] [ 0.60 - Tensor *w = NN_tensor(2, (size_t[]){ 5, 7 }, DTYPE_F16, (uint8_t[]){ 0x0,0x98,0xc,0xb5,0x8a,0x39,0xa4,0xb4,0x54,0xb7,0xc0,0xbb,0x80,0x21,0xd0,0x38,0x58,0xba,0x50,0x30,0x6c,0x39,0xd6,0x3b,0x94,0x36,0xfa,0x38,0x38,0x39,0x44,0xb6,0xa0,0x37,0xf0,0xb3,0x36,0x38,0xa6,0x3b,0xc8,0x32,0xa4,0xb5,0x58,0xb2,0x9a,0xb9,0x70,0x31,0xa0,0x30,0xb0,0xac,0x3a,0x38,0x86,0x38,0x74,0xb7,0xc8,0xb2,0x24,0x36,0x6e,0xbb,0xae,0xb9,0x90,0xae }); - // [[ 0.878 0.795 -0.287 0.924 -0.1445]] - Tensor *b = NN_tensor(2, (size_t[]){ 1, 5 }, DTYPE_F16, (uint8_t[]){ 0x6,0x3b,0x5c,0x3a,0x98,0xb4,0x64,0x3b,0xa0,0xb0 }); + // [[-0.84 -0.4082 0.5234 -0.3867 0.1865 0.1904 0.2998 ] [-0.86 + Tensor *x = NN_tensor(2, (size_t[]){ 6, 7 }, DTYPE_F16, (uint8_t[]){ 0xb8,0xba,0x88,0xb6,0x30,0x38,0x30,0xb6,0xf8,0x31,0x18,0x32,0xcc,0x34,0xf0,0xba,0x80,0xac,0x28,0xb1,0xd0,0x2d,0xf6,0x3a,0x48,0xb4,0x70,0x2d,0xc0,0xba,0xda,0xba,0xe0,0xb6,0x72,0xb8,0x70,0x2d,0x80,0x22,0xe8,0x34,0xa2,0x38,0x80,0x2f,0xe8,0x3a,0x90,0xac,0x0,0xb2,0xd4,0xb4,0x90,0xb4,0xc0,0xad,0xce,0xb9,0x0,0xb3,0xa8,0x36,0x1c,0x3a,0x54,0xb8,0x70,0x34,0x50,0x34,0xa4,0x3b,0x10,0x35,0x2c,0x38,0x18,0x33,0x30,0xb0,0xb4,0xb6 }); + // [[-0.3682 0.9453 -0.4404 -0.3623 0.505 -0.75 0.1328 ] [ 0.20 + Tensor *w = NN_tensor(2, (size_t[]){ 5, 7 }, DTYPE_F16, (uint8_t[]){ 0xe4,0xb5,0x90,0x3b,0xc,0xb7,0xcc,0xb5,0xa,0x38,0x0,0xba,0x40,0x30,0x78,0x32,0x62,0xb8,0x6a,0x3b,0x58,0x30,0x0,0x1c,0xb2,0xb8,0x48,0xb9,0x30,0xad,0x68,0x34,0xd8,0xb6,0x68,0x3a,0x80,0xa4,0xc4,0x36,0xd8,0x32,0x68,0xb2,0x74,0xb4,0xb8,0xb0,0x70,0xb3,0xd4,0x35,0x88,0x34,0x66,0x38,0x3c,0x39,0x96,0xb8,0x74,0x3a,0x56,0x3b,0x6e,0x3a,0xf0,0x37,0x0,0x9c }); + // [[-0.2324 -0.08105 -0.165 -0.5615 0.3184 ]] + Tensor *b = NN_tensor(2, (size_t[]){ 1, 5 }, DTYPE_F16, (uint8_t[]){ 0x70,0xb3,0x30,0xad,0x48,0xb1,0x7e,0xb8,0x18,0x35 }); - // [[ 1.018 1.73 1.191 0.036 -0.405 ] [ 0.04947 -0.2664 -0.396 1. - Tensor *golden = NN_tensor(2, (size_t[]){ 6, 5 }, DTYPE_F16, (uint8_t[]){ 0x12,0x3c,0xec,0x3e,0xc4,0x3c,0x9c,0x28,0x7b,0xb6,0x55,0x2a,0x43,0xb4,0x56,0xb6,0xa1,0x3e,0x9b,0xbc,0x1b,0xaf,0x69,0x3a,0x24,0x33,0x83,0x3c,0x4f,0xb9,0xe9,0xb0,0x8d,0x40,0x43,0xa9,0x5e,0x3f,0x6c,0xb8,0x8f,0x3e,0x1b,0x34,0x1c,0xbe,0xb7,0x3c,0xa7,0x40,0x10,0x31,0xa3,0x3e,0x9f,0xbc,0x4,0x40,0x57,0xb8 }); + // [[-0.4084 0.09644 -0.6016 0.01962 0.3137 ] [ 0.7095 -0.2505 -0.08276 -0. + Tensor *golden = NN_tensor(2, (size_t[]){ 6, 5 }, DTYPE_F16, (uint8_t[]){ 0x89,0xb6,0x2c,0x2e,0xd0,0xb8,0x6,0x25,0x5,0x35,0xad,0x39,0x2,0xb4,0x4c,0xad,0xfb,0xac,0xf9,0x34,0x45,0xb4,0x73,0xb7,0x32,0xb8,0xba,0x33,0x34,0xb8,0xc3,0xb8,0x82,0x3c,0x55,0xba,0x86,0xbc,0xb2,0x3b,0x22,0xaf,0xa2,0x34,0x48,0xaf,0x1d,0xb0,0xe6,0x3c,0x5b,0x36,0x64,0x31,0xb5,0x32,0xf1,0xbc,0x5d,0x3a }); Tensor *actual = NN_zeros(2, (size_t[]){ 6, 5 }, DTYPE_F16); cycles = read_cycles(); @@ -146,12 +224,12 @@ int main() { { printf("relu: "); - // [[ 0.753 0.7803 -0.2412 0.3848 0.5264 -0.581 -0.8057 ] [ 0.413 0. - Tensor *x = NN_tensor(2, (size_t[]){ 7, 7 }, DTYPE_F16, (uint8_t[]){ 0x6,0x3a,0x3e,0x3a,0xb8,0xb3,0x28,0x36,0x36,0x38,0xa6,0xb8,0x72,0xba,0x9c,0x36,0xa0,0x2b,0x56,0xb9,0xf8,0xb0,0x80,0x29,0xf0,0xb2,0x44,0x3b,0x80,0x2d,0xbe,0xbb,0xec,0xb5,0x30,0xba,0x4,0xb4,0xb8,0xbb,0x0,0x3a,0x2c,0x37,0xbc,0xb9,0x76,0xba,0x10,0xb0,0x6a,0x3b,0xf0,0x33,0x0,0x2d,0xf8,0x36,0xf0,0xba,0x86,0xba,0xfe,0xba,0x5e,0xba,0xc0,0x35,0xba,0xb9,0xe0,0x35,0x6c,0x35,0xb0,0xba,0xe8,0x3a,0xe0,0xb2,0xa0,0xba,0x32,0x38,0x7a,0xb9,0xce,0x3a,0x8,0xb8,0xe8,0xb2,0x20,0x2f,0x3c,0x3a,0x40,0x27 }); + // [[ 0.4785 0.05176 -0.2031 0.0918 -0.9424 -0.672 0.6504 ] [ 0.62 + Tensor *x = NN_tensor(2, (size_t[]){ 7, 7 }, DTYPE_F16, (uint8_t[]){ 0xa8,0x37,0xa0,0x2a,0x80,0xb2,0xe0,0x2d,0x8a,0xbb,0x60,0xb9,0x34,0x39,0xf8,0x38,0xba,0xba,0x74,0x38,0xf0,0xac,0xa0,0x2e,0x6e,0xba,0xb0,0x34,0x1a,0x3a,0x38,0xb0,0x8c,0x3a,0x68,0xb7,0x4e,0x39,0x10,0x39,0x80,0x22,0xf0,0x3b,0x90,0xaf,0xa0,0x39,0x0,0xa0,0xa8,0xb7,0x10,0xb8,0x9e,0x3a,0xb2,0x3b,0x10,0x3a,0xc6,0x39,0xc,0x3a,0x88,0x32,0x1c,0x37,0x58,0xb8,0x64,0x37,0x84,0x3b,0x78,0x38,0xe0,0xb8,0x0,0x2b,0x70,0xb0,0xda,0x39,0xa4,0xb5,0xd0,0xb9,0x0,0x1c,0x70,0xb6,0x0,0xa5,0x9e,0xb9,0x52,0xb9 }); - // [[0.753 0.7803 0. 0.3848 0.5264 0. 0. ] [0.413 0.05957 0. - Tensor *golden = NN_tensor(2, (size_t[]){ 7, 7 }, DTYPE_F16, (uint8_t[]){ 0x6,0x3a,0x3e,0x3a,0x0,0x0,0x28,0x36,0x36,0x38,0x0,0x0,0x0,0x0,0x9c,0x36,0xa0,0x2b,0x0,0x0,0x0,0x0,0x80,0x29,0x0,0x0,0x44,0x3b,0x80,0x2d,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x3a,0x2c,0x37,0x0,0x0,0x0,0x0,0x0,0x0,0x6a,0x3b,0xf0,0x33,0x0,0x2d,0xf8,0x36,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0xc0,0x35,0x0,0x0,0xe0,0x35,0x6c,0x35,0x0,0x0,0xe8,0x3a,0x0,0x0,0x0,0x0,0x32,0x38,0x0,0x0,0xce,0x3a,0x0,0x0,0x0,0x0,0x20,0x2f,0x3c,0x3a,0x40,0x27 }); + // [[0.4785 0.05176 0. 0.0918 0. 0. 0.6504 ] [0.621 0. + Tensor *golden = NN_tensor(2, (size_t[]){ 7, 7 }, DTYPE_F16, (uint8_t[]){ 0xa8,0x37,0xa0,0x2a,0x0,0x0,0xe0,0x2d,0x0,0x0,0x0,0x0,0x34,0x39,0xf8,0x38,0x0,0x0,0x74,0x38,0x0,0x0,0xa0,0x2e,0x0,0x0,0xb0,0x34,0x1a,0x3a,0x0,0x0,0x8c,0x3a,0x0,0x0,0x4e,0x39,0x10,0x39,0x80,0x22,0xf0,0x3b,0x0,0x0,0xa0,0x39,0x0,0x0,0x0,0x0,0x0,0x0,0x9e,0x3a,0xb2,0x3b,0x10,0x3a,0xc6,0x39,0xc,0x3a,0x88,0x32,0x1c,0x37,0x0,0x0,0x64,0x37,0x84,0x3b,0x78,0x38,0x0,0x0,0x0,0x2b,0x0,0x0,0xda,0x39,0x0,0x0,0x0,0x0,0x0,0x1c,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0 }); Tensor *actual = NN_zeros(2, (size_t[]){ 7, 7 }, DTYPE_F16); cycles = read_cycles(); diff --git a/tests/src/unittest.h b/tests/src/unittest.h index fbcc469..d762154 100644 --- a/tests/src/unittest.h +++ b/tests/src/unittest.h @@ -20,7 +20,7 @@ static void enable_accelerator_features() { // enable vector operation unsigned long mstatus; asm volatile("csrr %0, mstatus" : "=r"(mstatus)); - mstatus |= 0x00000600 | 0x00006000 | 0x00018000; + mstatus |= 0x00000400 | 0x00004000 | 0x00010000; asm volatile("csrw mstatus, %0"::"r"(mstatus)); #endif }