diff --git a/nn/CMakeLists.txt b/nn/CMakeLists.txt index 5a7bab7..d522071 100644 --- a/nn/CMakeLists.txt +++ b/nn/CMakeLists.txt @@ -7,12 +7,22 @@ set(INCLUDES set(SOURCES src/nn_tensor.c src/nn_print.c + src/add/nn_add.c + src/add/nn_add_rvv.c + src/copy/nn_copy.c - src/matmul/nn_matmul.c - src/add/nn_add_rvv.c + src/matmul/nn_matmul.c + src/matmul/nn_matmul_eaglex.c src/matmul/nn_matmul_rvv.c + + src/max/nn_max.c + src/max/nn_max_rvv.c + + src/min/nn_min.c + src/min/nn_min_rvv.c + ) add_library(nn ${SOURCES}) diff --git a/nn/inc/nn.h b/nn/inc/nn.h index 1a8a0b3..f63a6c3 100644 --- a/nn/inc/nn.h +++ b/nn/inc/nn.h @@ -8,6 +8,8 @@ #include "nn_add.h" #include "nn_copy.h" #include "nn_matmul.h" +#include "nn_max.h" +#include "nn_min.h" // http://elm-chan.org/junk/32bit/binclude.html diff --git a/nn/inc/nn_linear.h b/nn/inc/nn_linear.h index 4521b6b..552c463 100644 --- a/nn/inc/nn_linear.h +++ b/nn/inc/nn_linear.h @@ -14,8 +14,4 @@ */ void NN_linear_F32(Tensor *y, Tensor *x, Tensor *w, Tensor *b); -void NN_linear_I32(Tensor *y, Tensor *x, Tensor *w, Tensor *b); - -void NN_linear_I8_I8_I8_I32(Tensor *y, Tensor *x, Tensor *w, Tensor *b); - #endif // __NN_LINEAR_H diff --git a/nn/inc/nn_matmul.h b/nn/inc/nn_matmul.h index 4ae4279..0511fce 100644 --- a/nn/inc/nn_matmul.h +++ b/nn/inc/nn_matmul.h @@ -26,5 +26,7 @@ void NN_matmul_I32(Tensor *out, Tensor *a, Tensor *b); void NN_matmul_F32_RVV(Tensor *out, Tensor *a, Tensor *b); +void NN_matmul_I8_I8_I32_EAGLEX(Tensor *out, Tensor *a, Tensor *b); + #endif // __NN_MATMUL_H diff --git a/nn/inc/nn_max.h b/nn/inc/nn_max.h new file mode 100644 index 0000000..17946d6 --- /dev/null +++ b/nn/inc/nn_max.h @@ -0,0 +1,22 @@ +#ifndef __NN_MAX_H +#define __NN_MAX_H + +#include +#include + +#include "nn_tensor.h" + + +/** + * Returns the maximum value of all elements in the input tensor. + * + * @param t: input tensor + */ +float NN_max(Tensor *t); + +float NN_max_F32(Tensor *t); + +float NN_max_F32_RVV(Tensor *t); + + +#endif // __NN_MAX_H diff --git a/nn/inc/nn_min.h b/nn/inc/nn_min.h new file mode 100644 index 0000000..d668275 --- /dev/null +++ b/nn/inc/nn_min.h @@ -0,0 +1,22 @@ +#ifndef __NN_MIN_H +#define __NN_MIN_H + +#include +#include + +#include "nn_tensor.h" + + +/** + * Returns the minimum value of all elements in the input tensor. + * + * @param t: input tensor + */ +float NN_min(Tensor *t); + +float NN_min_F32(Tensor *t); + +float NN_min_F32_RVV(Tensor *t); + + +#endif // __NN_MIN_H diff --git a/nn/inc/nn_tensor.h b/nn/inc/nn_tensor.h index 200173c..5b8027a 100644 --- a/nn/inc/nn_tensor.h +++ b/nn/inc/nn_tensor.h @@ -77,35 +77,80 @@ static inline const char *NN_getDataTypeName(DataType dtype) { } } +/** + * Frees the memory allocated for the tensor data + */ static inline void NN_freeTensorData(Tensor *t) { free(t->data); } +/** + * Frees the memory allocated for the tensor + */ static inline void NN_deleteTensor(Tensor *t) { free(t); } - /** - * Initialize a tensor + * Initialize a given tensor * * @param ndim: number of dimensions * @param shape: shape of tensor - * @param dtype: DataType + * @param dtype: data type * @param data: pointer to data, if NULL, the data will be allocated */ void NN_initTensor(Tensor *t, size_t ndim, size_t *shape, DataType dtype, void *data); +/** + * Create a new tensor + * + * @param ndim: number of dimensions + * @param shape: shape of tensor + * @param dtype: data type + * @param data: pointer to data, if NULL, the data will be allocated + * @return Tensor +*/ Tensor *NN_tensor(size_t ndim, size_t *shape, DataType dtype, void *data); +/** + * Returns a tensor filled with the scalar value 0. + * + * @param ndim: number of dimensions + * @param shape: shape of tensor + * @param dtype: data type + * @return Tensor + */ Tensor *NN_zeros(size_t ndim, size_t *shape, DataType dtype); +/** + * Returns a tensor filled with the scalar value 1. + * + * @param ndim: number of dimensions + * @param shape: shape of tensor + * @param dtype: data type + * @return Tensor + */ Tensor *NN_ones(size_t ndim, size_t *shape, DataType dtype); +/** + * Returns a tensor filled with random numbers from a uniform distribution. + * + * The range of the random number is dependent on the data type: + * - For Float32, the range is [0, 1] + * - For Int8, the range is [0, 255] + * - For Int32, the range is [0, RAND_MAX] + * + * @param ndim: number of dimensions + * @param shape: shape of tensor + * @param dtype: data type + * @return Tensor + */ Tensor *NN_rand(size_t ndim, size_t *shape, DataType dtype); /** - * Convert tensor data type + * Returns this tensor cast to the type of the given tensor. + * + * This is a no-op if the tensor is already of the correct type. * * @param t: input tensor * @param dtype: target data type diff --git a/nn/inc/rv.h b/nn/inc/rv.h new file mode 100644 index 0000000..8169899 --- /dev/null +++ b/nn/inc/rv.h @@ -0,0 +1,94 @@ +/** + * @file rv_common.h + * @brief RISC-V Definitions + * + * This header file provides common definitions and operations for RISC-V core programming. + * It includes memory register attributes, bit operation definitions, RISC-V specific definitions, + * and common enumerations for state and status values. + * + * The memory register attributes define volatile permissions for read-only, write-only, and read/write access. + * The bit operation definitions provide macros for setting, clearing, reading, and writing specific bits in a register. + * The RISC-V specific definitions include macros for reading and writing control and status registers (CSRs), + * as well as operations to swap, set, and clear specific bits in a CSR. + * The common definitions include enumerations for state values (such as RESET and SET), and status values (such as OK and ERROR). + * + * @note This file should be included to access RISC-V core-specific definitions and perform common operations. + * + * @author -T.K.- + * @date 2023-05-20 + */ + +#ifndef __RV_H +#define __RV_H + +#include +#include + + +/* ================ Memory register attributes ================ */ +#ifdef __cplusplus + #define __I volatile /** Defines "read only" permissions */ +#else + #define __I volatile const /** Defines "read only" permissions */ +#endif +#define __O volatile /** Defines "write only" permissions */ +#define __IO volatile /** Defines "read / write" permissions */ + +/* following defines should be used for structure members */ +#define __IM volatile const /** Defines "read only" structure member permissions */ +#define __OM volatile /** Defines "write only" structure member permissions */ +#define __IOM volatile /** Defines "read / write" structure member permissions */ + + +/* ================ Bit Operation definitions ================ */ +#define SET_BITS(REG, BIT) ((REG) |= (BIT)) +#define CLEAR_BITS(REG, BIT) ((REG) &= ~(BIT)) +#define READ_BITS(REG, BIT) ((REG) & (BIT)) +#define WRITE_BITS(REG, CLEARMASK, SETMASK) ((REG) = (((REG) & (~(CLEARMASK))) | (SETMASK))) + + +/* ================ RISC-V specific definitions ================ */ +#define READ_CSR(REG) ({ \ + unsigned long __tmp; \ + asm volatile ("csrr %0, " REG : "=r"(__tmp)); \ + __tmp; }) + +#define WRITE_CSR(REG, VAL) ({ \ + asm volatile ("csrw " REG ", %0" :: "rK"(VAL)); }) + +#define SWAP_CSR(REG, VAL) ({ \ + unsigned long __tmp; \ + asm volatile ("csrrw %0, " REG ", %1" : "=r"(__tmp) : "rK"(VAL)); \ + __tmp; }) + +#define SET_CSR_BITS(REG, BIT) ({ \ + unsigned long __tmp; \ + asm volatile ("csrrs %0, " REG ", %1" : "=r"(__tmp) : "rK"(BIT)); \ + __tmp; }) + +#define CLEAR_CSR_BITS(REG, BIT) ({ \ + unsigned long __tmp; \ + asm volatile ("csrrc %0, " REG ", %1" : "=r"(__tmp) : "rK"(BIT)); \ + __tmp; }) + + +/* ================ Common definitions ================ */ +typedef enum { + RESET = 0UL, + SET = !RESET, + + DISABLE = RESET, + ENABLE = SET, + + LOW = RESET, + HIGH = SET, +} State; + +typedef enum { + OK = 0U, + ERROR, + BUSY, + TIMEOUT +} Status; + +#endif /* __RV_H */ diff --git a/nn/src/add/nn_add_rvv.c b/nn/src/add/nn_add_rvv.c index 49d4d97..d82b1e2 100644 --- a/nn/src/add/nn_add_rvv.c +++ b/nn/src/add/nn_add_rvv.c @@ -15,14 +15,16 @@ void NN_add_F32_RVV(Tensor *out, Tensor *a, Tensor *b) { float *a_data = (float *)a->data; float *b_data = (float *)b->data; - int k = out->shape[0] * out->shape[1]; - int l = 0; - for (size_t vl; k > 0; k -= vl, l += vl) { + // TODO: add broadcasting support + + size_t i = 0; + size_t vl = 0; + for (size_t k = out->shape[0] * out->shape[1]; k > 0; k -= vl, i += vl) { vl = __riscv_vsetvl_e32m1(k); - vfloat32m1_t vec_a = __riscv_vle32_v_f32m1(a_data + l, vl); - vfloat32m1_t vec_b = __riscv_vle32_v_f32m1(b_data + l, vl); + vfloat32m1_t vec_a = __riscv_vle32_v_f32m1(a_data + i, vl); + vfloat32m1_t vec_b = __riscv_vle32_v_f32m1(b_data + i, vl); vfloat32m1_t vec_c = __riscv_vfadd_vv_f32m1(vec_a, vec_b, vl); - __riscv_vse32_v_f32m1(out_data + l, vec_c, vl); + __riscv_vse32_v_f32m1(out_data + i, vec_c, vl); } } diff --git a/nn/src/linear/nn_linear.c b/nn/src/linear/nn_linear.c index 45879c2..e503c92 100644 --- a/nn/src/linear/nn_linear.c +++ b/nn/src/linear/nn_linear.c @@ -14,27 +14,3 @@ void NN_linear_F32(Tensor *y, Tensor *x, Tensor *w, Tensor *b) { NN_matmul_F32(y, x, w); NN_add_F32(y, y, b); } - -void NN_linear_I32(Tensor *y, Tensor *x, Tensor *w, Tensor *b) { - assert(x->shape[1] == w->shape[0]); - assert(y->shape[0] == x->shape[0]); - assert(y->shape[1] == w->shape[1]); - assert(b->shape[0] == w->shape[1]); - assert(x->dtype == DTYPE_I32); - assert(w->dtype == DTYPE_I32); - assert(b->dtype == DTYPE_I32); - assert(y->dtype == DTYPE_I32); - - NN_matmul_I32(y, x, w); - NN_add_I32(y, y, b); -} - -void NN_linear_I8_I8_I8_I32(Tensor *y, Tensor *x, Tensor *w, Tensor *b) { - assert(x->dtype == DTYPE_I8); - assert(w->dtype == DTYPE_I8); - assert(b->dtype == DTYPE_I8); - assert(y->dtype == DTYPE_I32); - - NN_matmul_I8_I8_I32(y, x, w); - NN_add_I32_I8_I32(y, y, b); -} diff --git a/nn/src/matmul/nn_matmul_eaglex.c b/nn/src/matmul/nn_matmul_eaglex.c new file mode 100644 index 0000000..4711e83 --- /dev/null +++ b/nn/src/matmul/nn_matmul_eaglex.c @@ -0,0 +1,7 @@ + +#include "nn_matmul.h" + +void NN_matmul_I8_I8_I32_EAGLEX(Tensor *out, Tensor *a, Tensor *b) { + // TODO: port to here +} + diff --git a/nn/src/matmul/nn_matmul_rvv.c b/nn/src/matmul/nn_matmul_rvv.c index d37ab61..4cb4f10 100644 --- a/nn/src/matmul/nn_matmul_rvv.c +++ b/nn/src/matmul/nn_matmul_rvv.c @@ -19,6 +19,7 @@ void NN_matmul_F32_RVV(Tensor *out, Tensor *a, Tensor *b) { float *ptr_a = (float *)a->data + i * a->shape[1]; float *ptr_b = (float *)b->data + j; vfloat32m1_t vec_s = __riscv_vfmv_v_f_f32m1(0, vlmax); + size_t vl = 0; for (int k = a->shape[1]; k > 0; k -= vl, ptr_a += vl, ptr_b += vl) { vl = __riscv_vsetvl_e32m1(k); diff --git a/nn/src/max/nn_max.c b/nn/src/max/nn_max.c new file mode 100644 index 0000000..03cf1aa --- /dev/null +++ b/nn/src/max/nn_max.c @@ -0,0 +1,20 @@ + +#include "nn_max.h" + + +float NN_max_F32(Tensor *t) { + assert(t->dtype == DTYPE_F32); + + float max = -FLT_MAX; + float *t_data = (float *)t->data; + + for (size_t i = 0; i < t->shape[0]; i += 1) { + for (size_t j = 0; j < t->shape[1]; j += 1) { + if (t_data[i * t->shape[1] + j] > max) { + max = t_data[i * t->shape[1] + j]; + } + } + } + + return max; +} diff --git a/nn/src/max/nn_max_rvv.c b/nn/src/max/nn_max_rvv.c new file mode 100644 index 0000000..accb948 --- /dev/null +++ b/nn/src/max/nn_max_rvv.c @@ -0,0 +1,22 @@ + +#include "nn_max.h" +#include "riscv_vector.h" + +float NN_max_F32_RVV(Tensor *t) { + assert(t->dtype == DTYPE_F32); + + float max = -FLT_MAX; + float *t_data = (float *)t->data; + + vfloat32m1_t vec_max = __riscv_vfmv_s_f_f32m1(max, 1); + size_t i = 0; + size_t vl = 0; + for (size_t k = t->shape[0] * t->shape[1]; k > 0; k -= vl, i += vl) { + vl = __riscv_vsetvl_e32m1(k); + vfloat32m1_t vec_t = __riscv_vle32_v_f32m1(t_data + i, vl); + vec_max = __riscv_vfredmax_vs_f32m1_f32m1(vec_t, vec_max, vl); + } + max = __riscv_vfmv_f_s_f32m1_f32(vec_max); + return max; +} + diff --git a/nn/src/min/nn_min.c b/nn/src/min/nn_min.c new file mode 100644 index 0000000..d61e727 --- /dev/null +++ b/nn/src/min/nn_min.c @@ -0,0 +1,20 @@ + +#include "nn_min.h" + + +float NN_min_F32(Tensor *t) { + assert(t->dtype == DTYPE_F32); + + float min = FLT_MAX; + float *t_data = (float *)t->data; + + for (size_t i = 0; i < t->shape[0]; i += 1) { + for (size_t j = 0; j < t->shape[1]; j += 1) { + if (t_data[i * t->shape[1] + j] < min) { + min = t_data[i * t->shape[1] + j]; + } + } + } + + return min; +} diff --git a/nn/src/min/nn_min_rvv.c b/nn/src/min/nn_min_rvv.c new file mode 100644 index 0000000..4fe80ea --- /dev/null +++ b/nn/src/min/nn_min_rvv.c @@ -0,0 +1,22 @@ + +#include "nn_min.h" +#include "riscv_vector.h" + +float NN_min_F32_RVV(Tensor *t) { + assert(t->dtype == DTYPE_F32); + + float min = FLT_MAX; + float *t_data = (float *)t->data; + + vfloat32m1_t vec_min = __riscv_vfmv_s_f_f32m1(min, 1); + size_t i = 0; + size_t vl = 0; + for (size_t k = t->shape[0] * t->shape[1]; k > 0; k -= vl, i += vl) { + vl = __riscv_vsetvl_e32m1(k); + vfloat32m1_t vec_t = __riscv_vle32_v_f32m1(t_data + i, vl); + vec_min = __riscv_vfredmin_vs_f32m1_f32m1(vec_t, vec_min, vl); + } + min = __riscv_vfmv_f_s_f32m1_f32(vec_min); + return min; +} + diff --git a/nn/src/nn_tensor.c b/nn/src/nn_tensor.c index 0ec6eac..ef9227f 100644 --- a/nn/src/nn_tensor.c +++ b/nn/src/nn_tensor.c @@ -98,7 +98,7 @@ Tensor *NN_rand(size_t ndim, size_t *shape, DataType dtype) { switch (dtype) { case DTYPE_I8: for (size_t i = 0; isize; i+=1) { - ((int8_t *)t->data)[i] = rand() % 256 - 128; + ((int8_t *)t->data)[i] = rand() % 256; } break; case DTYPE_I32: diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 32cbe7b..edf38d1 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,3 +1,7 @@ + +# the CMake files are still very hacky +# TODO: organize the build system properly + cmake_minimum_required(VERSION 3.15) set(PROJECT_NAME "example") @@ -23,8 +27,8 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_BUILD_TYPE Debug) set(BUILD_SHARED_LIBS OFF) set(CMAKE_SHARED_LIBRARY_LINK_CXX_FLAGS "") -set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3") -set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Og") +set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2") +set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O2") set(STATIC_LINKING TRUE) @@ -49,11 +53,13 @@ set(CMAKE_SIZE "${TOOLCHAIN_PREFIX}-size") set(CMAKE_STRIP "${TOOLCHAIN_PREFIX}-ld") -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcmodel=medany -march=rv64gcv_zfh -mabi=lp64d -fno-common -fno-builtin-printf") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Og -mcmodel=medany -march=rv64gcv_zfh -mabi=lp64d -fno-common -fno-builtin-printf") set(CMAKE_C_FLAGS ${CMAKE_CXX_FLAGS}) set(CMAKE_EXE_LINKER_FLAGS "-static -lm -lstdc++ -Wl,-Map=output.map -L${LIBGLOSS_DIR} -specs=${SPECS_FILE} -specs=${WRAP_SPECS_FILE} -T ${CMAKE_SOURCE_DIR}/htif.ld") +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -u _printf_float") + message(CMAKE_C_FLAGS: ${CMAKE_C_FLAGS}) add_executable(${PROJECT_NAME} ${PROJECT_SOURCES}) diff --git a/test/src/main.c b/test/src/main.c index 81b1d04..5921e39 100644 --- a/test/src/main.c +++ b/test/src/main.c @@ -1,6 +1,7 @@ #include #include #include +#include #include "nn.h" #include "riscv_vector.h" @@ -36,24 +37,24 @@ int main() { enable_vector_operations(); uint32_t seed = 0xdeadbeef; - uint64_t start, total; + size_t cycles; srand(seed); - + // matmul { Tensor *A = NN_rand(2, (size_t[]){M, O}, DTYPE_F32); Tensor *B = NN_rand(2, (size_t[]){O, N}, DTYPE_F32); Tensor *f = NN_rand(2, (size_t[]){M, N}, DTYPE_F32); - printf("matmul: "); + printf("matmul:\t\t"); Tensor *golden = NN_tensor(2, (size_t[]){M, N}, DTYPE_F32, NULL); Tensor *actual = NN_tensor(2, (size_t[]){M, N}, DTYPE_F32, NULL); NN_matmul_F32(golden, A, B); - // start = read_cycles(); + cycles = READ_CSR("mcycle"); NN_matmul_F32_RVV(actual, A, B); - // total = read_cycles() - start; - printf("%s (%lu)\n", compare_2d(golden->data, actual->data, N, M) ? "pass" : "fail", total); + cycles = READ_CSR("mcycle") - cycles; + printf("%s (%lu)\n", compare_2d(golden->data, actual->data, N, M) ? "pass" : "fail", cycles); // NN_printf(golden); // NN_printf(actual); @@ -71,18 +72,62 @@ int main() { NN_deleteTensor(actual); } + // matvec + { + + + } + + // max and min + { + Tensor *A = NN_rand(2, (size_t[]){M, N}, DTYPE_F32); + + printf("max:\t\t"); + float max_cpu = NN_max_F32(A); + cycles = READ_CSR("mcycle"); + float max_actual = NN_max_F32_RVV(A); + cycles = READ_CSR("mcycle") - cycles; + printf("%s (%lu)\n", float_eq(max_cpu, max_actual, 1e-6) ? "pass" : "fail", cycles); + + printf("min:\t\t"); + float min_cpu = NN_min_F32(A); + cycles = READ_CSR("mcycle"); + float min_actual = NN_min_F32_RVV(A); + cycles = READ_CSR("mcycle") - cycles; + printf("%s (%lu)\n", float_eq(min_cpu, min_actual, 1e-6) ? "pass" : "fail", cycles); + + NN_printf(A); + printf("max:"); + NN_printFloat(max_cpu, 6); + printf("\n"); + + NN_freeTensorData(A); + NN_deleteTensor(A); + } + + // matmulf + { + + } + + // matsub + { + + } + + // matadd { Tensor *A = NN_rand(2, (size_t[]){M, N}, DTYPE_F32); Tensor *B = NN_rand(2, (size_t[]){M, N}, DTYPE_F32); Tensor *golden = NN_tensor(2, (size_t[]){M, N}, DTYPE_F32, NULL); Tensor *actual = NN_tensor(2, (size_t[]){M, N}, DTYPE_F32, NULL); - printf("matadd: "); + printf("matadd:\t\t"); NN_add_F32(golden, A, B); // start = read_cycles(); NN_add_F32_RVV(actual, A, B); // total = read_cycles() - start; - printf("%s (%lu)\n", compare_2d(golden->data, actual->data, N, M) ? "pass" : "fail", total); + printf("%s (%lu)\n", compare_2d(golden->data, actual->data, N, M) ? "pass" : "fail", cycles); // NN_printf(A); // NN_printf(B); @@ -100,6 +145,56 @@ int main() { NN_deleteTensor(actual); } + // matneg + { + + } + + // matcopy + { + + } + + // cwiseabs + { + + } + + // cwisemin + { + + } + + // cwisemax + { + + } + + // cwisemul + { + + } + + // matset + { + + } + + // matsetv + { + + } + + // matnorm + { + + } + + // transpose + { + + } + return 0; } \ No newline at end of file