diff --git a/.gitignore b/.gitignore index 42b9309..9011ec2 100644 --- a/.gitignore +++ b/.gitignore @@ -28,3 +28,7 @@ share/python-wheels/ .installed.cfg *.egg MANIFEST + + +example/llama2/checkpoints/stories15M.bin + diff --git a/CMakeLists.txt b/CMakeLists.txt index 13d5e3f..087a301 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,7 +27,7 @@ set(WRAP_SPECS_FILE "htif_wrap.specs") set(SPECS_FILE "htif_nano.specs") set(LIBGLOSS_DIR "$ENV{RISCV}/riscv64-unknown-elf/lib/") -set(MARCH "rv64gcv_zfh") +set(MARCH "rv64gcv_zfh_zvfh") set(MABI "lp64d") set(MCMODEL "medany") diff --git a/barstools/src/barstools/converter.py b/barstools/src/barstools/converter.py index 42b8027..31a2edf 100644 --- a/barstools/src/barstools/converter.py +++ b/barstools/src/barstools/converter.py @@ -13,15 +13,15 @@ class TorchConverter(torch.fx.Interpreter): @staticmethod - def toNumpy(tensor: torch.Tensor): + def to_numpy(tensor: torch.Tensor): return tensor.cpu().detach().contiguous().numpy() @staticmethod - def toBytes(ndarray: np.ndarray): + def to_bytes(ndarray: np.ndarray): return ndarray.astype(np.float32).flatten().tobytes() @staticmethod - def dtypeToStr(dtype: torch.dtype): + def dtype_to_str(dtype: torch.dtype): if dtype == torch.float16: return "DTYPE_F16" elif dtype == torch.float32: @@ -67,12 +67,12 @@ def __init__(self, model): def print(self): self.gm.graph.print_tabular() - def getModuleInSequential(self, module, indicies): + def get_module_in_sequential(self, module, indicies): if len(indicies) == 0: return module - return self.getModuleInSequential(module[indicies[0]], indicies[1:]) + return self.get_module_in_sequential(module[indicies[0]], indicies[1:]) - def getModule(self, module_name): + def get_module(self, module_name): if "." in module_name: # if we have nn.Sequential layers target_hierarchy = module_name.split(".") @@ -82,36 +82,36 @@ def getModule(self, module_name): indicies = [int(x) for x in target_hierarchy[1:]] module = getattr(self.model, sequential_name) - return self.getModuleInSequential(module, indicies) + return self.get_module_in_sequential(module, indicies) return getattr(self.model, module_name) - def addDataTensor(self, name, tensor): + def add_data_tensor(self, name, tensor): self.model_struct += INDENT + "Tensor {name};\n".format( name=name ) - data = TorchConverter.toNumpy(tensor) + data = TorchConverter.to_numpy(tensor) - self.model_init += INDENT + "NN_initTensor(&model->{name}, {dim}, (size_t[]){{{shape}}}, {dtype}, array_pointer);\n".format( + self.model_init += INDENT + "NN_init_tensor(&model->{name}, {dim}, (size_t[]){{{shape}}}, {dtype}, array_pointer);\n".format( name=name, dim=len(tensor.shape), shape=", ".join(str(x) for x in tensor.shape), - dtype=TorchConverter.dtypeToStr(tensor.dtype) + dtype=TorchConverter.dtype_to_str(tensor.dtype) ) self.model_init += INDENT + "array_pointer += {increment};\n".format( increment=np.prod(tensor.shape) ) - self.weight_content += TorchConverter.toBytes(data) + self.weight_content += TorchConverter.to_bytes(data) - def addOutputTensor(self, name, shape, dtype=torch.float32): + def add_output_tensor(self, name, shape, dtype=torch.float32): self.model_struct += INDENT + "Tensor {name};\n".format( name=name ) - self.model_init += INDENT + "NN_initTensor(&model->{name}, {dim}, (size_t[]){{{shape}}}, {dtype}, NULL);\n".format( + self.model_init += INDENT + "NN_init_tensor(&model->{name}, {dim}, (size_t[]){{{shape}}}, {dtype}, NULL);\n".format( name=name, dim=len(shape), shape=", ".join(str(x) for x in shape), - dtype=TorchConverter.dtypeToStr(dtype) + dtype=TorchConverter.dtype_to_str(dtype) ) def placeholder(self, target, args, kwargs): @@ -134,7 +134,7 @@ def placeholder(self, target, args, kwargs): self.model_struct += INDENT + "Tensor {name};\n".format(name=name) - self.model_init += INDENT + "NN_initTensor(&model->{name}, {dim}, (size_t[]){{{shape}}}, DTYPE_F32, NULL);\n".format( + self.model_init += INDENT + "NN_init_tensor(&model->{name}, {dim}, (size_t[]){{{shape}}}, DTYPE_F32, NULL);\n".format( name=name, dim=len(shape), shape=", ".join(str(x) for x in shape) @@ -160,7 +160,7 @@ def call_function(self, target, args, kwargs): layer_name=layer_name, input_names=self.node_info[layer_name][0] ) - self.addOutputTensor(layer_name, output_shape) + self.add_output_tensor(layer_name, output_shape) elif target == torch.nn.functional.interpolate: layer_name = "interpolate_{count}".format(count=count) if count > 0 else "interpolate" @@ -170,12 +170,12 @@ def call_function(self, target, args, kwargs): input_names=self.node_info[layer_name][0], scale_factor=kwargs.get("scale_factor") ) - self.addOutputTensor(layer_name, output_shape) + self.add_output_tensor(layer_name, output_shape) elif target == torch.nn.functional.relu: layer_name = "relu_{count}".format(count=count) if count > 0 else "relu" self.model_forward += INDENT + "// F.{layer_name}\n".format(layer_name=layer_name) - self.model_forward += INDENT + "NN_ReLU(&model->{layer_name}, &model->{input_names[0]});\n".format( + self.model_forward += INDENT + "NN_relu(&model->{layer_name}, &model->{input_names[0]});\n".format( layer_name=layer_name, input_names=self.node_info[layer_name][0] ) @@ -183,7 +183,7 @@ def call_function(self, target, args, kwargs): elif target == torch.nn.functional.relu6: layer_name = "relu6_{count}".format(count=count) if count > 0 else "relu6" self.model_forward += INDENT + "// F.{layer_name}\n".format(layer_name=layer_name) - self.model_forward += INDENT + "NN_ReLU6(&model->{layer_name}, &model->{input_names[0]});\n".format( + self.model_forward += INDENT + "NN_relu6(&model->{layer_name}, &model->{input_names[0]});\n".format( layer_name=layer_name, input_names=self.node_info[layer_name][0] ) @@ -206,7 +206,7 @@ def call_module(self, target, args, kwargs): if len(output_shape) == 4: output_shape = (output_shape[0], output_shape[2], output_shape[3], output_shape[1]) - module = self.getModule(target) + module = self.get_module(target) layer_name = target.replace(".", "_") input_names = self.node_info[layer_name][0] @@ -217,24 +217,24 @@ def call_module(self, target, args, kwargs): ) if type(module) == torch.nn.Linear: - self.addDataTensor( + self.add_data_tensor( "{layer_name}_weight".format(layer_name=layer_name), module.state_dict().get("weight") ) if module.bias is not None: - self.addDataTensor( + self.add_data_tensor( "{layer_name}_bias".format(layer_name=layer_name), module.state_dict().get("bias") ) batch_size = int(output_shape[0]) - self.addOutputTensor( + self.add_output_tensor( layer_name, (batch_size, module.out_features) ) - self.model_forward += INDENT + "NN_Linear(&model->{layer_name}, &model->{input_names[0]}, {weight}, {bias});\n".format( + self.model_forward += INDENT + "NN_linear(&model->{layer_name}, &model->{input_names[0]}, {weight}, {bias});\n".format( layer_name=layer_name, input_names=input_names, weight="&model->{layer_name}_weight".format(layer_name=layer_name), @@ -243,30 +243,30 @@ def call_module(self, target, args, kwargs): elif type(module) == torch.nn.BatchNorm2d: if module.weight is not None: - self.addDataTensor( + self.add_data_tensor( "{layer_name}_weight".format(layer_name=layer_name), module.state_dict().get("weight") ) if module.bias is not None: - self.addDataTensor( + self.add_data_tensor( "{layer_name}_bias".format(layer_name=layer_name), module.state_dict().get("bias") ) if module.running_mean is not None: - self.addDataTensor( + self.add_data_tensor( "{layer_name}_running_mean".format(layer_name=layer_name), module.state_dict().get("running_mean") ) if module.running_var is not None: - self.addDataTensor( + self.add_data_tensor( "{layer_name}_running_var".format(layer_name=layer_name), module.state_dict().get("running_var") ) batch_size = int(output_shape[0]) - self.addOutputTensor(layer_name, output_shape) + self.add_output_tensor(layer_name, output_shape) - self.model_forward += INDENT + """NN_BatchNorm2d( + self.model_forward += INDENT + """NN_batch_norm2d( &model->{layer_name}, &model->{input_name[0]}, {weight}, {bias}, {eps}, {running_mean}, {running_var});\n""".format( @@ -282,19 +282,19 @@ def call_module(self, target, args, kwargs): elif type(module) == torch.nn.Conv2d: if module.weight is not None: # weight need to be converted from (out_ch, in_ch, kh, kw) to (kh, kw, in_ch, out_ch) - self.addDataTensor( + self.add_data_tensor( "{layer_name}_weight".format(layer_name=layer_name), module.state_dict().get("weight").permute(2, 3, 1, 0) ) if module.bias is not None: - self.addDataTensor( + self.add_data_tensor( "{layer_name}_bias".format(layer_name=layer_name), module.state_dict().get("bias") ) - self.addOutputTensor(layer_name, output_shape) + self.add_output_tensor(layer_name, output_shape) - self.model_forward += INDENT + """NN_Conv2d( + self.model_forward += INDENT + """NN_conv2d( &model->{layer_name}, &model->{input_names[0]}, {weight}, {bias}, (size_t[]){{{stride}}}, (size_t[]){{{padding}}}, (size_t[]){{{dilation}}}, {groups});\n""".format( layer_name=layer_name, @@ -309,26 +309,26 @@ def call_module(self, target, args, kwargs): self.prev_layer_name = "{layer_name}".format(layer_name=layer_name) elif type(module) == torch.nn.ReLU: - self.model_forward += INDENT + "NN_ReLU(&model->{layer_name}, &model->{input_names[0]});\n".format( + self.model_forward += INDENT + "NN_relu(&model->{layer_name}, &model->{input_names[0]});\n".format( layer_name=layer_name, input_names=input_names ) - self.addOutputTensor(layer_name, output_shape) + self.add_output_tensor(layer_name, output_shape) elif type(module) == torch.nn.ReLU6: - self.model_forward += INDENT + "NN_ReLU6(&model->{layer_name}, &model->{input_names[0]});\n".format( + self.model_forward += INDENT + "NN_relu6(&model->{layer_name}, &model->{input_names[0]});\n".format( layer_name=layer_name, input_names=input_names ) - self.addOutputTensor(layer_name, output_shape) + self.add_output_tensor(layer_name, output_shape) elif type(module) == torch.nn.ELU: - self.model_forward += INDENT + "NN_ELU(&model->{layer_name}, &model->{input_names[0]}, {eps});\n".format( + self.model_forward += INDENT + "NN_elu(&model->{layer_name}, &model->{input_names[0]}, {eps});\n".format( layer_name=layer_name, input_names=input_names, eps=module.alpha ) - self.addOutputTensor(layer_name, output_shape) + self.add_output_tensor(layer_name, output_shape) else: print("[WARNING] Unsupported module call:", target) diff --git a/example/char-rnn/runtime_test_c/char-rnn.c b/example/char-rnn/runtime_test_c/char-rnn.c index a510980..309a977 100644 --- a/example/char-rnn/runtime_test_c/char-rnn.c +++ b/example/char-rnn/runtime_test_c/char-rnn.c @@ -57,7 +57,7 @@ int main() { for (int j=1; j %s\n", str); printf("score: ("); - NN_printFloat(output.data[index], 2); + NN_print_f32(output.data[index], 2); printf("), predicted: (%d, %s)\n", index, categories[index]); } diff --git a/example/char-rnn/runtime_test_c/model.h b/example/char-rnn/runtime_test_c/model.h index 03a3472..b89c5e9 100644 --- a/example/char-rnn/runtime_test_c/model.h +++ b/example/char-rnn/runtime_test_c/model.h @@ -35,9 +35,9 @@ static void forward(Matrix *output, Matrix *hidden, Matrix *input) { // Input Matrix *input_out = input; // Linear - NN_Linear(hidden, &i2h_weight_transposed, &i2h_bias, input_out); + NN_linear(hidden, &i2h_weight_transposed, &i2h_bias, input_out); // Linear - NN_Linear(output, &h2o_weight_transposed, &h2o_bias, hidden); + NN_linear(output, &h2o_weight_transposed, &h2o_bias, hidden); // Log Softmax NN_logSoftmax(output, output); } diff --git a/example/char-rnn/runtime_test_c/nn.h b/example/char-rnn/runtime_test_c/nn.h index 2d843fe..b6fdc01 100644 --- a/example/char-rnn/runtime_test_c/nn.h +++ b/example/char-rnn/runtime_test_c/nn.h @@ -31,7 +31,7 @@ void NN_assert(int condition, char *message) { * These functions assumes that printf is available. */ -void NN_printFloat(float v, int16_t num_digits) { +void NN_print_f32(float v, int16_t num_digits) { int32_t scale = 1; int32_t integer_part, fractional_part; while (num_digits != 0) { @@ -46,14 +46,14 @@ void NN_printFloat(float v, int16_t num_digits) { printf("%i.%i", integer_part, fractional_part); } -void NN_printShape(Matrix *a) { +void NN_print_shape(Matrix *a) { printf("(%d, %d)\n", a->rows, a->cols); } void NN_printMatrix(Matrix *a) { for (size_t i = 0; i < a->rows; i++) { for (size_t j = 0; j < a->cols; j++) { - NN_printFloat(a->data[i * a->cols + j], 2); + NN_print_f32(a->data[i * a->cols + j], 2); printf(" "); } printf("\n"); @@ -128,7 +128,7 @@ size_t NN_argmax(Matrix *a) { * ====== Operators ====== */ -void NN_Linear(Matrix *out, Matrix *weight, Matrix *bias, Matrix *input) { +void NN_linear(Matrix *out, Matrix *weight, Matrix *bias, Matrix *input) { NN_matmul(out, input, weight); NN_matadd(out, out, bias); } diff --git a/example/char-rnn/runtime_test_np/model.py b/example/char-rnn/runtime_test_np/model.py index 667ba68..607fc36 100644 --- a/example/char-rnn/runtime_test_np/model.py +++ b/example/char-rnn/runtime_test_np/model.py @@ -8,9 +8,9 @@ def forward(input): # Input input_out = input # Linear - i2h_out = NN_Linear(input_out, i2h_weight_transposed, i2h_bias) + i2h_out = NN_linear(input_out, i2h_weight_transposed, i2h_bias) # Linear - h2o_out = NN_Linear(i2h_out, h2o_weight_transposed, h2o_bias) + h2o_out = NN_linear(i2h_out, h2o_weight_transposed, h2o_bias) # Log Softmax softmax_out = nn_logsoftmax(h2o_out) return softmax_out, i2h_out diff --git a/example/char-rnn/runtime_test_np/nn.py b/example/char-rnn/runtime_test_np/nn.py index 233a995..1caa9b6 100644 --- a/example/char-rnn/runtime_test_np/nn.py +++ b/example/char-rnn/runtime_test_np/nn.py @@ -1,6 +1,6 @@ import numpy as np -def NN_Linear(input, weight_T, bias): +def NN_linear(input, weight_T, bias): return np.matmul(input, weight_T) + bias def nn_logsoftmax(input): diff --git a/example/diffuse-loco/model.h b/example/diffuse-loco/model.h index 7edefba..7e8021b 100644 --- a/example/diffuse-loco/model.h +++ b/example/diffuse-loco/model.h @@ -41,44 +41,44 @@ void forward(Model *model); void init(Model *model) { float *array_pointer = (float *)model_weight_data; - NN_initTensor(&model->input_1, 2, (size_t[]){1, 48}, DTYPE_F32, NULL); + NN_init_tensor(&model->input_1, 2, (size_t[]){1, 48}, DTYPE_F32, NULL); // : actor_0 - NN_initTensor(&model->actor_0_weight, 2, (size_t[]){512, 48}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->actor_0_weight, 2, (size_t[]){512, 48}, DTYPE_F32, array_pointer); array_pointer += 24576; - NN_initTensor(&model->actor_0_bias, 1, (size_t[]){512}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->actor_0_bias, 1, (size_t[]){512}, DTYPE_F32, array_pointer); array_pointer += 512; - NN_initTensor(&model->actor_0, 2, (size_t[]){1, 512}, DTYPE_F32, NULL); + NN_init_tensor(&model->actor_0, 2, (size_t[]){1, 512}, DTYPE_F32, NULL); // : actor_1 - NN_initTensor(&model->actor_1, 2, (size_t[]){1, 512}, DTYPE_F32, NULL); + NN_init_tensor(&model->actor_1, 2, (size_t[]){1, 512}, DTYPE_F32, NULL); // : actor_2 - NN_initTensor(&model->actor_2_weight, 2, (size_t[]){256, 512}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->actor_2_weight, 2, (size_t[]){256, 512}, DTYPE_F32, array_pointer); array_pointer += 131072; - NN_initTensor(&model->actor_2_bias, 1, (size_t[]){256}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->actor_2_bias, 1, (size_t[]){256}, DTYPE_F32, array_pointer); array_pointer += 256; - NN_initTensor(&model->actor_2, 2, (size_t[]){1, 256}, DTYPE_F32, NULL); + NN_init_tensor(&model->actor_2, 2, (size_t[]){1, 256}, DTYPE_F32, NULL); // : actor_3 - NN_initTensor(&model->actor_3, 2, (size_t[]){1, 256}, DTYPE_F32, NULL); + NN_init_tensor(&model->actor_3, 2, (size_t[]){1, 256}, DTYPE_F32, NULL); // : actor_4 - NN_initTensor(&model->actor_4_weight, 2, (size_t[]){128, 256}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->actor_4_weight, 2, (size_t[]){128, 256}, DTYPE_F32, array_pointer); array_pointer += 32768; - NN_initTensor(&model->actor_4_bias, 1, (size_t[]){128}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->actor_4_bias, 1, (size_t[]){128}, DTYPE_F32, array_pointer); array_pointer += 128; - NN_initTensor(&model->actor_4, 2, (size_t[]){1, 128}, DTYPE_F32, NULL); + NN_init_tensor(&model->actor_4, 2, (size_t[]){1, 128}, DTYPE_F32, NULL); // : actor_5 - NN_initTensor(&model->actor_5, 2, (size_t[]){1, 128}, DTYPE_F32, NULL); + NN_init_tensor(&model->actor_5, 2, (size_t[]){1, 128}, DTYPE_F32, NULL); // : actor_6 - NN_initTensor(&model->actor_6_weight, 2, (size_t[]){12, 128}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->actor_6_weight, 2, (size_t[]){12, 128}, DTYPE_F32, array_pointer); array_pointer += 1536; - NN_initTensor(&model->actor_6_bias, 1, (size_t[]){12}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->actor_6_bias, 1, (size_t[]){12}, DTYPE_F32, array_pointer); array_pointer += 12; - NN_initTensor(&model->actor_6, 2, (size_t[]){1, 12}, DTYPE_F32, NULL); + NN_init_tensor(&model->actor_6, 2, (size_t[]){1, 12}, DTYPE_F32, NULL); } @@ -87,13 +87,13 @@ void init(Model *model) { * Forward pass of the model */ void forward(Model *model) { - NN_Linear(&model->actor_0, &model->input_1, &model->actor_0_weight, &model->actor_0_bias); - NN_ELU(&model->actor_1, &model->actor_0, 1.0); - NN_Linear(&model->actor_2, &model->actor_1, &model->actor_2_weight, &model->actor_2_bias); - NN_ELU(&model->actor_3, &model->actor_2, 1.0); - NN_Linear(&model->actor_4, &model->actor_3, &model->actor_4_weight, &model->actor_4_bias); - NN_ELU(&model->actor_5, &model->actor_4, 1.0); - NN_Linear(&model->actor_6, &model->actor_5, &model->actor_6_weight, &model->actor_6_bias); + NN_linear(&model->actor_0, &model->input_1, &model->actor_0_weight, &model->actor_0_bias); + NN_elu(&model->actor_1, &model->actor_0, 1.0); + NN_linear(&model->actor_2, &model->actor_1, &model->actor_2_weight, &model->actor_2_bias); + NN_elu(&model->actor_3, &model->actor_2, 1.0); + NN_linear(&model->actor_4, &model->actor_3, &model->actor_4_weight, &model->actor_4_bias); + NN_elu(&model->actor_5, &model->actor_4, 1.0); + NN_linear(&model->actor_6, &model->actor_5, &model->actor_6_weight, &model->actor_6_bias); } diff --git a/example/fast-depth/CMakeLists.txt b/example/fast-depth/CMakeLists.txt index c3960ee..6648c9d 100644 --- a/example/fast-depth/CMakeLists.txt +++ b/example/fast-depth/CMakeLists.txt @@ -4,7 +4,7 @@ project(fast-depth LANGUAGES C) add_executable(fast-depth main.c) -target_include_directories(fast-depth PUBLIC inc) +target_include_directories(fast-depth PUBLIC inc ../utility) target_compile_features(fast-depth INTERFACE c_std_11) diff --git a/example/fast-depth/README.md b/example/fast-depth/README.md index facee2d..dcb6692 100644 --- a/example/fast-depth/README.md +++ b/example/fast-depth/README.md @@ -41,7 +41,7 @@ Building for spike: ```bash cd ./example/fast-depth/build/ -cmake .. -D RISCV=ON +cmake .. -D RISCV=ON -D RVV=ON cmake --build . spike --isa=rv64gcv_zicntr --varch=vlen:512,elen:32 ./fast-depth ``` @@ -49,6 +49,9 @@ spike --isa=rv64gcv_zicntr --varch=vlen:512,elen:32 ./fast-depth Running with Gemmini ```bash +cd ./example/fast-depth/build/ +cmake .. -D RISCV=ON -D GEMMINI=ON +cmake --build . spike --extension=gemmini ./fast-depth ``` diff --git a/example/fast-depth/main.c b/example/fast-depth/main.c index 925003f..7baf4a9 100644 --- a/example/fast-depth/main.c +++ b/example/fast-depth/main.c @@ -13,6 +13,7 @@ #include "nn.h" #include "model.h" +#include "termimg.h" // load the weight data block from the model.bin file INCLUDE_FILE(".rodata", "../input.bin", model_input); @@ -21,16 +22,21 @@ extern size_t model_input_start[]; extern size_t model_input_end[]; -// static void enable_vector_operations() { -// unsigned long mstatus; -// asm volatile("csrr %0, mstatus" : "=r"(mstatus)); -// mstatus |= 0x00000600 | 0x00006000 | 0x00018000; -// asm volatile("csrw mstatus, %0"::"r"(mstatus)); -// } - int main() { + #ifdef RVV + printf("Using RVV\n"); + + // enable vector instructions + unsigned long mstatus; + asm volatile("csrr %0, mstatus" : "=r"(mstatus)); + mstatus |= 0x00000600 | 0x00006000 | 0x00018000; + asm volatile("csrw mstatus, %0"::"r"(mstatus)); + #endif + + #ifdef GEMMINI + printf("Using Gemmini\n"); + #endif - // enable_vector_operations(); Model *model = malloc(sizeof(Model)); @@ -53,11 +59,8 @@ int main() { NN_interpolate(img, &model->decode_conv6_2, (float []){0.125, 0.25}); - - printf("output:\n"); - showASCIIImage(img); - // showASCIIImage(&model->decode_conv6_2); + show_ASCII_image(img, 0, 0); return 0; } diff --git a/example/fast-depth/model.h b/example/fast-depth/model.h index 17b48c5..aa4addb 100644 --- a/example/fast-depth/model.h +++ b/example/fast-depth/model.h @@ -339,737 +339,737 @@ void forward(Model *model); void init(Model *model) { float *array_pointer = (float *)model_weight_data; - NN_initTensor(&model->x, 4, (size_t[]){1, 224, 224, 3}, DTYPE_F32, NULL); + NN_init_tensor(&model->x, 4, (size_t[]){1, 224, 224, 3}, DTYPE_F32, NULL); // : conv0_0 - NN_initTensor(&model->conv0_0_weight, 4, (size_t[]){3, 3, 3, 16}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv0_0_weight, 4, (size_t[]){3, 3, 3, 16}, DTYPE_F32, array_pointer); array_pointer += 432; - NN_initTensor(&model->conv0_0, 4, (size_t[]){1, 112, 112, 16}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv0_0, 4, (size_t[]){1, 112, 112, 16}, DTYPE_F32, NULL); // : conv0_1 - NN_initTensor(&model->conv0_1_weight, 1, (size_t[]){16}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv0_1_weight, 1, (size_t[]){16}, DTYPE_F32, array_pointer); array_pointer += 16; - NN_initTensor(&model->conv0_1_bias, 1, (size_t[]){16}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv0_1_bias, 1, (size_t[]){16}, DTYPE_F32, array_pointer); array_pointer += 16; - NN_initTensor(&model->conv0_1_running_mean, 1, (size_t[]){16}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv0_1_running_mean, 1, (size_t[]){16}, DTYPE_F32, array_pointer); array_pointer += 16; - NN_initTensor(&model->conv0_1_running_var, 1, (size_t[]){16}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv0_1_running_var, 1, (size_t[]){16}, DTYPE_F32, array_pointer); array_pointer += 16; - NN_initTensor(&model->conv0_1, 4, (size_t[]){1, 112, 112, 16}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv0_1, 4, (size_t[]){1, 112, 112, 16}, DTYPE_F32, NULL); // : conv0_2 - NN_initTensor(&model->conv0_2, 4, (size_t[]){1, 112, 112, 16}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv0_2, 4, (size_t[]){1, 112, 112, 16}, DTYPE_F32, NULL); // : conv1_0 - NN_initTensor(&model->conv1_0_weight, 4, (size_t[]){3, 3, 1, 16}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv1_0_weight, 4, (size_t[]){3, 3, 1, 16}, DTYPE_F32, array_pointer); array_pointer += 144; - NN_initTensor(&model->conv1_0, 4, (size_t[]){1, 112, 112, 16}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv1_0, 4, (size_t[]){1, 112, 112, 16}, DTYPE_F32, NULL); // : conv1_1 - NN_initTensor(&model->conv1_1_weight, 1, (size_t[]){16}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv1_1_weight, 1, (size_t[]){16}, DTYPE_F32, array_pointer); array_pointer += 16; - NN_initTensor(&model->conv1_1_bias, 1, (size_t[]){16}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv1_1_bias, 1, (size_t[]){16}, DTYPE_F32, array_pointer); array_pointer += 16; - NN_initTensor(&model->conv1_1_running_mean, 1, (size_t[]){16}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv1_1_running_mean, 1, (size_t[]){16}, DTYPE_F32, array_pointer); array_pointer += 16; - NN_initTensor(&model->conv1_1_running_var, 1, (size_t[]){16}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv1_1_running_var, 1, (size_t[]){16}, DTYPE_F32, array_pointer); array_pointer += 16; - NN_initTensor(&model->conv1_1, 4, (size_t[]){1, 112, 112, 16}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv1_1, 4, (size_t[]){1, 112, 112, 16}, DTYPE_F32, NULL); // : conv1_2 - NN_initTensor(&model->conv1_2, 4, (size_t[]){1, 112, 112, 16}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv1_2, 4, (size_t[]){1, 112, 112, 16}, DTYPE_F32, NULL); // : conv1_3 - NN_initTensor(&model->conv1_3_weight, 4, (size_t[]){1, 1, 16, 56}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv1_3_weight, 4, (size_t[]){1, 1, 16, 56}, DTYPE_F32, array_pointer); array_pointer += 896; - NN_initTensor(&model->conv1_3, 4, (size_t[]){1, 112, 112, 56}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv1_3, 4, (size_t[]){1, 112, 112, 56}, DTYPE_F32, NULL); // : conv1_4 - NN_initTensor(&model->conv1_4_weight, 1, (size_t[]){56}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv1_4_weight, 1, (size_t[]){56}, DTYPE_F32, array_pointer); array_pointer += 56; - NN_initTensor(&model->conv1_4_bias, 1, (size_t[]){56}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv1_4_bias, 1, (size_t[]){56}, DTYPE_F32, array_pointer); array_pointer += 56; - NN_initTensor(&model->conv1_4_running_mean, 1, (size_t[]){56}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv1_4_running_mean, 1, (size_t[]){56}, DTYPE_F32, array_pointer); array_pointer += 56; - NN_initTensor(&model->conv1_4_running_var, 1, (size_t[]){56}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv1_4_running_var, 1, (size_t[]){56}, DTYPE_F32, array_pointer); array_pointer += 56; - NN_initTensor(&model->conv1_4, 4, (size_t[]){1, 112, 112, 56}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv1_4, 4, (size_t[]){1, 112, 112, 56}, DTYPE_F32, NULL); // : conv1_5 - NN_initTensor(&model->conv1_5, 4, (size_t[]){1, 112, 112, 56}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv1_5, 4, (size_t[]){1, 112, 112, 56}, DTYPE_F32, NULL); // : conv2_0 - NN_initTensor(&model->conv2_0_weight, 4, (size_t[]){3, 3, 1, 56}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv2_0_weight, 4, (size_t[]){3, 3, 1, 56}, DTYPE_F32, array_pointer); array_pointer += 504; - NN_initTensor(&model->conv2_0, 4, (size_t[]){1, 56, 56, 56}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv2_0, 4, (size_t[]){1, 56, 56, 56}, DTYPE_F32, NULL); // : conv2_1 - NN_initTensor(&model->conv2_1_weight, 1, (size_t[]){56}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv2_1_weight, 1, (size_t[]){56}, DTYPE_F32, array_pointer); array_pointer += 56; - NN_initTensor(&model->conv2_1_bias, 1, (size_t[]){56}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv2_1_bias, 1, (size_t[]){56}, DTYPE_F32, array_pointer); array_pointer += 56; - NN_initTensor(&model->conv2_1_running_mean, 1, (size_t[]){56}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv2_1_running_mean, 1, (size_t[]){56}, DTYPE_F32, array_pointer); array_pointer += 56; - NN_initTensor(&model->conv2_1_running_var, 1, (size_t[]){56}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv2_1_running_var, 1, (size_t[]){56}, DTYPE_F32, array_pointer); array_pointer += 56; - NN_initTensor(&model->conv2_1, 4, (size_t[]){1, 56, 56, 56}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv2_1, 4, (size_t[]){1, 56, 56, 56}, DTYPE_F32, NULL); // : conv2_2 - NN_initTensor(&model->conv2_2, 4, (size_t[]){1, 56, 56, 56}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv2_2, 4, (size_t[]){1, 56, 56, 56}, DTYPE_F32, NULL); // : conv2_3 - NN_initTensor(&model->conv2_3_weight, 4, (size_t[]){1, 1, 56, 88}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv2_3_weight, 4, (size_t[]){1, 1, 56, 88}, DTYPE_F32, array_pointer); array_pointer += 4928; - NN_initTensor(&model->conv2_3, 4, (size_t[]){1, 56, 56, 88}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv2_3, 4, (size_t[]){1, 56, 56, 88}, DTYPE_F32, NULL); // : conv2_4 - NN_initTensor(&model->conv2_4_weight, 1, (size_t[]){88}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv2_4_weight, 1, (size_t[]){88}, DTYPE_F32, array_pointer); array_pointer += 88; - NN_initTensor(&model->conv2_4_bias, 1, (size_t[]){88}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv2_4_bias, 1, (size_t[]){88}, DTYPE_F32, array_pointer); array_pointer += 88; - NN_initTensor(&model->conv2_4_running_mean, 1, (size_t[]){88}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv2_4_running_mean, 1, (size_t[]){88}, DTYPE_F32, array_pointer); array_pointer += 88; - NN_initTensor(&model->conv2_4_running_var, 1, (size_t[]){88}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv2_4_running_var, 1, (size_t[]){88}, DTYPE_F32, array_pointer); array_pointer += 88; - NN_initTensor(&model->conv2_4, 4, (size_t[]){1, 56, 56, 88}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv2_4, 4, (size_t[]){1, 56, 56, 88}, DTYPE_F32, NULL); // : conv2_5 - NN_initTensor(&model->conv2_5, 4, (size_t[]){1, 56, 56, 88}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv2_5, 4, (size_t[]){1, 56, 56, 88}, DTYPE_F32, NULL); // : conv3_0 - NN_initTensor(&model->conv3_0_weight, 4, (size_t[]){3, 3, 1, 88}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv3_0_weight, 4, (size_t[]){3, 3, 1, 88}, DTYPE_F32, array_pointer); array_pointer += 792; - NN_initTensor(&model->conv3_0, 4, (size_t[]){1, 56, 56, 88}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv3_0, 4, (size_t[]){1, 56, 56, 88}, DTYPE_F32, NULL); // : conv3_1 - NN_initTensor(&model->conv3_1_weight, 1, (size_t[]){88}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv3_1_weight, 1, (size_t[]){88}, DTYPE_F32, array_pointer); array_pointer += 88; - NN_initTensor(&model->conv3_1_bias, 1, (size_t[]){88}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv3_1_bias, 1, (size_t[]){88}, DTYPE_F32, array_pointer); array_pointer += 88; - NN_initTensor(&model->conv3_1_running_mean, 1, (size_t[]){88}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv3_1_running_mean, 1, (size_t[]){88}, DTYPE_F32, array_pointer); array_pointer += 88; - NN_initTensor(&model->conv3_1_running_var, 1, (size_t[]){88}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv3_1_running_var, 1, (size_t[]){88}, DTYPE_F32, array_pointer); array_pointer += 88; - NN_initTensor(&model->conv3_1, 4, (size_t[]){1, 56, 56, 88}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv3_1, 4, (size_t[]){1, 56, 56, 88}, DTYPE_F32, NULL); // : conv3_2 - NN_initTensor(&model->conv3_2, 4, (size_t[]){1, 56, 56, 88}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv3_2, 4, (size_t[]){1, 56, 56, 88}, DTYPE_F32, NULL); // : conv3_3 - NN_initTensor(&model->conv3_3_weight, 4, (size_t[]){1, 1, 88, 120}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv3_3_weight, 4, (size_t[]){1, 1, 88, 120}, DTYPE_F32, array_pointer); array_pointer += 10560; - NN_initTensor(&model->conv3_3, 4, (size_t[]){1, 56, 56, 120}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv3_3, 4, (size_t[]){1, 56, 56, 120}, DTYPE_F32, NULL); // : conv3_4 - NN_initTensor(&model->conv3_4_weight, 1, (size_t[]){120}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv3_4_weight, 1, (size_t[]){120}, DTYPE_F32, array_pointer); array_pointer += 120; - NN_initTensor(&model->conv3_4_bias, 1, (size_t[]){120}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv3_4_bias, 1, (size_t[]){120}, DTYPE_F32, array_pointer); array_pointer += 120; - NN_initTensor(&model->conv3_4_running_mean, 1, (size_t[]){120}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv3_4_running_mean, 1, (size_t[]){120}, DTYPE_F32, array_pointer); array_pointer += 120; - NN_initTensor(&model->conv3_4_running_var, 1, (size_t[]){120}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv3_4_running_var, 1, (size_t[]){120}, DTYPE_F32, array_pointer); array_pointer += 120; - NN_initTensor(&model->conv3_4, 4, (size_t[]){1, 56, 56, 120}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv3_4, 4, (size_t[]){1, 56, 56, 120}, DTYPE_F32, NULL); // : conv3_5 - NN_initTensor(&model->conv3_5, 4, (size_t[]){1, 56, 56, 120}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv3_5, 4, (size_t[]){1, 56, 56, 120}, DTYPE_F32, NULL); // : conv4_0 - NN_initTensor(&model->conv4_0_weight, 4, (size_t[]){3, 3, 1, 120}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv4_0_weight, 4, (size_t[]){3, 3, 1, 120}, DTYPE_F32, array_pointer); array_pointer += 1080; - NN_initTensor(&model->conv4_0, 4, (size_t[]){1, 28, 28, 120}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv4_0, 4, (size_t[]){1, 28, 28, 120}, DTYPE_F32, NULL); // : conv4_1 - NN_initTensor(&model->conv4_1_weight, 1, (size_t[]){120}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv4_1_weight, 1, (size_t[]){120}, DTYPE_F32, array_pointer); array_pointer += 120; - NN_initTensor(&model->conv4_1_bias, 1, (size_t[]){120}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv4_1_bias, 1, (size_t[]){120}, DTYPE_F32, array_pointer); array_pointer += 120; - NN_initTensor(&model->conv4_1_running_mean, 1, (size_t[]){120}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv4_1_running_mean, 1, (size_t[]){120}, DTYPE_F32, array_pointer); array_pointer += 120; - NN_initTensor(&model->conv4_1_running_var, 1, (size_t[]){120}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv4_1_running_var, 1, (size_t[]){120}, DTYPE_F32, array_pointer); array_pointer += 120; - NN_initTensor(&model->conv4_1, 4, (size_t[]){1, 28, 28, 120}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv4_1, 4, (size_t[]){1, 28, 28, 120}, DTYPE_F32, NULL); // : conv4_2 - NN_initTensor(&model->conv4_2, 4, (size_t[]){1, 28, 28, 120}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv4_2, 4, (size_t[]){1, 28, 28, 120}, DTYPE_F32, NULL); // : conv4_3 - NN_initTensor(&model->conv4_3_weight, 4, (size_t[]){1, 1, 120, 144}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv4_3_weight, 4, (size_t[]){1, 1, 120, 144}, DTYPE_F32, array_pointer); array_pointer += 17280; - NN_initTensor(&model->conv4_3, 4, (size_t[]){1, 28, 28, 144}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv4_3, 4, (size_t[]){1, 28, 28, 144}, DTYPE_F32, NULL); // : conv4_4 - NN_initTensor(&model->conv4_4_weight, 1, (size_t[]){144}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv4_4_weight, 1, (size_t[]){144}, DTYPE_F32, array_pointer); array_pointer += 144; - NN_initTensor(&model->conv4_4_bias, 1, (size_t[]){144}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv4_4_bias, 1, (size_t[]){144}, DTYPE_F32, array_pointer); array_pointer += 144; - NN_initTensor(&model->conv4_4_running_mean, 1, (size_t[]){144}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv4_4_running_mean, 1, (size_t[]){144}, DTYPE_F32, array_pointer); array_pointer += 144; - NN_initTensor(&model->conv4_4_running_var, 1, (size_t[]){144}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv4_4_running_var, 1, (size_t[]){144}, DTYPE_F32, array_pointer); array_pointer += 144; - NN_initTensor(&model->conv4_4, 4, (size_t[]){1, 28, 28, 144}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv4_4, 4, (size_t[]){1, 28, 28, 144}, DTYPE_F32, NULL); // : conv4_5 - NN_initTensor(&model->conv4_5, 4, (size_t[]){1, 28, 28, 144}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv4_5, 4, (size_t[]){1, 28, 28, 144}, DTYPE_F32, NULL); // : conv5_0 - NN_initTensor(&model->conv5_0_weight, 4, (size_t[]){3, 3, 1, 144}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv5_0_weight, 4, (size_t[]){3, 3, 1, 144}, DTYPE_F32, array_pointer); array_pointer += 1296; - NN_initTensor(&model->conv5_0, 4, (size_t[]){1, 28, 28, 144}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv5_0, 4, (size_t[]){1, 28, 28, 144}, DTYPE_F32, NULL); // : conv5_1 - NN_initTensor(&model->conv5_1_weight, 1, (size_t[]){144}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv5_1_weight, 1, (size_t[]){144}, DTYPE_F32, array_pointer); array_pointer += 144; - NN_initTensor(&model->conv5_1_bias, 1, (size_t[]){144}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv5_1_bias, 1, (size_t[]){144}, DTYPE_F32, array_pointer); array_pointer += 144; - NN_initTensor(&model->conv5_1_running_mean, 1, (size_t[]){144}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv5_1_running_mean, 1, (size_t[]){144}, DTYPE_F32, array_pointer); array_pointer += 144; - NN_initTensor(&model->conv5_1_running_var, 1, (size_t[]){144}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv5_1_running_var, 1, (size_t[]){144}, DTYPE_F32, array_pointer); array_pointer += 144; - NN_initTensor(&model->conv5_1, 4, (size_t[]){1, 28, 28, 144}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv5_1, 4, (size_t[]){1, 28, 28, 144}, DTYPE_F32, NULL); // : conv5_2 - NN_initTensor(&model->conv5_2, 4, (size_t[]){1, 28, 28, 144}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv5_2, 4, (size_t[]){1, 28, 28, 144}, DTYPE_F32, NULL); // : conv5_3 - NN_initTensor(&model->conv5_3_weight, 4, (size_t[]){1, 1, 144, 256}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv5_3_weight, 4, (size_t[]){1, 1, 144, 256}, DTYPE_F32, array_pointer); array_pointer += 36864; - NN_initTensor(&model->conv5_3, 4, (size_t[]){1, 28, 28, 256}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv5_3, 4, (size_t[]){1, 28, 28, 256}, DTYPE_F32, NULL); // : conv5_4 - NN_initTensor(&model->conv5_4_weight, 1, (size_t[]){256}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv5_4_weight, 1, (size_t[]){256}, DTYPE_F32, array_pointer); array_pointer += 256; - NN_initTensor(&model->conv5_4_bias, 1, (size_t[]){256}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv5_4_bias, 1, (size_t[]){256}, DTYPE_F32, array_pointer); array_pointer += 256; - NN_initTensor(&model->conv5_4_running_mean, 1, (size_t[]){256}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv5_4_running_mean, 1, (size_t[]){256}, DTYPE_F32, array_pointer); array_pointer += 256; - NN_initTensor(&model->conv5_4_running_var, 1, (size_t[]){256}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv5_4_running_var, 1, (size_t[]){256}, DTYPE_F32, array_pointer); array_pointer += 256; - NN_initTensor(&model->conv5_4, 4, (size_t[]){1, 28, 28, 256}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv5_4, 4, (size_t[]){1, 28, 28, 256}, DTYPE_F32, NULL); // : conv5_5 - NN_initTensor(&model->conv5_5, 4, (size_t[]){1, 28, 28, 256}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv5_5, 4, (size_t[]){1, 28, 28, 256}, DTYPE_F32, NULL); // : conv6_0 - NN_initTensor(&model->conv6_0_weight, 4, (size_t[]){3, 3, 1, 256}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv6_0_weight, 4, (size_t[]){3, 3, 1, 256}, DTYPE_F32, array_pointer); array_pointer += 2304; - NN_initTensor(&model->conv6_0, 4, (size_t[]){1, 14, 14, 256}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv6_0, 4, (size_t[]){1, 14, 14, 256}, DTYPE_F32, NULL); // : conv6_1 - NN_initTensor(&model->conv6_1_weight, 1, (size_t[]){256}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv6_1_weight, 1, (size_t[]){256}, DTYPE_F32, array_pointer); array_pointer += 256; - NN_initTensor(&model->conv6_1_bias, 1, (size_t[]){256}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv6_1_bias, 1, (size_t[]){256}, DTYPE_F32, array_pointer); array_pointer += 256; - NN_initTensor(&model->conv6_1_running_mean, 1, (size_t[]){256}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv6_1_running_mean, 1, (size_t[]){256}, DTYPE_F32, array_pointer); array_pointer += 256; - NN_initTensor(&model->conv6_1_running_var, 1, (size_t[]){256}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv6_1_running_var, 1, (size_t[]){256}, DTYPE_F32, array_pointer); array_pointer += 256; - NN_initTensor(&model->conv6_1, 4, (size_t[]){1, 14, 14, 256}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv6_1, 4, (size_t[]){1, 14, 14, 256}, DTYPE_F32, NULL); // : conv6_2 - NN_initTensor(&model->conv6_2, 4, (size_t[]){1, 14, 14, 256}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv6_2, 4, (size_t[]){1, 14, 14, 256}, DTYPE_F32, NULL); // : conv6_3 - NN_initTensor(&model->conv6_3_weight, 4, (size_t[]){1, 1, 256, 408}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv6_3_weight, 4, (size_t[]){1, 1, 256, 408}, DTYPE_F32, array_pointer); array_pointer += 104448; - NN_initTensor(&model->conv6_3, 4, (size_t[]){1, 14, 14, 408}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv6_3, 4, (size_t[]){1, 14, 14, 408}, DTYPE_F32, NULL); // : conv6_4 - NN_initTensor(&model->conv6_4_weight, 1, (size_t[]){408}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv6_4_weight, 1, (size_t[]){408}, DTYPE_F32, array_pointer); array_pointer += 408; - NN_initTensor(&model->conv6_4_bias, 1, (size_t[]){408}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv6_4_bias, 1, (size_t[]){408}, DTYPE_F32, array_pointer); array_pointer += 408; - NN_initTensor(&model->conv6_4_running_mean, 1, (size_t[]){408}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv6_4_running_mean, 1, (size_t[]){408}, DTYPE_F32, array_pointer); array_pointer += 408; - NN_initTensor(&model->conv6_4_running_var, 1, (size_t[]){408}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv6_4_running_var, 1, (size_t[]){408}, DTYPE_F32, array_pointer); array_pointer += 408; - NN_initTensor(&model->conv6_4, 4, (size_t[]){1, 14, 14, 408}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv6_4, 4, (size_t[]){1, 14, 14, 408}, DTYPE_F32, NULL); // : conv6_5 - NN_initTensor(&model->conv6_5, 4, (size_t[]){1, 14, 14, 408}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv6_5, 4, (size_t[]){1, 14, 14, 408}, DTYPE_F32, NULL); // : conv7_0 - NN_initTensor(&model->conv7_0_weight, 4, (size_t[]){3, 3, 1, 408}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv7_0_weight, 4, (size_t[]){3, 3, 1, 408}, DTYPE_F32, array_pointer); array_pointer += 3672; - NN_initTensor(&model->conv7_0, 4, (size_t[]){1, 14, 14, 408}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv7_0, 4, (size_t[]){1, 14, 14, 408}, DTYPE_F32, NULL); // : conv7_1 - NN_initTensor(&model->conv7_1_weight, 1, (size_t[]){408}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv7_1_weight, 1, (size_t[]){408}, DTYPE_F32, array_pointer); array_pointer += 408; - NN_initTensor(&model->conv7_1_bias, 1, (size_t[]){408}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv7_1_bias, 1, (size_t[]){408}, DTYPE_F32, array_pointer); array_pointer += 408; - NN_initTensor(&model->conv7_1_running_mean, 1, (size_t[]){408}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv7_1_running_mean, 1, (size_t[]){408}, DTYPE_F32, array_pointer); array_pointer += 408; - NN_initTensor(&model->conv7_1_running_var, 1, (size_t[]){408}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv7_1_running_var, 1, (size_t[]){408}, DTYPE_F32, array_pointer); array_pointer += 408; - NN_initTensor(&model->conv7_1, 4, (size_t[]){1, 14, 14, 408}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv7_1, 4, (size_t[]){1, 14, 14, 408}, DTYPE_F32, NULL); // : conv7_2 - NN_initTensor(&model->conv7_2, 4, (size_t[]){1, 14, 14, 408}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv7_2, 4, (size_t[]){1, 14, 14, 408}, DTYPE_F32, NULL); // : conv7_3 - NN_initTensor(&model->conv7_3_weight, 4, (size_t[]){1, 1, 408, 376}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv7_3_weight, 4, (size_t[]){1, 1, 408, 376}, DTYPE_F32, array_pointer); array_pointer += 153408; - NN_initTensor(&model->conv7_3, 4, (size_t[]){1, 14, 14, 376}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv7_3, 4, (size_t[]){1, 14, 14, 376}, DTYPE_F32, NULL); // : conv7_4 - NN_initTensor(&model->conv7_4_weight, 1, (size_t[]){376}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv7_4_weight, 1, (size_t[]){376}, DTYPE_F32, array_pointer); array_pointer += 376; - NN_initTensor(&model->conv7_4_bias, 1, (size_t[]){376}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv7_4_bias, 1, (size_t[]){376}, DTYPE_F32, array_pointer); array_pointer += 376; - NN_initTensor(&model->conv7_4_running_mean, 1, (size_t[]){376}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv7_4_running_mean, 1, (size_t[]){376}, DTYPE_F32, array_pointer); array_pointer += 376; - NN_initTensor(&model->conv7_4_running_var, 1, (size_t[]){376}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv7_4_running_var, 1, (size_t[]){376}, DTYPE_F32, array_pointer); array_pointer += 376; - NN_initTensor(&model->conv7_4, 4, (size_t[]){1, 14, 14, 376}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv7_4, 4, (size_t[]){1, 14, 14, 376}, DTYPE_F32, NULL); // : conv7_5 - NN_initTensor(&model->conv7_5, 4, (size_t[]){1, 14, 14, 376}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv7_5, 4, (size_t[]){1, 14, 14, 376}, DTYPE_F32, NULL); // : conv8_0 - NN_initTensor(&model->conv8_0_weight, 4, (size_t[]){3, 3, 1, 376}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv8_0_weight, 4, (size_t[]){3, 3, 1, 376}, DTYPE_F32, array_pointer); array_pointer += 3384; - NN_initTensor(&model->conv8_0, 4, (size_t[]){1, 14, 14, 376}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv8_0, 4, (size_t[]){1, 14, 14, 376}, DTYPE_F32, NULL); // : conv8_1 - NN_initTensor(&model->conv8_1_weight, 1, (size_t[]){376}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv8_1_weight, 1, (size_t[]){376}, DTYPE_F32, array_pointer); array_pointer += 376; - NN_initTensor(&model->conv8_1_bias, 1, (size_t[]){376}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv8_1_bias, 1, (size_t[]){376}, DTYPE_F32, array_pointer); array_pointer += 376; - NN_initTensor(&model->conv8_1_running_mean, 1, (size_t[]){376}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv8_1_running_mean, 1, (size_t[]){376}, DTYPE_F32, array_pointer); array_pointer += 376; - NN_initTensor(&model->conv8_1_running_var, 1, (size_t[]){376}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv8_1_running_var, 1, (size_t[]){376}, DTYPE_F32, array_pointer); array_pointer += 376; - NN_initTensor(&model->conv8_1, 4, (size_t[]){1, 14, 14, 376}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv8_1, 4, (size_t[]){1, 14, 14, 376}, DTYPE_F32, NULL); // : conv8_2 - NN_initTensor(&model->conv8_2, 4, (size_t[]){1, 14, 14, 376}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv8_2, 4, (size_t[]){1, 14, 14, 376}, DTYPE_F32, NULL); // : conv8_3 - NN_initTensor(&model->conv8_3_weight, 4, (size_t[]){1, 1, 376, 272}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv8_3_weight, 4, (size_t[]){1, 1, 376, 272}, DTYPE_F32, array_pointer); array_pointer += 102272; - NN_initTensor(&model->conv8_3, 4, (size_t[]){1, 14, 14, 272}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv8_3, 4, (size_t[]){1, 14, 14, 272}, DTYPE_F32, NULL); // : conv8_4 - NN_initTensor(&model->conv8_4_weight, 1, (size_t[]){272}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv8_4_weight, 1, (size_t[]){272}, DTYPE_F32, array_pointer); array_pointer += 272; - NN_initTensor(&model->conv8_4_bias, 1, (size_t[]){272}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv8_4_bias, 1, (size_t[]){272}, DTYPE_F32, array_pointer); array_pointer += 272; - NN_initTensor(&model->conv8_4_running_mean, 1, (size_t[]){272}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv8_4_running_mean, 1, (size_t[]){272}, DTYPE_F32, array_pointer); array_pointer += 272; - NN_initTensor(&model->conv8_4_running_var, 1, (size_t[]){272}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv8_4_running_var, 1, (size_t[]){272}, DTYPE_F32, array_pointer); array_pointer += 272; - NN_initTensor(&model->conv8_4, 4, (size_t[]){1, 14, 14, 272}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv8_4, 4, (size_t[]){1, 14, 14, 272}, DTYPE_F32, NULL); // : conv8_5 - NN_initTensor(&model->conv8_5, 4, (size_t[]){1, 14, 14, 272}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv8_5, 4, (size_t[]){1, 14, 14, 272}, DTYPE_F32, NULL); // : conv9_0 - NN_initTensor(&model->conv9_0_weight, 4, (size_t[]){3, 3, 1, 272}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv9_0_weight, 4, (size_t[]){3, 3, 1, 272}, DTYPE_F32, array_pointer); array_pointer += 2448; - NN_initTensor(&model->conv9_0, 4, (size_t[]){1, 14, 14, 272}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv9_0, 4, (size_t[]){1, 14, 14, 272}, DTYPE_F32, NULL); // : conv9_1 - NN_initTensor(&model->conv9_1_weight, 1, (size_t[]){272}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv9_1_weight, 1, (size_t[]){272}, DTYPE_F32, array_pointer); array_pointer += 272; - NN_initTensor(&model->conv9_1_bias, 1, (size_t[]){272}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv9_1_bias, 1, (size_t[]){272}, DTYPE_F32, array_pointer); array_pointer += 272; - NN_initTensor(&model->conv9_1_running_mean, 1, (size_t[]){272}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv9_1_running_mean, 1, (size_t[]){272}, DTYPE_F32, array_pointer); array_pointer += 272; - NN_initTensor(&model->conv9_1_running_var, 1, (size_t[]){272}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv9_1_running_var, 1, (size_t[]){272}, DTYPE_F32, array_pointer); array_pointer += 272; - NN_initTensor(&model->conv9_1, 4, (size_t[]){1, 14, 14, 272}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv9_1, 4, (size_t[]){1, 14, 14, 272}, DTYPE_F32, NULL); // : conv9_2 - NN_initTensor(&model->conv9_2, 4, (size_t[]){1, 14, 14, 272}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv9_2, 4, (size_t[]){1, 14, 14, 272}, DTYPE_F32, NULL); // : conv9_3 - NN_initTensor(&model->conv9_3_weight, 4, (size_t[]){1, 1, 272, 288}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv9_3_weight, 4, (size_t[]){1, 1, 272, 288}, DTYPE_F32, array_pointer); array_pointer += 78336; - NN_initTensor(&model->conv9_3, 4, (size_t[]){1, 14, 14, 288}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv9_3, 4, (size_t[]){1, 14, 14, 288}, DTYPE_F32, NULL); // : conv9_4 - NN_initTensor(&model->conv9_4_weight, 1, (size_t[]){288}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv9_4_weight, 1, (size_t[]){288}, DTYPE_F32, array_pointer); array_pointer += 288; - NN_initTensor(&model->conv9_4_bias, 1, (size_t[]){288}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv9_4_bias, 1, (size_t[]){288}, DTYPE_F32, array_pointer); array_pointer += 288; - NN_initTensor(&model->conv9_4_running_mean, 1, (size_t[]){288}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv9_4_running_mean, 1, (size_t[]){288}, DTYPE_F32, array_pointer); array_pointer += 288; - NN_initTensor(&model->conv9_4_running_var, 1, (size_t[]){288}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv9_4_running_var, 1, (size_t[]){288}, DTYPE_F32, array_pointer); array_pointer += 288; - NN_initTensor(&model->conv9_4, 4, (size_t[]){1, 14, 14, 288}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv9_4, 4, (size_t[]){1, 14, 14, 288}, DTYPE_F32, NULL); // : conv9_5 - NN_initTensor(&model->conv9_5, 4, (size_t[]){1, 14, 14, 288}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv9_5, 4, (size_t[]){1, 14, 14, 288}, DTYPE_F32, NULL); // : conv10_0 - NN_initTensor(&model->conv10_0_weight, 4, (size_t[]){3, 3, 1, 288}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv10_0_weight, 4, (size_t[]){3, 3, 1, 288}, DTYPE_F32, array_pointer); array_pointer += 2592; - NN_initTensor(&model->conv10_0, 4, (size_t[]){1, 14, 14, 288}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv10_0, 4, (size_t[]){1, 14, 14, 288}, DTYPE_F32, NULL); // : conv10_1 - NN_initTensor(&model->conv10_1_weight, 1, (size_t[]){288}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv10_1_weight, 1, (size_t[]){288}, DTYPE_F32, array_pointer); array_pointer += 288; - NN_initTensor(&model->conv10_1_bias, 1, (size_t[]){288}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv10_1_bias, 1, (size_t[]){288}, DTYPE_F32, array_pointer); array_pointer += 288; - NN_initTensor(&model->conv10_1_running_mean, 1, (size_t[]){288}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv10_1_running_mean, 1, (size_t[]){288}, DTYPE_F32, array_pointer); array_pointer += 288; - NN_initTensor(&model->conv10_1_running_var, 1, (size_t[]){288}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv10_1_running_var, 1, (size_t[]){288}, DTYPE_F32, array_pointer); array_pointer += 288; - NN_initTensor(&model->conv10_1, 4, (size_t[]){1, 14, 14, 288}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv10_1, 4, (size_t[]){1, 14, 14, 288}, DTYPE_F32, NULL); // : conv10_2 - NN_initTensor(&model->conv10_2, 4, (size_t[]){1, 14, 14, 288}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv10_2, 4, (size_t[]){1, 14, 14, 288}, DTYPE_F32, NULL); // : conv10_3 - NN_initTensor(&model->conv10_3_weight, 4, (size_t[]){1, 1, 288, 296}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv10_3_weight, 4, (size_t[]){1, 1, 288, 296}, DTYPE_F32, array_pointer); array_pointer += 85248; - NN_initTensor(&model->conv10_3, 4, (size_t[]){1, 14, 14, 296}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv10_3, 4, (size_t[]){1, 14, 14, 296}, DTYPE_F32, NULL); // : conv10_4 - NN_initTensor(&model->conv10_4_weight, 1, (size_t[]){296}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv10_4_weight, 1, (size_t[]){296}, DTYPE_F32, array_pointer); array_pointer += 296; - NN_initTensor(&model->conv10_4_bias, 1, (size_t[]){296}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv10_4_bias, 1, (size_t[]){296}, DTYPE_F32, array_pointer); array_pointer += 296; - NN_initTensor(&model->conv10_4_running_mean, 1, (size_t[]){296}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv10_4_running_mean, 1, (size_t[]){296}, DTYPE_F32, array_pointer); array_pointer += 296; - NN_initTensor(&model->conv10_4_running_var, 1, (size_t[]){296}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv10_4_running_var, 1, (size_t[]){296}, DTYPE_F32, array_pointer); array_pointer += 296; - NN_initTensor(&model->conv10_4, 4, (size_t[]){1, 14, 14, 296}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv10_4, 4, (size_t[]){1, 14, 14, 296}, DTYPE_F32, NULL); // : conv10_5 - NN_initTensor(&model->conv10_5, 4, (size_t[]){1, 14, 14, 296}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv10_5, 4, (size_t[]){1, 14, 14, 296}, DTYPE_F32, NULL); // : conv11_0 - NN_initTensor(&model->conv11_0_weight, 4, (size_t[]){3, 3, 1, 296}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv11_0_weight, 4, (size_t[]){3, 3, 1, 296}, DTYPE_F32, array_pointer); array_pointer += 2664; - NN_initTensor(&model->conv11_0, 4, (size_t[]){1, 14, 14, 296}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv11_0, 4, (size_t[]){1, 14, 14, 296}, DTYPE_F32, NULL); // : conv11_1 - NN_initTensor(&model->conv11_1_weight, 1, (size_t[]){296}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv11_1_weight, 1, (size_t[]){296}, DTYPE_F32, array_pointer); array_pointer += 296; - NN_initTensor(&model->conv11_1_bias, 1, (size_t[]){296}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv11_1_bias, 1, (size_t[]){296}, DTYPE_F32, array_pointer); array_pointer += 296; - NN_initTensor(&model->conv11_1_running_mean, 1, (size_t[]){296}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv11_1_running_mean, 1, (size_t[]){296}, DTYPE_F32, array_pointer); array_pointer += 296; - NN_initTensor(&model->conv11_1_running_var, 1, (size_t[]){296}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv11_1_running_var, 1, (size_t[]){296}, DTYPE_F32, array_pointer); array_pointer += 296; - NN_initTensor(&model->conv11_1, 4, (size_t[]){1, 14, 14, 296}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv11_1, 4, (size_t[]){1, 14, 14, 296}, DTYPE_F32, NULL); // : conv11_2 - NN_initTensor(&model->conv11_2, 4, (size_t[]){1, 14, 14, 296}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv11_2, 4, (size_t[]){1, 14, 14, 296}, DTYPE_F32, NULL); // : conv11_3 - NN_initTensor(&model->conv11_3_weight, 4, (size_t[]){1, 1, 296, 328}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv11_3_weight, 4, (size_t[]){1, 1, 296, 328}, DTYPE_F32, array_pointer); array_pointer += 97088; - NN_initTensor(&model->conv11_3, 4, (size_t[]){1, 14, 14, 328}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv11_3, 4, (size_t[]){1, 14, 14, 328}, DTYPE_F32, NULL); // : conv11_4 - NN_initTensor(&model->conv11_4_weight, 1, (size_t[]){328}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv11_4_weight, 1, (size_t[]){328}, DTYPE_F32, array_pointer); array_pointer += 328; - NN_initTensor(&model->conv11_4_bias, 1, (size_t[]){328}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv11_4_bias, 1, (size_t[]){328}, DTYPE_F32, array_pointer); array_pointer += 328; - NN_initTensor(&model->conv11_4_running_mean, 1, (size_t[]){328}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv11_4_running_mean, 1, (size_t[]){328}, DTYPE_F32, array_pointer); array_pointer += 328; - NN_initTensor(&model->conv11_4_running_var, 1, (size_t[]){328}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv11_4_running_var, 1, (size_t[]){328}, DTYPE_F32, array_pointer); array_pointer += 328; - NN_initTensor(&model->conv11_4, 4, (size_t[]){1, 14, 14, 328}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv11_4, 4, (size_t[]){1, 14, 14, 328}, DTYPE_F32, NULL); // : conv11_5 - NN_initTensor(&model->conv11_5, 4, (size_t[]){1, 14, 14, 328}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv11_5, 4, (size_t[]){1, 14, 14, 328}, DTYPE_F32, NULL); // : conv12_0 - NN_initTensor(&model->conv12_0_weight, 4, (size_t[]){3, 3, 1, 328}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv12_0_weight, 4, (size_t[]){3, 3, 1, 328}, DTYPE_F32, array_pointer); array_pointer += 2952; - NN_initTensor(&model->conv12_0, 4, (size_t[]){1, 7, 7, 328}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv12_0, 4, (size_t[]){1, 7, 7, 328}, DTYPE_F32, NULL); // : conv12_1 - NN_initTensor(&model->conv12_1_weight, 1, (size_t[]){328}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv12_1_weight, 1, (size_t[]){328}, DTYPE_F32, array_pointer); array_pointer += 328; - NN_initTensor(&model->conv12_1_bias, 1, (size_t[]){328}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv12_1_bias, 1, (size_t[]){328}, DTYPE_F32, array_pointer); array_pointer += 328; - NN_initTensor(&model->conv12_1_running_mean, 1, (size_t[]){328}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv12_1_running_mean, 1, (size_t[]){328}, DTYPE_F32, array_pointer); array_pointer += 328; - NN_initTensor(&model->conv12_1_running_var, 1, (size_t[]){328}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv12_1_running_var, 1, (size_t[]){328}, DTYPE_F32, array_pointer); array_pointer += 328; - NN_initTensor(&model->conv12_1, 4, (size_t[]){1, 7, 7, 328}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv12_1, 4, (size_t[]){1, 7, 7, 328}, DTYPE_F32, NULL); // : conv12_2 - NN_initTensor(&model->conv12_2, 4, (size_t[]){1, 7, 7, 328}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv12_2, 4, (size_t[]){1, 7, 7, 328}, DTYPE_F32, NULL); // : conv12_3 - NN_initTensor(&model->conv12_3_weight, 4, (size_t[]){1, 1, 328, 480}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv12_3_weight, 4, (size_t[]){1, 1, 328, 480}, DTYPE_F32, array_pointer); array_pointer += 157440; - NN_initTensor(&model->conv12_3, 4, (size_t[]){1, 7, 7, 480}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv12_3, 4, (size_t[]){1, 7, 7, 480}, DTYPE_F32, NULL); // : conv12_4 - NN_initTensor(&model->conv12_4_weight, 1, (size_t[]){480}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv12_4_weight, 1, (size_t[]){480}, DTYPE_F32, array_pointer); array_pointer += 480; - NN_initTensor(&model->conv12_4_bias, 1, (size_t[]){480}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv12_4_bias, 1, (size_t[]){480}, DTYPE_F32, array_pointer); array_pointer += 480; - NN_initTensor(&model->conv12_4_running_mean, 1, (size_t[]){480}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv12_4_running_mean, 1, (size_t[]){480}, DTYPE_F32, array_pointer); array_pointer += 480; - NN_initTensor(&model->conv12_4_running_var, 1, (size_t[]){480}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv12_4_running_var, 1, (size_t[]){480}, DTYPE_F32, array_pointer); array_pointer += 480; - NN_initTensor(&model->conv12_4, 4, (size_t[]){1, 7, 7, 480}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv12_4, 4, (size_t[]){1, 7, 7, 480}, DTYPE_F32, NULL); // : conv12_5 - NN_initTensor(&model->conv12_5, 4, (size_t[]){1, 7, 7, 480}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv12_5, 4, (size_t[]){1, 7, 7, 480}, DTYPE_F32, NULL); // : conv13_0 - NN_initTensor(&model->conv13_0_weight, 4, (size_t[]){3, 3, 1, 480}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv13_0_weight, 4, (size_t[]){3, 3, 1, 480}, DTYPE_F32, array_pointer); array_pointer += 4320; - NN_initTensor(&model->conv13_0, 4, (size_t[]){1, 7, 7, 480}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv13_0, 4, (size_t[]){1, 7, 7, 480}, DTYPE_F32, NULL); // : conv13_1 - NN_initTensor(&model->conv13_1_weight, 1, (size_t[]){480}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv13_1_weight, 1, (size_t[]){480}, DTYPE_F32, array_pointer); array_pointer += 480; - NN_initTensor(&model->conv13_1_bias, 1, (size_t[]){480}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv13_1_bias, 1, (size_t[]){480}, DTYPE_F32, array_pointer); array_pointer += 480; - NN_initTensor(&model->conv13_1_running_mean, 1, (size_t[]){480}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv13_1_running_mean, 1, (size_t[]){480}, DTYPE_F32, array_pointer); array_pointer += 480; - NN_initTensor(&model->conv13_1_running_var, 1, (size_t[]){480}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv13_1_running_var, 1, (size_t[]){480}, DTYPE_F32, array_pointer); array_pointer += 480; - NN_initTensor(&model->conv13_1, 4, (size_t[]){1, 7, 7, 480}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv13_1, 4, (size_t[]){1, 7, 7, 480}, DTYPE_F32, NULL); // : conv13_2 - NN_initTensor(&model->conv13_2, 4, (size_t[]){1, 7, 7, 480}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv13_2, 4, (size_t[]){1, 7, 7, 480}, DTYPE_F32, NULL); // : conv13_3 - NN_initTensor(&model->conv13_3_weight, 4, (size_t[]){1, 1, 480, 512}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv13_3_weight, 4, (size_t[]){1, 1, 480, 512}, DTYPE_F32, array_pointer); array_pointer += 245760; - NN_initTensor(&model->conv13_3, 4, (size_t[]){1, 7, 7, 512}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv13_3, 4, (size_t[]){1, 7, 7, 512}, DTYPE_F32, NULL); // : conv13_4 - NN_initTensor(&model->conv13_4_weight, 1, (size_t[]){512}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv13_4_weight, 1, (size_t[]){512}, DTYPE_F32, array_pointer); array_pointer += 512; - NN_initTensor(&model->conv13_4_bias, 1, (size_t[]){512}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv13_4_bias, 1, (size_t[]){512}, DTYPE_F32, array_pointer); array_pointer += 512; - NN_initTensor(&model->conv13_4_running_mean, 1, (size_t[]){512}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv13_4_running_mean, 1, (size_t[]){512}, DTYPE_F32, array_pointer); array_pointer += 512; - NN_initTensor(&model->conv13_4_running_var, 1, (size_t[]){512}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->conv13_4_running_var, 1, (size_t[]){512}, DTYPE_F32, array_pointer); array_pointer += 512; - NN_initTensor(&model->conv13_4, 4, (size_t[]){1, 7, 7, 512}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv13_4, 4, (size_t[]){1, 7, 7, 512}, DTYPE_F32, NULL); // : conv13_5 - NN_initTensor(&model->conv13_5, 4, (size_t[]){1, 7, 7, 512}, DTYPE_F32, NULL); + NN_init_tensor(&model->conv13_5, 4, (size_t[]){1, 7, 7, 512}, DTYPE_F32, NULL); // : decode_conv1_0_0 - NN_initTensor(&model->decode_conv1_0_0_weight, 4, (size_t[]){5, 5, 1, 512}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv1_0_0_weight, 4, (size_t[]){5, 5, 1, 512}, DTYPE_F32, array_pointer); array_pointer += 12800; - NN_initTensor(&model->decode_conv1_0_0, 4, (size_t[]){1, 7, 7, 512}, DTYPE_F32, NULL); + NN_init_tensor(&model->decode_conv1_0_0, 4, (size_t[]){1, 7, 7, 512}, DTYPE_F32, NULL); // : decode_conv1_0_1 - NN_initTensor(&model->decode_conv1_0_1_weight, 1, (size_t[]){512}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv1_0_1_weight, 1, (size_t[]){512}, DTYPE_F32, array_pointer); array_pointer += 512; - NN_initTensor(&model->decode_conv1_0_1_bias, 1, (size_t[]){512}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv1_0_1_bias, 1, (size_t[]){512}, DTYPE_F32, array_pointer); array_pointer += 512; - NN_initTensor(&model->decode_conv1_0_1_running_mean, 1, (size_t[]){512}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv1_0_1_running_mean, 1, (size_t[]){512}, DTYPE_F32, array_pointer); array_pointer += 512; - NN_initTensor(&model->decode_conv1_0_1_running_var, 1, (size_t[]){512}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv1_0_1_running_var, 1, (size_t[]){512}, DTYPE_F32, array_pointer); array_pointer += 512; - NN_initTensor(&model->decode_conv1_0_1, 4, (size_t[]){1, 7, 7, 512}, DTYPE_F32, NULL); + NN_init_tensor(&model->decode_conv1_0_1, 4, (size_t[]){1, 7, 7, 512}, DTYPE_F32, NULL); // : decode_conv1_0_2 - NN_initTensor(&model->decode_conv1_0_2, 4, (size_t[]){1, 7, 7, 512}, DTYPE_F32, NULL); + NN_init_tensor(&model->decode_conv1_0_2, 4, (size_t[]){1, 7, 7, 512}, DTYPE_F32, NULL); // : decode_conv1_1_0 - NN_initTensor(&model->decode_conv1_1_0_weight, 4, (size_t[]){1, 1, 512, 200}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv1_1_0_weight, 4, (size_t[]){1, 1, 512, 200}, DTYPE_F32, array_pointer); array_pointer += 102400; - NN_initTensor(&model->decode_conv1_1_0, 4, (size_t[]){1, 7, 7, 200}, DTYPE_F32, NULL); + NN_init_tensor(&model->decode_conv1_1_0, 4, (size_t[]){1, 7, 7, 200}, DTYPE_F32, NULL); // : decode_conv1_1_1 - NN_initTensor(&model->decode_conv1_1_1_weight, 1, (size_t[]){200}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv1_1_1_weight, 1, (size_t[]){200}, DTYPE_F32, array_pointer); array_pointer += 200; - NN_initTensor(&model->decode_conv1_1_1_bias, 1, (size_t[]){200}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv1_1_1_bias, 1, (size_t[]){200}, DTYPE_F32, array_pointer); array_pointer += 200; - NN_initTensor(&model->decode_conv1_1_1_running_mean, 1, (size_t[]){200}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv1_1_1_running_mean, 1, (size_t[]){200}, DTYPE_F32, array_pointer); array_pointer += 200; - NN_initTensor(&model->decode_conv1_1_1_running_var, 1, (size_t[]){200}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv1_1_1_running_var, 1, (size_t[]){200}, DTYPE_F32, array_pointer); array_pointer += 200; - NN_initTensor(&model->decode_conv1_1_1, 4, (size_t[]){1, 7, 7, 200}, DTYPE_F32, NULL); + NN_init_tensor(&model->decode_conv1_1_1, 4, (size_t[]){1, 7, 7, 200}, DTYPE_F32, NULL); // : decode_conv1_1_2 - NN_initTensor(&model->decode_conv1_1_2, 4, (size_t[]){1, 7, 7, 200}, DTYPE_F32, NULL); - NN_initTensor(&model->interpolate, 4, (size_t[]){1, 14, 14, 200}, DTYPE_F32, NULL); + NN_init_tensor(&model->decode_conv1_1_2, 4, (size_t[]){1, 7, 7, 200}, DTYPE_F32, NULL); + NN_init_tensor(&model->interpolate, 4, (size_t[]){1, 14, 14, 200}, DTYPE_F32, NULL); // : decode_conv2_0_0 - NN_initTensor(&model->decode_conv2_0_0_weight, 4, (size_t[]){5, 5, 1, 200}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv2_0_0_weight, 4, (size_t[]){5, 5, 1, 200}, DTYPE_F32, array_pointer); array_pointer += 5000; - NN_initTensor(&model->decode_conv2_0_0, 4, (size_t[]){1, 14, 14, 200}, DTYPE_F32, NULL); + NN_init_tensor(&model->decode_conv2_0_0, 4, (size_t[]){1, 14, 14, 200}, DTYPE_F32, NULL); // : decode_conv2_0_1 - NN_initTensor(&model->decode_conv2_0_1_weight, 1, (size_t[]){200}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv2_0_1_weight, 1, (size_t[]){200}, DTYPE_F32, array_pointer); array_pointer += 200; - NN_initTensor(&model->decode_conv2_0_1_bias, 1, (size_t[]){200}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv2_0_1_bias, 1, (size_t[]){200}, DTYPE_F32, array_pointer); array_pointer += 200; - NN_initTensor(&model->decode_conv2_0_1_running_mean, 1, (size_t[]){200}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv2_0_1_running_mean, 1, (size_t[]){200}, DTYPE_F32, array_pointer); array_pointer += 200; - NN_initTensor(&model->decode_conv2_0_1_running_var, 1, (size_t[]){200}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv2_0_1_running_var, 1, (size_t[]){200}, DTYPE_F32, array_pointer); array_pointer += 200; - NN_initTensor(&model->decode_conv2_0_1, 4, (size_t[]){1, 14, 14, 200}, DTYPE_F32, NULL); + NN_init_tensor(&model->decode_conv2_0_1, 4, (size_t[]){1, 14, 14, 200}, DTYPE_F32, NULL); // : decode_conv2_0_2 - NN_initTensor(&model->decode_conv2_0_2, 4, (size_t[]){1, 14, 14, 200}, DTYPE_F32, NULL); + NN_init_tensor(&model->decode_conv2_0_2, 4, (size_t[]){1, 14, 14, 200}, DTYPE_F32, NULL); // : decode_conv2_1_0 - NN_initTensor(&model->decode_conv2_1_0_weight, 4, (size_t[]){1, 1, 200, 256}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv2_1_0_weight, 4, (size_t[]){1, 1, 200, 256}, DTYPE_F32, array_pointer); array_pointer += 51200; - NN_initTensor(&model->decode_conv2_1_0, 4, (size_t[]){1, 14, 14, 256}, DTYPE_F32, NULL); + NN_init_tensor(&model->decode_conv2_1_0, 4, (size_t[]){1, 14, 14, 256}, DTYPE_F32, NULL); // : decode_conv2_1_1 - NN_initTensor(&model->decode_conv2_1_1_weight, 1, (size_t[]){256}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv2_1_1_weight, 1, (size_t[]){256}, DTYPE_F32, array_pointer); array_pointer += 256; - NN_initTensor(&model->decode_conv2_1_1_bias, 1, (size_t[]){256}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv2_1_1_bias, 1, (size_t[]){256}, DTYPE_F32, array_pointer); array_pointer += 256; - NN_initTensor(&model->decode_conv2_1_1_running_mean, 1, (size_t[]){256}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv2_1_1_running_mean, 1, (size_t[]){256}, DTYPE_F32, array_pointer); array_pointer += 256; - NN_initTensor(&model->decode_conv2_1_1_running_var, 1, (size_t[]){256}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv2_1_1_running_var, 1, (size_t[]){256}, DTYPE_F32, array_pointer); array_pointer += 256; - NN_initTensor(&model->decode_conv2_1_1, 4, (size_t[]){1, 14, 14, 256}, DTYPE_F32, NULL); + NN_init_tensor(&model->decode_conv2_1_1, 4, (size_t[]){1, 14, 14, 256}, DTYPE_F32, NULL); // : decode_conv2_1_2 - NN_initTensor(&model->decode_conv2_1_2, 4, (size_t[]){1, 14, 14, 256}, DTYPE_F32, NULL); - NN_initTensor(&model->interpolate_1, 4, (size_t[]){1, 28, 28, 256}, DTYPE_F32, NULL); - NN_initTensor(&model->add, 4, (size_t[]){1, 28, 28, 256}, DTYPE_F32, NULL); + NN_init_tensor(&model->decode_conv2_1_2, 4, (size_t[]){1, 14, 14, 256}, DTYPE_F32, NULL); + NN_init_tensor(&model->interpolate_1, 4, (size_t[]){1, 28, 28, 256}, DTYPE_F32, NULL); + NN_init_tensor(&model->add, 4, (size_t[]){1, 28, 28, 256}, DTYPE_F32, NULL); // : decode_conv3_0_0 - NN_initTensor(&model->decode_conv3_0_0_weight, 4, (size_t[]){5, 5, 1, 256}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv3_0_0_weight, 4, (size_t[]){5, 5, 1, 256}, DTYPE_F32, array_pointer); array_pointer += 6400; - NN_initTensor(&model->decode_conv3_0_0, 4, (size_t[]){1, 28, 28, 256}, DTYPE_F32, NULL); + NN_init_tensor(&model->decode_conv3_0_0, 4, (size_t[]){1, 28, 28, 256}, DTYPE_F32, NULL); // : decode_conv3_0_1 - NN_initTensor(&model->decode_conv3_0_1_weight, 1, (size_t[]){256}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv3_0_1_weight, 1, (size_t[]){256}, DTYPE_F32, array_pointer); array_pointer += 256; - NN_initTensor(&model->decode_conv3_0_1_bias, 1, (size_t[]){256}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv3_0_1_bias, 1, (size_t[]){256}, DTYPE_F32, array_pointer); array_pointer += 256; - NN_initTensor(&model->decode_conv3_0_1_running_mean, 1, (size_t[]){256}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv3_0_1_running_mean, 1, (size_t[]){256}, DTYPE_F32, array_pointer); array_pointer += 256; - NN_initTensor(&model->decode_conv3_0_1_running_var, 1, (size_t[]){256}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv3_0_1_running_var, 1, (size_t[]){256}, DTYPE_F32, array_pointer); array_pointer += 256; - NN_initTensor(&model->decode_conv3_0_1, 4, (size_t[]){1, 28, 28, 256}, DTYPE_F32, NULL); + NN_init_tensor(&model->decode_conv3_0_1, 4, (size_t[]){1, 28, 28, 256}, DTYPE_F32, NULL); // : decode_conv3_0_2 - NN_initTensor(&model->decode_conv3_0_2, 4, (size_t[]){1, 28, 28, 256}, DTYPE_F32, NULL); + NN_init_tensor(&model->decode_conv3_0_2, 4, (size_t[]){1, 28, 28, 256}, DTYPE_F32, NULL); // : decode_conv3_1_0 - NN_initTensor(&model->decode_conv3_1_0_weight, 4, (size_t[]){1, 1, 256, 120}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv3_1_0_weight, 4, (size_t[]){1, 1, 256, 120}, DTYPE_F32, array_pointer); array_pointer += 30720; - NN_initTensor(&model->decode_conv3_1_0, 4, (size_t[]){1, 28, 28, 120}, DTYPE_F32, NULL); + NN_init_tensor(&model->decode_conv3_1_0, 4, (size_t[]){1, 28, 28, 120}, DTYPE_F32, NULL); // : decode_conv3_1_1 - NN_initTensor(&model->decode_conv3_1_1_weight, 1, (size_t[]){120}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv3_1_1_weight, 1, (size_t[]){120}, DTYPE_F32, array_pointer); array_pointer += 120; - NN_initTensor(&model->decode_conv3_1_1_bias, 1, (size_t[]){120}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv3_1_1_bias, 1, (size_t[]){120}, DTYPE_F32, array_pointer); array_pointer += 120; - NN_initTensor(&model->decode_conv3_1_1_running_mean, 1, (size_t[]){120}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv3_1_1_running_mean, 1, (size_t[]){120}, DTYPE_F32, array_pointer); array_pointer += 120; - NN_initTensor(&model->decode_conv3_1_1_running_var, 1, (size_t[]){120}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv3_1_1_running_var, 1, (size_t[]){120}, DTYPE_F32, array_pointer); array_pointer += 120; - NN_initTensor(&model->decode_conv3_1_1, 4, (size_t[]){1, 28, 28, 120}, DTYPE_F32, NULL); + NN_init_tensor(&model->decode_conv3_1_1, 4, (size_t[]){1, 28, 28, 120}, DTYPE_F32, NULL); // : decode_conv3_1_2 - NN_initTensor(&model->decode_conv3_1_2, 4, (size_t[]){1, 28, 28, 120}, DTYPE_F32, NULL); - NN_initTensor(&model->interpolate_2, 4, (size_t[]){1, 56, 56, 120}, DTYPE_F32, NULL); - NN_initTensor(&model->add_1, 4, (size_t[]){1, 56, 56, 120}, DTYPE_F32, NULL); + NN_init_tensor(&model->decode_conv3_1_2, 4, (size_t[]){1, 28, 28, 120}, DTYPE_F32, NULL); + NN_init_tensor(&model->interpolate_2, 4, (size_t[]){1, 56, 56, 120}, DTYPE_F32, NULL); + NN_init_tensor(&model->add_1, 4, (size_t[]){1, 56, 56, 120}, DTYPE_F32, NULL); // : decode_conv4_0_0 - NN_initTensor(&model->decode_conv4_0_0_weight, 4, (size_t[]){5, 5, 1, 120}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv4_0_0_weight, 4, (size_t[]){5, 5, 1, 120}, DTYPE_F32, array_pointer); array_pointer += 3000; - NN_initTensor(&model->decode_conv4_0_0, 4, (size_t[]){1, 56, 56, 120}, DTYPE_F32, NULL); + NN_init_tensor(&model->decode_conv4_0_0, 4, (size_t[]){1, 56, 56, 120}, DTYPE_F32, NULL); // : decode_conv4_0_1 - NN_initTensor(&model->decode_conv4_0_1_weight, 1, (size_t[]){120}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv4_0_1_weight, 1, (size_t[]){120}, DTYPE_F32, array_pointer); array_pointer += 120; - NN_initTensor(&model->decode_conv4_0_1_bias, 1, (size_t[]){120}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv4_0_1_bias, 1, (size_t[]){120}, DTYPE_F32, array_pointer); array_pointer += 120; - NN_initTensor(&model->decode_conv4_0_1_running_mean, 1, (size_t[]){120}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv4_0_1_running_mean, 1, (size_t[]){120}, DTYPE_F32, array_pointer); array_pointer += 120; - NN_initTensor(&model->decode_conv4_0_1_running_var, 1, (size_t[]){120}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv4_0_1_running_var, 1, (size_t[]){120}, DTYPE_F32, array_pointer); array_pointer += 120; - NN_initTensor(&model->decode_conv4_0_1, 4, (size_t[]){1, 56, 56, 120}, DTYPE_F32, NULL); + NN_init_tensor(&model->decode_conv4_0_1, 4, (size_t[]){1, 56, 56, 120}, DTYPE_F32, NULL); // : decode_conv4_0_2 - NN_initTensor(&model->decode_conv4_0_2, 4, (size_t[]){1, 56, 56, 120}, DTYPE_F32, NULL); + NN_init_tensor(&model->decode_conv4_0_2, 4, (size_t[]){1, 56, 56, 120}, DTYPE_F32, NULL); // : decode_conv4_1_0 - NN_initTensor(&model->decode_conv4_1_0_weight, 4, (size_t[]){1, 1, 120, 56}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv4_1_0_weight, 4, (size_t[]){1, 1, 120, 56}, DTYPE_F32, array_pointer); array_pointer += 6720; - NN_initTensor(&model->decode_conv4_1_0, 4, (size_t[]){1, 56, 56, 56}, DTYPE_F32, NULL); + NN_init_tensor(&model->decode_conv4_1_0, 4, (size_t[]){1, 56, 56, 56}, DTYPE_F32, NULL); // : decode_conv4_1_1 - NN_initTensor(&model->decode_conv4_1_1_weight, 1, (size_t[]){56}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv4_1_1_weight, 1, (size_t[]){56}, DTYPE_F32, array_pointer); array_pointer += 56; - NN_initTensor(&model->decode_conv4_1_1_bias, 1, (size_t[]){56}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv4_1_1_bias, 1, (size_t[]){56}, DTYPE_F32, array_pointer); array_pointer += 56; - NN_initTensor(&model->decode_conv4_1_1_running_mean, 1, (size_t[]){56}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv4_1_1_running_mean, 1, (size_t[]){56}, DTYPE_F32, array_pointer); array_pointer += 56; - NN_initTensor(&model->decode_conv4_1_1_running_var, 1, (size_t[]){56}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv4_1_1_running_var, 1, (size_t[]){56}, DTYPE_F32, array_pointer); array_pointer += 56; - NN_initTensor(&model->decode_conv4_1_1, 4, (size_t[]){1, 56, 56, 56}, DTYPE_F32, NULL); + NN_init_tensor(&model->decode_conv4_1_1, 4, (size_t[]){1, 56, 56, 56}, DTYPE_F32, NULL); // : decode_conv4_1_2 - NN_initTensor(&model->decode_conv4_1_2, 4, (size_t[]){1, 56, 56, 56}, DTYPE_F32, NULL); - NN_initTensor(&model->interpolate_3, 4, (size_t[]){1, 112, 112, 56}, DTYPE_F32, NULL); - NN_initTensor(&model->add_2, 4, (size_t[]){1, 112, 112, 56}, DTYPE_F32, NULL); + NN_init_tensor(&model->decode_conv4_1_2, 4, (size_t[]){1, 56, 56, 56}, DTYPE_F32, NULL); + NN_init_tensor(&model->interpolate_3, 4, (size_t[]){1, 112, 112, 56}, DTYPE_F32, NULL); + NN_init_tensor(&model->add_2, 4, (size_t[]){1, 112, 112, 56}, DTYPE_F32, NULL); // : decode_conv5_0_0 - NN_initTensor(&model->decode_conv5_0_0_weight, 4, (size_t[]){5, 5, 1, 56}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv5_0_0_weight, 4, (size_t[]){5, 5, 1, 56}, DTYPE_F32, array_pointer); array_pointer += 1400; - NN_initTensor(&model->decode_conv5_0_0, 4, (size_t[]){1, 112, 112, 56}, DTYPE_F32, NULL); + NN_init_tensor(&model->decode_conv5_0_0, 4, (size_t[]){1, 112, 112, 56}, DTYPE_F32, NULL); // : decode_conv5_0_1 - NN_initTensor(&model->decode_conv5_0_1_weight, 1, (size_t[]){56}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv5_0_1_weight, 1, (size_t[]){56}, DTYPE_F32, array_pointer); array_pointer += 56; - NN_initTensor(&model->decode_conv5_0_1_bias, 1, (size_t[]){56}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv5_0_1_bias, 1, (size_t[]){56}, DTYPE_F32, array_pointer); array_pointer += 56; - NN_initTensor(&model->decode_conv5_0_1_running_mean, 1, (size_t[]){56}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv5_0_1_running_mean, 1, (size_t[]){56}, DTYPE_F32, array_pointer); array_pointer += 56; - NN_initTensor(&model->decode_conv5_0_1_running_var, 1, (size_t[]){56}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv5_0_1_running_var, 1, (size_t[]){56}, DTYPE_F32, array_pointer); array_pointer += 56; - NN_initTensor(&model->decode_conv5_0_1, 4, (size_t[]){1, 112, 112, 56}, DTYPE_F32, NULL); + NN_init_tensor(&model->decode_conv5_0_1, 4, (size_t[]){1, 112, 112, 56}, DTYPE_F32, NULL); // : decode_conv5_0_2 - NN_initTensor(&model->decode_conv5_0_2, 4, (size_t[]){1, 112, 112, 56}, DTYPE_F32, NULL); + NN_init_tensor(&model->decode_conv5_0_2, 4, (size_t[]){1, 112, 112, 56}, DTYPE_F32, NULL); // : decode_conv5_1_0 - NN_initTensor(&model->decode_conv5_1_0_weight, 4, (size_t[]){1, 1, 56, 16}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv5_1_0_weight, 4, (size_t[]){1, 1, 56, 16}, DTYPE_F32, array_pointer); array_pointer += 896; - NN_initTensor(&model->decode_conv5_1_0, 4, (size_t[]){1, 112, 112, 16}, DTYPE_F32, NULL); + NN_init_tensor(&model->decode_conv5_1_0, 4, (size_t[]){1, 112, 112, 16}, DTYPE_F32, NULL); // : decode_conv5_1_1 - NN_initTensor(&model->decode_conv5_1_1_weight, 1, (size_t[]){16}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv5_1_1_weight, 1, (size_t[]){16}, DTYPE_F32, array_pointer); array_pointer += 16; - NN_initTensor(&model->decode_conv5_1_1_bias, 1, (size_t[]){16}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv5_1_1_bias, 1, (size_t[]){16}, DTYPE_F32, array_pointer); array_pointer += 16; - NN_initTensor(&model->decode_conv5_1_1_running_mean, 1, (size_t[]){16}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv5_1_1_running_mean, 1, (size_t[]){16}, DTYPE_F32, array_pointer); array_pointer += 16; - NN_initTensor(&model->decode_conv5_1_1_running_var, 1, (size_t[]){16}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv5_1_1_running_var, 1, (size_t[]){16}, DTYPE_F32, array_pointer); array_pointer += 16; - NN_initTensor(&model->decode_conv5_1_1, 4, (size_t[]){1, 112, 112, 16}, DTYPE_F32, NULL); + NN_init_tensor(&model->decode_conv5_1_1, 4, (size_t[]){1, 112, 112, 16}, DTYPE_F32, NULL); // : decode_conv5_1_2 - NN_initTensor(&model->decode_conv5_1_2, 4, (size_t[]){1, 112, 112, 16}, DTYPE_F32, NULL); - NN_initTensor(&model->interpolate_4, 4, (size_t[]){1, 224, 224, 16}, DTYPE_F32, NULL); + NN_init_tensor(&model->decode_conv5_1_2, 4, (size_t[]){1, 112, 112, 16}, DTYPE_F32, NULL); + NN_init_tensor(&model->interpolate_4, 4, (size_t[]){1, 224, 224, 16}, DTYPE_F32, NULL); // : decode_conv6_0 - NN_initTensor(&model->decode_conv6_0_weight, 4, (size_t[]){1, 1, 16, 1}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv6_0_weight, 4, (size_t[]){1, 1, 16, 1}, DTYPE_F32, array_pointer); array_pointer += 16; - NN_initTensor(&model->decode_conv6_0, 4, (size_t[]){1, 224, 224, 1}, DTYPE_F32, NULL); + NN_init_tensor(&model->decode_conv6_0, 4, (size_t[]){1, 224, 224, 1}, DTYPE_F32, NULL); // : decode_conv6_1 - NN_initTensor(&model->decode_conv6_1_weight, 1, (size_t[]){1}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv6_1_weight, 1, (size_t[]){1}, DTYPE_F32, array_pointer); array_pointer += 1; - NN_initTensor(&model->decode_conv6_1_bias, 1, (size_t[]){1}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv6_1_bias, 1, (size_t[]){1}, DTYPE_F32, array_pointer); array_pointer += 1; - NN_initTensor(&model->decode_conv6_1_running_mean, 1, (size_t[]){1}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv6_1_running_mean, 1, (size_t[]){1}, DTYPE_F32, array_pointer); array_pointer += 1; - NN_initTensor(&model->decode_conv6_1_running_var, 1, (size_t[]){1}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->decode_conv6_1_running_var, 1, (size_t[]){1}, DTYPE_F32, array_pointer); array_pointer += 1; - NN_initTensor(&model->decode_conv6_1, 4, (size_t[]){1, 224, 224, 1}, DTYPE_F32, NULL); + NN_init_tensor(&model->decode_conv6_1, 4, (size_t[]){1, 224, 224, 1}, DTYPE_F32, NULL); // : decode_conv6_2 - NN_initTensor(&model->decode_conv6_2, 4, (size_t[]){1, 224, 224, 1}, DTYPE_F32, NULL); + NN_init_tensor(&model->decode_conv6_2, 4, (size_t[]){1, 224, 224, 1}, DTYPE_F32, NULL); } @@ -1078,334 +1078,334 @@ void init(Model *model) { * Forward pass of the model */ void forward(Model *model) { - NN_Conv2d( + NN_conv2d( &model->conv0_0, &model->x, &model->conv0_0_weight, NULL, (size_t[]){2, 2}, (size_t[]){1, 1}, (size_t[]){1, 1}, 1); - NN_BatchNorm2d( + NN_batch_norm2d( &model->conv0_1, &model->conv0_0, &model->conv0_1_weight, &model->conv0_1_bias, 1e-05, &model->conv0_1_running_mean, &model->conv0_1_running_var); - NN_ReLU6(&model->conv0_2, &model->conv0_1); - NN_Conv2d( + NN_relu6(&model->conv0_2, &model->conv0_1); + NN_conv2d( &model->conv1_0, &model->conv0_2, &model->conv1_0_weight, NULL, (size_t[]){1, 1}, (size_t[]){1, 1}, (size_t[]){1, 1}, 16); - NN_BatchNorm2d( + NN_batch_norm2d( &model->conv1_1, &model->conv1_0, &model->conv1_1_weight, &model->conv1_1_bias, 1e-05, &model->conv1_1_running_mean, &model->conv1_1_running_var); - NN_ReLU6(&model->conv1_2, &model->conv1_1); - NN_Conv2d( + NN_relu6(&model->conv1_2, &model->conv1_1); + NN_conv2d( &model->conv1_3, &model->conv1_2, &model->conv1_3_weight, NULL, (size_t[]){1, 1}, (size_t[]){0, 0}, (size_t[]){1, 1}, 1); - NN_BatchNorm2d( + NN_batch_norm2d( &model->conv1_4, &model->conv1_3, &model->conv1_4_weight, &model->conv1_4_bias, 1e-05, &model->conv1_4_running_mean, &model->conv1_4_running_var); - NN_ReLU6(&model->conv1_5, &model->conv1_4); - NN_Conv2d( + NN_relu6(&model->conv1_5, &model->conv1_4); + NN_conv2d( &model->conv2_0, &model->conv1_5, &model->conv2_0_weight, NULL, (size_t[]){2, 2}, (size_t[]){1, 1}, (size_t[]){1, 1}, 56); - NN_BatchNorm2d( + NN_batch_norm2d( &model->conv2_1, &model->conv2_0, &model->conv2_1_weight, &model->conv2_1_bias, 1e-05, &model->conv2_1_running_mean, &model->conv2_1_running_var); - NN_ReLU6(&model->conv2_2, &model->conv2_1); - NN_Conv2d( + NN_relu6(&model->conv2_2, &model->conv2_1); + NN_conv2d( &model->conv2_3, &model->conv2_2, &model->conv2_3_weight, NULL, (size_t[]){1, 1}, (size_t[]){0, 0}, (size_t[]){1, 1}, 1); - NN_BatchNorm2d( + NN_batch_norm2d( &model->conv2_4, &model->conv2_3, &model->conv2_4_weight, &model->conv2_4_bias, 1e-05, &model->conv2_4_running_mean, &model->conv2_4_running_var); - NN_ReLU6(&model->conv2_5, &model->conv2_4); - NN_Conv2d( + NN_relu6(&model->conv2_5, &model->conv2_4); + NN_conv2d( &model->conv3_0, &model->conv2_5, &model->conv3_0_weight, NULL, (size_t[]){1, 1}, (size_t[]){1, 1}, (size_t[]){1, 1}, 88); - NN_BatchNorm2d( + NN_batch_norm2d( &model->conv3_1, &model->conv3_0, &model->conv3_1_weight, &model->conv3_1_bias, 1e-05, &model->conv3_1_running_mean, &model->conv3_1_running_var); - NN_ReLU6(&model->conv3_2, &model->conv3_1); - NN_Conv2d( + NN_relu6(&model->conv3_2, &model->conv3_1); + NN_conv2d( &model->conv3_3, &model->conv3_2, &model->conv3_3_weight, NULL, (size_t[]){1, 1}, (size_t[]){0, 0}, (size_t[]){1, 1}, 1); - NN_BatchNorm2d( + NN_batch_norm2d( &model->conv3_4, &model->conv3_3, &model->conv3_4_weight, &model->conv3_4_bias, 1e-05, &model->conv3_4_running_mean, &model->conv3_4_running_var); - NN_ReLU6(&model->conv3_5, &model->conv3_4); - NN_Conv2d( + NN_relu6(&model->conv3_5, &model->conv3_4); + NN_conv2d( &model->conv4_0, &model->conv3_5, &model->conv4_0_weight, NULL, (size_t[]){2, 2}, (size_t[]){1, 1}, (size_t[]){1, 1}, 120); - NN_BatchNorm2d( + NN_batch_norm2d( &model->conv4_1, &model->conv4_0, &model->conv4_1_weight, &model->conv4_1_bias, 1e-05, &model->conv4_1_running_mean, &model->conv4_1_running_var); - NN_ReLU6(&model->conv4_2, &model->conv4_1); - NN_Conv2d( + NN_relu6(&model->conv4_2, &model->conv4_1); + NN_conv2d( &model->conv4_3, &model->conv4_2, &model->conv4_3_weight, NULL, (size_t[]){1, 1}, (size_t[]){0, 0}, (size_t[]){1, 1}, 1); - NN_BatchNorm2d( + NN_batch_norm2d( &model->conv4_4, &model->conv4_3, &model->conv4_4_weight, &model->conv4_4_bias, 1e-05, &model->conv4_4_running_mean, &model->conv4_4_running_var); - NN_ReLU6(&model->conv4_5, &model->conv4_4); - NN_Conv2d( + NN_relu6(&model->conv4_5, &model->conv4_4); + NN_conv2d( &model->conv5_0, &model->conv4_5, &model->conv5_0_weight, NULL, (size_t[]){1, 1}, (size_t[]){1, 1}, (size_t[]){1, 1}, 144); - NN_BatchNorm2d( + NN_batch_norm2d( &model->conv5_1, &model->conv5_0, &model->conv5_1_weight, &model->conv5_1_bias, 1e-05, &model->conv5_1_running_mean, &model->conv5_1_running_var); - NN_ReLU6(&model->conv5_2, &model->conv5_1); - NN_Conv2d( + NN_relu6(&model->conv5_2, &model->conv5_1); + NN_conv2d( &model->conv5_3, &model->conv5_2, &model->conv5_3_weight, NULL, (size_t[]){1, 1}, (size_t[]){0, 0}, (size_t[]){1, 1}, 1); - NN_BatchNorm2d( + NN_batch_norm2d( &model->conv5_4, &model->conv5_3, &model->conv5_4_weight, &model->conv5_4_bias, 1e-05, &model->conv5_4_running_mean, &model->conv5_4_running_var); - NN_ReLU6(&model->conv5_5, &model->conv5_4); - NN_Conv2d( + NN_relu6(&model->conv5_5, &model->conv5_4); + NN_conv2d( &model->conv6_0, &model->conv5_5, &model->conv6_0_weight, NULL, (size_t[]){2, 2}, (size_t[]){1, 1}, (size_t[]){1, 1}, 256); - NN_BatchNorm2d( + NN_batch_norm2d( &model->conv6_1, &model->conv6_0, &model->conv6_1_weight, &model->conv6_1_bias, 1e-05, &model->conv6_1_running_mean, &model->conv6_1_running_var); - NN_ReLU6(&model->conv6_2, &model->conv6_1); - NN_Conv2d( + NN_relu6(&model->conv6_2, &model->conv6_1); + NN_conv2d( &model->conv6_3, &model->conv6_2, &model->conv6_3_weight, NULL, (size_t[]){1, 1}, (size_t[]){0, 0}, (size_t[]){1, 1}, 1); - NN_BatchNorm2d( + NN_batch_norm2d( &model->conv6_4, &model->conv6_3, &model->conv6_4_weight, &model->conv6_4_bias, 1e-05, &model->conv6_4_running_mean, &model->conv6_4_running_var); - NN_ReLU6(&model->conv6_5, &model->conv6_4); - NN_Conv2d( + NN_relu6(&model->conv6_5, &model->conv6_4); + NN_conv2d( &model->conv7_0, &model->conv6_5, &model->conv7_0_weight, NULL, (size_t[]){1, 1}, (size_t[]){1, 1}, (size_t[]){1, 1}, 408); - NN_BatchNorm2d( + NN_batch_norm2d( &model->conv7_1, &model->conv7_0, &model->conv7_1_weight, &model->conv7_1_bias, 1e-05, &model->conv7_1_running_mean, &model->conv7_1_running_var); - NN_ReLU6(&model->conv7_2, &model->conv7_1); - NN_Conv2d( + NN_relu6(&model->conv7_2, &model->conv7_1); + NN_conv2d( &model->conv7_3, &model->conv7_2, &model->conv7_3_weight, NULL, (size_t[]){1, 1}, (size_t[]){0, 0}, (size_t[]){1, 1}, 1); - NN_BatchNorm2d( + NN_batch_norm2d( &model->conv7_4, &model->conv7_3, &model->conv7_4_weight, &model->conv7_4_bias, 1e-05, &model->conv7_4_running_mean, &model->conv7_4_running_var); - NN_ReLU6(&model->conv7_5, &model->conv7_4); - NN_Conv2d( + NN_relu6(&model->conv7_5, &model->conv7_4); + NN_conv2d( &model->conv8_0, &model->conv7_5, &model->conv8_0_weight, NULL, (size_t[]){1, 1}, (size_t[]){1, 1}, (size_t[]){1, 1}, 376); - NN_BatchNorm2d( + NN_batch_norm2d( &model->conv8_1, &model->conv8_0, &model->conv8_1_weight, &model->conv8_1_bias, 1e-05, &model->conv8_1_running_mean, &model->conv8_1_running_var); - NN_ReLU6(&model->conv8_2, &model->conv8_1); - NN_Conv2d( + NN_relu6(&model->conv8_2, &model->conv8_1); + NN_conv2d( &model->conv8_3, &model->conv8_2, &model->conv8_3_weight, NULL, (size_t[]){1, 1}, (size_t[]){0, 0}, (size_t[]){1, 1}, 1); - NN_BatchNorm2d( + NN_batch_norm2d( &model->conv8_4, &model->conv8_3, &model->conv8_4_weight, &model->conv8_4_bias, 1e-05, &model->conv8_4_running_mean, &model->conv8_4_running_var); - NN_ReLU6(&model->conv8_5, &model->conv8_4); - NN_Conv2d( + NN_relu6(&model->conv8_5, &model->conv8_4); + NN_conv2d( &model->conv9_0, &model->conv8_5, &model->conv9_0_weight, NULL, (size_t[]){1, 1}, (size_t[]){1, 1}, (size_t[]){1, 1}, 272); - NN_BatchNorm2d( + NN_batch_norm2d( &model->conv9_1, &model->conv9_0, &model->conv9_1_weight, &model->conv9_1_bias, 1e-05, &model->conv9_1_running_mean, &model->conv9_1_running_var); - NN_ReLU6(&model->conv9_2, &model->conv9_1); - NN_Conv2d( + NN_relu6(&model->conv9_2, &model->conv9_1); + NN_conv2d( &model->conv9_3, &model->conv9_2, &model->conv9_3_weight, NULL, (size_t[]){1, 1}, (size_t[]){0, 0}, (size_t[]){1, 1}, 1); - NN_BatchNorm2d( + NN_batch_norm2d( &model->conv9_4, &model->conv9_3, &model->conv9_4_weight, &model->conv9_4_bias, 1e-05, &model->conv9_4_running_mean, &model->conv9_4_running_var); - NN_ReLU6(&model->conv9_5, &model->conv9_4); - NN_Conv2d( + NN_relu6(&model->conv9_5, &model->conv9_4); + NN_conv2d( &model->conv10_0, &model->conv9_5, &model->conv10_0_weight, NULL, (size_t[]){1, 1}, (size_t[]){1, 1}, (size_t[]){1, 1}, 288); - NN_BatchNorm2d( + NN_batch_norm2d( &model->conv10_1, &model->conv10_0, &model->conv10_1_weight, &model->conv10_1_bias, 1e-05, &model->conv10_1_running_mean, &model->conv10_1_running_var); - NN_ReLU6(&model->conv10_2, &model->conv10_1); - NN_Conv2d( + NN_relu6(&model->conv10_2, &model->conv10_1); + NN_conv2d( &model->conv10_3, &model->conv10_2, &model->conv10_3_weight, NULL, (size_t[]){1, 1}, (size_t[]){0, 0}, (size_t[]){1, 1}, 1); - NN_BatchNorm2d( + NN_batch_norm2d( &model->conv10_4, &model->conv10_3, &model->conv10_4_weight, &model->conv10_4_bias, 1e-05, &model->conv10_4_running_mean, &model->conv10_4_running_var); - NN_ReLU6(&model->conv10_5, &model->conv10_4); - NN_Conv2d( + NN_relu6(&model->conv10_5, &model->conv10_4); + NN_conv2d( &model->conv11_0, &model->conv10_5, &model->conv11_0_weight, NULL, (size_t[]){1, 1}, (size_t[]){1, 1}, (size_t[]){1, 1}, 296); - NN_BatchNorm2d( + NN_batch_norm2d( &model->conv11_1, &model->conv11_0, &model->conv11_1_weight, &model->conv11_1_bias, 1e-05, &model->conv11_1_running_mean, &model->conv11_1_running_var); - NN_ReLU6(&model->conv11_2, &model->conv11_1); - NN_Conv2d( + NN_relu6(&model->conv11_2, &model->conv11_1); + NN_conv2d( &model->conv11_3, &model->conv11_2, &model->conv11_3_weight, NULL, (size_t[]){1, 1}, (size_t[]){0, 0}, (size_t[]){1, 1}, 1); - NN_BatchNorm2d( + NN_batch_norm2d( &model->conv11_4, &model->conv11_3, &model->conv11_4_weight, &model->conv11_4_bias, 1e-05, &model->conv11_4_running_mean, &model->conv11_4_running_var); - NN_ReLU6(&model->conv11_5, &model->conv11_4); - NN_Conv2d( + NN_relu6(&model->conv11_5, &model->conv11_4); + NN_conv2d( &model->conv12_0, &model->conv11_5, &model->conv12_0_weight, NULL, (size_t[]){2, 2}, (size_t[]){1, 1}, (size_t[]){1, 1}, 328); - NN_BatchNorm2d( + NN_batch_norm2d( &model->conv12_1, &model->conv12_0, &model->conv12_1_weight, &model->conv12_1_bias, 1e-05, &model->conv12_1_running_mean, &model->conv12_1_running_var); - NN_ReLU6(&model->conv12_2, &model->conv12_1); - NN_Conv2d( + NN_relu6(&model->conv12_2, &model->conv12_1); + NN_conv2d( &model->conv12_3, &model->conv12_2, &model->conv12_3_weight, NULL, (size_t[]){1, 1}, (size_t[]){0, 0}, (size_t[]){1, 1}, 1); - NN_BatchNorm2d( + NN_batch_norm2d( &model->conv12_4, &model->conv12_3, &model->conv12_4_weight, &model->conv12_4_bias, 1e-05, &model->conv12_4_running_mean, &model->conv12_4_running_var); - NN_ReLU6(&model->conv12_5, &model->conv12_4); - NN_Conv2d( + NN_relu6(&model->conv12_5, &model->conv12_4); + NN_conv2d( &model->conv13_0, &model->conv12_5, &model->conv13_0_weight, NULL, (size_t[]){1, 1}, (size_t[]){1, 1}, (size_t[]){1, 1}, 480); - NN_BatchNorm2d( + NN_batch_norm2d( &model->conv13_1, &model->conv13_0, &model->conv13_1_weight, &model->conv13_1_bias, 1e-05, &model->conv13_1_running_mean, &model->conv13_1_running_var); - NN_ReLU6(&model->conv13_2, &model->conv13_1); - NN_Conv2d( + NN_relu6(&model->conv13_2, &model->conv13_1); + NN_conv2d( &model->conv13_3, &model->conv13_2, &model->conv13_3_weight, NULL, (size_t[]){1, 1}, (size_t[]){0, 0}, (size_t[]){1, 1}, 1); - NN_BatchNorm2d( + NN_batch_norm2d( &model->conv13_4, &model->conv13_3, &model->conv13_4_weight, &model->conv13_4_bias, 1e-05, &model->conv13_4_running_mean, &model->conv13_4_running_var); - NN_ReLU6(&model->conv13_5, &model->conv13_4); - NN_Conv2d( + NN_relu6(&model->conv13_5, &model->conv13_4); + NN_conv2d( &model->decode_conv1_0_0, &model->conv13_5, &model->decode_conv1_0_0_weight, NULL, (size_t[]){1, 1}, (size_t[]){2, 2}, (size_t[]){1, 1}, 512); - NN_BatchNorm2d( + NN_batch_norm2d( &model->decode_conv1_0_1, &model->decode_conv1_0_0, &model->decode_conv1_0_1_weight, &model->decode_conv1_0_1_bias, 1e-05, &model->decode_conv1_0_1_running_mean, &model->decode_conv1_0_1_running_var); - NN_ReLU(&model->decode_conv1_0_2, &model->decode_conv1_0_1); - NN_Conv2d( + NN_relu(&model->decode_conv1_0_2, &model->decode_conv1_0_1); + NN_conv2d( &model->decode_conv1_1_0, &model->decode_conv1_0_2, &model->decode_conv1_1_0_weight, NULL, (size_t[]){1, 1}, (size_t[]){0, 0}, (size_t[]){1, 1}, 1); - NN_BatchNorm2d( + NN_batch_norm2d( &model->decode_conv1_1_1, &model->decode_conv1_1_0, &model->decode_conv1_1_1_weight, &model->decode_conv1_1_1_bias, 1e-05, &model->decode_conv1_1_1_running_mean, &model->decode_conv1_1_1_running_var); - NN_ReLU(&model->decode_conv1_1_2, &model->decode_conv1_1_1); + NN_relu(&model->decode_conv1_1_2, &model->decode_conv1_1_1); // F.interpolate NN_interpolate(&model->interpolate, &model->decode_conv1_1_2, (float []){2, 2}); - NN_Conv2d( + NN_conv2d( &model->decode_conv2_0_0, &model->interpolate, &model->decode_conv2_0_0_weight, NULL, (size_t[]){1, 1}, (size_t[]){2, 2}, (size_t[]){1, 1}, 200); - NN_BatchNorm2d( + NN_batch_norm2d( &model->decode_conv2_0_1, &model->decode_conv2_0_0, &model->decode_conv2_0_1_weight, &model->decode_conv2_0_1_bias, 1e-05, &model->decode_conv2_0_1_running_mean, &model->decode_conv2_0_1_running_var); - NN_ReLU(&model->decode_conv2_0_2, &model->decode_conv2_0_1); - NN_Conv2d( + NN_relu(&model->decode_conv2_0_2, &model->decode_conv2_0_1); + NN_conv2d( &model->decode_conv2_1_0, &model->decode_conv2_0_2, &model->decode_conv2_1_0_weight, NULL, (size_t[]){1, 1}, (size_t[]){0, 0}, (size_t[]){1, 1}, 1); - NN_BatchNorm2d( + NN_batch_norm2d( &model->decode_conv2_1_1, &model->decode_conv2_1_0, &model->decode_conv2_1_1_weight, &model->decode_conv2_1_1_bias, 1e-05, &model->decode_conv2_1_1_running_mean, &model->decode_conv2_1_1_running_var); - NN_ReLU(&model->decode_conv2_1_2, &model->decode_conv2_1_1); + NN_relu(&model->decode_conv2_1_2, &model->decode_conv2_1_1); // F.interpolate_1 NN_interpolate(&model->interpolate_1, &model->decode_conv2_1_2, (float []){2, 2}); // F.add NN_add(&model->add, &model->interpolate_1, &model->conv5_5); - NN_Conv2d( + NN_conv2d( &model->decode_conv3_0_0, &model->add, &model->decode_conv3_0_0_weight, NULL, (size_t[]){1, 1}, (size_t[]){2, 2}, (size_t[]){1, 1}, 256); - NN_BatchNorm2d( + NN_batch_norm2d( &model->decode_conv3_0_1, &model->decode_conv3_0_0, &model->decode_conv3_0_1_weight, &model->decode_conv3_0_1_bias, 1e-05, &model->decode_conv3_0_1_running_mean, &model->decode_conv3_0_1_running_var); - NN_ReLU(&model->decode_conv3_0_2, &model->decode_conv3_0_1); - NN_Conv2d( + NN_relu(&model->decode_conv3_0_2, &model->decode_conv3_0_1); + NN_conv2d( &model->decode_conv3_1_0, &model->decode_conv3_0_2, &model->decode_conv3_1_0_weight, NULL, (size_t[]){1, 1}, (size_t[]){0, 0}, (size_t[]){1, 1}, 1); - NN_BatchNorm2d( + NN_batch_norm2d( &model->decode_conv3_1_1, &model->decode_conv3_1_0, &model->decode_conv3_1_1_weight, &model->decode_conv3_1_1_bias, 1e-05, &model->decode_conv3_1_1_running_mean, &model->decode_conv3_1_1_running_var); - NN_ReLU(&model->decode_conv3_1_2, &model->decode_conv3_1_1); + NN_relu(&model->decode_conv3_1_2, &model->decode_conv3_1_1); // F.interpolate_2 NN_interpolate(&model->interpolate_2, &model->decode_conv3_1_2, (float []){2, 2}); // F.add_1 NN_add(&model->add_1, &model->interpolate_2, &model->conv3_5); - NN_Conv2d( + NN_conv2d( &model->decode_conv4_0_0, &model->add_1, &model->decode_conv4_0_0_weight, NULL, (size_t[]){1, 1}, (size_t[]){2, 2}, (size_t[]){1, 1}, 120); - NN_BatchNorm2d( + NN_batch_norm2d( &model->decode_conv4_0_1, &model->decode_conv4_0_0, &model->decode_conv4_0_1_weight, &model->decode_conv4_0_1_bias, 1e-05, &model->decode_conv4_0_1_running_mean, &model->decode_conv4_0_1_running_var); - NN_ReLU(&model->decode_conv4_0_2, &model->decode_conv4_0_1); - NN_Conv2d( + NN_relu(&model->decode_conv4_0_2, &model->decode_conv4_0_1); + NN_conv2d( &model->decode_conv4_1_0, &model->decode_conv4_0_2, &model->decode_conv4_1_0_weight, NULL, (size_t[]){1, 1}, (size_t[]){0, 0}, (size_t[]){1, 1}, 1); - NN_BatchNorm2d( + NN_batch_norm2d( &model->decode_conv4_1_1, &model->decode_conv4_1_0, &model->decode_conv4_1_1_weight, &model->decode_conv4_1_1_bias, 1e-05, &model->decode_conv4_1_1_running_mean, &model->decode_conv4_1_1_running_var); - NN_ReLU(&model->decode_conv4_1_2, &model->decode_conv4_1_1); + NN_relu(&model->decode_conv4_1_2, &model->decode_conv4_1_1); // F.interpolate_3 NN_interpolate(&model->interpolate_3, &model->decode_conv4_1_2, (float []){2, 2}); // F.add_2 NN_add(&model->add_2, &model->interpolate_3, &model->conv1_5); - NN_Conv2d( + NN_conv2d( &model->decode_conv5_0_0, &model->add_2, &model->decode_conv5_0_0_weight, NULL, (size_t[]){1, 1}, (size_t[]){2, 2}, (size_t[]){1, 1}, 56); - NN_BatchNorm2d( + NN_batch_norm2d( &model->decode_conv5_0_1, &model->decode_conv5_0_0, &model->decode_conv5_0_1_weight, &model->decode_conv5_0_1_bias, 1e-05, &model->decode_conv5_0_1_running_mean, &model->decode_conv5_0_1_running_var); - NN_ReLU(&model->decode_conv5_0_2, &model->decode_conv5_0_1); - NN_Conv2d( + NN_relu(&model->decode_conv5_0_2, &model->decode_conv5_0_1); + NN_conv2d( &model->decode_conv5_1_0, &model->decode_conv5_0_2, &model->decode_conv5_1_0_weight, NULL, (size_t[]){1, 1}, (size_t[]){0, 0}, (size_t[]){1, 1}, 1); - NN_BatchNorm2d( + NN_batch_norm2d( &model->decode_conv5_1_1, &model->decode_conv5_1_0, &model->decode_conv5_1_1_weight, &model->decode_conv5_1_1_bias, 1e-05, &model->decode_conv5_1_1_running_mean, &model->decode_conv5_1_1_running_var); - NN_ReLU(&model->decode_conv5_1_2, &model->decode_conv5_1_1); + NN_relu(&model->decode_conv5_1_2, &model->decode_conv5_1_1); // F.interpolate_4 NN_interpolate(&model->interpolate_4, &model->decode_conv5_1_2, (float []){2, 2}); - NN_Conv2d( + NN_conv2d( &model->decode_conv6_0, &model->interpolate_4, &model->decode_conv6_0_weight, NULL, (size_t[]){1, 1}, (size_t[]){0, 0}, (size_t[]){1, 1}, 1); - NN_BatchNorm2d( + NN_batch_norm2d( &model->decode_conv6_1, &model->decode_conv6_0, &model->decode_conv6_1_weight, &model->decode_conv6_1_bias, 1e-05, &model->decode_conv6_1_running_mean, &model->decode_conv6_1_running_var); - NN_ReLU(&model->decode_conv6_2, &model->decode_conv6_1); + NN_relu(&model->decode_conv6_2, &model->decode_conv6_1); } diff --git a/example/fast-depth/termimg.h b/example/fast-depth/termimg.h deleted file mode 100644 index 1dd1aa4..0000000 --- a/example/fast-depth/termimg.h +++ /dev/null @@ -1,57 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -#include "nn_tensor.h" - - -const size_t n_mapping = 92; - -const uint8_t ascii_map[] = {0X20, 0X60, 0X2E, 0X2D, 0X27, 0X3A, 0X5F, 0X2C, 0X5E, 0X3D, 0X3B, 0X3E, 0X3C, 0X2B, 0X21, 0X72, 0X63, 0X2A, 0X2F, 0X7A, 0X3F, 0X73, 0X4C, 0X54, 0X76, 0X29, 0X4A, 0X37, 0X28, 0X7C, 0X46, 0X69, 0X7B, 0X43, 0X7D, 0X66, 0X49, 0X33, 0X31, 0X74, 0X6C, 0X75, 0X5B, 0X6E, 0X65, 0X6F, 0X5A, 0X35, 0X59, 0X78, 0X6A, 0X79, 0X61, 0X5D, 0X32, 0X45, 0X53, 0X77, 0X71, 0X6B, 0X50, 0X36, 0X68, 0X39, 0X64, 0X34, 0X56, 0X70, 0X4F, 0X47, 0X62, 0X55, 0X41, 0X4B, 0X58, 0X48, 0X6D, 0X38, 0X52, 0X44, 0X23, 0X24, 0X42, 0X67, 0X30, 0X4D, 0X4E, 0X57, 0X51, 0X25, 0X26, 0X40}; - -const float brightness_map[] = {0, 0.0751, 0.0829, 0.0848, 0.1227, 0.1403, 0.1559, 0.185, 0.2183, 0.2417, 0.2571, 0.2852, 0.2902, 0.2919, 0.3099, 0.3192, 0.3232, 0.3294, 0.3384, 0.3609, 0.3619, 0.3667, 0.3737, 0.3747, 0.3838, 0.3921, 0.396, 0.3984, 0.3993, 0.4075, 0.4091, 0.4101, 0.42, 0.423, 0.4247, 0.4274, 0.4293, 0.4328, 0.4382, 0.4385, 0.442, 0.4473, 0.4477, 0.4503, 0.4562, 0.458, 0.461, 0.4638, 0.4667, 0.4686, 0.4693, 0.4703, 0.4833, 0.4881, 0.4944, 0.4953, 0.4992, 0.5509, 0.5567, 0.5569, 0.5591, 0.5602, 0.5602, 0.565, 0.5776, 0.5777, 0.5818, 0.587, 0.5972, 0.5999, 0.6043, 0.6049, 0.6093, 0.6099, 0.6465, 0.6561, 0.6595, 0.6631, 0.6714, 0.6759, 0.6809, 0.6816, 0.6925, 0.7039, 0.7086, 0.7235, 0.7302, 0.7332, 0.7602, 0.7834, 0.8037, 0.9999}; - -void showASCIIImage(Tensor *tensor) { - assert(tensor->ndim == 4); - assert(tensor->shape[0] == 1); - assert(tensor->shape[3] == 1); - float min = 1000; - float max = -1000; - for (size_t h = 0; h < tensor->shape[1]; h += 1) { - for (size_t w = 0; w < tensor->shape[2]; w += 1) { - float pixel_value = ((float *)tensor->data)[h * tensor->shape[2] + w]; - if (pixel_value < min) { - min = pixel_value; - } - if (pixel_value > max) { - max = pixel_value; - } - } - } - - for (size_t h = 0; h < tensor->shape[1]; h += 1) { - for (size_t w = 0; w < tensor->shape[2]; w += 1) { - float pixel_value = ((float *)tensor->data)[h * tensor->shape[2] + w]; - - // normalize the pixel value to the range [0, 1] - pixel_value = (pixel_value - min) / (max - min); - - // find the closest brightness value in the brightness_map - size_t brightness_index = 0; - for (size_t i = 0; i < n_mapping; i += 1) { - if (pixel_value < brightness_map[i]) { - break; - } - brightness_index = i; - } - - // find the corresponding ASCII character - uint8_t ascii = ascii_map[brightness_index]; - printf("%c", ascii); - } - printf("\n"); - } -} diff --git a/example/llama2/README.md b/example/llama2/README.md index 085a2f8..84c0f83 100644 --- a/example/llama2/README.md +++ b/example/llama2/README.md @@ -1 +1,32 @@ -make && spike --isa=rv64gcv_zicntr --varch=vlen:512,elen:32 --misaligned ./llama2 \ No newline at end of file +make && spike --isa=rv64gcv_zicntr --varch=vlen:512,elen:32 --misaligned ./llama2 + + +```bash +wget -P checkpoints/ https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.bin +``` + + + + +# Performance Benchmark + +Native impl + +``` +[100%] Built target llama2 +Llama 2: a small transformer model for text generation +forward taking 458394741 cycles +Once +forward taking 458545220 cycles + upon +forward taking 458648904 cycles + a +forward taking 458744673 cycles + time +forward taking 458850691 cycles +, +forward taking 458942021 cycles + there +forward taking 459044968 cycles + was +``` \ No newline at end of file diff --git a/example/llama2/main.c b/example/llama2/main.c index f94fd19..0e48cdd 100644 --- a/example/llama2/main.c +++ b/example/llama2/main.c @@ -13,6 +13,7 @@ // Transformer model #include "nn.h" +#include "rv.h" // load the weight data block from the model.bin file @@ -173,7 +174,7 @@ void read_checkpoint(Config* config, TransformerWeights* weights, // *data = mmap(NULL, *file_size, PROT_READ, MAP_PRIVATE, *fd, 0); // if (*data == MAP_FAILED) { fprintf(stderr, "mmap failed!\n"); exit(EXIT_FAILURE); } - *data = checkpoint_data; + *data = (float *)checkpoint_data; float* weights_ptr = *data + sizeof(Config)/sizeof(float); memory_map_weights(weights, config, weights_ptr, shared_weights); @@ -213,52 +214,50 @@ void rmsnorm(float* o, float* x, float* weight, int size) { } void softmax(float* x, int size) { - // // find max value (for numerical stability) - // float max_val = x[0]; - // for (int i = 1; i < size; i++) { - // if (x[i] > max_val) { - // max_val = x[i]; - // } - // } - // // exp and sum - // float sum = 0.0f; - // for (int i = 0; i < size; i++) { - // x[i] = expf(x[i] - max_val); - // sum += x[i]; - // } - // // normalize - // for (int i = 0; i < size; i++) { - // x[i] /= sum; - // } - Tensor *out = NN_tensor(2, (size_t[]){1, size}, DTYPE_F32, x); - NN_Softmax(out, out, 0); + // find max value (for numerical stability) + float max_val = x[0]; + for (int i = 1; i < size; i++) { + if (x[i] > max_val) { + max_val = x[i]; + } + } + // exp and sum + float sum = 0.0f; + for (int i = 0; i < size; i++) { + x[i] = expf(x[i] - max_val); + sum += x[i]; + } + // normalize + for (int i = 0; i < size; i++) { + x[i] /= sum; + } + // Tensor *out = NN_tensor(2, (size_t[]){1, size}, DTYPE_F32, x); + // NN_softmax(out, out, 1); } void matmul(float* xout, float* x, float* w, int n, int d) { // W (d,n) @ x (n,) -> xout (d,) // by far the most amount of time is spent inside this little function - Tensor *out = NN_tensor(2, (size_t[]){d, 1}, DTYPE_F32, xout); - Tensor *a = NN_tensor(2, (size_t[]){d, n}, DTYPE_F32, w); - Tensor *b = NN_tensor(2, (size_t[]){1, n}, DTYPE_F32, x); - - - NN_matmulT(out, a, b); + + int i; + #pragma omp parallel for private(i) + for (i = 0; i < d; i++) { + float val = 0.0f; + for (int j = 0; j < n; j++) { + val += w[i * n + j] * x[j]; + } + xout[i] = val; + } + // Tensor *out = NN_tensor(2, (size_t[]){d, 1}, DTYPE_F32, xout); + // Tensor *a = NN_tensor(2, (size_t[]){d, n}, DTYPE_F32, w); + // Tensor *b = NN_tensor(2, (size_t[]){1, n}, DTYPE_F32, x); - // int i; - // #pragma omp parallel for private(i) - // for (i = 0; i < d; i++) { - // float val = 0.0f; - // for (int j = 0; j < n; j++) { - // val += w[i * n + j] * x[j]; - // } - // xout[i] = val; - // } + // NN_matmul_t(out, a, b); } float* forward(Transformer* transformer, int token, int pos) { - // a few convenience variables Config* p = &transformer->config; TransformerWeights* w = &transformer->weights; @@ -286,6 +285,13 @@ float* forward(Transformer* transformer, int token, int pos) { s->v = s->value_cache + loff + pos * kv_dim; // qkv matmuls for this position + + // Tensor *tensor_s_q = NN_tensor(2, (size_t[]){dim, 1}, DTYPE_F32, s->q); + // Tensor *tensor_s_xb = NN_tensor(2, (size_t[]){dim, dim}, DTYPE_F32, s->xb); + // Tensor *tensor_w_wq = NN_tensor(2, (size_t[]){1, dim}, DTYPE_F32, w->wq + l*dim*dim); + + // NN_matmul_t(tensor_s_q, tensor_s_xb, tensor_w_wq); + matmul(s->q, s->xb, w->wq + l*dim*dim, dim, dim); matmul(s->k, s->xb, w->wk + l*dim*kv_dim, dim, kv_dim); matmul(s->v, s->xb, w->wv + l*dim*kv_dim, dim, kv_dim); @@ -330,7 +336,10 @@ float* forward(Transformer* transformer, int token, int pos) { } // softmax the scores to get attention weights, from 0..pos inclusively - softmax(att, pos + 1); + // softmax(att, pos + 1); + + Tensor *att_tensor = NN_tensor(2, (size_t[]){1, pos + 1}, DTYPE_F32, att); + NN_softmax(att_tensor, att_tensor, 1); // weighted sum of the values, store back into xb float* xb = s->xb + h * head_size; @@ -747,7 +756,11 @@ int sample(Sampler* sampler, float* logits) { // apply the temperature to the logits for (int q=0; qvocab_size; q++) { logits[q] /= sampler->temperature; } // apply softmax to the logits to get the probabilities for next token - softmax(logits, sampler->vocab_size); + // softmax(logits, sampler->vocab_size); + + Tensor *logits_tensor = NN_tensor(2, (size_t[]){1, sampler->vocab_size}, DTYPE_F32, logits); + NN_softmax(logits_tensor, logits_tensor, 1); + // flip a (float) coin (this is our source of entropy for sampling) float coin = random_f32(&sampler->rng_state); // we sample from this distribution to get the next token @@ -796,7 +809,10 @@ void generate(Transformer *transformer, Tokenizer *tokenizer, Sampler *sampler, while (pos < steps) { // forward the transformer to get logits for the next token + size_t cycles = READ_CSR("mcycle"); float* logits = forward(transformer, token, pos); + cycles = READ_CSR("mcycle") - cycles; + printf("forward taking %d cycles\n", cycles); // advance the state machine if (pos < num_prompt_tokens - 1) { diff --git a/example/mlp/model.h b/example/mlp/model.h index 7edefba..7e8021b 100644 --- a/example/mlp/model.h +++ b/example/mlp/model.h @@ -41,44 +41,44 @@ void forward(Model *model); void init(Model *model) { float *array_pointer = (float *)model_weight_data; - NN_initTensor(&model->input_1, 2, (size_t[]){1, 48}, DTYPE_F32, NULL); + NN_init_tensor(&model->input_1, 2, (size_t[]){1, 48}, DTYPE_F32, NULL); // : actor_0 - NN_initTensor(&model->actor_0_weight, 2, (size_t[]){512, 48}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->actor_0_weight, 2, (size_t[]){512, 48}, DTYPE_F32, array_pointer); array_pointer += 24576; - NN_initTensor(&model->actor_0_bias, 1, (size_t[]){512}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->actor_0_bias, 1, (size_t[]){512}, DTYPE_F32, array_pointer); array_pointer += 512; - NN_initTensor(&model->actor_0, 2, (size_t[]){1, 512}, DTYPE_F32, NULL); + NN_init_tensor(&model->actor_0, 2, (size_t[]){1, 512}, DTYPE_F32, NULL); // : actor_1 - NN_initTensor(&model->actor_1, 2, (size_t[]){1, 512}, DTYPE_F32, NULL); + NN_init_tensor(&model->actor_1, 2, (size_t[]){1, 512}, DTYPE_F32, NULL); // : actor_2 - NN_initTensor(&model->actor_2_weight, 2, (size_t[]){256, 512}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->actor_2_weight, 2, (size_t[]){256, 512}, DTYPE_F32, array_pointer); array_pointer += 131072; - NN_initTensor(&model->actor_2_bias, 1, (size_t[]){256}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->actor_2_bias, 1, (size_t[]){256}, DTYPE_F32, array_pointer); array_pointer += 256; - NN_initTensor(&model->actor_2, 2, (size_t[]){1, 256}, DTYPE_F32, NULL); + NN_init_tensor(&model->actor_2, 2, (size_t[]){1, 256}, DTYPE_F32, NULL); // : actor_3 - NN_initTensor(&model->actor_3, 2, (size_t[]){1, 256}, DTYPE_F32, NULL); + NN_init_tensor(&model->actor_3, 2, (size_t[]){1, 256}, DTYPE_F32, NULL); // : actor_4 - NN_initTensor(&model->actor_4_weight, 2, (size_t[]){128, 256}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->actor_4_weight, 2, (size_t[]){128, 256}, DTYPE_F32, array_pointer); array_pointer += 32768; - NN_initTensor(&model->actor_4_bias, 1, (size_t[]){128}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->actor_4_bias, 1, (size_t[]){128}, DTYPE_F32, array_pointer); array_pointer += 128; - NN_initTensor(&model->actor_4, 2, (size_t[]){1, 128}, DTYPE_F32, NULL); + NN_init_tensor(&model->actor_4, 2, (size_t[]){1, 128}, DTYPE_F32, NULL); // : actor_5 - NN_initTensor(&model->actor_5, 2, (size_t[]){1, 128}, DTYPE_F32, NULL); + NN_init_tensor(&model->actor_5, 2, (size_t[]){1, 128}, DTYPE_F32, NULL); // : actor_6 - NN_initTensor(&model->actor_6_weight, 2, (size_t[]){12, 128}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->actor_6_weight, 2, (size_t[]){12, 128}, DTYPE_F32, array_pointer); array_pointer += 1536; - NN_initTensor(&model->actor_6_bias, 1, (size_t[]){12}, DTYPE_F32, array_pointer); + NN_init_tensor(&model->actor_6_bias, 1, (size_t[]){12}, DTYPE_F32, array_pointer); array_pointer += 12; - NN_initTensor(&model->actor_6, 2, (size_t[]){1, 12}, DTYPE_F32, NULL); + NN_init_tensor(&model->actor_6, 2, (size_t[]){1, 12}, DTYPE_F32, NULL); } @@ -87,13 +87,13 @@ void init(Model *model) { * Forward pass of the model */ void forward(Model *model) { - NN_Linear(&model->actor_0, &model->input_1, &model->actor_0_weight, &model->actor_0_bias); - NN_ELU(&model->actor_1, &model->actor_0, 1.0); - NN_Linear(&model->actor_2, &model->actor_1, &model->actor_2_weight, &model->actor_2_bias); - NN_ELU(&model->actor_3, &model->actor_2, 1.0); - NN_Linear(&model->actor_4, &model->actor_3, &model->actor_4_weight, &model->actor_4_bias); - NN_ELU(&model->actor_5, &model->actor_4, 1.0); - NN_Linear(&model->actor_6, &model->actor_5, &model->actor_6_weight, &model->actor_6_bias); + NN_linear(&model->actor_0, &model->input_1, &model->actor_0_weight, &model->actor_0_bias); + NN_elu(&model->actor_1, &model->actor_0, 1.0); + NN_linear(&model->actor_2, &model->actor_1, &model->actor_2_weight, &model->actor_2_bias); + NN_elu(&model->actor_3, &model->actor_2, 1.0); + NN_linear(&model->actor_4, &model->actor_3, &model->actor_4_weight, &model->actor_4_bias); + NN_elu(&model->actor_5, &model->actor_4, 1.0); + NN_linear(&model->actor_6, &model->actor_5, &model->actor_6_weight, &model->actor_6_bias); } diff --git a/example/mnist/main.c b/example/mnist/main.c index f89f016..c86aa00 100644 --- a/example/mnist/main.c +++ b/example/mnist/main.c @@ -55,19 +55,19 @@ typedef struct { } Model; void init(Model *model) { - NN_initTensor(&model->input, 2, (size_t[]){ 1, 10 }, DTYPE_F32, (float *)malloc(10 * sizeof(float))); + NN_init_tensor(&model->input, 2, (size_t[]){ 1, 10 }, DTYPE_F32, (float *)malloc(10 * sizeof(float))); - NN_initTensor(&model->fc1_weight, 2, (size_t[]){ 10, 15 }, DTYPE_F32, (float *)fc1_weight_data); - NN_initTensor(&model->fc1_bias, 2, (size_t[]){ 1, 15 }, DTYPE_F32, (float *)fc1_bias_data); - NN_initTensor(&model->fc1_out, 2, (size_t[]){ 1, 15 }, DTYPE_F32, (float *)malloc(15 * sizeof(float))); + NN_init_tensor(&model->fc1_weight, 2, (size_t[]){ 10, 15 }, DTYPE_F32, (float *)fc1_weight_data); + NN_init_tensor(&model->fc1_bias, 2, (size_t[]){ 1, 15 }, DTYPE_F32, (float *)fc1_bias_data); + NN_init_tensor(&model->fc1_out, 2, (size_t[]){ 1, 15 }, DTYPE_F32, (float *)malloc(15 * sizeof(float))); - NN_initTensor(&model->fc2_weight, 2, (size_t[]){ 15, 20 }, DTYPE_F32, (float *)fc2_weight_data); - NN_initTensor(&model->fc2_bias, 2, (size_t[]){ 1, 20 }, DTYPE_F32, (float *)fc2_bias_data); - NN_initTensor(&model->fc2_out, 2, (size_t[]){ 1, 20 }, DTYPE_F32, (float *)malloc(20 * sizeof(float))); + NN_init_tensor(&model->fc2_weight, 2, (size_t[]){ 15, 20 }, DTYPE_F32, (float *)fc2_weight_data); + NN_init_tensor(&model->fc2_bias, 2, (size_t[]){ 1, 20 }, DTYPE_F32, (float *)fc2_bias_data); + NN_init_tensor(&model->fc2_out, 2, (size_t[]){ 1, 20 }, DTYPE_F32, (float *)malloc(20 * sizeof(float))); - NN_initTensor(&model->fc3_weight, 2, (size_t[]){ 20, 5 }, DTYPE_F32, (float *)fc3_weight_data); - NN_initTensor(&model->fc3_bias, 2, (size_t[]){ 1, 5 }, DTYPE_F32, (float *)fc3_bias_data); - NN_initTensor(&model->output, 2, (size_t[]){ 1, 5 }, DTYPE_F32, (float *)malloc(5 * sizeof(float))); + NN_init_tensor(&model->fc3_weight, 2, (size_t[]){ 20, 5 }, DTYPE_F32, (float *)fc3_weight_data); + NN_init_tensor(&model->fc3_bias, 2, (size_t[]){ 1, 5 }, DTYPE_F32, (float *)fc3_bias_data); + NN_init_tensor(&model->output, 2, (size_t[]){ 1, 5 }, DTYPE_F32, (float *)malloc(5 * sizeof(float))); } void forward(Model *model) { diff --git a/example/ppo/main.c b/example/ppo/main.c index c0a5540..d290f89 100644 --- a/example/ppo/main.c +++ b/example/ppo/main.c @@ -46,32 +46,32 @@ typedef struct { void init(Model *model) { uint8_t *array_pointer = externdata; - NN_initTensor(&model->input, 2, (size_t[]){ 1, N_OBS }, DTYPE_F32, (float *)malloc(N_OBS * sizeof(float))); + NN_init_tensor(&model->input, 2, (size_t[]){ 1, N_OBS }, DTYPE_F32, (float *)malloc(N_OBS * sizeof(float))); - NN_initTensor(&model->fc1_weight, 2, (size_t[]){ N_OBS, FC1_SIZE }, DTYPE_F32, (float *)(array_pointer)); + NN_init_tensor(&model->fc1_weight, 2, (size_t[]){ N_OBS, FC1_SIZE }, DTYPE_F32, (float *)(array_pointer)); array_pointer += N_OBS * FC1_SIZE * sizeof(float); - NN_initTensor(&model->fc1_bias, 2, (size_t[]){ 1, FC1_SIZE }, DTYPE_F32, (float *)(array_pointer)); + NN_init_tensor(&model->fc1_bias, 2, (size_t[]){ 1, FC1_SIZE }, DTYPE_F32, (float *)(array_pointer)); array_pointer += FC1_SIZE * sizeof(float); - NN_initTensor(&model->fc1_out, 2, (size_t[]){ 1, FC1_SIZE }, DTYPE_F32, (float *)malloc(FC1_SIZE * sizeof(float))); + NN_init_tensor(&model->fc1_out, 2, (size_t[]){ 1, FC1_SIZE }, DTYPE_F32, (float *)malloc(FC1_SIZE * sizeof(float))); - NN_initTensor(&model->fc2_weight, 2, (size_t[]){ FC1_SIZE, FC2_SIZE }, DTYPE_F32, (float *)(array_pointer)); + NN_init_tensor(&model->fc2_weight, 2, (size_t[]){ FC1_SIZE, FC2_SIZE }, DTYPE_F32, (float *)(array_pointer)); array_pointer += FC1_SIZE * FC2_SIZE * sizeof(float); - NN_initTensor(&model->fc2_bias, 2, (size_t[]){ 1, FC2_SIZE }, DTYPE_F32, (float *)(array_pointer)); + NN_init_tensor(&model->fc2_bias, 2, (size_t[]){ 1, FC2_SIZE }, DTYPE_F32, (float *)(array_pointer)); array_pointer += FC2_SIZE * sizeof(float); - NN_initTensor(&model->fc2_out, 2, (size_t[]){ 1, FC2_SIZE }, DTYPE_F32, (float *)malloc(FC2_SIZE * sizeof(float))); + NN_init_tensor(&model->fc2_out, 2, (size_t[]){ 1, FC2_SIZE }, DTYPE_F32, (float *)malloc(FC2_SIZE * sizeof(float))); - NN_initTensor(&model->fc3_weight, 2, (size_t[]){ FC2_SIZE, FC3_SIZE }, DTYPE_F32, (float *)(array_pointer)); + NN_init_tensor(&model->fc3_weight, 2, (size_t[]){ FC2_SIZE, FC3_SIZE }, DTYPE_F32, (float *)(array_pointer)); array_pointer += FC2_SIZE * FC3_SIZE * sizeof(float); - NN_initTensor(&model->fc3_bias, 2, (size_t[]){ 1, FC3_SIZE }, DTYPE_F32, (float *)(array_pointer)); + NN_init_tensor(&model->fc3_bias, 2, (size_t[]){ 1, FC3_SIZE }, DTYPE_F32, (float *)(array_pointer)); array_pointer += FC3_SIZE * sizeof(float); - NN_initTensor(&model->fc3_out, 2, (size_t[]){ 1, FC3_SIZE }, DTYPE_F32, (float *)malloc(FC3_SIZE * sizeof(float))); + NN_init_tensor(&model->fc3_out, 2, (size_t[]){ 1, FC3_SIZE }, DTYPE_F32, (float *)malloc(FC3_SIZE * sizeof(float))); - NN_initTensor(&model->fc4_weight, 2, (size_t[]){ FC3_SIZE, N_ACS }, DTYPE_F32, (float *)(array_pointer)); + NN_init_tensor(&model->fc4_weight, 2, (size_t[]){ FC3_SIZE, N_ACS }, DTYPE_F32, (float *)(array_pointer)); array_pointer += FC3_SIZE * N_ACS * sizeof(float); printf("ptr: %d\n", (int)array_pointer - (int)externdata_start); - NN_initTensor(&model->fc4_bias, 2, (size_t[]){ 1, N_ACS }, DTYPE_F32, (float *)(array_pointer)); + NN_init_tensor(&model->fc4_bias, 2, (size_t[]){ 1, N_ACS }, DTYPE_F32, (float *)(array_pointer)); array_pointer += N_ACS * sizeof(float); - NN_initTensor(&model->output, 2, (size_t[]){ 1, N_ACS }, DTYPE_F32, (float *)malloc(N_ACS * sizeof(float))); + NN_init_tensor(&model->output, 2, (size_t[]){ 1, N_ACS }, DTYPE_F32, (float *)malloc(N_ACS * sizeof(float))); printf("fc4_bias: \n"); NN_printf(&model->fc4_bias); diff --git a/example/simple/main.c b/example/simple/main.c index 14e67f0..0693fe4 100644 --- a/example/simple/main.c +++ b/example/simple/main.c @@ -30,10 +30,10 @@ Tensor D; * Initialize the required tensors for the model */ void init(Tensor *A, Tensor *B, Tensor *C, Tensor *D) { - NN_initTensor(A, 2, (size_t[]){3, 3}, DTYPE_F32, (float *)malloc(9 * sizeof(float))); - NN_initTensor(B, 2, (size_t[]){3, 3}, DTYPE_F32, (float *)(weights_data + 3 * sizeof(float))); - NN_initTensor(C, 2, (size_t[]){3, 3}, DTYPE_F32, (float *)malloc(9 * sizeof(float))); - NN_initTensor(D, 1, (size_t[]){3}, DTYPE_F32, (float *)(weights_data + 0 * sizeof(float))); + NN_init_tensor(A, 2, (size_t[]){3, 3}, DTYPE_F32, (float *)malloc(9 * sizeof(float))); + NN_init_tensor(B, 2, (size_t[]){3, 3}, DTYPE_F32, (float *)(weights_data + 3 * sizeof(float))); + NN_init_tensor(C, 2, (size_t[]){3, 3}, DTYPE_F32, (float *)malloc(9 * sizeof(float))); + NN_init_tensor(D, 1, (size_t[]){3}, DTYPE_F32, (float *)(weights_data + 0 * sizeof(float))); } /** diff --git a/example/stereo-block-matching/CMakeLists.txt b/example/stereo-block-matching/CMakeLists.txt index 507352d..5ec929a 100644 --- a/example/stereo-block-matching/CMakeLists.txt +++ b/example/stereo-block-matching/CMakeLists.txt @@ -4,7 +4,7 @@ project(stereoBM LANGUAGES C) add_executable(stereoBM main.c) -target_include_directories(stereoBM PUBLIC inc) +target_include_directories(stereoBM PUBLIC inc ../utility) target_compile_features(stereoBM INTERFACE c_std_11) diff --git a/example/stereo-block-matching/main.c b/example/stereo-block-matching/main.c index ed15369..0ac656d 100644 --- a/example/stereo-block-matching/main.c +++ b/example/stereo-block-matching/main.c @@ -60,13 +60,8 @@ Tensor* compute_dispartiy(Tensor *left, Tensor *right, int min_disparity, int ma int sad_iop = 0; - signed char *disparity = (signed char *)calloc(s_w*s_h, sizeof(signed char)); - if (!disparity) { - printf("Error: Memory allocation failed\n"); - return NULL; - } - - + Tensor *disparity_img = NN_zeros(4, (const size_t[]){1, s_h, s_w, 1}, DTYPE_U8); + Tensor *left_block = NN_tensor(2, (const size_t[]){1, 2*half_block_size}, DTYPE_U8, (uint8_t *)left->data); Tensor *right_block = NN_tensor(2, (const size_t[]){1, 2*half_block_size}, DTYPE_U8, (uint8_t *)right->data); Tensor *left_block_signed = NN_tensor(2, (const size_t[]){1, 2*half_block_size}, DTYPE_U32, NULL); @@ -75,26 +70,20 @@ Tensor* compute_dispartiy(Tensor *left, Tensor *right, int min_disparity, int ma Tensor *diff_wide = NN_tensor(2, (const size_t[]){1, 2*half_block_size}, DTYPE_I32, NULL); Tensor *out = NN_tensor(1, (const size_t[]){1}, DTYPE_I32, NULL); - // Tensor *left_block = NN_tensor(2, (const size_t[]){1, 1}, DTYPE_U8, (uint8_t *)left->data); - // Tensor *right_block = NN_tensor(2, (const size_t[]){1, 1}, DTYPE_U8, (uint8_t *)right->data); - // Tensor *left_block_signed = NN_tensor(2, (const size_t[]){1, 1}, DTYPE_U32, NULL); - // Tensor *right_block_signed = NN_tensor(2, (const size_t[]){1, 1}, DTYPE_U32, NULL); - // Tensor *diff = NN_tensor(2, (const size_t[]){1, 1}, DTYPE_I32, NULL); - // Tensor *out = NN_tensor(1, (const size_t[]){1}, DTYPE_I32, NULL); - // compute disparity // outer loop iterating over blocks for (int i = half_block_size; i < height-half_block_size; i += 1) { printf("i: %d / %d\n", i, height-half_block_size); for (int j = half_block_size - min_disparity; j < width-half_block_size - max_disparity; j += 1) { - // printf("j: %d / %d\n", j, width-half_block_size - max_disparity); // middle loop per block min_SAD = INT32_MAX; - for (int offset = min_disparity; offsetdata)[row * width + col] - ((uint8_t *)right->data)[row * width + col + offset])); @@ -103,71 +92,49 @@ Tensor* compute_dispartiy(Tensor *left, Tensor *right, int min_disparity, int ma // } // } - // tensor version - size_t row = i - half_block_size; size_t col = j - half_block_size; for (size_t row = i - half_block_size; row < half_block_size + i; row += 1) { - // for (size_t col = j - half_block_size; col < half_block_size + j; col += 1) { left_block->data = ((uint8_t *)left->data) + row*width + col; right_block->data = ((uint8_t *)right->data) + row*width + col + offset; NN_sub(diff, left_block, right_block); - // // NN_printf(diff); - diff->dtype = DTYPE_I8; - - NN_asType(diff_wide, diff); + NN_copy(diff_wide, diff); diff->dtype = DTYPE_U8; - NN_absInplace(diff_wide); - - // NN_printf(diff); + NN_abs_inplace(diff_wide); NN_sum(out, diff_wide); SAD += ((int32_t *)out->data)[0]; - - // } } - // printf("SAD: %d\n", SAD); - // return NULL; - - // reduction step if (SAD < min_SAD) { - // for debugging - // if (i == half_block_size) { - // printf("Updated min_SAD: %x, SAD: %x, j: %d, offset: %d\n", min_SAD, SAD, j, offset); - // } min_SAD = SAD; - disparity[(i-half_block_size)*(s_w)+j-half_block_size] = offset; + ((uint8_t *)disparity_img->data)[(i-half_block_size)*(s_w)+j-half_block_size] = offset; } } - // if (j > half_block_size - min_disparity + 2) - // return NULL; } } - NN_freeTensorData(left_block_signed); - NN_freeTensorData(right_block_signed); - NN_freeTensorData(diff); - NN_freeTensorData(out); - NN_deleteTensor(left_block_signed); - NN_deleteTensor(right_block_signed); - NN_deleteTensor(diff); - NN_deleteTensor(out); - NN_deleteTensor(left_block); - NN_deleteTensor(right_block); + NN_free_tensor_data(left_block_signed); + NN_free_tensor_data(right_block_signed); + NN_free_tensor_data(diff); + NN_free_tensor_data(out); + NN_delete_tensor(left_block_signed); + NN_delete_tensor(right_block_signed); + NN_delete_tensor(diff); + NN_delete_tensor(out); + NN_delete_tensor(left_block); + NN_delete_tensor(right_block); printf("SAD IOPs: %d\n", sad_iop); - - Tensor *disparity_image = NN_tensor(4, (const size_t[]){1, s_h, s_w, 1}, DTYPE_U8, disparity); - return disparity_image; + return disparity_img; } int main() { @@ -178,27 +145,21 @@ int main() { Tensor *right_image = NN_tensor(4, (const size_t[]){1, IMG_HEIGHT, IMG_WIDTH, 1}, DTYPE_U8, right_data); size_t cycles = READ_CSR("cycle"); - - Tensor *disparity_image = compute_dispartiy(left_image, right_image, 0, 32, 4); - + Tensor *disparity_img = compute_dispartiy(left_image, right_image, 0, 32, 4); cycles = READ_CSR("cycle") - cycles; + printf("Cycles: %lu\n", cycles); // Save the disparity image - - // write only the data - printf("printing result\n"); - NN_printShape(disparity_image); + printf("Result:\n"); + NN_print_shape(disparity_img); printf("\n"); - Tensor *img_small = NN_zeros(4, (const size_t[]){1, disparity_image->shape[1] / 4, disparity_image->shape[2] / 2, 1}, DTYPE_U8); - - - - NN_interpolate(img_small, disparity_image, (float []){0.25, 0.5}); + Tensor *disparity_img_small = NN_zeros(4, (const size_t[]){1, disparity_img->shape[1] / 4, disparity_img->shape[2] / 2, 1}, DTYPE_U8); + NN_interpolate(disparity_img_small, disparity_img, (float []){0.25, 0.5}); - // showASCIIImage(img_small); + show_ASCII_image(disparity_img_small, 0, 32); return 0; } diff --git a/example/stereo-block-matching/termimg.h b/example/stereo-block-matching/termimg.h deleted file mode 100644 index 930d9aa..0000000 --- a/example/stereo-block-matching/termimg.h +++ /dev/null @@ -1,89 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -#include "nn_tensor.h" - - -const size_t n_mapping = 92; - -const uint8_t ascii_map[] = {0X20, 0X60, 0X2E, 0X2D, 0X27, 0X3A, 0X5F, 0X2C, 0X5E, 0X3D, 0X3B, 0X3E, 0X3C, 0X2B, 0X21, 0X72, 0X63, 0X2A, 0X2F, 0X7A, 0X3F, 0X73, 0X4C, 0X54, 0X76, 0X29, 0X4A, 0X37, 0X28, 0X7C, 0X46, 0X69, 0X7B, 0X43, 0X7D, 0X66, 0X49, 0X33, 0X31, 0X74, 0X6C, 0X75, 0X5B, 0X6E, 0X65, 0X6F, 0X5A, 0X35, 0X59, 0X78, 0X6A, 0X79, 0X61, 0X5D, 0X32, 0X45, 0X53, 0X77, 0X71, 0X6B, 0X50, 0X36, 0X68, 0X39, 0X64, 0X34, 0X56, 0X70, 0X4F, 0X47, 0X62, 0X55, 0X41, 0X4B, 0X58, 0X48, 0X6D, 0X38, 0X52, 0X44, 0X23, 0X24, 0X42, 0X67, 0X30, 0X4D, 0X4E, 0X57, 0X51, 0X25, 0X26, 0X40}; - -const float brightness_map[] = {0, 0.0751, 0.0829, 0.0848, 0.1227, 0.1403, 0.1559, 0.185, 0.2183, 0.2417, 0.2571, 0.2852, 0.2902, 0.2919, 0.3099, 0.3192, 0.3232, 0.3294, 0.3384, 0.3609, 0.3619, 0.3667, 0.3737, 0.3747, 0.3838, 0.3921, 0.396, 0.3984, 0.3993, 0.4075, 0.4091, 0.4101, 0.42, 0.423, 0.4247, 0.4274, 0.4293, 0.4328, 0.4382, 0.4385, 0.442, 0.4473, 0.4477, 0.4503, 0.4562, 0.458, 0.461, 0.4638, 0.4667, 0.4686, 0.4693, 0.4703, 0.4833, 0.4881, 0.4944, 0.4953, 0.4992, 0.5509, 0.5567, 0.5569, 0.5591, 0.5602, 0.5602, 0.565, 0.5776, 0.5777, 0.5818, 0.587, 0.5972, 0.5999, 0.6043, 0.6049, 0.6093, 0.6099, 0.6465, 0.6561, 0.6595, 0.6631, 0.6714, 0.6759, 0.6809, 0.6816, 0.6925, 0.7039, 0.7086, 0.7235, 0.7302, 0.7332, 0.7602, 0.7834, 0.8037, 0.9999}; - -void showASCIIImage(Tensor *tensor) { - size_t width; - size_t height; - - if (tensor->ndim == 4) { - height = tensor->shape[1]; - width = tensor->shape[2]; - } else if (tensor->ndim == 3) { - height = tensor->shape[1]; - width = tensor->shape[2]; - } else if (tensor->ndim == 2) { - height = tensor->shape[0]; - width = tensor->shape[1]; - } else { - printf("[ERROR] Invalid tensor shape\n"); - } - - float min; - float max; - - if (tensor->dtype == DTYPE_F32) { - min = 1000; - max = -1000; - for (size_t h = 0; h < height; h += 1) { - for (size_t w = 0; w < width; w += 1) { - float pixel_value = ((float *)tensor->data)[h * width + w]; - - if (pixel_value < min) { - min = pixel_value; - } - if (pixel_value > max) { - max = pixel_value; - } - } - } - } - else if (tensor->dtype == DTYPE_U8) { - min = 0x00; - // max = 0xFF; - max = 32; - } else { - printf("[ERROR] Unsupported data type\n"); - } - - for (size_t h = 0; h < height; h += 1) { - for (size_t w = 0; w < width; w += 1) { - float pixel_value; - if (tensor->dtype == DTYPE_F32) { - pixel_value = ((float *)tensor->data)[h * width + w]; - } - else if (tensor->dtype == DTYPE_U8) { - pixel_value = (float)((uint8_t *)tensor->data)[h * width + w]; - } - - // normalize the pixel value to the range [0, 1] - pixel_value = (pixel_value - min) / (max - min); - - // find the closest brightness value in the brightness_map - size_t brightness_index = 0; - for (size_t i = 0; i < n_mapping; i += 1) { - if (pixel_value < brightness_map[i]) { - break; - } - brightness_index = i; - } - - // find the corresponding ASCII character - uint8_t ascii = ascii_map[brightness_index]; - printf("%c", ascii); - } - printf("\n"); - } -} diff --git a/example/utility/termimg.h b/example/utility/termimg.h new file mode 100644 index 0000000..046e02b --- /dev/null +++ b/example/utility/termimg.h @@ -0,0 +1,89 @@ +#pragma once + +#include +#include +#include +#include + +#include "nn_tensor.h" + + +const static size_t NUM_MAPPING = 92; + +const static uint8_t ASCII_MAP[] = {0X20, 0X60, 0X2E, 0X2D, 0X27, 0X3A, 0X5F, 0X2C, 0X5E, 0X3D, 0X3B, 0X3E, 0X3C, 0X2B, 0X21, 0X72, 0X63, 0X2A, 0X2F, 0X7A, 0X3F, 0X73, 0X4C, 0X54, 0X76, 0X29, 0X4A, 0X37, 0X28, 0X7C, 0X46, 0X69, 0X7B, 0X43, 0X7D, 0X66, 0X49, 0X33, 0X31, 0X74, 0X6C, 0X75, 0X5B, 0X6E, 0X65, 0X6F, 0X5A, 0X35, 0X59, 0X78, 0X6A, 0X79, 0X61, 0X5D, 0X32, 0X45, 0X53, 0X77, 0X71, 0X6B, 0X50, 0X36, 0X68, 0X39, 0X64, 0X34, 0X56, 0X70, 0X4F, 0X47, 0X62, 0X55, 0X41, 0X4B, 0X58, 0X48, 0X6D, 0X38, 0X52, 0X44, 0X23, 0X24, 0X42, 0X67, 0X30, 0X4D, 0X4E, 0X57, 0X51, 0X25, 0X26, 0X40}; + +const static float BRIGHTNESS_MAP[] = {0, 0.0751, 0.0829, 0.0848, 0.1227, 0.1403, 0.1559, 0.185, 0.2183, 0.2417, 0.2571, 0.2852, 0.2902, 0.2919, 0.3099, 0.3192, 0.3232, 0.3294, 0.3384, 0.3609, 0.3619, 0.3667, 0.3737, 0.3747, 0.3838, 0.3921, 0.396, 0.3984, 0.3993, 0.4075, 0.4091, 0.4101, 0.42, 0.423, 0.4247, 0.4274, 0.4293, 0.4328, 0.4382, 0.4385, 0.442, 0.4473, 0.4477, 0.4503, 0.4562, 0.458, 0.461, 0.4638, 0.4667, 0.4686, 0.4693, 0.4703, 0.4833, 0.4881, 0.4944, 0.4953, 0.4992, 0.5509, 0.5567, 0.5569, 0.5591, 0.5602, 0.5602, 0.565, 0.5776, 0.5777, 0.5818, 0.587, 0.5972, 0.5999, 0.6043, 0.6049, 0.6093, 0.6099, 0.6465, 0.6561, 0.6595, 0.6631, 0.6714, 0.6759, 0.6809, 0.6816, 0.6925, 0.7039, 0.7086, 0.7235, 0.7302, 0.7332, 0.7602, 0.7834, 0.8037, 0.9999}; + +/** + * Display the image in ASCII format + * + * @param tensor: input tensor + */ +void show_ASCII_image(Tensor *tensor, float min, float max) { + assert(tensor->ndim == 4); + assert(tensor->shape[0] == 1); + assert(tensor->shape[3] == 1); + + size_t width; + size_t height; + + if (tensor->ndim == 4) { + height = tensor->shape[1]; + width = tensor->shape[2]; + } else if (tensor->ndim == 3) { + height = tensor->shape[1]; + width = tensor->shape[2]; + } else if (tensor->ndim == 2) { + height = tensor->shape[0]; + width = tensor->shape[1]; + } else { + printf("[ERROR] Invalid tensor shape\n"); + } + + // find the min and max pixel value + if (max == 0 && min == 0) { + min = 1000; + max = -1000; + + for (size_t h = 0; h < height; h += 1) { + for (size_t w = 0; w < width; w += 1) { + float pixel_value = ((float *)tensor->data)[h * width + w]; + + if (pixel_value < min) { + min = pixel_value; + } + if (pixel_value > max) { + max = pixel_value; + } + } + } + } + + for (size_t h = 0; h < height; h += 1) { + for (size_t w = 0; w < width; w += 1) { + float pixel_value; + if (tensor->dtype == DTYPE_F32) { + pixel_value = ((float *)tensor->data)[h * width + w]; + } + else if (tensor->dtype == DTYPE_U8) { + pixel_value = (float)((uint8_t *)tensor->data)[h * width + w]; + } + + // normalize the pixel value to the range [0, 1] + pixel_value = (pixel_value - min) / (max - min); + + // find the closest brightness value in the brightness_map + size_t brightness_index = 0; + for (size_t i = 0; i < NUM_MAPPING; i += 1) { + if (pixel_value < BRIGHTNESS_MAP[i]) { + break; + } + brightness_index = i; + } + + // print the corresponding ASCII character + printf("%c", ASCII_MAP[brightness_index]); + } + printf("\n"); + } +} diff --git a/nn/CMakeLists.txt b/nn/CMakeLists.txt index 55ab703..65d334f 100644 --- a/nn/CMakeLists.txt +++ b/nn/CMakeLists.txt @@ -2,40 +2,34 @@ option(AVX "Use AVX implementation" OFF) option(RVV "Use RISCV vector extension implementation" OFF) - -add_library(nn - src/nn_tensor.c - src/nn_print.c - src/nn_abs.c - src/nn_add.c - src/nn_batchnorm2d.c - src/nn_conv2d.c - src/nn_clip.c - src/nn_copy.c - src/nn_div.c - src/nn_elu.c - src/nn_fill.c - src/nn_interpolate.c - src/nn_layernorm.c - src/nn_linear.c - src/nn_matmul.c - src/nn_matrixnorm.c - src/nn_max.c - src/nn_maximum.c - src/nn_maxpool2d.c - src/nn_min.c - src/nn_minimum.c - src/nn_mul.c - src/nn_neg.c - src/nn_relu.c - src/nn_relu6.c - src/nn_softmax.c - src/nn_sub.c - src/nn_sum.c - src/nn_transpose.c - src/nn_unfold.c +set(cpu_impl + ./impl/cpu/abs.c + ./impl/cpu/acc.c + ./impl/cpu/acc1.c + ./impl/cpu/add.c + ./impl/cpu/add1.c + ./impl/cpu/div.c + ./impl/cpu/dot.c + ./impl/cpu/fill.c + ./impl/cpu/max.c + ./impl/cpu/maximum.c + ./impl/cpu/maximum1.c + ./impl/cpu/min.c + ./impl/cpu/minimum.c + ./impl/cpu/minimum1.c + ./impl/cpu/mul.c + ./impl/cpu/mul1.c + ./impl/cpu/neg.c + ./impl/cpu/norm.c + ./impl/cpu/rmsnorm.c + ./impl/cpu/sgn.c + ./impl/cpu/softmax.c + ./impl/cpu/sqr.c + ./impl/cpu/sqrt.c + ./impl/cpu/sub.c + ./impl/cpu/sum.c + ./impl/cpu/transpose.c ) -target_include_directories(nn PUBLIC inc) if (AVX) @@ -46,13 +40,82 @@ endif () if (RVV) message(STATUS "Using RVV implementation") add_compile_definitions(RVV) + + set(rvv_impl + ./impl/rvv/abs.c + ./impl/rvv/acc.c + ./impl/rvv/acc1.c + ./impl/rvv/add.c + ./impl/rvv/add1.c + ./impl/rvv/div.c + ./impl/rvv/dot.c + ./impl/rvv/max.c + ./impl/rvv/maximum.c + ./impl/rvv/maximum1.c + ./impl/rvv/min.c + ./impl/rvv/minimum.c + ./impl/rvv/minimum1.c + ./impl/rvv/mul.c + ./impl/rvv/mul1.c + ./impl/rvv/neg.c + ./impl/rvv/sub.c + ./impl/rvv/transpose.c + ) endif () if (GEMMINI) message(STATUS "Using Gemmini implementation") add_compile_definitions(GEMMINI) + + set(gemmini_impl + impl/gemmini/mm.c + ) endif () + +add_library(nn + ./functional/nn_tensor_creation.c + ./functional/nn_print.c + ./functional/nn_abs.c + ./functional/nn_add.c + ./functional/nn_batch_norm2d.c + ./functional/nn_conv2d.c + ./functional/nn_clip.c + ./functional/nn_copy.c + ./functional/nn_div.c + ./functional/nn_elu.c + ./functional/nn_fill.c + ./functional/nn_interpolate.c + ./functional/nn_layer_norm.c + ./functional/nn_linear.c + ./functional/nn_matmul.c + ./functional/nn_mm.c + ./functional/nn_norm.c + ./functional/nn_max.c + ./functional/nn_maximum.c + ./functional/nn_max_pool2d.c + ./functional/nn_min.c + ./functional/nn_minimum.c + ./functional/nn_mul.c + ./functional/nn_neg.c + ./functional/nn_relu.c + ./functional/nn_relu6.c + ./functional/nn_softmax.c + ./functional/nn_sub.c + ./functional/nn_sum.c + ./functional/nn_transpose.c + ./functional/nn_unfold.c + + ${rvv_impl} + ${gemmini_impl} + ${cpu_impl} +) + +target_include_directories(nn PUBLIC + ./ + ./functional + ./impl) + if (X86) message("NN: Building for x86") target_link_libraries(nn target-x86) diff --git a/nn/src/nn_abs.c b/nn/functional/nn_abs.c similarity index 50% rename from nn/src/nn_abs.c rename to nn/functional/nn_abs.c index 20898b1..94ed2f2 100644 --- a/nn/src/nn_abs.c +++ b/nn/functional/nn_abs.c @@ -9,19 +9,19 @@ void NN_abs(Tensor *out, Tensor *in) { switch (out->dtype) { case DTYPE_I8: - NN__abs_I8(out->size, (int8_t *)out->data, (int8_t *)in->data); + NN__abs_i8(out->size, (int8_t *)out->data, 1, (int8_t *)in->data, 1); return; case DTYPE_I16: - NN__abs_I16(out->size, (int16_t *)out->data, (int16_t *)in->data); + NN__abs_i16(out->size, (int16_t *)out->data, 1, (int16_t *)in->data, 1); return; case DTYPE_I32: - NN__abs_I32(out->size, (int32_t *)out->data, (int32_t *)in->data); + NN__abs_i32(out->size, (int32_t *)out->data, 1, (int32_t *)in->data, 1); return; case DTYPE_F16: - NN__abs_F16(out->size, (float16_t *)out->data, (float16_t *)in->data); + NN__abs_f16(out->size, (float16_t *)out->data, 1, (float16_t *)in->data, 1); return; case DTYPE_F32: - NN__abs_F32(out->size, (float *)out->data, (float *)in->data); + NN__abs_f32(out->size, (float *)out->data, 1, (float *)in->data, 1); return; default: @@ -29,10 +29,10 @@ void NN_abs(Tensor *out, Tensor *in) { } printf("[ERROR] Unsupported operation of tensor with dtype %s = |%s|\n", - NN_getDataTypeName(out->dtype), NN_getDataTypeName(in->dtype) + NN_get_datatype_name(out->dtype), NN_get_datatype_name(in->dtype) ); } -void NN_absInplace(Tensor *x) { +void NN_abs_inplace(Tensor *x) { NN_abs(x, x); } diff --git a/nn/inc/nn_abs.h b/nn/functional/nn_abs.h similarity index 85% rename from nn/inc/nn_abs.h rename to nn/functional/nn_abs.h index ee4e87b..d88b0bd 100644 --- a/nn/inc/nn_abs.h +++ b/nn/functional/nn_abs.h @@ -4,7 +4,7 @@ #include #include "nn_tensor.h" -#include "ops/abs.h" +#include "abs.h" /** @@ -17,7 +17,7 @@ */ void NN_abs(Tensor *out, Tensor *input); -void NN_absInplace(Tensor *x); +void NN_abs_inplace(Tensor *x); #endif // __NN_ABS_H diff --git a/nn/src/nn_add.c b/nn/functional/nn_add.c similarity index 73% rename from nn/src/nn_add.c rename to nn/functional/nn_add.c index 6ffc207..e78be1c 100644 --- a/nn/src/nn_add.c +++ b/nn/functional/nn_add.c @@ -12,7 +12,7 @@ void NN_add(Tensor *out, Tensor *a, Tensor *b) { && b->shape[1] == a->shape[1] && b->shape[2] == a->shape[2] && b->shape[3] == a->shape[3]) { - NN__add_F16(out->size, (float16_t *)out->data, (float16_t *)a->data, (float16_t *)b->data); + NN__add_f16(out->size, (float16_t *)out->data, 1, (float16_t *)a->data, 1, (float16_t *)b->data, 1); return; } for (size_t i = 0; i < out->shape[0]; i += 1) { @@ -23,17 +23,18 @@ void NN_add(Tensor *out, Tensor *a, Tensor *b) { size_t b_i = i < b->shape[0] ? i : 0; size_t b_j = j < b->shape[1] ? j : 0; - ((float *)out->data)[i * out->shape[1] + j] - = ((float *)a->data)[a_i * a->shape[1] + a_j] - + ((float *)b->data)[b_i * b->shape[1] + b_j]; + ((float16_t *)out->data)[i * out->shape[1] + j] = NN_float_to_half( + NN_half_to_float(((float16_t *)a->data)[a_i * a->shape[1] + a_j]) + + NN_half_to_float(((float16_t *)b->data)[b_i * b->shape[1] + b_j]) + ); } } return; } printf("[ERROR] Unsupported operation between tensor with shape "); - NN_printShape(a); + NN_print_shape(a); printf(" + "); - NN_printShape(b); + NN_print_shape(b); printf("\n"); return; @@ -43,7 +44,7 @@ void NN_add(Tensor *out, Tensor *a, Tensor *b) { && b->shape[1] == a->shape[1] && b->shape[2] == a->shape[2] && b->shape[3] == a->shape[3]) { - NN__add_F32(out->size, (float *)out->data, (float *)a->data, (float *)b->data); + NN__add_f32(out->size, (float *)out->data, 1, (float *)a->data, 1, (float *)b->data, 1); return; } for (size_t i = 0; i < out->shape[0]; i += 1) { @@ -80,9 +81,9 @@ void NN_add(Tensor *out, Tensor *a, Tensor *b) { } printf("[ERROR] Unsupported operation between tensor with shape "); - NN_printShape(a); + NN_print_shape(a); printf(" + "); - NN_printShape(b); + NN_print_shape(b); printf("\n"); return; @@ -91,7 +92,7 @@ void NN_add(Tensor *out, Tensor *a, Tensor *b) { } printf("[ERROR] Unsupported operation between tensor with dtype %s = %s + %s\n", - NN_getDataTypeName(out->dtype), NN_getDataTypeName(a->dtype), NN_getDataTypeName(b->dtype) + NN_get_datatype_name(out->dtype), NN_get_datatype_name(a->dtype), NN_get_datatype_name(b->dtype) ); } @@ -102,47 +103,47 @@ void NN_add1(Tensor *out, Tensor *a, float b) { switch (out->dtype) { case DTYPE_F32: - NN__add1_F32(out->size, (float *)out->data, (float *)a->data, b); + NN__add1_f32(out->size, (float *)out->data, 1, (float *)a->data, 1, b); return; default: break; } printf("[ERROR] Unsupported operation between tensor with dtype %s += %s\n", - NN_getDataTypeName(out->dtype), NN_getDataTypeName(a->dtype) + NN_get_datatype_name(out->dtype), NN_get_datatype_name(a->dtype) ); } -void NN_addInplace(Tensor *b, Tensor *a) { +void NN_add_inplace(Tensor *b, Tensor *a) { assert(b->ndim == a->ndim); assert(b->dtype == a->dtype); switch (b->dtype) { case DTYPE_F32: - NN__acc_F32(b->size, (float *)b->data, (float *)a->data); + NN__acc_f32(b->size, (float *)b->data, 1, (float *)a->data, 1); return; case DTYPE_I8: - NN__acc_I8(b->size, (int8_t *)b->data, (int8_t *)a->data); + NN__acc_i8(b->size, (int8_t *)b->data, 1, (int8_t *)a->data, 1); return; default: break; } printf("[ERROR] Unsupported operation between tensor with dtype %s += %s\n", - NN_getDataTypeName(b->dtype), NN_getDataTypeName(a->dtype) + NN_get_datatype_name(b->dtype), NN_get_datatype_name(a->dtype) ); } -void NN_addInplace1(Tensor *b, float scalar) { +void NN_add_inplace1(Tensor *b, float scalar) { switch (b->dtype) { case DTYPE_F32: - NN__acc1_F32(b->size, (float *)b->data, scalar); + NN__acc1_f32(b->size, (float *)b->data, 1, scalar); return; default: break; } printf("[ERROR] Unsupported operation between tensor with dtype %s += float\n", - NN_getDataTypeName(b->dtype) + NN_get_datatype_name(b->dtype) ); } diff --git a/nn/inc/nn_add.h b/nn/functional/nn_add.h similarity index 72% rename from nn/inc/nn_add.h rename to nn/functional/nn_add.h index f90bb7a..9dd22b0 100644 --- a/nn/inc/nn_add.h +++ b/nn/functional/nn_add.h @@ -6,10 +6,10 @@ #include "nn_tensor.h" #include "nn_print.h" -#include "ops/acc.h" -#include "ops/acc1.h" -#include "ops/add.h" -#include "ops/add1.h" +#include "acc.h" +#include "acc1.h" +#include "add.h" +#include "add1.h" /** @@ -44,7 +44,7 @@ void NN_add1(Tensor *out, Tensor *in, float scalar); * @param b: the target tensor * @param a: the input tensor */ -void NN_addInplace(Tensor *b, Tensor *a); +void NN_add_inplace(Tensor *b, Tensor *a); /** * Returns the element-wise addition of the input tensor with a scalar. @@ -52,15 +52,15 @@ void NN_addInplace(Tensor *b, Tensor *a); * @param b: the target tensor * @param scalar: scalar value */ -void NN_addInplace1(Tensor *b, float scalar); +void NN_add_inplace1(Tensor *b, float scalar); -void NN_add_1D(Tensor *out, Tensor *a, Tensor *b); +void NN_add_1d(Tensor *out, Tensor *a, Tensor *b); -void NN_add_2D(Tensor *out, Tensor *a, Tensor *b); +void NN_add_2d(Tensor *out, Tensor *a, Tensor *b); -void NN_add_3D(Tensor *out, Tensor *a, Tensor *b); +void NN_add_3d(Tensor *out, Tensor *a, Tensor *b); -void NN_add_4D(Tensor *out, Tensor *a, Tensor *b); +void NN_add_4d(Tensor *out, Tensor *a, Tensor *b); #endif // __NN_ADD_H diff --git a/nn/src/nn_attention.c b/nn/functional/nn_attention.c similarity index 95% rename from nn/src/nn_attention.c rename to nn/functional/nn_attention.c index ced2e2e..c79ec8f 100644 --- a/nn/src/nn_attention.c +++ b/nn/functional/nn_attention.c @@ -17,9 +17,9 @@ void NN_attention( s->v = s->value_cache + loff + pos * kv_dim; // qkv matmuls for this position - NN_matmulT(q, s->xb, w->wq + l*dim*dim, dim, dim); - NN_matmulT(k, s->xb, w->wk + l*dim*kv_dim, dim, kv_dim); - NN_matmulT(v, s->xb, w->wv + l*dim*kv_dim, dim, kv_dim); + NN_matmul_t(q, s->xb, w->wq + l*dim*dim, dim, dim); + NN_matmul_t(k, s->xb, w->wk + l*dim*kv_dim, dim, kv_dim); + NN_matmul_t(v, s->xb, w->wv + l*dim*kv_dim, dim, kv_dim); // RoPE relative positional encoding: complex-valued rotate q and k in each head for (int i = 0; i < dim; i+=2) { diff --git a/nn/src/nn_batchnorm2d.c b/nn/functional/nn_batch_norm2d.c similarity index 98% rename from nn/src/nn_batchnorm2d.c rename to nn/functional/nn_batch_norm2d.c index 7787021..aeb6a5c 100644 --- a/nn/src/nn_batchnorm2d.c +++ b/nn/functional/nn_batch_norm2d.c @@ -1,7 +1,7 @@ -#include "nn_batchnorm2d.h" +#include "nn_batch_norm2d.h" -void NN_BatchNorm2d( +void NN_batch_norm2d( Tensor *out, Tensor *in, Tensor *weight, Tensor *bias, float eps, Tensor *running_mean, Tensor *running_var) { diff --git a/nn/inc/nn_batchnorm2d.h b/nn/functional/nn_batch_norm2d.h similarity index 89% rename from nn/inc/nn_batchnorm2d.h rename to nn/functional/nn_batch_norm2d.h index de3f5c3..f38f387 100644 --- a/nn/inc/nn_batchnorm2d.h +++ b/nn/functional/nn_batch_norm2d.h @@ -1,5 +1,5 @@ -#ifndef __NN_BATCHNORM2D_H -#define __NN_BATCHNORM2D_H +#ifndef __NN_BATCH_NORM2D_H +#define __NN_BATCH_NORM2D_H #include #include @@ -18,11 +18,11 @@ * @param running_mean: the running mean of the module of shape (channels), or NULL if no running mean is applied * @param running_var: the running variance of the module of shape (channels), or NULL if no running variance is applied */ -void NN_BatchNorm2d( +void NN_batch_norm2d( Tensor *out, Tensor *in, Tensor *weight, Tensor *bias, float eps, Tensor *running_mean, Tensor *running_va ); -#endif // __NN_BATCHNORM2D_H +#endif // __NN_BATCH_NORM2D_H diff --git a/nn/src/nn_clip.c b/nn/functional/nn_clip.c similarity index 58% rename from nn/src/nn_clip.c rename to nn/functional/nn_clip.c index 6ebd1a3..7b72ec5 100644 --- a/nn/src/nn_clip.c +++ b/nn/functional/nn_clip.c @@ -9,8 +9,8 @@ void NN_clip(Tensor *y, Tensor *x, float min, float max) { switch (y->dtype) { case DTYPE_F32: - NN__maximum1_F32(y->size, (float *)y->data, (float *)x->data, min); - NN__minimum1_F32(y->size, (float *)y->data, (float *)y->data, max); + NN__maximum1_f32(y->size, (float *)y->data, 1, (float *)x->data, 1, min); + NN__minimum1_f32(y->size, (float *)y->data, 1, (float *)y->data, 1, max); return; default: @@ -18,10 +18,10 @@ void NN_clip(Tensor *y, Tensor *x, float min, float max) { } printf("[ERROR] Unsupported operation for tensor with dtype %s = clip(%s, float, float)\n", - NN_getDataTypeName(y->dtype), NN_getDataTypeName(x->dtype) + NN_get_datatype_name(y->dtype), NN_get_datatype_name(x->dtype) ); } -void NN_clipInplace(Tensor *x, float min, float max) { +void NN_clip_inplace(Tensor *x, float min, float max) { NN_clip(x, x, min, max); } diff --git a/nn/inc/nn_clip.h b/nn/functional/nn_clip.h similarity index 83% rename from nn/inc/nn_clip.h rename to nn/functional/nn_clip.h index 353cf0c..45986ec 100644 --- a/nn/inc/nn_clip.h +++ b/nn/functional/nn_clip.h @@ -4,8 +4,8 @@ #include #include "nn_tensor.h" -#include "ops/maximum1.h" -#include "ops/minimum1.h" +#include "maximum1.h" +#include "minimum1.h" /** @@ -21,7 +21,7 @@ */ void NN_clip(Tensor *y, Tensor *x, float min, float max); -void NN_clipInplace(Tensor *x, float min, float max); +void NN_clip_inplace(Tensor *x, float min, float max); #endif // __NN_CLIP_H diff --git a/nn/src/nn_conv2d.c b/nn/functional/nn_conv2d.c similarity index 97% rename from nn/src/nn_conv2d.c rename to nn/functional/nn_conv2d.c index 4bfbe4b..c383b07 100644 --- a/nn/src/nn_conv2d.c +++ b/nn/functional/nn_conv2d.c @@ -6,7 +6,7 @@ #endif -void NN_NCHWToNHWC(Tensor *out, Tensor *in) { +void NN_nchw_to_nhwc(Tensor *out, Tensor *in) { assert(in->ndim == 4); assert(out->ndim == 4); assert(in->dtype == DTYPE_F32); @@ -34,7 +34,7 @@ void NN_NCHWToNHWC(Tensor *out, Tensor *in) { } } -void NN_NHWCToNCHW(Tensor *out, Tensor *in) { +void NN_nhwc_to_nchw(Tensor *out, Tensor *in) { assert(in->ndim == 4); assert(out->ndim == 4); assert(in->dtype == DTYPE_F32); @@ -62,7 +62,7 @@ void NN_NHWCToNCHW(Tensor *out, Tensor *in) { } } -void NN_Conv2d( +void NN_conv2d( Tensor *out, Tensor *in, Tensor *weight, Tensor *bias, const size_t *stride, const size_t *padding, const size_t *dilation, size_t groups) { @@ -135,8 +135,8 @@ void NN_Conv2d( Tensor *weight_1hwc = NN_tensor(4, (size_t[]){1, kernel_height, kernel_width, out_channels}, DTYPE_F32, weight->data); Tensor *weight_1chw = NN_tensor(4, (size_t[]){1, out_channels, kernel_height, kernel_width}, DTYPE_F32, NULL); - NN_NHWCToNCHW(in_nchw, in); - NN_NHWCToNCHW(weight_1chw, weight_1hwc); + NN_nhwc_to_nchw(in_nchw, in); + NN_nhwc_to_nchw(weight_1chw, weight_1hwc); for (size_t g = 0; g < groups; g += 1) { tiled_conv_auto( @@ -153,7 +153,7 @@ void NN_Conv2d( WS); } - NN_NCHWToNHWC(out, out_nchw); + NN_nchw_to_nhwc(out, out_nchw); } else { diff --git a/nn/inc/nn_conv2d.h b/nn/functional/nn_conv2d.h similarity index 89% rename from nn/inc/nn_conv2d.h rename to nn/functional/nn_conv2d.h index 9a7495b..88b3e40 100644 --- a/nn/inc/nn_conv2d.h +++ b/nn/functional/nn_conv2d.h @@ -1,5 +1,5 @@ -#ifndef __NN_CONV2D_H -#define __NN_CONV2D_H +#ifndef __NN_conv2d_H +#define __NN_conv2d_H #include #include @@ -12,7 +12,7 @@ * @param out: the output tensor of shape (batch_size, height, width, channels) * @param in: the input tensor of shape (batch_size, channels, height, width) */ -void NN_NCHWToNHWC(Tensor *out, Tensor *in); +void NN_nchw_to_nhwc(Tensor *out, Tensor *in); /** * Converts a tensor from NHWC (Channel-last) to NCHW (PyTorch) format. @@ -20,7 +20,7 @@ void NN_NCHWToNHWC(Tensor *out, Tensor *in); * @param out: the output tensor of shape (batch_size, channels, height, width) * @param in: the input tensor of shape (batch_size, height, width, channels) */ -void NN_NHWCToNCHW(Tensor *out, Tensor *in); +void NN_nhwc_to_nchw(Tensor *out, Tensor *in); /** * Applies a 2D convolution over an input signal composed of several input planes. @@ -34,11 +34,11 @@ void NN_NHWCToNCHW(Tensor *out, Tensor *in); * @param dilation: the spacing between kernel elements * @param groups: number of blocked connections from input channels to output channels */ -void NN_Conv2d( +void NN_conv2d( Tensor *out, Tensor *in, Tensor *weight, Tensor *bias, const size_t *stride, const size_t *padding, const size_t *dilation, size_t groups ); -#endif // __NN_CONV2D_H +#endif // __NN_conv2d_H diff --git a/nn/functional/nn_copy.c b/nn/functional/nn_copy.c new file mode 100644 index 0000000..13873e5 --- /dev/null +++ b/nn/functional/nn_copy.c @@ -0,0 +1,241 @@ + +#include "nn_copy.h" + +void NN_copy(Tensor *dst, Tensor *src) { + assert(dst->ndim == src->ndim); + assert(dst->size == src->size); + + if (dst->dtype == src->dtype) { + memcpy(dst->data, src->data, dst->size * NN_sizeof(dst->dtype)); + return; + } + + switch (src->dtype) { + case DTYPE_U8: + switch (dst->dtype) { + case DTYPE_U16: + for (size_t i = 0; i < src->size; i += 1) { + ((uint16_t *)dst->data)[i] = (uint16_t)((uint8_t *)src->data)[i]; + } + return; + case DTYPE_I16: + for (size_t i = 0; i < src->size; i += 1) { + ((int16_t *)dst->data)[i] = (int16_t)((uint8_t *)src->data)[i]; + } + return; + case DTYPE_U32: + for (size_t i = 0; i < src->size; i += 1) { + ((uint32_t *)dst->data)[i] = (uint32_t)((uint8_t *)src->data)[i]; + } + return; + case DTYPE_I32: + for (size_t i = 0; i < src->size; i += 1) { + ((int32_t *)dst->data)[i] = (int32_t)((uint8_t *)src->data)[i]; + } + return; + case DTYPE_F16: + for (size_t i = 0; i < src->size; i += 1) { + ((float16_t *)dst->data)[i] = NN_float_to_half((float)((uint8_t *)src->data)[i]); + } + return; + case DTYPE_F32: + for (size_t i = 0; i < src->size; i += 1) { + ((float *)dst->data)[i] = (float)((uint8_t *)src->data)[i]; + } + return; + default: + break; + } + break; + + case DTYPE_I8: + switch (dst->dtype) { + case DTYPE_I16: + for (size_t i = 0; i < src->size; i += 1) { + ((int16_t *)dst->data)[i] = (int16_t)((int8_t *)src->data)[i]; + } + return; + case DTYPE_I32: + for (size_t i = 0; i < src->size; i += 1) { + ((int32_t *)dst->data)[i] = (int32_t)((int8_t *)src->data)[i]; + } + return; + case DTYPE_F32: + for (size_t i = 0; i < src->size; i += 1) { + ((float *)dst->data)[i] = (float)((int8_t *)src->data)[i]; + } + return; + default: + break; + } + break; + + case DTYPE_U16: + switch (dst->dtype) { + case DTYPE_U8: + for (size_t i = 0; i < src->size; i += 1) { + ((uint8_t *)dst->data)[i] = (uint8_t)((uint16_t *)src->data)[i]; + } + return; + case DTYPE_I16: + for (size_t i = 0; i < src->size; i += 1) { + ((int16_t *)dst->data)[i] = (int16_t)((uint16_t *)src->data)[i]; + } + return; + case DTYPE_U32: + for (size_t i = 0; i < src->size; i += 1) { + ((uint32_t *)dst->data)[i] = (uint32_t)((uint16_t *)src->data)[i]; + } + return; + case DTYPE_I32: + for (size_t i = 0; i < src->size; i += 1) { + ((int32_t *)dst->data)[i] = (int32_t)((uint16_t *)src->data)[i]; + } + return; + case DTYPE_F32: + for (size_t i = 0; i < src->size; i += 1) { + ((float *)dst->data)[i] = (float)((uint16_t *)src->data)[i]; + } + return; + default: + break; + } + break; + + case DTYPE_I16: + switch (dst->dtype) { + case DTYPE_U8: + for (size_t i = 0; i < src->size; i += 1) { + ((uint8_t *)dst->data)[i] = (uint8_t)((int16_t *)src->data)[i]; + } + return; + case DTYPE_I8: + for (size_t i = 0; i < src->size; i += 1) { + ((int8_t *)dst->data)[i] = (int8_t)((int16_t *)src->data)[i]; + } + return; + case DTYPE_U16: + for (size_t i = 0; i < src->size; i += 1) { + ((uint16_t *)dst->data)[i] = (uint16_t)((int16_t *)src->data)[i]; + } + return; + case DTYPE_U32: + for (size_t i = 0; i < src->size; i += 1) { + ((uint32_t *)dst->data)[i] = (uint32_t)((int16_t *)src->data)[i]; + } + return; + case DTYPE_I32: + for (size_t i = 0; i < src->size; i += 1) { + ((int32_t *)dst->data)[i] = (int32_t)((int16_t *)src->data)[i]; + } + return; + default: + break; + } + break; + + case DTYPE_U32: + switch (dst->dtype) { + case DTYPE_U8: + for (size_t i = 0; i < src->size; i += 1) { + ((uint8_t *)dst->data)[i] = (uint8_t)((uint32_t *)src->data)[i]; + } + return; + case DTYPE_I8: + for (size_t i = 0; i < src->size; i += 1) { + ((int8_t *)dst->data)[i] = (int8_t)((uint32_t *)src->data)[i]; + } + return; + case DTYPE_U16: + for (size_t i = 0; i < src->size; i += 1) { + ((uint16_t *)dst->data)[i] = (uint16_t)((uint32_t *)src->data)[i]; + } + return; + case DTYPE_I16: + for (size_t i = 0; i < src->size; i += 1) { + ((int16_t *)dst->data)[i] = (int16_t)((uint32_t *)src->data)[i]; + } + return; + case DTYPE_I32: + for (size_t i = 0; i < src->size; i += 1) { + ((int32_t *)dst->data)[i] = (int32_t)((uint32_t *)src->data)[i]; + } + return; + case DTYPE_F32: + for (size_t i = 0; i < src->size; i += 1) { + ((float *)dst->data)[i] = (float)((uint32_t *)src->data)[i]; + } + return; + default: + break; + } + break; + + case DTYPE_I32: + switch (dst->dtype) { + case DTYPE_U8: + for (size_t i = 0; i < src->size; i += 1) { + ((uint8_t *)dst->data)[i] = (uint8_t)((int32_t *)src->data)[i]; + } + return; + case DTYPE_I8: + for (size_t i = 0; i < src->size; i += 1) { + ((int8_t *)dst->data)[i] = (int8_t)((int32_t *)src->data)[i]; + } + return; + case DTYPE_U16: + for (size_t i = 0; i < src->size; i += 1) { + ((uint16_t *)dst->data)[i] = (uint16_t)((int32_t *)src->data)[i]; + } + return; + case DTYPE_I16: + for (size_t i = 0; i < src->size; i += 1) { + ((int16_t *)dst->data)[i] = (int16_t)((int32_t *)src->data)[i]; + } + return; + case DTYPE_U32: + for (size_t i = 0; i < src->size; i += 1) { + ((uint32_t *)dst->data)[i] = (uint32_t)((int32_t *)src->data)[i]; + } + return; + case DTYPE_F32: + for (size_t i = 0; i < src->size; i += 1) { + ((float *)dst->data)[i] = (float)((int32_t *)src->data)[i]; + } + return; + default: + break; + } + break; + + case DTYPE_F16: + switch (dst->dtype) { + case DTYPE_F32: + for (size_t i = 0; i < src->size; i += 1) { + ((float *)dst->data)[i] = NN_half_to_float(((float16_t *)src->data)[i]); + } + return; + default: + break; + } + break; + + case DTYPE_F32: + switch (dst->dtype) { + case DTYPE_I32: + for (size_t i = 0; i < src->size; i += 1) { + ((int32_t *)dst->data)[i] = (int32_t)((float *)src->data)[i]; + } + return; + case DTYPE_F16: + for (size_t i = 0; i < src->size; i += 1) { + ((float16_t *)dst->data)[i] = NN_float_to_half(((float *)src->data)[i]); + } + return; + default: + break; + } + break; + } + printf("[ERROR] Cannot copy tensor from type %s to %s\n", NN_get_datatype_name(src->dtype), NN_get_datatype_name(dst->dtype)); +} diff --git a/nn/inc/nn_copy.h b/nn/functional/nn_copy.h similarity index 72% rename from nn/inc/nn_copy.h rename to nn/functional/nn_copy.h index 07d6997..2bcab46 100644 --- a/nn/inc/nn_copy.h +++ b/nn/functional/nn_copy.h @@ -10,6 +10,8 @@ /** * Copies values from one tensor to another * + * If the data types of the two tensors are different, the values are casted to the destination data type + * * @param dst: destination tensor * @param src: source tensor */ diff --git a/nn/src/nn_div.c b/nn/functional/nn_div.c similarity index 69% rename from nn/src/nn_div.c rename to nn/functional/nn_div.c index 3e0f581..6aaeeb8 100644 --- a/nn/src/nn_div.c +++ b/nn/functional/nn_div.c @@ -11,7 +11,7 @@ void NN_div(Tensor *out, Tensor *a, Tensor *b) { switch (out->dtype) { case DTYPE_F32: - NN__div_F32(out->size, (float *)out->data, (float *)a->data, (float *)b->data); + NN__div_f32(out->size, (float *)out->data, 1, (float *)a->data, 1, (float *)b->data, 1); return; default: @@ -19,6 +19,6 @@ void NN_div(Tensor *out, Tensor *a, Tensor *b) { } printf("[ERROR] Unsupported operation of tensor with dtype %s = %s / %s\n", - NN_getDataTypeName(out->dtype), NN_getDataTypeName(a->dtype), NN_getDataTypeName(b->dtype) + NN_get_datatype_name(out->dtype), NN_get_datatype_name(a->dtype), NN_get_datatype_name(b->dtype) ); } diff --git a/nn/inc/nn_div.h b/nn/functional/nn_div.h similarity index 94% rename from nn/inc/nn_div.h rename to nn/functional/nn_div.h index ae4554d..41b1579 100644 --- a/nn/inc/nn_div.h +++ b/nn/functional/nn_div.h @@ -5,7 +5,7 @@ #include #include "nn_tensor.h" -#include "ops/div.h" +#include "div.h" /** diff --git a/nn/src/nn_elu.c b/nn/functional/nn_elu.c similarity index 78% rename from nn/src/nn_elu.c rename to nn/functional/nn_elu.c index 49abfa2..b078bde 100644 --- a/nn/src/nn_elu.c +++ b/nn/functional/nn_elu.c @@ -2,7 +2,7 @@ #include "nn_elu.h" -void NN_ELU(Tensor *y, Tensor *x, float alpha) { +void NN_elu(Tensor *y, Tensor *x, float alpha) { assert(y->ndim == x->ndim); assert(y->dtype == x->dtype); assert(y->size == x->size); @@ -25,10 +25,10 @@ void NN_ELU(Tensor *y, Tensor *x, float alpha) { } printf("[ERROR] Unsupported operation between tensor with dtype %s = ELU(%s)\n", - NN_getDataTypeName(y->dtype), NN_getDataTypeName(x->dtype) + NN_get_datatype_name(y->dtype), NN_get_datatype_name(x->dtype) ); } -void NN_ELUInplace(Tensor *x, float alpha) { - NN_ELU(x, x, alpha); +void NN_elu_inplace(Tensor *x, float alpha) { + NN_elu(x, x, alpha); } diff --git a/nn/inc/nn_elu.h b/nn/functional/nn_elu.h similarity index 82% rename from nn/inc/nn_elu.h rename to nn/functional/nn_elu.h index 29db679..273cc64 100644 --- a/nn/inc/nn_elu.h +++ b/nn/functional/nn_elu.h @@ -19,9 +19,9 @@ * @param x: input tensor * @param alpha: the alpha value for the ELU formulation */ -void NN_ELU(Tensor *y, Tensor *x, float alpha); +void NN_elu(Tensor *y, Tensor *x, float alpha); -void NN_ELUInplace(Tensor *x, float alpha); +void NN_elu_inplace(Tensor *x, float alpha); #endif // __NN_ELU_H diff --git a/nn/functional/nn_fill.c b/nn/functional/nn_fill.c new file mode 100644 index 0000000..5d703f8 --- /dev/null +++ b/nn/functional/nn_fill.c @@ -0,0 +1,25 @@ + +#include "nn_fill.h" + + +void NN_fill(Tensor *tensor, float value) { + switch (tensor->dtype) { + case DTYPE_U8: + NN__fill_u8(tensor->size, (uint8_t *)tensor->data, 1, (uint8_t)value); + return; + case DTYPE_I8: + NN__fill_i8(tensor->size, (int8_t *)tensor->data, 1, (int8_t)value); + return; + case DTYPE_I32: + NN__fill_i32(tensor->size, (int32_t *)tensor->data, 1, (int32_t)value); + return; + case DTYPE_F16: + NN__fill_f16(tensor->size, (float16_t *)tensor->data, 1, NN_float_to_half(value)); + return; + case DTYPE_F32: + NN__fill_f32(tensor->size, (float *)tensor->data, 1, value); + return; + default: + printf("[ERROR] Unsupported operation fill to tensor with dtype: %d\n", tensor->dtype); + } +} diff --git a/nn/inc/nn_fill.h b/nn/functional/nn_fill.h similarity index 56% rename from nn/inc/nn_fill.h rename to nn/functional/nn_fill.h index ab0f763..48638f4 100644 --- a/nn/inc/nn_fill.h +++ b/nn/functional/nn_fill.h @@ -5,7 +5,7 @@ #include #include "nn_tensor.h" -#include "ops/fill.h" +#include "fill.h" /** * Fills the tensor with the specified value. @@ -15,12 +15,5 @@ */ void NN_fill(Tensor *tensor, float value); -Tensor *NN_zeros(size_t ndim, const size_t *shape, DataType dtype); - -Tensor *NN_ones(size_t ndim, const size_t *shape, DataType dtype); - -Tensor *NN_rand(size_t ndim, const size_t *shape, DataType dtype); - - #endif // __NN_FILL_H diff --git a/nn/src/nn_interpolate.c b/nn/functional/nn_interpolate.c similarity index 95% rename from nn/src/nn_interpolate.c rename to nn/functional/nn_interpolate.c index 93665d7..4c57ab3 100644 --- a/nn/src/nn_interpolate.c +++ b/nn/functional/nn_interpolate.c @@ -46,7 +46,7 @@ void NN_interpolate(Tensor *out, Tensor *in, const float *scale_factor/*const ch default: printf("[ERROR] Unsupported operation of tensor with dtype %s = |%s|\n", - NN_getDataTypeName(out->dtype), NN_getDataTypeName(in->dtype) + NN_get_datatype_name(out->dtype), NN_get_datatype_name(in->dtype) ); break; } diff --git a/nn/inc/nn_interpolate.h b/nn/functional/nn_interpolate.h similarity index 100% rename from nn/inc/nn_interpolate.h rename to nn/functional/nn_interpolate.h diff --git a/nn/functional/nn_layer_norm.c b/nn/functional/nn_layer_norm.c new file mode 100644 index 0000000..7c5f934 --- /dev/null +++ b/nn/functional/nn_layer_norm.c @@ -0,0 +1,44 @@ + +#include "nn_layer_norm.h" + +void NN_layer_norm( + Tensor *out, Tensor *in, + size_t normalized_dims, + Tensor *weight, Tensor *bias, + const float eps) { + assert(out->dtype == in->dtype && in->dtype == DTYPE_F32); + assert(out->ndim == in->ndim); + + // currently only support 1D normalization + assert(normalized_dims == 1); + + size_t n = in->shape[1]; + for (size_t i = 0; i < in->shape[0]; i += 1) { + float *out_ptr = (float *)out->data + i * n; + float *in_ptr = (float *)in->data + i * n; + + float mean = 0; + NN__sum_f32(n, &mean, in_ptr, 1); + mean /= n; + + float variance = 0; + // use y as temporary buffer + // y = x - E[x] + NN__add1_f32(n, out_ptr, 1, in_ptr, 1, -mean); + // y = y * y + NN__sqr_f32(n, out_ptr, 1, out_ptr, 1); + + NN__sum_f32(n, &variance, out_ptr, 1); + variance /= n; + + // y = x - E[x] + NN__add1_f32(n, out_ptr, 1, in_ptr, 1, -mean); + + // y = y / sqrt(Var[x] + eps) + NN__mul1_f32(n, out_ptr, 1, out_ptr, 1, 1.f / sqrtf(variance + eps)); + + // y = y * weight + bias + NN__mul_f32(n, out_ptr, 1, (float *)weight->data, 1, out_ptr, 1); + NN__add_f32(n, out_ptr, 1, (float *)bias->data, 1, out_ptr, 1); + } +} diff --git a/nn/functional/nn_layer_norm.h b/nn/functional/nn_layer_norm.h new file mode 100644 index 0000000..b630389 --- /dev/null +++ b/nn/functional/nn_layer_norm.h @@ -0,0 +1,23 @@ +#ifndef __NN_LAYER_NORM_H +#define __NN_LAYER_NORM_H + +#include +#include + +#include "nn_tensor.h" +#include "sum.h" +#include "add.h" +#include "add1.h" +#include "mul.h" +#include "mul1.h" +#include "sqr.h" + + +void NN_layer_norm( + Tensor *out, Tensor *in, + size_t normalized_dims, + Tensor *weight, Tensor *bias, + const float eps); + + +#endif // __NN_LAYER_NORM_H diff --git a/nn/functional/nn_linear.c b/nn/functional/nn_linear.c new file mode 100644 index 0000000..08f8122 --- /dev/null +++ b/nn/functional/nn_linear.c @@ -0,0 +1,11 @@ + +#include "nn_linear.h" + + +void NN_linear(Tensor *y, Tensor *x, Tensor *w, Tensor *b) { + NN_mm_t(y, x, w); + + if (b != NULL) { + NN_add(y, y, b); + } +} diff --git a/nn/inc/nn_linear.h b/nn/functional/nn_linear.h similarity index 85% rename from nn/inc/nn_linear.h rename to nn/functional/nn_linear.h index aa3d9a9..1c98f8d 100644 --- a/nn/inc/nn_linear.h +++ b/nn/functional/nn_linear.h @@ -4,7 +4,7 @@ #include #include "nn_tensor.h" -#include "nn_matmul.h" +#include "nn_mm.h" #include "nn_add.h" @@ -18,6 +18,6 @@ * @param w: weight tensor of shape (out_features, in_features) * @param b: bias tensor of shape (1, out_features), or NULL if no bias is applied */ -void NN_Linear(Tensor *y, Tensor *x, Tensor *w, Tensor *b); +void NN_linear(Tensor *y, Tensor *x, Tensor *w, Tensor *b); #endif // __NN_Linear_H diff --git a/nn/inc/nn_math.h b/nn/functional/nn_math.h similarity index 80% rename from nn/inc/nn_math.h rename to nn/functional/nn_math.h index 955113f..b11ab5e 100644 --- a/nn/inc/nn_math.h +++ b/nn/functional/nn_math.h @@ -14,158 +14,6 @@ - - - -// static void NN__dot_bf16(int n, float *s, bfloat16_t *x, bfloat16_t *y) { -// int i = 0; -// float sumf = 0; - -// #if defined(__AVX512BF16__) -// __m512 c1 = _mm512_setzero_ps(); -// __m512 c2 = _mm512_setzero_ps(); -// for (; i + 64 <= n; i += 64) { -// c1 = _mm512_dpbf16_ps(c1, m512bh(_mm512_loadu_si512((x + i))), -// m512bh(_mm512_loadu_si512((y + i)))); -// c2 = _mm512_dpbf16_ps(c2, m512bh(_mm512_loadu_si512((x + i + 32))), -// m512bh(_mm512_loadu_si512((y + i + 32)))); -// } -// sumf += (float)_mm512_reduce_add_ps(c1); -// sumf += (float)_mm512_reduce_add_ps(c2); - -// #elif defined(__AVX512F__) -// #define LOAD(p) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)(p))), 16)) -// __m512 c1 = _mm512_setzero_ps(); -// __m512 c2 = _mm512_setzero_ps(); -// for (; i + 32 <= n; i += 32) { -// c1 = _mm512_add_ps(_mm512_mul_ps(LOAD(x + i), LOAD(y + i)), c1); -// c2 = _mm512_add_ps(_mm512_mul_ps(LOAD(x + i + 16), LOAD(y + i + 16)), c2); -// } -// sumf += (float)_mm512_reduce_add_ps(c1); -// sumf += (float)_mm512_reduce_add_ps(c2); - -// #undef LOAD -// #elif defined(__AVX2__) -// #define LOAD(p) _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16)) -// __m256 c1 = _mm256_setzero_ps(); -// __m256 c2 = _mm256_setzero_ps(); -// __m256 c3 = _mm256_setzero_ps(); -// __m256 c4 = _mm256_setzero_ps(); -// for (; i + 32 <= n; i += 32) { -// c1 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i), LOAD(y + i)), c1); -// c2 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 8), LOAD(y + i + 8)), c2); -// c3 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 16), LOAD(y + i + 16)), c3); -// c4 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 24), LOAD(y + i + 24)), c4); -// } -// __m128 g; -// c1 = _mm256_add_ps(_mm256_add_ps(c1, c3), -// _mm256_add_ps(c2, c4)); -// g = _mm_add_ps(_mm256_extractf128_ps(c1, 1), -// _mm256_castps256_ps128(c1)); -// g = _mm_add_ps(g, _mm_movehl_ps(g, g)); -// g = _mm_add_ss(g, _mm_movehdup_ps(g)); -// sumf += (float)_mm_cvtss_F32(g); - -// #undef LOAD -// #endif - -// for (; i < n; i += 1) { -// sumf += (float)(GGML_BF16_TO_FP32(x[i]) * -// GGML_BF16_TO_FP32(y[i])); -// } -// *s = sumf; -// } - -// static void NN__dot_f16(int n, float *s, float16_t *x, float16_t *y) { -// float sumf = 0.0; - -// #if defined(GGML_SIMD) -// const int np = (n & ~(GGML_F16_STEP - 1)); - -// GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO }; - -// GGML_F16_VEC ax[GGML_F16_ARR]; -// GGML_F16_VEC ay[GGML_F16_ARR]; - -// for (int i = 0; i < np; i += GGML_F16_STEP) { -// for (int j = 0; j < GGML_F16_ARR; j++) { -// ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j); -// ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); - -// sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]); -// } -// } - -// // reduce sum0..sum3 to sum0 -// GGML_F16_VEC_REDUCE(sumf, sum); - -// // leftovers -// for (int i = np; i < n; i += 1) { -// sumf += (float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i])); -// } -// #else -// for (int i = 0; i < n; i += 1) { -// sumf += (float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i])); -// } -// #endif -// *s = sumf; -// } - -// // compute GGML_VEC_DOT_UNROLL dot products at once -// // xs - x row stride in bytes -// inline static void NN__dot_f16_unroll(const int n, const int xs, float *restrict s, void * restrict xv, float16_t * restrict y) { -// float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 }; - -// float16_t * restrict x[GGML_VEC_DOT_UNROLL]; - -// for (int i = 0; i < GGML_VEC_DOT_UNROLL; i += 1) { -// x[i] = (float16_t *) ((char *) xv + i*xs); -// } - -// #if defined(GGML_SIMD) -// const int np = (n & ~(GGML_F16_STEP - 1)); - -// GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } }; - -// GGML_F16_VEC ax[GGML_F16_ARR]; -// GGML_F16_VEC ay[GGML_F16_ARR]; - -// for (int i = 0; i < np; i += GGML_F16_STEP) { -// for (int j = 0; j < GGML_F16_ARR; j++) { -// ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); - -// for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) { -// ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j); - -// sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]); -// } -// } -// } - -// // reduce sum0..sum3 to sum0 -// for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) { -// GGML_F16_VEC_REDUCE(sumf[k], sum[k]); -// } - -// // leftovers -// for (int i = np; i < n; i += 1) { -// for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) { -// sumf[j] += (float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i])); -// } -// } -// #else -// for (int i = 0; i < n; i += 1) { -// for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) { -// sumf[j] += (float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i])); -// } -// } -// #endif - -// for (int i = 0; i < GGML_VEC_DOT_UNROLL; i += 1) { -// s[i] = sumf[i]; -// } -// } - // inline static void NN__mad_F32(const int n, float *y, const float *x, const float v) { // #if defined(GGML_SIMD) // const int np = (n & ~(GGML_F32_STEP - 1)); @@ -657,7 +505,7 @@ // } -// inline static void NN__sum_F32_ggf(const int n, float *s, const float *x) { +// inline static void NN__sum_f32_ggf(const int n, float *s, const float *x) { // float sum = 0.0; // for (int i = 0; i < n; i += 1) { // sum += (float)x[i]; diff --git a/nn/functional/nn_matmul.c b/nn/functional/nn_matmul.c new file mode 100644 index 0000000..ee11356 --- /dev/null +++ b/nn/functional/nn_matmul.c @@ -0,0 +1,24 @@ + +#include "nn_matmul.h" + + +void NN_matmul(Tensor *out, Tensor *a, Tensor *b) { + if (a->ndim == 2 && b->ndim == 2) { + NN_mm(out, a, b); + return; + } + printf("Unsupported operation: %s = %s @ %s\n", + NN_get_datatype_name(out->dtype), NN_get_datatype_name(a->dtype), NN_get_datatype_name(b->dtype) + ); +} + +void NN_matmul_t(Tensor *out, Tensor *a, Tensor *b) { + if (a->ndim == 2 && b->ndim == 2) { + NN_mm_t(out, a, b); + return; + } + printf("Unsupported operation: %s = %s @ %s\n", + NN_get_datatype_name(out->dtype), NN_get_datatype_name(a->dtype), NN_get_datatype_name(b->dtype) + ); +} + diff --git a/nn/inc/nn_matmul.h b/nn/functional/nn_matmul.h similarity index 88% rename from nn/inc/nn_matmul.h rename to nn/functional/nn_matmul.h index 0081abe..082b708 100644 --- a/nn/inc/nn_matmul.h +++ b/nn/functional/nn_matmul.h @@ -4,7 +4,7 @@ #include #include "nn_tensor.h" -#include "ops/dot.h" +#include "nn_mm.h" /** @@ -27,7 +27,7 @@ void NN_matmul(Tensor *out, Tensor *a, Tensor *b); * @param a: the input tensor of shape (m, k) * @param b: the input tensor of shape (n, k) */ -void NN_matmulT(Tensor *out, Tensor *a, Tensor *b); +void NN_matmul_t(Tensor *out, Tensor *a, Tensor *b); #endif // __NN_MATMUL_H diff --git a/nn/src/nn_max.c b/nn/functional/nn_max.c similarity index 69% rename from nn/src/nn_max.c rename to nn/functional/nn_max.c index e03d194..e80a78a 100644 --- a/nn/src/nn_max.c +++ b/nn/functional/nn_max.c @@ -7,12 +7,12 @@ void NN_max(Tensor *scalar, Tensor *tensor) { switch (tensor->dtype) { case DTYPE_F32: - NN__max_F32(tensor->size, (float *)scalar->data, (float *)tensor->data); + NN__max_f32(tensor->size, (float *)scalar->data, (float *)tensor->data, 1); break; default: printf("[ERROR] Unsupported operation of tensor with dtype max(%s)\n", - NN_getDataTypeName(tensor->dtype) + NN_get_datatype_name(tensor->dtype) ); } } diff --git a/nn/inc/nn_max.h b/nn/functional/nn_max.h similarity index 93% rename from nn/inc/nn_max.h rename to nn/functional/nn_max.h index 9d3380a..5fb3181 100644 --- a/nn/inc/nn_max.h +++ b/nn/functional/nn_max.h @@ -5,7 +5,7 @@ #include #include "nn_tensor.h" -#include "ops/max.h" +#include "max.h" /** diff --git a/nn/src/nn_maxpool2d.c b/nn/functional/nn_max_pool2d.c similarity index 78% rename from nn/src/nn_maxpool2d.c rename to nn/functional/nn_max_pool2d.c index add74f8..869fadb 100644 --- a/nn/src/nn_maxpool2d.c +++ b/nn/functional/nn_max_pool2d.c @@ -1,8 +1,8 @@ -#include "nn_maxpool2d.h" +#include "nn_max_pool2d.h" -void NN_MaxPool2d_F32(Tensor *out, Tensor *in, const size_t *kernel_size) { +void NN_max_pool2d(Tensor *out, Tensor *in, const size_t *kernel_size) { assert(in->ndim == 4); assert(out->ndim == 4); assert(in->dtype == DTYPE_F32); @@ -28,8 +28,6 @@ void NN_MaxPool2d_F32(Tensor *out, Tensor *in, const size_t *kernel_size) { size_t output_height = (input_height - kernel_height) + 1; size_t output_width = (input_width - kernel_width) + 1; - size_t stride = kernel_size[0]; - for (size_t b = 0; b < batch_size; b += 1) { for (size_t c = 0; c < channels; c += 1) { for (size_t h = 0; h < output_height; h += 1) { @@ -40,10 +38,7 @@ void NN_MaxPool2d_F32(Tensor *out, Tensor *in, const size_t *kernel_size) { + h * in->shape[3] + w; - // Create a tensor for the current pooling window - Tensor *window = NN_tensor(2, (size_t[]){kernel_height, kernel_width}, DTYPE_F32, ((float *)in->data) + window_offset); - - NN_max((((float *)out->data) + window_offset), window); + NN__max_f32(kernel_height * kernel_width, ((float *)out->data) + window_offset, ((float *)out->data) + window_offset, 1); } } } diff --git a/nn/inc/nn_maxpool2d.h b/nn/functional/nn_max_pool2d.h similarity index 71% rename from nn/inc/nn_maxpool2d.h rename to nn/functional/nn_max_pool2d.h index 88c226b..26a018c 100644 --- a/nn/inc/nn_maxpool2d.h +++ b/nn/functional/nn_max_pool2d.h @@ -1,5 +1,5 @@ -#ifndef __NN_MAXPOOL2D_H -#define __NN_MAXPOOL2D_H +#ifndef __NN_MAX_POOL2D_H +#define __NN_MAX_POOL2D_H #include @@ -14,7 +14,7 @@ * @param in: the input tensor of shape (batch_size, channels, pooled_height, pooled_width) * @param kernel_size: size of the pooling window */ -void NN_MaxPool2d_F32(Tensor *out, Tensor *in, const size_t *kernel_size); +void NN_max_pool2d(Tensor *out, Tensor *in, const size_t *kernel_size); -#endif // __NN_MAXPOOL2D_H +#endif // __NN_MAX_POOL2D_H diff --git a/nn/src/nn_maximum.c b/nn/functional/nn_maximum.c similarity index 69% rename from nn/src/nn_maximum.c rename to nn/functional/nn_maximum.c index e9b759c..66ce752 100644 --- a/nn/src/nn_maximum.c +++ b/nn/functional/nn_maximum.c @@ -12,13 +12,13 @@ void NN_maximum(Tensor *out, Tensor *a, Tensor *b) { switch (out->dtype) { case DTYPE_F32: - NN__maximum_F32(out->size, (float *)out->data, (float *)a->data, (float *)b->data); + NN__maximum_f32(out->size, (float *)out->data, 1, (float *)a->data, 1, (float *)b->data, 1); return; default: break; } printf("[ERROR] Unsupported operation between tensor with dtype %s = max(%s, %s)\n", - NN_getDataTypeName(out->dtype), NN_getDataTypeName(a->dtype), NN_getDataTypeName(b->dtype) + NN_get_datatype_name(out->dtype), NN_get_datatype_name(a->dtype), NN_get_datatype_name(b->dtype) ); } diff --git a/nn/inc/nn_maximum.h b/nn/functional/nn_maximum.h similarity index 93% rename from nn/inc/nn_maximum.h rename to nn/functional/nn_maximum.h index b4a50cf..10abefd 100644 --- a/nn/inc/nn_maximum.h +++ b/nn/functional/nn_maximum.h @@ -7,7 +7,7 @@ #endif #include "nn_tensor.h" -#include "ops/maximum.h" +#include "maximum.h" /** diff --git a/nn/src/nn_min.c b/nn/functional/nn_min.c similarity index 69% rename from nn/src/nn_min.c rename to nn/functional/nn_min.c index 64267b9..5ef0410 100644 --- a/nn/src/nn_min.c +++ b/nn/functional/nn_min.c @@ -7,12 +7,12 @@ void NN_min(Tensor *scalar, Tensor *tensor) { switch (tensor->dtype) { case DTYPE_F32: - NN__min_F32(tensor->size, (float *)scalar->data, (float *)tensor->data); + NN__min_f32(tensor->size, (float *)scalar->data, (float *)tensor->data, 1); break; default: printf("[ERROR] Unsupported operation of tensor with dtype min(%s)\n", - NN_getDataTypeName(tensor->dtype) + NN_get_datatype_name(tensor->dtype) ); } } diff --git a/nn/inc/nn_min.h b/nn/functional/nn_min.h similarity index 93% rename from nn/inc/nn_min.h rename to nn/functional/nn_min.h index 17d1b88..f37f480 100644 --- a/nn/inc/nn_min.h +++ b/nn/functional/nn_min.h @@ -5,7 +5,7 @@ #include #include "nn_tensor.h" -#include "ops/min.h" +#include "min.h" /** diff --git a/nn/src/nn_minimum.c b/nn/functional/nn_minimum.c similarity index 69% rename from nn/src/nn_minimum.c rename to nn/functional/nn_minimum.c index ab41f64..363fb41 100644 --- a/nn/src/nn_minimum.c +++ b/nn/functional/nn_minimum.c @@ -12,7 +12,7 @@ void NN_minimum(Tensor *out, Tensor *a, Tensor *b) { switch (out->dtype) { case DTYPE_F32: - NN__minimum_F32(out->size, (float *)out->data, (float *)a->data, (float *)b->data); + NN__minimum_f32(out->size, (float *)out->data, 1, (float *)a->data, 1, (float *)b->data, 1); return; default: @@ -20,6 +20,6 @@ void NN_minimum(Tensor *out, Tensor *a, Tensor *b) { } printf("[ERROR] Unsupported operation between tensor with dtype %s = max(%s, %s)\n", - NN_getDataTypeName(out->dtype), NN_getDataTypeName(a->dtype), NN_getDataTypeName(b->dtype) + NN_get_datatype_name(out->dtype), NN_get_datatype_name(a->dtype), NN_get_datatype_name(b->dtype) ); } diff --git a/nn/inc/nn_minimum.h b/nn/functional/nn_minimum.h similarity index 93% rename from nn/inc/nn_minimum.h rename to nn/functional/nn_minimum.h index 48a67db..019aafe 100644 --- a/nn/inc/nn_minimum.h +++ b/nn/functional/nn_minimum.h @@ -7,7 +7,7 @@ #endif #include "nn_tensor.h" -#include "ops/minimum.h" +#include "minimum.h" /** diff --git a/nn/functional/nn_mm.c b/nn/functional/nn_mm.c new file mode 100644 index 0000000..7fa4e50 --- /dev/null +++ b/nn/functional/nn_mm.c @@ -0,0 +1,81 @@ + +#include "nn_mm.h" + + +void NN_mm(Tensor *out, Tensor *a, Tensor *b) { + assert(a->ndim == 2); + assert(b->ndim == 2); + assert(a->shape[1] == b->shape[0]); + assert(out->shape[0] == a->shape[0]); + assert(out->shape[1] == b->shape[1]); + + #ifdef GEMMINI + NN__mm_f32(out->shape[0], out->shape[1], (float *)out->data, (float *)a->data, (float *)b->data); + return; + #endif + + if (a->dtype == DTYPE_F32 && b->dtype == DTYPE_F32 && out->dtype == DTYPE_F32) { + for (size_t i = 0; i < out->shape[0]; i += 1) { + for (size_t j = 0; j < out->shape[1]; j += 1) { + NN__dot_f32(a->shape[1], + (float *)out->data + i * out->shape[1] + j, + (float *)a->data + i * a->shape[1], 1, + (float *)b->data + j, b->shape[1] + ); + } + } + return; + } + if (a->dtype == DTYPE_F16 && b->dtype == DTYPE_F16 && out->dtype == DTYPE_F16) { + for (size_t i = 0; i < out->shape[0]; i += 1) { + for (size_t j = 0; j < out->shape[1]; j += 1) { + NN__dot_f16(a->shape[1], + (float16_t *)out->data + i * out->shape[1] + j, + (float16_t *)a->data + i * a->shape[1], 1, + (float16_t *)b->data + j, b->shape[1] + ); + } + } + return; + } + printf("Unsupported operation: %s = %s @ %s\n", + NN_get_datatype_name(out->dtype), NN_get_datatype_name(a->dtype), NN_get_datatype_name(b->dtype) + ); +} + +void NN_mm_t(Tensor *out, Tensor *a, Tensor *b) { + assert(a->ndim == 2); + assert(b->ndim == 2); + assert(a->shape[1] == b->shape[1]); + assert(out->shape[0] == a->shape[0]); + assert(out->shape[1] == b->shape[0]); + + if (a->dtype == DTYPE_F16 && b->dtype == DTYPE_F16 && out->dtype == DTYPE_F16) { + for (size_t i = 0; i < out->shape[0]; i += 1) { + for (size_t j = 0; j < out->shape[1]; j += 1) { + NN__dot_f16(a->shape[1], + (float16_t *)out->data + i * out->shape[1] + j, + (float16_t *)a->data + i * a->shape[1], 1, + (float16_t *)b->data + j * b->shape[1], 1 + ); + } + } + return; + } + if (a->dtype == DTYPE_F32 && b->dtype == DTYPE_F32 && out->dtype == DTYPE_F32) { + for (size_t i = 0; i < out->shape[0]; i += 1) { + for (size_t j = 0; j < out->shape[1]; j += 1) { + NN__dot_f32(a->shape[1], + (float *)out->data + i * out->shape[1] + j, + (float *)a->data + i * a->shape[1], 1, + (float *)b->data + j * b->shape[1], 1 + ); + } + } + return; + } + printf("Unsupported operation: %s = %s @ %s\n", + NN_get_datatype_name(out->dtype), NN_get_datatype_name(a->dtype), NN_get_datatype_name(b->dtype) + ); +} + diff --git a/nn/functional/nn_mm.h b/nn/functional/nn_mm.h new file mode 100644 index 0000000..ff84cd3 --- /dev/null +++ b/nn/functional/nn_mm.h @@ -0,0 +1,37 @@ +#ifndef __NN_MM_H +#define __NN_MM_H + +#include + +#include "nn_tensor.h" +#include "dot.h" + + +/** + * Performs a matrix multiplication. + * + * If input is a (m×k) tensor, mat2 is a (k×n) tensor, out will be a (m×n) tensor. + * + * C = A @ B + * + * @param out: the output tensor of shape (m, n) + * @param a: the input tensor of shape (m, k) + * @param b: the input tensor of shape (k, n) + */ +void NN_mm(Tensor *out, Tensor *a, Tensor *b); + +/** + * Performs a matrix multiplication with transposed B. + * + * If input is a (m×k) tensor, mat2 is a (n×k) tensor, out will be a (m×n) tensor. + * + * C = A @ B.T + * + * @param out: the output tensor of shape (m, n) + * @param a: the input tensor of shape (m, k) + * @param b: the input tensor of shape (n, k) + */ +void NN_mm_t(Tensor *out, Tensor *a, Tensor *b); + + +#endif // __NN_MM_H diff --git a/nn/src/nn_mul.c b/nn/functional/nn_mul.c similarity index 70% rename from nn/src/nn_mul.c rename to nn/functional/nn_mul.c index cdba6f6..cfc13dd 100644 --- a/nn/src/nn_mul.c +++ b/nn/functional/nn_mul.c @@ -11,7 +11,7 @@ void NN_mul(Tensor *out, Tensor *a, Tensor *b) { switch (out->dtype) { case DTYPE_F32: - NN__mul_F32(out->size, (float *)out->data, (float *)a->data, (float *)b->data); + NN__mul_f32(out->size, (float *)out->data, 1, (float *)a->data, 1, (float *)b->data, 1); return; default: @@ -19,7 +19,7 @@ void NN_mul(Tensor *out, Tensor *a, Tensor *b) { } printf("[ERROR] Unsupported operation of tensor with dtype %s = %s * %s\n", - NN_getDataTypeName(out->dtype), NN_getDataTypeName(a->dtype), NN_getDataTypeName(b->dtype) + NN_get_datatype_name(out->dtype), NN_get_datatype_name(a->dtype), NN_get_datatype_name(b->dtype) ); } @@ -31,7 +31,7 @@ void NN_mul1(Tensor *out, Tensor *in, float scalar) { switch (out->dtype) { case DTYPE_F32: - NN__mul1_F32(out->size, (float *)out->data, (float *)in->data, scalar); + NN__mul1_f32(out->size, (float *)out->data, 1, (float *)in->data, 1, scalar); return; default: @@ -39,7 +39,7 @@ void NN_mul1(Tensor *out, Tensor *in, float scalar) { } printf("[ERROR] Unsupported operation of tensor with dtype %s = %s * scalar\n", - NN_getDataTypeName(out->dtype), NN_getDataTypeName(in->dtype) + NN_get_datatype_name(out->dtype), NN_get_datatype_name(in->dtype) ); } diff --git a/nn/inc/nn_mul.h b/nn/functional/nn_mul.h similarity index 93% rename from nn/inc/nn_mul.h rename to nn/functional/nn_mul.h index 572df79..802649e 100644 --- a/nn/inc/nn_mul.h +++ b/nn/functional/nn_mul.h @@ -5,8 +5,8 @@ #include #include "nn_tensor.h" -#include "ops/mul.h" -#include "ops/mul1.h" +#include "mul.h" +#include "mul1.h" /** diff --git a/nn/src/nn_neg.c b/nn/functional/nn_neg.c similarity index 68% rename from nn/src/nn_neg.c rename to nn/functional/nn_neg.c index 5501893..0acfab2 100644 --- a/nn/src/nn_neg.c +++ b/nn/functional/nn_neg.c @@ -10,7 +10,7 @@ void NN_neg(Tensor *out, Tensor *in) { switch (out->dtype) { case DTYPE_F32: - NN__neg_F32(out->size, (float *)out->data, (float *)in->data); + NN__neg_f32(out->size, (float *)out->data, 1, (float *)in->data, 1); return; default: @@ -18,10 +18,10 @@ void NN_neg(Tensor *out, Tensor *in) { } printf("[ERROR] Unsupported operation of tensor with dtype %s = -%s\n", - NN_getDataTypeName(out->dtype), NN_getDataTypeName(in->dtype) + NN_get_datatype_name(out->dtype), NN_get_datatype_name(in->dtype) ); } -void NN_negInplace(Tensor *tensor) { +void NN_neg_inplace(Tensor *tensor) { NN_neg(tensor, tensor); } diff --git a/nn/inc/nn_neg.h b/nn/functional/nn_neg.h similarity index 84% rename from nn/inc/nn_neg.h rename to nn/functional/nn_neg.h index a433a53..4e8e20f 100644 --- a/nn/inc/nn_neg.h +++ b/nn/functional/nn_neg.h @@ -4,7 +4,7 @@ #include #include "nn_tensor.h" -#include "ops/neg.h" +#include "neg.h" /** @@ -17,7 +17,7 @@ */ void NN_neg(Tensor *out, Tensor *input); -void NN_negInplace(Tensor *tensor); +void NN_neg_inplace(Tensor *tensor); #endif // __NN_NEG_H diff --git a/nn/src/nn_matrixnorm.c b/nn/functional/nn_norm.c similarity index 88% rename from nn/src/nn_matrixnorm.c rename to nn/functional/nn_norm.c index 16a07e6..722c5f6 100644 --- a/nn/src/nn_matrixnorm.c +++ b/nn/functional/nn_norm.c @@ -1,13 +1,13 @@ -#include "nn_matrixnorm.h" +#include "nn_norm.h" #ifdef RVV #include #endif -void NN_matrixNorm(Tensor *scalar, Tensor *x) { +void NN_norm(Tensor *scalar, Tensor *x) { assert(x->ndim == 2); - assert(NN_isScalar(scalar)); + assert(NN_is_scalar(scalar)); assert(scalar->dtype == x->dtype); switch (x->dtype) { @@ -20,7 +20,7 @@ void NN_matrixNorm(Tensor *scalar, Tensor *x) { } printf("[ERROR] Unsupported operation between tensor with dtype %s = ||%s||\n", - NN_getDataTypeName(scalar->dtype), NN_getDataTypeName(x->dtype) + NN_get_datatype_name(scalar->dtype), NN_get_datatype_name(x->dtype) ); } diff --git a/nn/functional/nn_norm.h b/nn/functional/nn_norm.h new file mode 100644 index 0000000..4dd6ca4 --- /dev/null +++ b/nn/functional/nn_norm.h @@ -0,0 +1,18 @@ +#ifndef __NN_NORM_H +#define __NN_NORM_H + +#include +#include + +#include "nn_tensor.h" + + +/** + * Computes the Frobenius norm of a matrix. + * + * @param tensor: the input tensor of shape (m, n) + */ +void NN_norm(Tensor *scalar, Tensor *x); + + +#endif // __NN_NORM_H diff --git a/nn/src/nn_print.c b/nn/functional/nn_print.c similarity index 53% rename from nn/src/nn_print.c rename to nn/functional/nn_print.c index c3b454e..1d7ed95 100644 --- a/nn/src/nn_print.c +++ b/nn/functional/nn_print.c @@ -2,7 +2,11 @@ #include "nn_print.h" -void NN_printFloat(float v, int16_t num_digits) { +void NN_print_f16(float16_t v, int16_t num_digits) { + NN_print_f32(NN_half_to_float(v), num_digits); +} + +void NN_print_f32(float v, int16_t num_digits) { if (isinf(v)) { if (signbit(v)) { printf("-inf"); @@ -38,7 +42,7 @@ void NN_printFloat(float v, int16_t num_digits) { } } -void NN_printShape(Tensor *tensor) { +void NN_print_shape(Tensor *tensor) { printf("("); for (size_t i = 0; i < tensor->ndim; i += 1) { printf("%d", (int)tensor->shape[i]); @@ -60,19 +64,28 @@ void NN_printf(Tensor *tensor) { for (size_t i=0; ishape[0]; i+=1) { switch (tensor->dtype) { case DTYPE_U8: - printf("%d", *((uint8_t *)tensor->data + i)); + NN_print_u8(*((uint8_t *)tensor->data + i)); break; case DTYPE_I8: - printf("%d", *((int8_t *)tensor->data + i)); + NN_print_i8(*((int8_t *)tensor->data + i)); + break; + case DTYPE_U16: + NN_print_u16(*((uint16_t *)tensor->data + i)); + break; + case DTYPE_I16: + NN_print_i16(*((int16_t *)tensor->data + i)); + break; + case DTYPE_U32: + NN_print_u32(*((uint32_t *)tensor->data + i)); break; case DTYPE_I32: - printf("%ld", (size_t)(*((int32_t *)tensor->data + i))); + NN_print_i32(*((int32_t *)tensor->data + i)); break; case DTYPE_F16: - NN_printFloat(NN_halfToFloat(*((float16_t *)tensor->data + i)), 3); + NN_print_f16(*((float16_t *)tensor->data + i), 3); break; case DTYPE_F32: - NN_printFloat(*((float *)tensor->data + i), 4); + NN_print_f32(*((float *)tensor->data + i), 4); break; } if (i < tensor->shape[0]-1) { @@ -90,19 +103,28 @@ void NN_printf(Tensor *tensor) { for (size_t j=0; jshape[1]; j+=1) { switch (tensor->dtype) { case DTYPE_U8: - printf("%d", *((uint8_t *)tensor->data + i*tensor->shape[1] + j)); + NN_print_u8(*((uint8_t *)tensor->data + i*tensor->shape[1] + j)); break; case DTYPE_I8: - printf("%d", *((int8_t *)tensor->data + i*tensor->shape[1] + j)); + NN_print_i8(*((int8_t *)tensor->data + i*tensor->shape[1] + j)); + break; + case DTYPE_U16: + NN_print_u16(*((uint16_t *)tensor->data + i*tensor->shape[1] + j)); + break; + case DTYPE_I16: + NN_print_i16(*((int16_t *)tensor->data + i*tensor->shape[1] + j)); + break; + case DTYPE_U32: + NN_print_u32(*((uint32_t *)tensor->data + i*tensor->shape[1] + j)); break; case DTYPE_I32: - printf("%ld", (size_t)(*((int32_t *)tensor->data + i*tensor->shape[1] + j))); + NN_print_i32(*((int32_t *)tensor->data + i*tensor->shape[1] + j)); break; case DTYPE_F16: - NN_printFloat(NN_halfToFloat(*((float16_t *)tensor->data + i*tensor->shape[1] + j)), 3); + NN_print_f16(*((float16_t *)tensor->data + i*tensor->shape[1] + j), 3); break; case DTYPE_F32: - NN_printFloat(*((float *)tensor->data + i*tensor->shape[1] + j), 4); + NN_print_f32(*((float *)tensor->data + i*tensor->shape[1] + j), 4); break; } if (j < tensor->shape[1]-1) { @@ -130,19 +152,28 @@ void NN_printf(Tensor *tensor) { for (size_t k=0; kshape[2]; k+=1) { switch (tensor->dtype) { case DTYPE_U8: - printf("%d", *((uint8_t *)tensor->data + i*tensor->shape[1]*tensor->shape[2] + j*tensor->shape[2] + k)); + NN_print_u8(*((uint8_t *)tensor->data + i*tensor->shape[1]*tensor->shape[2] + j*tensor->shape[2] + k)); break; case DTYPE_I8: - printf("%d", *((int8_t *)tensor->data + i*tensor->shape[1]*tensor->shape[2] + j*tensor->shape[2] + k)); + NN_print_i8(*((int8_t *)tensor->data + i*tensor->shape[1]*tensor->shape[2] + j*tensor->shape[2] + k)); + break; + case DTYPE_U16: + NN_print_u16(*((uint16_t *)tensor->data + i*tensor->shape[1]*tensor->shape[2] + j*tensor->shape[2] + k)); + break; + case DTYPE_I16: + NN_print_i16(*((int16_t *)tensor->data + i*tensor->shape[1]*tensor->shape[2] + j*tensor->shape[2] + k)); + break; + case DTYPE_U32: + NN_print_u32(*((uint32_t *)tensor->data + i*tensor->shape[1]*tensor->shape[2] + j*tensor->shape[2] + k)); break; case DTYPE_I32: - printf("%ld", (size_t)(*((int32_t *)tensor->data + i*tensor->shape[1]*tensor->shape[2] + j*tensor->shape[2] + k))); + NN_print_i32(*((int32_t *)tensor->data + i*tensor->shape[1]*tensor->shape[2] + j*tensor->shape[2] + k)); break; case DTYPE_F16: - NN_printFloat(NN_halfToFloat(*((float16_t *)tensor->data + i*tensor->shape[1]*tensor->shape[2] + j*tensor->shape[2] + k)), 3); + NN_print_f16(*((float16_t *)tensor->data + i*tensor->shape[1]*tensor->shape[2] + j*tensor->shape[2] + k), 3); break; case DTYPE_F32: - NN_printFloat(*((float *)tensor->data + i*tensor->shape[1]*tensor->shape[2] + j*tensor->shape[2] + k), 4); + NN_print_f32(*((float *)tensor->data + i*tensor->shape[1]*tensor->shape[2] + j*tensor->shape[2] + k), 4); break; } if (k < tensor->shape[2]-1) { @@ -174,19 +205,28 @@ void NN_printf(Tensor *tensor) { for (size_t w = 0; w < tensor->shape[3]; w += 1) { switch (tensor->dtype) { case DTYPE_U8: - printf("%d", *((uint8_t *)tensor->data + n*tensor->shape[1]*tensor->shape[2]*tensor->shape[3] + c*tensor->shape[2]*tensor->shape[3] + h*tensor->shape[3] + w)); + NN_print_u8(*((uint8_t *)tensor->data + n*tensor->shape[1]*tensor->shape[2]*tensor->shape[3] + c*tensor->shape[2]*tensor->shape[3] + h*tensor->shape[3] + w)); break; case DTYPE_I8: - printf("%d", *((int8_t *)tensor->data + n*tensor->shape[1]*tensor->shape[2]*tensor->shape[3] + c*tensor->shape[2]*tensor->shape[3] + h*tensor->shape[3] + w)); + NN_print_i8(*((int8_t *)tensor->data + n*tensor->shape[1]*tensor->shape[2]*tensor->shape[3] + c*tensor->shape[2]*tensor->shape[3] + h*tensor->shape[3] + w)); + break; + case DTYPE_U16: + NN_print_u16(*((uint16_t *)tensor->data + n*tensor->shape[1]*tensor->shape[2]*tensor->shape[3] + c*tensor->shape[2]*tensor->shape[3] + h*tensor->shape[3] + w)); + break; + case DTYPE_I16: + NN_print_i16(*((int16_t *)tensor->data + n*tensor->shape[1]*tensor->shape[2]*tensor->shape[3] + c*tensor->shape[2]*tensor->shape[3] + h*tensor->shape[3] + w)); + break; + case DTYPE_U32: + NN_print_u32(*((uint32_t *)tensor->data + n*tensor->shape[1]*tensor->shape[2]*tensor->shape[3] + c*tensor->shape[2]*tensor->shape[3] + h*tensor->shape[3] + w)); break; case DTYPE_I32: - printf("%ld", (size_t)(*((int32_t *)tensor->data + n*tensor->shape[1]*tensor->shape[2]*tensor->shape[3] + c*tensor->shape[2]*tensor->shape[3] + h*tensor->shape[3] + w))); + NN_print_i32(*((int32_t *)tensor->data + n*tensor->shape[1]*tensor->shape[2]*tensor->shape[3] + c*tensor->shape[2]*tensor->shape[3] + h*tensor->shape[3] + w)); break; case DTYPE_F16: - NN_printFloat(NN_halfToFloat(*((float16_t *)tensor->data + n*tensor->shape[1]*tensor->shape[2]*tensor->shape[3] + c*tensor->shape[2]*tensor->shape[3] + h*tensor->shape[3] + w)), 3); + NN_print_f16(*((float16_t *)tensor->data + n*tensor->shape[1]*tensor->shape[2]*tensor->shape[3] + c*tensor->shape[2]*tensor->shape[3] + h*tensor->shape[3] + w), 3); break; case DTYPE_F32: - NN_printFloat(*((float *)tensor->data + n*tensor->shape[1]*tensor->shape[2]*tensor->shape[3] + c*tensor->shape[2]*tensor->shape[3] + h*tensor->shape[3] + w), 4); + NN_print_f32(*((float *)tensor->data + n*tensor->shape[1]*tensor->shape[2]*tensor->shape[3] + c*tensor->shape[2]*tensor->shape[3] + h*tensor->shape[3] + w), 4); break; } if (w < tensor->shape[3]-1) { diff --git a/nn/functional/nn_print.h b/nn/functional/nn_print.h new file mode 100644 index 0000000..aeae419 --- /dev/null +++ b/nn/functional/nn_print.h @@ -0,0 +1,41 @@ +#ifndef __NN_PRINT_H +#define __NN_PRINT_H + +#include + +#include "nn_tensor.h" + +static inline void NN_print_u8(uint8_t v) { + printf("%d", v); +} + +static inline void NN_print_i8(int8_t v) { + printf("%d", v); +} + +static inline void NN_print_u16(uint16_t v) { + printf("%d", v); +} + +static inline void NN_print_i16(int16_t v) { + printf("%d", v); +} + +static inline void NN_print_u32(uint32_t v) { + printf("%ld", (size_t)v); +} + +static inline void NN_print_i32(int32_t v) { + printf("%ld", (size_t)v); +} + +void NN_print_f16(float16_t v, int16_t num_digits); + +void NN_print_f32(float v, int16_t num_digits); + +void NN_print_shape(Tensor *t); + +void NN_printf(Tensor *t); + + +#endif // __NN_PRINT_H diff --git a/nn/functional/nn_relu.c b/nn/functional/nn_relu.c new file mode 100644 index 0000000..a7c016d --- /dev/null +++ b/nn/functional/nn_relu.c @@ -0,0 +1,29 @@ + +#include "nn_relu.h" + + +void NN_relu(Tensor *y, Tensor *x) { + assert(y->ndim == x->ndim); + assert(y->dtype == x->dtype); + assert(y->size == x->size); + + switch (y->dtype) { + case DTYPE_F16: + NN__maximum1_f16(y->size, (float *)y->data, 1, (float *)x->data, 1, 0.0f); + return; + case DTYPE_F32: + NN__maximum1_f32(y->size, (float *)y->data, 1, (float *)x->data, 1, 0.0f); + return; + + default: + break; + } + + printf("[ERROR] Unsupported operation between tensor with dtype %s = ReLU(%s)\n", + NN_get_datatype_name(y->dtype), NN_get_datatype_name(x->dtype) + ); +} + +void NN_relu_inplace(Tensor *x) { + NN_relu(x, x); +} diff --git a/nn/inc/nn_relu.h b/nn/functional/nn_relu.h similarity index 74% rename from nn/inc/nn_relu.h rename to nn/functional/nn_relu.h index 420b81c..39d3d37 100644 --- a/nn/inc/nn_relu.h +++ b/nn/functional/nn_relu.h @@ -4,7 +4,7 @@ #include #include "nn_tensor.h" -#include "ops/maximum1.h" +#include "maximum1.h" /** @@ -15,8 +15,8 @@ * @param y: the output tensor * @param x: the input tensor */ -void NN_ReLU(Tensor *y, Tensor *x); +void NN_relu(Tensor *y, Tensor *x); -void NN_ReLUInplace(Tensor *x); +void NN_relu_inplace(Tensor *x); #endif // __NN_RELU_H diff --git a/nn/functional/nn_relu6.c b/nn/functional/nn_relu6.c new file mode 100644 index 0000000..c1c237d --- /dev/null +++ b/nn/functional/nn_relu6.c @@ -0,0 +1,27 @@ + +#include "nn_relu6.h" + + +void NN_relu6(Tensor *y, Tensor *x) { + assert(y->ndim == x->ndim); + assert(y->dtype == x->dtype); + assert(y->size == x->size); + + switch (y->dtype) { + case DTYPE_F32: + NN__maximum1_f32(y->size, (float *)y->data, 1, (float *)x->data, 1, 0.0f); + NN__minimum1_f32(y->size, (float *)y->data, 1, (float *)y->data, 1, 6.0f); + return; + + default: + break; + } + + printf("[ERROR] Unsupported operation between tensor with dtype %s = ReLU(%s)\n", + NN_get_datatype_name(y->dtype), NN_get_datatype_name(x->dtype) + ); +} + +void NN_relu6_inplace(Tensor *x) { + NN_relu6(x, x); +} diff --git a/nn/inc/nn_relu6.h b/nn/functional/nn_relu6.h similarity index 69% rename from nn/inc/nn_relu6.h rename to nn/functional/nn_relu6.h index 460fe5c..60568e9 100644 --- a/nn/inc/nn_relu6.h +++ b/nn/functional/nn_relu6.h @@ -4,8 +4,8 @@ #include #include "nn_tensor.h" -#include "ops/maximum1.h" -#include "ops/minimum1.h" +#include "maximum1.h" +#include "minimum1.h" /** @@ -16,9 +16,9 @@ * @param y: the output tensor * @param x: the input tensor */ -void NN_ReLU6(Tensor *y, Tensor *x); +void NN_relu6(Tensor *y, Tensor *x); -void NN_ReLU6Inplace(Tensor *x); +void NN_relu6_inplace(Tensor *x); #endif // __NN_RELU6_H diff --git a/nn/src/nn_softmax.c b/nn/functional/nn_softmax.c similarity index 67% rename from nn/src/nn_softmax.c rename to nn/functional/nn_softmax.c index f021b6c..54ac03b 100644 --- a/nn/src/nn_softmax.c +++ b/nn/functional/nn_softmax.c @@ -2,34 +2,36 @@ #include "nn_softmax.h" -void NN_Softmax(Tensor *out, Tensor *tensor, size_t dim) { +void NN_softmax(Tensor *out, Tensor *tensor, int dim) { assert(out->dtype == tensor->dtype); + if (dim < 0) { + dim = out->ndim + dim; + } + switch (tensor->dtype) { case DTYPE_F32: if (dim == 0) { - for (size_t i = 0; i < tensor->shape[0]; i += 1) { - float *x = (float *)tensor->data + i * tensor->shape[1]; - float *y = (float *)out->data + i * out->shape[1]; - NN__softmax_F32(tensor->shape[1], y, x, 1); + for (size_t i = 0; i < tensor->shape[1]; i += 1) { + float *x = (float *)tensor->data + i; + float *y = (float *)out->data + i; + NN__softmax_f32(tensor->shape[0], y, tensor->shape[1], x, tensor->shape[1]); } return; } if (dim == 1) { - for (size_t i = 0; i < tensor->shape[1]; i += 1) { - float *x = (float *)tensor->data + i; - float *y = (float *)out->data + i; - NN__softmax_F32(tensor->shape[0], y, x, tensor->shape[1]); + for (size_t i = 0; i < tensor->shape[0]; i += 1) { + float *x = (float *)tensor->data + i * tensor->shape[1]; + float *y = (float *)out->data + i * out->shape[1]; + NN__softmax_f32(tensor->shape[1], y, 1, x, 1); } return; } - - // NN__softmax_F32(tensor->size, (float *)out->data, (float *)tensor->data); break; default: printf("[ERROR] Unsupported operation of tensor with dtype %s = softmax(%s)\n", - NN_getDataTypeName(out->dtype), NN_getDataTypeName(tensor->dtype) + NN_get_datatype_name(out->dtype), NN_get_datatype_name(tensor->dtype) ); } } diff --git a/nn/inc/nn_softmax.h b/nn/functional/nn_softmax.h similarity index 69% rename from nn/inc/nn_softmax.h rename to nn/functional/nn_softmax.h index 7185afb..a4acfac 100644 --- a/nn/inc/nn_softmax.h +++ b/nn/functional/nn_softmax.h @@ -4,7 +4,7 @@ #include #include "nn_tensor.h" -#include "ops/softmax.h" +#include "softmax.h" /** @@ -12,8 +12,9 @@ * * @param out: the output scalar tensor * @param tensor: the input tensor + * @param dim: the dimension to reduce */ -void NN_Softmax(Tensor *out, Tensor *tensor, size_t dim); +void NN_softmax(Tensor *out, Tensor *tensor, int dim); #endif // __NN_SOFTMAX_H diff --git a/nn/src/nn_sub.c b/nn/functional/nn_sub.c similarity index 77% rename from nn/src/nn_sub.c rename to nn/functional/nn_sub.c index 5cc8d04..bc90c44 100644 --- a/nn/src/nn_sub.c +++ b/nn/functional/nn_sub.c @@ -8,19 +8,19 @@ void NN_sub(Tensor *out, Tensor *a, Tensor *b) { switch (out->ndim) { case 1: - NN_sub_1D(out, a, b); + NN_sub_1d(out, a, b); return; case 2: - NN_sub_2D(out, a, b); + NN_sub_2d(out, a, b); return; case 3: - NN_sub_3D(out, a, b); + NN_sub_3d(out, a, b); return; case 4: - NN_sub_4D(out, a, b); + NN_sub_4d(out, a, b); return; default: @@ -29,47 +29,47 @@ void NN_sub(Tensor *out, Tensor *a, Tensor *b) { } } -void NN_sub_1D(Tensor *out, Tensor *a, Tensor *b) { +void NN_sub_1d(Tensor *out, Tensor *a, Tensor *b) { assert(out->shape[0] == a->shape[0] || out->shape[0] == b->shape[0]); if (out->dtype == DTYPE_F32 && a->dtype == DTYPE_F32 && b->dtype == DTYPE_F32) { - NN__sub_F32(out->size, (float *)out->data, (float *)a->data, (float *)b->data); + NN__sub_f32(out->size, (float *)out->data, 1, (float *)a->data, 1, (float *)b->data, 1); return; } if (out->dtype == DTYPE_I8 && a->dtype == DTYPE_I8 && b->dtype == DTYPE_I8) { - NN__sub_I8(out->size, (int8_t *)out->data, (int8_t *)a->data, (int8_t *)b->data); + NN__sub_i8(out->size, (int8_t *)out->data, 1, (int8_t *)a->data, 1, (int8_t *)b->data, 1); return; } printf("[ERROR] Unsupported operation between tensor with dtype %s = %s - %s\n", - NN_getDataTypeName(out->dtype), NN_getDataTypeName(a->dtype), NN_getDataTypeName(b->dtype) + NN_get_datatype_name(out->dtype), NN_get_datatype_name(a->dtype), NN_get_datatype_name(b->dtype) ); } -void NN_sub_2D(Tensor *out, Tensor *a, Tensor *b) { +void NN_sub_2d(Tensor *out, Tensor *a, Tensor *b) { assert(out->shape[0] == a->shape[0] || out->shape[0] == b->shape[0]); assert(out->shape[1] == a->shape[1] || out->shape[1] == b->shape[1]); if (a->shape[0] == b->shape[0] && a->shape[1] == b->shape[1]) { if (out->dtype == DTYPE_F32 && a->dtype == DTYPE_F32 && b->dtype == DTYPE_F32) { - NN__sub_F32(out->size, (float *)out->data, (float *)a->data, (float *)b->data); + NN__sub_f32(out->size, (float *)out->data, 1, (float *)a->data, 1, (float *)b->data, 1); return; } if (out->dtype == DTYPE_I8 && a->dtype == DTYPE_I8 && b->dtype == DTYPE_I8) { - NN__sub_I8(out->size, (int8_t *)out->data, (int8_t *)a->data, (int8_t *)b->data); + NN__sub_i8(out->size, (int8_t *)out->data, 1, (int8_t *)a->data, 1, (int8_t *)b->data, 1); return; } if (out->dtype == DTYPE_I16 && a->dtype == DTYPE_I16 && b->dtype == DTYPE_I16) { - NN__sub_I16(out->size, (int16_t *)out->data, (int16_t *)a->data, (int16_t *)b->data); + NN__sub_i16(out->size, (int16_t *)out->data, 1, (int16_t *)a->data, 1, (int16_t *)b->data, 1); return; } if (out->dtype == DTYPE_I32 && a->dtype == DTYPE_I32 && b->dtype == DTYPE_I32) { - NN__sub_I32(out->size, (int32_t *)out->data, (int32_t *)a->data, (int32_t *)b->data); + NN__sub_i32(out->size, (int32_t *)out->data, 1, (int32_t *)a->data, 1, (int32_t *)b->data, 1); return; } if (out->dtype == DTYPE_U8 && a->dtype == DTYPE_U8 && b->dtype == DTYPE_U8) { - NN__sub_U8(out->size, (uint8_t *)out->data, (uint8_t *)a->data, (uint8_t *)b->data); + NN__sub_u8(out->size, (uint8_t *)out->data, 1, (uint8_t *)a->data, 1, (uint8_t *)b->data, 1); return; } } @@ -92,11 +92,11 @@ void NN_sub_2D(Tensor *out, Tensor *a, Tensor *b) { } printf("[ERROR] Unsupported operation between tensor with dtype %s = %s - %s\n", - NN_getDataTypeName(out->dtype), NN_getDataTypeName(a->dtype), NN_getDataTypeName(b->dtype) + NN_get_datatype_name(out->dtype), NN_get_datatype_name(a->dtype), NN_get_datatype_name(b->dtype) ); } -void NN_sub_3D(Tensor *out, Tensor *a, Tensor *b) { +void NN_sub_3d(Tensor *out, Tensor *a, Tensor *b) { assert(out->shape[0] == a->shape[0] || out->shape[0] == b->shape[0]); assert(out->shape[1] == a->shape[1] || out->shape[1] == b->shape[1]); assert(out->shape[2] == a->shape[2] || out->shape[2] == b->shape[2]); @@ -122,11 +122,11 @@ void NN_sub_3D(Tensor *out, Tensor *a, Tensor *b) { } printf("[ERROR] Unsupported operation between tensor with dtype %s = %s - %s\n", - NN_getDataTypeName(out->dtype), NN_getDataTypeName(a->dtype), NN_getDataTypeName(b->dtype) + NN_get_datatype_name(out->dtype), NN_get_datatype_name(a->dtype), NN_get_datatype_name(b->dtype) ); } -void NN_sub_4D(Tensor *out, Tensor *a, Tensor *b) { +void NN_sub_4d(Tensor *out, Tensor *a, Tensor *b) { assert(out->shape[0] == a->shape[0] || out->shape[0] == b->shape[0]); assert(out->shape[1] == a->shape[1] || out->shape[1] == b->shape[1]); assert(out->shape[2] == a->shape[2] || out->shape[2] == b->shape[2]); @@ -157,7 +157,7 @@ void NN_sub_4D(Tensor *out, Tensor *a, Tensor *b) { } printf("[ERROR] Unsupported operation between tensor with dtype %s = %s - %s\n", - NN_getDataTypeName(out->dtype), NN_getDataTypeName(a->dtype), NN_getDataTypeName(b->dtype) + NN_get_datatype_name(out->dtype), NN_get_datatype_name(a->dtype), NN_get_datatype_name(b->dtype) ); } diff --git a/nn/inc/nn_sub.h b/nn/functional/nn_sub.h similarity index 65% rename from nn/inc/nn_sub.h rename to nn/functional/nn_sub.h index 20a7a4e..687e5dd 100644 --- a/nn/inc/nn_sub.h +++ b/nn/functional/nn_sub.h @@ -6,7 +6,7 @@ #include "nn_tensor.h" #include "nn_print.h" -#include "ops/sub.h" +#include "sub.h" /** @@ -22,13 +22,13 @@ */ void NN_sub(Tensor *out, Tensor *a, Tensor *b); -void NN_sub_1D(Tensor *out, Tensor *a, Tensor *b); +void NN_sub_1d(Tensor *out, Tensor *a, Tensor *b); -void NN_sub_2D(Tensor *out, Tensor *a, Tensor *b); +void NN_sub_2d(Tensor *out, Tensor *a, Tensor *b); -void NN_sub_3D(Tensor *out, Tensor *a, Tensor *b); +void NN_sub_3d(Tensor *out, Tensor *a, Tensor *b); -void NN_sub_4D(Tensor *out, Tensor *a, Tensor *b); +void NN_sub_4d(Tensor *out, Tensor *a, Tensor *b); #endif // __NN_SUB_H diff --git a/nn/functional/nn_sum.c b/nn/functional/nn_sum.c new file mode 100644 index 0000000..305c52e --- /dev/null +++ b/nn/functional/nn_sum.c @@ -0,0 +1,65 @@ + +#include "nn_sum.h" + + +void NN_sum(Tensor *out, Tensor *tensor) { + int32_t result_i32; + switch (tensor->dtype) { + case DTYPE_U8: + NN__sum_u8_to_i32(tensor->size, &result_i32, (uint8_t *)tensor->data, 1); + switch (out->dtype) { + case DTYPE_U16: + *(uint16_t *)out->data = (uint16_t)result_i32; + return; + case DTYPE_U32: + *(uint32_t *)out->data = (uint32_t)result_i32; + return; + case DTYPE_I32: + *(int32_t *)out->data = result_i32; + return; + default: + break; + } + break; + + case DTYPE_I16: + NN__sum_i16_to_i32(tensor->size, &result_i32, (int16_t *)tensor->data, 1); + switch (out->dtype) { + case DTYPE_I16: + *(int16_t *)out->data = (int16_t)result_i32; + return; + case DTYPE_I32: + *(int32_t *)out->data = result_i32; + return; + default: + break; + } + break; + + case DTYPE_I32: + switch (out->dtype) { + case DTYPE_I32: + NN__sum_i32(tensor->size, (int32_t *)out->data, (int32_t *)tensor->data, 1); + return; + default: + break; + } + break; + + case DTYPE_F32: + switch (out->dtype) { + case DTYPE_F32: + NN__sum_f32(tensor->size, (float *)out->data, (float *)tensor->data, 1); + return; + default: + break; + } + + default: + break; + } + + printf("[ERROR] Unsupported operation of tensor with dtype %s = sum(%s)\n", + NN_get_datatype_name(out->dtype), NN_get_datatype_name(tensor->dtype) + ); +} diff --git a/nn/inc/nn_sum.h b/nn/functional/nn_sum.h similarity index 93% rename from nn/inc/nn_sum.h rename to nn/functional/nn_sum.h index 67d76df..f9cbf0b 100644 --- a/nn/inc/nn_sum.h +++ b/nn/functional/nn_sum.h @@ -4,7 +4,7 @@ #include #include "nn_tensor.h" -#include "ops/sum.h" +#include "sum.h" /** diff --git a/nn/src/nn_fill.c b/nn/functional/nn_tensor_creation.c similarity index 57% rename from nn/src/nn_fill.c rename to nn/functional/nn_tensor_creation.c index 115a768..0edf2db 100644 --- a/nn/src/nn_fill.c +++ b/nn/functional/nn_tensor_creation.c @@ -1,27 +1,39 @@ -#include "nn_fill.h" +#include "nn_tensor_creation.h" -void NN_fill(Tensor *tensor, float value) { - switch (tensor->dtype) { - case DTYPE_U8: - NN__fill_U8(tensor->size, (uint8_t *)tensor->data, (uint8_t)value); - return; - case DTYPE_I8: - NN__fill_I8(tensor->size, (int8_t *)tensor->data, (int8_t)value); - return; - case DTYPE_I32: - NN__fill_I32(tensor->size, (int32_t *)tensor->data, (int32_t)value); - return; - case DTYPE_F16: - NN__fill_F16(tensor->size, (float16_t *)tensor->data, NN_floatToHalf(value)); - return; - case DTYPE_F32: - NN__fill_F32(tensor->size, (float *)tensor->data, value); - return; - default: - printf("[ERROR] Unsupported operation fill to tensor with dtype: %d\n", tensor->dtype); +void NN_init_tensor(Tensor *tensor, const size_t ndim, const size_t *shape, DataType dtype, void *data) { + tensor->dtype = dtype; + tensor->ndim = ndim; + + // set shape + memcpy(tensor->shape, shape, ndim * sizeof(size_t)); + memset(tensor->shape + ndim, 0, (MAX_DIMS - ndim) * sizeof(size_t)); + + // calculate size (number of elements) + tensor->size = 1; + for (size_t i = 0; i < ndim; i += 1) { + tensor->size *= shape[i]; } + + if (data != NULL) { + tensor->data = data; + return; + } + + // if this is a scalar tensor + if (tensor->ndim == 0) { + tensor->data = malloc(NN_sizeof(dtype)); + return; + } + + tensor->data = malloc(NN_sizeof(dtype) * tensor->size); +} + +Tensor *NN_tensor(size_t ndim, const size_t *shape, DataType dtype, void *data) { + Tensor *t = (Tensor *)malloc(sizeof(Tensor)); + NN_init_tensor(t, ndim, shape, dtype, data); + return t; } Tensor *NN_zeros(size_t ndim, const size_t *shape, DataType dtype) { diff --git a/nn/functional/nn_tensor_creation.h b/nn/functional/nn_tensor_creation.h new file mode 100644 index 0000000..daa9c7a --- /dev/null +++ b/nn/functional/nn_tensor_creation.h @@ -0,0 +1,84 @@ +#ifndef __NN_TENSOR_CREATION +#define __NN_TENSOR_CREATION + +#include +#include +#include +#include +#include +#include + +#include "nn_tensor.h" +#include "nn_fill.h" + + +/** + * Initialize a given tensor + * + * The memory is initialized in C order, i.e., the last dimension is contiguous. + * + * @param ndim: number of dimensions + * @param shape: shape of tensor + * @param dtype: data type + * @param data: pointer to data, if NULL, the data will be allocated + */ +void NN_init_tensor(Tensor *tensor, const size_t ndim, const size_t *shape, DataType dtype, void *data); + +/** + * Create a new tensor + * + * @param ndim: number of dimensions + * @param shape: shape of tensor + * @param dtype: data type + * @param data: pointer to data, if NULL, the data will be allocated + * @return Tensor +*/ +Tensor *NN_tensor(size_t ndim, const size_t *shape, DataType dtype, void *data); + +/** + * Returns a tensor filled with the scalar value 0. + * + * @param ndim: number of dimensions + * @param shape: shape of tensor + * @param dtype: data type + * @return Tensor + */ +Tensor *NN_zeros(size_t ndim, const size_t *shape, DataType dtype); + +/** + * Returns a tensor filled with the scalar value 1. + * + * @param ndim: number of dimensions + * @param shape: shape of tensor + * @param dtype: data type + * @return Tensor + */ +Tensor *NN_ones(size_t ndim, const size_t *shape, DataType dtype); + +/** + * Returns a tensor filled with random numbers from a uniform distribution. + * + * The range of the random number is dependent on the data type: + * - For Float32, the range is [0, 1] + * - For Int8, the range is [0, 255] + * - For Int32, the range is [0, RAND_MAX] + * + * @param ndim: number of dimensions + * @param shape: shape of tensor + * @param dtype: data type + * @return Tensor + */ +Tensor *NN_rand(size_t ndim, const size_t *shape, DataType dtype); + +/** + * Returns this tensor cast to the type of the given tensor. + * + * This is equivalent to NN_copy() if the data types are the same. + * + * @param out: the output tensor + * @param in: the input tensor + */ +void NN_as_type(Tensor *out, Tensor *in); + + +#endif // __NN_TENSOR_CREATION \ No newline at end of file diff --git a/nn/src/nn_transpose.c b/nn/functional/nn_transpose.c similarity index 82% rename from nn/src/nn_transpose.c rename to nn/functional/nn_transpose.c index b8e22f7..d47dae1 100644 --- a/nn/src/nn_transpose.c +++ b/nn/functional/nn_transpose.c @@ -18,12 +18,12 @@ void NN_transpose(Tensor *out, Tensor *a) { assert(out->shape[1] == a->shape[0]); if (a->dtype == DTYPE_F32) { - NN__transpose_F32(a->shape[0], a->shape[1], (float *)out->data, (float *)a->data); + NN__transpose_f32(a->shape[0], a->shape[1], (float *)out->data, (float *)a->data); return; } printf("[ERROR] Unsupported operation of tensor with dtype %s = %s.T\n", - NN_getDataTypeName(out->dtype), NN_getDataTypeName(a->dtype) + NN_get_datatype_name(out->dtype), NN_get_datatype_name(a->dtype) ); } diff --git a/nn/inc/nn_transpose.h b/nn/functional/nn_transpose.h similarity index 93% rename from nn/inc/nn_transpose.h rename to nn/functional/nn_transpose.h index bf820c2..6019209 100644 --- a/nn/inc/nn_transpose.h +++ b/nn/functional/nn_transpose.h @@ -4,7 +4,7 @@ #include #include "nn_tensor.h" -#include "ops/transpose.h" +#include "transpose.h" /** * Transpose a 2D tensor diff --git a/nn/src/nn_unfold.c b/nn/functional/nn_unfold.c similarity index 93% rename from nn/src/nn_unfold.c rename to nn/functional/nn_unfold.c index fe5a8e0..a1777e7 100644 --- a/nn/src/nn_unfold.c +++ b/nn/functional/nn_unfold.c @@ -11,24 +11,24 @@ // // const int32_t p1 = ((const int32_t *)(dst->op_params))[3]; // // const int32_t d0 = ((const int32_t *)(dst->op_params))[4]; // // const int32_t d1 = ((const int32_t *)(dst->op_params))[5]; -// // const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1; +// // const bool is_2d = ((const int32_t *)(dst->op_params))[6] == 1; // // const int ith = params->ith; // // const int nth = params->nth; -// // const int64_t N = is_2D ? ne13 : ne12; -// // const int64_t IC = is_2D ? ne12 : ne11; -// // const int64_t IH = is_2D ? ne11 : 1; +// // const int64_t N = is_2d ? ne13 : ne12; +// // const int64_t IC = is_2d ? ne12 : ne11; +// // const int64_t IH = is_2d ? ne11 : 1; // // const int64_t IW = ne10; -// // const int64_t KH = is_2D ? ne01 : 1; +// // const int64_t KH = is_2d ? ne01 : 1; // // const int64_t KW = ne00; -// // const int64_t OH = is_2D ? ne2 : 1; +// // const int64_t OH = is_2d ? ne2 : 1; // // const int64_t OW = ne1; -// // int ofs0 = is_2D ? nb13 : nb12; -// // int ofs1 = is_2D ? nb12 : nb11; +// // int ofs0 = is_2d ? nb13 : nb12; +// // int ofs1 = is_2d ? nb12 : nb11; // // GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); // // GGML_ASSERT(nb10 == sizeof(float)); diff --git a/nn/inc/nn_unfold.h b/nn/functional/nn_unfold.h similarity index 100% rename from nn/inc/nn_unfold.h rename to nn/functional/nn_unfold.h diff --git a/nn/impl/abs.h b/nn/impl/abs.h new file mode 100644 index 0000000..f707096 --- /dev/null +++ b/nn/impl/abs.h @@ -0,0 +1,37 @@ +#ifndef __NN__ABS_H +#define __NN__ABS_H + +#include +#include +#include + +#include "nn_float16.h" + + +void NN__abs_i8(size_t n, + int8_t *y, size_t incy, + int8_t *x, size_t incx + ); + +void NN__abs_i16(size_t n, + int16_t *y, size_t incy, + int16_t *x, size_t incx + ); + +void NN__abs_i32(size_t n, + int32_t *y, size_t incy, + int32_t *x, size_t incx + ); + +void NN__abs_f16(size_t n, + float16_t *y, size_t incy, + float16_t *x, size_t incx + ); + +void NN__abs_f32(size_t n, + float *y, size_t incy, + float *x, size_t incx + ); + + +#endif // __NN__ABS_H diff --git a/nn/impl/acc.h b/nn/impl/acc.h new file mode 100644 index 0000000..14bc5ee --- /dev/null +++ b/nn/impl/acc.h @@ -0,0 +1,19 @@ +#ifndef __NN__ACC_H +#define __NN__ACC_H + +#include +#include + + +void NN__acc_i8(size_t n, + int8_t *y, size_t incy, + int8_t *x, size_t incx + ); + +void NN__acc_f32(size_t n, + float *y, size_t incy, + float *x, size_t incx + ); + + +#endif // __NN__ACC_H diff --git a/nn/impl/acc1.h b/nn/impl/acc1.h new file mode 100644 index 0000000..f3c52af --- /dev/null +++ b/nn/impl/acc1.h @@ -0,0 +1,13 @@ +#ifndef __NN__ACC1_H +#define __NN__ACC1_H + +#include + + +void NN__acc1_f32(size_t n, + float *result, size_t incx, + float scalar + ); + + +#endif // __NN__ADD1_H diff --git a/nn/impl/add.h b/nn/impl/add.h new file mode 100644 index 0000000..b7ecf44 --- /dev/null +++ b/nn/impl/add.h @@ -0,0 +1,29 @@ +#ifndef __NN__ADD_H +#define __NN__ADD_H + +#include +#include + +#include "nn_float16.h" + + +void NN__add_i8(size_t n, + int8_t *z, size_t incz, + int8_t *x, size_t incx, + int8_t *y, size_t incy + ); + +void NN__add_f16(size_t n, + float16_t *z, size_t incz, + float16_t *x, size_t incx, + float16_t *y, size_t incy + ); + +void NN__add_f32(size_t n, + float *z, size_t incz, + float *x, size_t incx, + float *y, size_t incy + ); + + +#endif // __NN__ADD_H diff --git a/nn/impl/add1.h b/nn/impl/add1.h new file mode 100644 index 0000000..1726b76 --- /dev/null +++ b/nn/impl/add1.h @@ -0,0 +1,14 @@ +#ifndef __NN__ADD1_H +#define __NN__ADD1_H + +#include + + +void NN__add1_f32(size_t n, + float *z, size_t incz, + float *x, size_t incx, + float scalar + ); + + +#endif // __NN__ADD1_H diff --git a/nn/impl/avx/abs.c b/nn/impl/avx/abs.c new file mode 100644 index 0000000..704c709 --- /dev/null +++ b/nn/impl/avx/abs.c @@ -0,0 +1,28 @@ + +#include + +#include "abs.h" + +#ifdef AVX + +// void NN__abs_f32(size_t n, float *result, float *x, size_t incx) { +// // Mask to clear the sign bit +// __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); + +// size_t vl = 8; + +// while (n > 0) { +// size_t count = n < vl ? n : vl; +// // Load input values into an AVX register +// __m256 vec_x = _mm256_loadu_ps(x); +// // Compute the absolute values +// __m256 vec_y = _mm256_and_ps(vec_x, mask); +// // Store the result +// _mm256_storeu_ps(y, vec_y); +// x += count; +// y += count; +// n -= count; +// } +// } + +#endif \ No newline at end of file diff --git a/nn/impl/cpu/abs.c b/nn/impl/cpu/abs.c new file mode 100644 index 0000000..a2e6938 --- /dev/null +++ b/nn/impl/cpu/abs.c @@ -0,0 +1,32 @@ +#include "abs.h" + + +__attribute__((weak)) void NN__abs_i8(size_t n, int8_t *y, size_t incy, int8_t *x, size_t incx) { + for (size_t i = 0; i < n; i += 1) { + y[i * incy] = x[i * incx] < 0 ? -x[i * incx] : x[i * incx]; + } +} + +__attribute__((weak)) void NN__abs_i16(size_t n, int16_t *y, size_t incy, int16_t *x, size_t incx) { + for (size_t i = 0; i < n; i += 1) { + y[i * incy] = x[i * incx] < 0 ? -x[i * incx] : x[i * incx]; + } +} + +__attribute__((weak)) void NN__abs_i32(size_t n, int32_t *y, size_t incy, int32_t *x, size_t incx) { + for (size_t i = 0; i < n; i += 1) { + y[i * incy] = x[i * incx] < 0 ? -x[i * incx] : x[i * incx]; + } +} + +__attribute__((weak)) void NN__abs_f16(size_t n, float16_t *y, size_t incy, float16_t *x, size_t incx) { + for (size_t i = 0; i < n; i += incx) { + y[i * incy] = NN_float_to_half(fabsf(NN_half_to_float(x[i * incx]))); + } +} + +__attribute__((weak)) void NN__abs_f32(size_t n, float *y, size_t incy, float *x, size_t incx) { + for (size_t i = 0; i < n; i += 1) { + y[i * incy] = fabsf(x[i * incx]); + } +} diff --git a/nn/impl/cpu/acc.c b/nn/impl/cpu/acc.c new file mode 100644 index 0000000..8366aca --- /dev/null +++ b/nn/impl/cpu/acc.c @@ -0,0 +1,14 @@ +#include "acc.h" + + +__attribute__((weak)) void NN__acc_i8(size_t n, int8_t *y, size_t incy, int8_t *x, size_t incx) { + for (size_t i = 0; i < n; i += 1) { + y[i * incy] += x[i * incx]; + } +} + +__attribute__((weak)) void NN__acc_f32(size_t n, float *y, size_t incy, float *x, size_t incx) { + for (size_t i = 0; i < n; i += 1) { + y[i * incy] += x[i * incx]; + } +} diff --git a/nn/impl/cpu/acc1.c b/nn/impl/cpu/acc1.c new file mode 100644 index 0000000..3b925a2 --- /dev/null +++ b/nn/impl/cpu/acc1.c @@ -0,0 +1,8 @@ +#include "acc1.h" + + +__attribute__((weak)) void NN__acc1_f32(size_t n, float *result, size_t incx, float scalar) { + for (size_t i = 0; i < n; i += incx) { + result[i] += scalar; + } +} \ No newline at end of file diff --git a/nn/impl/cpu/add.c b/nn/impl/cpu/add.c new file mode 100644 index 0000000..66c16c6 --- /dev/null +++ b/nn/impl/cpu/add.c @@ -0,0 +1,21 @@ +#include "add.h" + + +__attribute__((weak)) void NN__add_i8(size_t n, int8_t *z, size_t incz, int8_t *x, size_t incx, int8_t *y, size_t incy) { + for (size_t i = 0; i < n; i += 1) { + z[i * incz] = x[i * incx] + y[i * incy]; + } +} + +__attribute__((weak)) void NN__add_f16(size_t n, float16_t *z, size_t incz, float16_t *x, size_t incx, float16_t *y, size_t incy) { + for (size_t i = 0; i < n; i += 1) { + z[i * incz] = NN_float_to_half(NN_half_to_float(x[i * incx]) + NN_half_to_float(y[i * incy])); + } +} + +__attribute__((weak)) void NN__add_f32(size_t n, float *z, size_t incz, float *x, size_t incx, float *y, size_t incy) { + for (size_t i = 0; i < n; i += 1) { + z[i * incz] = x[i * incx] + y[i * incy]; + } +} + diff --git a/nn/impl/cpu/add1.c b/nn/impl/cpu/add1.c new file mode 100644 index 0000000..331ec24 --- /dev/null +++ b/nn/impl/cpu/add1.c @@ -0,0 +1,8 @@ +#include "add1.h" + + +__attribute__((weak)) void NN__add1_f32(size_t n, float *y, size_t incy, float *x, size_t incx, float scalar) { + for (size_t i = 0; i < n; i += 1) { + y[i * incy] = x[i * incx] + scalar; + } +} diff --git a/nn/impl/cpu/div.c b/nn/impl/cpu/div.c new file mode 100644 index 0000000..f52ea6d --- /dev/null +++ b/nn/impl/cpu/div.c @@ -0,0 +1,8 @@ +#include "div.h" + + +__attribute__((weak)) void NN__div_f32(size_t n, float *z, size_t incz, float *x, size_t incx, float *y, size_t incy) { + for (size_t i = 0; i < n; i += 1) { + z[i * incz] = x[i * incx] / y[i * incy]; + } +} diff --git a/nn/impl/cpu/dot.c b/nn/impl/cpu/dot.c new file mode 100644 index 0000000..33702b3 --- /dev/null +++ b/nn/impl/cpu/dot.c @@ -0,0 +1,18 @@ +#include "dot.h" + + +__attribute__((weak)) void NN__dot_f16(size_t n, float16_t *result, float16_t *x, size_t incx, float16_t *y, size_t incy) { + float sum_f32 = 0; + for (size_t i = 0; i < n; i += 1) { + sum_f32 += NN_half_to_float(x[i * incx]) * NN_half_to_float(y[i * incy]); + } + *result = NN_float_to_half(sum_f32); +} + +__attribute__((weak)) void NN__dot_f32(size_t n, float *result, float *x, size_t incx, float *y, size_t incy) { + float sum = 0.0; + for (size_t i = 0; i < n; i += 1) { + sum += x[i * incx] * y[i * incy]; + } + *result = sum; +} diff --git a/nn/impl/cpu/fill.c b/nn/impl/cpu/fill.c new file mode 100644 index 0000000..b76dec1 --- /dev/null +++ b/nn/impl/cpu/fill.c @@ -0,0 +1,37 @@ +#include "fill.h" + +__attribute__((weak)) void NN__fill_u8(size_t n, uint8_t *x, size_t incx, uint8_t scalar) { + for (size_t i = 0; i < n; i += 1) { + x[i * incx] = scalar; + } +} + +__attribute__((weak)) void NN__fill_i8(size_t n, int8_t *x, size_t incx, int8_t scalar) { + for (size_t i = 0; i < n; i += 1) { + x[i * incx] = scalar; + } +} + +__attribute__((weak)) void NN__fill_i16(size_t n, int16_t *x, size_t incx, int16_t scalar) { + for (size_t i = 0; i < n; i += 1) { + x[i * incx] = scalar; + } +} + +__attribute__((weak)) void NN__fill_i32(size_t n, int32_t *x, size_t incx, int32_t scalar) { + for (size_t i = 0; i < n; i += 1) { + x[i * incx] = scalar; + } +} + +__attribute__((weak)) void NN__fill_f16(size_t n, float16_t *x, size_t incx, float16_t scalar) { + for (size_t i = 0; i < n; i += 1) { + x[i * incx] = scalar; + } +} + +__attribute__((weak)) void NN__fill_f32(size_t n, float *x, size_t incx, float scalar) { + for (size_t i = 0; i < n; i += 1) { + x[i * incx] = scalar; + } +} diff --git a/nn/impl/cpu/log.c b/nn/impl/cpu/log.c new file mode 100644 index 0000000..1b9f775 --- /dev/null +++ b/nn/impl/cpu/log.c @@ -0,0 +1,9 @@ +#include "log.h" + + +__attribute__((weak)) void NN__log_f32(size_t n, float *y, size_t incy, float *x, size_t incx) { + for (size_t i = 0; i < n; i += 1) { + y[i * incy] = logf(x[i * incx]); + } +} + diff --git a/nn/impl/cpu/max.c b/nn/impl/cpu/max.c new file mode 100644 index 0000000..77a07a2 --- /dev/null +++ b/nn/impl/cpu/max.c @@ -0,0 +1,12 @@ +#include "max.h" + + +__attribute__((weak)) void NN__max_f32(size_t n, float *result, float *x, size_t incx) { + float max = -FLT_MAX; + for (size_t i = 0; i < n; i += 1) { + float val = x[i * incx]; + max = val > max ? val : max; + } + *result = max; +} + diff --git a/nn/impl/cpu/maximum.c b/nn/impl/cpu/maximum.c new file mode 100644 index 0000000..62b7e95 --- /dev/null +++ b/nn/impl/cpu/maximum.c @@ -0,0 +1,11 @@ +#include "maximum.h" + + +__attribute__((weak)) void NN__maximum_f32(size_t n, float *z, size_t incz, float *x, size_t incx, float *y, size_t incy) { + for (size_t i = 0; i < n; i += 1) { + float x_val = x[i * incx]; + float y_val = y[i * incy]; + z[i * incz] = x_val > y_val ? x_val : y_val; + } +} + diff --git a/nn/impl/cpu/maximum1.c b/nn/impl/cpu/maximum1.c new file mode 100644 index 0000000..c78d7f7 --- /dev/null +++ b/nn/impl/cpu/maximum1.c @@ -0,0 +1,10 @@ +#include "maximum1.h" + + +__attribute__((weak)) void NN__maximum1_f32(size_t n, float *y, size_t incy, float *x, size_t incx, float scalar) { + for (size_t i = 0; i < n; i += 1) { + float x_val = x[i * incx]; + y[i * incy] = x_val > scalar ? x_val : scalar; + } +} + diff --git a/nn/impl/cpu/min.c b/nn/impl/cpu/min.c new file mode 100644 index 0000000..dc18838 --- /dev/null +++ b/nn/impl/cpu/min.c @@ -0,0 +1,12 @@ +#include "min.h" + + +__attribute__((weak)) void NN__min_f32(size_t n, float *result, float *x, size_t incx) { + float min = FLT_MAX; + for (size_t i = 0; i < n; i += 1) { + float val = x[i * incx]; + min = val < min ? val : min; + } + *result = min; +} + diff --git a/nn/impl/cpu/minimum.c b/nn/impl/cpu/minimum.c new file mode 100644 index 0000000..ccb4789 --- /dev/null +++ b/nn/impl/cpu/minimum.c @@ -0,0 +1,11 @@ +#include "minimum.h" + + +__attribute__((weak)) void NN__minimum_f32(size_t n, float *z, size_t incz, float *x, size_t incx, float *y, size_t incy) { + for (size_t i = 0; i < n; i += 1) { + float x_val = x[i * incx]; + float y_val = y[i * incy]; + z[i * incz] = x_val < y_val ? x_val : y_val; + } +} + diff --git a/nn/impl/cpu/minimum1.c b/nn/impl/cpu/minimum1.c new file mode 100644 index 0000000..f9974a9 --- /dev/null +++ b/nn/impl/cpu/minimum1.c @@ -0,0 +1,10 @@ +#include "minimum1.h" + + +__attribute__((weak)) void NN__minimum1_f32(size_t n, float *y, size_t incy, float *x, size_t incx, float scalar) { + for (size_t i = 0; i < n; i += 1) { + float x_val = x[i * incx]; + y[i * incy] = x_val < scalar ? x_val : scalar; + } +} + diff --git a/nn/impl/cpu/mul.c b/nn/impl/cpu/mul.c new file mode 100644 index 0000000..d17d1a8 --- /dev/null +++ b/nn/impl/cpu/mul.c @@ -0,0 +1,8 @@ +#include "mul.h" + + +__attribute__((weak)) void NN__mul_f32(size_t n, float *z, size_t incz, float *x, size_t incx, float *y, size_t incy) { + for (size_t i = 0; i < n; i += 1) { + z[i * incz] = x[i * incx] * y[i * incy]; + } +} diff --git a/nn/impl/cpu/mul1.c b/nn/impl/cpu/mul1.c new file mode 100644 index 0000000..41a364a --- /dev/null +++ b/nn/impl/cpu/mul1.c @@ -0,0 +1,8 @@ +#include "mul1.h" + + +__attribute__((weak)) void NN__mul1_f32(size_t n, float *y, size_t incy, float *x, size_t incx, float scalar) { + for (size_t i = 0; i < n; i += 1) { + y[i * incy] = x[i * incx] * scalar; + } +} diff --git a/nn/impl/cpu/neg.c b/nn/impl/cpu/neg.c new file mode 100644 index 0000000..7fca209 --- /dev/null +++ b/nn/impl/cpu/neg.c @@ -0,0 +1,8 @@ +#include "neg.h" + + +__attribute__((weak)) void NN__neg_f32(size_t n, float *y, size_t incy, float *x, size_t incx) { + for (size_t i = 0; i < n; i += 1) { + y[i * incy] = -x[i * incx]; + } +} diff --git a/nn/impl/cpu/norm.c b/nn/impl/cpu/norm.c new file mode 100644 index 0000000..b6a83fb --- /dev/null +++ b/nn/impl/cpu/norm.c @@ -0,0 +1,12 @@ +#include "norm.h" + + +__attribute__((weak)) void NN__norm_f32(size_t n, float *result, float *x, size_t incx) { + NN__dot_f32(n, result, x, incx, x, incx); + *result = sqrtf(*result); +} + +__attribute__((weak)) void NN__norm_inv_f32(size_t n, float *result, float *x, size_t incx) { + NN__norm_f32(n, result, x, incx); + *result = 1.f/(*result); +} diff --git a/nn/impl/cpu/rmsnorm.c b/nn/impl/cpu/rmsnorm.c new file mode 100644 index 0000000..efd60fa --- /dev/null +++ b/nn/impl/cpu/rmsnorm.c @@ -0,0 +1,18 @@ +#include "rmsnorm.h" + + +__attribute__((weak)) void NN__rmsnorm_f32(size_t n, float* y, size_t incy, float* x, size_t incx, float* w, size_t incw) { + // calculate sum of squares + float ss = 0.0f; + for (size_t i = 0; i < n; i += 1) { + ss += x[i * incx] * x[i * incx]; + } + ss /= n; + ss += 1e-5f; + ss = 1.0f / sqrtf(ss); + // normalize and scale + for (size_t i = 0; i < n; i += 1) { + y[i * incy] = w[i * incw] * (ss * x[i * incx]); + } +} + diff --git a/nn/impl/cpu/sgn.c b/nn/impl/cpu/sgn.c new file mode 100644 index 0000000..e6af037 --- /dev/null +++ b/nn/impl/cpu/sgn.c @@ -0,0 +1,9 @@ +#include "sgn.h" + + +__attribute__((weak)) void NN__sgn_f32(size_t n, float *y, size_t incy, float *x, size_t incx) { + for (size_t i = 0; i < n; i += 1) { + y[i * incy] = (x[i * incx] > 0.f) ? 1.f : ((x[i * incx] < 0.f) ? -1.f : 0.f); + } +} + diff --git a/nn/impl/cpu/softmax.c b/nn/impl/cpu/softmax.c new file mode 100644 index 0000000..84c4322 --- /dev/null +++ b/nn/impl/cpu/softmax.c @@ -0,0 +1,15 @@ +#include "softmax.h" + + +__attribute__((weak)) void NN__softmax_f32(size_t n, float *y, size_t incy, float *x, size_t incx) { + // exp and sum + float sum = 0.0f; + for (size_t i = 0; i < n; i += 1) { + y[i * incy] = expf(x[i * incx]); + sum += y[i * incy]; + } + // normalize + for (size_t i = 0; i < n; i += 1) { + y[i * incy] /= sum; + } +} diff --git a/nn/impl/cpu/sqr.c b/nn/impl/cpu/sqr.c new file mode 100644 index 0000000..9432168 --- /dev/null +++ b/nn/impl/cpu/sqr.c @@ -0,0 +1,9 @@ +#include "sqr.h" + + +__attribute__((weak)) void NN__sqr_f32(size_t n, float *y, size_t incy, float *x, size_t incx) { + for (size_t i = 0; i < n; i += 1) { + y[i * incy] = x[i * incx] * x[i * incx]; + } +} + diff --git a/nn/impl/cpu/sqrt.c b/nn/impl/cpu/sqrt.c new file mode 100644 index 0000000..98d97d5 --- /dev/null +++ b/nn/impl/cpu/sqrt.c @@ -0,0 +1,9 @@ +#include "sqrt.h" + + +__attribute__((weak)) void NN__sqrt_f32(size_t n, float *y, size_t incy, float *x, size_t incx) { + for (size_t i = 0; i < n; i += 1) { + y[i * incy] = sqrtf(x[i * incx]); + } +} + diff --git a/nn/impl/cpu/sub.c b/nn/impl/cpu/sub.c new file mode 100644 index 0000000..a5f8558 --- /dev/null +++ b/nn/impl/cpu/sub.c @@ -0,0 +1,39 @@ +#include "sub.h" + + +__attribute__((weak)) void NN__sub_u8(size_t n, uint8_t *z, size_t incz, uint8_t *x, size_t incx, uint8_t *y, size_t incy) { + for (size_t i = 0; i < n; i += 1) { + z[i * incz] = x[i * incx] - y[i * incy]; + } +} + +__attribute__((weak)) void NN__sub_i8(size_t n, int8_t *z, size_t incz, int8_t *x, size_t incx, int8_t *y, size_t incy) { + for (size_t i = 0; i < n; i += 1) { + z[i * incz] = x[i * incx] - y[i * incy]; + } +} + +__attribute__((weak)) void NN__sub_i16(size_t n, int16_t *z, size_t incz, int16_t *x, size_t incx, int16_t *y, size_t incy) { + for (size_t i = 0; i < n; i += 1) { + z[i * incz] = x[i * incx] - y[i * incy]; + } +} + +__attribute__((weak)) void NN__sub_i32(size_t n, int32_t *z, size_t incz, int32_t *x, size_t incx, int32_t *y, size_t incy) { + for (size_t i = 0; i < n; i += 1) { + z[i * incz] = x[i * incx] - y[i * incy]; + } +} + +__attribute__((weak)) void NN__sub_f16(size_t n, float16_t *z, size_t incz, float16_t *x, size_t incx, float16_t *y, size_t incy) { + for (size_t i = 0; i < n; i += 1) { + z[i * incz] = NN_float_to_half(NN_half_to_float(x[i * incx]) - NN_half_to_float(y[i * incy])); + } +} + +__attribute__((weak)) void NN__sub_f32(size_t n, float *z, size_t incz, float *x, size_t incx, float *y, size_t incy) { + for (size_t i = 0; i < n; i += 1) { + z[i * incz] = x[i * incx] - y[i * incy]; + } +} + diff --git a/nn/impl/cpu/sum.c b/nn/impl/cpu/sum.c new file mode 100644 index 0000000..8181b2b --- /dev/null +++ b/nn/impl/cpu/sum.c @@ -0,0 +1,34 @@ +#include "sum.h" + + +__attribute__((weak)) void NN__sum_u8_to_i32(size_t n, int32_t *result, uint8_t *x, size_t incx) { + int32_t sum = 0; + for (size_t i = 0; i < n; i += 1) { + sum += (int32_t)x[i * incx]; + } + *result = sum; +} + +__attribute__((weak)) void NN__sum_i16_to_i32(size_t n, int32_t *result, int16_t *x, size_t incx) { + int32_t sum = 0; + for (size_t i = 0; i < n; i += 1) { + sum += (int32_t)x[i * incx]; + } + *result = sum; +} + +__attribute__((weak)) void NN__sum_i32(size_t n, int32_t *result, int32_t *x, size_t incx) { + int32_t sum = 0; + for (size_t i = 0; i < n; i += 1) { + sum += x[i * incx]; + } + *result = sum; +} + +__attribute__((weak)) void NN__sum_f32(size_t n, float *result, float *x, size_t incx) { + float sum = 0.f; + for (size_t i = 0; i < n; i += 1) { + sum += x[i * incx]; + } + *result = sum; +} diff --git a/nn/impl/cpu/transpose.c b/nn/impl/cpu/transpose.c new file mode 100644 index 0000000..3339ba0 --- /dev/null +++ b/nn/impl/cpu/transpose.c @@ -0,0 +1,10 @@ +#include "transpose.h" + + +__attribute__((weak)) void NN__transpose_f32(size_t m, size_t n, float *y, float *x) { + for (size_t i = 0; i < m; i += 1) { + for (size_t j = 0; j < n; j += 1) { + y[j * m + i] = x[i * n + j]; + } + } +}; diff --git a/nn/impl/div.h b/nn/impl/div.h new file mode 100644 index 0000000..6d4bbd5 --- /dev/null +++ b/nn/impl/div.h @@ -0,0 +1,15 @@ +#ifndef __NN__DIV_H +#define __NN__DIV_H + +#include +#include + + +void NN__div_f32(size_t n, + float *z, size_t incz, + float *x, size_t incx, + float *y, size_t incy + ); + + +#endif // __NN__DIV_H diff --git a/nn/impl/dot.h b/nn/impl/dot.h new file mode 100644 index 0000000..c7b2f14 --- /dev/null +++ b/nn/impl/dot.h @@ -0,0 +1,21 @@ +#ifndef __NN__DOT_H +#define __NN__DOT_H + +#include + +#include "nn_float16.h" + + +void NN__dot_f16(size_t n, + float16_t *result, + float16_t *x, size_t incx, + float16_t *y, size_t incy + ); + +void NN__dot_f32(size_t n, + float *result, + float *x, size_t incx, + float *y, size_t incy + ); + +#endif // __NN__DOT_H diff --git a/nn/impl/fill.h b/nn/impl/fill.h new file mode 100644 index 0000000..73f550f --- /dev/null +++ b/nn/impl/fill.h @@ -0,0 +1,41 @@ +#ifndef __NN__FILL_H +#define __NN__FILL_H + +#include +#include + +#include "nn_float16.h" + + +void NN__fill_u8(size_t n, + uint8_t *x, size_t incx, + uint8_t scalar + ); + +void NN__fill_i8(size_t n, + int8_t *x, size_t incx, + int8_t scalar + ); + +void NN__fill_i16(size_t n, + int16_t *x, size_t incx, + int16_t scalar + ); + +void NN__fill_i32(size_t n, + int32_t *x, size_t incx, + int32_t scalar + ); + +void NN__fill_f16(size_t n, + float16_t *x, size_t incx, + float16_t scalar + ); + +void NN__fill_f32(size_t n, + float *x, size_t incx, + float scalar + ); + + +#endif // __NN__FILL_H diff --git a/nn/inc/gemmini/gemmini.h b/nn/impl/gemmini/gemmini.h similarity index 100% rename from nn/inc/gemmini/gemmini.h rename to nn/impl/gemmini/gemmini.h diff --git a/nn/inc/gemmini/gemmini_counter.h b/nn/impl/gemmini/gemmini_counter.h similarity index 100% rename from nn/inc/gemmini/gemmini_counter.h rename to nn/impl/gemmini/gemmini_counter.h diff --git a/nn/inc/gemmini/gemmini_params.h b/nn/impl/gemmini/gemmini_params.h similarity index 100% rename from nn/inc/gemmini/gemmini_params.h rename to nn/impl/gemmini/gemmini_params.h diff --git a/nn/impl/gemmini/mm.c b/nn/impl/gemmini/mm.c new file mode 100644 index 0000000..4b7d113 --- /dev/null +++ b/nn/impl/gemmini/mm.c @@ -0,0 +1,23 @@ +#include "mm.h" + + +void NN__mm_f32(size_t m, size_t n, float16_t *z, float16_t *x, float16_t *y) { + size_t dim_I = m; + size_t dim_J = n; + size_t dim_K = n; + + size_t stride_A = dim_K; + size_t stride_B = dim_J; + size_t stride_D = dim_J; + size_t stride_C = dim_J; + + tiled_matmul_auto(dim_I, dim_J, dim_K, + x, y, + NULL, z, + stride_A, stride_B, stride_D, stride_C, + MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, + NO_ACTIVATION, ACC_SCALE_IDENTITY, 0, + 0, 0, 0, 0, 0, 0, WS); + + return; +}; diff --git a/nn/impl/gemmini/mm.h b/nn/impl/gemmini/mm.h new file mode 100644 index 0000000..505149c --- /dev/null +++ b/nn/impl/gemmini/mm.h @@ -0,0 +1,19 @@ +#ifndef __NN__MM_H +#define __NN__MM_H + +#include +#include + +#include "nn_float16.h" + +#include "gemmini.h" + + +void NN__mm_f32(size_t m, size_t n, + float16_t *z, + float16_t *x, + float16_t *y + ); + + +#endif // __NN__MM_H diff --git a/nn/inc/gemmini/xcustom.h b/nn/impl/gemmini/xcustom.h similarity index 100% rename from nn/inc/gemmini/xcustom.h rename to nn/impl/gemmini/xcustom.h diff --git a/nn/impl/log.h b/nn/impl/log.h new file mode 100644 index 0000000..d9e2fb4 --- /dev/null +++ b/nn/impl/log.h @@ -0,0 +1,15 @@ +#ifndef __NN__LOG_H +#define __NN__LOG_H + +#include +#include +#include + + +void NN__log_f32(size_t n, + float *y, size_t incy, + float *x, size_t incx + ); + + +#endif // __NN__LOG_H diff --git a/nn/impl/max.h b/nn/impl/max.h new file mode 100644 index 0000000..6d2b17d --- /dev/null +++ b/nn/impl/max.h @@ -0,0 +1,15 @@ +#ifndef __NN__MAX_H +#define __NN__MAX_H + +#include +#include +#include + + +void NN__max_f32(size_t n, + float *result, + float *x, size_t incx + ); + + +#endif // __NN__MAX_H diff --git a/nn/impl/maximum.h b/nn/impl/maximum.h new file mode 100644 index 0000000..c34493e --- /dev/null +++ b/nn/impl/maximum.h @@ -0,0 +1,15 @@ +#ifndef __NN__MAXIMUM_H +#define __NN__MAXIMUM_H + +#include +#include + + +void NN__maximum_f32(size_t n, + float *z, size_t incz, + float *x, size_t incx, + float *y, size_t incy + ); + + +#endif // __NN__MAXIMUM_H diff --git a/nn/impl/maximum1.h b/nn/impl/maximum1.h new file mode 100644 index 0000000..50a8a48 --- /dev/null +++ b/nn/impl/maximum1.h @@ -0,0 +1,23 @@ +#ifndef __NN__MAXIMUM1_H +#define __NN__MAXIMUM1_H + +#include +#include + +#include "nn_float16.h" + + +void NN__maximum1_f16(size_t n, + float16_t *y, size_t incy, + float16_t *x, size_t incx, + float16_t scalar + ); + +void NN__maximum1_f32(size_t n, + float *y, size_t incy, + float *x, size_t incx, + float scalar + ); + + +#endif // __NN__MAXIMUM1_H diff --git a/nn/impl/min.h b/nn/impl/min.h new file mode 100644 index 0000000..4b40630 --- /dev/null +++ b/nn/impl/min.h @@ -0,0 +1,15 @@ +#ifndef __NN__MIN_H +#define __NN__MIN_H + +#include +#include +#include + + +void NN__min_f32(size_t n, + float *result, + float *x, size_t incx + ); + + +#endif // __NN__MIN_H diff --git a/nn/impl/minimum.h b/nn/impl/minimum.h new file mode 100644 index 0000000..0adde53 --- /dev/null +++ b/nn/impl/minimum.h @@ -0,0 +1,15 @@ +#ifndef __NN__MINIMUM_H +#define __NN__MINIMUM_H + +#include +#include + + +void NN__minimum_f32(size_t n, + float *z, size_t incz, + float *x, size_t incx, + float *y, size_t incy + ); + + +#endif // __NN__MINIMUM_H diff --git a/nn/impl/minimum1.h b/nn/impl/minimum1.h new file mode 100644 index 0000000..b867ec1 --- /dev/null +++ b/nn/impl/minimum1.h @@ -0,0 +1,15 @@ +#ifndef __NN__MINIMUM1_H +#define __NN__MINIMUM1_H + +#include +#include + + +void NN__minimum1_f32(size_t n, + float *y, size_t incy, + float *x, size_t incx, + float scalar + ); + + +#endif // __NN__MINIMUM1_H diff --git a/nn/impl/mul.h b/nn/impl/mul.h new file mode 100644 index 0000000..b67896d --- /dev/null +++ b/nn/impl/mul.h @@ -0,0 +1,15 @@ +#ifndef __NN__MUL_H +#define __NN__MUL_H + +#include +#include + + +void NN__mul_f32(size_t n, + float *z, size_t incz, + float *x, size_t incx, + float *y, size_t incy + ); + + +#endif // __NN__MUL_H diff --git a/nn/impl/mul1.h b/nn/impl/mul1.h new file mode 100644 index 0000000..7a84a25 --- /dev/null +++ b/nn/impl/mul1.h @@ -0,0 +1,15 @@ +#ifndef __NN__MUL1_H +#define __NN__MUL1_H + +#include +#include + + +void NN__mul1_f32(size_t n, + float *y, size_t incy, + float *x, size_t incx, + float scalar + ); + + +#endif // __NN__MUL1_H diff --git a/nn/impl/neg.h b/nn/impl/neg.h new file mode 100644 index 0000000..277f87d --- /dev/null +++ b/nn/impl/neg.h @@ -0,0 +1,14 @@ +#ifndef __NN__NEG_H +#define __NN__NEG_H + +#include +#include + + +void NN__neg_f32(size_t n, + float *y, size_t incy, + float *x, size_t incx + ); + + +#endif // __NN__NEG_H diff --git a/nn/impl/norm.h b/nn/impl/norm.h new file mode 100644 index 0000000..66580ae --- /dev/null +++ b/nn/impl/norm.h @@ -0,0 +1,21 @@ +#ifndef __NN__NORM_H +#define __NN__NORM_H + +#include +#include + +#include "dot.h" + + +void NN__norm_f32(size_t n, + float *result, + float *x, size_t incx + ); + +void NN__norm_inv_f32(size_t n, + float *result, + float *x, size_t incx + ); + + +#endif // __NN__NORM_H diff --git a/nn/impl/rmsnorm.h b/nn/impl/rmsnorm.h new file mode 100644 index 0000000..f52c045 --- /dev/null +++ b/nn/impl/rmsnorm.h @@ -0,0 +1,15 @@ +#ifndef __NN__RMSNORM_H +#define __NN__RMSNORM_H + +#include +#include + + +void NN__rmsnorm_f32(size_t n, + float* y, size_t incy, + float* x, size_t incx, + float* w, size_t incw + ); + + +#endif // __NN__RMSNORM_H diff --git a/nn/impl/rvv/abs.c b/nn/impl/rvv/abs.c new file mode 100644 index 0000000..4dee14a --- /dev/null +++ b/nn/impl/rvv/abs.c @@ -0,0 +1,72 @@ +#include +#include "abs.h" + +#ifdef RVV + +void NN__abs_i8(size_t n, int8_t *y, size_t incy, int8_t *x, size_t incx) { + while (n > 0) { + size_t vl = __riscv_vsetvl_e8m1(n); + vint8m1_t vec_x = __riscv_vlse8_v_i8m1(x, sizeof(int8_t) * incx, vl); + vint8m1_t vec_neg_x = __riscv_vneg_v_i8m1(vec_x, vl); + vbool8_t mask = __riscv_vmslt_vx_i8m1_b8(vec_x, 0, vl); + vint8m1_t vec_abs_x = __riscv_vmerge_vvm_i8m1(vec_x, vec_neg_x, mask, vl); + __riscv_vsse8_v_i8m1(y, sizeof(int8_t) * incy, vec_abs_x, vl); + x += vl; + y += vl; + n -= vl; + } +} + +void NN__abs_i16(size_t n, int16_t *y, size_t incy, int16_t *x, size_t incx) { + while (n > 0) { + size_t vl = __riscv_vsetvl_e16m1(n); + vint16m1_t vec_x = __riscv_vlse16_v_i16m1(x, sizeof(int16_t) * incx, vl); + vint16m1_t vec_neg_x = __riscv_vneg_v_i16m1(vec_x, vl); + vbool16_t mask = __riscv_vmslt_vx_i16m1_b16(vec_x, 0, vl); + vint16m1_t vec_abs_x = __riscv_vmerge_vvm_i16m1(vec_x, vec_neg_x, mask, vl); + __riscv_vsse16_v_i16m1(y, sizeof(int16_t) * incy, vec_abs_x, vl); + x += vl; + y += vl; + n -= vl; + } +} + +void NN__abs_i32(size_t n, int32_t *y, size_t incy, int32_t *x, size_t incx) { + while (n > 0) { + size_t vl = __riscv_vsetvl_e32m1(n); + vint32m1_t vec_x = __riscv_vlse32_v_i32m1(x, sizeof(int32_t) * incx, vl); + vint32m1_t vec_neg_x = __riscv_vneg_v_i32m1(vec_x, vl); + vbool32_t mask = __riscv_vmslt_vx_i32m1_b32(vec_x, 0, vl); + vint32m1_t vec_abs_x = __riscv_vmerge_vvm_i32m1(vec_x, vec_neg_x, mask, vl); + __riscv_vsse32_v_i32m1(y, sizeof(int32_t) * incy, vec_abs_x, vl); + x += vl; + y += vl; + n -= vl; + } +} + +// void NN__abs_f16(size_t n, float16_t *y, size_t incy, float16_t *x, size_t incx) { +// while (n > 0) { +// size_t vl = __riscv_vsetvl_e16m1(n); +// vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); +// vfloat16m1_t vec_y = __riscv_vfabs_v_f16m1(vec_x, vl); +// __riscv_vse16_v_f16m1(y, sizeof(float16_t) * incy, vec_y, vl); +// x += vl; +// y += vl; +// n -= vl; +// } +// } + +void NN__abs_f32(size_t n, float *y, size_t incy, float *x, size_t incx) { + while (n > 0) { + size_t vl = __riscv_vsetvl_e32m1(n); + vfloat32m1_t vec_x = __riscv_vlse32_v_f32m1(x, sizeof(float) * incx, vl); + vfloat32m1_t vec_y = __riscv_vfabs_v_f32m1(vec_x, vl); + __riscv_vsse32_v_f32m1(y, sizeof(float) * incy, vec_y, vl); + x += vl; + y += vl; + n -= vl; + } +} + +#endif \ No newline at end of file diff --git a/nn/impl/rvv/acc.c b/nn/impl/rvv/acc.c new file mode 100644 index 0000000..7eadf04 --- /dev/null +++ b/nn/impl/rvv/acc.c @@ -0,0 +1,20 @@ +#include +#include "acc.h" + +#ifdef RVV + + +void NN__acc_f32(size_t n, float *y, size_t incy, float *x, size_t incx) { + while (n > 0) { + size_t vl = __riscv_vsetvl_e32m1(n); + vfloat32m1_t vec_x = __riscv_vlse32_v_f32m1(x, sizeof(float) * incx, vl); + vfloat32m1_t vec_y = __riscv_vlse32_v_f32m1(y, sizeof(float) * incy, vl); + vec_y = __riscv_vfadd_vv_f32m1(vec_y, vec_x, vl); + __riscv_vsse32_v_f32m1(y, sizeof(float) * incy, vec_y, vl); + x += vl; + y += vl; + n -= vl; + } +} + +#endif diff --git a/nn/impl/rvv/acc1.c b/nn/impl/rvv/acc1.c new file mode 100644 index 0000000..9370d4c --- /dev/null +++ b/nn/impl/rvv/acc1.c @@ -0,0 +1,18 @@ +#include +#include "acc1.h" + +#ifdef RVV + +void NN__acc1_f32(size_t n, float *result, size_t incx, float scalar) { + while (n > 0) { + size_t vl = __riscv_vsetvl_e32m1(n); + vfloat32m1_t vec_r = __riscv_vlse32_v_f32m1(result, sizeof(float) * incx, vl); + vfloat32m1_t vec_s = __riscv_vfmv_v_f_f32m1(scalar, vl); + vec_r = __riscv_vfadd_vv_f32m1(vec_r, vec_s, vl); + __riscv_vse32_v_f32m1(result, vec_r, vl); + result += vl; + n -= vl; + } +} + +#endif \ No newline at end of file diff --git a/nn/impl/rvv/add.c b/nn/impl/rvv/add.c new file mode 100644 index 0000000..e642bcc --- /dev/null +++ b/nn/impl/rvv/add.c @@ -0,0 +1,61 @@ +#include +#include "add.h" + +#ifdef RVV + + +void NN__add_i8(size_t n, int8_t *z, size_t incz, int8_t *x, size_t incx, int8_t *y, size_t incy) { + while (n > 0) { + size_t vl = __riscv_vsetvl_e8m1(n); + vint8m1_t vec_x = __riscv_vlse8_v_i8m1(x, sizeof(int8_t) * incx, vl); + vint8m1_t vec_y = __riscv_vlse8_v_i8m1(y, sizeof(int8_t) * incy, vl); + vint8m1_t vec_z = __riscv_vadd_vv_i8m1(vec_x, vec_y, vl); + __riscv_vsse8_v_i8m1(z, sizeof(int8_t) * incz, vec_z, vl); + x += vl; + y += vl; + z += vl; + n -= vl; + } +} + +void NN__add_f16(size_t n, float16_t *z, size_t incz, float16_t *x, size_t incx, float16_t *y, size_t incy) { + while (n > 0) { + size_t vl; + + // size_t vl = __riscv_vsetvl_e16m1(n); + asm volatile("vsetvli %0, %1, e16, m1, ta, ma" : "=r"(vl) : "r"(n)); + + // vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); + asm volatile("vlse16.v v24, (%0), %1" : : "r"(x), "r"(sizeof(float16_t) * incx)); + + // vfloat16m1_t vec_y = __riscv_vlse16_v_f16m1(y, sizeof(float16_t) * incy, vl); + asm volatile("vlse16.v v25, (%0), %1" : : "r"(y), "r"(sizeof(float16_t) * incy)); + + // // vfloat16m1_t vec_z = __riscv_vfadd_vv_f16m1(vec_x, vec_y, vl); + asm volatile("vfadd.vv v24, v24, v25"); + + // __riscv_vsse16_v_f16m1(z, sizeof(float16_t) * incz, vec_z, vl); + asm volatile("vsse16.v v24, (%0), %1" : : "r"(z), "r"(sizeof(float16_t) * incz)); + + x += vl; + y += vl; + z += vl; + n -= vl; + } +} + +void NN__add_f32(size_t n, float *z, size_t incz, float *x, size_t incx, float *y, size_t incy) { + while (n > 0) { + size_t vl = __riscv_vsetvl_e32m1(n); + vfloat32m1_t vec_x = __riscv_vlse32_v_f32m1(x, sizeof(float) * incx, vl); + vfloat32m1_t vec_y = __riscv_vlse32_v_f32m1(y, sizeof(float) * incy, vl); + vfloat32m1_t vec_z = __riscv_vfadd_vv_f32m1(vec_x, vec_y, vl); + __riscv_vsse32_v_f32m1(z, sizeof(float) * incz, vec_z, vl); + x += vl; + y += vl; + z += vl; + n -= vl; + } +} + +#endif diff --git a/nn/impl/rvv/add1.c b/nn/impl/rvv/add1.c new file mode 100644 index 0000000..361799a --- /dev/null +++ b/nn/impl/rvv/add1.c @@ -0,0 +1,20 @@ +#include +#include "add1.h" + +#ifdef RVV + + +void NN__add1_f32(size_t n, float *y, size_t incy, float *x, size_t incx, float scalar) { + while (n > 0) { + size_t vl = __riscv_vsetvl_e32m1(n); + vfloat32m1_t vec_x = __riscv_vlse32_v_f32m1(x, sizeof(float) * incx, vl); + vfloat32m1_t vec_s = __riscv_vfmv_v_f_f32m1(scalar, vl); + vfloat32m1_t vec_y = __riscv_vfadd_vv_f32m1(vec_x, vec_s, vl); + __riscv_vsse32_v_f32m1(y, sizeof(float) * incy, vec_y, vl); + x += vl; + y += vl; + n -= vl; + } +} + +#endif diff --git a/nn/impl/rvv/div.c b/nn/impl/rvv/div.c new file mode 100644 index 0000000..92fb7c8 --- /dev/null +++ b/nn/impl/rvv/div.c @@ -0,0 +1,21 @@ +#include +#include "div.h" + +#ifdef RVV + + +void NN__div_f32(size_t n, float *z, size_t incz, float *x, size_t incx, float *y, size_t incy) { + while (n > 0) { + size_t vl = __riscv_vsetvl_e32m1(n); + vfloat32m1_t vec_x = __riscv_vlse32_v_f32m1(x, sizeof(float) * incx, vl); + vfloat32m1_t vec_y = __riscv_vlse32_v_f32m1(y, sizeof(float) * incy, vl); + vfloat32m1_t vec_z = __riscv_vfdiv_vv_f32m1(vec_x, vec_y, vl); + __riscv_vsse32_v_f32m1(z, sizeof(float) * incz, vec_z, vl); + x += vl; + y += vl; + z += vl; + n -= vl; + } +} + +#endif diff --git a/nn/impl/rvv/dot.c b/nn/impl/rvv/dot.c new file mode 100644 index 0000000..b39eb19 --- /dev/null +++ b/nn/impl/rvv/dot.c @@ -0,0 +1,69 @@ +#include +#include "dot.h" + +#ifdef RVV + + +void NN__dot_f16(size_t n, float16_t *result, float16_t *x, size_t incx, float16_t *y, size_t incy) { + size_t vlmax; + // size_t vlmax = __riscv_vsetvlmax_e16m1(); + asm volatile("vsetvli %0, zero, e16, m1, ta, ma" : "=r"(vlmax) : "r"(n)); + + // vfloat16m1_t vec_zero = __riscv_vfmv_v_f_f16m1(0, vlmax); + asm volatile("vmv.v.i v27, 0"); + + // vfloat16m1_t vec_r = __riscv_vfmv_v_f_f16m1(0, vlmax); + asm volatile("vmv1r.v v24, v27"); + + while (n > 0) { + size_t vl; + // size_t vl = __riscv_vsetvl_e16m1(n); + asm volatile("vsetvli %0, %1, e16, m1, ta, ma" : "=r"(vl) : "r"(n)); + + // vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); + asm volatile("vlse16.v v26, (%0), %1" : : "r"(x), "r"(sizeof(float16_t) * incx)); + + // vfloat16m1_t vec_y = __riscv_vlse16_v_f16m1(y, sizeof(float16_t) * incy, vl); + asm volatile("vlse16.v v25, (%0), %1" : : "r"(y), "r"(sizeof(float16_t) * incy)); + + // vec_r = __riscv_vfmacc_vv_f16m1(vec_r, vec_x, vec_y, vl); + asm volatile("vfmacc.vv v24, v26, v25"); + + x += vl; + y += vl; + n -= vl; + } + + // vec_r = __riscv_vfredusum_vs_f16m1_f16m1(vec_r, vec_zero, vlmax); + asm volatile("vsetvli %0, zero, e16, m1, ta, ma" : "=r"(vlmax) : "r"(n)); + asm volatile("vfredusum.vs v24, v24, v27"); + + // *result = __riscv_vfmv_f_s_f16m1_f16(vec_r); + float16_t r; + asm volatile("vmv.x.s %0, v24" : "=r"(r)); + *result = r; + +} + +void NN__dot_f32(size_t n, float *result, float *x, size_t incx, float *y, size_t incy) { + size_t vlmax = __riscv_vsetvlmax_e32m1(); + + vfloat32m1_t vec_zero = __riscv_vfmv_v_f_f32m1(0, vlmax); + vfloat32m1_t vec_r = __riscv_vfmv_v_f_f32m1(0, vlmax); + + while (n > 0) { + size_t vl = __riscv_vsetvl_e32m1(n); + vfloat32m1_t vec_x = __riscv_vlse32_v_f32m1(x, sizeof(float) * incx, vl); + vfloat32m1_t vec_y = __riscv_vlse32_v_f32m1(y, sizeof(float) * incy, vl); + vec_r = __riscv_vfmacc_vv_f32m1(vec_r, vec_x, vec_y, vl); + + x += vl; + y += vl; + n -= vl; + } + vec_r = __riscv_vfredusum_vs_f32m1_f32m1(vec_r, vec_zero, vlmax); + *result = __riscv_vfmv_f_s_f32m1_f32(vec_r); +} + + +#endif diff --git a/nn/impl/rvv/fill.c b/nn/impl/rvv/fill.c new file mode 100644 index 0000000..24dd763 --- /dev/null +++ b/nn/impl/rvv/fill.c @@ -0,0 +1,18 @@ +#include +#include "fill.h" + +#ifdef RVV + + +void NN__fill_f32(size_t n, float *x, size_t incx, float scalar) { + while (n > 0) { + size_t vl = __riscv_vsetvl_e32m1(n); + vfloat32m1_t vec_x = __riscv_vfmv_v_f_f32m1(scalar, vl); + __riscv_vsse32_v_f32m1(x, sizeof(float) * incx, vec_x, vl); + x += vl; + n -= vl; + } +} + + +#endif diff --git a/nn/impl/rvv/max.c b/nn/impl/rvv/max.c new file mode 100644 index 0000000..27dc452 --- /dev/null +++ b/nn/impl/rvv/max.c @@ -0,0 +1,20 @@ +#include +#include "max.h" + +#ifdef RVV + + +void NN__max_f32(size_t n, float *result, float *x, size_t incx) { + vfloat32m1_t vec_max = __riscv_vfmv_s_f_f32m1(-FLT_MAX, 1); + while (n > 0) { + size_t vl = __riscv_vsetvl_e32m1(n); + vfloat32m1_t vec_x = __riscv_vlse32_v_f32m1(x, sizeof(float) * incx, vl); + vec_max = __riscv_vfredmax_vs_f32m1_f32m1(vec_x, vec_max, vl); + x += vl; + n -= vl; + } + *result = __riscv_vfmv_f_s_f32m1_f32(vec_max); +} + + +#endif diff --git a/nn/impl/rvv/maximum.c b/nn/impl/rvv/maximum.c new file mode 100644 index 0000000..68784b6 --- /dev/null +++ b/nn/impl/rvv/maximum.c @@ -0,0 +1,22 @@ +#include +#include "maximum.h" + +#ifdef RVV + + +void NN__maximum_f32(size_t n, float *z, size_t incz, float *x, size_t incx, float *y, size_t incy) { + while (n > 0) { + size_t vl = __riscv_vsetvl_e32m1(n); + vfloat32m1_t vec_x = __riscv_vlse32_v_f32m1(x, sizeof(float) * incx, vl); + vfloat32m1_t vec_y = __riscv_vlse32_v_f32m1(y, sizeof(float) * incy, vl); + vfloat32m1_t vec_z = __riscv_vfmax_vv_f32m1(vec_x, vec_y, vl); + __riscv_vsse32_v_f32m1(z, sizeof(float) * incz, vec_z, vl); + x += vl; + y += vl; + z += vl; + n -= vl; + } +} + + +#endif diff --git a/nn/impl/rvv/maximum1.c b/nn/impl/rvv/maximum1.c new file mode 100644 index 0000000..ab908b7 --- /dev/null +++ b/nn/impl/rvv/maximum1.c @@ -0,0 +1,45 @@ +#include +#include "maximum1.h" + +#ifdef RVV + + +void NN__maximum1_f16(size_t n, float16_t *y, size_t incy, float16_t *x, size_t incx, float16_t scalar) { + while (n > 0) { + size_t vl; + // size_t vl = __riscv_vsetvl_e16m1(n); + asm volatile("vsetvli %0, %1, e16, m1, ta, ma" : "=r"(vl) : "r"(n)); + + // vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); + asm volatile("vlse16.v v26, (%0), %1" : : "r"(x), "r"(sizeof(float16_t) * incx)); + + // vfloat16m1_t vec_s = __riscv_vfmv_v_f_f16m1(scalar, vl); + asm volatile("vmv.v.x v25, %0" : : "r"(scalar)); + + // vfloat16m1_t vec_y = __riscv_vfmax_vv_f16m1(vec_x, vec_s, vl); + asm volatile("vfmax.vv v25, v26, v25"); + + // __riscv_vsse16_v_f16m1(y, sizeof(float16_t) * incy, vec_y, vl); + asm volatile("vsse16.v v25, (%0), %1" : : "r"(y), "r"(sizeof(float16_t) * incy)); + + x += vl; + y += vl; + n -= vl; + } +} + +void NN__maximum1_f32(size_t n, float *y, size_t incy, float *x, size_t incx, float scalar) { + while (n > 0) { + size_t vl = __riscv_vsetvl_e32m1(n); + vfloat32m1_t vec_x = __riscv_vlse32_v_f32m1(x, sizeof(float) * incx, vl); + vfloat32m1_t vec_s = __riscv_vfmv_v_f_f32m1(scalar, vl); + vfloat32m1_t vec_y = __riscv_vfmax_vv_f32m1(vec_x, vec_s, vl); + __riscv_vsse32_v_f32m1(y, sizeof(float) * incy, vec_y, vl); + x += vl; + y += vl; + n -= vl; + } +} + + +#endif diff --git a/nn/impl/rvv/min.c b/nn/impl/rvv/min.c new file mode 100644 index 0000000..f10486e --- /dev/null +++ b/nn/impl/rvv/min.c @@ -0,0 +1,20 @@ +#include +#include "min.h" + +#ifdef RVV + + +void NN__min_f32(size_t n, float *result, float *x, size_t incx) { + vfloat32m1_t vec_min = __riscv_vfmv_s_f_f32m1(FLT_MAX, 1); + while (n > 0) { + size_t vl = __riscv_vsetvl_e32m1(n); + vfloat32m1_t vec_x = __riscv_vlse32_v_f32m1(x, sizeof(float) * incx, vl); + vec_min = __riscv_vfredmin_vs_f32m1_f32m1(vec_x, vec_min, vl); + x += vl; + n -= vl; + } + *result = __riscv_vfmv_f_s_f32m1_f32(vec_min); +} + + +#endif diff --git a/nn/impl/rvv/minimum.c b/nn/impl/rvv/minimum.c new file mode 100644 index 0000000..0b4f1a5 --- /dev/null +++ b/nn/impl/rvv/minimum.c @@ -0,0 +1,22 @@ +#include +#include "minimum.h" + +#ifdef RVV + + +void NN__minimum_f32(size_t n, float *z, size_t incz, float *x, size_t incx, float *y, size_t incy) { + while (n > 0) { + size_t vl = __riscv_vsetvl_e32m1(n); + vfloat32m1_t vec_x = __riscv_vlse32_v_f32m1(x, sizeof(float) * incx, vl); + vfloat32m1_t vec_y = __riscv_vlse32_v_f32m1(y, sizeof(float) * incy, vl); + vfloat32m1_t vec_z = __riscv_vfmin_vv_f32m1(vec_x, vec_y, vl); + __riscv_vsse32_v_f32m1(z, sizeof(float) * incz, vec_z, vl); + x += vl; + y += vl; + z += vl; + n -= vl; + } +} + + +#endif diff --git a/nn/impl/rvv/minimum1.c b/nn/impl/rvv/minimum1.c new file mode 100644 index 0000000..040cbda --- /dev/null +++ b/nn/impl/rvv/minimum1.c @@ -0,0 +1,45 @@ +#include +#include "minimum1.h" + +#ifdef RVV + + +void NN__minimum1_f16(size_t n, float16_t *y, size_t incy, float16_t *x, size_t incx, float16_t scalar) { + while (n > 0) { + size_t vl; + // size_t vl = __riscv_vsetvl_e16m1(n); + asm volatile("vsetvli %0, %1, e16, m1, ta, ma" : "=r"(vl) : "r"(n)); + + // vfloat16m1_t vec_x = __riscv_vlse16_v_f16m1(x, sizeof(float16_t) * incx, vl); + asm volatile("vlse16.v v26, (%0), %1" : : "r"(x), "r"(sizeof(float16_t) * incx)); + + // vfloat16m1_t vec_s = __riscv_vfmv_v_f_f16m1(scalar, vl); + asm volatile("vmv.v.x v25, %0" : : "r"(scalar)); + + // vfloat16m1_t vec_y = __riscv_vfmin_vv_f16m1(vec_x, vec_s, vl); + asm volatile("vfmin.vv v25, v26, v25"); + + // __riscv_vsse16_v_f16m1(y, sizeof(float16_t) * incy, vec_y, vl); + asm volatile("vsse16.v v25, (%0), %1" : : "r"(y), "r"(sizeof(float16_t) * incy)); + + x += vl; + y += vl; + n -= vl; + } +} + +void NN__minimum1_f32(size_t n, float *y, size_t incy, float *x, size_t incx, float scalar) { + while (n > 0) { + size_t vl = __riscv_vsetvl_e32m1(n); + vfloat32m1_t vec_x = __riscv_vlse32_v_f32m1(x, sizeof(float) * incx, vl); + vfloat32m1_t vec_s = __riscv_vfmv_v_f_f32m1(scalar, vl); + vfloat32m1_t vec_y = __riscv_vfmin_vv_f32m1(vec_x, vec_s, vl); + __riscv_vsse32_v_f32m1(y, sizeof(float) * incy, vec_y, vl); + x += vl; + y += vl; + n -= vl; + } +} + + +#endif diff --git a/nn/impl/rvv/mul.c b/nn/impl/rvv/mul.c new file mode 100644 index 0000000..d2435a5 --- /dev/null +++ b/nn/impl/rvv/mul.c @@ -0,0 +1,22 @@ +#include +#include "mul.h" + +#ifdef RVV + + +void NN__mul_f32(size_t n, float *z, size_t incz, float *x, size_t incx, float *y, size_t incy) { + while (n > 0) { + size_t vl = __riscv_vsetvl_e32m1(n); + vfloat32m1_t vec_x = __riscv_vlse32_v_f32m1(x, sizeof(float) * incx, vl); + vfloat32m1_t vec_y = __riscv_vlse32_v_f32m1(y, sizeof(float) * incy, vl); + vfloat32m1_t vec_z = __riscv_vfmul_vv_f32m1(vec_x, vec_y, vl); + __riscv_vsse32_v_f32m1(z, sizeof(float) * incz, vec_z, vl); + x += vl; + y += vl; + z += vl; + n -= vl; + } +} + + +#endif diff --git a/nn/impl/rvv/mul1.c b/nn/impl/rvv/mul1.c new file mode 100644 index 0000000..f982074 --- /dev/null +++ b/nn/impl/rvv/mul1.c @@ -0,0 +1,21 @@ +#include +#include "mul1.h" + +#ifdef RVV + + +void NN__mul1_f32(size_t n, float *y, size_t incy, float *x, size_t incx, float scalar) { + while (n > 0) { + size_t vl = __riscv_vsetvl_e32m1(n); + vfloat32m1_t vec_x = __riscv_vlse32_v_f32m1(x, sizeof(float) * incx, vl); + vfloat32m1_t vec_s = __riscv_vfmv_v_f_f32m1(scalar, vl); + vfloat32m1_t vec_y = __riscv_vfmul_vv_f32m1(vec_x, vec_s, vl); + __riscv_vsse32_v_f32m1(y, sizeof(float) * incy, vec_y, vl); + x += vl; + y += vl; + n -= vl; + } +} + + +#endif diff --git a/nn/impl/rvv/neg.c b/nn/impl/rvv/neg.c new file mode 100644 index 0000000..00735f6 --- /dev/null +++ b/nn/impl/rvv/neg.c @@ -0,0 +1,20 @@ +#include +#include "neg.h" + +#ifdef RVV + + +void NN__neg_f32(size_t n, float *y, size_t incy, float *x, size_t incx) { + while (n > 0) { + size_t vl = __riscv_vsetvl_e32m1(n); + vfloat32m1_t vec_x = __riscv_vlse32_v_f32m1(x, sizeof(float) * incx, vl); + vfloat32m1_t vec_y = __riscv_vfneg_v_f32m1(vec_x, vl); + __riscv_vsse32_v_f32m1(y, sizeof(float) * incy, vec_y, vl); + x += vl; + y += vl; + n -= vl; + } +} + + +#endif diff --git a/nn/inc/riscv_vector.h b/nn/impl/rvv/riscv_vector.h similarity index 100% rename from nn/inc/riscv_vector.h rename to nn/impl/rvv/riscv_vector.h diff --git a/nn/impl/rvv/sub.c b/nn/impl/rvv/sub.c new file mode 100644 index 0000000..af5e3d2 --- /dev/null +++ b/nn/impl/rvv/sub.c @@ -0,0 +1,77 @@ +#include +#include "sub.h" + +#ifdef RVV + + +void NN__sub_u8(size_t n, uint8_t *z, size_t incz, uint8_t *x, size_t incx, uint8_t *y, size_t incy) { + while (n > 0) { + size_t vl = __riscv_vsetvl_e8m1(n); + vuint8m1_t vec_x = __riscv_vlse8_v_u8m1(x, sizeof(uint8_t) * incx, vl); + vuint8m1_t vec_y = __riscv_vlse8_v_u8m1(y, sizeof(uint8_t) * incy, vl); + vuint8m1_t vec_z = __riscv_vsub_vv_u8m1(vec_x, vec_y, vl); + __riscv_vsse8_v_u8m1(z, sizeof(uint8_t) * incz, vec_z, vl); + x += vl; + y += vl; + z += vl; + n -= vl; + } +} + +void NN__sub_i8(size_t n, int8_t *z, size_t incz, int8_t *x, size_t incx, int8_t *y, size_t incy) { + while (n > 0) { + size_t vl = __riscv_vsetvl_e8m1(n); + vint8m1_t vec_x = __riscv_vlse8_v_i8m1(x, sizeof(int8_t) * incx, vl); + vint8m1_t vec_y = __riscv_vlse8_v_i8m1(y, sizeof(int8_t) * incy, vl); + vint8m1_t vec_z = __riscv_vsub_vv_i8m1(vec_x, vec_y, vl); + __riscv_vsse8_v_i8m1(z, sizeof(int8_t) * incz, vec_z, vl); + x += vl; + y += vl; + z += vl; + n -= vl; + } +} + +void NN__sub_i16(size_t n, int16_t *z, size_t incz, int16_t *x, size_t incx, int16_t *y, size_t incy) { + while (n > 0) { + size_t vl = __riscv_vsetvl_e16m1(n); + vint16m1_t vec_x = __riscv_vlse16_v_i16m1(x, sizeof(int16_t) * incx, vl); + vint16m1_t vec_y = __riscv_vlse16_v_i16m1(y, sizeof(int16_t) * incy, vl); + vint16m1_t vec_z = __riscv_vsub_vv_i16m1(vec_x, vec_y, vl); + __riscv_vsse16_v_i16m1(z, sizeof(int16_t) * incz, vec_z, vl); + x += vl; + y += vl; + z += vl; + n -= vl; + } +} + +void NN__sub_i32(size_t n, int32_t *z, size_t incz, int32_t *x, size_t incx, int32_t *y, size_t incy) { + while (n > 0) { + size_t vl = __riscv_vsetvl_e32m1(n); + vint32m1_t vec_x = __riscv_vlse32_v_i32m1(x, sizeof(int32_t) * incx, vl); + vint32m1_t vec_y = __riscv_vlse32_v_i32m1(y, sizeof(int32_t) * incy, vl); + vint32m1_t vec_z = __riscv_vsub_vv_i32m1(vec_x, vec_y, vl); + __riscv_vsse32_v_i32m1(z, sizeof(int32_t) * incz, vec_z, vl); + x += vl; + y += vl; + z += vl; + n -= vl; + } +} + +void NN__sub_f32(size_t n, float *z, size_t incz, float *x, size_t incx, float *y, size_t incy) { + while (n > 0) { + size_t vl = __riscv_vsetvl_e32m1(n); + vfloat32m1_t vec_x = __riscv_vlse32_v_f32m1(x, sizeof(float) * incx, vl); + vfloat32m1_t vec_y = __riscv_vlse32_v_f32m1(y, sizeof(float) * incy, vl); + vfloat32m1_t vec_z = __riscv_vfsub_vv_f32m1(vec_x, vec_y, vl); + __riscv_vsse32_v_f32m1(z, sizeof(float) * incz, vec_z, vl); + x += vl; + y += vl; + z += vl; + n -= vl; + } +} + +#endif diff --git a/nn/impl/rvv/transpose.c b/nn/impl/rvv/transpose.c new file mode 100644 index 0000000..a8ae1c1 --- /dev/null +++ b/nn/impl/rvv/transpose.c @@ -0,0 +1,22 @@ +#include +#include "neg.h" + +#ifdef RVV + + +void NN__transpose_f32(size_t m, size_t n, float *y, float *x) { + for (size_t i = 0; i < m; i += 1) { + size_t k = n; + while (k > 0) { + size_t vl = __riscv_vsetvl_e32m1(k); + vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x, vl); + __riscv_vsse32_v_f32m1(y, sizeof(float) * m, vec_x, vl); + x += vl; + y += vl * m; + k -= vl; + } + } +} + + +#endif diff --git a/nn/impl/sgn.h b/nn/impl/sgn.h new file mode 100644 index 0000000..52580bc --- /dev/null +++ b/nn/impl/sgn.h @@ -0,0 +1,15 @@ +#ifndef __NN__SGN_H +#define __NN__SGN_H + +#include +#include +#include + + +void NN__sgn_f32(size_t n, + float *y, size_t incy, + float *x, size_t incx + ); + + +#endif // __NN__SGN_H diff --git a/nn/impl/softmax.h b/nn/impl/softmax.h new file mode 100644 index 0000000..503e46d --- /dev/null +++ b/nn/impl/softmax.h @@ -0,0 +1,14 @@ +#ifndef __NN__SOFTMAX_H +#define __NN__SOFTMAX_H + +#include +#include + + +void NN__softmax_f32(size_t n, + float *y, size_t incy, + float *x, size_t incx + ); + + +#endif // __NN__SOFTMAX_H diff --git a/nn/impl/sqr.h b/nn/impl/sqr.h new file mode 100644 index 0000000..1867149 --- /dev/null +++ b/nn/impl/sqr.h @@ -0,0 +1,15 @@ +#ifndef __NN__SQR_H +#define __NN__SQR_H + +#include +#include +#include + + +void NN__sqr_f32(size_t n, + float *y, size_t incy, + float *x, size_t incx + ); + + +#endif // __NN__SQR_H diff --git a/nn/impl/sqrt.h b/nn/impl/sqrt.h new file mode 100644 index 0000000..06b7b2a --- /dev/null +++ b/nn/impl/sqrt.h @@ -0,0 +1,15 @@ +#ifndef __NN__SQRT_H +#define __NN__SQRT_H + +#include +#include +#include + + +void NN__sqrt_f32(size_t n, + float *y, size_t incy, + float *x, size_t incx + ); + + +#endif // __NN__SQRT_H diff --git a/nn/impl/sub.h b/nn/impl/sub.h new file mode 100644 index 0000000..870efbb --- /dev/null +++ b/nn/impl/sub.h @@ -0,0 +1,47 @@ +#ifndef __NN__SUB_H +#define __NN__SUB_H + +#include +#include + +#include "nn_float16.h" + + +void NN__sub_u8(size_t n, + uint8_t *z, size_t incz, + uint8_t *x, size_t incx, + uint8_t *y, size_t incy + ); + +void NN__sub_i8(size_t n, + int8_t *z, size_t incz, + int8_t *x, size_t incx, + int8_t *y, size_t incy + ); + +void NN__sub_i16(size_t n, + int16_t *z, size_t incz, + int16_t *x, size_t incx, + int16_t *y, size_t incy + ); + +void NN__sub_i32(size_t n, + int32_t *z, size_t incz, + int32_t *x, size_t incx, + int32_t *y, size_t incy + ); + +void NN__sub_f16(size_t n, + float16_t *z, size_t incz, + float16_t *x, size_t incx, + float16_t *y, size_t incy + ); + +void NN__sub_f32(size_t n, + float *z, size_t incz, + float *x, size_t incx, + float *y, size_t incy + ); + + +#endif // __NN__SUB_H diff --git a/nn/impl/sum.h b/nn/impl/sum.h new file mode 100644 index 0000000..ee97b42 --- /dev/null +++ b/nn/impl/sum.h @@ -0,0 +1,28 @@ +#ifndef __NN__SUM_H +#define __NN__SUM_H + +#include +#include +#include + + +void NN__sum_u8_to_i32(size_t n, + int32_t *result, + uint8_t *x, size_t incx + ); +void NN__sum_i16_to_i32(size_t n, + int32_t *result, + int16_t *x, size_t incx + ); + +void NN__sum_i32(size_t n, + int32_t *result, + int32_t *x, size_t incx + ); + +void NN__sum_f32(size_t n, + float *result, + float *x, size_t incx); + + +#endif // __NN__SUM_H diff --git a/nn/impl/transpose.h b/nn/impl/transpose.h new file mode 100644 index 0000000..d11f875 --- /dev/null +++ b/nn/impl/transpose.h @@ -0,0 +1,12 @@ +#ifndef __NN__TRANSPOSE_H +#define __NN__TRANSPOSE_H + +#include + + +void NN__transpose_f32(size_t m, size_t n, + float *y, float *x + ); + + +#endif // __NN__TRANSPOSE_H diff --git a/nn/inc/nn_float16.h b/nn/inc/nn_float16.h deleted file mode 100644 index 64c509f..0000000 --- a/nn/inc/nn_float16.h +++ /dev/null @@ -1,159 +0,0 @@ -#ifndef __NN_FLOAT16 -#define __NN_FLOAT16 - -#include -#include -#include - -#ifdef X86 - #include -#endif - - -typedef uint16_t float16_t; - -typedef union { - uint32_t i; - float f; -} float_uint32_union_t; - - -// from https://github.com/AcademySoftwareFoundation/Imath/blob/main/src/Imath/half.h - -static inline float NN_halfToFloat(float16_t h) { - // #if defined(__F16C__) - // // NB: The intel implementation does seem to treat NaN slightly - // // different than the original toFloat table does (i.e. where the - // // 1 bits are, meaning the signalling or not bits). This seems - // // benign, given that the original library didn't really deal with - // // signalling vs non-signalling NaNs - // #ifdef _MSC_VER - // /* msvc does not seem to have cvtsh_ss :( */ - // return _mm_cvtss_f32(_mm_cvtph_ps (_mm_set1_epi16 (h))); - // #else - // return _cvtsh_ss(h); - // #endif - // #else - float_uint32_union_t v; - // this code would be clearer, although it does appear to be faster - // (1.06 vs 1.08 ns/call) to avoid the constants and just do 4 - // shifts. - // - uint32_t hexpmant = ((uint32_t) (h) << 17) >> 4; - v.i = ((uint32_t) (h >> 15)) << 31; - - // the likely really does help if most of your numbers are "normal" half numbers - if ((hexpmant >= 0x00800000)) { - v.i |= hexpmant; - // either we are a normal number, in which case add in the bias difference - // otherwise make sure all exponent bits are set - if ((hexpmant < 0x0f800000)) { - v.i += 0x38000000; - } - else { - v.i |= 0x7f800000; - } - } - else if (hexpmant != 0) { - // exponent is 0 because we're denormal, don't have to extract - // the mantissa, can just use as is - // - // - // other compilers may provide count-leading-zeros primitives, - // but we need the community to inform us of the variants - uint32_t lc; - lc = 0; - while (0 == ((hexpmant << lc) & 0x80000000)) { - lc += 1; - } - lc -= 8; - // so nominally we want to remove that extra bit we shifted - // up, but we are going to add that bit back in, then subtract - // from it with the 0x38800000 - (lc << 23).... - // - // by combining, this allows us to skip the & operation (and - // remove a constant) - // - // hexpmant &= ~0x00800000; - v.i |= 0x38800000; - // lc is now x, where the desired exponent is then - // -14 - lc - // + 127 -> new exponent - v.i |= (hexpmant << lc); - v.i -= (lc << 23); - } - return v.f; - // #endif -} - -/// -/// Convert half to float -/// -/// Note: This only supports the "round to even" rounding mode, which -/// was the only mode supported by the original OpenEXR library -/// - -static inline float16_t NN_floatToHalf(float f) { - // #if defined(__F16C__) - // #ifdef _MSC_VER - // // msvc does not seem to have cvtsh_ss :( - // return _mm_extract_epi16 ( - // _mm_cvtps_ph ( - // _mm_set_ss (f), (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), - // 0); - // #else - // // preserve the fixed rounding mode to nearest - // return _cvtss_sh (f, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); - // #endif - // #else - float_uint32_union_t v; - float16_t ret; - uint32_t e, m, ui, r, shift; - - v.f = f; - - ui = (v.i & ~0x80000000); - ret = ((v.i >> 16) & 0x8000); - - // exponent large enough to result in a normal number, round and return - if (ui >= 0x38800000) { - // inf or nan - if (ui >= 0x7f800000) { - ret |= 0x7c00; - if (ui == 0x7f800000) { - return ret; - } - m = (ui & 0x7fffff) >> 13; - // make sure we have at least one bit after shift to preserve nan-ness - return ret | (uint16_t) m | (uint16_t) (m == 0); - } - - // too large, round to infinity - if (ui > 0x477fefff) { - return ret | 0x7c00; - } - - ui -= 0x38000000; - ui = ((ui + 0x00000fff + ((ui >> 13) & 1)) >> 13); - return ret | (uint16_t) ui; - } - - // zero or flush to 0 - if (ui < 0x33000001) { - return ret; - } - - // produce a denormalized half - e = (ui >> 23); - shift = 0x7e - e; - m = 0x800000 | (ui & 0x7fffff); - r = m << (32 - shift); - ret |= (m >> shift); - if (r > 0x80000000 || (r == 0x80000000 && (ret & 0x1) != 0)) { - ret += 1; - } - return ret; -// #endif -} - -#endif // __NN_FLOAT16 \ No newline at end of file diff --git a/nn/inc/nn_layernorm.h b/nn/inc/nn_layernorm.h deleted file mode 100644 index e99f4c2..0000000 --- a/nn/inc/nn_layernorm.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef __NN_LAYERNORM_H -#define __NN_LAYERNORM_H - -#include -#include - -#include "nn_tensor.h" - - -void NN_LayerNorm( - Tensor *out, Tensor *in, - Tensor *weight, Tensor *bias, - const float eps); - - -#endif // __NN_LAYERNORM_H diff --git a/nn/inc/nn_matrixnorm.h b/nn/inc/nn_matrixnorm.h deleted file mode 100644 index ad8257a..0000000 --- a/nn/inc/nn_matrixnorm.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef __NN_MATRIXNORM_H -#define __NN_MATRIXNORM_H - -#include -#include - -#include "nn_tensor.h" - - -/** - * Computes the Frobenius norm of a matrix. - * - * @param tensor: the input tensor of shape (m, n) - */ -void NN_matrixNorm(Tensor *scalar, Tensor *x); - -void NN_matrixNorm_F32(Tensor *scalar, Tensor *x); - - -#endif // __NN_MATRIXNORM_H diff --git a/nn/inc/nn_print.h b/nn/inc/nn_print.h deleted file mode 100644 index cea4297..0000000 --- a/nn/inc/nn_print.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef __NN_PRINT_H -#define __NN_PRINT_H - -#include - -#include "nn_tensor.h" - - -void NN_printFloat(float v, int16_t num_digits); - -void NN_printShape(Tensor *t); - -void NN_printf(Tensor *t); - - -#endif // __NN_PRINT_H diff --git a/nn/inc/ops/abs.h b/nn/inc/ops/abs.h deleted file mode 100644 index 39d3b6e..0000000 --- a/nn/inc/ops/abs.h +++ /dev/null @@ -1,118 +0,0 @@ -#ifndef __NN__ABS_H -#define __NN__ABS_H - -#include -#include -#include - -#ifdef AVX - #include -#endif - -#ifdef RVV - #include -#endif - -static inline void NN__abs_I8(size_t n, int8_t *y, int8_t *x) { - #if defined(RVV) - while (n > 0) { - size_t vl = __riscv_vsetvl_e8m1(n); - vint8m1_t vec_x = __riscv_vle8_v_i8m1(x, vl); - vint8m1_t vec_neg_x = __riscv_vneg_v_i8m1(vec_x, vl); - vbool8_t mask = __riscv_vmslt_vx_i8m1_b8(vec_x, 0, vl); - vint8m1_t vec_abs_x = __riscv_vmerge_vvm_i8m1(vec_x, vec_neg_x, mask, vl); - __riscv_vse8_v_i8m1(y, vec_abs_x, vl); - x += vl; - y += vl; - n -= vl; - } - #else - for (size_t i = 0; i < n; i += 1) { - y[i] = x[i] < 0 ? -x[i] : x[i]; - } - #endif -} - -static inline void NN__abs_I16(size_t n, int16_t *y, int16_t *x) { - #if defined(RVV) - while (n > 0) { - size_t vl = __riscv_vsetvl_e16m1(n); - vint16m1_t vec_x = __riscv_vle16_v_i16m1(x, vl); - vint16m1_t vec_neg_x = __riscv_vneg_v_i16m1(vec_x, vl); - vbool16_t mask = __riscv_vmslt_vx_i16m1_b16(vec_x, 0, vl); - vint16m1_t vec_abs_x = __riscv_vmerge_vvm_i16m1(vec_x, vec_neg_x, mask, vl); - __riscv_vse16_v_i16m1(y, vec_abs_x, vl); - x += vl; - y += vl; - n -= vl; - } - #else - for (size_t i = 0; i < n; i += 1) { - y[i] = x[i] < 0 ? -x[i] : x[i]; - } - #endif -} - -static inline void NN__abs_I32(size_t n, int32_t *y, int32_t *x) { - #if defined(RVV) - while (n > 0) { - size_t vl = __riscv_vsetvl_e32m1(n); - vint32m1_t vec_x = __riscv_vle32_v_i32m1(x, vl); - vint32m1_t vec_neg_x = __riscv_vneg_v_i32m1(vec_x, vl); - vbool32_t mask = __riscv_vmslt_vx_i32m1_b32(vec_x, 0, vl); - vint32m1_t vec_abs_x = __riscv_vmerge_vvm_i32m1(vec_x, vec_neg_x, mask, vl); - __riscv_vse32_v_i32m1(y, vec_abs_x, vl); - x += vl; - y += vl; - n -= vl; - } - #else - for (size_t i = 0; i < n; i += 1) { - y[i] = x[i] < 0 ? -x[i] : x[i]; - } - #endif -} - -static inline void NN__abs_F16(size_t n, float16_t *y, float16_t *x) { - for (size_t i = 0; i < n; i += 1) { - y[i] = NN_floatToHalf(fabsf(NN_halfToFloat(x[i]))); - } -} - -static inline void NN__abs_F32(size_t n, float *y, float *x) { - #if defined(AVX) - // Mask to clear the sign bit - __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); - - size_t vl = 8; - - while (n > 0) { - size_t count = n < vl ? n : vl; - // Load input values into an AVX register - __m256 vec_x = _mm256_loadu_ps(x); - // Compute the absolute values - __m256 vec_y = _mm256_and_ps(vec_x, mask); - // Store the result - _mm256_storeu_ps(y, vec_y); - x += count; - y += count; - n -= count; - } - #elif defined(RVV) - while (n > 0) { - size_t vl = __riscv_vsetvl_e32m1(n); - vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x, vl); - vfloat32m1_t vec_y = __riscv_vfabs_v_f32m1(vec_x, vl); - __riscv_vse32_v_f32m1(y, vec_y, vl); - x += vl; - y += vl; - n -= vl; - } - #else - for (size_t i = 0; i < n; i += 1) { - y[i] = fabsf(x[i]); - } - #endif -} - -#endif // __NN__ABS_H diff --git a/nn/inc/ops/acc.h b/nn/inc/ops/acc.h deleted file mode 100644 index 87cfbbb..0000000 --- a/nn/inc/ops/acc.h +++ /dev/null @@ -1,36 +0,0 @@ -#ifndef __NN__ACC_H -#define __NN__ACC_H - -#include -#include - -#ifdef RVV - #include -#endif - -static inline void NN__acc_I8(size_t n, int8_t *y, int8_t *x) { - for (size_t i = 0; i < n; i += 1) { - y[i] += x[i]; - } -} - -static inline void NN__acc_F32(size_t n, float *y, float *x) { - #ifdef RVV - while (n > 0) { - size_t vl = __riscv_vsetvl_e32m1(n); - vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x, vl); - vfloat32m1_t vec_y = __riscv_vle32_v_f32m1(y, vl); - vec_y = __riscv_vfadd_vv_f32m1(vec_y, vec_x, vl); - __riscv_vse32_v_f32m1(y, vec_y, vl); - x += vl; - y += vl; - n -= vl; - } - #else - for (size_t i = 0; i < n; i += 1) { - y[i] += x[i]; - } - #endif -} - -#endif // __NN__ACC_H diff --git a/nn/inc/ops/acc1.h b/nn/inc/ops/acc1.h deleted file mode 100644 index d0ca777..0000000 --- a/nn/inc/ops/acc1.h +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef __NN__ACC1_H -#define __NN__ACC1_H - -#include - -#ifdef RVV - #include -#endif - -static inline void NN__acc1_F32(size_t n, float *y, float v) { - #ifdef RVV - while (n > 0) { - size_t vl = __riscv_vsetvl_e32m1(n); - vfloat32m1_t vec_y = __riscv_vle32_v_f32m1(y, vl); - vfloat32m1_t vec_v = __riscv_vfmv_v_f_f32m1(v, vl); - vec_y = __riscv_vfadd_vv_f32m1(vec_y, vec_v, vl); - __riscv_vse32_v_f32m1(y, vec_y, vl); - y += vl; - n -= vl; - } - #else - for (size_t i = 0; i < n; i += 1) { - y[i] += v; - } - #endif -} - -#endif // __NN__ADD1_H diff --git a/nn/inc/ops/add.h b/nn/inc/ops/add.h deleted file mode 100644 index e785b59..0000000 --- a/nn/inc/ops/add.h +++ /dev/null @@ -1,60 +0,0 @@ -#ifndef __NN__ADD_H -#define __NN__ADD_H - -#include -#include - -#ifdef RVV - #include -#endif - -#include "nn_float16.h" - -static inline void NN__add_I8(size_t n, int8_t *z, int8_t *x, int8_t *y) { - #ifdef RVV - while (n > 0) { - size_t vl = __riscv_vsetvl_e8m1(n); - vint8m1_t vec_x = __riscv_vle8_v_i8m1(x, vl); - vint8m1_t vec_y = __riscv_vle8_v_i8m1(y, vl); - vint8m1_t vec_z = __riscv_vadd_vv_i8m1(vec_x, vec_y, vl); - __riscv_vse8_v_i8m1(z, vec_z, vl); - x += vl; - y += vl; - z += vl; - n -= vl; - } - #else - for (size_t i = 0; i < n; i += 1) { - z[i] = x[i] + y[i]; - } - #endif -} - -static inline void NN__add_F16(size_t n, float16_t *z, float16_t *x, float16_t *y) { - for (size_t i = 0; i < n; i += 1) { - z[i] = NN_floatToHalf(NN_halfToFloat(x[i]) + NN_halfToFloat(y[i])); - } -} - -static inline void NN__add_F32(size_t n, float *z, float *x, float *y) { - #ifdef RVV - while (n > 0) { - size_t vl = __riscv_vsetvl_e32m1(n); - vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x, vl); - vfloat32m1_t vec_y = __riscv_vle32_v_f32m1(y, vl); - vfloat32m1_t vec_z = __riscv_vfadd_vv_f32m1(vec_x, vec_y, vl); - __riscv_vse32_v_f32m1(z, vec_z, vl); - x += vl; - y += vl; - z += vl; - n -= vl; - } - #else - for (size_t i = 0; i < n; i += 1) { - z[i] = x[i] + y[i]; - } - #endif -} - - -#endif // __NN__ADD_H diff --git a/nn/inc/ops/add1.h b/nn/inc/ops/add1.h deleted file mode 100644 index 84c015c..0000000 --- a/nn/inc/ops/add1.h +++ /dev/null @@ -1,29 +0,0 @@ -#ifndef __NN__ADD1_H -#define __NN__ADD1_H - -#include - -#ifdef RVV - #include -#endif - -static inline void NN__add1_F32(size_t n, float *z, float *x, float v) { - #ifdef RVV - while (n > 0) { - size_t vl = __riscv_vsetvl_e32m1(n); - vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x, vl); - vfloat32m1_t vec_v = __riscv_vfmv_v_f_f32m1(v, vl); - vfloat32m1_t vec_z = __riscv_vfadd_vv_f32m1(vec_x, vec_v, vl); - __riscv_vse32_v_f32m1(z, vec_z, vl); - x += vl; - z += vl; - n -= vl; - } - #else - for (size_t i = 0; i < n; i += 1) { - z[i] = x[i] + v; - } - #endif -} - -#endif // __NN__ADD1_H diff --git a/nn/inc/ops/div.h b/nn/inc/ops/div.h deleted file mode 100644 index 8e43483..0000000 --- a/nn/inc/ops/div.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef __NN__DIV_H -#define __NN__DIV_H - -#include -#include - -#ifdef RVV - #include -#endif - -static inline void NN__div_F32(size_t n, float *z, float *x, float *y) { - #ifdef RVV - while (n > 0) { - size_t vl = __riscv_vsetvl_e32m1(n); - vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x, vl); - vfloat32m1_t vec_y = __riscv_vle32_v_f32m1(y, vl); - vfloat32m1_t vec_z = __riscv_vfdiv_vv_f32m1(vec_x, vec_y, vl); - __riscv_vse32_v_f32m1(z, vec_z, vl); - x += vl; - y += vl; - z += vl; - n -= vl; - } - #else - for (size_t i = 0; i < n; i += 1) { - z[i] = x[i] / y[i]; - } - #endif -} - - -#endif // __NN__DIV_H diff --git a/nn/inc/ops/dot.h b/nn/inc/ops/dot.h deleted file mode 100644 index 372d032..0000000 --- a/nn/inc/ops/dot.h +++ /dev/null @@ -1,75 +0,0 @@ -#ifndef __NN__DOT_H -#define __NN__DOT_H - -#include - -#ifdef RVV - #include -#endif - -#include "nn_float16.h" - - -static inline void NN__dot_F16(size_t n, float16_t *s, float16_t *x, float16_t *y) { - float16_t sum = 0.0; - - #ifdef RVV - size_t vlmax = __riscv_vsetvlmax_e16m1(); - - vfloat16m1_t vec_zero = __riscv_vfmv_v_f_f16m1(0, vlmax); - vfloat16m1_t vec_s = __riscv_vfmv_v_f_f16m1(0, vlmax); - - while (n > 0) { - size_t vl = __riscv_vsetvl_e16m1(n); - vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x, vl); - vfloat16m1_t vec_y = __riscv_vle16_v_f16m1(y, vl); - vec_s = __riscv_vfmacc_vv_f16m1(vec_s, vec_x, vec_y, vl); - - x += vl; - y += vl; - n -= vl; - } - vec_s = __riscv_vfredusum_vs_f16m1_f16m1(vec_s, vec_zero, vlmax); - sum = __riscv_vfmv_f_s_f16m1_f16(vec_s); - #else - float sum_f32 = 0; - for (size_t i = 0; i < n; i += 1) { - sum_f32 += NN_halfToFloat(x[i]) * NN_halfToFloat(y[i]); - } - sum = NN_floatToHalf(sum_f32); - #endif - - *s = sum; -} - -static inline void NN__dot_F32(size_t n, float *s, float *x, float *y) { - float sum = 0.0; - - #ifdef RVV - size_t vlmax = __riscv_vsetvlmax_e32m1(); - - vfloat32m1_t vec_zero = __riscv_vfmv_v_f_f32m1(0, vlmax); - vfloat32m1_t vec_s = __riscv_vfmv_v_f_f32m1(0, vlmax); - - while (n > 0) { - size_t vl = __riscv_vsetvl_e32m1(n); - vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x, vl); - vfloat32m1_t vec_y = __riscv_vle32_v_f32m1(y, vl); - vec_s = __riscv_vfmacc_vv_f32m1(vec_s, vec_x, vec_y, vl); - - x += vl; - y += vl; - n -= vl; - } - vec_s = __riscv_vfredusum_vs_f32m1_f32m1(vec_s, vec_zero, vlmax); - sum = __riscv_vfmv_f_s_f32m1_f32(vec_s); - #else - for (size_t i = 0; i < n; i += 1) { - sum += x[i] * y[i]; - } - #endif - - *s = sum; -} - -#endif // __NN__DOT_H diff --git a/nn/inc/ops/fill.h b/nn/inc/ops/fill.h deleted file mode 100644 index 81fc0d9..0000000 --- a/nn/inc/ops/fill.h +++ /dev/null @@ -1,72 +0,0 @@ -#ifndef __NN__FILL_H -#define __NN__FILL_H - -#include -#include - -#ifdef RVV - #include -#endif - -#include "nn_float16.h" - -inline static void NN__fill_U8(size_t n, uint8_t *x, uint8_t v) { - for (size_t i = 0; i < n; i += 1) { - x[i] = v; - } -} - -inline static void NN__fill_I8(size_t n, int8_t *x, int8_t v) { - for (size_t i = 0; i < n; i += 1) { - x[i] = v; - } -} - -inline static void NN__fill_U16(size_t n, uint16_t *x, uint16_t v) { - for (size_t i = 0; i < n; i += 1) { - x[i] = v; - } -} - -inline static void NN__fill_I16(size_t n, int16_t *x, int16_t v) { - for (size_t i = 0; i < n; i += 1) { - x[i] = v; - } -} - -inline static void NN__fill_I32(size_t n, int32_t *x, int32_t v) { - for (size_t i = 0; i < n; i += 1) { - x[i] = v; - } -} - -inline static void NN__fill_F16(size_t n, float16_t *x, float16_t v) { - for (size_t i = 0; i < n; i += 1) { - x[i] = v; - } -} - -// inline static void NN__fill_BF16(size_t n, bfloat16_t * x, const bfloat16_t v) { -// for (size_t i = 0; i < n; i += 1) { -// x[i] = v; -// } -// } - -inline static void NN__fill_F32(size_t n, float *x, float v) { - #ifdef RVV - while (n > 0) { - size_t vl = __riscv_vsetvl_e32m1(n); - vfloat32m1_t vec_x = __riscv_vfmv_v_f_f32m1(v, vl); - __riscv_vse32_v_f32m1(x, vec_x, vl); - x += vl; - n -= vl; - } - #else - for (size_t i = 0; i < n; i += 1) { - x[i] = v; - } - #endif -} - - -#endif // __NN__FILL_H diff --git a/nn/inc/ops/log.h b/nn/inc/ops/log.h deleted file mode 100644 index 8aaf9b3..0000000 --- a/nn/inc/ops/log.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef __NN__LOG_H -#define __NN__LOG_H - -#include -#include -#include - -#ifdef RVV - #include -#endif - -static inline void NN__log_F32(size_t n, float *y, float *x) { - for (size_t i = 0; i < n; i += 1) { - y[i] = logf(x[i]); - } -} - - -#endif // __NN__LOG_H diff --git a/nn/inc/ops/max.h b/nn/inc/ops/max.h deleted file mode 100644 index 4c33d84..0000000 --- a/nn/inc/ops/max.h +++ /dev/null @@ -1,35 +0,0 @@ -#ifndef __NN__MAX_H -#define __NN__MAX_H - -#include -#include -#include - -#ifdef RVV - #include -#endif - -static inline void NN__max_F32(size_t n, float *s, float *x) { - float max = -FLT_MAX; - - #ifdef RVV - vfloat32m1_t vec_max = __riscv_vfmv_s_f_f32m1(max, 1); - while (n > 0) { - size_t vl = __riscv_vsetvl_e32m1(n); - vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x, vl); - vec_max = __riscv_vfredmax_vs_f32m1_f32m1(vec_x, vec_max, vl); - x += vl; - n -= vl; - } - max = __riscv_vfmv_f_s_f32m1_f32(vec_max); - #else - for (size_t i = 0; i < n; i += 1) { - float val = x[i]; - max = val > max ? val : max; - } - #endif - - *s = max; -} - -#endif // __NN__MAX_H diff --git a/nn/inc/ops/maximum.h b/nn/inc/ops/maximum.h deleted file mode 100644 index 39180e3..0000000 --- a/nn/inc/ops/maximum.h +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef __NN__MAXIMUM_H -#define __NN__MAXIMUM_H - -#include -#include - -#ifdef RVV - #include -#endif - -static inline void NN__maximum_F32(size_t n, float *z, float *x, float *y) { - #ifdef RVV - while (n > 0) { - size_t vl = __riscv_vsetvl_e32m1(n); - vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x, vl); - vfloat32m1_t vec_y = __riscv_vle32_v_f32m1(y, vl); - vfloat32m1_t vec_z = __riscv_vfmax_vv_f32m1(vec_x, vec_y, vl); - __riscv_vse32_v_f32m1(z, vec_z, vl); - x += vl; - y += vl; - z += vl; - n -= vl; - } - #else - for (size_t i = 0; i < n; i += 1) { - float x_val = x[i]; - float y_val = y[i]; - z[i] = x_val > y_val ? x_val : y_val; - } - #endif -} - -#endif // __NN__MAXIMUM_H diff --git a/nn/inc/ops/maximum1.h b/nn/inc/ops/maximum1.h deleted file mode 100644 index 26ec9d7..0000000 --- a/nn/inc/ops/maximum1.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef __NN__MAXIMUM1_H -#define __NN__MAXIMUM1_H - -#include -#include - -#ifdef RVV - #include -#endif - -static inline void NN__maximum1_F32(size_t n, float *y, float *x, float v) { - #ifdef RVV - while (n > 0) { - size_t vl = __riscv_vsetvl_e32m1(n); - vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x, vl); - vfloat32m1_t vec_v = __riscv_vfmv_v_f_f32m1(v, vl); - vfloat32m1_t vec_y = __riscv_vfmax_vv_f32m1(vec_x, vec_v, vl); - __riscv_vse32_v_f32m1(y, vec_y, vl); - x += vl; - y += vl; - n -= vl; - } - #else - for (size_t i = 0; i < n; i += 1) { - float x_val = x[i]; - y[i] = x_val > v ? x_val : v; - } - #endif -} - -#endif // __NN__MAXIMUM1_H diff --git a/nn/inc/ops/min.h b/nn/inc/ops/min.h deleted file mode 100644 index 44f2b0f..0000000 --- a/nn/inc/ops/min.h +++ /dev/null @@ -1,35 +0,0 @@ -#ifndef __NN__MIN_H -#define __NN__MIN_H - -#include -#include -#include - -#ifdef RVV - #include -#endif - -static inline void NN__min_F32(size_t n, float *s, float *x) { - float min = FLT_MAX; - - #ifdef RVV - vfloat32m1_t vec_min = __riscv_vfmv_s_f_f32m1(min, 1); - while (n > 0) { - size_t vl = __riscv_vsetvl_e32m1(n); - vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x, vl); - vec_min = __riscv_vfredmin_vs_f32m1_f32m1(vec_x, vec_min, vl); - x += vl; - n -= vl; - } - min = __riscv_vfmv_f_s_f32m1_f32(vec_min); - #else - for (size_t i = 0; i < n; i += 1) { - float val = x[i]; - min = val < min ? val : min; - } - #endif - - *s = min; -} - -#endif // __NN__MIN_H diff --git a/nn/inc/ops/minimum.h b/nn/inc/ops/minimum.h deleted file mode 100644 index 674bf72..0000000 --- a/nn/inc/ops/minimum.h +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef __NN__MINIMUM_H -#define __NN__MINIMUM_H - -#include -#include - -#ifdef RVV - #include -#endif - -static inline void NN__minimum_F32(size_t n, float *z, float *x, float *y) { - #ifdef RVV - while (n > 0) { - size_t vl = __riscv_vsetvl_e32m1(n); - vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x, vl); - vfloat32m1_t vec_y = __riscv_vle32_v_f32m1(y, vl); - vfloat32m1_t vec_z = __riscv_vfmin_vv_f32m1(vec_x, vec_y, vl); - __riscv_vse32_v_f32m1(z, vec_z, vl); - x += vl; - y += vl; - z += vl; - n -= vl; - } - #else - for (size_t i = 0; i < n; i += 1) { - float x_val = x[i]; - float y_val = y[i]; - z[i] = x_val < y_val ? x_val : y_val; - } - #endif -} - -#endif // __NN__MINIMUM_H diff --git a/nn/inc/ops/minimum1.h b/nn/inc/ops/minimum1.h deleted file mode 100644 index 1b81f2a..0000000 --- a/nn/inc/ops/minimum1.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef __NN__MINIMUM1_H -#define __NN__MINIMUM1_H - -#include -#include - -#ifdef RVV - #include -#endif - -static inline void NN__minimum1_F32(size_t n, float *y, float *x, float v) { - #ifdef RVV - while (n > 0) { - size_t vl = __riscv_vsetvl_e32m1(n); - vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x, vl); - vfloat32m1_t vec_v = __riscv_vfmv_v_f_f32m1(v, vl); - vfloat32m1_t vec_y = __riscv_vfmin_vv_f32m1(vec_x, vec_v, vl); - __riscv_vse32_v_f32m1(y, vec_y, vl); - x += vl; - y += vl; - n -= vl; - } - #else - for (size_t i = 0; i < n; i += 1) { - float x_val = x[i]; - y[i] = x_val < v ? x_val : v; - } - #endif -} - -#endif // __NN__MINIMUM1_H diff --git a/nn/inc/ops/mul.h b/nn/inc/ops/mul.h deleted file mode 100644 index 5c0e4cf..0000000 --- a/nn/inc/ops/mul.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef __NN__MUL_H -#define __NN__MUL_H - -#include -#include - -#ifdef RVV - #include -#endif - -static inline void NN__mul_F32(size_t n, float *z, float *x, float *y) { - #ifdef RVV - while (n > 0) { - size_t vl = __riscv_vsetvl_e32m1(n); - vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x, vl); - vfloat32m1_t vec_y = __riscv_vle32_v_f32m1(y, vl); - vfloat32m1_t vec_z = __riscv_vfmul_vv_f32m1(vec_x, vec_y, vl); - __riscv_vse32_v_f32m1(z, vec_z, vl); - x += vl; - y += vl; - z += vl; - n -= vl; - } - #else - for (size_t i = 0; i < n; i += 1) { - z[i] = x[i] * y[i]; - } - #endif -} - - -#endif // __NN__MUL_H diff --git a/nn/inc/ops/mul1.h b/nn/inc/ops/mul1.h deleted file mode 100644 index f187b9a..0000000 --- a/nn/inc/ops/mul1.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef __NN__MUL1_H -#define __NN__MUL1_H - -#include -#include - -#ifdef RVV - #include -#endif - -static inline void NN__mul1_F32(size_t n, float *y, float *x, float v) { - #ifdef RVV - while (n > 0) { - size_t vl = __riscv_vsetvl_e32m1(n); - vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x, vl); - vfloat32m1_t vec_v = __riscv_vfmv_v_f_f32m1(v, vl); - vfloat32m1_t vec_y = __riscv_vfmul_vv_f32m1(vec_x, vec_v, vl); - __riscv_vse32_v_f32m1(y, vec_y, vl); - x += vl; - y += vl; - n -= vl; - } - #else - for (size_t i = 0; i < n; i += 1) { - y[i] = x[i] * v; - } - #endif -} - - -#endif // __NN__MUL1_H diff --git a/nn/inc/ops/neg.h b/nn/inc/ops/neg.h deleted file mode 100644 index 7f95421..0000000 --- a/nn/inc/ops/neg.h +++ /dev/null @@ -1,30 +0,0 @@ -#ifndef __NN__NEG_H -#define __NN__NEG_H - -#include -#include - -#ifdef RVV - #include -#endif - -static inline void NN__neg_F32(size_t n, float *y, float *x) { - #ifdef RVV - while (n > 0) { - size_t vl = __riscv_vsetvl_e32m1(n); - vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x, vl); - vfloat32m1_t vec_y = __riscv_vfneg_v_f32m1(vec_x, vl); - __riscv_vse32_v_f32m1(y, vec_y, vl); - x += vl; - y += vl; - n -= vl; - } - #else - for (size_t i = 0; i < n; i += 1) { - y[i] = -x[i]; - } - #endif -} - - -#endif // __NN__NEG_H diff --git a/nn/inc/ops/norm.h b/nn/inc/ops/norm.h deleted file mode 100644 index 035b04e..0000000 --- a/nn/inc/ops/norm.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef __NN__NORM_H -#define __NN__NORM_H - -#include -#include -#ifdef RVV - #include -#endif - -#include "dot.h" - -static inline void NN__norm_F32(size_t n, float *s, float *x) { - NN__dot_F32(n, s, x, x); - *s = sqrtf(*s); -} - -static inline void NN__norm_inv_F32(size_t n, float *s, float *x) { - NN__norm_F32(n, s, x); - *s = 1.f/(*s); -} - - -#endif // __NN__NORM_H diff --git a/nn/inc/ops/rmsnorm.h b/nn/inc/ops/rmsnorm.h deleted file mode 100644 index fb667f8..0000000 --- a/nn/inc/ops/rmsnorm.h +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef __NN__RMSNORM_H -#define __NN__RMSNORM_H - -#include -#include - -#ifdef RVV - #include -#endif - -void NN__rmsnorm_F32(size_t size, float* o, float* x, float* weight) { - // calculate sum of squares - float ss = 0.0f; - for (int j = 0; j < size; j++) { - ss += x[j] * x[j]; - } - ss /= size; - ss += 1e-5f; - ss = 1.0f / sqrtf(ss); - // normalize and scale - for (int j = 0; j < size; j++) { - o[j] = weight[j] * (ss * x[j]); - } -} - -#endif // __NN__RMSNORM_H diff --git a/nn/inc/ops/sgn.h b/nn/inc/ops/sgn.h deleted file mode 100644 index 830b857..0000000 --- a/nn/inc/ops/sgn.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef __NN__SGN_H -#define __NN__SGN_H - -#include -#include -#include - -#ifdef RVV - #include -#endif - -static inline void NN__sgn_F32(size_t n, float *y, float *x) { - for (size_t i = 0; i < n; i += 1) { - y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); - } -} - - -#endif // __NN__SGN_H diff --git a/nn/inc/ops/softmax.h b/nn/inc/ops/softmax.h deleted file mode 100644 index c2bd9cc..0000000 --- a/nn/inc/ops/softmax.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef __NN__SOFTMAX_H -#define __NN__SOFTMAX_H - -#include -#include - -#ifdef RVV - #include -#endif - -static inline void NN__softmax_F32(size_t n, float *y, float *x, size_t stride) { - // exp and sum - float sum = 0.0f; - for (size_t i = 0; i < n * stride; i += stride) { - y[i] = expf(x[i]); - sum += y[i]; - } - // normalize - for (size_t i = 0; i < n * stride; i += stride) { - y[i] /= sum; - } -} - - -#endif // __NN__SOFTMAX_H diff --git a/nn/inc/ops/sqr.h b/nn/inc/ops/sqr.h deleted file mode 100644 index d829b1b..0000000 --- a/nn/inc/ops/sqr.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef __NN__SQR_H -#define __NN__SQR_H - -#include -#include -#include - -#ifdef RVV - #include -#endif - -static inline void NN__sqr_F32(size_t n, float *y, float *x) { - for (size_t i = 0; i < n; i += 1) { - y[i] = x[i] * x[i]; - } -} - - -#endif // __NN__SQR_H diff --git a/nn/inc/ops/sqrt.h b/nn/inc/ops/sqrt.h deleted file mode 100644 index 054edde..0000000 --- a/nn/inc/ops/sqrt.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef __NN__SQRT_H -#define __NN__SQRT_H - -#include -#include -#include - -#ifdef RVV - #include -#endif - -static inline void NN__sqrt_F32(size_t n, float *y, float *x) { - for (size_t i = 0; i < n; i += 1) { - y[i] = sqrtf(x[i]); - } -} - - -#endif // __NN__SQRT_H diff --git a/nn/inc/ops/sub.h b/nn/inc/ops/sub.h deleted file mode 100644 index 9e5dd91..0000000 --- a/nn/inc/ops/sub.h +++ /dev/null @@ -1,111 +0,0 @@ -#ifndef __NN__SUB_H -#define __NN__SUB_H - -#include -#include - -#ifdef RVV - #include -#endif - -static inline void NN__sub_U8(size_t n, uint8_t *z, uint8_t *x, uint8_t *y) { - #ifdef RVV - while (n > 0) { - size_t vl = __riscv_vsetvl_e8m1(n); - vuint8m1_t vec_x = __riscv_vle8_v_u8m1(x, vl); - vuint8m1_t vec_y = __riscv_vle8_v_u8m1(y, vl); - vuint8m1_t vec_z = __riscv_vsub_vv_u8m1(vec_x, vec_y, vl); - __riscv_vse8_v_u8m1(z, vec_z, vl); - x += vl; - y += vl; - z += vl; - n -= vl; - } - #else - for (size_t i = 0; i < n; i += 1) { - z[i] = x[i] - y[i]; - } - #endif -} - -static inline void NN__sub_I8(size_t n, int8_t *z, int8_t *x, int8_t *y) { - #ifdef RVV - while (n > 0) { - size_t vl = __riscv_vsetvl_e8m1(n); - vint8m1_t vec_x = __riscv_vle8_v_i8m1(x, vl); - vint8m1_t vec_y = __riscv_vle8_v_i8m1(y, vl); - vint8m1_t vec_z = __riscv_vsub_vv_i8m1(vec_x, vec_y, vl); - __riscv_vse8_v_i8m1(z, vec_z, vl); - x += vl; - y += vl; - z += vl; - n -= vl; - } - #else - for (size_t i = 0; i < n; i += 1) { - z[i] = x[i] - y[i]; - } - #endif -} - -static inline void NN__sub_I16(size_t n, int16_t *z, int16_t *x, int16_t *y) { - #ifdef RVV - while (n > 0) { - size_t vl = __riscv_vsetvl_e16m1(n); - vint16m1_t vec_x = __riscv_vle16_v_i16m1(x, vl); - vint16m1_t vec_y = __riscv_vle16_v_i16m1(y, vl); - vint16m1_t vec_z = __riscv_vsub_vv_i16m1(vec_x, vec_y, vl); - __riscv_vse16_v_i16m1(z, vec_z, vl); - x += vl; - y += vl; - z += vl; - n -= vl; - } - #else - for (size_t i = 0; i < n; i += 1) { - z[i] = x[i] - y[i]; - } - #endif -} - -static inline void NN__sub_I32(size_t n, int32_t *z, int32_t *x, int32_t *y) { - #ifdef RVV - while (n > 0) { - size_t vl = __riscv_vsetvl_e32m1(n); - vint32m1_t vec_x = __riscv_vle32_v_i32m1(x, vl); - vint32m1_t vec_y = __riscv_vle32_v_i32m1(y, vl); - vint32m1_t vec_z = __riscv_vsub_vv_i32m1(vec_x, vec_y, vl); - __riscv_vse32_v_i32m1(z, vec_z, vl); - x += vl; - y += vl; - z += vl; - n -= vl; - } - #else - for (size_t i = 0; i < n; i += 1) { - z[i] = x[i] - y[i]; - } - #endif -} - -static inline void NN__sub_F32(size_t n, float *z, float *x, float *y) { - #ifdef RVV - while (n > 0) { - size_t vl = __riscv_vsetvl_e32m1(n); - vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x, vl); - vfloat32m1_t vec_y = __riscv_vle32_v_f32m1(y, vl); - vfloat32m1_t vec_z = __riscv_vfsub_vv_f32m1(vec_x, vec_y, vl); - __riscv_vse32_v_f32m1(z, vec_z, vl); - x += vl; - y += vl; - z += vl; - n -= vl; - } - #else - for (size_t i = 0; i < n; i += 1) { - z[i] = x[i] - y[i]; - } - #endif -} - -#endif // __NN__SUB_H diff --git a/nn/inc/ops/sum.h b/nn/inc/ops/sum.h deleted file mode 100644 index 31da83d..0000000 --- a/nn/inc/ops/sum.h +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef __NN__SUM_H -#define __NN__SUM_H - -#include -#include -#include - -#ifdef RVV - #include -#endif - -static inline void NN__sum_I16_to_I32(size_t n, uint32_t *s, int16_t *x) { - int32_t sum = 0; - for (size_t i = 0; i < n; i += 1) { - sum += (int32_t)x[i]; - } - *s = sum; -} - -static inline void NN__sum_I32(size_t n, uint32_t *s, int32_t *x) { - int32_t sum = 0; - for (size_t i = 0; i < n; i += 1) { - sum += x[i]; - } - *s = sum; -} - -static inline void NN__sum_F32(size_t n, float *s, float *x) { - float sum = 0.0; - for (size_t i = 0; i < n; i += 1) { - sum += (float)x[i]; - } - *s = sum; -} - - - -#endif // __NN__SUM_H diff --git a/nn/inc/ops/transpose.h b/nn/inc/ops/transpose.h deleted file mode 100644 index 55edada..0000000 --- a/nn/inc/ops/transpose.h +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef __NN__TRANSPOSE_H -#define __NN__TRANSPOSE_H - -#include - -#ifdef RVV - #include -#endif - -static inline void NN__transpose_F32(size_t n, size_t m, float *y, float *x) { - #ifdef RVV - for (size_t i = 0; i < m; i += 1) { - size_t k = n; - while (k > 0) { - size_t vl = __riscv_vsetvl_e32m1(k); - vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x, vl); - __riscv_vsse32_v_f32m1(y, sizeof(float) * m, vec_x, vl); - x += vl; - y += vl * m; - k -= vl; - } - } - #else - for (size_t i = 0; i < m; i += 1) { - for (size_t j = 0; j < n; j += 1) { - y[j * m + i] = x[i * n + j]; - } - } - #endif -}; - - -#endif // __NN__TRANSPOSE_H diff --git a/nn/inc/nn.h b/nn/nn.h similarity index 93% rename from nn/inc/nn.h rename to nn/nn.h index ad6b026..432aca8 100644 --- a/nn/inc/nn.h +++ b/nn/nn.h @@ -5,10 +5,11 @@ #include "nn_float16.h" #include "nn_tensor.h" +#include "nn_tensor_creation.h" #include "nn_print.h" #include "nn_abs.h" #include "nn_add.h" -#include "nn_batchnorm2d.h" +#include "nn_batch_norm2d.h" #include "nn_clip.h" #include "nn_conv2d.h" #include "nn_copy.h" @@ -16,10 +17,12 @@ #include "nn_elu.h" #include "nn_fill.h" #include "nn_interpolate.h" +#include "nn_layer_norm.h" #include "nn_linear.h" #include "nn_matmul.h" -#include "nn_matrixnorm.h" +#include "nn_norm.h" #include "nn_max.h" +#include "nn_mm.h" #include "nn_maximum.h" #include "nn_min.h" #include "nn_minimum.h" diff --git a/nn/nn_float16.h b/nn/nn_float16.h new file mode 100644 index 0000000..31beaa1 --- /dev/null +++ b/nn/nn_float16.h @@ -0,0 +1,154 @@ +#ifndef __NN_FLOAT16 +#define __NN_FLOAT16 + +#include +#include +#include + +#ifdef X86 + #include +#endif + + +#ifdef _Float16 + typedef _Float16 float16_t; +#else + typedef union { + uint32_t i; + float f; + } float_uint32_union_t; + + typedef uint16_t float16_t; +#endif + +/** + * Converts a half-precision floating-point number to a single-precision floating-point number. + * + * @param h The half-precision floating-point number to convert. + * @return The single-precision floating-point number. + */ +static inline float NN_half_to_float(float16_t h) { + #ifdef _Float16 + return (float)h; + #else + // from https://github.com/AcademySoftwareFoundation/Imath/blob/main/src/Imath/half.h + // Note: This only supports the "round to even" rounding mode, which + // was the only mode supported by the original OpenEXR library + + float_uint32_union_t v; + // this code would be clearer, although it does appear to be faster + // (1.06 vs 1.08 ns/call) to avoid the constants and just do 4 + // shifts. + // + uint32_t hexpmant = ((uint32_t) (h) << 17) >> 4; + v.i = ((uint32_t) (h >> 15)) << 31; + + // the likely really does help if most of your numbers are "normal" half numbers + if ((hexpmant >= 0x00800000)) { + v.i |= hexpmant; + // either we are a normal number, in which case add in the bias difference + // otherwise make sure all exponent bits are set + if ((hexpmant < 0x0f800000)) { + v.i += 0x38000000; + } + else { + v.i |= 0x7f800000; + } + } + else if (hexpmant != 0) { + // exponent is 0 because we're denormal, don't have to extract + // the mantissa, can just use as is + // + // other compilers may provide count-leading-zeros primitives, + // but we need the community to inform us of the variants + uint32_t lc; + lc = 0; + while (0 == ((hexpmant << lc) & 0x80000000)) { + lc += 1; + } + lc -= 8; + // so nominally we want to remove that extra bit we shifted + // up, but we are going to add that bit back in, then subtract + // from it with the 0x38800000 - (lc << 23).... + // + // by combining, this allows us to skip the & operation (and + // remove a constant) + // + // hexpmant &= ~0x00800000; + v.i |= 0x38800000; + // lc is now x, where the desired exponent is then + // -14 - lc + // + 127 -> new exponent + v.i |= (hexpmant << lc); + v.i -= (lc << 23); + } + return v.f; + #endif +} + + +/** + * Converts a single-precision floating-point number to a half-precision floating-point number. + * + * @param f The single-precision floating-point number to convert. + * @return The half-precision floating-point number. + */ +static inline float16_t NN_float_to_half(float f) { + #ifdef _Float16 + return (_Float16)f; + #else + // from https://github.com/AcademySoftwareFoundation/Imath/blob/main/src/Imath/half.h + // Note: This only supports the "round to even" rounding mode, which + // was the only mode supported by the original OpenEXR library + + float_uint32_union_t v; + float16_t ret; + uint32_t e, m, ui, r, shift; + + v.f = f; + + ui = (v.i & ~0x80000000); + ret = ((v.i >> 16) & 0x8000); + + // exponent large enough to result in a normal number, round and return + if (ui >= 0x38800000) { + // inf or nan + if (ui >= 0x7f800000) { + ret |= 0x7c00; + if (ui == 0x7f800000) { + return ret; + } + m = (ui & 0x7fffff) >> 13; + // make sure we have at least one bit after shift to preserve nan-ness + return ret | (uint16_t) m | (uint16_t) (m == 0); + } + + // too large, round to infinity + if (ui > 0x477fefff) { + return ret | 0x7c00; + } + + ui -= 0x38000000; + ui = ((ui + 0x00000fff + ((ui >> 13) & 1)) >> 13); + return ret | (uint16_t) ui; + } + + // zero or flush to 0 + if (ui < 0x33000001) { + return ret; + } + + // produce a denormalized half + e = (ui >> 23); + shift = 0x7e - e; + m = 0x800000 | (ui & 0x7fffff); + r = m << (32 - shift); + ret |= (m >> shift); + if (r > 0x80000000 || (r == 0x80000000 && (ret & 0x1) != 0)) { + ret += 1; + } + return ret; + #endif +} + +#endif // __NN_FLOAT16 \ No newline at end of file diff --git a/nn/inc/nn_tensor.h b/nn/nn_tensor.h similarity index 56% rename from nn/inc/nn_tensor.h rename to nn/nn_tensor.h index ff24da2..deb3c95 100644 --- a/nn/inc/nn_tensor.h +++ b/nn/nn_tensor.h @@ -70,7 +70,7 @@ static inline size_t NN_sizeof(DataType dtype) { } } -static inline const char *NN_getDataTypeName(DataType dtype) { +static inline const char *NN_get_datatype_name(DataType dtype) { switch (dtype) { case DTYPE_U8: return "UINT8"; @@ -104,7 +104,7 @@ static inline const char *NN_getDataTypeName(DataType dtype) { * * @param tensor: the target tensor */ -static inline uint8_t NN_isScalar(Tensor *tensor) { +static inline uint8_t NN_is_scalar(Tensor *tensor) { return tensor->ndim == 0; } @@ -113,7 +113,7 @@ static inline uint8_t NN_isScalar(Tensor *tensor) { * * @param tensor: the target tensor */ -static inline uint8_t NN_isVector(Tensor *tensor) { +static inline uint8_t NN_is_vector(Tensor *tensor) { return tensor->ndim == 1; } @@ -122,7 +122,7 @@ static inline uint8_t NN_isVector(Tensor *tensor) { * * @param tensor: the target tensor */ -static inline uint8_t NN_isMatrix(Tensor *tensor) { +static inline uint8_t NN_is_matrix(Tensor *tensor) { return tensor->ndim == 2; } @@ -131,7 +131,7 @@ static inline uint8_t NN_isMatrix(Tensor *tensor) { * * @param tensor: the target tensor */ -static inline uint8_t NN_is3D(Tensor *tensor) { +static inline uint8_t NN_is_3d(Tensor *tensor) { return tensor->ndim == 3; } @@ -140,7 +140,7 @@ static inline uint8_t NN_is3D(Tensor *tensor) { * * @param tensor: the target tensor */ -static inline uint8_t NN_is4D(Tensor *tensor) { +static inline uint8_t NN_is_4d(Tensor *tensor) { return tensor->ndim == 4; } @@ -149,7 +149,7 @@ static inline uint8_t NN_is4D(Tensor *tensor) { * * @param tensor: the target tensor */ -static inline void NN_freeTensorData(Tensor *tensor) { +static inline void NN_free_tensor_data(Tensor *tensor) { free(tensor->data); } @@ -158,77 +158,10 @@ static inline void NN_freeTensorData(Tensor *tensor) { * * @param tensor: the target tensor */ -static inline void NN_deleteTensor(Tensor *tensor) { +static inline void NN_delete_tensor(Tensor *tensor) { free(tensor); } -/** - * Initialize a given tensor - * - * The memory is initialized in C order, i.e., the last dimension is contiguous. - * - * @param ndim: number of dimensions - * @param shape: shape of tensor - * @param dtype: data type - * @param data: pointer to data, if NULL, the data will be allocated - */ -void NN_initTensor(Tensor *tensor, const size_t ndim, const size_t *shape, DataType dtype, void *data); - -/** - * Create a new tensor - * - * @param ndim: number of dimensions - * @param shape: shape of tensor - * @param dtype: data type - * @param data: pointer to data, if NULL, the data will be allocated - * @return Tensor -*/ -Tensor *NN_tensor(size_t ndim, const size_t *shape, DataType dtype, void *data); - -/** - * Returns a tensor filled with the scalar value 0. - * - * @param ndim: number of dimensions - * @param shape: shape of tensor - * @param dtype: data type - * @return Tensor - */ -Tensor *NN_zeros(size_t ndim, const size_t *shape, DataType dtype); - -/** - * Returns a tensor filled with the scalar value 1. - * - * @param ndim: number of dimensions - * @param shape: shape of tensor - * @param dtype: data type - * @return Tensor - */ -Tensor *NN_ones(size_t ndim, const size_t *shape, DataType dtype); - -/** - * Returns a tensor filled with random numbers from a uniform distribution. - * - * The range of the random number is dependent on the data type: - * - For Float32, the range is [0, 1] - * - For Int8, the range is [0, 255] - * - For Int32, the range is [0, RAND_MAX] - * - * @param ndim: number of dimensions - * @param shape: shape of tensor - * @param dtype: data type - * @return Tensor - */ -Tensor *NN_rand(size_t ndim, const size_t *shape, DataType dtype); - -/** - * Returns this tensor cast to the type of the given tensor. - * - * This is equivalent to NN_copy() if the data types are the same. - * - * @param out: the output tensor - * @param in: the input tensor - */ -void NN_asType(Tensor *out, Tensor *in); #endif // __NN_TENSOR \ No newline at end of file diff --git a/nn/nn_todo b/nn/nn_todo index d5e6c3a..10b9f76 100644 --- a/nn/nn_todo +++ b/nn/nn_todo @@ -78,7 +78,7 @@ size_t NN_argmax(Matrix *a) { * ====== Operators ====== */ -void NN_Linear(Matrix *out, Matrix *weight, Matrix *bias, Matrix *input) { +void NN_linear(Matrix *out, Matrix *weight, Matrix *bias, Matrix *input) { NN_matmul(out, input, weight); NN_matadd(out, out, bias); } diff --git a/nn/inc/rv.h b/nn/rv.h similarity index 100% rename from nn/inc/rv.h rename to nn/rv.h diff --git a/nn/src/nn_copy.c b/nn/src/nn_copy.c deleted file mode 100644 index 59a8651..0000000 --- a/nn/src/nn_copy.c +++ /dev/null @@ -1,10 +0,0 @@ - -#include "nn_copy.h" - -void NN_copy(Tensor *dst, Tensor *src) { - assert(dst->ndim == src->ndim); - assert(dst->dtype == src->dtype); - assert(dst->size == src->size); - - memcpy(dst->data, src->data, dst->size * NN_sizeof(dst->dtype)); -} diff --git a/nn/src/nn_layernorm.c b/nn/src/nn_layernorm.c deleted file mode 100644 index 250ff2f..0000000 --- a/nn/src/nn_layernorm.c +++ /dev/null @@ -1,29 +0,0 @@ - -#include "nn_layernorm.h" - -void NN_LayerNorm( - Tensor *out, Tensor *in, - Tensor *weight, Tensor *bias, - const float eps) { - assert(out->dtype == in->dtype && in->dtype == DTYPE_F32); - assert(out->ndim == in->ndim); - - size_t N = in->shape[1]; - for (size_t i = 0; i < in->shape[0]; i++) { - float mean = 0; - for (size_t j = 0; j < N; j++) { - mean += ((float *)in->data)[i * N + j]; - } - mean /= N; - - float variance = 0; - for (size_t j = 0; j < N; j++) { - variance += powf(((float *)in->data)[i * N + j] - mean, 2); - } - variance /= N; - - for (size_t j = 0; j < N; j++) { - ((float *)out->data)[i * N + j] = ((float *)weight->data)[j] * (((float *)in->data)[i * N + j] - mean) / sqrtf(variance + eps) + ((float *)bias)[j]; - } - } -} diff --git a/nn/src/nn_linear.c b/nn/src/nn_linear.c deleted file mode 100644 index e9017a1..0000000 --- a/nn/src/nn_linear.c +++ /dev/null @@ -1,11 +0,0 @@ - -#include "nn_linear.h" - - -void NN_Linear(Tensor *y, Tensor *x, Tensor *w, Tensor *b) { - NN_matmulT(y, x, w); - - if (b != NULL) { - NN_add(y, y, b); - } -} diff --git a/nn/src/nn_matmul.c b/nn/src/nn_matmul.c deleted file mode 100644 index 0f1bd1f..0000000 --- a/nn/src/nn_matmul.c +++ /dev/null @@ -1,165 +0,0 @@ - -#include "nn_matmul.h" - -#ifdef GEMMINI - #include - #include - #include - - #include "gemmini/gemmini.h" -#endif - -void NN_matmul(Tensor *out, Tensor *a, Tensor *b) { - - #ifdef GEMMINI - // // This function runs a tiled matrix multiplication, with automatically - // // calculated tiling factors - // static void tiled_matmul_auto(size_t dim_I, size_t dim_J, size_t dim_K, - // const elem_t* A, const elem_t* B, - // const void * D, void * C, - // size_t stride_A, size_t stride_B, size_t stride_D, size_t stride_C, - // scale_t A_scale_factor, scale_t B_scale_factor, scale_acc_t D_scale_factor, - // int act, acc_scale_t scale, acc_scale_t bert_scale, - // bool repeating_bias, - // bool transpose_A, bool transpose_B, - // bool full_C, bool low_D, - // uint8_t weightA, - // enum tiled_matmul_type_t tiled_matmul_type) { - - size_t dim_I = a->shape[0]; - size_t dim_J = b->shape[1]; - size_t dim_K = a->shape[1]; - - size_t stride_A = dim_K; - size_t stride_B = dim_J; - size_t stride_D = dim_J; - size_t stride_C = dim_J; - - scale_t A_scale_factor = 1.0; - scale_t B_scale_factor = 1.0; - scale_acc_t D_scale_factor = 1.0; - - int act = 0; - acc_scale_t scale = 1.0; - acc_scale_t bert_scale = 1.0; - - bool repeating_bias = false; - bool transpose_A = false; - bool transpose_B = false; - bool full_C = false; - bool low_D = false; - - tiled_matmul_auto(dim_I, dim_J, dim_K, - a->data, b->data, - NULL, out->data, - stride_A, stride_B, stride_D, stride_C, - MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, - NO_ACTIVATION, ACC_SCALE_IDENTITY, 0, - repeating_bias, - transpose_A, transpose_B, - full_C, low_D, - 0, - WS); - - return; - #endif - - if (a->dtype == DTYPE_F32 && b->dtype == DTYPE_F32 && out->dtype == DTYPE_F32) { - // currently only support 2D matrix multiplication - assert(a->ndim == 2); - assert(b->ndim == 2); - assert(a->dtype == DTYPE_F32); - assert(b->dtype == DTYPE_F32); - assert(out->dtype == DTYPE_F32); - assert(a->shape[1] == b->shape[0]); - assert(out->shape[0] == a->shape[0]); - assert(out->shape[1] == b->shape[1]); - - for (size_t i = 0; i < out->shape[0]; i += 1) { - for (size_t j = 0; j < out->shape[1]; j += 1) { - float sum = 0; - for (size_t k = 0; k < a->shape[1]; k += 1) { - sum += ((float *)a->data)[i * a->shape[1] + k] * ((float *)b->data)[k * b->shape[1] + j]; - } - ((float *)out->data)[i * out->shape[1] + j] = sum; - } - } - return; - } - if (a->dtype == DTYPE_F16 && b->dtype == DTYPE_F16 && out->dtype == DTYPE_F16) { - // currently only support 2D matrix multiplication - assert(a->ndim == 2); - assert(b->ndim == 2); - assert(a->dtype == DTYPE_F16); - assert(b->dtype == DTYPE_F16); - assert(out->dtype == DTYPE_F16); - assert(a->shape[1] == b->shape[0]); - assert(out->shape[0] == a->shape[0]); - assert(out->shape[1] == b->shape[1]); - - for (size_t i = 0; i < out->shape[0]; i += 1) { - for (size_t j = 0; j < out->shape[1]; j += 1) { - float sum = 0; - for (size_t k = 0; k < a->shape[1]; k += 1) { - sum += NN_halfToFloat(((float16_t *)a->data)[i * a->shape[1] + k]) * NN_halfToFloat(((float16_t *)b->data)[k * b->shape[1] + j]); - } - ((float16_t *)out->data)[i * out->shape[1] + j] = NN_floatToHalf(sum); - } - } - return; - } - printf("Unsupported operation: %s = %s @ %s\n", - NN_getDataTypeName(out->dtype), NN_getDataTypeName(a->dtype), NN_getDataTypeName(b->dtype) - ); -} - -void NN_matmulT(Tensor *out, Tensor *a, Tensor *b) { - if (a->dtype == DTYPE_F16 && b->dtype == DTYPE_F16 && out->dtype == DTYPE_F16) { - // currently only support 2D matrix multiplication - assert(a->ndim == 2); - assert(b->ndim == 2); - assert(a->dtype == DTYPE_F16); - assert(b->dtype == DTYPE_F16); - assert(out->dtype == DTYPE_F16); - assert(a->shape[1] == b->shape[1]); - assert(out->shape[0] == a->shape[0]); - assert(out->shape[1] == b->shape[0]); - - for (size_t i = 0; i < out->shape[0]; i += 1) { - for (size_t j = 0; j < out->shape[1]; j += 1) { - NN__dot_F16(a->shape[1], - (float16_t *)out->data + i * out->shape[1] + j, - (float16_t *)a->data + i * a->shape[1], - (float16_t *)b->data + j * b->shape[1] - ); - } - } - return; - } - if (a->dtype == DTYPE_F32 && b->dtype == DTYPE_F32 && out->dtype == DTYPE_F32) { - // currently only support 2D matrix multiplication - assert(a->ndim == 2); - assert(b->ndim == 2); - assert(a->dtype == DTYPE_F32); - assert(b->dtype == DTYPE_F32); - assert(out->dtype == DTYPE_F32); - assert(a->shape[1] == b->shape[1]); - assert(out->shape[0] == a->shape[0]); - assert(out->shape[1] == b->shape[0]); - - for (size_t i = 0; i < out->shape[0]; i += 1) { - for (size_t j = 0; j < out->shape[1]; j += 1) { - NN__dot_F32(a->shape[1], - (float *)out->data + i * out->shape[1] + j, - (float *)a->data + i * a->shape[1], - (float *)b->data + j * b->shape[1] - ); - } - } - return; - } - printf("Unsupported operation: %s = %s @ %s\n", - NN_getDataTypeName(out->dtype), NN_getDataTypeName(a->dtype), NN_getDataTypeName(b->dtype) - ); -} - diff --git a/nn/src/nn_relu.c b/nn/src/nn_relu.c deleted file mode 100644 index 9025728..0000000 --- a/nn/src/nn_relu.c +++ /dev/null @@ -1,26 +0,0 @@ - -#include "nn_relu.h" - - -void NN_ReLU(Tensor *y, Tensor *x) { - assert(y->ndim == x->ndim); - assert(y->dtype == x->dtype); - assert(y->size == x->size); - - switch (y->dtype) { - case DTYPE_F32: - NN__maximum1_F32(y->size, (float *)y->data, (float *)x->data, 0.0f); - return; - - default: - break; - } - - printf("[ERROR] Unsupported operation between tensor with dtype %s = ReLU(%s)\n", - NN_getDataTypeName(y->dtype), NN_getDataTypeName(x->dtype) - ); -} - -void NN_ReLUInplace(Tensor *x) { - NN_ReLU(x, x); -} diff --git a/nn/src/nn_relu6.c b/nn/src/nn_relu6.c deleted file mode 100644 index 099729d..0000000 --- a/nn/src/nn_relu6.c +++ /dev/null @@ -1,27 +0,0 @@ - -#include "nn_relu6.h" - - -void NN_ReLU6(Tensor *y, Tensor *x) { - assert(y->ndim == x->ndim); - assert(y->dtype == x->dtype); - assert(y->size == x->size); - - switch (y->dtype) { - case DTYPE_F32: - NN__maximum1_F32(y->size, (float *)y->data, (float *)x->data, 0.0f); - NN__minimum1_F32(y->size, (float *)y->data, (float *)y->data, 6.0f); - return; - - default: - break; - } - - printf("[ERROR] Unsupported operation between tensor with dtype %s = ReLU(%s)\n", - NN_getDataTypeName(y->dtype), NN_getDataTypeName(x->dtype) - ); -} - -void NN_ReLU6Inplace(Tensor *x) { - NN_ReLU6(x, x); -} diff --git a/nn/src/nn_sum.c b/nn/src/nn_sum.c deleted file mode 100644 index 806a00c..0000000 --- a/nn/src/nn_sum.c +++ /dev/null @@ -1,34 +0,0 @@ - -#include "nn_sum.h" - - -void NN_sum(Tensor *out, Tensor *tensor) { - switch (tensor->dtype) { - case DTYPE_I16: - switch (out->dtype) { - case DTYPE_I32: - NN__sum_I16_to_I32(tensor->size, (int32_t *)out->data, (int16_t *)tensor->data); - return; - } - break; - - case DTYPE_I32: - switch (out->dtype) { - case DTYPE_I32: - NN__sum_I32(tensor->size, (int32_t *)out->data, (int32_t *)tensor->data); - return; - } - break; - - case DTYPE_F32: - NN__sum_F32(tensor->size, (float *)out->data, (float *)tensor->data); - return; - - default: - break; - } - - printf("[ERROR] Unsupported operation of tensor with dtype %s = sum(%s)\n", - NN_getDataTypeName(out->dtype), NN_getDataTypeName(tensor->dtype) - ); -} diff --git a/nn/src/nn_tensor.c b/nn/src/nn_tensor.c deleted file mode 100644 index 8015384..0000000 --- a/nn/src/nn_tensor.c +++ /dev/null @@ -1,137 +0,0 @@ - -#include "nn_tensor.h" - - -void NN_initTensor(Tensor *tensor, const size_t ndim, const size_t *shape, DataType dtype, void *data) { - tensor->dtype = dtype; - tensor->ndim = ndim; - - // set shape - memcpy(tensor->shape, shape, ndim * sizeof(size_t)); - memset(tensor->shape + ndim, 0, (MAX_DIMS - ndim) * sizeof(size_t)); - - // calculate size (number of elements) - tensor->size = 1; - for (size_t i = 0; i < ndim; i += 1) { - tensor->size *= shape[i]; - } - - if (data != NULL) { - tensor->data = data; - return; - } - - // if this is a scalar tensor - if (tensor->ndim == 0) { - tensor->data = malloc(NN_sizeof(dtype)); - return; - } - - tensor->data = malloc(NN_sizeof(dtype) * tensor->size); -} - -Tensor *NN_tensor(size_t ndim, const size_t *shape, DataType dtype, void *data) { - Tensor *t = (Tensor *)malloc(sizeof(Tensor)); - NN_initTensor(t, ndim, shape, dtype, data); - return t; -} - -void NN_asType(Tensor *out, Tensor *in) { - if (out->dtype == in->dtype) { - NN_copy(out, in); - return; - } - - switch (in->dtype) { - case DTYPE_U8: - switch (out->dtype) { - case DTYPE_U16: - for (size_t i = 0; i < in->size; i += 1) { - ((uint16_t *)out->data)[i] = (uint16_t)((uint8_t *)in->data)[i]; - } - return; - case DTYPE_U32: - for (size_t i = 0; i < in->size; i += 1) { - ((uint32_t *)out->data)[i] = (uint32_t)((uint8_t *)in->data)[i]; - } - return; - case DTYPE_I32: - for (size_t i = 0; i < in->size; i += 1) { - ((int32_t *)out->data)[i] = (int32_t)((uint8_t *)in->data)[i]; - } - return; - } - break; - - case DTYPE_I8: - switch (out->dtype) { - case DTYPE_I16: - for (size_t i = 0; i < in->size; i += 1) { - ((int16_t *)out->data)[i] = (int16_t)((int8_t *)in->data)[i]; - } - return; - case DTYPE_I32: - for (size_t i = 0; i < in->size; i += 1) { - ((int32_t *)out->data)[i] = (int32_t)((int8_t *)in->data)[i]; - } - return; - case DTYPE_F32: - for (size_t i = 0; i < in->size; i += 1) { - ((float *)out->data)[i] = (float)((int8_t *)in->data)[i]; - } - return; - } - break; - - case DTYPE_I16: - switch (out->dtype) { - case DTYPE_I32: - for (size_t i = 0; i < in->size; i += 1) { - ((int32_t *)out->data)[i] = (int32_t)((int16_t *)in->data)[i]; - } - return; - } - break; - - case DTYPE_I32: - switch (out->dtype) { - case DTYPE_I8: - for (size_t i = 0; i < in->size; i += 1) { - ((int8_t *)out->data)[i] = (int8_t)((int32_t *)in->data)[i]; - } - return; - case DTYPE_F32: - for (size_t i = 0; i < in->size; i += 1) { - ((float *)out->data)[i] = (float)((int32_t *)in->data)[i]; - } - return; - } - break; - - case DTYPE_F16: - switch (out->dtype) { - case DTYPE_F32: - for (size_t i = 0; i < in->size; i += 1) { - ((float *)out->data)[i] = NN_halfToFloat(((float16_t *)in->data)[i]); - } - return; - } - break; - - case DTYPE_F32: - switch (out->dtype) { - case DTYPE_I32: - for (size_t i = 0; i < in->size; i += 1) { - ((int32_t *)out->data)[i] = (int32_t)((float *)in->data)[i]; - } - return; - case DTYPE_F16: - for (size_t i = 0; i < in->size; i += 1) { - ((float16_t *)out->data)[i] = NN_floatToHalf(((float *)in->data)[i]); - } - return; - } - break; - } - printf("[ERROR] Cannot convert data type from %s to %s\n", NN_getDataTypeName(in->dtype), NN_getDataTypeName(out->dtype)); -} diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 628766c..20d1c72 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -32,7 +32,11 @@ elseif (RISCV) target_link_libraries(test PUBLIC target-riscv) endif() -add_compile_options(-O3 -Wall -Wextra) +add_compile_options(-O1 -Wall -Wextra) + + +# add -mfp16-format=ieee +# add_compile_options(-mavx512fp16) target_compile_options(test PRIVATE -u _printf_float) diff --git a/tests/src/generate_test.py b/tests/src/generate_test.py index fe51bdf..554067d 100644 --- a/tests/src/generate_test.py +++ b/tests/src/generate_test.py @@ -3,8 +3,13 @@ import torch import jinja2 -# seed +# see if we have a GPU +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +print("Using device: {}".format(device)) + + +# seed seed = 0 random.seed(seed) @@ -20,10 +25,10 @@ def rand(shape): - return (torch.rand(shape, dtype=torch.float32) - 0.5) * 10 + return (torch.rand(shape, dtype=torch.float32, device=device) - 0.5) * 10 def rand16(shape): - return (torch.rand(shape, dtype=torch.float16) - 0.5) * 10 + return (torch.rand(shape, dtype=torch.float16, device=device) - 0.5) * 2 test_pattern = [ @@ -31,13 +36,14 @@ def rand16(shape): # ("add", lambda a, b: a + b, [("a", rand((6, 7))), ("b", rand((6, 7))) ]), # ("add", lambda a, b: a + b, [("a", rand((6, 7))), ("b", rand((1, 7))) ]), # ("add", lambda a, b: a + b, [("a", rand((6, 7))), ("b", rand((6, 1))) ]), - # ("add", lambda a, b: a + b, [("a", rand((6, 7))), ("b", rand((7, ))) ]), - # ("addInplace", lambda a, b: a + b, [("actual", torch.zeros((7, 7))), ("b", rand((7, 7))) ]), + # ("add", lambda a, b: a + b, [("a", rand((6, 7))), ("b", rand((7, ))) ]), + # ("add_inplace", lambda a, b: a + b, [("actual", torch.zeros((7, 7))), ("b", rand((7, 7))) ]), # ("add1", lambda a, b: a + b, [("a", rand((7, 7))), ("v", random.random()) ]), - # ("clip", lambda a, v_min, v_max: torch.clip(a, v_min, v_max), [("a", rand((7, 7))), ("v_min", random.random() - 1), ("v_max", random.random())]), + # ("clip", lambda a, v_min, v_max: torch.clip(a, v_min, v_max), + # [("a", rand((7, 7))), ("v_min", random.random() - 1), ("v_max", random.random()) ]), # ("div", lambda a, b: a / b, [("a", rand((7, 7))), ("b", rand((7, 7))) ]), # ("fill", lambda a, v: a.fill_(v), [("actual", torch.zeros((7, 7))), ("v", random.random()) ]), - # ("matmulT", lambda a, b: a @ b.T, [("a", rand((6, 7))), ("b", rand((5, 7))) ]), + # ("matmul_t", lambda a, b: a @ b.T, [("a", rand((6, 7))), ("b", rand((5, 7))) ]), # ("matmul", lambda a, b: a @ b, [("a", rand((6, 7))), ("b", rand((7, 5))) ]), # ("max", lambda a: torch.max(a), [("a", rand((7, 7))) ]), # ("maximum", lambda a, b: torch.maximum(a, b), [("a", rand((7, 7))), ("b", rand((7, 7))) ]), @@ -49,42 +55,50 @@ def rand16(shape): # ("sub", lambda a, b: a - b, [("a", rand((7, 7))), ("b", rand((7, 7))) ]), # ("sum", lambda a: torch.sum(a), [("a", rand((7, 7))), ]), - # ("Linear", lambda x, w, b: torch.nn.functional.linear(x, w, b), - # [("x", rand((6, 7))), ("w", rand((5, 7))), ("b", rand((1, 5))) ]), - # ("Linear", lambda x, w, b: torch.nn.functional.linear(x, w, b), - # [("x", rand((6, 7))), ("w", rand((5, 7))), ("b", rand((5, ))) ]), - # ("ReLU", lambda x: torch.nn.functional.relu(x), + # ("linear", lambda x, w, b: torch.nn.functional.linear(x, w, b), + # [("x", rand((6, 7))), ("w", rand((5, 7))), ("b", rand((1, 5))) ]), + # ("linear", lambda x, w, b: torch.nn.functional.linear(x, w, b), + # [("x", rand((6, 7))), ("w", rand((5, 7))), ("b", rand((5, ))) ]), + # ("relu", lambda x: torch.nn.functional.relu(x), # [("x", rand((7, 7))) ]), - # ("Softmax", lambda a: torch.nn.functional.softmax(a, dim=0), - # [("x", torch.ones((7, 7))) ], - # ", 0" ), - # ("Softmax", lambda a: torch.nn.functional.softmax(a, dim=1), - # [("x", torch.ones((7, 7))) ], - # ", 1" ), - # ("ReLU6", lambda x: torch.nn.functional.relu6(x), + # ("softmax", lambda a: torch.nn.functional.softmax(a, dim=0), + # [("x", rand((7, 7))+1), ("0", None) ]), + # ("softmax", lambda a: torch.nn.functional.softmax(a, dim=1), + # [("x", rand((7, 7))+1), ("1", None) ]), + # ("softmax", lambda a: torch.nn.functional.softmax(a, dim=-1), + # [("x", rand((7, 7))+1), ("-1", None) ]), + # ("relu6", lambda x: torch.nn.functional.relu6(x), # [("x", rand((7, 7))) ]), - # ("Conv2d", lambda x, w, b: torch.nn.functional.conv2d(x.permute((0, 3, 1, 2)), w.permute((3, 2, 0, 1)), b, stride=1, padding=0, dilation=1, groups=1).permute((0, 2, 3, 1)), - # [("x", rand((1, 16, 16, 3))), ("w", rand((3, 3, 3, 6))), ("b", rand((6, )))], - # ", (size_t[]){1, 1}, (size_t[]){0, 0}, (size_t[]){1, 1}, 1" ), - # ("Conv2d", lambda x, w, b: torch.nn.functional.conv2d(x.permute((0, 3, 1, 2)), w.permute((3, 2, 0, 1)), b, stride=1, padding=1, dilation=1, groups=1).permute((0, 2, 3, 1)), - # [("x", rand((1, 16, 16, 3))), ("w", rand((3, 3, 3, 71))), ("b", rand((71, )))], - # ", (size_t[]){1, 1}, (size_t[]){1, 1}, (size_t[]){1, 1}, 1" ), - # ("NCHWToNHWC", lambda x: x.permute((0, 2, 3, 1)), [("x", rand((1, 2, 3, 3))) ]), - # ("NHWCToNCHW", lambda x: x.permute((0, 3, 1, 2)), [("x", rand((1, 3, 3, 2))) ]), - # ("Conv2d", lambda x, w, b: torch.nn.functional.conv2d(x.permute((0, 3, 1, 2)), w.permute((3, 2, 0, 1)), b, stride=1, padding=1, dilation=1, groups=16).permute((0, 2, 3, 1)), - # [("x", rand((1, 12, 12, 16))), ("w", rand((3, 3, 1, 16))), ("b", rand((16, )))], - # ", (size_t[]){1, 1}, (size_t[]){1, 1}, (size_t[]){1, 1}, 16" ), - # ("Conv2d", lambda x, w, b: torch.nn.functional.conv2d(x.permute((0, 3, 1, 2)), w.permute((3, 2, 0, 1)), b, stride=1, padding=1, dilation=1, groups=1).permute((0, 2, 3, 1)), - # [("x", rand((1, 12, 12, 16))), ("w", rand((3, 3, 16, 56))), ("b", rand((56, )))], - # ", (size_t[]){1, 1}, (size_t[]){1, 1}, (size_t[]){1, 1}, 1" ), - # ("LayerNorm", lambda x, w, b: torch.nn.functional.layer_norm(x, x.shape, w, b, eps=1e-05), - # [("x", rand((6, 5))), ("w", rand((6, 5))), ("b", rand((6, 5))) ], - # ", 1e-05" ), - - # ("abs", lambda a: torch.abs(a), [("a", rand16((1, 4))), ]), + # ("conv2d", lambda x, w, b: torch.nn.functional.conv2d(x.permute((0, 3, 1, 2)), w.permute((3, 2, 0, 1)), b, stride=1, padding=0, dilation=1, groups=1).permute((0, 2, 3, 1)), + # [("x", rand((1, 16, 16, 3))), ("w", rand((3, 3, 3, 6))), ("b", rand((6, ))), + # ("(size_t[]){1, 1}, (size_t[]){0, 0}, (size_t[]){1, 1}, 1", None) ]), + # ("conv2d", lambda x, w, b: torch.nn.functional.conv2d(x.permute((0, 3, 1, 2)), w.permute((3, 2, 0, 1)), b, stride=1, padding=1, dilation=1, groups=1).permute((0, 2, 3, 1)), + # [("x", rand((1, 16, 16, 3))), ("w", rand((3, 3, 3, 71))), ("b", rand((71, ))), + # ("(size_t[]){1, 1}, (size_t[]){1, 1}, (size_t[]){1, 1}, 1", None) ]), + # ("nchw_to_nhwc", lambda x: x.permute((0, 2, 3, 1)), [("x", rand((1, 2, 3, 3))) ]), + # ("nhwc_to_nchw", lambda x: x.permute((0, 3, 1, 2)), [("x", rand((1, 3, 3, 2))) ]), + # ("conv2d", lambda x, w, b: torch.nn.functional.conv2d(x.permute((0, 3, 1, 2)), w.permute((3, 2, 0, 1)), b, stride=1, padding=1, dilation=1, groups=16).permute((0, 2, 3, 1)), + # [("x", rand((1, 12, 12, 16))), ("w", rand((3, 3, 1, 16))), ("b", rand((16, ))), + # ("(size_t[]){1, 1}, (size_t[]){1, 1}, (size_t[]){1, 1}, 16", None) ]), + # ("conv2d", lambda x, w, b: torch.nn.functional.conv2d(x.permute((0, 3, 1, 2)), w.permute((3, 2, 0, 1)), b, stride=1, padding=1, dilation=1, groups=1).permute((0, 2, 3, 1)), + # [("x", rand((1, 12, 12, 16))), ("w", rand((3, 3, 16, 56))), ("b", rand((56, ))), + # ("(size_t[]){1, 1}, (size_t[]){1, 1}, (size_t[]){1, 1}, 1", None) ]), + + # ("layer_norm", lambda x, w, b: torch.nn.functional.layer_norm(x, (x.shape[1], ), w, b, eps=1e-05), + # [("x", rand((6, 5))), ("1", None), ("w", rand((5))), ("b", torch.zeros((5))), ("1e-05", None) ]), + # ("layer_norm", lambda x, w, b: torch.nn.functional.layer_norm(x, (x.shape[1], ), w, b, eps=1e-05), + # [("x", rand((6, 5))), ("1", None), ("w", rand((5))), ("b", rand((5))), ("1e-05", None) ]), + + ("abs", lambda a: torch.abs(a), [("a", rand16((1, 7))), ]), ("add", lambda a, b: a + b, [("a", rand16((6, 7))), ("b", rand16((6, 7))) ]), - ("matmulT", lambda a, b: a @ b.T, [("a", rand16((6, 7))), ("b", rand16((5, 7))) ]), + ("matmul_t", lambda a, b: a @ b.T, [("a", rand16((6, 7))), ("b", rand16((5, 7))) ]), ("matmul", lambda a, b: a @ b, [("a", rand16((6, 7))), ("b", rand16((7, 5))) ]), + + ("linear", lambda x, w, b: torch.nn.functional.linear(x, w, b), + [("x", rand16((6, 7))), ("w", rand16((5, 7))), ("b", rand16((1, 5))) ]), + ("relu", lambda x: torch.nn.functional.relu(x), + [("x", rand16((7, 7))) ]), + ] @@ -102,7 +116,7 @@ def rand16(shape): #include "unittest.h" int main() { - enableAcceleratorFeatures(); + enable_accelerator_features(); size_t cycles = 0; @@ -112,7 +126,7 @@ def rand16(shape): """ -def typeToStr(dtype: torch.dtype): +def type_to_str(dtype: torch.dtype): if dtype == torch.float16: return "DTYPE_F16" elif dtype == torch.float32: @@ -120,12 +134,13 @@ def typeToStr(dtype: torch.dtype): -def formatTensor(name: str, tensor: torch.Tensor): +def format_tensor(name: str, tensor: torch.Tensor): dim = len(tensor.shape) shape = ", ".join([str(s) for s in tensor.shape]) - dtype = typeToStr(tensor.dtype) - data = ",".join([hex(b) for b in tensor.contiguous().numpy().flatten().tobytes()]) - human_readable = str(tensor.contiguous().numpy()).replace("\n", " ")[:80] + dtype = type_to_str(tensor.dtype) + data_np = tensor.cpu().contiguous().numpy() + data = ",".join([hex(b) for b in data_np.flatten().tobytes()]) + human_readable = str(data_np).replace("\n", " ")[:80] tensor_str = env.from_string(""" // {{ human_readable }} Tensor *{{ name }} = NN_tensor({{ dim }}, (size_t[]){ {{ shape }} }, {{ dtype }}, (uint8_t[]){ {{ data }} });""").render( @@ -135,36 +150,41 @@ def formatTensor(name: str, tensor: torch.Tensor): -def generateTestPattern(op, function, inputs, additional_params=""): - result = function(*[value for name, value in inputs]) +def generate_test_pattern(op, function, inputs): + actual_inputs = [value for name, value in inputs if value is not None] + result = function(*actual_inputs) tensor_constructors = [] tensor_destructors = [] for name, value in inputs: + if type(value) == str: + pass + if type(value) == torch.Tensor and name != "actual": dim = len(value.shape) shape = ", ".join([str(s) for s in value.shape]) - dtype = typeToStr(value.dtype) - data = ", ".join([str(b) for b in value.contiguous().numpy().flatten().tobytes()]) + dtype = type_to_str(value.dtype) + data_np = value.cpu().contiguous().numpy() + data = ", ".join([str(b) for b in data_np.flatten().tobytes()]) - human_readable = str(value.contiguous().numpy()).replace("\n", " ")[:80] - tensor_str = formatTensor(name, value) + human_readable = str(data_np).replace("\n", " ")[:80] + tensor_str = format_tensor(name, value) tensor_constructors.append(tensor_str) - tensor_destructors.append(" NN_deleteTensor({});\n".format(name)) + tensor_destructors.append(" NN_delete_tensor({});\n".format(name)) elif type(value) == float: tensor_str = env.from_string("float {{ name }} = {{ value }};").render(name=name, value=value) tensor_constructors.append(tensor_str) - golden_str = formatTensor("golden", result) + golden_str = format_tensor("golden", result) dim = len(result.shape) shape = ", ".join([str(s) for s in result.shape]) - dtype = typeToStr(result.dtype) + dtype = type_to_str(result.dtype) result_tensors = golden_str + env.from_string(""" Tensor *actual = NN_zeros({{ dim }}, (size_t[]){ {{ shape }} }, {{ dtype }});""").render( @@ -174,9 +194,13 @@ def generateTestPattern(op, function, inputs, additional_params=""): inputs = ", ".join([name for name, value in inputs if name != "actual"]) inputs = ", " + inputs if inputs else inputs - func_str = env.from_string(""" NN_{{ op }}(actual{{ inputs }}{{ additional_params }});\n""").render( - op=op, inputs=inputs, additional_params=additional_params + func_str = env.from_string(""" NN_{{ op }}(actual{{ inputs }});\n""").render( + op=op, inputs=inputs ) + + precision = "1e-4" + if result.dtype == torch.float16: + precision = "1e-2" test_template = env.from_string(""" { @@ -185,26 +209,26 @@ def generateTestPattern(op, function, inputs, additional_params=""): {{ result_tensors }} - cycles = readCycles(); + cycles = read_cycles(); {{ func_str }} - cycles = readCycles() - cycles; - printf("%s (%lu cycles)\\n", compareTensor(golden, actual, 1e-3) ? "PASS" : "FAIL", cycles); + cycles = read_cycles() - cycles; + printf("%s (%lu cycles)\\n", compare_tensor(golden, actual, {{ precision }}) ? "PASS" : "FAIL", cycles); {% for tensor_str in tensor_destructors %}{{ tensor_str }}{% endfor %} - NN_deleteTensor(golden); - NN_freeTensorData(actual); - NN_deleteTensor(actual); + NN_delete_tensor(golden); + NN_free_tensor_data(actual); + NN_delete_tensor(actual); } """) - return test_template.render(op=op, tensor_constructors=tensor_constructors, tensor_destructors=tensor_destructors, result_tensors=result_tensors, func_str=func_str) + return test_template.render(op=op, tensor_constructors=tensor_constructors, tensor_destructors=tensor_destructors, result_tensors=result_tensors, func_str=func_str, precision=precision) template = env.from_string(c_code) -result = template.render(code="\n".join([generateTestPattern(*pattern) for pattern in test_pattern])) +result = template.render(code="\n".join([generate_test_pattern(*pattern) for pattern in test_pattern])) with open("generated.c", "w") as f: f.write(result) diff --git a/tests/src/generated.c b/tests/src/generated.c index 5143173..4433723 100644 --- a/tests/src/generated.c +++ b/tests/src/generated.c @@ -8,87 +8,162 @@ #include "unittest.h" int main() { - enableAcceleratorFeatures(); + enable_accelerator_features(); size_t cycles = 0; + { + printf("abs: "); + + // [[-0.332 -0.454 0.6143 0.1875 -0.1846 -0.255 0.6904]] + Tensor *a = NN_tensor(2, (size_t[]){ 1, 7 }, DTYPE_F16, (uint8_t[]){ 0x50,0xb5,0x44,0xb7,0xea,0x38,0x0,0x32,0xe8,0xb1,0x14,0xb4,0x86,0x39 }); + + + // [[0.332 0.454 0.6143 0.1875 0.1846 0.255 0.6904]] + Tensor *golden = NN_tensor(2, (size_t[]){ 1, 7 }, DTYPE_F16, (uint8_t[]){ 0x50,0x35,0x44,0x37,0xea,0x38,0x0,0x32,0xe8,0x31,0x14,0x34,0x86,0x39 }); + Tensor *actual = NN_zeros(2, (size_t[]){ 1, 7 }, DTYPE_F16); + + cycles = read_cycles(); + NN_abs(actual, a); + cycles = read_cycles() - cycles; + printf("%s (%lu cycles)\n", compare_tensor(golden, actual, 1e-2) ? "PASS" : "FAIL", cycles); + + NN_delete_tensor(a); + + NN_delete_tensor(golden); + NN_free_tensor_data(actual); + NN_delete_tensor(actual); + } + { printf("add: "); - // [[-1.66 -2.27 3.07 0.9375 -0.923 -1.274 3.453 ] [ 1.753 0. - Tensor *a = NN_tensor(2, (size_t[]){ 6, 7 }, DTYPE_F16, (uint8_t[]){ 0xa4,0xbe,0x8a,0xc0,0x24,0x42,0x80,0x3b,0x62,0xbb,0x19,0xbd,0xe8,0x42,0x3,0x3f,0xa0,0x29,0x10,0x43,0x4c,0xc3,0x5d,0x43,0xda,0x43,0x26,0xc0,0x78,0x35,0x9c,0x40,0x24,0xc0,0xbc,0x3f,0xef,0xc2,0x3b,0xbe,0xbf,0x40,0x9e,0xc0,0x93,0xc4,0x26,0xc4,0xdc,0x41,0xd6,0xba,0x9c,0x42,0xc2,0xc0,0xfe,0xba,0xa6,0xc4,0xd3,0xbc,0x79,0x44,0x70,0xc4,0x80,0xb3,0x41,0xbd,0xec,0x42,0xfe,0x44,0x23,0x44,0x36,0x40,0xa0,0xc1,0xcc,0xba,0x84,0xc4 }); - // [[ 4.805 0.8643 -1.313 -1.108 -1.782 0.718 -0.5566] [-2.934 1.406 4. - Tensor *b = NN_tensor(2, (size_t[]){ 6, 7 }, DTYPE_F16, (uint8_t[]){ 0xce,0x44,0xea,0x3a,0x41,0xbd,0x6f,0xbc,0x21,0xbf,0xbe,0x39,0x74,0xb8,0xde,0xc1,0xa0,0x3d,0xb1,0x44,0x2d,0x3d,0x63,0xbe,0x85,0xbf,0xb0,0xc0,0xb2,0xc0,0xe,0xbe,0x9e,0xc3,0x43,0xc4,0xd1,0xbe,0x5,0xc1,0x7c,0x40,0x8c,0xb9,0x26,0x3b,0xdd,0xc4,0x56,0xbc,0x60,0xc4,0x60,0xc4,0x8d,0x44,0x24,0x34,0x66,0x44,0x5f,0x41,0xc4,0x40,0x3c,0xbd,0x56,0xc3,0xba,0xc2,0x36,0xc2,0x6,0x42,0x90,0x32,0x51,0x40,0xb2,0xc0,0x50,0xb9,0x7e,0xc3 }); + // [[ 0.3506 0.00879 0.706 -0.7295 0.7363 0.785 -0.415 ] [ 0.06836 0. + Tensor *a = NN_tensor(2, (size_t[]){ 6, 7 }, DTYPE_F16, (uint8_t[]){ 0x9c,0x35,0x80,0x20,0xa6,0x39,0xd6,0xb9,0xe4,0x39,0x48,0x3a,0xa4,0xb6,0x60,0x2c,0x60,0x37,0xa0,0xb6,0x30,0x36,0x8c,0xb9,0xfc,0xb4,0x98,0x37,0x64,0xb7,0x52,0xbb,0xa4,0xba,0xb0,0x38,0x78,0xb1,0x4a,0x39,0x9c,0xb7,0x98,0xb1,0x70,0xbb,0xb8,0xb3,0x28,0x3b,0x1a,0xbb,0x0,0xaa,0x34,0xb4,0x8a,0x39,0xfc,0x3b,0x9e,0x3a,0xbc,0x36,0x80,0xb8,0x70,0xb1,0x3a,0xbb,0xb0,0x3b,0x88,0x31,0x34,0xb4,0x18,0xb3,0xb4,0xb5,0x98,0x30,0x20,0xaf }); + // [[-0.587 0.2812 0.9385 0.2588 -0.3193 -0.376 -0.4688 ] [-0.4697 -0. + Tensor *b = NN_tensor(2, (size_t[]){ 6, 7 }, DTYPE_F16, (uint8_t[]){ 0xb2,0xb8,0x80,0x34,0x82,0x3b,0x24,0x34,0x1c,0xb5,0x4,0xb6,0x80,0xb7,0x84,0xb7,0xd8,0xb4,0x18,0xba,0xd2,0xba,0x74,0xb5,0x4,0xb8,0x2c,0x37,0x70,0xb0,0xb8,0x31,0xc8,0xbb,0xf0,0xb2,0x0,0xbb,0x0,0xbb,0x48,0x3b,0xa0,0x2a,0xa,0x3b,0x4c,0x38,0xa0,0x37,0x30,0xb4,0xde,0xb9,0x62,0xb9,0xf8,0xb8,0xd2,0x38,0x40,0x29,0xe8,0x36,0x84,0xb7,0x40,0xb0,0xfe,0xb9,0x4,0x35,0xce,0x3b,0x52,0x3b,0x8e,0xbb,0xba,0xb9,0xcc,0x3a,0x70,0x2f }); - // [[ 3.145 -1.405 1.757 -0.1709 -2.705 -0.5566 2.896 ] [-1.181 1.45 8. - Tensor *golden = NN_tensor(2, (size_t[]){ 6, 7 }, DTYPE_F16, (uint8_t[]){ 0x4a,0x42,0x9f,0xbd,0x7,0x3f,0x78,0xb1,0x69,0xc1,0x74,0xb8,0xcb,0x41,0xb9,0xbc,0xcd,0x3d,0x1c,0x48,0xb6,0xc0,0x2c,0x40,0x18,0x40,0x6b,0xc4,0x3,0xc0,0x54,0x3a,0xe1,0xc5,0xa8,0xc0,0x2c,0xc5,0x11,0xc4,0x9e,0x44,0x1,0xc2,0x5c,0xc3,0x82,0xc8,0x62,0x3f,0x3b,0xc5,0x48,0xbc,0x58,0x40,0xec,0xb8,0x0,0xb4,0xeb,0x3d,0xdb,0x46,0xbf,0xc5,0xce,0xc3,0xad,0xc4,0xb0,0x35,0x0,0x48,0x58,0x44,0x44,0x44,0x29,0xc5,0xe,0xbe,0x22,0xc8 }); + // [[-0.2363 0.29 1.645 -0.4707 0.417 0.4092 -0.884 ] [-0.40 + Tensor *golden = NN_tensor(2, (size_t[]){ 6, 7 }, DTYPE_F16, (uint8_t[]){ 0x90,0xb3,0xa4,0x34,0x94,0x3e,0x88,0xb7,0xac,0x36,0x8c,0x36,0x12,0xbb,0x6c,0xb6,0x10,0x31,0xb4,0xbc,0x74,0xb7,0x23,0xbc,0x82,0xba,0x62,0x3b,0xce,0xb8,0xe4,0xb9,0x36,0xbf,0xe8,0x35,0x2f,0xbc,0xd8,0xb2,0xf4,0x36,0xe0,0xaf,0x60,0xaa,0xbc,0x34,0x7c,0x3d,0x99,0xbc,0x3e,0xba,0x7c,0xbb,0x90,0x2c,0x67,0x3e,0xf2,0x3a,0xd2,0x3a,0x21,0xbc,0xd8,0xb4,0x9c,0xbe,0x19,0x3d,0x98,0x3c,0x38,0x39,0xaa,0xbc,0x4a,0xbc,0xf2,0x3b,0x0,0x1d }); Tensor *actual = NN_zeros(2, (size_t[]){ 6, 7 }, DTYPE_F16); - cycles = readCycles(); + cycles = read_cycles(); NN_add(actual, a, b); - cycles = readCycles() - cycles; - printf("%s (%lu cycles)\n", compareTensor(golden, actual, 1e-3) ? "PASS" : "FAIL", cycles); + cycles = read_cycles() - cycles; + printf("%s (%lu cycles)\n", compare_tensor(golden, actual, 1e-2) ? "PASS" : "FAIL", cycles); - NN_deleteTensor(a); - NN_deleteTensor(b); + NN_delete_tensor(a); + NN_delete_tensor(b); - NN_deleteTensor(golden); - NN_freeTensorData(actual); - NN_deleteTensor(actual); + NN_delete_tensor(golden); + NN_free_tensor_data(actual); + NN_delete_tensor(actual); } { - printf("matmulT: "); + printf("matmul_t: "); - // [[ 1.567 4.88 4.574 -4.723 -3.578 4.25 0.581 ] [-1.196 -2.9 0. - Tensor *a = NN_tensor(2, (size_t[]){ 6, 7 }, DTYPE_F16, (uint8_t[]){ 0x45,0x3e,0xe1,0x44,0x93,0x44,0xb9,0xc4,0x28,0xc3,0x40,0x44,0xa6,0x38,0xc9,0xbc,0xcd,0xc1,0x68,0x36,0x8e,0xc4,0x60,0x44,0x1d,0xc2,0x8,0xc0,0x84,0x41,0x88,0xb8,0xa3,0xbf,0x94,0x44,0x97,0xc4,0x2,0xc4,0xd6,0xc2,0xe2,0xbc,0xb0,0xb4,0x58,0x33,0x85,0xbf,0xf1,0x44,0x1e,0x39,0x10,0xb4,0x18,0xba,0x8,0x3b,0xe2,0x44,0x22,0x44,0x96,0x39,0x72,0x42,0x6f,0x3c,0x78,0xb5,0x8d,0xbc,0xf5,0xbd,0x7e,0xb8,0x5c,0xc2,0x2a,0x44,0x88,0xc0 }); - // [[ 1.489 2.836 -4.176 3.297 2.188 0.4639 0.962 ] [ 2.959 -3.75 4. - Tensor *b = NN_tensor(2, (size_t[]){ 5, 7 }, DTYPE_F16, (uint8_t[]){ 0xf5,0x3d,0xac,0x41,0x2d,0xc4,0x98,0x42,0x60,0x40,0x6c,0x37,0xb2,0x3b,0xeb,0x41,0x80,0xc3,0x4d,0x44,0xbd,0x42,0xb4,0x3d,0x8a,0x44,0x10,0xbc,0x34,0x44,0x38,0xc2,0xf5,0x41,0x69,0x41,0xf8,0xb3,0x68,0xc2,0xe1,0x44,0x2e,0xb8,0x5,0xc4,0x42,0xc2,0x0,0x41,0x6c,0x44,0x63,0xc2,0x94,0xc0,0x78,0xb5,0x7b,0x3f,0xe3,0xc4,0xd4,0x42,0x5e,0xc4,0x52,0xc2,0x86,0xc4 }); + // [[-0.2393 -0.58 0.0801 -0.911 0.875 -0.6113 -0.4033 ] [ 0.552 -0. + Tensor *a = NN_tensor(2, (size_t[]){ 6, 7 }, DTYPE_F16, (uint8_t[]){ 0xa8,0xb3,0xa4,0xb8,0x20,0x2d,0x4a,0xbb,0x0,0x3b,0xe4,0xb8,0x74,0xb6,0x6a,0x38,0x40,0xaf,0x1c,0xb6,0x54,0x3b,0x58,0xbb,0x6a,0xba,0x78,0xb9,0xd0,0xb3,0x80,0xab,0xe0,0x29,0x4,0xb6,0xe8,0x3b,0x18,0x30,0x80,0xaa,0xe0,0xb0,0xa0,0x31,0xd0,0x3b,0x9c,0x3a,0x78,0x30,0x28,0x39,0x18,0x33,0x60,0xac,0x48,0xb3,0xc4,0xb4,0x30,0xaf,0x16,0xb9,0xaa,0x3a,0x40,0xb7,0xc4,0x34,0x8a,0x38,0xae,0xba,0x46,0x39,0x0,0x37,0xf0,0x2d,0x28,0x32 }); + // [[ 0.592 -0.75 0.8604 0.674 0.2852 0.908 -0.2031 ] [ 0.841 -0. + Tensor *b = NN_tensor(2, (size_t[]){ 5, 7 }, DTYPE_F16, (uint8_t[]){ 0xbc,0x38,0x0,0xba,0xe2,0x3a,0x64,0x39,0x90,0x34,0x44,0x3b,0x80,0xb2,0xba,0x3a,0xfa,0xb8,0xc4,0x38,0x54,0x38,0x60,0xaa,0x20,0xb9,0xce,0x3b,0xb0,0xae,0x6e,0xba,0x2,0xb9,0x0,0x38,0x14,0x3b,0x1c,0xb9,0x54,0xb7,0x60,0xac,0xfc,0x35,0xd2,0xbb,0x76,0x39,0xfc,0xba,0xe,0xb9,0x3c,0xbb,0x7c,0x38,0xd0,0xad,0x88,0xb4,0xa,0xbb,0x86,0x3b,0xb8,0xb6,0x5c,0x3b }); - // [[-23.8 3.703 -17.62 -77.25 -30.33 ] [-20.48 -11.875 -8.27 33.38 -22 - Tensor *golden = NN_tensor(2, (size_t[]){ 6, 5 }, DTYPE_F16, (uint8_t[]){ 0xf3,0xcd,0x68,0x43,0x68,0xcc,0xd4,0xd4,0x95,0xcf,0x1f,0xcd,0xf0,0xc9,0x23,0xc8,0x2c,0x50,0xb6,0xcd,0x31,0x49,0x89,0xc3,0x57,0x4c,0xa5,0x4c,0x72,0x54,0x3a,0x3c,0xb1,0x40,0x92,0xca,0x32,0x4c,0x88,0xcf,0x45,0xbd,0x7c,0x51,0x5a,0x4b,0x75,0xcc,0x86,0xce,0x92,0xc6,0xd0,0x49,0xdd,0xce,0x15,0xcb,0x2a,0x4b }); + // [[-0.4753 -0.331 1.335 -0.9136 1.406 ] [-0.1509 0.695 0.743 2. + Tensor *golden = NN_tensor(2, (size_t[]){ 6, 5 }, DTYPE_F16, (uint8_t[]){ 0x9b,0xb7,0x4c,0xb5,0x57,0x3d,0x4f,0xbb,0xa0,0x3d,0xd4,0xb0,0x8f,0x39,0xf2,0x39,0xb0,0x41,0x23,0xbe,0x3,0x2e,0x34,0xb8,0x60,0x39,0xd1,0xbc,0x10,0x3c,0x5,0x3f,0xb4,0x38,0xb5,0xb9,0x2d,0xbc,0x29,0xbc,0x73,0x37,0x65,0xbc,0x89,0xb8,0x95,0x38,0xcc,0xbc,0xa8,0xb5,0x61,0xb0,0xd7,0x38,0xbf,0x3a,0x24,0x35 }); Tensor *actual = NN_zeros(2, (size_t[]){ 6, 5 }, DTYPE_F16); - cycles = readCycles(); - NN_matmulT(actual, a, b); - cycles = readCycles() - cycles; - printf("%s (%lu cycles)\n", compareTensor(golden, actual, 1e-3) ? "PASS" : "FAIL", cycles); + cycles = read_cycles(); + NN_matmul_t(actual, a, b); + cycles = read_cycles() - cycles; + printf("%s (%lu cycles)\n", compare_tensor(golden, actual, 1e-2) ? "PASS" : "FAIL", cycles); - NN_deleteTensor(a); - NN_deleteTensor(b); + NN_delete_tensor(a); + NN_delete_tensor(b); - NN_deleteTensor(golden); - NN_freeTensorData(actual); - NN_deleteTensor(actual); + NN_delete_tensor(golden); + NN_free_tensor_data(actual); + NN_delete_tensor(actual); } { printf("matmul: "); - // [[ 2.803 -0.454 -1.416 -4.4 4.703 -2.1 4.6 ] [-1.392 0.7227 -3. - Tensor *a = NN_tensor(2, (size_t[]){ 6, 7 }, DTYPE_F16, (uint8_t[]){ 0x9b,0x41,0x44,0xb7,0xaa,0xbd,0x66,0xc4,0xb4,0x44,0x33,0xc0,0x9a,0x44,0x91,0xbd,0xc8,0x39,0xf6,0xc3,0x83,0xc0,0x4,0x40,0xba,0xbc,0x68,0x32,0x88,0x34,0x3b,0xbe,0xa0,0x31,0xce,0xbc,0x94,0xc4,0xb8,0x32,0xf0,0xb9,0xf2,0xc4,0xf6,0xbc,0x0,0xc3,0x40,0x36,0xd8,0xc4,0x1c,0x3b,0x0,0x39,0x42,0xbc,0x50,0xc3,0xcc,0x3a,0x69,0x41,0x3f,0x3f,0x30,0x3b,0xaa,0xc1,0x3c,0xc1,0xc0,0x41,0x35,0xc3,0x78,0x3d,0x8f,0xbf,0xf6,0xbc,0x8,0x41 }); - // [[-4.54 1.104 4.273 1.313 4.297 ] [-3.555 1.602 0.0928 -2. - Tensor *b = NN_tensor(2, (size_t[]){ 7, 5 }, DTYPE_F16, (uint8_t[]){ 0x8a,0xc4,0x6a,0x3c,0x46,0x44,0x41,0x3d,0x4c,0x44,0x1c,0xc3,0x68,0x3e,0xf0,0x2d,0x12,0xc0,0x1a,0x44,0x1,0x40,0x9b,0xc1,0xd8,0x3c,0x1f,0xc4,0xd8,0xc2,0x40,0xae,0xc,0x44,0xa3,0x43,0x38,0x44,0x14,0x3d,0xb4,0x44,0x5f,0xbd,0x52,0xc4,0x0,0xa5,0x29,0xc0,0x80,0x40,0x46,0xb9,0x10,0x42,0x12,0xbb,0x68,0xbe,0x28,0x41,0xf0,0x39,0xbc,0xb7,0x46,0xc4,0x4c,0x38 }); + // [[-0.2783 0.1445 -0.796 -0.4512 0.4014 -0.2363 0.04004] [ 0.05664 -0. + Tensor *a = NN_tensor(2, (size_t[]){ 6, 7 }, DTYPE_F16, (uint8_t[]){ 0x74,0xb4,0xa0,0x30,0x5e,0xba,0x38,0xb7,0x6c,0x36,0x90,0xb3,0x20,0x29,0x40,0x2b,0xfc,0xb4,0x80,0x28,0xb0,0xb3,0x54,0xbb,0x60,0x29,0xc0,0xb0,0xea,0xbb,0xf0,0xb3,0x9a,0xb9,0x0,0x2d,0xc0,0xbb,0xb0,0x31,0x0,0x30,0xd0,0xb2,0xda,0xb9,0x70,0x31,0x54,0x38,0xcc,0x35,0xc0,0x31,0x88,0xb8,0x30,0xb8,0x9a,0x38,0xc4,0xb9,0x60,0x34,0xc,0xb6,0xf0,0xb3,0x6,0x38,0x44,0xbb,0x10,0x33,0xd6,0x3a,0x34,0x34,0xe0,0x3a,0xb0,0xb9,0x20,0x35 }); + // [[ 0.01855 -0.4072 0.8203 0.4004 -0.5605 ] [ 0.2422 -0.824 -0.68 + Tensor *b = NN_tensor(2, (size_t[]){ 7, 5 }, DTYPE_F16, (uint8_t[]){ 0xc0,0x24,0x84,0xb6,0x90,0x3a,0x68,0x36,0x7c,0xb8,0xc0,0x33,0x98,0xba,0x7a,0xb9,0x0,0xa5,0x7a,0x3a,0x1c,0x3a,0xc0,0x3a,0x10,0x34,0x86,0x3b,0x4c,0xb4,0xea,0xba,0x0,0x9c,0xa8,0xb6,0x34,0x37,0x38,0xb0,0xda,0x38,0xa8,0xb1,0x20,0xb5,0x20,0x38,0xc0,0x30,0x30,0xae,0xd6,0xba,0xe0,0x2e,0xce,0xbb,0x40,0x37,0x40,0xac,0xf2,0x3a,0x50,0xb0,0x50,0x2f,0xf4,0x36 }); - // [[ 15.74 -12.984 -35.47 -26.02 5.492 ] [ 3.299 -0.1198 -31.67 3 - Tensor *golden = NN_tensor(2, (size_t[]){ 6, 5 }, DTYPE_F16, (uint8_t[]){ 0xdf,0x4b,0x7e,0xca,0x6f,0xd0,0x81,0xce,0x7e,0x45,0x99,0x42,0xab,0xaf,0xeb,0xcf,0x7a,0x43,0x91,0x45,0x90,0xcc,0x28,0xc0,0x5e,0x4c,0x9b,0x3a,0xf6,0x3d,0x20,0x39,0x2a,0x49,0x6b,0xb9,0x5f,0x48,0xe0,0xc4,0xa0,0x4d,0x2f,0xc3,0x79,0x41,0x4f,0x4e,0x71,0xce,0x71,0xc9,0xa5,0x4d,0xdf,0xc6,0xd9,0x3e,0x78,0x4d }); + // [[ 0.07556 -0.51 -0.501 -0.6235 0.516 ] [-0.3896 0.2615 0.6865 -0. + Tensor *golden = NN_tensor(2, (size_t[]){ 6, 5 }, DTYPE_F16, (uint8_t[]){ 0xd6,0x2c,0x14,0xb8,0x2,0xb8,0xfd,0xb8,0x21,0x38,0x3c,0xb6,0x2f,0x34,0x7e,0x39,0x9d,0xb8,0x17,0xb7,0x2d,0xbd,0x9c,0x30,0x51,0xb8,0xb1,0xbe,0x2e,0x38,0x77,0xb4,0xc7,0x2f,0xeb,0x2f,0x78,0x34,0x9b,0xb9,0x2a,0xbb,0xf3,0xb0,0x5f,0xbc,0x5e,0xb9,0xbc,0x3b,0x1f,0x3c,0x91,0x3e,0xbc,0xbc,0xe9,0x3e,0xe4,0x35 }); Tensor *actual = NN_zeros(2, (size_t[]){ 6, 5 }, DTYPE_F16); - cycles = readCycles(); + cycles = read_cycles(); NN_matmul(actual, a, b); - cycles = readCycles() - cycles; - printf("%s (%lu cycles)\n", compareTensor(golden, actual, 1e-3) ? "PASS" : "FAIL", cycles); + cycles = read_cycles() - cycles; + printf("%s (%lu cycles)\n", compare_tensor(golden, actual, 1e-2) ? "PASS" : "FAIL", cycles); + + NN_delete_tensor(a); + NN_delete_tensor(b); + + NN_delete_tensor(golden); + NN_free_tensor_data(actual); + NN_delete_tensor(actual); + } + + { + printf("linear: "); + + // [[ 0.581 -0.3428 0.8516 -0.09375 0.2227 0.495 -0.255 ] [-0.882 0. + Tensor *x = NN_tensor(2, (size_t[]){ 6, 7 }, DTYPE_F16, (uint8_t[]){ 0xa6,0x38,0x7c,0xb5,0xd0,0x3a,0x0,0xae,0x20,0x33,0xec,0x37,0x14,0xb4,0xe,0xbb,0x18,0x35,0x56,0xb8,0xc2,0xb9,0xcc,0xb7,0x8e,0x3a,0x70,0x39,0xc0,0x30,0xc4,0xb6,0xae,0xbb,0xdc,0xb9,0x50,0x38,0xd8,0x36,0x62,0xba,0xa8,0xb3,0xb4,0xba,0x22,0xbb,0x74,0x35,0xf2,0x3b,0x8,0x30,0x84,0xb4,0xec,0x3a,0xa0,0xb2,0x50,0xba,0xf4,0x35,0xd0,0xbb,0x7a,0xbb,0xc0,0xa9,0x40,0xa4,0x6,0x3a,0x2c,0xba,0x98,0x39,0xcc,0x3b,0xba,0xb9,0xac,0x39 }); + // [[-0.001953 -0.3154 0.6924 -0.29 -0.458 -0.9688 0.01074 ] [ 0.60 + Tensor *w = NN_tensor(2, (size_t[]){ 5, 7 }, DTYPE_F16, (uint8_t[]){ 0x0,0x98,0xc,0xb5,0x8a,0x39,0xa4,0xb4,0x54,0xb7,0xc0,0xbb,0x80,0x21,0xd0,0x38,0x58,0xba,0x50,0x30,0x6c,0x39,0xd6,0x3b,0x94,0x36,0xfa,0x38,0x38,0x39,0x44,0xb6,0xa0,0x37,0xf0,0xb3,0x36,0x38,0xa6,0x3b,0xc8,0x32,0xa4,0xb5,0x58,0xb2,0x9a,0xb9,0x70,0x31,0xa0,0x30,0xb0,0xac,0x3a,0x38,0x86,0x38,0x74,0xb7,0xc8,0xb2,0x24,0x36,0x6e,0xbb,0xae,0xb9,0x90,0xae }); + // [[ 0.878 0.795 -0.287 0.924 -0.1445]] + Tensor *b = NN_tensor(2, (size_t[]){ 1, 5 }, DTYPE_F16, (uint8_t[]){ 0x6,0x3b,0x5c,0x3a,0x98,0xb4,0x64,0x3b,0xa0,0xb0 }); + + + // [[ 1.018 1.73 1.191 0.036 -0.405 ] [ 0.04947 -0.2664 -0.396 1. + Tensor *golden = NN_tensor(2, (size_t[]){ 6, 5 }, DTYPE_F16, (uint8_t[]){ 0x12,0x3c,0xec,0x3e,0xc4,0x3c,0x9c,0x28,0x7b,0xb6,0x55,0x2a,0x43,0xb4,0x56,0xb6,0xa1,0x3e,0x9b,0xbc,0x1b,0xaf,0x69,0x3a,0x24,0x33,0x83,0x3c,0x4f,0xb9,0xe9,0xb0,0x8d,0x40,0x43,0xa9,0x5e,0x3f,0x6c,0xb8,0x8f,0x3e,0x1b,0x34,0x1c,0xbe,0xb7,0x3c,0xa7,0x40,0x10,0x31,0xa3,0x3e,0x9f,0xbc,0x4,0x40,0x57,0xb8 }); + Tensor *actual = NN_zeros(2, (size_t[]){ 6, 5 }, DTYPE_F16); + + cycles = read_cycles(); + NN_linear(actual, x, w, b); + cycles = read_cycles() - cycles; + printf("%s (%lu cycles)\n", compare_tensor(golden, actual, 1e-2) ? "PASS" : "FAIL", cycles); + + NN_delete_tensor(x); + NN_delete_tensor(w); + NN_delete_tensor(b); + + NN_delete_tensor(golden); + NN_free_tensor_data(actual); + NN_delete_tensor(actual); + } + + { + printf("relu: "); + + // [[ 0.753 0.7803 -0.2412 0.3848 0.5264 -0.581 -0.8057 ] [ 0.413 0. + Tensor *x = NN_tensor(2, (size_t[]){ 7, 7 }, DTYPE_F16, (uint8_t[]){ 0x6,0x3a,0x3e,0x3a,0xb8,0xb3,0x28,0x36,0x36,0x38,0xa6,0xb8,0x72,0xba,0x9c,0x36,0xa0,0x2b,0x56,0xb9,0xf8,0xb0,0x80,0x29,0xf0,0xb2,0x44,0x3b,0x80,0x2d,0xbe,0xbb,0xec,0xb5,0x30,0xba,0x4,0xb4,0xb8,0xbb,0x0,0x3a,0x2c,0x37,0xbc,0xb9,0x76,0xba,0x10,0xb0,0x6a,0x3b,0xf0,0x33,0x0,0x2d,0xf8,0x36,0xf0,0xba,0x86,0xba,0xfe,0xba,0x5e,0xba,0xc0,0x35,0xba,0xb9,0xe0,0x35,0x6c,0x35,0xb0,0xba,0xe8,0x3a,0xe0,0xb2,0xa0,0xba,0x32,0x38,0x7a,0xb9,0xce,0x3a,0x8,0xb8,0xe8,0xb2,0x20,0x2f,0x3c,0x3a,0x40,0x27 }); + + + // [[0.753 0.7803 0. 0.3848 0.5264 0. 0. ] [0.413 0.05957 0. + Tensor *golden = NN_tensor(2, (size_t[]){ 7, 7 }, DTYPE_F16, (uint8_t[]){ 0x6,0x3a,0x3e,0x3a,0x0,0x0,0x28,0x36,0x36,0x38,0x0,0x0,0x0,0x0,0x9c,0x36,0xa0,0x2b,0x0,0x0,0x0,0x0,0x80,0x29,0x0,0x0,0x44,0x3b,0x80,0x2d,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x3a,0x2c,0x37,0x0,0x0,0x0,0x0,0x0,0x0,0x6a,0x3b,0xf0,0x33,0x0,0x2d,0xf8,0x36,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0xc0,0x35,0x0,0x0,0xe0,0x35,0x6c,0x35,0x0,0x0,0xe8,0x3a,0x0,0x0,0x0,0x0,0x32,0x38,0x0,0x0,0xce,0x3a,0x0,0x0,0x0,0x0,0x20,0x2f,0x3c,0x3a,0x40,0x27 }); + Tensor *actual = NN_zeros(2, (size_t[]){ 7, 7 }, DTYPE_F16); + + cycles = read_cycles(); + NN_relu(actual, x); + cycles = read_cycles() - cycles; + printf("%s (%lu cycles)\n", compare_tensor(golden, actual, 1e-2) ? "PASS" : "FAIL", cycles); - NN_deleteTensor(a); - NN_deleteTensor(b); + NN_delete_tensor(x); - NN_deleteTensor(golden); - NN_freeTensorData(actual); - NN_deleteTensor(actual); + NN_delete_tensor(golden); + NN_free_tensor_data(actual); + NN_delete_tensor(actual); } } \ No newline at end of file diff --git a/tests/src/main_avx.c b/tests/src/main_avx.c index 7a99ac0..d845a09 100644 --- a/tests/src/main_avx.c +++ b/tests/src/main_avx.c @@ -14,7 +14,7 @@ int main() { NN_sub(a, a, ones); - NN_abs_F32_AVX(a, a); + NN_abs_f32_AVX(a, a); NN_printf(a); diff --git a/tests/src/main_fp16.c b/tests/src/main_fp16.c index 3b1b414..dbfc6bb 100644 --- a/tests/src/main_fp16.c +++ b/tests/src/main_fp16.c @@ -34,10 +34,10 @@ uint8_t compareResult(float golden, float actual) { if (diff > 1e-2) { printf("FAILED "); printf("golden: "); - NN_printFloat(golden, 6); + NN_print_f32(golden, 6); printf("\n"); printf("actual: "); - NN_printFloat(actual, 6); + NN_print_f32(actual, 6); printf("\n"); return 1; } @@ -51,8 +51,8 @@ int main() { float x = (float)(0x47ca9334); - float16_t x_compressed = NN_floatToHalf(x); - float x_decompressed = NN_halfToFloat(x_compressed); + float16_t x_compressed = NN_float_to_half(x); + float x_decompressed = NN_half_to_float(x_compressed); print_bits(x); print_bits_half(x_compressed); diff --git a/tests/src/main_matlib.c b/tests/src/main_matlib.c index 9f43b19..8c283c1 100644 --- a/tests/src/main_matlib.c +++ b/tests/src/main_matlib.c @@ -66,17 +66,17 @@ int main() { printf("%s (%lu)\n", compare_2d(golden->data, actual->data, N, M) ? "PASS" : "FAIL", cycles); - NN_freeTensorData(A); - NN_deleteTensor(A); - NN_freeTensorData(B); - NN_deleteTensor(B); - NN_freeTensorData(f); - NN_deleteTensor(f); - - NN_freeTensorData(golden); - NN_deleteTensor(golden); - NN_freeTensorData(actual); - NN_deleteTensor(actual); + NN_free_tensor_data(A); + NN_delete_tensor(A); + NN_free_tensor_data(B); + NN_delete_tensor(B); + NN_free_tensor_data(f); + NN_delete_tensor(f); + + NN_free_tensor_data(golden); + NN_delete_tensor(golden); + NN_free_tensor_data(actual); + NN_delete_tensor(actual); } // matvec @@ -107,12 +107,12 @@ int main() { cycles = READ_CSR("mcycle") - cycles; printf("%s (%lu)\n", compare_2d(golden_vect->data, actual_vect->data, N, 1) ? "PASS" : "FAIL", cycles); - NN_freeTensorData(H); - NN_deleteTensor(H); - NN_freeTensorData(V); - NN_deleteTensor(V); - NN_freeTensorData(W); - NN_deleteTensor(W); + NN_free_tensor_data(H); + NN_delete_tensor(H); + NN_free_tensor_data(V); + NN_delete_tensor(V); + NN_free_tensor_data(W); + NN_delete_tensor(W); } // max and min @@ -133,8 +133,8 @@ int main() { cycles = READ_CSR("mcycle") - cycles; printf("%s (%lu)\n", float_eq(min_cpu, min_actual, 1e-6) ? "PASS" : "FAIL", cycles); - NN_freeTensorData(A); - NN_deleteTensor(A); + NN_free_tensor_data(A); + NN_delete_tensor(A); } // matmulf @@ -150,12 +150,12 @@ int main() { cycles = READ_CSR("mcycle") - cycles; printf("%s (%lu)\n", compare_2d(C->data, D->data, M, N) ? "PASS" : "FAIL", cycles); - NN_freeTensorData(A); - NN_deleteTensor(A); - NN_freeTensorData(C); - NN_deleteTensor(C); - NN_freeTensorData(D); - NN_deleteTensor(D); + NN_free_tensor_data(A); + NN_delete_tensor(A); + NN_free_tensor_data(C); + NN_delete_tensor(C); + NN_free_tensor_data(D); + NN_delete_tensor(D); } // matsub @@ -172,15 +172,15 @@ int main() { cycles = READ_CSR("mcycle") - cycles; printf("%s (%lu)\n", compare_2d(golden->data, actual->data, N, M) ? "PASS" : "FAIL", cycles); - NN_freeTensorData(A); - NN_deleteTensor(A); - NN_freeTensorData(B); - NN_deleteTensor(B); + NN_free_tensor_data(A); + NN_delete_tensor(A); + NN_free_tensor_data(B); + NN_delete_tensor(B); - NN_freeTensorData(golden); - NN_deleteTensor(golden); - NN_freeTensorData(actual); - NN_deleteTensor(actual); + NN_free_tensor_data(golden); + NN_delete_tensor(golden); + NN_free_tensor_data(actual); + NN_delete_tensor(actual); } // matadd @@ -191,21 +191,21 @@ int main() { Tensor *actual = NN_tensor(2, (size_t[]){M, N}, DTYPE_F32, NULL); printf("matadd:\t\t"); - NN_add_F32(golden, A, B); + NN_add_f32(golden, A, B); cycles = READ_CSR("mcycle"); - NN_add_F32_RVV(actual, A, B); + NN_add_f32_RVV(actual, A, B); cycles = READ_CSR("mcycle") - cycles; printf("%s (%lu)\n", compare_2d(golden->data, actual->data, N, M) ? "PASS" : "FAIL", cycles); - NN_freeTensorData(A); - NN_deleteTensor(A); - NN_freeTensorData(B); - NN_deleteTensor(B); + NN_free_tensor_data(A); + NN_delete_tensor(A); + NN_free_tensor_data(B); + NN_delete_tensor(B); - NN_freeTensorData(golden); - NN_deleteTensor(golden); - NN_freeTensorData(actual); - NN_deleteTensor(actual); + NN_free_tensor_data(golden); + NN_delete_tensor(golden); + NN_free_tensor_data(actual); + NN_delete_tensor(actual); } // matneg @@ -221,13 +221,13 @@ int main() { cycles = READ_CSR("mcycle") - cycles; printf("%s (%lu)\n", compare_2d(golden->data, actual->data, N, M) ? "PASS" : "FAIL", cycles); - NN_freeTensorData(A); - NN_deleteTensor(A); + NN_free_tensor_data(A); + NN_delete_tensor(A); - NN_freeTensorData(golden); - NN_deleteTensor(golden); - NN_freeTensorData(actual); - NN_deleteTensor(actual); + NN_free_tensor_data(golden); + NN_delete_tensor(golden); + NN_free_tensor_data(actual); + NN_delete_tensor(actual); } // matcopy @@ -242,19 +242,19 @@ int main() { Tensor *actual = NN_tensor(2, (size_t[]){M, N}, DTYPE_F32, NULL); printf("cwiseabs:\t"); - NN_abs_F32(golden, A); + NN_abs_f32(golden, A); cycles = READ_CSR("mcycle"); - NN_abs_F32_RVV(actual, A); + NN_abs_f32_RVV(actual, A); cycles = READ_CSR("mcycle") - cycles; printf("%s (%lu)\n", compare_2d(golden->data, actual->data, N, M) ? "PASS" : "FAIL", cycles); - NN_freeTensorData(A); - NN_deleteTensor(A); + NN_free_tensor_data(A); + NN_delete_tensor(A); - NN_freeTensorData(golden); - NN_deleteTensor(golden); - NN_freeTensorData(actual); - NN_deleteTensor(actual); + NN_free_tensor_data(golden); + NN_delete_tensor(golden); + NN_free_tensor_data(actual); + NN_delete_tensor(actual); } // cwisemin @@ -271,15 +271,15 @@ int main() { cycles = READ_CSR("mcycle") - cycles; printf("%s (%lu)\n", compare_2d(golden->data, actual->data, N, M) ? "PASS" : "FAIL", cycles); - NN_freeTensorData(A); - NN_deleteTensor(A); - NN_freeTensorData(B); - NN_deleteTensor(B); + NN_free_tensor_data(A); + NN_delete_tensor(A); + NN_free_tensor_data(B); + NN_delete_tensor(B); - NN_freeTensorData(golden); - NN_deleteTensor(golden); - NN_freeTensorData(actual); - NN_deleteTensor(actual); + NN_free_tensor_data(golden); + NN_delete_tensor(golden); + NN_free_tensor_data(actual); + NN_delete_tensor(actual); } // cwisemax @@ -296,15 +296,15 @@ int main() { cycles = READ_CSR("mcycle") - cycles; printf("%s (%lu)\n", compare_2d(golden->data, actual->data, N, M) ? "PASS" : "FAIL", cycles); - NN_freeTensorData(A); - NN_deleteTensor(A); - NN_freeTensorData(B); - NN_deleteTensor(B); + NN_free_tensor_data(A); + NN_delete_tensor(A); + NN_free_tensor_data(B); + NN_delete_tensor(B); - NN_freeTensorData(golden); - NN_deleteTensor(golden); - NN_freeTensorData(actual); - NN_deleteTensor(actual); + NN_free_tensor_data(golden); + NN_delete_tensor(golden); + NN_free_tensor_data(actual); + NN_delete_tensor(actual); } // cwisemul @@ -321,15 +321,15 @@ int main() { cycles = READ_CSR("mcycle") - cycles; printf("%s (%lu)\n", compare_2d(golden->data, actual->data, N, M) ? "PASS" : "FAIL", cycles); - NN_freeTensorData(A); - NN_deleteTensor(A); - NN_freeTensorData(B); - NN_deleteTensor(B); + NN_free_tensor_data(A); + NN_delete_tensor(A); + NN_free_tensor_data(B); + NN_delete_tensor(B); - NN_freeTensorData(golden); - NN_deleteTensor(golden); - NN_freeTensorData(actual); - NN_deleteTensor(actual); + NN_free_tensor_data(golden); + NN_delete_tensor(golden); + NN_free_tensor_data(actual); + NN_delete_tensor(actual); } // matset @@ -366,10 +366,10 @@ int main() { cycles = READ_CSR("mcycle") - cycles; printf("%s (%lu)\n", (B->shape[0] == N && B->shape[1] == M) ? "PASS" : "FAIL", cycles); - NN_freeTensorData(A); - NN_deleteTensor(A); - NN_freeTensorData(B); - NN_deleteTensor(B); + NN_free_tensor_data(A); + NN_delete_tensor(A); + NN_free_tensor_data(B); + NN_delete_tensor(B); } @@ -434,9 +434,9 @@ int main() { printf("conv2d:\t\t"); - NN_Conv2d_F32(y_golden, x, w, b, (size_t[]){stride_height, stride_width}, (size_t[]){padding_height, padding_width}, 1); + NN_conv2d_F32(y_golden, x, w, b, (size_t[]){stride_height, stride_width}, (size_t[]){padding_height, padding_width}, 1); cycles = READ_CSR("mcycle"); - NN_Conv2d_F32_RVV(y_actual, x, w, b, (size_t[]){stride_height, stride_width}, (size_t[]){padding_height, padding_width}, 1); + NN_conv2d_F32_RVV(y_actual, x, w, b, (size_t[]){stride_height, stride_width}, (size_t[]){padding_height, padding_width}, 1); cycles = READ_CSR("mcycle") - cycles; printf("%s (%lu)\n", compare_2d(y_golden->data, y_actual->data, in_width - kernel_width + 1, in_height - kernel_height + 1) ? "PASS" : "FAIL", cycles); @@ -449,12 +449,12 @@ int main() { printf("output:\n"); NN_printf(y_golden); - NN_freeTensorData(x); - NN_deleteTensor(x); - NN_freeTensorData(w); - NN_deleteTensor(w); - NN_freeTensorData(b); - NN_deleteTensor(b); + NN_free_tensor_data(x); + NN_delete_tensor(x); + NN_free_tensor_data(w); + NN_delete_tensor(w); + NN_free_tensor_data(b); + NN_delete_tensor(b); } @@ -499,7 +499,7 @@ int main() { // printf("sum:\t\t"); // float sum_cpu = NN_sum_F32(A); - // NN_printFloat(sum_cpu, 4); + // NN_print_f32(sum_cpu, 4); // printf("\n"); // } diff --git a/tests/src/unittest.h b/tests/src/unittest.h index 28cf0fe..fbcc469 100644 --- a/tests/src/unittest.h +++ b/tests/src/unittest.h @@ -15,7 +15,7 @@ #include "riscv_vector.h" #endif -static void enableAcceleratorFeatures() { +static void enable_accelerator_features() { #ifdef RVV // enable vector operation unsigned long mstatus; @@ -25,7 +25,7 @@ static void enableAcceleratorFeatures() { #endif } -static size_t readCycles() { +static size_t read_cycles() { #ifdef X86 return __rdtsc(); #elif defined(RISCV) @@ -33,22 +33,22 @@ static size_t readCycles() { #endif } -static uint8_t floatEqual(float golden, float actual, float rel_err) { +static uint8_t float_equal(float golden, float actual, float rel_err) { return (fabs(actual - golden) < rel_err) || (fabs((actual - golden) / actual) < rel_err); } -static uint8_t compareTensor(Tensor *golden, Tensor *actual, float rel_err) { +static uint8_t compare_tensor(Tensor *golden, Tensor *actual, float rel_err) { switch (golden->dtype) { case DTYPE_F16: for (size_t i = 0; i < golden->size; i += 1) { - if (!floatEqual(NN_halfToFloat(((float16_t *)golden->data)[i]), NN_halfToFloat(((float16_t *)actual->data)[i]), rel_err)) { + if (!float_equal(NN_half_to_float(((float16_t *)golden->data)[i]), NN_half_to_float(((float16_t *)actual->data)[i]), rel_err)) { return 0; } } return 1; case DTYPE_F32: for (size_t i = 0; i < golden->size; i += 1) { - if (!floatEqual(((float *)golden->data)[i], ((float *)actual->data)[i], rel_err)) { + if (!float_equal(((float *)golden->data)[i], ((float *)actual->data)[i], rel_err)) { return 0; } }