diff --git a/lib_tflite_micro/src/tflite-xcore-kernels/conv2d_float.c b/lib_tflite_micro/src/tflite-xcore-kernels/conv2d_float.c index 918474f..71e6078 100644 --- a/lib_tflite_micro/src/tflite-xcore-kernels/conv2d_float.c +++ b/lib_tflite_micro/src/tflite-xcore-kernels/conv2d_float.c @@ -1,10 +1,13 @@ #include "conv2d_float.h" #include +#include +#include int xc_fc_float_ref(float *outputs, float *inputs, float *kernels, - int out_features, int input_features) { + int out_features, int input_features, int out_f_start, + int out_f_end) { int cnt = 0; - for (int f = 0; f < out_features; f++) { + for (int f = out_f_start; f < out_f_end; f++) { int output_index = f; float acc = 0; for (int kf = 0; kf < input_features; kf++) { @@ -202,6 +205,7 @@ int xc_transpose_conv2d_float_kw5xh2_stride_h3_ref( for (int d = 0; d < out_depth; d++) { for (int kx = 0; kx < KW; kx++) { for (int ky = 0; ky < KH; ky++) { + if (y+ky !=1) continue; int output_index = ((x * H_TR_STRIDE + kx) * out_h + (y + ky)) * out_depth + d; float acc = outputs[output_index]; @@ -351,6 +355,340 @@ void xc_transpose_conv2d_float_kw5xh2_stride_h3_opt( outputs[output_index] = acc; } } + } else if ((input_depth & 0x3) == 0) { // All cases before here to be deleted. + for (int kx = 0; kx < KW; kx++) { + for (int ky = 0; ky < KH; ky++) { +// Only compute if it is the middle frame + if (ky + y != 1) { + continue; + } + int output_index = + ((x * H_TR_STRIDE + kx) * out_h + (y + ky)) * out_depth + d; + float acc = outputs[output_index]; + for (int kd = 0; kd < input_depth; kd+=4) { + int input_index = ((x)*input_h + (y)) * input_depth + kd; + int kernel_index = ((d * KW + kx) * KH + ky) * input_depth + kd; + float in1 = inputs[input_index]; + float in2 = kernels[kernel_index]; + asm volatile("fmacc %0, %1, %2, %3" + : "=r"(acc) + : "r"(acc), "r"(in1), "r"(in2)); + in1 = inputs[input_index+1]; + in2 = kernels[kernel_index+1]; + asm volatile("fmacc %0, %1, %2, %3" + : "=r"(acc) + : "r"(acc), "r"(in1), "r"(in2)); + in1 = inputs[input_index+2]; + in2 = kernels[kernel_index+2]; + asm volatile("fmacc %0, %1, %2, %3" + : "=r"(acc) + : "r"(acc), "r"(in1), "r"(in2)); + in1 = inputs[input_index+3]; + in2 = kernels[kernel_index+3]; + asm volatile("fmacc %0, %1, %2, %3" + : "=r"(acc) + : "r"(acc), "r"(in1), "r"(in2)); + } + outputs[output_index] = acc; + } + } + } else { + assert(0); + } + } + } + } +} +#endif + + +float extract3_ref(float *kernels, int index) { + float x; + ((uint8_t *)&x)[0] = *(((uint8_t *)kernels) + index*3-1); + ((uint8_t *)&x)[1] = *(((uint8_t *)kernels) + index*3-1+1); + ((uint8_t *)&x)[2] = *(((uint8_t *)kernels) + index*3-1+2); + ((uint8_t *)&x)[3] = *(((uint8_t *)kernels) + index*3-1+3); +// memcpy(&x, ((uint8_t *)kernels) + index*3-1, 4); + return x; +} + +#define extract3(fout, kernels, index) \ + { \ + switch(index & 3) { \ + default: \ + f0 = kernels[(index>>2)*3]; \ + asm volatile("lextract %0, %1, %2, %3, 32" : "=r" (fout) : "r" (f0), "r" (f0), "r" (24)); \ + break; \ + case 1: \ + f1 = kernels[(index>>2)*3+1]; \ + asm volatile("lextract %0, %1, %2, %3, 32" : "=r" (fout) : "r" (f1), "r" (f0), "r" (16)); \ + break; \ + case 2: \ + f2 = kernels[(index>>2)*3+2]; \ + asm volatile("lextract %0, %1, %2, %3, 32" : "=r" (fout) : "r" (f2), "r" (f1), "r" (8)); \ + break; \ + case 3: \ + fout = f2; \ + break; \ + } \ +} + + +#ifndef NN_USE_REF +void xc_conv2d_float_kw5xh2_stride_w3_packed_ref(float *outputs, float *inputs, + float *kernels, float *biases, + int out_w, int out_h, int out_depth, + int input_w, int input_h, + int input_depth, int out_depth_start, + int out_depth_end) { + for (int x = 0; x < out_w; x++) { + for (int y = 0; y < out_h; y++) { + for (int d = out_depth_start; d < out_depth_end; d++) { + int output_index = (x * out_h + y) * out_depth + d; + float acc = biases[d]; + if (input_depth == 1) { +#pragma clang loop unroll(full) + for (int kx = 0; kx < KW; kx++) { +#pragma clang loop unroll(full) + for (int ky = 0; ky < KH; ky++) { + int input_index = + ((x * H_STRIDE + kx) * input_h + (y + ky)) * input_depth; + int kernel_index = ((d * KW + kx) * KH + ky) * input_depth; + float in1 = inputs[input_index]; + float in2 = extract3_ref(kernels ,kernel_index); + asm volatile("fmacc %0, %1, %2, %3" + : "=r"(acc) + : "r"(acc), "r"(in1), "r"(in2)); + } + } + } else if (input_depth == 2) { +#pragma clang loop unroll(full) + for (int kx = 0; kx < KW; kx++) { +#pragma clang loop unroll(full) + for (int ky = 0; ky < KH; ky++) { + for (int kd = 0; kd < 2; kd++) { + int input_index = + ((x * H_STRIDE + kx) * input_h + (y + ky)) * input_depth + + kd; + int kernel_index = ((d * KW + kx) * KH + ky) * input_depth + kd; + float in1 = inputs[input_index]; + float in2 = extract3_ref(kernels ,kernel_index); + asm volatile("fmacc %0, %1, %2, %3" + : "=r"(acc) + : "r"(acc), "r"(in1), "r"(in2)); + } + } + } + } else if (input_depth == 4) { + for (int kx = 0; kx < KW; kx++) { + for (int ky = 0; ky < KH; ky++) { +#pragma clang loop unroll(full) + for (int kd = 0; kd < 4; kd++) { + int input_index = + ((x * H_STRIDE + kx) * input_h + (y + ky)) * input_depth + + kd; + int kernel_index = ((d * KW + kx) * KH + ky) * input_depth + kd; + float in1 = inputs[input_index]; + float in2 = extract3_ref(kernels ,kernel_index); + asm volatile("fmacc %0, %1, %2, %3" + : "=r"(acc) + : "r"(acc), "r"(in1), "r"(in2)); + } + } + } + } else if (input_depth == 8) { + for (int kx = 0; kx < KW; kx++) { + for (int ky = 0; ky < KH; ky++) { +#pragma clang loop unroll(full) + for (int kd = 0; kd < 8; kd++) { + int input_index = + ((x * H_STRIDE + kx) * input_h + (y + ky)) * input_depth + + kd; + int kernel_index = ((d * KW + kx) * KH + ky) * input_depth + kd; + float in1 = inputs[input_index]; + float in2 = extract3_ref(kernels ,kernel_index); + asm volatile("fmacc %0, %1, %2, %3" + : "=r"(acc) + : "r"(acc), "r"(in1), "r"(in2)); + } + } + } + } else if (input_depth == 16) { + for (int kx = 0; kx < KW; kx++) { + for (int ky = 0; ky < KH; ky++) { +#pragma clang loop unroll_count(8) + for (int kd = 0; kd < 16; kd++) { + int input_index = + ((x * H_STRIDE + kx) * input_h + (y + ky)) * input_depth + + kd; + int kernel_index = ((d * KW + kx) * KH + ky) * input_depth + kd; + float in1 = inputs[input_index]; + float in2 = extract3_ref(kernels ,kernel_index); + asm volatile("fmacc %0, %1, %2, %3" + : "=r"(acc) + : "r"(acc), "r"(in1), "r"(in2)); + } + } + } + } else { + assert(0); + } + outputs[output_index] = acc; + } + } + } +} +#endif + +#ifndef NN_USE_REF +void xc_conv2d_float_kw5xh2_stride_w3_packed_opt(float *outputs, float *inputs, + float *kernels, float *biases, + int out_w, int out_h, int out_depth, + int input_w, int input_h, + int input_depth, int out_depth_start, + int out_depth_end) { + for (int x = 0; x < out_w; x++) { + for (int y = 0; y < out_h; y++) { + for (int d = out_depth_start; d < out_depth_end; d++) { + float f0, f1, f2; + int output_index = (x * out_h + y) * out_depth + d; + float acc = biases[d]; + if (input_depth == 1) { +#pragma clang loop unroll(full) + for (int kx = 0; kx < KW; kx++) { +#pragma clang loop unroll(full) + for (int ky = 0; ky < KH; ky++) { + int input_index = + ((x * H_STRIDE + kx) * input_h + (y + ky)) * input_depth; + int kernel_index = ((d * KW + kx) * KH + ky) * input_depth; + float in1 = inputs[input_index]; + float in2 = extract3_ref(kernels ,kernel_index); + asm volatile("fmacc %0, %1, %2, %3" + : "=r"(acc) + : "r"(acc), "r"(in1), "r"(in2)); + } + } + } else if (input_depth == 2) { +#pragma clang loop unroll(full) + for (int kx = 0; kx < KW; kx++) { +#pragma clang loop unroll(full) + for (int ky = 0; ky < KH; ky++) { + for (int kd = 0; kd < 2; kd++) { + int input_index = + ((x * H_STRIDE + kx) * input_h + (y + ky)) * input_depth + + kd; + int kernel_index = ((d * KW + kx) * KH + ky) * input_depth + kd; + float in1 = inputs[input_index]; + float in2 = extract3_ref(kernels ,kernel_index); + asm volatile("fmacc %0, %1, %2, %3" + : "=r"(acc) + : "r"(acc), "r"(in1), "r"(in2)); + } + } + } + } else if ((input_depth & 3) == 0) { + for (int kx = 0; kx < KW; kx++) { + for (int ky = 0; ky < KH; ky++) { + for (int kd = 0, kd2 = 0; kd < input_depth; kd+=4, kd2 += 3) { + int input_index = + ((x * H_STRIDE + kx) * input_h + (y + ky)) * input_depth + + kd; + int kernel_index = ((d * KW + kx) * KH + ky) * input_depth*3/4 + kd2; + float in1 = inputs[input_index]; + float in2; + f0 = kernels[(kernel_index>>0)]; + asm volatile("lextract %0, %1, %2, %3, 32" : "=r" (in2) : "r" (f0), "r" (f0), "r" (24)); + + asm volatile("fmacc %0, %1, %2, %3" + : "=r"(acc) + : "r"(acc), "r"(in1), "r"(in2)); + in1 = inputs[input_index+1]; + f1 = kernels[((kernel_index)>>0)+1]; + asm volatile("lextract %0, %1, %2, %3, 32" : "=r" (in2) : "r" (f1), "r" (f0), "r" (16)); + + asm volatile("fmacc %0, %1, %2, %3" + : "=r"(acc) + : "r"(acc), "r"(in1), "r"(in2)); + in1 = inputs[input_index+2]; + + f2 = kernels[(kernel_index>>0)+2]; + asm volatile("lextract %0, %1, %2, %3, 32" : "=r" (in2) : "r" (f2), "r" (f1), "r" (8)); + asm volatile("fmacc %0, %1, %2, %3" + : "=r"(acc) + : "r"(acc), "r"(in1), "r"(in2)); + in1 = inputs[input_index+3]; + in2 = f2; + asm volatile("fmacc %0, %1, %2, %3" + : "=r"(acc) + : "r"(acc), "r"(in1), "r"(in2)); + } + } + } + } else { + assert(0); + } + outputs[output_index] = acc; + } + } + } +} +#endif + + +#ifndef NN_USE_REF +void xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_ref( + float *outputs, float *inputs, float *kernels, float *biases, int out_w, + int out_h, int out_depth, int input_w, int input_h, int input_depth, + int out_depth_start, int out_depth_end) { + for (int x = 0; x < out_w; x++) { + for (int y = 0; y < out_h; y++) { + for (int d = out_depth_start; d < out_depth_end; d++) { + int output_index = (x * out_h + y) * out_depth + d; + outputs[output_index] = biases[d]; + } + } + } + for (int x = 0; x < input_w; x++) { + for (int y = 0; y < input_h; y++) { + for (int d = out_depth_start; d < out_depth_end; d++) { + if ((input_depth & 0x3) == 0) { + for (int kx = 0; kx < KW; kx++) { + for (int ky = 0; ky < KH; ky++) { +// Only compute if it is the middle frame + if (ky + y != 1) { + continue; + } + int output_index = + ((x * H_TR_STRIDE + kx) * out_h + (y + ky)) * out_depth + d; + float acc = outputs[output_index]; + for (int kd = 0; kd < input_depth; kd+=4) { + int input_index = ((x)*input_h + (y)) * input_depth + kd; + int kernel_index = ((d * KW + kx) * KH + ky) * input_depth + kd; + float in1 = inputs[input_index]; + float in2; + in2 = extract3_ref(kernels ,kernel_index); + asm volatile("fmacc %0, %1, %2, %3" + : "=r"(acc) + : "r"(acc), "r"(in1), "r"(in2)); + in1 = inputs[input_index+1]; + in2 = extract3_ref(kernels ,kernel_index+1); + asm volatile("fmacc %0, %1, %2, %3" + : "=r"(acc) + : "r"(acc), "r"(in1), "r"(in2)); + in1 = inputs[input_index+2]; + in2 = extract3_ref(kernels ,kernel_index+2); + asm volatile("fmacc %0, %1, %2, %3" + : "=r"(acc) + : "r"(acc), "r"(in1), "r"(in2)); + in1 = inputs[input_index+3]; + in2 = extract3_ref(kernels ,kernel_index+3); + asm volatile("fmacc %0, %1, %2, %3" + : "=r"(acc) + : "r"(acc), "r"(in1), "r"(in2)); + } + outputs[output_index] = acc; + } + } } else { assert(0); } @@ -359,3 +697,389 @@ void xc_transpose_conv2d_float_kw5xh2_stride_h3_opt( } } #endif + +extern void xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S( + float *outputs, float *inputs, float *kernels, float *biases, int out_w, + int out_h, int out_depth, int input_w, int input_h, int input_depth, + int out_depth_start, int out_depth_end); + +#ifndef NN_USE_REF +void xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt( + float *outputs, float *inputs, float *kernels, float *biases, int out_w, + int out_h, int out_depth, int input_w, int input_h, int input_depth, + int out_depth_start, int out_depth_end) { + assert((input_depth & 0x3) == 0); + for (int x = 0; x < out_w; x++) { + for (int y = 0; y < out_h; y++) { + for (int d = out_depth_start; d < out_depth_end; d++) { + int output_index = (x * out_h + y) * out_depth + d; + outputs[output_index] = biases[d]; + } + } + } + int input_base = 0; + for (int x = 0; x < input_w; x++) { + for (int y = 0; y < input_h; y++) { + int ky = 1-y; + if (ky >= 0 && ky < KH) { + for (int kx = 0; kx < KW; kx++) { + int output_index = + ((x * H_TR_STRIDE + kx) * out_h + 1) * out_depth + out_depth_start; + for (int d = out_depth_start; d < out_depth_end; d++) { + float acc = outputs[output_index]; + float f0 = 0, f1 = 0, f2 = 0; + int kernel_index = ((d * KW + kx) * KH + ky) * input_depth*3/4; + for (int kd = 0; kd < input_depth; kd+=4, kernel_index += 3) { + int input_index = input_base + kd; + float in1 = inputs[input_index]; + float in2 = 0; + f0 = kernels[(kernel_index>>0)]; + asm volatile("lextract %0, %1, %2, %3, 32" : "=r" (in2) : "r" (f0), "r" (f0), "r" (24)); + asm volatile("fmacc %0, %1, %2, %3" + : "=r"(acc) + : "r"(acc), "r"(in1), "r"(in2)); + in1 = inputs[input_index+1]; + f1 = kernels[((kernel_index)>>0)+1]; + asm volatile("lextract %0, %1, %2, %3, 32" : "=r" (in2) : "r" (f1), "r" (f0), "r" (16)); + asm volatile("fmacc %0, %1, %2, %3" + : "=r"(acc) + : "r"(acc), "r"(in1), "r"(in2)); + in1 = inputs[input_index+2]; + f2 = kernels[(kernel_index>>0)+2]; + asm volatile("lextract %0, %1, %2, %3, 32" : "=r" (in2) : "r" (f2), "r" (f1), "r" (8)); + asm volatile("fmacc %0, %1, %2, %3" + : "=r"(acc) + : "r"(acc), "r"(in1), "r"(in2)); + in1 = inputs[input_index+3]; + in2 = f2; + asm volatile("fmacc %0, %1, %2, %3" + : "=r"(acc) + : "r"(acc), "r"(in1), "r"(in2)); + } + outputs[output_index] = acc; + output_index++; + } + } + } + input_base += input_depth; + } + } +} +#endif + + + +int xc_fc_float_packed_ref(float *outputs, float *inputs, float *kernels, + int out_features, int input_features, int out_f_start, + int out_f_end) { + int cnt = 0; + for (int f = out_f_start ; f < out_f_end; f++) { + int output_index = f; + float acc = 0; + for (int kf = 0; kf < input_features; kf++) { + int input_index = kf; + int kernel_index = f * input_features + kf; + acc += inputs[input_index] * extract3_ref(kernels ,kernel_index); + cnt++; + } + outputs[output_index] = acc; + } + return cnt; +} + +#ifndef NN_USE_REF +int xc_fc_float_packed_opt(float *outputs, float *inputs, float *kernels, + int out_features, int input_features, int out_f_start, + int out_f_end) { + float f0, f1, f2; + for (int f = out_f_start; f < out_f_end; f++) { + int output_index = f; + float acc = 0; + assert (input_features == 96); + for (int kf = 0, kf2 = 0; kf < input_features; kf+=4, kf2 += 3) { + int input_index = kf; + int kernel_index = f * input_features*3/4 + kf2; + float in1 = inputs[input_index]; + float in2 = 0; + f0 = kernels[(kernel_index>>0)]; + asm volatile("lextract %0, %1, %2, %3, 32" : "=r" (in2) : "r" (f0), "r" (f0), "r" (24)); + asm volatile("fmacc %0, %1, %2, %3" + : "=r"(acc) + : "r"(acc), "r"(in1), "r"(in2)); + in1 = inputs[input_index+1]; + f1 = kernels[((kernel_index)>>0)+1]; + asm volatile("lextract %0, %1, %2, %3, 32" : "=r" (in2) : "r" (f1), "r" (f0), "r" (16)); + asm volatile("fmacc %0, %1, %2, %3" + : "=r"(acc) + : "r"(acc), "r"(in1), "r"(in2)); + in1 = inputs[input_index+2]; + f2 = kernels[(kernel_index>>0)+2]; + asm volatile("lextract %0, %1, %2, %3, 32" : "=r" (in2) : "r" (f2), "r" (f1), "r" (8)); + asm volatile("fmacc %0, %1, %2, %3" + : "=r"(acc) + : "r"(acc), "r"(in1), "r"(in2)); + in1 = inputs[input_index+3]; + in2 = f2; + asm volatile("fmacc %0, %1, %2, %3" + : "=r"(acc) + : "r"(acc), "r"(in1), "r"(in2)); + } + outputs[output_index] = acc; + } + return 0; +} + +#endif + +#ifdef LOCAL_MAIN + +static void pack_float(float *kernels, float *kernels_in, int num) { + for(int i = 0; i < num; i++) { + memcpy(((uint8_t *)kernels) + i*3, ((uint8_t * )&kernels_in[i])+1, 3); + } +} + +#include +#include + +int test_timing() { + int errors = 0; + float outputs0[64*3*16]; + float outputs2[64*3*16]; + float outputs3[64*3*16]; + float inputs[31*2*32]; + float kernels[16*3*2*32]; + float kernels_packed[16*3*2*32*3/4]; + float biases[128]; + for(int i = 0; i < sizeof(kernels) / sizeof(float); i++) { + kernels[i] = i / 16 - 30; + } + for(int i = 0; i < sizeof(inputs) / sizeof(float); i++) { + inputs[i] = 12 - i / 4; + } + for(int i = 0; i < sizeof(biases) / sizeof(float); i++) { + biases[i] = 1/i; + } + for(int i = 0; i < sizeof(outputs2) / sizeof(float); i++) { + outputs0[i] = 0; + outputs2[i] = 0; + outputs3[i] = 0; + } + int t0, t1, t2, t3, cnt; + pack_float(kernels_packed, kernels, sizeof(kernels)/sizeof(float)); + asm volatile("gettime %0" : "=r" (t0)); + xc_transpose_conv2d_float_kw5xh2_stride_h3_opt( + outputs2, inputs, kernels, biases, + 64, 3, 16, // output h/w/d + 31, 2, 32, // input h/w/d + 0, 16); + asm volatile("gettime %0" : "=r" (t1)); + cnt = xc_transpose_conv2d_float_kw5xh2_stride_h3_ref( + outputs0, inputs, kernels, biases, + 64, 3, 16, // output h/w/d + 31, 2, 32 // input h/w/d + ); + asm volatile("gettime %0" : "=r" (t2)); + xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt( + outputs3, inputs, kernels_packed, biases, + 64, 3, 16, // output h/w/d + 31, 2, 32, // input h/w/d + 0, 16); + asm volatile("gettime %0" : "=r" (t3)); + printf("TrConv NewOpt %d REF %d Old %d maccs %d\n", t3-t2, t2-t1, t1-t0, cnt); + for(int o = 0; o < sizeof(outputs2)/sizeof(float); o++) { + if (fabs((outputs2[o]-outputs3[o]) / outputs2[o]) > 10e-5 ) { + printf("Expected idx %d %f got %f tr_conv_packed_opt\n", o, outputs2[o], outputs3[o]); + errors++; + } + if (fabs((outputs2[o]-outputs0[o]) / outputs2[o]) > 10e-5 ) { + printf("Expected idx %d %f got %f tr_conv_ref\n", o, outputs2[o], outputs0[o]); + errors++; + } + } + + return errors; +} + +int test_tc() { + int errors = 0; + float outputs[6*4*32]; + float outputs2[6*4*32]; + float outputs3[6*4*32]; + float inputs[2*2*64]; + float kernels[32*3*2*64]; + float kernels_packed[32*3*2*64*3/4]; + float biases[128]; + for(int i = 0; i < sizeof(kernels) / sizeof(float); i++) { + kernels[i] = i / 16 - 30; + } + for(int i = 0; i < sizeof(inputs) / sizeof(float); i++) { + inputs[i] = 12 - i / 4; + } + for(int i = 0; i < sizeof(biases) / sizeof(float); i++) { + biases[i] = 1/i; + } + int t0, t1, t2, t3; + pack_float(kernels_packed, kernels, sizeof(kernels)/sizeof(float)); + asm volatile("gettime %0" : "=r" (t0)); + xc_transpose_conv2d_float_kw5xh2_stride_h3_opt( + outputs2, inputs, kernels, biases, + 6, 4, 32, // output h/w/d + 2, 2, 64, // input h/w/d + 0, 32); + asm volatile("gettime %0" : "=r" (t1)); + xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_ref( + outputs, inputs, kernels_packed, biases, + 6, 4, 32, // output h/w/d + 2, 2, 64, // input h/w/d + 0, 32); + asm volatile("gettime %0" : "=r" (t2)); + xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S( + outputs3, inputs, kernels_packed, biases, + 6, 4, 32, // output h/w/d + 2, 2, 64, // input h/w/d + 0, 32); + asm volatile("gettime %0" : "=r" (t3)); + printf("TrConv NewOpt %d NewRef %d Old %d\n", t3-t2, t2-t1, t1-t0); + for(int o = 0; o < sizeof(outputs)/sizeof(float); o++) { + if (fabs((outputs2[o]-outputs[o]) / outputs2[o]) > 4e-5 ) { + printf("Expected idx %d %f got %f tr_conv_packed_ref\n", o, outputs2[o], outputs[o]); + errors++; + } + if (fabs((outputs2[o]-outputs3[o]) / outputs2[o]) > 4e-5 ) { + printf("Expected idx %d %f got %f tr_conv_packed_opt\n", o, outputs2[o], outputs3[o]); + errors++; + } + } + + return errors; +} + +int test_c() { + int errors = 0; + float outputs[3*1*32]; + float outputs2[3*1*32]; + float outputs3[3*1*32]; + float inputs[8*2*16]; + float kernels[16*3*2*32]; + float kernels_packed[16*3*2*32*3/4]; + float biases[128]; + for(int i = 0; i < sizeof(kernels) / sizeof(float); i++) { + kernels[i] = i / 16 - 30; + } + for(int i = 0; i < sizeof(inputs) / sizeof(float); i++) { + inputs[i] = 12 - i / 4; + } + for(int i = 0; i < sizeof(outputs) / sizeof(float); i++) { + outputs[i] = 0; + outputs2[i] = 0; + outputs3[i] = 0; + } + for(int i = 0; i < sizeof(biases) / sizeof(float); i++) { + biases[i] = 1/i; + } + int t0, t1, t2, t3; + pack_float(kernels_packed, kernels, sizeof(kernels)/sizeof(float)); + asm volatile("gettime %0" : "=r" (t0)); + xc_conv2d_float_kw5xh2_stride_w3_ref( + outputs2, inputs, kernels, biases, + 3, 1, 32, // output h/w/d + 5, 2, 16 // input h/w/d + ); + asm volatile("gettime %0" : "=r" (t1)); + xc_conv2d_float_kw5xh2_stride_w3_packed_ref( + outputs, inputs, kernels_packed, biases, + 3, 1, 32, // output h/w/d + 5, 2, 16, // input h/w/d + 0, 32); + asm volatile("gettime %0" : "=r" (t2)); + xc_conv2d_float_kw5xh2_stride_w3_packed_opt( + outputs3, inputs, kernels_packed, biases, + 3, 1, 32, // output h/w/d + 5, 2, 16, // input h/w/d + 0, 32); + asm volatile("gettime %0" : "=r" (t3)); + printf("Conv2d NewOpt %d NewRef %d Old %d\n", t3-t2, t2-t1, t1-t0); + for(int o = 0; o < sizeof(outputs)/sizeof(float); o++) { + if (fabs((outputs2[o]-outputs[o]) / outputs2[o]) > 6e-5 ) { + printf("Expected idx %d %f got %f conv_packed_ref\n", o, outputs2[o], outputs[o]); + errors++; + } + if (fabs((outputs2[o]-outputs3[o]) / outputs2[o]) > 6e-5 ) { + printf("Expected idx %d %f got %f conv_packed_opt\n", o, outputs2[o], outputs3[o]); + errors++; + } + } + + return errors; +} + +int test_fc(int opt) { + float outputs[4]; + float expected_outputs[4]; + float inputs[96]; + float kernels2[96*3]; + float kernels[96*4]; + for(int i=0; i<96; i++) { + inputs[i] = i*i; + } + for(int i=0; i<4*96; i++) { + kernels[i] = i; + } + for(int o = 0; o < 4; o++) { + float e = 0; + for(int i=0; i<96; i++) { + e += i*i * (i+o*96); + } + expected_outputs[o] = e; + } + int t0, t1; + pack_float(kernels2, kernels, 96*4); + asm volatile("gettime %0" : "=r" (t0)); + switch(opt) { + case 0: + xc_fc_float_ref(outputs, inputs, kernels, 10, 96, 0, 4); + break; + case 1: + xc_fc_float_opt(outputs, inputs, kernels, 10, 96, 0, 4); + break; + case 2: + xc_fc_float_packed_ref(outputs, inputs, kernels2, 10, 96, 0, 4); + break; + case 3: + xc_fc_float_packed_opt(outputs, inputs, kernels2, 10, 96, 0, 4); + break; + } + asm volatile("gettime %0" : "=r" (t1)); + printf("%d ticks\n", t1-t0); + int errors = 0; + for(int o = 0; o < 4; o++) { + if (fabs((expected_outputs[o]-outputs[o]) / expected_outputs[o]) > 1e-5 ) { + printf("Expected idx %d %f got %f func %d\n", o, expected_outputs[o], outputs[o], opt); + errors++; + } + } + return errors; + +} + +int main(void) { + int errors = 0; + errors += test_tc(); + return errors; + errors += test_timing(); + return errors; + + errors += test_c(); + errors += test_fc(0); + errors += test_fc(1); + errors += test_fc(2); + errors += test_fc(3); + if (errors) { + printf("FAIL\n"); + } else { + printf("PASS\n"); + } + return 0; +} +#endif diff --git a/lib_tflite_micro/src/tflite-xcore-kernels/conv2d_float.h b/lib_tflite_micro/src/tflite-xcore-kernels/conv2d_float.h index 09b9a64..7a04c1f 100644 --- a/lib_tflite_micro/src/tflite-xcore-kernels/conv2d_float.h +++ b/lib_tflite_micro/src/tflite-xcore-kernels/conv2d_float.h @@ -19,7 +19,7 @@ extern "C" { * @returns number of MACCs */ extern int xc_fc_float_ref(float *outputs, float *inputs, float *kernels, - int out_features, int input_features); + int out_features, int input_features, int start, int end); /** Optimized function that calculates a fully connected. * @@ -148,6 +148,45 @@ extern void xc_transpose_conv2d_float_kw5xh2_stride_h3_opt( int out_h, int out_depth, int input_w, int input_h, int input_depth, int out_depth_start, int out_depth_end); +/** Function that calculates a fully connected. The kernels must be packed floats. + * + * @param outputs pointer to the output data, the output data will + * be stored as an array [out_features] + * @param inputs pointer to the input data, the input data must + * be stored as an array [input_features] + * @param kernels pointer to the kernels, the kernels + * must be stored as an array + * [out_features][input_features] + * @param out_features dimension 1 of the output array + * @param input_features dimension 1 of the input array + * @param out_f_start output features to start at + * @param out_f_end output features to end at plus one + * @returns number of MACCs + */ +extern int xc_fc_float_packed_ref(float *outputs, float *inputs, float *kernels, + int out_features, int input_features, int start, + int end); + +/** Optimized function that calculates a fully connected. The kernels must be packed floats. + * + * @param outputs pointer to the output data, the output data will + * be stored as an array [out_features] + * @param inputs pointer to the input data, the input data must + * be stored as an array [input_features] + * @param kernels pointer to the kernels, the kernels + * must be stored as an array + * [out_features][input_features] + * @param out_features dimension 1 of the output array + * @param input_features dimension 1 of the input array + * @param out_f_start output features to start at + * @param out_f_end output features to end at plus one + * @returns number of MACCs + */ +extern int xc_fc_float_packed_opt(float *outputs, float *inputs, float *kernels, + int out_features, int input_features, int out_f_start, + int out_f_end); + + #ifdef __cplusplus }; #endif diff --git a/lib_tflite_micro/src/tflite-xcore-kernels/xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S.S b/lib_tflite_micro/src/tflite-xcore-kernels/xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S.S new file mode 100644 index 0000000..17ef07d --- /dev/null +++ b/lib_tflite_micro/src/tflite-xcore-kernels/xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S.S @@ -0,0 +1,613 @@ + + .section .cp.rodata.cst4,"aMc",@progbits,4 + .cc_top .LCPI10_0.data,.LCPI10_0 + .align 4 + .type .LCPI10_0,@object + .size .LCPI10_0, 4 +.LCPI10_0: + .long 4294967292 + .cc_bottom .LCPI10_0.data + .text + .globl xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S + .align 4 + .type xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S,@function + .cc_top xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S.function,xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S +xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S: + .issue_mode dual + { + nop + dualentsp 30 + } + std r5, r4, sp[11] + { + nop + stw r6, sp[24] + } + { + nop + stw r7, sp[25] + } + { + nop + stw r8, sp[26] + } + { + nop + stw r9, sp[27] + } + { + nop + stw r10, sp[28] + } + { + nop + stw r2, sp[15] + } + { + nop + stw r1, sp[2] + } + { + nop + stw r0, sp[20] + } + { + nop + ldw r11, sp[36] + } + { + nop + ldw r1, sp[37] + } + { + nop + stw r1, sp[13] + } + { + nop + ldw r2, sp[34] + } + { + nop + ldw r1, sp[35] + } + { + nop + stw r2, sp[17] + } + { + nop + stw r1, sp[16] + } + { + nop + ldw r1, sp[32] + } + { + nop + ldw r2, sp[33] + } + { + nop + stw r2, sp[10] + } + { + mov r1, r11 + stw r1, sp[12] + } + { + zext r1, 2 + nop + } + ecallt r1 + { + mkmsk r2, 1 + ldw r1, sp[31] + } + { + lss r2, r1, r2 + nop + } + bt r2, .LBB10_8 + { + nop + ldw r5, sp[12] + } + { + sub r2, r5, 1 + sub r1, r1, 1 + } + { + nop + stw r1, sp[19] + } + { + nop + ldw r1, sp[13] + } + { + nop + ldw r4, sp[38] + } + { + sub r4, r4, r1 + nop + } + ldaw r10, r3[r1] + { + nop + ldw r0, sp[20] + } + ldaw r0, r0[r1] + { + nop + ldw r3, sp[10] + } + mul r1, r3, r5 + { + shl r1, r1, 2 + nop + } + { + shl r9, r3, 2 + stw r1, sp[18] + } + { + ldc r1, 0 + nop + } +.LBB10_3: + { + nop + stw r1, sp[21] + } + { + ldc r3, 0 + ldw r1, sp[12] + } + { + lss r1, r3, r1 + mov r8, r0 + } + { + mov r5, r3 + nop + } + bf r1, .LBB10_7 +.LBB10_4: + { + nop + ldw r1, sp[13] + } + { + nop + ldw r3, sp[38] + } + { + lss r7, r1, r3 + mov r1, r8 + } + { + mov r3, r10 + mov r6, r4 + } + bf r7, .LBB10_6 +.LBB10_5: + { + nop + ldw r7, r3[0] + } + { + sub r6, r6, 1 + stw r7, r1[0] + } + { + add r3, r3, 4 + add r1, r1, 4 + } + bt r6, .LBB10_5 +.LBB10_6: + { + add r1, r5, 1 + add r8, r8, r9 + } + { + eq r3, r5, r2 + mov r5, r1 + } + bf r3, .LBB10_4 +.LBB10_7: + { + nop + ldw r5, sp[21] + } + { + add r1, r5, 1 + ldw r3, sp[18] + } + { + add r0, r0, r3 + ldw r3, sp[19] + } + { + eq r3, r5, r3 + nop + } + bf r3, .LBB10_3 +.LBB10_8: + { + mkmsk r1, 1 + ldw r4, sp[17] + } + { + lss r1, r4, r1 + ldw r2, sp[20] + } + bt r1, .LBB10_25 + lda16 r0, r11[r11] + { + nop + stw r0, sp[14] + } + { + nop + ldw r0, sp[38] + } + { + sub r0, r0, 1 + nop + } + { + nop + stw r0, sp[19] + } + { + nop + ldw r3, sp[16] + } + mul r0, r11, r3 + { + sub r0, r3, 1 + stw r0, sp[1] + } + { + sub r0, r4, 1 + stw r0, sp[8] + } + { + shl r0, r11, 2 + stw r0, sp[4] + } + { + ldc r1, 0 + stw r0, sp[7] + } + { + mkmsk r0, 1 + nop + } + { + lss r0, r3, r0 + nop + } + { + ldc r5, 24 + stw r0, sp[3] + } + { + ldc r7, 16 + ldc r4, 8 + } + { + mov r3, r1 + mov r6, r1 + } +.LBB10_10: + { + nop + ldw r0, sp[3] + } + bt r0, .LBB10_24 + { + shl r0, r6, 1 + nop + } + { + nop + stw r0, sp[11] + } + { + nop + stw r6, sp[5] + } + { + nop + ldw r0, sp[2] + } + ldaw r0, r0[r3] + { + nop + stw r0, sp[17] + } + { + ldc r0, 0 + stw r3, sp[6] + } + { + mov r3, r0 + nop + } +.LBB10_12: + { + mkmsk r0, 1 + stw r3, sp[9] + } + { + sub r1, r0, r3 + nop + } + { + lsu r1, r0, r1 + stw r1, sp[16] + } + { + ldc r0, 0 + nop + } + bu .LBB10_13 +.LBB10_21: + { + nop + ldw r1, sp[18] + } + { + add r0, r1, 1 + nop + } +.xtaloop 3 + { + eq r1, r1, 2 + nop + } +.LBB10_13: + { + nop + stw r0, sp[18] + } + bt r1, .LBB10_22 + { + nop + ldw r0, sp[13] + } + { + nop + ldw r1, sp[38] + } + { + lss r1, r0, r1 + nop + } + bf r1, .LBB10_21 + { + nop + ldw r0, sp[11] + } + { + nop + ldw r1, sp[18] + } + { + add r1, r1, r0 + ldw r0, sp[12] + } + mul r1, r1, r0 + { + add r1, r1, 1 + ldw r0, sp[10] + } + mul r1, r1, r0 + { + nop + ldw r0, sp[13] + } + { + add r8, r1, r0 + mov r3, r0 + } +.LBB10_16: + { + shl r1, r3, 1 + ldw r2, r2[r8] + } + { + add r1, r1, r3 + ldw r0, sp[18] + } + { + nop + stw r3, sp[21] + } + { + add r1, r1, r0 + ldw r0, sp[16] + } + lda16 r1, r0[r1] + { + ldc r10, 30 + ldw r0, sp[14] + } + mul r1, r0, r1 + ashr r9, r1, 32 + { + shr r9, r9, r10 + nop + } + { + add r1, r1, r9 + nop + } + ldw r0, cp[.LCPI10_0] + { + and r1, r1, r0 + ldw r0, sp[15] + } + { + add r1, r0, r1 + ldw r10, sp[17] + } + { + shr r9, r11, 2 + nop + } +.LBB10_19: + { + add r1, r1, 4 + ldw r6, r1[0] + } + lextract r3, r6, r6, r5, 32 + + { + sub r9, r9, 1 + ldw r0, r10[0] + } + fmacc r2, r2, r0, r3 + + { + add r1, r1, 4 + ldw r0, r1[0] + } + lextract r6, r0, r6, r7, 32 + + { + ldc r4, 8 + ldw r3, r10[1] + } + fmacc r2, r2, r3, r6 + + { + add r1, r1, 4 + ldw r3, r1[0] + } + lextract r0, r3, r0, r4, 32 + + ldd r6, r4, r10[1] + fmacc r2, r2, r6, r0 + fmacc r2, r2, r4, r3 + + { + add r10, r10, r7 + bt r9, .LBB10_19 + } + { + nop + ldw r0, sp[20] + } + stw r2, r0[r8] + { + mov r2, r0 + add r8, r8, 1 + } + { + nop + ldw r3, sp[21] + } + { + add r0, r3, 1 + ldw r1, sp[19] + } + { + eq r1, r3, r1 + mov r3, r0 + } + bf r1, .LBB10_16 + bu .LBB10_21 +.LBB10_22: + { + nop + ldw r6, sp[9] + } + { + add r0, r6, 1 + ldw r1, sp[7] + } + { + nop + ldw r3, sp[17] + } + { + add r3, r3, r1 + nop + } + { + nop + stw r3, sp[17] + } + { + nop + ldw r1, sp[8] + } + { + eq r1, r6, r1 + mov r3, r0 + } + bf r1, .LBB10_12 + { + nop + ldw r0, sp[1] + } + { + nop + ldw r3, sp[6] + } + { + add r3, r0, r3 + ldw r6, sp[5] + } +.LBB10_24: + { + add r0, r6, 1 + ldw r1, sp[4] + } + { + eq r1, r6, r1 + mov r6, r0 + } + bf r1, .LBB10_10 +.LBB10_25: + { + nop + ldw r10, sp[28] + } + { + nop + ldw r8, sp[26] + } + { + nop + ldw r9, sp[27] + } + { + nop + ldw r6, sp[24] + } + { + nop + ldw r7, sp[25] + } + ldd r5, r4, sp[11] + { + nop + retsp 30 + } +.LBB10_26: + .cc_bottom xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S.function + .set xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S.nstackwords,(__assert_func.nstackwords + 30) + .globl xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S.nstackwords + .set xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S.maxcores,__assert_func.maxcores $M 1 + .globl xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S.maxcores + .set xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S.maxtimers,__assert_func.maxtimers $M 0 + .globl xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S.maxtimers + .set xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S.maxchanends,__assert_func.maxchanends $M 0 + .globl xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S.maxchanends +.Ltmp10: + .size xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S, .Ltmp10-xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S diff --git a/lib_tflite_micro/src/tflite-xcore-kernels/xcore_beta_fcf32.cc b/lib_tflite_micro/src/tflite-xcore-kernels/xcore_beta_fcf32.cc index 06f2eb0..c78fb52 100644 --- a/lib_tflite_micro/src/tflite-xcore-kernels/xcore_beta_fcf32.cc +++ b/lib_tflite_micro/src/tflite-xcore-kernels/xcore_beta_fcf32.cc @@ -36,7 +36,7 @@ void beta_fcf32_thread_worker(void *shared, void *start, void *end) { auto sd = static_cast(shared); #ifdef NN_USE_REF xc_fc_float_ref(sd->out, sd->in, sd->kernels, sd->out_f, - sd->in_f); + sd->in_f, *s, *e); #else xc_fc_float_opt(sd->out, sd->in, sd->kernels, sd->out_f, sd->in_f, *s, *e);