diff --git a/lib_tflite_micro/src/tflite-xcore-kernels/conv2d_float.c b/lib_tflite_micro/src/tflite-xcore-kernels/conv2d_float.c
index 918474f..71e6078 100644
--- a/lib_tflite_micro/src/tflite-xcore-kernels/conv2d_float.c
+++ b/lib_tflite_micro/src/tflite-xcore-kernels/conv2d_float.c
@@ -1,10 +1,13 @@
 #include "conv2d_float.h"
 #include <assert.h>
+#include <stdint.h>
+#include <string.h>
 
 int xc_fc_float_ref(float *outputs, float *inputs, float *kernels,
-                    int out_features, int input_features) {
+                    int out_features, int input_features, int out_f_start,
+                    int out_f_end) {
   int cnt = 0;
-  for (int f = 0; f < out_features; f++) {
+  for (int f = out_f_start; f < out_f_end; f++) {
     int output_index = f;
     float acc = 0;
     for (int kf = 0; kf < input_features; kf++) {
@@ -202,6 +205,7 @@ int xc_transpose_conv2d_float_kw5xh2_stride_h3_ref(
       for (int d = 0; d < out_depth; d++) {
         for (int kx = 0; kx < KW; kx++) {
           for (int ky = 0; ky < KH; ky++) {
+              if (y+ky !=1) continue;
             int output_index =
                 ((x * H_TR_STRIDE + kx) * out_h + (y + ky)) * out_depth + d;
             float acc = outputs[output_index];
@@ -351,6 +355,340 @@ void xc_transpose_conv2d_float_kw5xh2_stride_h3_opt(
               outputs[output_index] = acc;
             }
           }
+        } else if ((input_depth & 0x3) == 0) {  // All cases before here to be deleted.
+          for (int kx = 0; kx < KW; kx++) {
+            for (int ky = 0; ky < KH; ky++) {
+// Only compute if it is the middle frame
+              if (ky + y != 1) {
+                continue;
+              }
+              int output_index =
+                  ((x * H_TR_STRIDE + kx) * out_h + (y + ky)) * out_depth + d;
+              float acc = outputs[output_index];
+              for (int kd = 0; kd < input_depth; kd+=4) {
+                int input_index = ((x)*input_h + (y)) * input_depth + kd;
+                int kernel_index = ((d * KW + kx) * KH + ky) * input_depth + kd;
+                float in1 = inputs[input_index];
+                float in2 = kernels[kernel_index];
+                asm volatile("fmacc %0, %1, %2, %3"
+                             : "=r"(acc)
+                             : "r"(acc), "r"(in1), "r"(in2));
+                in1 = inputs[input_index+1];
+                in2 = kernels[kernel_index+1];
+                asm volatile("fmacc %0, %1, %2, %3"
+                             : "=r"(acc)
+                             : "r"(acc), "r"(in1), "r"(in2));
+                in1 = inputs[input_index+2];
+                in2 = kernels[kernel_index+2];
+                asm volatile("fmacc %0, %1, %2, %3"
+                             : "=r"(acc)
+                             : "r"(acc), "r"(in1), "r"(in2));
+                in1 = inputs[input_index+3];
+                in2 = kernels[kernel_index+3];
+                asm volatile("fmacc %0, %1, %2, %3"
+                             : "=r"(acc)
+                             : "r"(acc), "r"(in1), "r"(in2));
+              }
+              outputs[output_index] = acc;
+            }
+          }
+        } else {
+          assert(0);
+        }
+      }
+    }
+  }
+}
+#endif
+
+
+float extract3_ref(float *kernels, int index) {
+    float x;
+    ((uint8_t *)&x)[0] = *(((uint8_t *)kernels) + index*3-1);
+    ((uint8_t *)&x)[1] = *(((uint8_t *)kernels) + index*3-1+1);
+    ((uint8_t *)&x)[2] = *(((uint8_t *)kernels) + index*3-1+2);
+    ((uint8_t *)&x)[3] = *(((uint8_t *)kernels) + index*3-1+3);
+//    memcpy(&x, ((uint8_t *)kernels) + index*3-1, 4);
+    return x;
+}
+
+#define extract3(fout, kernels, index) \
+    { \
+    switch(index & 3) { \
+    default: \
+        f0 = kernels[(index>>2)*3];                 \
+        asm volatile("lextract %0, %1, %2, %3, 32" : "=r" (fout) : "r" (f0), "r" (f0), "r" (24)); \
+        break; \
+    case 1: \
+        f1 = kernels[(index>>2)*3+1]; \
+        asm volatile("lextract %0, %1, %2, %3, 32" : "=r" (fout) : "r" (f1), "r" (f0), "r" (16)); \
+        break; \
+    case 2: \
+        f2 = kernels[(index>>2)*3+2]; \
+        asm volatile("lextract %0, %1, %2, %3, 32" : "=r" (fout) : "r" (f2), "r" (f1), "r" (8)); \
+        break; \
+    case 3: \
+        fout = f2; \
+        break; \
+    } \
+}
+
+
+#ifndef NN_USE_REF
+void xc_conv2d_float_kw5xh2_stride_w3_packed_ref(float *outputs, float *inputs,
+                                          float *kernels, float *biases,
+                                          int out_w, int out_h, int out_depth,
+                                          int input_w, int input_h,
+                                          int input_depth, int out_depth_start,
+                                          int out_depth_end) {
+  for (int x = 0; x < out_w; x++) {
+    for (int y = 0; y < out_h; y++) {
+      for (int d = out_depth_start; d < out_depth_end; d++) {
+        int output_index = (x * out_h + y) * out_depth + d;
+        float acc = biases[d];
+        if (input_depth == 1) {
+#pragma clang loop unroll(full)
+          for (int kx = 0; kx < KW; kx++) {
+#pragma clang loop unroll(full)
+            for (int ky = 0; ky < KH; ky++) {
+              int input_index =
+                  ((x * H_STRIDE + kx) * input_h + (y + ky)) * input_depth;
+              int kernel_index = ((d * KW + kx) * KH + ky) * input_depth;
+              float in1 = inputs[input_index];
+              float in2 = extract3_ref(kernels ,kernel_index);
+              asm volatile("fmacc %0, %1, %2, %3"
+                           : "=r"(acc)
+                           : "r"(acc), "r"(in1), "r"(in2));
+            }
+          }
+        } else if (input_depth == 2) {
+#pragma clang loop unroll(full)
+          for (int kx = 0; kx < KW; kx++) {
+#pragma clang loop unroll(full)
+            for (int ky = 0; ky < KH; ky++) {
+              for (int kd = 0; kd < 2; kd++) {
+                int input_index =
+                    ((x * H_STRIDE + kx) * input_h + (y + ky)) * input_depth +
+                    kd;
+                int kernel_index = ((d * KW + kx) * KH + ky) * input_depth + kd;
+                float in1 = inputs[input_index];
+                float in2 = extract3_ref(kernels ,kernel_index);
+                asm volatile("fmacc %0, %1, %2, %3"
+                             : "=r"(acc)
+                             : "r"(acc), "r"(in1), "r"(in2));
+              }
+            }
+          }
+        } else if (input_depth == 4) {
+          for (int kx = 0; kx < KW; kx++) {
+            for (int ky = 0; ky < KH; ky++) {
+#pragma clang loop unroll(full)
+              for (int kd = 0; kd < 4; kd++) {
+                int input_index =
+                    ((x * H_STRIDE + kx) * input_h + (y + ky)) * input_depth +
+                    kd;
+                int kernel_index = ((d * KW + kx) * KH + ky) * input_depth + kd;
+                float in1 = inputs[input_index];
+                float in2 = extract3_ref(kernels ,kernel_index);
+                asm volatile("fmacc %0, %1, %2, %3"
+                             : "=r"(acc)
+                             : "r"(acc), "r"(in1), "r"(in2));
+              }
+            }
+          }
+        } else if (input_depth == 8) {
+          for (int kx = 0; kx < KW; kx++) {
+            for (int ky = 0; ky < KH; ky++) {
+#pragma clang loop unroll(full)
+              for (int kd = 0; kd < 8; kd++) {
+                int input_index =
+                    ((x * H_STRIDE + kx) * input_h + (y + ky)) * input_depth +
+                    kd;
+                int kernel_index = ((d * KW + kx) * KH + ky) * input_depth + kd;
+                float in1 = inputs[input_index];
+                float in2 = extract3_ref(kernels ,kernel_index);
+                asm volatile("fmacc %0, %1, %2, %3"
+                             : "=r"(acc)
+                             : "r"(acc), "r"(in1), "r"(in2));
+              }
+            }
+          }
+        } else if (input_depth == 16) {
+          for (int kx = 0; kx < KW; kx++) {
+            for (int ky = 0; ky < KH; ky++) {
+#pragma clang loop unroll_count(8)
+              for (int kd = 0; kd < 16; kd++) {
+                int input_index =
+                    ((x * H_STRIDE + kx) * input_h + (y + ky)) * input_depth +
+                    kd;
+                int kernel_index = ((d * KW + kx) * KH + ky) * input_depth + kd;
+                float in1 = inputs[input_index];
+                float in2 = extract3_ref(kernels ,kernel_index);
+                asm volatile("fmacc %0, %1, %2, %3"
+                             : "=r"(acc)
+                             : "r"(acc), "r"(in1), "r"(in2));
+              }
+            }
+          }
+        } else {
+          assert(0);
+        }
+        outputs[output_index] = acc;
+      }
+    }
+  }
+}
+#endif
+
+#ifndef NN_USE_REF
+void xc_conv2d_float_kw5xh2_stride_w3_packed_opt(float *outputs, float *inputs,
+                                          float *kernels, float *biases,
+                                          int out_w, int out_h, int out_depth,
+                                          int input_w, int input_h,
+                                          int input_depth, int out_depth_start,
+                                          int out_depth_end) {
+  for (int x = 0; x < out_w; x++) {
+    for (int y = 0; y < out_h; y++) {
+      for (int d = out_depth_start; d < out_depth_end; d++) {
+        float f0, f1, f2;
+        int output_index = (x * out_h + y) * out_depth + d;
+        float acc = biases[d];
+        if (input_depth == 1) {
+#pragma clang loop unroll(full)
+          for (int kx = 0; kx < KW; kx++) {
+#pragma clang loop unroll(full)
+            for (int ky = 0; ky < KH; ky++) {
+              int input_index =
+                  ((x * H_STRIDE + kx) * input_h + (y + ky)) * input_depth;
+              int kernel_index = ((d * KW + kx) * KH + ky) * input_depth;
+              float in1 = inputs[input_index];
+              float in2 = extract3_ref(kernels ,kernel_index);
+              asm volatile("fmacc %0, %1, %2, %3"
+                           : "=r"(acc)
+                           : "r"(acc), "r"(in1), "r"(in2));
+            }
+          }
+        } else if (input_depth == 2) {
+#pragma clang loop unroll(full)
+          for (int kx = 0; kx < KW; kx++) {
+#pragma clang loop unroll(full)
+            for (int ky = 0; ky < KH; ky++) {
+              for (int kd = 0; kd < 2; kd++) {
+                int input_index =
+                    ((x * H_STRIDE + kx) * input_h + (y + ky)) * input_depth +
+                    kd;
+                int kernel_index = ((d * KW + kx) * KH + ky) * input_depth + kd;
+                float in1 = inputs[input_index];
+                float in2 = extract3_ref(kernels ,kernel_index);
+                asm volatile("fmacc %0, %1, %2, %3"
+                             : "=r"(acc)
+                             : "r"(acc), "r"(in1), "r"(in2));
+              }
+            }
+          }
+        } else if ((input_depth & 3) == 0) {
+          for (int kx = 0; kx < KW; kx++) {
+            for (int ky = 0; ky < KH; ky++) {
+                for (int kd = 0, kd2 = 0; kd < input_depth; kd+=4, kd2 += 3) {
+                int input_index =
+                    ((x * H_STRIDE + kx) * input_h + (y + ky)) * input_depth +
+                    kd;
+                int kernel_index = ((d * KW + kx) * KH + ky) * input_depth*3/4 + kd2;
+                float in1 = inputs[input_index];
+                float in2;
+                f0 = kernels[(kernel_index>>0)];                 
+                asm volatile("lextract %0, %1, %2, %3, 32" : "=r" (in2) : "r" (f0), "r" (f0), "r" (24));
+                
+                asm volatile("fmacc %0, %1, %2, %3"
+                             : "=r"(acc)
+                             : "r"(acc), "r"(in1), "r"(in2));
+                in1 = inputs[input_index+1];
+                f1 = kernels[((kernel_index)>>0)+1]; 
+                asm volatile("lextract %0, %1, %2, %3, 32" : "=r" (in2) : "r" (f1), "r" (f0), "r" (16));
+
+                asm volatile("fmacc %0, %1, %2, %3"
+                             : "=r"(acc)
+                             : "r"(acc), "r"(in1), "r"(in2));
+                in1 = inputs[input_index+2];
+
+                f2 = kernels[(kernel_index>>0)+2]; 
+                asm volatile("lextract %0, %1, %2, %3, 32" : "=r" (in2) : "r" (f2), "r" (f1), "r" (8)); 
+                asm volatile("fmacc %0, %1, %2, %3"
+                             : "=r"(acc)
+                             : "r"(acc), "r"(in1), "r"(in2));
+                in1 = inputs[input_index+3];
+                in2 = f2;
+                asm volatile("fmacc %0, %1, %2, %3"
+                             : "=r"(acc)
+                             : "r"(acc), "r"(in1), "r"(in2));
+              }
+            }
+          }
+        } else {
+          assert(0);
+        }
+        outputs[output_index] = acc;
+      }
+    }
+  }
+}
+#endif
+
+
+#ifndef NN_USE_REF
+void xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_ref(
+    float *outputs, float *inputs, float *kernels, float *biases, int out_w,
+    int out_h, int out_depth, int input_w, int input_h, int input_depth,
+    int out_depth_start, int out_depth_end) {
+  for (int x = 0; x < out_w; x++) {
+    for (int y = 0; y < out_h; y++) {
+      for (int d = out_depth_start; d < out_depth_end; d++) {
+        int output_index = (x * out_h + y) * out_depth + d;
+        outputs[output_index] = biases[d];
+      }
+    }
+  }
+  for (int x = 0; x < input_w; x++) {
+    for (int y = 0; y < input_h; y++) {
+      for (int d = out_depth_start; d < out_depth_end; d++) {
+        if ((input_depth & 0x3) == 0) {
+          for (int kx = 0; kx < KW; kx++) {
+            for (int ky = 0; ky < KH; ky++) {
+// Only compute if it is the middle frame
+              if (ky + y != 1) {
+                continue;
+              }
+              int output_index =
+                  ((x * H_TR_STRIDE + kx) * out_h + (y + ky)) * out_depth + d;
+              float acc = outputs[output_index];
+              for (int kd = 0; kd < input_depth; kd+=4) {
+                int input_index = ((x)*input_h + (y)) * input_depth + kd;
+                int kernel_index = ((d * KW + kx) * KH + ky) * input_depth + kd;
+                float in1 = inputs[input_index];
+                float in2;
+                in2 = extract3_ref(kernels ,kernel_index);
+                asm volatile("fmacc %0, %1, %2, %3"
+                             : "=r"(acc)
+                             : "r"(acc), "r"(in1), "r"(in2));
+                in1 = inputs[input_index+1];
+                in2 = extract3_ref(kernels ,kernel_index+1);
+                asm volatile("fmacc %0, %1, %2, %3"
+                             : "=r"(acc)
+                             : "r"(acc), "r"(in1), "r"(in2));
+                in1 = inputs[input_index+2];
+                in2 = extract3_ref(kernels ,kernel_index+2);
+                asm volatile("fmacc %0, %1, %2, %3"
+                             : "=r"(acc)
+                             : "r"(acc), "r"(in1), "r"(in2));
+                in1 = inputs[input_index+3];
+                in2 = extract3_ref(kernels ,kernel_index+3);
+                asm volatile("fmacc %0, %1, %2, %3"
+                             : "=r"(acc)
+                             : "r"(acc), "r"(in1), "r"(in2));
+              }
+              outputs[output_index] = acc;
+            }
+          }
         } else {
           assert(0);
         }
@@ -359,3 +697,389 @@ void xc_transpose_conv2d_float_kw5xh2_stride_h3_opt(
   }
 }
 #endif
+
+extern void xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S(
+    float *outputs, float *inputs, float *kernels, float *biases, int out_w,
+    int out_h, int out_depth, int input_w, int input_h, int input_depth,
+    int out_depth_start, int out_depth_end);
+
+#ifndef NN_USE_REF
+void xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt(
+    float *outputs, float *inputs, float *kernels, float *biases, int out_w,
+    int out_h, int out_depth, int input_w, int input_h, int input_depth,
+    int out_depth_start, int out_depth_end) {
+  assert((input_depth & 0x3) == 0);
+  for (int x = 0; x < out_w; x++) {
+    for (int y = 0; y < out_h; y++) {
+      for (int d = out_depth_start; d < out_depth_end; d++) {
+        int output_index = (x * out_h + y) * out_depth + d;
+        outputs[output_index] = biases[d];
+      }
+    }
+  }
+  int input_base = 0;
+  for (int x = 0; x < input_w; x++) {
+    for (int y = 0; y < input_h; y++) {
+      int ky = 1-y;
+      if (ky >= 0 && ky < KH) {
+        for (int kx = 0; kx < KW; kx++) {
+          int output_index =
+              ((x * H_TR_STRIDE + kx) * out_h + 1) * out_depth + out_depth_start;
+          for (int d = out_depth_start; d < out_depth_end; d++) {
+            float acc = outputs[output_index];
+            float f0 = 0, f1 = 0, f2 = 0;
+            int kernel_index = ((d * KW + kx) * KH + ky) * input_depth*3/4;
+            for (int kd = 0; kd < input_depth; kd+=4, kernel_index += 3) {
+              int input_index = input_base + kd;
+              float in1 = inputs[input_index];
+              float in2 = 0;
+              f0 = kernels[(kernel_index>>0)];                 
+              asm volatile("lextract %0, %1, %2, %3, 32" : "=r" (in2) : "r" (f0), "r" (f0), "r" (24));
+              asm volatile("fmacc %0, %1, %2, %3"
+                           : "=r"(acc)
+                           : "r"(acc), "r"(in1), "r"(in2));
+              in1 = inputs[input_index+1];
+              f1 = kernels[((kernel_index)>>0)+1]; 
+              asm volatile("lextract %0, %1, %2, %3, 32" : "=r" (in2) : "r" (f1), "r" (f0), "r" (16));
+              asm volatile("fmacc %0, %1, %2, %3"
+                           : "=r"(acc)
+                           : "r"(acc), "r"(in1), "r"(in2));
+              in1 = inputs[input_index+2];
+              f2 = kernels[(kernel_index>>0)+2]; 
+              asm volatile("lextract %0, %1, %2, %3, 32" : "=r" (in2) : "r" (f2), "r" (f1), "r" (8)); 
+              asm volatile("fmacc %0, %1, %2, %3"
+                           : "=r"(acc)
+                           : "r"(acc), "r"(in1), "r"(in2));
+              in1 = inputs[input_index+3];
+              in2 = f2;
+              asm volatile("fmacc %0, %1, %2, %3"
+                           : "=r"(acc)
+                           : "r"(acc), "r"(in1), "r"(in2));
+            }
+            outputs[output_index] = acc;
+            output_index++;
+          }
+        }
+      }
+      input_base += input_depth;
+    }
+  }
+}
+#endif
+
+
+
+int xc_fc_float_packed_ref(float *outputs, float *inputs, float *kernels,
+                    int out_features, int input_features, int out_f_start,
+                    int out_f_end) {
+  int cnt = 0;
+  for (int f = out_f_start ; f < out_f_end; f++) {
+    int output_index = f;
+    float acc = 0;
+    for (int kf = 0; kf < input_features; kf++) {
+      int input_index = kf;
+      int kernel_index = f * input_features + kf;
+      acc += inputs[input_index] * extract3_ref(kernels ,kernel_index);
+      cnt++;
+    }
+    outputs[output_index] = acc;
+  }
+  return cnt;
+}
+
+#ifndef NN_USE_REF
+int xc_fc_float_packed_opt(float *outputs, float *inputs, float *kernels,
+                    int out_features, int input_features, int out_f_start,
+                    int out_f_end) {
+  float f0, f1, f2;
+  for (int f = out_f_start; f < out_f_end; f++) {
+    int output_index = f;
+    float acc = 0;
+    assert (input_features == 96);
+    for (int kf = 0, kf2 = 0; kf < input_features; kf+=4, kf2 += 3) {
+      int input_index = kf;
+      int kernel_index = f * input_features*3/4 + kf2;
+      float in1 = inputs[input_index];
+      float in2 = 0;
+      f0 = kernels[(kernel_index>>0)];                 
+      asm volatile("lextract %0, %1, %2, %3, 32" : "=r" (in2) : "r" (f0), "r" (f0), "r" (24)); 
+      asm volatile("fmacc %0, %1, %2, %3"
+                   : "=r"(acc)
+                   : "r"(acc), "r"(in1), "r"(in2));
+      in1 = inputs[input_index+1];
+      f1 = kernels[((kernel_index)>>0)+1]; 
+      asm volatile("lextract %0, %1, %2, %3, 32" : "=r" (in2) : "r" (f1), "r" (f0), "r" (16)); 
+      asm volatile("fmacc %0, %1, %2, %3"
+                   : "=r"(acc)
+                   : "r"(acc), "r"(in1), "r"(in2));
+      in1 = inputs[input_index+2];
+      f2 = kernels[(kernel_index>>0)+2]; 
+      asm volatile("lextract %0, %1, %2, %3, 32" : "=r" (in2) : "r" (f2), "r" (f1), "r" (8)); 
+      asm volatile("fmacc %0, %1, %2, %3"
+                   : "=r"(acc)
+                   : "r"(acc), "r"(in1), "r"(in2));
+      in1 = inputs[input_index+3];
+      in2 = f2;
+      asm volatile("fmacc %0, %1, %2, %3"
+                   : "=r"(acc)
+                   : "r"(acc), "r"(in1), "r"(in2));
+    }
+    outputs[output_index] = acc;
+  }
+  return 0;
+}
+
+#endif
+
+#ifdef LOCAL_MAIN
+
+static void pack_float(float *kernels, float *kernels_in, int num) {
+    for(int i = 0; i < num; i++) {
+        memcpy(((uint8_t *)kernels) + i*3, ((uint8_t * )&kernels_in[i])+1, 3);
+    }
+}
+
+#include <stdio.h>
+#include <math.h>
+
+int test_timing() {
+    int errors = 0;
+    float outputs0[64*3*16];
+    float outputs2[64*3*16];
+    float outputs3[64*3*16];
+    float inputs[31*2*32];
+    float kernels[16*3*2*32];
+    float kernels_packed[16*3*2*32*3/4];
+    float biases[128];
+    for(int i = 0; i < sizeof(kernels) / sizeof(float); i++) {
+        kernels[i] = i / 16 - 30;
+    }
+    for(int i = 0; i < sizeof(inputs) / sizeof(float); i++) {
+        inputs[i] = 12 - i / 4;
+    }
+    for(int i = 0; i < sizeof(biases) / sizeof(float); i++) {
+        biases[i] = 1/i;
+    }
+    for(int i = 0; i < sizeof(outputs2) / sizeof(float); i++) {
+        outputs0[i] = 0;
+        outputs2[i] = 0;
+        outputs3[i] = 0;
+    }
+    int t0, t1, t2, t3, cnt;
+    pack_float(kernels_packed, kernels, sizeof(kernels)/sizeof(float));
+    asm volatile("gettime %0" : "=r" (t0));
+    xc_transpose_conv2d_float_kw5xh2_stride_h3_opt(
+        outputs2, inputs, kernels, biases,
+        64, 3, 16,  // output h/w/d
+        31, 2, 32, // input h/w/d
+        0, 16);
+    asm volatile("gettime %0" : "=r" (t1));
+    cnt = xc_transpose_conv2d_float_kw5xh2_stride_h3_ref(
+        outputs0, inputs, kernels, biases,
+        64, 3, 16,  // output h/w/d
+        31, 2, 32 // input h/w/d
+        );
+    asm volatile("gettime %0" : "=r" (t2));
+    xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt(
+        outputs3, inputs, kernels_packed, biases,
+        64, 3, 16,  // output h/w/d
+        31, 2, 32, // input h/w/d
+        0, 16);
+    asm volatile("gettime %0" : "=r" (t3));
+    printf("TrConv NewOpt %d   REF %d   Old %d     maccs %d\n", t3-t2, t2-t1, t1-t0,  cnt);
+    for(int o = 0; o < sizeof(outputs2)/sizeof(float); o++) {
+        if (fabs((outputs2[o]-outputs3[o]) / outputs2[o]) > 10e-5 ) {
+            printf("Expected idx %d %f got %f tr_conv_packed_opt\n", o, outputs2[o], outputs3[o]);
+            errors++;
+        }
+        if (fabs((outputs2[o]-outputs0[o]) / outputs2[o]) > 10e-5 ) {
+            printf("Expected idx %d %f got %f tr_conv_ref\n", o, outputs2[o], outputs0[o]);
+            errors++;
+        }
+    }
+    
+    return errors;
+}
+
+int test_tc() {
+    int errors = 0;
+    float outputs[6*4*32];
+    float outputs2[6*4*32];
+    float outputs3[6*4*32];
+    float inputs[2*2*64];
+    float kernels[32*3*2*64];
+    float kernels_packed[32*3*2*64*3/4];
+    float biases[128];
+    for(int i = 0; i < sizeof(kernels) / sizeof(float); i++) {
+        kernels[i] = i / 16 - 30;
+    }
+    for(int i = 0; i < sizeof(inputs) / sizeof(float); i++) {
+        inputs[i] = 12 - i / 4;
+    }
+    for(int i = 0; i < sizeof(biases) / sizeof(float); i++) {
+        biases[i] = 1/i;
+    }
+    int t0, t1, t2, t3;
+    pack_float(kernels_packed, kernels, sizeof(kernels)/sizeof(float));
+    asm volatile("gettime %0" : "=r" (t0));
+    xc_transpose_conv2d_float_kw5xh2_stride_h3_opt(
+        outputs2, inputs, kernels, biases,
+        6, 4, 32,  // output h/w/d
+        2, 2, 64, // input h/w/d
+        0, 32);
+    asm volatile("gettime %0" : "=r" (t1));
+    xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_ref(
+        outputs, inputs, kernels_packed, biases,
+        6, 4, 32,  // output h/w/d
+        2, 2, 64, // input h/w/d
+        0, 32);
+    asm volatile("gettime %0" : "=r" (t2));
+    xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S(
+        outputs3, inputs, kernels_packed, biases,
+        6, 4, 32,  // output h/w/d
+        2, 2, 64, // input h/w/d
+        0, 32);
+    asm volatile("gettime %0" : "=r" (t3));
+    printf("TrConv NewOpt %d NewRef %d Old %d\n", t3-t2, t2-t1, t1-t0);
+    for(int o = 0; o < sizeof(outputs)/sizeof(float); o++) {
+        if (fabs((outputs2[o]-outputs[o]) / outputs2[o]) > 4e-5 ) {
+            printf("Expected idx %d %f got %f tr_conv_packed_ref\n", o, outputs2[o], outputs[o]);
+            errors++;
+        }
+        if (fabs((outputs2[o]-outputs3[o]) / outputs2[o]) > 4e-5 ) {
+            printf("Expected idx %d %f got %f tr_conv_packed_opt\n", o, outputs2[o], outputs3[o]);
+            errors++;
+        }
+    }
+    
+    return errors;
+}
+
+int test_c() {
+    int errors = 0;
+    float outputs[3*1*32];
+    float outputs2[3*1*32];
+    float outputs3[3*1*32];
+    float inputs[8*2*16];
+    float kernels[16*3*2*32];
+    float kernels_packed[16*3*2*32*3/4];
+    float biases[128];
+    for(int i = 0; i < sizeof(kernels) / sizeof(float); i++) {
+        kernels[i] = i / 16 - 30;
+    }
+    for(int i = 0; i < sizeof(inputs) / sizeof(float); i++) {
+        inputs[i] = 12 - i / 4;
+    }
+    for(int i = 0; i < sizeof(outputs) / sizeof(float); i++) {
+        outputs[i] = 0;
+        outputs2[i] = 0;
+        outputs3[i] = 0;
+    }
+    for(int i = 0; i < sizeof(biases) / sizeof(float); i++) {
+        biases[i] = 1/i;
+    }
+    int t0, t1, t2, t3;
+    pack_float(kernels_packed, kernels, sizeof(kernels)/sizeof(float));
+    asm volatile("gettime %0" : "=r" (t0));
+    xc_conv2d_float_kw5xh2_stride_w3_ref(
+        outputs2, inputs, kernels, biases,
+        3, 1, 32,  // output h/w/d
+        5, 2, 16 // input h/w/d
+        );
+    asm volatile("gettime %0" : "=r" (t1));
+    xc_conv2d_float_kw5xh2_stride_w3_packed_ref(
+        outputs, inputs, kernels_packed, biases,
+        3, 1, 32,  // output h/w/d
+        5, 2, 16, // input h/w/d
+        0, 32);
+    asm volatile("gettime %0" : "=r" (t2));
+    xc_conv2d_float_kw5xh2_stride_w3_packed_opt(
+        outputs3, inputs, kernels_packed, biases,
+        3, 1, 32,  // output h/w/d
+        5, 2, 16, // input h/w/d
+        0, 32);
+    asm volatile("gettime %0" : "=r" (t3));
+    printf("Conv2d NewOpt %d NewRef %d Old %d\n", t3-t2, t2-t1, t1-t0);
+    for(int o = 0; o < sizeof(outputs)/sizeof(float); o++) {
+        if (fabs((outputs2[o]-outputs[o]) / outputs2[o]) > 6e-5 ) {
+            printf("Expected idx %d %f got %f conv_packed_ref\n", o, outputs2[o], outputs[o]);
+            errors++;
+        }
+        if (fabs((outputs2[o]-outputs3[o]) / outputs2[o]) > 6e-5 ) {
+            printf("Expected idx %d %f got %f conv_packed_opt\n", o, outputs2[o], outputs3[o]);
+            errors++;
+        }
+    }
+    
+    return errors;
+}
+
+int test_fc(int opt) {
+    float outputs[4];
+    float expected_outputs[4];
+    float inputs[96];
+    float kernels2[96*3];
+    float kernels[96*4];
+    for(int i=0; i<96; i++) {
+        inputs[i] = i*i;
+    }
+    for(int i=0; i<4*96; i++) {
+        kernels[i] = i;
+    }
+    for(int o = 0; o < 4; o++) {
+        float e = 0;
+        for(int i=0; i<96; i++) {
+            e += i*i * (i+o*96);
+        }
+        expected_outputs[o] = e;
+    }
+    int t0, t1;
+    pack_float(kernels2, kernels, 96*4);
+    asm volatile("gettime %0" : "=r" (t0));
+    switch(opt) {
+    case 0:
+        xc_fc_float_ref(outputs, inputs, kernels, 10, 96, 0, 4);
+        break;
+    case 1:
+        xc_fc_float_opt(outputs, inputs, kernels, 10, 96, 0, 4);
+        break;
+    case 2:
+        xc_fc_float_packed_ref(outputs, inputs, kernels2, 10, 96, 0, 4);
+        break;
+    case 3:
+        xc_fc_float_packed_opt(outputs, inputs, kernels2, 10, 96, 0, 4);
+        break;
+    }
+    asm volatile("gettime %0" : "=r" (t1));
+    printf("%d ticks\n", t1-t0);
+    int errors = 0;
+    for(int o = 0; o < 4; o++) {
+        if (fabs((expected_outputs[o]-outputs[o]) / expected_outputs[o]) > 1e-5 ) {
+            printf("Expected idx %d %f got %f func %d\n", o, expected_outputs[o], outputs[o], opt);
+            errors++;
+        }
+    }
+    return errors;
+
+}
+
+int main(void) {
+    int errors = 0;
+    errors += test_tc();
+    return errors;
+    errors += test_timing();
+    return errors;
+    
+    errors += test_c();
+    errors += test_fc(0);
+    errors += test_fc(1);
+    errors += test_fc(2);
+    errors += test_fc(3);
+    if (errors) {
+        printf("FAIL\n");
+    } else {
+        printf("PASS\n");
+    }
+    return 0;
+}
+#endif
diff --git a/lib_tflite_micro/src/tflite-xcore-kernels/conv2d_float.h b/lib_tflite_micro/src/tflite-xcore-kernels/conv2d_float.h
index 09b9a64..7a04c1f 100644
--- a/lib_tflite_micro/src/tflite-xcore-kernels/conv2d_float.h
+++ b/lib_tflite_micro/src/tflite-xcore-kernels/conv2d_float.h
@@ -19,7 +19,7 @@ extern "C" {
  * @returns                     number of MACCs
  */
 extern int xc_fc_float_ref(float *outputs, float *inputs, float *kernels,
-                           int out_features, int input_features);
+                           int out_features, int input_features, int start, int end);
 
 /** Optimized function that calculates a fully connected.
  *
@@ -148,6 +148,45 @@ extern void xc_transpose_conv2d_float_kw5xh2_stride_h3_opt(
     int out_h, int out_depth, int input_w, int input_h, int input_depth,
     int out_depth_start, int out_depth_end);
 
+/** Function that calculates a fully connected. The kernels must be packed floats.
+ *
+ * @param  outputs              pointer to the output data, the output data will
+ * be stored as an array [out_features]
+ * @param  inputs               pointer to the input data, the input data must
+ * be stored as an array [input_features]
+ * @param  kernels              pointer to the kernels, the kernels
+ *                              must be stored as an array
+ *                              [out_features][input_features]
+ * @param  out_features         dimension 1 of the output array
+ * @param  input_features       dimension 1 of the input array
+ * @param  out_f_start          output features to start at
+ * @param  out_f_end            output features to end at plus one
+ * @returns                     number of MACCs
+ */
+extern int xc_fc_float_packed_ref(float *outputs, float *inputs, float *kernels,
+                                  int out_features, int input_features, int start,
+                                  int end);
+
+/** Optimized function that calculates a fully connected. The kernels must be packed floats.
+ *
+ * @param  outputs              pointer to the output data, the output data will
+ * be stored as an array [out_features]
+ * @param  inputs               pointer to the input data, the input data must
+ * be stored as an array [input_features]
+ * @param  kernels              pointer to the kernels, the kernels
+ *                              must be stored as an array
+ *                              [out_features][input_features]
+ * @param  out_features         dimension 1 of the output array
+ * @param  input_features       dimension 1 of the input array
+ * @param  out_f_start          output features to start at
+ * @param  out_f_end            output features to end at plus one
+ * @returns                     number of MACCs
+ */
+extern int xc_fc_float_packed_opt(float *outputs, float *inputs, float *kernels,
+                                  int out_features, int input_features, int out_f_start,
+                                  int out_f_end);
+
+
 #ifdef __cplusplus
 };
 #endif
diff --git a/lib_tflite_micro/src/tflite-xcore-kernels/xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S.S b/lib_tflite_micro/src/tflite-xcore-kernels/xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S.S
new file mode 100644
index 0000000..17ef07d
--- /dev/null
+++ b/lib_tflite_micro/src/tflite-xcore-kernels/xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S.S
@@ -0,0 +1,613 @@
+
+	.section	.cp.rodata.cst4,"aMc",@progbits,4
+	.cc_top .LCPI10_0.data,.LCPI10_0
+	.align	4
+	.type	.LCPI10_0,@object
+	.size	.LCPI10_0, 4
+.LCPI10_0:
+	.long	4294967292
+	.cc_bottom .LCPI10_0.data
+	.text
+	.globl	xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S
+	.align	4
+	.type	xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S,@function
+	.cc_top xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S.function,xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S
+xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S:
+	.issue_mode dual
+	{
+		nop
+		dualentsp 30
+	}
+	std r5, r4, sp[11]
+	{
+		nop
+		stw r6, sp[24]
+	}
+	{
+		nop
+		stw r7, sp[25]
+	}
+	{
+		nop
+		stw r8, sp[26]
+	}
+	{
+		nop
+		stw r9, sp[27]
+	}
+	{
+		nop
+		stw r10, sp[28]
+	}
+	{
+		nop
+		stw r2, sp[15]
+	}
+	{
+		nop
+		stw r1, sp[2]
+	}
+	{
+		nop
+		stw r0, sp[20]
+	}
+	{
+		nop
+		ldw r11, sp[36]
+	}
+	{
+		nop
+		ldw r1, sp[37]
+	}
+	{
+		nop
+		stw r1, sp[13]
+	}
+	{
+		nop
+		ldw r2, sp[34]
+	}
+	{
+		nop
+		ldw r1, sp[35]
+	}
+	{
+		nop
+		stw r2, sp[17]
+	}
+	{
+		nop
+		stw r1, sp[16]
+	}
+	{
+		nop
+		ldw r1, sp[32]
+	}
+	{
+		nop
+		ldw r2, sp[33]
+	}
+	{
+		nop
+		stw r2, sp[10]
+	}
+	{
+		mov r1, r11
+		stw r1, sp[12]
+	}
+	{
+		zext r1, 2
+		nop
+	}
+    ecallt   r1
+	{
+		mkmsk r2, 1
+		ldw r1, sp[31]
+	}
+	{
+		lss r2, r1, r2
+		nop
+	}
+	bt r2, .LBB10_8
+	{
+		nop
+		ldw r5, sp[12]
+	}
+	{
+		sub r2, r5, 1
+		sub r1, r1, 1
+	}
+	{
+		nop
+		stw r1, sp[19]
+	}
+	{
+		nop
+		ldw r1, sp[13]
+	}
+	{
+		nop
+		ldw r4, sp[38]
+	}
+	{
+		sub r4, r4, r1
+		nop
+	}
+	ldaw r10, r3[r1]
+	{
+		nop
+		ldw r0, sp[20]
+	}
+	ldaw r0, r0[r1]
+	{
+		nop
+		ldw r3, sp[10]
+	}
+	mul r1, r3, r5
+	{
+		shl r1, r1, 2
+		nop
+	}
+	{
+		shl r9, r3, 2
+		stw r1, sp[18]
+	}
+	{
+		ldc r1, 0
+		nop
+	}
+.LBB10_3:
+	{
+		nop
+		stw r1, sp[21]
+	}
+	{
+		ldc r3, 0
+		ldw r1, sp[12]
+	}
+	{
+		lss r1, r3, r1
+		mov r8, r0
+	}
+	{
+		mov r5, r3
+		nop
+	}
+	bf r1, .LBB10_7
+.LBB10_4:
+	{
+		nop
+		ldw r1, sp[13]
+	}
+	{
+		nop
+		ldw r3, sp[38]
+	}
+	{
+		lss r7, r1, r3
+		mov r1, r8
+	}
+	{
+		mov r3, r10
+		mov r6, r4
+	}
+	bf r7, .LBB10_6
+.LBB10_5:
+	{
+		nop
+		ldw r7, r3[0]
+	}
+	{
+		sub r6, r6, 1
+		stw r7, r1[0]
+	}
+	{
+		add r3, r3, 4
+		add r1, r1, 4
+	}
+	bt r6, .LBB10_5
+.LBB10_6:
+	{
+		add r1, r5, 1
+		add r8, r8, r9
+	}
+	{
+		eq r3, r5, r2
+		mov r5, r1
+	}
+	bf r3, .LBB10_4
+.LBB10_7:
+	{
+		nop
+		ldw r5, sp[21]
+	}
+	{
+		add r1, r5, 1
+		ldw r3, sp[18]
+	}
+	{
+		add r0, r0, r3
+		ldw r3, sp[19]
+	}
+	{
+		eq r3, r5, r3
+		nop
+	}
+	bf r3, .LBB10_3
+.LBB10_8:
+	{
+		mkmsk r1, 1
+		ldw r4, sp[17]
+	}
+	{
+		lss r1, r4, r1
+		ldw r2, sp[20]
+	}
+	bt r1, .LBB10_25
+	lda16 r0, r11[r11]
+	{
+		nop
+		stw r0, sp[14]
+	}
+	{
+		nop
+		ldw r0, sp[38]
+	}
+	{
+		sub r0, r0, 1
+		nop
+	}
+	{
+		nop
+		stw r0, sp[19]
+	}
+	{
+		nop
+		ldw r3, sp[16]
+	}
+	mul r0, r11, r3
+	{
+		sub r0, r3, 1
+		stw r0, sp[1]
+	}
+	{
+		sub r0, r4, 1
+		stw r0, sp[8]
+	}
+	{
+		shl r0, r11, 2
+		stw r0, sp[4]
+	}
+	{
+		ldc r1, 0
+		stw r0, sp[7]
+	}
+	{
+		mkmsk r0, 1
+		nop
+	}
+	{
+		lss r0, r3, r0
+		nop
+	}
+	{
+		ldc r5, 24
+		stw r0, sp[3]
+	}
+	{
+		ldc r7, 16
+		ldc r4, 8
+	}
+	{
+		mov r3, r1
+		mov r6, r1
+	}
+.LBB10_10:
+	{
+		nop
+		ldw r0, sp[3]
+	}
+	bt r0, .LBB10_24
+	{
+		shl r0, r6, 1
+		nop
+	}
+	{
+		nop
+		stw r0, sp[11]
+	}
+	{
+		nop
+		stw r6, sp[5]
+	}
+	{
+		nop
+		ldw r0, sp[2]
+	}
+	ldaw r0, r0[r3]
+	{
+		nop
+		stw r0, sp[17]
+	}
+	{
+		ldc r0, 0
+		stw r3, sp[6]
+	}
+	{
+		mov r3, r0
+		nop
+	}
+.LBB10_12:
+	{
+		mkmsk r0, 1
+		stw r3, sp[9]
+	}
+	{
+		sub r1, r0, r3
+		nop
+	}
+	{
+		lsu r1, r0, r1
+		stw r1, sp[16]
+	}
+	{
+		ldc r0, 0
+		nop
+	}
+	bu .LBB10_13
+.LBB10_21:
+	{
+		nop
+		ldw r1, sp[18]
+	}
+	{
+		add r0, r1, 1
+		nop
+	}
+.xtaloop 3
+	{
+		eq r1, r1, 2
+		nop
+	}
+.LBB10_13:
+	{
+		nop
+		stw r0, sp[18]
+	}
+	bt r1, .LBB10_22
+	{
+		nop
+		ldw r0, sp[13]
+	}
+	{
+		nop
+		ldw r1, sp[38]
+	}
+	{
+		lss r1, r0, r1
+		nop
+	}
+	bf r1, .LBB10_21
+	{
+		nop
+		ldw r0, sp[11]
+	}
+	{
+		nop
+		ldw r1, sp[18]
+	}
+	{
+		add r1, r1, r0
+		ldw r0, sp[12]
+	}
+	mul r1, r1, r0
+	{
+		add r1, r1, 1
+		ldw r0, sp[10]
+	}
+	mul r1, r1, r0
+	{
+		nop
+		ldw r0, sp[13]
+	}
+	{
+		add r8, r1, r0
+		mov r3, r0
+	}
+.LBB10_16:
+	{
+	    shl r1, r3, 1
+		ldw r2, r2[r8]
+	}
+	{
+		add r1, r1, r3
+		ldw r0, sp[18]
+	}
+	{
+		nop
+		stw r3, sp[21]
+	}
+	{
+		add r1, r1, r0
+		ldw r0, sp[16]
+	}
+	lda16 r1, r0[r1]
+	{
+		ldc r10, 30
+		ldw r0, sp[14]
+	}
+	mul r1, r0, r1
+	ashr r9, r1, 32
+	{
+		shr r9, r9, r10
+		nop
+	}
+	{
+		add r1, r1, r9
+		nop
+	}
+	ldw r0, cp[.LCPI10_0]
+	{
+		and r1, r1, r0
+		ldw r0, sp[15]
+	}
+	{
+		add r1, r0, r1
+		ldw r10, sp[17]
+	}
+	{
+		shr r9, r11, 2
+		nop
+	}
+.LBB10_19:
+	{
+		add r1, r1, 4
+		ldw r6, r1[0]
+	}
+	lextract r3, r6, r6, r5, 32
+    
+	{
+		sub r9, r9, 1
+		ldw r0, r10[0]
+	}
+	fmacc r2, r2, r0, r3
+    
+	{
+		add r1, r1, 4
+		ldw r0, r1[0]
+	}
+	lextract r6, r0, r6, r7, 32
+    
+	{
+	    ldc r4, 8
+		ldw r3, r10[1]
+	}
+	fmacc r2, r2, r3, r6
+    
+	{
+		add r1, r1, 4
+		ldw r3, r1[0]
+	}
+	lextract r0, r3, r0, r4, 32
+    
+	ldd r6, r4, r10[1]
+	fmacc r2, r2, r6, r0
+	fmacc r2, r2, r4, r3
+    
+    {
+		add r10, r10, r7
+	    bt r9, .LBB10_19
+    }
+	{
+		nop
+		ldw r0, sp[20]
+	}
+	stw r2, r0[r8]
+	{
+		mov r2, r0
+		add r8, r8, 1
+	}
+	{
+		nop
+		ldw r3, sp[21]
+	}
+	{
+		add r0, r3, 1
+		ldw r1, sp[19]
+	}
+	{
+		eq r1, r3, r1
+		mov r3, r0
+	}
+	bf r1, .LBB10_16
+	bu .LBB10_21
+.LBB10_22:
+	{
+		nop
+		ldw r6, sp[9]
+	}
+	{
+		add r0, r6, 1
+		ldw r1, sp[7]
+	}
+	{
+		nop
+		ldw r3, sp[17]
+	}
+	{
+		add r3, r3, r1
+		nop
+	}
+	{
+		nop
+		stw r3, sp[17]
+	}
+	{
+		nop
+		ldw r1, sp[8]
+	}
+	{
+		eq r1, r6, r1
+		mov r3, r0
+	}
+	bf r1, .LBB10_12
+	{
+		nop
+		ldw r0, sp[1]
+	}
+	{
+		nop
+		ldw r3, sp[6]
+	}
+	{
+		add r3, r0, r3
+		ldw r6, sp[5]
+	}
+.LBB10_24:
+	{
+		add r0, r6, 1
+		ldw r1, sp[4]
+	}
+	{
+		eq r1, r6, r1
+		mov r6, r0
+	}
+	bf r1, .LBB10_10
+.LBB10_25:
+	{
+		nop
+		ldw r10, sp[28]
+	}
+	{
+		nop
+		ldw r8, sp[26]
+	}
+	{
+		nop
+		ldw r9, sp[27]
+	}
+	{
+		nop
+		ldw r6, sp[24]
+	}
+	{
+		nop
+		ldw r7, sp[25]
+	}
+	ldd r5, r4, sp[11]
+	{
+		nop
+		retsp 30
+	}
+.LBB10_26:
+	.cc_bottom xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S.function
+	.set	xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S.nstackwords,(__assert_func.nstackwords + 30)
+	.globl	xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S.nstackwords
+	.set	xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S.maxcores,__assert_func.maxcores $M 1
+	.globl	xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S.maxcores
+	.set	xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S.maxtimers,__assert_func.maxtimers $M 0
+	.globl	xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S.maxtimers
+	.set	xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S.maxchanends,__assert_func.maxchanends $M 0
+	.globl	xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S.maxchanends
+.Ltmp10:
+	.size	xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S, .Ltmp10-xc_transpose_conv2d_float_kw5xh2_stride_h3_packed_opt_S
diff --git a/lib_tflite_micro/src/tflite-xcore-kernels/xcore_beta_fcf32.cc b/lib_tflite_micro/src/tflite-xcore-kernels/xcore_beta_fcf32.cc
index 06f2eb0..c78fb52 100644
--- a/lib_tflite_micro/src/tflite-xcore-kernels/xcore_beta_fcf32.cc
+++ b/lib_tflite_micro/src/tflite-xcore-kernels/xcore_beta_fcf32.cc
@@ -36,7 +36,7 @@ void beta_fcf32_thread_worker(void *shared, void *start, void *end) {
   auto sd = static_cast<Beta_FcF32Shared *>(shared);
   #ifdef NN_USE_REF
   xc_fc_float_ref(sd->out, sd->in, sd->kernels, sd->out_f,
-                  sd->in_f);
+                  sd->in_f, *s, *e);
   #else
   xc_fc_float_opt(sd->out, sd->in, sd->kernels, sd->out_f,
                   sd->in_f, *s, *e);