From 94fc870751490c88a77bf033854b89351ecc2b7e Mon Sep 17 00:00:00 2001
From: panickal-xmos <deepakpanickal@xmos.com>
Date: Wed, 25 Oct 2023 21:28:44 +0100
Subject: [PATCH] Add beta float support

---
 xformer/IR/XCoreOps.td                        | 27 +++++++++-
 .../ApplyLoadConstantOpPatterns.cpp           |  6 +++
 xformer/Transforms/ApplyXCPatterns.cpp        | 15 +++++-
 xformer/Transforms/ConvPatterns.td            | 20 ++++----
 xformer/Transforms/Options.h                  |  3 ++
 xformer/Transforms/ReplaceConv2D.cpp          |  2 +
 xformer/Transforms/TranslateToCustomOp.cpp    | 18 ++++---
 xformer/Transforms/WriteFlashImage.cpp        | 49 ++++++++++---------
 xformer/Transforms/XCPatterns.td              | 17 +++++--
 xformer/Utils/FileIO.cpp                      | 36 ++++++++++++++
 xformer/Utils/FileIO.h                        |  4 ++
 xformer/Utils/TileRamSupport.cpp              | 27 ++++++++++
 xformer/Utils/TileRamSupport.h                | 20 ++++++++
 xformer/Utils/Utils.td                        |  2 +
 xformer/XCoreOptMain.cpp                      | 24 ++++++++-
 xformer/lib_tflite_micro.BUILD                |  2 +
 16 files changed, 224 insertions(+), 48 deletions(-)
 create mode 100644 xformer/Utils/TileRamSupport.cpp
 create mode 100644 xformer/Utils/TileRamSupport.h

diff --git a/xformer/IR/XCoreOps.td b/xformer/IR/XCoreOps.td
index ec7a918bc..17414147f 100644
--- a/xformer/IR/XCoreOps.td
+++ b/xformer/IR/XCoreOps.td
@@ -132,6 +132,31 @@ def XC_MulOp : XC_Op<"mul", [Pure]> {
   let results = (outs TensorOf<[QI8]> : $output);
 }
 
+def XC_Beta_ActivationF32Op : XC_Op<"beta_activationf32", [Pure]> {
+  let summary = "Beta ActivationF32 op";
+
+  let description = [{Beta ActivationF32 op.}];
+
+  let arguments = (ins
+    TensorOf<[F32]>:$input,
+    I32Attr:$type
+  );
+
+  let results = (outs TensorOf<[F32]> : $output);
+}
+
+def XC_Beta_ConcatF32Op : XC_Op<"beta_concatf32", [Pure]> {
+  let summary = "Beta ConcatF32 op";
+
+  let description = [{Beta ConcatF32 op.}];
+
+  let arguments = (ins
+    Variadic<TensorOf<[F32]>>:$input
+  );
+
+  let results = (outs TensorOf<[F32]> : $output);
+}
+
 def XC_Beta_ConvF32Op : XC_Op<"beta_convf32", [Pure]> {
   let summary = "Beta ConvF32 op";
 
@@ -248,7 +273,7 @@ def XC_LookupOp : XC_Op<"lookup", [Pure]> {
 
   let description = [{Lookup table op.}];
 
-  let arguments = (ins TensorOf<[QI8]> : $input, TensorOf<[I8]> : $lut, I32Attr : $thread_count);
+  let arguments = (ins TensorOf<[QI8]> : $input, TensorOf<[I8]> : $lut);
 
   let results = (outs TensorOf<[QI8]> : $output);
 }
diff --git a/xformer/Transforms/ApplyLoadConstantOpPatterns.cpp b/xformer/Transforms/ApplyLoadConstantOpPatterns.cpp
index 1c14290f6..27c7ac50f 100644
--- a/xformer/Transforms/ApplyLoadConstantOpPatterns.cpp
+++ b/xformer/Transforms/ApplyLoadConstantOpPatterns.cpp
@@ -30,7 +30,12 @@ struct ApplyLoadConstantOpPatterns
   void runOnOperation() override;
 };
 
+static int totalSize_ = 0;
+
 bool shouldBeLoadedExternally(Attribute values) {
+  if (totalSize_ > maxLoadExternalSizeOption) {
+    return false;
+  }
   // values might be UnitAttr or BoolAttr which are too small to be loaded
   // externally anyway
   auto totalSizeInBits = 0;
@@ -40,6 +45,7 @@ bool shouldBeLoadedExternally(Attribute values) {
         (valuesAttr.getNumElements() *
          valuesAttr.getType().getElementType().getIntOrFloatBitWidth());
   }
+  totalSize_ += totalSizeInBits / CHAR_BIT;
   return totalSizeInBits / CHAR_BIT > loadExternallyIfLargerOption;
 }
 
diff --git a/xformer/Transforms/ApplyXCPatterns.cpp b/xformer/Transforms/ApplyXCPatterns.cpp
index e6e484cec..9b7bc57cd 100644
--- a/xformer/Transforms/ApplyXCPatterns.cpp
+++ b/xformer/Transforms/ApplyXCPatterns.cpp
@@ -30,6 +30,8 @@ struct ApplyXCPatterns
   void runOnOperation() override;
 };
 
+bool isBetaFloatEnabled() { return enableBetaFloatOption; }
+
 StringAttr getPaddingPlan(PatternRewriter &rewriter, TFL::PadOp padOp) {
   DenseIntElementsAttr paddingAttr;
   if (!matchPattern(padOp.getPadding(), m_Constant(&paddingAttr))) {
@@ -83,8 +85,17 @@ IntegerAttr getPadValue(PatternRewriter &rewriter, Value inputVal) {
   return rewriter.getI32IntegerAttr(padValue);
 }
 
-IntegerAttr getThreadCount(PatternRewriter &rewriter) {
-  return rewriter.getI32IntegerAttr(threadCountOption);
+IntegerAttr getActivationType(PatternRewriter &rewriter, Operation *op) {
+  // TODO: Refactor to use shared header file for enum
+  if (isa<TFL::EluOp>(op)) {
+    return rewriter.getI32IntegerAttr(0);
+  } else if (isa<TFL::LogisticOp>(op)) {
+    return rewriter.getI32IntegerAttr(1);
+  } else if (isa<TFL::TanhOp>(op)) {
+    return rewriter.getI32IntegerAttr(2);
+  } else {
+    llvm_unreachable("Unsupported op!");
+  }
 }
 
 DenseElementsAttr getLookupTable(PatternRewriter &rewriter, Operation *op) {
diff --git a/xformer/Transforms/ConvPatterns.td b/xformer/Transforms/ConvPatterns.td
index fcfdc18f7..9cbd0ced8 100644
--- a/xformer/Transforms/ConvPatterns.td
+++ b/xformer/Transforms/ConvPatterns.td
@@ -42,26 +42,26 @@ Pat<(TFL_DepthwiseConv2DOp: $output TensorOf<[QI8]>:$input, TensorOf<[QI8]>:$f,
               (IsConstOp $f),
               ]>;
 
-// TODO: Special case, we only optimize conv with filter width 5, filter height
-// 2, and stride height 3
+// Special case, we only optimize conv with filter width 3, filter height
+// 2, and stride height 2
 def Hasfw5fh2
     : Constraint<CPred<"$0.getType().cast<ShapedType>().getRank() == 4 && "
-                       "$0.getType().cast<ShapedType>().getDimSize(1) == 5 && "
+                       "$0.getType().cast<ShapedType>().getDimSize(1) == 3 && "
                        "$0.getType().cast<ShapedType>().getDimSize(2) == 2">>;
 
 // F32 TFL_Conv2D() -> XC_Beta_ConvF32()
 def :
-Pat<(TFL_Conv2DOp: $output TensorOf<[F32]>:$input, TensorOf<[F32]>:$f, TensorOf<[F32]>:$b, $dh, $dw, $faf, $wf, ConstantAttr<I32Attr, "3">, ConstantAttr<I32Attr, "1">),
+Pat<(TFL_Conv2DOp: $output TensorOf<[F32]>:$input, TensorOf<[F32]>:$f, TensorOf<[F32]>:$b, $dh, $dw, $faf, $wf, ConstantAttr<I32Attr, "2">, ConstantAttr<I32Attr, "1">),
           (XC_Beta_ConvF32Op $input, $f, $b),
-          [(Hasfw5fh2 $f)]>;
+          [(Hasfw5fh2 $f), (isBetaFloatEnabled)]>;
 
-// F32 TFL_TransposeConv2D() -> XC_Beta_TransposeConvF32()
+// // F32 TFL_TransposeConv2D() -> XC_Beta_TransposeConvF32()
 def :
-Pat<(TFL_TransposeConvOp: $output $outshape, TensorOf<[F32]>:$f, TensorOf<[F32]>:$input, TensorOf<[F32]>:$b, $wf, ConstantAttr<I32Attr, "3">, ConstantAttr<I32Attr, "1">, $faf),
+Pat<(TFL_TransposeConvOp: $output $outshape, TensorOf<[F32]>:$f, TensorOf<[F32]>:$input, TensorOf<[F32]>:$b, $wf, ConstantAttr<I32Attr, "2">, ConstantAttr<I32Attr, "1">, $faf),
           (XC_Beta_TransposeConvF32Op $input, $f, $b),
-          [(Hasfw5fh2 $f)]>;
+          [(Hasfw5fh2 $f), (isBetaFloatEnabled)]>;
 
-// F32 TFL_FullyConnected() -> XC_Beta_FcF32()
+// // F32 TFL_FullyConnected() -> XC_Beta_FcF32()
 def :
 Pat<(TFL_FullyConnectedOp: $output TensorOf<[F32]>:$input, TensorOf<[F32]>:$f, $b, $faf, $wf, $knd, $aqi),
-          (XC_Beta_FcF32Op $input, $f)>;
+          (XC_Beta_FcF32Op $input, $f), [(isBetaFloatEnabled)]>;
diff --git a/xformer/Transforms/Options.h b/xformer/Transforms/Options.h
index 5dbf94c7f..096d8ef2d 100644
--- a/xformer/Transforms/Options.h
+++ b/xformer/Transforms/Options.h
@@ -9,9 +9,12 @@
 namespace mlir {
 namespace xcore {
 
+extern llvm::cl::opt<bool> enableBetaFloatOption;
 extern llvm::cl::opt<unsigned> threadCountOption;
 extern llvm::cl::opt<std::string> flashImageFilenameOption;
 extern llvm::cl::opt<unsigned> loadExternallyIfLargerOption;
+extern llvm::cl::opt<bool> tileLoadOption;
+extern llvm::cl::opt<unsigned> maxLoadExternalSizeOption;
 extern llvm::cl::opt<double> convQuantErrorThresholdOption;
 extern llvm::cl::opt<bool> convForceErrorCheckOption;
 extern llvm::cl::opt<unsigned> convMultiplierFactorOption;
diff --git a/xformer/Transforms/ReplaceConv2D.cpp b/xformer/Transforms/ReplaceConv2D.cpp
index 66349bf7a..47ed454b0 100644
--- a/xformer/Transforms/ReplaceConv2D.cpp
+++ b/xformer/Transforms/ReplaceConv2D.cpp
@@ -157,6 +157,8 @@ struct ReplaceConv2D
   void runOnOperation() override;
 };
 
+bool isBetaFloatEnabled() { return enableBetaFloatOption; }
+
 namespace convpatterns {
 #include "Transforms/GeneratedConvPatterns.inc"
 }
diff --git a/xformer/Transforms/TranslateToCustomOp.cpp b/xformer/Transforms/TranslateToCustomOp.cpp
index 2e459b3e5..12550e927 100644
--- a/xformer/Transforms/TranslateToCustomOp.cpp
+++ b/xformer/Transforms/TranslateToCustomOp.cpp
@@ -14,18 +14,20 @@ namespace mlir {
 namespace xcore {
 
 std::vector<uint8_t> Bsign8Op::buildCustomOptions() { return {}; }
-std::vector<uint8_t> Beta_ConvF32Op::buildCustomOptions() { return {}; }
-std::vector<uint8_t> Beta_TransposeConvF32Op::buildCustomOptions() {
-  return {};
-}
-std::vector<uint8_t> Beta_FcF32Op::buildCustomOptions() { return {}; }
 
-std::vector<uint8_t> LookupOp::buildCustomOptions() {
+std::vector<uint8_t> Beta_ActivationF32Op::buildCustomOptions() {
   flexbuffers::Builder fbb;
-  fbb.Map([&]() { fbb.Int("tc", (int32_t)getThreadCount()); });
+  fbb.Map([&]() { fbb.Int("type", (int32_t)getType()); });
   fbb.Finish();
   return fbb.GetBuffer();
 }
+std::vector<uint8_t> Beta_ConcatF32Op::buildCustomOptions() { return {}; }
+std::vector<uint8_t> Beta_ConvF32Op::buildCustomOptions() { return {}; }
+std::vector<uint8_t> Beta_TransposeConvF32Op::buildCustomOptions() {
+  return {};
+}
+std::vector<uint8_t> Beta_FcF32Op::buildCustomOptions() { return {}; }
+std::vector<uint8_t> LookupOp::buildCustomOptions() { return {}; }
 
 std::vector<uint8_t> AddOp::buildCustomOptions() {
   flexbuffers::Builder fbb;
@@ -176,6 +178,8 @@ void TranslateToCustomOp::runOnOperation() {
   patterns.insert<RewriteToCustomOp<PadOp>>(ctx);
   patterns.insert<RewriteToCustomOp<Pad3To4Op>>(ctx);
   patterns.insert<RewriteToCustomOp<StridedSliceOp>>(ctx);
+  patterns.insert<RewriteToCustomOp<Beta_ActivationF32Op>>(ctx);
+  patterns.insert<RewriteToCustomOp<Beta_ConcatF32Op>>(ctx);
   patterns.insert<RewriteToCustomOp<Beta_ConvF32Op>>(ctx);
   patterns.insert<RewriteToCustomOp<Beta_TransposeConvF32Op>>(ctx);
   patterns.insert<RewriteToCustomOp<Beta_FcF32Op>>(ctx);
diff --git a/xformer/Transforms/WriteFlashImage.cpp b/xformer/Transforms/WriteFlashImage.cpp
index cef8e616e..5fc819482 100644
--- a/xformer/Transforms/WriteFlashImage.cpp
+++ b/xformer/Transforms/WriteFlashImage.cpp
@@ -4,6 +4,7 @@
 #include "IR/XCoreOps.h"
 #include "Transforms/Options.h"
 #include "Utils/FileIO.h"
+#include "Utils/TileRamSupport.h"
 
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/FileUtilities.h"
@@ -94,6 +95,8 @@ struct WriteFlashImagePattern : public OpRewritePattern<LoadConstantOp> {
       for (int i = 0; i < opNums.size(); i++) {
         ownerOp->setOperand(opNums[i], loadFlashOp.getResult(i));
       }
+
+      loadFlashOp->moveBefore(ownerOp);
       loadOp.erase();
     } else {
       std::vector<char> loadOpData = getTensorData(loadOp);
@@ -103,6 +106,18 @@ struct WriteFlashImagePattern : public OpRewritePattern<LoadConstantOp> {
           loadOp.getLoc(), loadOp.getType(), address,
           rewriter.getArrayAttr(dataSizes));
       rewriter.replaceOp(loadOp, loadFlashOp.getOutput());
+
+      // Find all uses of loadFlashOp and find the first Owner op
+      // so that we can move the loading to just before that op.
+      mlir::Operation *firstOwnerOp =
+          loadFlashOp->getResult(0).getUses().begin()->getOwner();
+      for (const mlir::OpOperand &use : loadFlashOp->getResult(0).getUses()) {
+        mlir::Operation *op = use.getOwner();
+        if (op->isBeforeInBlock(firstOwnerOp)) {
+          firstOwnerOp = op;
+        }
+      }
+      loadFlashOp->moveBefore(firstOwnerOp);
     }
 
     tensorsVec_->push_back(tensorData);
@@ -114,27 +129,6 @@ struct WriteFlashImagePattern : public OpRewritePattern<LoadConstantOp> {
   std::vector<std::vector<char>> *tensorsVec_;
 };
 
-struct MoveLoadOpPattern : public OpRewritePattern<LoadFlashOp> {
-  MoveLoadOpPattern(MLIRContext *context)
-      : OpRewritePattern<LoadFlashOp>(context) {}
-
-  LogicalResult matchAndRewrite(LoadFlashOp loadFlashOp,
-                                PatternRewriter &rewriter) const override {
-    // Constants are usually allocated in the beginning of the function.
-    // Lowering them to load from flash op leads to loading constants from flash
-    // occurring in the beginning of graph execution before other ops are
-    // executed, thereby needing a much larger tensor arena.
-    // We move the op to right before the user op (user op would be conv or
-    // lookup op etc, any op that is using the constant).
-    // This is so that when we lower to flatbuffer the loadOp will be located
-    // in the graph close to the user op.
-    Operation *ownerOp =
-        loadFlashOp->getResult(0).getUses().begin()->getOwner();
-    loadFlashOp->moveBefore(ownerOp);
-    return success();
-  }
-};
-
 void WriteFlashImage::runOnOperation() {
   func::FuncOp f = getOperation();
   if (flashImageFilenameOption.empty()) {
@@ -150,12 +144,19 @@ void WriteFlashImage::runOnOperation() {
   std::vector<std::vector<char>> tensorsVec;
   RewritePatternSet patterns(ctx);
   patterns.insert<WriteFlashImagePattern>(&tensorsVec, ctx);
-  patterns.insert<MoveLoadOpPattern>(ctx);
   (void)applyPatternsAndFoldGreedily(func, std::move(patterns));
 
+  if (tileLoadOption) {
+    if (failed(utils::writeTileServerDataToFile(flashImageFilenameOption,
+                                                tensorsVec))) {
+      f.emitError("Failed to write tile data!");
+      signalPassFailure();
+      return;
+    }
+  }
   // Write tensor data to flash image file
-  if (failed(
-          utils::writeFlashImageToFile(flashImageFilenameOption, tensorsVec))) {
+  else if (failed(utils::writeFlashImageToFile(flashImageFilenameOption,
+                                               tensorsVec))) {
     f.emitError("Failed to write flash image!");
     signalPassFailure();
     return;
diff --git a/xformer/Transforms/XCPatterns.td b/xformer/Transforms/XCPatterns.td
index d3784e5b4..ec38144c1 100644
--- a/xformer/Transforms/XCPatterns.td
+++ b/xformer/Transforms/XCPatterns.td
@@ -14,17 +14,28 @@ include "Utils/Utils.td"
 def getLookupTable
     : NativeCodeCall<"getLookupTable($_builder, $0.getDefiningOp())">;
 
-def getThreadCount : NativeCodeCall<"getThreadCount($_builder)">;
-
 foreach activationOp =
     [TFL_ReluOp, TFL_Relu6Op, TFL_TanhOp, TFL_LogisticOp, TFL_HardSwishOp] in {
 def:
   Pat<(activationOp
             : $output TensorOf<[QI8]>:$input),
             (XC_LookupOp $input, (Arith_ConstantOp (getLookupTable
-            $output)), (getThreadCount))>;
+            $output)))>;
 }
 
+def getActivationType
+    : NativeCodeCall<"getActivationType($_builder, $0.getDefiningOp())">;
+
+foreach activationOp = [TFL_EluOp, TFL_LogisticOp, TFL_TanhOp] in {
+def:
+  Pat<(activationOp
+            : $output TensorOf<[F32]>:$input),
+            (XC_Beta_ActivationF32Op $input, (getActivationType $output)), [(isBetaFloatEnabled)]>;
+}
+
+def : Pat<(TFL_ConcatenationOp $input, $axis, $faf),
+          (XC_Beta_ConcatF32Op $input), [(isBetaFloatEnabled)]>;
+
 def getPadValue : NativeCodeCall<"getPadValue($_builder, $0)">;
 
 def getPaddingPlan
diff --git a/xformer/Utils/FileIO.cpp b/xformer/Utils/FileIO.cpp
index 75be7b6d4..ec462c4c8 100644
--- a/xformer/Utils/FileIO.cpp
+++ b/xformer/Utils/FileIO.cpp
@@ -2,6 +2,7 @@
 // XMOS Public License: Version 1
 
 #include "Utils/FileIO.h"
+#include "Utils/TileRamSupport.h"
 
 #include "mlir/Support/FileUtilities.h"
 #include "tensorflow/compiler/mlir/lite/flatbuffer_export.h"
@@ -9,6 +10,8 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/ToolOutputFile.h"
 
+#include <iomanip>
+
 namespace mlir {
 namespace xcore {
 namespace utils {
@@ -35,6 +38,39 @@ LogicalResult writeFlashImageToFile(const std::string &filename,
   return utils::writeDataToFile(filename, data);
 }
 
+LogicalResult
+writeTileServerDataToFile(const std::string &filename,
+                          std::vector<std::vector<char>> tensorsVec) {
+  // Add header
+  auto tileHeader = utils::tileRamHeader();
+  tensorsVec.insert(tensorsVec.begin(), tileHeader);
+
+  std::ostringstream out;
+  out << R"(#ifndef TILESERVERGEN_H
+#define TILESERVERGEN_H
+
+const int8_t tile_server_weights[] = {
+)";
+  int lineEnding = 0;
+  for (auto const &tensor : tensorsVec) {
+    for (auto const &i : tensor) {
+      out << (int)i << ", ";
+      lineEnding++;
+      if (lineEnding > 80) {
+        out << "\n";
+        lineEnding = 0;
+      }
+    }
+  }
+
+  out << R"(};
+
+#endif // TILESERVERGEN_H
+)";
+
+  return utils::writeDataToFile(filename, out.str());
+}
+
 LogicalResult getFlatBufferStringFromMLIR(
     mlir::ModuleOp module, std::map<std::string, std::string> metadata,
     const bool &dontMinify, std::string &flatBufferString) {
diff --git a/xformer/Utils/FileIO.h b/xformer/Utils/FileIO.h
index 97353fa5e..2b2ea81f5 100644
--- a/xformer/Utils/FileIO.h
+++ b/xformer/Utils/FileIO.h
@@ -16,6 +16,10 @@ LogicalResult writeDataToFile(const std::string &filename, std::string data);
 LogicalResult writeFlashImageToFile(const std::string &filename,
                                     std::vector<std::vector<char>> tensorsVec);
 
+LogicalResult
+writeTileServerDataToFile(const std::string &filename,
+                          std::vector<std::vector<char>> tensorsVec);
+
 LogicalResult getFlatBufferStringFromMLIR(
     mlir::ModuleOp module, std::map<std::string, std::string> metadata,
     const bool &dontMinify, std::string &flatBufferString);
diff --git a/xformer/Utils/TileRamSupport.cpp b/xformer/Utils/TileRamSupport.cpp
new file mode 100644
index 000000000..e5c0297cb
--- /dev/null
+++ b/xformer/Utils/TileRamSupport.cpp
@@ -0,0 +1,27 @@
+#include "Utils/TileRamSupport.h"
+#include <stdint.h>
+#include <string.h>
+
+namespace mlir {
+namespace xcore {
+namespace utils {
+
+std::vector<char> tileRamHeader() {
+  // TODO: Change flash_t struct to mem_server_header_t
+  // We are reusing the flash_t struct in lib_tflite_micro as the header
+  // The header version is stored as one integer
+  // There are four parameter integers in the flash_t struct
+  // Altogether 20 bytes
+  constexpr int headerSize = 20;
+  std::vector<char> header(headerSize, 0);
+  header[0] = 1;
+  header[1] = 2;
+  header[2] = ~1;
+  header[3] = ~2;
+  header[8] = headerSize;
+  return header;
+}
+
+} // namespace utils
+} // namespace xcore
+} // namespace mlir
\ No newline at end of file
diff --git a/xformer/Utils/TileRamSupport.h b/xformer/Utils/TileRamSupport.h
new file mode 100644
index 000000000..0d452c08d
--- /dev/null
+++ b/xformer/Utils/TileRamSupport.h
@@ -0,0 +1,20 @@
+#ifndef XFORMER_UTILS_TILESUPPORT_H
+#define XFORMER_UTILS_TILESUPPORT_H
+
+#include <stdint.h>
+#include <string>
+#include <vector>
+
+namespace mlir {
+namespace xcore {
+namespace utils {
+
+/** Function that creates a tile_ram_header
+ */
+std::vector<char> tileRamHeader();
+
+} // namespace utils
+} // namespace xcore
+} // namespace mlir
+
+#endif // XFORMER_UTILS_TILESUPPORT_H
\ No newline at end of file
diff --git a/xformer/Utils/Utils.td b/xformer/Utils/Utils.td
index a3d348770..7a56a77a6 100644
--- a/xformer/Utils/Utils.td
+++ b/xformer/Utils/Utils.td
@@ -3,6 +3,8 @@
 
 // Utility predicates that are shared by multiple passes.
 
+def isBetaFloatEnabled : Constraint<CPred<"isBetaFloatEnabled()">>;
+
 // Check that bytes per pixel is a multiple of n
 class HasMultipleOfNBytesPerPixel<int n>
     : Constraint<CPred<
diff --git a/xformer/XCoreOptMain.cpp b/xformer/XCoreOptMain.cpp
index 4a232419d..b1f7ae2e0 100644
--- a/xformer/XCoreOptMain.cpp
+++ b/xformer/XCoreOptMain.cpp
@@ -34,6 +34,10 @@ namespace xcore {
 // and -help) will be hidden.
 static cl::OptionCategory XformerCategory("Xformer options");
 
+cl::opt<bool> enableBetaFloatOption("xcore-enable-beta-float",
+                                    cl::desc("Enable beta float support."),
+                                    cl::init(false), cl::cat(XformerCategory));
+
 cl::opt<unsigned> threadCountOption("xcore-thread-count",
                                     cl::desc("[-tc] Thread count"), cl::init(1),
                                     cl::cat(XformerCategory));
@@ -51,6 +55,10 @@ cl::alias aliasFlashImageOption("f",
                                 cl::desc("Alias for --xcore-flash-image-file"),
                                 cl::aliasopt(flashImageFilenameOption));
 
+cl::opt<bool> tileLoadOption("xcore-load-tile",
+                             cl::desc("Enable loading weights from a tile."),
+                             cl::init(false), cl::cat(XformerCategory));
+
 cl::opt<unsigned> loadExternallyIfLargerOption(
     "xcore-load-externally-if-larger",
     cl::desc("Load constants externally if larger than given limit in bytes "
@@ -58,6 +66,13 @@ cl::opt<unsigned> loadExternallyIfLargerOption(
              "xcore-flash-image-file is not provided."),
     cl::init(96), cl::cat(XformerCategory), cl::Hidden);
 
+cl::opt<unsigned> maxLoadExternalSizeOption(
+    "xcore-max-load-external-size",
+    cl::desc("The size of external load image from flash or tile will be "
+             "limited to the max specified bytes "
+             "(default = UINT_MAX bytes)."),
+    cl::init(UINT_MAX), cl::cat(XformerCategory), cl::Hidden);
+
 // This option is to provide an error threshold.
 // The maximum average error between the reference and quantised
 // implementations of the output transform over each channel is used to decide
@@ -277,6 +292,13 @@ int main(int argc, char **argv) {
   };
 
   // Validate options
+  if (mlir::xcore::tileLoadOption.getNumOccurrences() > 0 &&
+      mlir::xcore::threadCountOption < 4) {
+    return failedMessage("Please specify at least four threads using "
+                         "xcore-thread-count option when using the "
+                         "xcore-load-tile option!");
+  }
+
   if (mlir::xcore::loadExternallyIfLargerOption.getNumOccurrences() > 0 &&
       mlir::xcore::flashImageFilenameOption.empty()) {
     return failedMessage(
@@ -289,7 +311,7 @@ int main(int argc, char **argv) {
        !(mlir::xcore::opSplitTopOpsOption.empty()) ||
        !(mlir::xcore::opSplitNumSplitsOption.empty()))) {
     return failedMessage(
-        "target size option cannot be used with start, end, and "
+        "Target size option cannot be used with start, end, and "
         "numSplits options");
   }
 
diff --git a/xformer/lib_tflite_micro.BUILD b/xformer/lib_tflite_micro.BUILD
index 8d8b1948b..11a06f374 100644
--- a/xformer/lib_tflite_micro.BUILD
+++ b/xformer/lib_tflite_micro.BUILD
@@ -36,7 +36,9 @@ filegroup(
         "lib_tflite_micro/src/tflite-xcore-kernels/xcore_3_to_4.cc",
         "lib_tflite_micro/src/tflite-xcore-kernels/xcore_strided_slice.cc",
         "lib_tflite_micro/src/tflite-xcore-kernels/xcore_mul.cc",
+        "lib_tflite_micro/src/tflite-xcore-kernels/xcore_beta_activationf32.cc",
         "lib_tflite_micro/src/tflite-xcore-kernels/xcore_beta_convf32.cc",
+        "lib_tflite_micro/src/tflite-xcore-kernels/xcore_beta_concatf32.cc",
         "lib_tflite_micro/src/tflite-xcore-kernels/xcore_beta_transposeconvf32.cc",
         "lib_tflite_micro/src/tflite-xcore-kernels/xcore_beta_fcf32.cc",
         "lib_tflite_micro/src/tflite-xcore-kernels/conv2d_float.c",