From 94fc870751490c88a77bf033854b89351ecc2b7e Mon Sep 17 00:00:00 2001 From: panickal-xmos Date: Wed, 25 Oct 2023 21:28:44 +0100 Subject: [PATCH] Add beta float support --- xformer/IR/XCoreOps.td | 27 +++++++++- .../ApplyLoadConstantOpPatterns.cpp | 6 +++ xformer/Transforms/ApplyXCPatterns.cpp | 15 +++++- xformer/Transforms/ConvPatterns.td | 20 ++++---- xformer/Transforms/Options.h | 3 ++ xformer/Transforms/ReplaceConv2D.cpp | 2 + xformer/Transforms/TranslateToCustomOp.cpp | 18 ++++--- xformer/Transforms/WriteFlashImage.cpp | 49 ++++++++++--------- xformer/Transforms/XCPatterns.td | 17 +++++-- xformer/Utils/FileIO.cpp | 36 ++++++++++++++ xformer/Utils/FileIO.h | 4 ++ xformer/Utils/TileRamSupport.cpp | 27 ++++++++++ xformer/Utils/TileRamSupport.h | 20 ++++++++ xformer/Utils/Utils.td | 2 + xformer/XCoreOptMain.cpp | 24 ++++++++- xformer/lib_tflite_micro.BUILD | 2 + 16 files changed, 224 insertions(+), 48 deletions(-) create mode 100644 xformer/Utils/TileRamSupport.cpp create mode 100644 xformer/Utils/TileRamSupport.h diff --git a/xformer/IR/XCoreOps.td b/xformer/IR/XCoreOps.td index ec7a918bc..17414147f 100644 --- a/xformer/IR/XCoreOps.td +++ b/xformer/IR/XCoreOps.td @@ -132,6 +132,31 @@ def XC_MulOp : XC_Op<"mul", [Pure]> { let results = (outs TensorOf<[QI8]> : $output); } +def XC_Beta_ActivationF32Op : XC_Op<"beta_activationf32", [Pure]> { + let summary = "Beta ActivationF32 op"; + + let description = [{Beta ActivationF32 op.}]; + + let arguments = (ins + TensorOf<[F32]>:$input, + I32Attr:$type + ); + + let results = (outs TensorOf<[F32]> : $output); +} + +def XC_Beta_ConcatF32Op : XC_Op<"beta_concatf32", [Pure]> { + let summary = "Beta ConcatF32 op"; + + let description = [{Beta ConcatF32 op.}]; + + let arguments = (ins + Variadic>:$input + ); + + let results = (outs TensorOf<[F32]> : $output); +} + def XC_Beta_ConvF32Op : XC_Op<"beta_convf32", [Pure]> { let summary = "Beta ConvF32 op"; @@ -248,7 +273,7 @@ def XC_LookupOp : XC_Op<"lookup", [Pure]> { let description = [{Lookup table op.}]; - let arguments = (ins TensorOf<[QI8]> : $input, TensorOf<[I8]> : $lut, I32Attr : $thread_count); + let arguments = (ins TensorOf<[QI8]> : $input, TensorOf<[I8]> : $lut); let results = (outs TensorOf<[QI8]> : $output); } diff --git a/xformer/Transforms/ApplyLoadConstantOpPatterns.cpp b/xformer/Transforms/ApplyLoadConstantOpPatterns.cpp index 1c14290f6..27c7ac50f 100644 --- a/xformer/Transforms/ApplyLoadConstantOpPatterns.cpp +++ b/xformer/Transforms/ApplyLoadConstantOpPatterns.cpp @@ -30,7 +30,12 @@ struct ApplyLoadConstantOpPatterns void runOnOperation() override; }; +static int totalSize_ = 0; + bool shouldBeLoadedExternally(Attribute values) { + if (totalSize_ > maxLoadExternalSizeOption) { + return false; + } // values might be UnitAttr or BoolAttr which are too small to be loaded // externally anyway auto totalSizeInBits = 0; @@ -40,6 +45,7 @@ bool shouldBeLoadedExternally(Attribute values) { (valuesAttr.getNumElements() * valuesAttr.getType().getElementType().getIntOrFloatBitWidth()); } + totalSize_ += totalSizeInBits / CHAR_BIT; return totalSizeInBits / CHAR_BIT > loadExternallyIfLargerOption; } diff --git a/xformer/Transforms/ApplyXCPatterns.cpp b/xformer/Transforms/ApplyXCPatterns.cpp index e6e484cec..9b7bc57cd 100644 --- a/xformer/Transforms/ApplyXCPatterns.cpp +++ b/xformer/Transforms/ApplyXCPatterns.cpp @@ -30,6 +30,8 @@ struct ApplyXCPatterns void runOnOperation() override; }; +bool isBetaFloatEnabled() { return enableBetaFloatOption; } + StringAttr getPaddingPlan(PatternRewriter &rewriter, TFL::PadOp padOp) { DenseIntElementsAttr paddingAttr; if (!matchPattern(padOp.getPadding(), m_Constant(&paddingAttr))) { @@ -83,8 +85,17 @@ IntegerAttr getPadValue(PatternRewriter &rewriter, Value inputVal) { return rewriter.getI32IntegerAttr(padValue); } -IntegerAttr getThreadCount(PatternRewriter &rewriter) { - return rewriter.getI32IntegerAttr(threadCountOption); +IntegerAttr getActivationType(PatternRewriter &rewriter, Operation *op) { + // TODO: Refactor to use shared header file for enum + if (isa(op)) { + return rewriter.getI32IntegerAttr(0); + } else if (isa(op)) { + return rewriter.getI32IntegerAttr(1); + } else if (isa(op)) { + return rewriter.getI32IntegerAttr(2); + } else { + llvm_unreachable("Unsupported op!"); + } } DenseElementsAttr getLookupTable(PatternRewriter &rewriter, Operation *op) { diff --git a/xformer/Transforms/ConvPatterns.td b/xformer/Transforms/ConvPatterns.td index fcfdc18f7..9cbd0ced8 100644 --- a/xformer/Transforms/ConvPatterns.td +++ b/xformer/Transforms/ConvPatterns.td @@ -42,26 +42,26 @@ Pat<(TFL_DepthwiseConv2DOp: $output TensorOf<[QI8]>:$input, TensorOf<[QI8]>:$f, (IsConstOp $f), ]>; -// TODO: Special case, we only optimize conv with filter width 5, filter height -// 2, and stride height 3 +// Special case, we only optimize conv with filter width 3, filter height +// 2, and stride height 2 def Hasfw5fh2 : Constraint().getRank() == 4 && " - "$0.getType().cast().getDimSize(1) == 5 && " + "$0.getType().cast().getDimSize(1) == 3 && " "$0.getType().cast().getDimSize(2) == 2">>; // F32 TFL_Conv2D() -> XC_Beta_ConvF32() def : -Pat<(TFL_Conv2DOp: $output TensorOf<[F32]>:$input, TensorOf<[F32]>:$f, TensorOf<[F32]>:$b, $dh, $dw, $faf, $wf, ConstantAttr, ConstantAttr), +Pat<(TFL_Conv2DOp: $output TensorOf<[F32]>:$input, TensorOf<[F32]>:$f, TensorOf<[F32]>:$b, $dh, $dw, $faf, $wf, ConstantAttr, ConstantAttr), (XC_Beta_ConvF32Op $input, $f, $b), - [(Hasfw5fh2 $f)]>; + [(Hasfw5fh2 $f), (isBetaFloatEnabled)]>; -// F32 TFL_TransposeConv2D() -> XC_Beta_TransposeConvF32() +// // F32 TFL_TransposeConv2D() -> XC_Beta_TransposeConvF32() def : -Pat<(TFL_TransposeConvOp: $output $outshape, TensorOf<[F32]>:$f, TensorOf<[F32]>:$input, TensorOf<[F32]>:$b, $wf, ConstantAttr, ConstantAttr, $faf), +Pat<(TFL_TransposeConvOp: $output $outshape, TensorOf<[F32]>:$f, TensorOf<[F32]>:$input, TensorOf<[F32]>:$b, $wf, ConstantAttr, ConstantAttr, $faf), (XC_Beta_TransposeConvF32Op $input, $f, $b), - [(Hasfw5fh2 $f)]>; + [(Hasfw5fh2 $f), (isBetaFloatEnabled)]>; -// F32 TFL_FullyConnected() -> XC_Beta_FcF32() +// // F32 TFL_FullyConnected() -> XC_Beta_FcF32() def : Pat<(TFL_FullyConnectedOp: $output TensorOf<[F32]>:$input, TensorOf<[F32]>:$f, $b, $faf, $wf, $knd, $aqi), - (XC_Beta_FcF32Op $input, $f)>; + (XC_Beta_FcF32Op $input, $f), [(isBetaFloatEnabled)]>; diff --git a/xformer/Transforms/Options.h b/xformer/Transforms/Options.h index 5dbf94c7f..096d8ef2d 100644 --- a/xformer/Transforms/Options.h +++ b/xformer/Transforms/Options.h @@ -9,9 +9,12 @@ namespace mlir { namespace xcore { +extern llvm::cl::opt enableBetaFloatOption; extern llvm::cl::opt threadCountOption; extern llvm::cl::opt flashImageFilenameOption; extern llvm::cl::opt loadExternallyIfLargerOption; +extern llvm::cl::opt tileLoadOption; +extern llvm::cl::opt maxLoadExternalSizeOption; extern llvm::cl::opt convQuantErrorThresholdOption; extern llvm::cl::opt convForceErrorCheckOption; extern llvm::cl::opt convMultiplierFactorOption; diff --git a/xformer/Transforms/ReplaceConv2D.cpp b/xformer/Transforms/ReplaceConv2D.cpp index 66349bf7a..47ed454b0 100644 --- a/xformer/Transforms/ReplaceConv2D.cpp +++ b/xformer/Transforms/ReplaceConv2D.cpp @@ -157,6 +157,8 @@ struct ReplaceConv2D void runOnOperation() override; }; +bool isBetaFloatEnabled() { return enableBetaFloatOption; } + namespace convpatterns { #include "Transforms/GeneratedConvPatterns.inc" } diff --git a/xformer/Transforms/TranslateToCustomOp.cpp b/xformer/Transforms/TranslateToCustomOp.cpp index 2e459b3e5..12550e927 100644 --- a/xformer/Transforms/TranslateToCustomOp.cpp +++ b/xformer/Transforms/TranslateToCustomOp.cpp @@ -14,18 +14,20 @@ namespace mlir { namespace xcore { std::vector Bsign8Op::buildCustomOptions() { return {}; } -std::vector Beta_ConvF32Op::buildCustomOptions() { return {}; } -std::vector Beta_TransposeConvF32Op::buildCustomOptions() { - return {}; -} -std::vector Beta_FcF32Op::buildCustomOptions() { return {}; } -std::vector LookupOp::buildCustomOptions() { +std::vector Beta_ActivationF32Op::buildCustomOptions() { flexbuffers::Builder fbb; - fbb.Map([&]() { fbb.Int("tc", (int32_t)getThreadCount()); }); + fbb.Map([&]() { fbb.Int("type", (int32_t)getType()); }); fbb.Finish(); return fbb.GetBuffer(); } +std::vector Beta_ConcatF32Op::buildCustomOptions() { return {}; } +std::vector Beta_ConvF32Op::buildCustomOptions() { return {}; } +std::vector Beta_TransposeConvF32Op::buildCustomOptions() { + return {}; +} +std::vector Beta_FcF32Op::buildCustomOptions() { return {}; } +std::vector LookupOp::buildCustomOptions() { return {}; } std::vector AddOp::buildCustomOptions() { flexbuffers::Builder fbb; @@ -176,6 +178,8 @@ void TranslateToCustomOp::runOnOperation() { patterns.insert>(ctx); patterns.insert>(ctx); patterns.insert>(ctx); + patterns.insert>(ctx); + patterns.insert>(ctx); patterns.insert>(ctx); patterns.insert>(ctx); patterns.insert>(ctx); diff --git a/xformer/Transforms/WriteFlashImage.cpp b/xformer/Transforms/WriteFlashImage.cpp index cef8e616e..5fc819482 100644 --- a/xformer/Transforms/WriteFlashImage.cpp +++ b/xformer/Transforms/WriteFlashImage.cpp @@ -4,6 +4,7 @@ #include "IR/XCoreOps.h" #include "Transforms/Options.h" #include "Utils/FileIO.h" +#include "Utils/TileRamSupport.h" #include "mlir/Pass/Pass.h" #include "mlir/Support/FileUtilities.h" @@ -94,6 +95,8 @@ struct WriteFlashImagePattern : public OpRewritePattern { for (int i = 0; i < opNums.size(); i++) { ownerOp->setOperand(opNums[i], loadFlashOp.getResult(i)); } + + loadFlashOp->moveBefore(ownerOp); loadOp.erase(); } else { std::vector loadOpData = getTensorData(loadOp); @@ -103,6 +106,18 @@ struct WriteFlashImagePattern : public OpRewritePattern { loadOp.getLoc(), loadOp.getType(), address, rewriter.getArrayAttr(dataSizes)); rewriter.replaceOp(loadOp, loadFlashOp.getOutput()); + + // Find all uses of loadFlashOp and find the first Owner op + // so that we can move the loading to just before that op. + mlir::Operation *firstOwnerOp = + loadFlashOp->getResult(0).getUses().begin()->getOwner(); + for (const mlir::OpOperand &use : loadFlashOp->getResult(0).getUses()) { + mlir::Operation *op = use.getOwner(); + if (op->isBeforeInBlock(firstOwnerOp)) { + firstOwnerOp = op; + } + } + loadFlashOp->moveBefore(firstOwnerOp); } tensorsVec_->push_back(tensorData); @@ -114,27 +129,6 @@ struct WriteFlashImagePattern : public OpRewritePattern { std::vector> *tensorsVec_; }; -struct MoveLoadOpPattern : public OpRewritePattern { - MoveLoadOpPattern(MLIRContext *context) - : OpRewritePattern(context) {} - - LogicalResult matchAndRewrite(LoadFlashOp loadFlashOp, - PatternRewriter &rewriter) const override { - // Constants are usually allocated in the beginning of the function. - // Lowering them to load from flash op leads to loading constants from flash - // occurring in the beginning of graph execution before other ops are - // executed, thereby needing a much larger tensor arena. - // We move the op to right before the user op (user op would be conv or - // lookup op etc, any op that is using the constant). - // This is so that when we lower to flatbuffer the loadOp will be located - // in the graph close to the user op. - Operation *ownerOp = - loadFlashOp->getResult(0).getUses().begin()->getOwner(); - loadFlashOp->moveBefore(ownerOp); - return success(); - } -}; - void WriteFlashImage::runOnOperation() { func::FuncOp f = getOperation(); if (flashImageFilenameOption.empty()) { @@ -150,12 +144,19 @@ void WriteFlashImage::runOnOperation() { std::vector> tensorsVec; RewritePatternSet patterns(ctx); patterns.insert(&tensorsVec, ctx); - patterns.insert(ctx); (void)applyPatternsAndFoldGreedily(func, std::move(patterns)); + if (tileLoadOption) { + if (failed(utils::writeTileServerDataToFile(flashImageFilenameOption, + tensorsVec))) { + f.emitError("Failed to write tile data!"); + signalPassFailure(); + return; + } + } // Write tensor data to flash image file - if (failed( - utils::writeFlashImageToFile(flashImageFilenameOption, tensorsVec))) { + else if (failed(utils::writeFlashImageToFile(flashImageFilenameOption, + tensorsVec))) { f.emitError("Failed to write flash image!"); signalPassFailure(); return; diff --git a/xformer/Transforms/XCPatterns.td b/xformer/Transforms/XCPatterns.td index d3784e5b4..ec38144c1 100644 --- a/xformer/Transforms/XCPatterns.td +++ b/xformer/Transforms/XCPatterns.td @@ -14,17 +14,28 @@ include "Utils/Utils.td" def getLookupTable : NativeCodeCall<"getLookupTable($_builder, $0.getDefiningOp())">; -def getThreadCount : NativeCodeCall<"getThreadCount($_builder)">; - foreach activationOp = [TFL_ReluOp, TFL_Relu6Op, TFL_TanhOp, TFL_LogisticOp, TFL_HardSwishOp] in { def: Pat<(activationOp : $output TensorOf<[QI8]>:$input), (XC_LookupOp $input, (Arith_ConstantOp (getLookupTable - $output)), (getThreadCount))>; + $output)))>; } +def getActivationType + : NativeCodeCall<"getActivationType($_builder, $0.getDefiningOp())">; + +foreach activationOp = [TFL_EluOp, TFL_LogisticOp, TFL_TanhOp] in { +def: + Pat<(activationOp + : $output TensorOf<[F32]>:$input), + (XC_Beta_ActivationF32Op $input, (getActivationType $output)), [(isBetaFloatEnabled)]>; +} + +def : Pat<(TFL_ConcatenationOp $input, $axis, $faf), + (XC_Beta_ConcatF32Op $input), [(isBetaFloatEnabled)]>; + def getPadValue : NativeCodeCall<"getPadValue($_builder, $0)">; def getPaddingPlan diff --git a/xformer/Utils/FileIO.cpp b/xformer/Utils/FileIO.cpp index 75be7b6d4..ec462c4c8 100644 --- a/xformer/Utils/FileIO.cpp +++ b/xformer/Utils/FileIO.cpp @@ -2,6 +2,7 @@ // XMOS Public License: Version 1 #include "Utils/FileIO.h" +#include "Utils/TileRamSupport.h" #include "mlir/Support/FileUtilities.h" #include "tensorflow/compiler/mlir/lite/flatbuffer_export.h" @@ -9,6 +10,8 @@ #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/ToolOutputFile.h" +#include + namespace mlir { namespace xcore { namespace utils { @@ -35,6 +38,39 @@ LogicalResult writeFlashImageToFile(const std::string &filename, return utils::writeDataToFile(filename, data); } +LogicalResult +writeTileServerDataToFile(const std::string &filename, + std::vector> tensorsVec) { + // Add header + auto tileHeader = utils::tileRamHeader(); + tensorsVec.insert(tensorsVec.begin(), tileHeader); + + std::ostringstream out; + out << R"(#ifndef TILESERVERGEN_H +#define TILESERVERGEN_H + +const int8_t tile_server_weights[] = { +)"; + int lineEnding = 0; + for (auto const &tensor : tensorsVec) { + for (auto const &i : tensor) { + out << (int)i << ", "; + lineEnding++; + if (lineEnding > 80) { + out << "\n"; + lineEnding = 0; + } + } + } + + out << R"(}; + +#endif // TILESERVERGEN_H +)"; + + return utils::writeDataToFile(filename, out.str()); +} + LogicalResult getFlatBufferStringFromMLIR( mlir::ModuleOp module, std::map metadata, const bool &dontMinify, std::string &flatBufferString) { diff --git a/xformer/Utils/FileIO.h b/xformer/Utils/FileIO.h index 97353fa5e..2b2ea81f5 100644 --- a/xformer/Utils/FileIO.h +++ b/xformer/Utils/FileIO.h @@ -16,6 +16,10 @@ LogicalResult writeDataToFile(const std::string &filename, std::string data); LogicalResult writeFlashImageToFile(const std::string &filename, std::vector> tensorsVec); +LogicalResult +writeTileServerDataToFile(const std::string &filename, + std::vector> tensorsVec); + LogicalResult getFlatBufferStringFromMLIR( mlir::ModuleOp module, std::map metadata, const bool &dontMinify, std::string &flatBufferString); diff --git a/xformer/Utils/TileRamSupport.cpp b/xformer/Utils/TileRamSupport.cpp new file mode 100644 index 000000000..e5c0297cb --- /dev/null +++ b/xformer/Utils/TileRamSupport.cpp @@ -0,0 +1,27 @@ +#include "Utils/TileRamSupport.h" +#include +#include + +namespace mlir { +namespace xcore { +namespace utils { + +std::vector tileRamHeader() { + // TODO: Change flash_t struct to mem_server_header_t + // We are reusing the flash_t struct in lib_tflite_micro as the header + // The header version is stored as one integer + // There are four parameter integers in the flash_t struct + // Altogether 20 bytes + constexpr int headerSize = 20; + std::vector header(headerSize, 0); + header[0] = 1; + header[1] = 2; + header[2] = ~1; + header[3] = ~2; + header[8] = headerSize; + return header; +} + +} // namespace utils +} // namespace xcore +} // namespace mlir \ No newline at end of file diff --git a/xformer/Utils/TileRamSupport.h b/xformer/Utils/TileRamSupport.h new file mode 100644 index 000000000..0d452c08d --- /dev/null +++ b/xformer/Utils/TileRamSupport.h @@ -0,0 +1,20 @@ +#ifndef XFORMER_UTILS_TILESUPPORT_H +#define XFORMER_UTILS_TILESUPPORT_H + +#include +#include +#include + +namespace mlir { +namespace xcore { +namespace utils { + +/** Function that creates a tile_ram_header + */ +std::vector tileRamHeader(); + +} // namespace utils +} // namespace xcore +} // namespace mlir + +#endif // XFORMER_UTILS_TILESUPPORT_H \ No newline at end of file diff --git a/xformer/Utils/Utils.td b/xformer/Utils/Utils.td index a3d348770..7a56a77a6 100644 --- a/xformer/Utils/Utils.td +++ b/xformer/Utils/Utils.td @@ -3,6 +3,8 @@ // Utility predicates that are shared by multiple passes. +def isBetaFloatEnabled : Constraint>; + // Check that bytes per pixel is a multiple of n class HasMultipleOfNBytesPerPixel : Constraint enableBetaFloatOption("xcore-enable-beta-float", + cl::desc("Enable beta float support."), + cl::init(false), cl::cat(XformerCategory)); + cl::opt threadCountOption("xcore-thread-count", cl::desc("[-tc] Thread count"), cl::init(1), cl::cat(XformerCategory)); @@ -51,6 +55,10 @@ cl::alias aliasFlashImageOption("f", cl::desc("Alias for --xcore-flash-image-file"), cl::aliasopt(flashImageFilenameOption)); +cl::opt tileLoadOption("xcore-load-tile", + cl::desc("Enable loading weights from a tile."), + cl::init(false), cl::cat(XformerCategory)); + cl::opt loadExternallyIfLargerOption( "xcore-load-externally-if-larger", cl::desc("Load constants externally if larger than given limit in bytes " @@ -58,6 +66,13 @@ cl::opt loadExternallyIfLargerOption( "xcore-flash-image-file is not provided."), cl::init(96), cl::cat(XformerCategory), cl::Hidden); +cl::opt maxLoadExternalSizeOption( + "xcore-max-load-external-size", + cl::desc("The size of external load image from flash or tile will be " + "limited to the max specified bytes " + "(default = UINT_MAX bytes)."), + cl::init(UINT_MAX), cl::cat(XformerCategory), cl::Hidden); + // This option is to provide an error threshold. // The maximum average error between the reference and quantised // implementations of the output transform over each channel is used to decide @@ -277,6 +292,13 @@ int main(int argc, char **argv) { }; // Validate options + if (mlir::xcore::tileLoadOption.getNumOccurrences() > 0 && + mlir::xcore::threadCountOption < 4) { + return failedMessage("Please specify at least four threads using " + "xcore-thread-count option when using the " + "xcore-load-tile option!"); + } + if (mlir::xcore::loadExternallyIfLargerOption.getNumOccurrences() > 0 && mlir::xcore::flashImageFilenameOption.empty()) { return failedMessage( @@ -289,7 +311,7 @@ int main(int argc, char **argv) { !(mlir::xcore::opSplitTopOpsOption.empty()) || !(mlir::xcore::opSplitNumSplitsOption.empty()))) { return failedMessage( - "target size option cannot be used with start, end, and " + "Target size option cannot be used with start, end, and " "numSplits options"); } diff --git a/xformer/lib_tflite_micro.BUILD b/xformer/lib_tflite_micro.BUILD index 8d8b1948b..11a06f374 100644 --- a/xformer/lib_tflite_micro.BUILD +++ b/xformer/lib_tflite_micro.BUILD @@ -36,7 +36,9 @@ filegroup( "lib_tflite_micro/src/tflite-xcore-kernels/xcore_3_to_4.cc", "lib_tflite_micro/src/tflite-xcore-kernels/xcore_strided_slice.cc", "lib_tflite_micro/src/tflite-xcore-kernels/xcore_mul.cc", + "lib_tflite_micro/src/tflite-xcore-kernels/xcore_beta_activationf32.cc", "lib_tflite_micro/src/tflite-xcore-kernels/xcore_beta_convf32.cc", + "lib_tflite_micro/src/tflite-xcore-kernels/xcore_beta_concatf32.cc", "lib_tflite_micro/src/tflite-xcore-kernels/xcore_beta_transposeconvf32.cc", "lib_tflite_micro/src/tflite-xcore-kernels/xcore_beta_fcf32.cc", "lib_tflite_micro/src/tflite-xcore-kernels/conv2d_float.c",