From a2a3519ae07ea406cee0e72e5927131993dd027d Mon Sep 17 00:00:00 2001 From: panickal-xmos Date: Mon, 29 Jul 2024 15:49:02 +0100 Subject: [PATCH] Change ld flash to ld weights and add alignment for weights in DDR --- xformer/IR/XCoreOps.td | 11 ++- xformer/Test/invalid-loadconstantop.mlir | 2 +- xformer/Test/invalid-loadflashop.mlir | 4 +- .../{loadflashop.mlir => loadweightsop.mlir} | 6 +- .../ApplyLoadConstantOpPatterns.cpp | 2 +- xformer/Transforms/Passes.cpp | 2 +- xformer/Transforms/TranslateToCustomOp.cpp | 5 +- xformer/Transforms/WriteWeights.cpp | 64 ++++++------ xformer/Utils/FileIO.cpp | 97 ++++++++++--------- xformer/Utils/FileIO.h | 9 +- xformer/Utils/TileRamSupport.cpp | 2 +- xformer/Utils/TileRamSupport.h | 2 +- xformer/XCoreOptMain.cpp | 30 +++--- xformer/lib_tflite_micro.BUILD | 2 +- 14 files changed, 122 insertions(+), 116 deletions(-) rename xformer/Test/{loadflashop.mlir => loadweightsop.mlir} (98%) diff --git a/xformer/IR/XCoreOps.td b/xformer/IR/XCoreOps.td index eaa6a247d..4b302ef26 100644 --- a/xformer/IR/XCoreOps.td +++ b/xformer/IR/XCoreOps.td @@ -453,12 +453,15 @@ def XC_LoadConstantOp let results = (outs AnyTensor : $output); } -def XC_LoadFlashOp : XC_Op<"ld_flash", [Pure]> { - let summary = "Load from flash op"; +def XC_LoadWeightsOp : XC_Op<"ld_weights", [Pure]> { + let summary = "Load weights op"; - let description = [{Load from flash op.}]; + let description = [{Load weights op.}]; - let arguments = (ins I32Attr : $address, I32ArrayAttr : $sizes); + let arguments = (ins I32Attr + : $address, I32ArrayAttr + : $sizes, BoolAttr + : $in_ddr); let results = (outs Variadic : $output); } diff --git a/xformer/Test/invalid-loadconstantop.mlir b/xformer/Test/invalid-loadconstantop.mlir index a6b1535f3..782952568 100644 --- a/xformer/Test/invalid-loadconstantop.mlir +++ b/xformer/Test/invalid-loadconstantop.mlir @@ -1,6 +1,6 @@ // RUN: xcore-opt --mlir-io %s --xcore-apply-loadconstantop-patterns -verify-diagnostics -// expected-error@+1 {{Flash image file option should be provided to run this pass!}} +// expected-error@+1 {{Weights file option should be provided to run this pass!}} func.func @invalid(%arg0: tensor>) -> tensor> attributes {tf.entry_function = {inputs = "flatten_input", outputs = "Identity"}} { %cst = arith.constant dense<[1, 2, 3, 4, 5, 6, 7, 8, 9, 0]> : tensor<10xi8> %cst_0 = arith.constant dense<[[11, 12, 13, 14, 15, 16, 17, 18, 19, 10]]> : tensor<1x10xi16> diff --git a/xformer/Test/invalid-loadflashop.mlir b/xformer/Test/invalid-loadflashop.mlir index 3ba4eb258..8495a387b 100644 --- a/xformer/Test/invalid-loadflashop.mlir +++ b/xformer/Test/invalid-loadflashop.mlir @@ -1,6 +1,6 @@ -// RUN: xcore-opt --mlir-io %s --xcore-write-flash-image -verify-diagnostics +// RUN: xcore-opt --mlir-io %s --xcore-write-weights -verify-diagnostics -// expected-error@+1 {{Flash image file option should be provided to run this pass!}} +// expected-error@+1 {{Weights file option should be provided to run this pass!}} func.func @valid(%arg0: tensor>) -> tensor> attributes {tf.entry_function = {inputs = "flatten_input", outputs = "Identity"}} { %cst = arith.constant dense<[1, 2, 3, 4, 5, 6, 7, 8, 9, 0]> : tensor<10xi8> %cst_0 = arith.constant dense<[[11, 12, 13, 14, 15, 16, 17, 18, 19, 10]]> : tensor<1x10xi16> diff --git a/xformer/Test/loadflashop.mlir b/xformer/Test/loadweightsop.mlir similarity index 98% rename from xformer/Test/loadflashop.mlir rename to xformer/Test/loadweightsop.mlir index 011abbcd5..a57257912 100644 --- a/xformer/Test/loadflashop.mlir +++ b/xformer/Test/loadweightsop.mlir @@ -1,4 +1,4 @@ -// RUN: xcore-opt --mlir-io %s --xcore-write-flash-image --xcore-weights-file=/dev/null | FileCheck %s +// RUN: xcore-opt --mlir-io %s --xcore-write-weights --xcore-weights-file=/dev/null | FileCheck %s // CHECK-LABEL: valid func.func @valid(%arg0: tensor>) -> tensor> attributes {tf.entry_function = {inputs = "flatten_input", outputs = "Identity"}} { @@ -10,8 +10,8 @@ func.func @valid(%arg0: tensor none %0 = "tfl.reshape"(%arg0, %cst_3) : (tensor>, tensor<2xi32>) -> tensor> %1 = "tfl.reshape"(%0, %cst_1) : (tensor>, tensor<4xi64>) -> tensor> - // CHECK: xc.ld_flash - // CHECK-NOT: xc.ld_flash + // CHECK: xc.ld_weights + // CHECK-NOT: xc.ld_weights // CHECK: xc.conv2d_v2 %2 = "xc.ld_constant"(%cst) : (tensor<10xi8>) -> tensor<10xi8> %3 = "xc.ld_constant"(%cst_0) : (tensor<1x10xi16>) -> tensor<1x10xi16> diff --git a/xformer/Transforms/ApplyLoadConstantOpPatterns.cpp b/xformer/Transforms/ApplyLoadConstantOpPatterns.cpp index b203d4569..91b09748d 100644 --- a/xformer/Transforms/ApplyLoadConstantOpPatterns.cpp +++ b/xformer/Transforms/ApplyLoadConstantOpPatterns.cpp @@ -62,7 +62,7 @@ bool isNotUsedByLoadConstantOp(Value result) { void ApplyLoadConstantOpPatterns::runOnOperation() { func::FuncOp f = getOperation(); if (weightsFilenameOption.empty()) { - f.emitError("Flash image file option should be provided to run this pass!"); + f.emitError("Weights file option should be provided to run this pass!"); signalPassFailure(); return; } diff --git a/xformer/Transforms/Passes.cpp b/xformer/Transforms/Passes.cpp index d0cad10e2..f87f1facc 100644 --- a/xformer/Transforms/Passes.cpp +++ b/xformer/Transforms/Passes.cpp @@ -46,7 +46,7 @@ void buildXCoreRemainingPassPipeline(OpPassManager &pm) { pm.addPass(createReplaceBroadcastPass()); pm.addPass(createReplaceConcatPass()); pm.addPass(createApplyXCPatternsPass()); - // Add to pipeline only if flash image file option is provided + // Add to pipeline only if weights file option is provided if (!weightsFilenameOption.empty()) { pm.addPass(createApplyLoadConstantOpPatternsPass()); pm.addPass(createWriteWeightsPass()); diff --git a/xformer/Transforms/TranslateToCustomOp.cpp b/xformer/Transforms/TranslateToCustomOp.cpp index 7672418cb..5a4ba6eef 100644 --- a/xformer/Transforms/TranslateToCustomOp.cpp +++ b/xformer/Transforms/TranslateToCustomOp.cpp @@ -121,7 +121,7 @@ std::vector ConcatOp::buildCustomOptions() { return fbb.GetBuffer(); } -std::vector LoadFlashOp::buildCustomOptions() { +std::vector LoadWeightsOp::buildCustomOptions() { flexbuffers::Builder fbb; auto rootMap = fbb.StartMap(); fbb.Int("addr", (int32_t)getAddress()); @@ -130,6 +130,7 @@ std::vector LoadFlashOp::buildCustomOptions() { fbb.Int(getSizes().cast()[i].cast().getInt()); } fbb.EndVector(sizesVec, false, false); + fbb.Bool("ddr", (bool)getInDdr()); fbb.EndMap(rootMap); fbb.Finish(); return fbb.GetBuffer(); @@ -244,7 +245,7 @@ void TranslateToCustomOp::runOnOperation() { patterns.insert>(ctx); patterns.insert>(ctx); patterns.insert>(ctx); - patterns.insert>(ctx); + patterns.insert>(ctx); patterns.insert>(ctx); patterns.insert>(ctx); patterns.insert>(ctx); diff --git a/xformer/Transforms/WriteWeights.cpp b/xformer/Transforms/WriteWeights.cpp index e832cb636..338be6389 100644 --- a/xformer/Transforms/WriteWeights.cpp +++ b/xformer/Transforms/WriteWeights.cpp @@ -15,7 +15,7 @@ namespace mlir::xcore { namespace { -// Write flash image +// Write weights to a file struct WriteWeights : public PassWrapper> { MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(WriteWeights) @@ -23,8 +23,8 @@ struct WriteWeights void getDependentDialects(DialectRegistry ®istry) const final { registry.insert(); } - StringRef getArgument() const final { return "xcore-write-flash-image"; } - StringRef getDescription() const final { return "Write flash image"; } + StringRef getArgument() const final { return "xcore-write-weights"; } + StringRef getDescription() const final { return "Write weights"; } void runOnOperation() override; }; @@ -66,7 +66,11 @@ struct WriteWeightsPattern : public OpRewritePattern { address += t.size(); } - if (loadOp.getResult().hasOneUse()) { + // We try to combine loads to one op if the load has only one use or if the + // load is not from external memory. + // External memory loads have to be aligned to 32 bytes/256 bits for max + // speed + if (loadOp.getResult().hasOneUse() && !weightsInExternalMemory) { auto use = loadOp->use_begin(); Operation *ownerOp = use->getOwner(); @@ -87,36 +91,43 @@ struct WriteWeightsPattern : public OpRewritePattern { } } - auto loadFlashOp = - rewriter.create(loadOp.getLoc(), outputTypes, address, - rewriter.getArrayAttr(dataSizes)); + auto loadWeightsOp = rewriter.create( + loadOp.getLoc(), outputTypes, address, + rewriter.getArrayAttr(dataSizes), /*in_ddr=*/false); for (int i = 0; i < opNums.size(); i++) { - ownerOp->setOperand(opNums[i], loadFlashOp.getResult(i)); + ownerOp->setOperand(opNums[i], loadWeightsOp.getResult(i)); } - loadFlashOp->moveBefore(ownerOp); + loadWeightsOp->moveBefore(ownerOp); loadOp.erase(); } else { std::vector loadOpData = getTensorData(loadOp); dataSizes.push_back(rewriter.getI32IntegerAttr(loadOpData.size())); tensorData.insert(tensorData.end(), loadOpData.begin(), loadOpData.end()); - auto loadFlashOp = rewriter.create( + if (weightsInExternalMemory) { + // Pad tensordata to 32 bytes alignment + auto alignedSize = ((loadOpData.size() + 31) / 32) * 32; + auto toBePaddedSize = alignedSize - loadOpData.size(); + // Pad with zeros + tensorData.insert(tensorData.end(), toBePaddedSize, 0); + } + auto loadWeightsOp = rewriter.create( loadOp.getLoc(), loadOp.getType(), address, - rewriter.getArrayAttr(dataSizes)); - rewriter.replaceOp(loadOp, loadFlashOp.getOutput()); + rewriter.getArrayAttr(dataSizes), /*in_ddr=*/weightsInExternalMemory); + rewriter.replaceOp(loadOp, loadWeightsOp.getOutput()); - // Find all uses of loadFlashOp and find the first Owner op + // Find all uses of loadWeightsOp and find the first Owner op // so that we can move the loading to just before that op. mlir::Operation *firstOwnerOp = - loadFlashOp->getResult(0).getUses().begin()->getOwner(); - for (const mlir::OpOperand &use : loadFlashOp->getResult(0).getUses()) { + loadWeightsOp->getResult(0).getUses().begin()->getOwner(); + for (const mlir::OpOperand &use : loadWeightsOp->getResult(0).getUses()) { mlir::Operation *op = use.getOwner(); if (op->isBeforeInBlock(firstOwnerOp)) { firstOwnerOp = op; } } - loadFlashOp->moveBefore(firstOwnerOp); + loadWeightsOp->moveBefore(firstOwnerOp); } tensorsVec_->push_back(tensorData); @@ -131,7 +142,7 @@ struct WriteWeightsPattern : public OpRewritePattern { void WriteWeights::runOnOperation() { func::FuncOp f = getOperation(); if (weightsFilenameOption.empty()) { - f.emitError("Flash image file option should be provided to run this pass!"); + f.emitError("Weights file option should be provided to run this pass!"); signalPassFailure(); return; } @@ -139,25 +150,16 @@ void WriteWeights::runOnOperation() { auto *ctx = &getContext(); func::FuncOp func = getOperation(); // For each LoadOp in the graph, save the tensor data, and replace the LoadOp - // with a LoadFlashOp + // with a LoadWeightsOp std::vector> tensorsVec; RewritePatternSet patterns(ctx); patterns.insert(&tensorsVec, ctx); (void)applyPatternsAndFoldGreedily(func, std::move(patterns)); - if (weightsAsArrayOption) { - if (failed(utils::writeTileServerDataToFile(weightsFilenameOption, - tensorsVec, - weightsInExternalMemory))) { - f.emitError("Failed to write tile data!"); - signalPassFailure(); - return; - } - } - // Write tensor data to flash image file - else if (failed( - utils::writeWeightsToFile(weightsFilenameOption, tensorsVec))) { - f.emitError("Failed to write flash image!"); + if (failed(utils::writeWeightsToFile(weightsFilenameOption, tensorsVec, + weightsAsArrayOption, + weightsInExternalMemory))) { + f.emitError("Failed to write weights to file!"); signalPassFailure(); return; } diff --git a/xformer/Utils/FileIO.cpp b/xformer/Utils/FileIO.cpp index b06445ef2..0918d7954 100644 --- a/xformer/Utils/FileIO.cpp +++ b/xformer/Utils/FileIO.cpp @@ -24,63 +24,68 @@ LogicalResult writeDataToFile(const std::string &filename, std::string data) { } LogicalResult writeWeightsToFile(const std::string &filename, - std::vector> tensorsVec) { - // Combine data for the tensors - std::string data; - for (auto const &tensor : tensorsVec) { - data += std::string(tensor.data(), tensor.size()); - } - - return utils::writeDataToFile(filename, data); -} - -LogicalResult -writeTileServerDataToFile(const std::string &filename, - std::vector> tensorsVec, - bool placeInExternalMemory) { - // Add header - auto tileHeader = utils::tileRamHeader(); - tensorsVec.insert(tensorsVec.begin(), tileHeader); - - std::ostringstream cOut; - cOut << R"(#include )"; - - if (placeInExternalMemory) { - cOut << "\n\n" << R"(__attribute__ ((section(".ExtMem.data"))))" << "\n"; - } + std::vector> tensorsVec, + bool writeWeightsAsArray, + bool placeInExternalMemory) { + if (writeWeightsAsArray) { + std::ostringstream cOut; + cOut << R"(#include )"; + + if (placeInExternalMemory) { + cOut << "\n\n" + << R"(__attribute__ ((section(".ExtMem.data"))))" + << "\n"; + } else { + // Weights are to be placed in SRAM tile + // Add tile ram server header + auto tileHeader = utils::tileRamServerHeader(); + tensorsVec.insert(tensorsVec.begin(), tileHeader); + } - cOut << "const int8_t tile_server_weights[] = {\n"; - int lineEnding = 0; - int weightsSize = 0; - for (auto const &tensor : tensorsVec) { - for (auto const &i : tensor) { - cOut << (int)i << ", "; - lineEnding++; - weightsSize++; - if (lineEnding > 80) { - cOut << "\n"; - lineEnding = 0; + cOut << "const int8_t weights[] = {\n"; + int lineEnding = 0; + int weightsSize = 0; + for (auto const &tensor : tensorsVec) { + for (auto const &i : tensor) { + cOut << (int)i << ", "; + lineEnding++; + weightsSize++; + if (lineEnding > 80) { + cOut << "\n"; + lineEnding = 0; + } } } - } - cOut << R"(}; + cOut << R"(}; )"; - if (failed(utils::writeDataToFile(filename + ".c", cOut.str()))) { - return failure(); - } + if (failed(utils::writeDataToFile(filename + ".c", cOut.str()))) { + return failure(); + } - std::ostringstream hOut; - hOut << R"(#ifndef TILESERVERGEN_H -#define TILESERVERGEN_H + std::ostringstream hOut; + hOut << R"(#ifndef WEIGHTSGEN_H +#define WEIGHTSGEN_H -#define TILE_SERVER_WEIGHTS_SIZE ()" << weightsSize << R"(U) +#define WEIGHTS_SIZE ()" + << weightsSize << R"(U) -#endif // TILESERVERGEN_H +#endif // WEIGHTSGEN_H )"; - return utils::writeDataToFile(filename + ".h", hOut.str()); + return utils::writeDataToFile(filename + ".h", hOut.str()); + + } else { + // Write data for flash image + // Combine data for the tensors + std::string data; + for (auto const &tensor : tensorsVec) { + data += std::string(tensor.data(), tensor.size()); + } + + return utils::writeDataToFile(filename, data); + } } LogicalResult getFlatBufferStringFromMLIR( diff --git a/xformer/Utils/FileIO.h b/xformer/Utils/FileIO.h index a1345d389..3224c1e59 100644 --- a/xformer/Utils/FileIO.h +++ b/xformer/Utils/FileIO.h @@ -11,12 +11,9 @@ namespace mlir::xcore::utils { LogicalResult writeDataToFile(const std::string &filename, std::string data); LogicalResult writeWeightsToFile(const std::string &filename, - std::vector> tensorsVec); - -LogicalResult -writeTileServerDataToFile(const std::string &filename, - std::vector> tensorsVec, - bool placeInExternalMemory); + std::vector> tensorsVec, + bool writeWeightsAsArray, + bool placeInExternalMemory); LogicalResult getFlatBufferStringFromMLIR( mlir::ModuleOp module, std::map metadata, diff --git a/xformer/Utils/TileRamSupport.cpp b/xformer/Utils/TileRamSupport.cpp index ef630c1fe..db02754b5 100644 --- a/xformer/Utils/TileRamSupport.cpp +++ b/xformer/Utils/TileRamSupport.cpp @@ -2,7 +2,7 @@ namespace mlir::xcore::utils { -std::vector tileRamHeader() { +std::vector tileRamServerHeader() { // TODO: Change flash_t struct to mem_server_header_t // We are reusing the flash_t struct in lib_tflite_micro as the header // The header version is stored as one integer diff --git a/xformer/Utils/TileRamSupport.h b/xformer/Utils/TileRamSupport.h index c049359ea..09ae1cefb 100644 --- a/xformer/Utils/TileRamSupport.h +++ b/xformer/Utils/TileRamSupport.h @@ -7,7 +7,7 @@ namespace mlir::xcore::utils { /** Function that creates a tile_ram_header */ -std::vector tileRamHeader(); +std::vector tileRamServerHeader(); } // namespace mlir::xcore::utils diff --git a/xformer/XCoreOptMain.cpp b/xformer/XCoreOptMain.cpp index 682d2dad4..da9297a96 100644 --- a/xformer/XCoreOptMain.cpp +++ b/xformer/XCoreOptMain.cpp @@ -72,13 +72,18 @@ cl::alias aliasWeightsFilenameOption("f", cl::desc("Alias for --xcore-weights-file"), cl::aliasopt(weightsFilenameOption)); -cl::opt weightsAsArrayOption("xcore-write-weights-as-array", - cl::desc("Write the weights in the form of an array in a source file (creates .c/.h files with as the file name)."), - cl::init(false), cl::cat(XformerCategory)); +cl::opt weightsAsArrayOption( + "xcore-write-weights-as-array", + cl::desc( + "Write the weights in the form of an array in a source file (creates " + ".c/.h files with as the file name)."), + cl::init(false), cl::cat(XformerCategory)); -cl::opt weightsInExternalMemory("xcore-weights-in-external-memory", - cl::desc("Annotate the generated weights array with an attribute to place it in external memory."), - cl::init(false), cl::cat(XformerCategory)); +cl::opt weightsInExternalMemory( + "xcore-weights-in-external-memory", + cl::desc("Annotate the generated weights array with an attribute to place " + "it in external memory."), + cl::init(false), cl::cat(XformerCategory)); cl::opt loadExternallyIfLargerOption( "xcore-load-externally-if-larger", @@ -457,18 +462,11 @@ int main(int argc, char **argv) { "Please specify an output filename using the -o option!"); } - if (mlir::xcore::weightsAsArrayOption.getNumOccurrences() > 0 && - mlir::xcore::threadCountOption < 4) { - // TODO: This feels wrong considering this is just about the export format of the weights - return failedMessage("Please specify at least four threads using " - "xcore-thread-count option when using the " - "xcore-write-weights-as-array option!"); - } - if (mlir::xcore::weightsInExternalMemory.getNumOccurrences() > 0 && mlir::xcore::weightsAsArrayOption.getNumOccurrences() == 0) { - return failedMessage("Please specify the xcore-write-weights-as-array" - "when using the xcore-weights-in-external-memory option!"); + return failedMessage( + "Please specify the xcore-write-weights-as-array" + "when using the xcore-weights-in-external-memory option!"); } if (mlir::xcore::loadExternallyIfLargerOption.getNumOccurrences() > 0 && diff --git a/xformer/lib_tflite_micro.BUILD b/xformer/lib_tflite_micro.BUILD index bafd75ca4..23612a7b1 100644 --- a/xformer/lib_tflite_micro.BUILD +++ b/xformer/lib_tflite_micro.BUILD @@ -30,7 +30,7 @@ filegroup( "lib_tflite_micro/src/tflite-xcore-kernels/xcore_conv2d_v2.cc", "lib_tflite_micro/src/tflite-xcore-kernels/xcore_maxpool2d.cc", "lib_tflite_micro/src/tflite-xcore-kernels/xcore_detection_post.cc", - "lib_tflite_micro/src/tflite-xcore-kernels/xcore_load_from_flash.cc", + "lib_tflite_micro/src/tflite-xcore-kernels/xcore_load_weights.cc", "lib_tflite_micro/src/tflite-xcore-kernels/xcore_lookup.cc", "lib_tflite_micro/src/tflite-xcore-kernels/xcore_softmax.cc", "lib_tflite_micro/src/tflite-xcore-kernels/xcore_batched_softmax.cc",