Change ld flash to ld weights and add alignment for weights in DDR

xmos · Jul 29, 2024 · a2a3519 · a2a3519
1 parent 5b12afb
commit a2a3519
Show file tree

Hide file tree

Showing 14 changed files with 122 additions and 116 deletions.
diff --git a/xformer/IR/XCoreOps.td b/xformer/IR/XCoreOps.td
@@ -453,12 +453,15 @@ def XC_LoadConstantOp
   let results = (outs AnyTensor : $output);
 }
 
-def XC_LoadFlashOp : XC_Op<"ld_flash", [Pure]> {
-  let summary = "Load from flash op";
+def XC_LoadWeightsOp : XC_Op<"ld_weights", [Pure]> {
+  let summary = "Load weights op";
 
-  let description = [{Load from flash op.}];
+  let description = [{Load weights op.}];
 
-  let arguments = (ins I32Attr : $address, I32ArrayAttr : $sizes);
+  let arguments = (ins I32Attr
+                   : $address, I32ArrayAttr
+                   : $sizes, BoolAttr
+                   : $in_ddr);
 
   let results = (outs Variadic<AnyTensor> : $output);
 }

diff --git a/xformer/Test/invalid-loadconstantop.mlir b/xformer/Test/invalid-loadconstantop.mlir
@@ -1,6 +1,6 @@
 // RUN: xcore-opt --mlir-io %s --xcore-apply-loadconstantop-patterns -verify-diagnostics
 
-// expected-error@+1 {{Flash image file option should be provided to run this pass!}}
+// expected-error@+1 {{Weights file option should be provided to run this pass!}}
 func.func @invalid(%arg0: tensor<?x4x8x1x!quant.uniform<i8:f32, 0.0078160231932997704>>) -> tensor<?x32x!quant.uniform<i8:f32, 0.037329975515604019:-13>> attributes {tf.entry_function = {inputs = "flatten_input", outputs = "Identity"}} {
   %cst = arith.constant dense<[1, 2, 3, 4, 5, 6, 7, 8, 9, 0]> : tensor<10xi8>
   %cst_0 = arith.constant dense<[[11, 12, 13, 14, 15, 16, 17, 18, 19, 10]]> : tensor<1x10xi16>

diff --git a/xformer/Test/invalid-loadflashop.mlir b/xformer/Test/invalid-loadflashop.mlir
@@ -1,6 +1,6 @@
-// RUN: xcore-opt --mlir-io %s --xcore-write-flash-image -verify-diagnostics
+// RUN: xcore-opt --mlir-io %s --xcore-write-weights -verify-diagnostics
 
-// expected-error@+1 {{Flash image file option should be provided to run this pass!}}
+// expected-error@+1 {{Weights file option should be provided to run this pass!}}
 func.func @valid(%arg0: tensor<?x4x8x1x!quant.uniform<i8:f32, 0.0078160231932997704>>) -> tensor<?x32x!quant.uniform<i8:f32, 0.037329975515604019:-13>> attributes {tf.entry_function = {inputs = "flatten_input", outputs = "Identity"}} {
   %cst = arith.constant dense<[1, 2, 3, 4, 5, 6, 7, 8, 9, 0]> : tensor<10xi8>
   %cst_0 = arith.constant dense<[[11, 12, 13, 14, 15, 16, 17, 18, 19, 10]]> : tensor<1x10xi16>

diff --git a/xformer/Test/loadflashop.mlir → xformer/Test/loadweightsop.mlir b/xformer/Test/loadflashop.mlir → xformer/Test/loadweightsop.mlir
@@ -1,4 +1,4 @@
-// RUN: xcore-opt --mlir-io %s --xcore-write-flash-image --xcore-weights-file=/dev/null | FileCheck %s
+// RUN: xcore-opt --mlir-io %s --xcore-write-weights --xcore-weights-file=/dev/null | FileCheck %s
 
 // CHECK-LABEL: valid
 func.func @valid(%arg0: tensor<?x4x8x1x!quant.uniform<i8:f32, 0.0078160231932997704>>) -> tensor<?x32x!quant.uniform<i8:f32, 0.037329975515604019:-13>> attributes {tf.entry_function = {inputs = "flatten_input", outputs = "Identity"}} {
@@ -10,8 +10,8 @@ func.func @valid(%arg0: tensor<?x4x8x1x!quant.uniform<i8:f32, 0.0078160231932997
   %cst_4 = "tfl.no_value"() {value} : () -> none
   %0 = "tfl.reshape"(%arg0, %cst_3) : (tensor<?x4x8x1x!quant.uniform<i8:f32, 0.0078160231932997704>>, tensor<2xi32>) -> tensor<?x32x!quant.uniform<i8:f32, 0.0078160231932997704>>
   %1 = "tfl.reshape"(%0, %cst_1) : (tensor<?x32x!quant.uniform<i8:f32, 0.0078160231932997704>>, tensor<4xi64>) -> tensor<?x1x1x32x!quant.uniform<i8:f32, 0.0078160231932997704>>
-  // CHECK: xc.ld_flash
-  // CHECK-NOT: xc.ld_flash
+  // CHECK: xc.ld_weights
+  // CHECK-NOT: xc.ld_weights
   // CHECK: xc.conv2d_v2
   %2 = "xc.ld_constant"(%cst) : (tensor<10xi8>) -> tensor<10xi8>
   %3 = "xc.ld_constant"(%cst_0) : (tensor<1x10xi16>) -> tensor<1x10xi16>

diff --git a/xformer/Transforms/ApplyLoadConstantOpPatterns.cpp b/xformer/Transforms/ApplyLoadConstantOpPatterns.cpp
@@ -62,7 +62,7 @@ bool isNotUsedByLoadConstantOp(Value result) {
 void ApplyLoadConstantOpPatterns::runOnOperation() {
   func::FuncOp f = getOperation();
   if (weightsFilenameOption.empty()) {
-    f.emitError("Flash image file option should be provided to run this pass!");
+    f.emitError("Weights file option should be provided to run this pass!");
     signalPassFailure();
     return;
   }

diff --git a/xformer/Transforms/Passes.cpp b/xformer/Transforms/Passes.cpp
@@ -46,7 +46,7 @@ void buildXCoreRemainingPassPipeline(OpPassManager &pm) {
   pm.addPass(createReplaceBroadcastPass());
   pm.addPass(createReplaceConcatPass());
   pm.addPass(createApplyXCPatternsPass());
-  // Add to pipeline only if flash image file option is provided
+  // Add to pipeline only if weights file option is provided
   if (!weightsFilenameOption.empty()) {
     pm.addPass(createApplyLoadConstantOpPatternsPass());
     pm.addPass(createWriteWeightsPass());

diff --git a/xformer/Transforms/TranslateToCustomOp.cpp b/xformer/Transforms/TranslateToCustomOp.cpp
@@ -121,7 +121,7 @@ std::vector<uint8_t> ConcatOp::buildCustomOptions() {
   return fbb.GetBuffer();
 }
 
-std::vector<uint8_t> LoadFlashOp::buildCustomOptions() {
+std::vector<uint8_t> LoadWeightsOp::buildCustomOptions() {
   flexbuffers::Builder fbb;
   auto rootMap = fbb.StartMap();
   fbb.Int("addr", (int32_t)getAddress());
@@ -130,6 +130,7 @@ std::vector<uint8_t> LoadFlashOp::buildCustomOptions() {
     fbb.Int(getSizes().cast<ArrayAttr>()[i].cast<IntegerAttr>().getInt());
   }
   fbb.EndVector(sizesVec, false, false);
+  fbb.Bool("ddr", (bool)getInDdr());
   fbb.EndMap(rootMap);
   fbb.Finish();
   return fbb.GetBuffer();
@@ -244,7 +245,7 @@ void TranslateToCustomOp::runOnOperation() {
   patterns.insert<RewriteToCustomOp<Bsign8Op>>(ctx);
   patterns.insert<RewriteToCustomOp<Conv2DV2Op>>(ctx);
   patterns.insert<RewriteToCustomOp<MaxPool2DOp>>(ctx);
-  patterns.insert<RewriteToCustomOp<LoadFlashOp>>(ctx);
+  patterns.insert<RewriteToCustomOp<LoadWeightsOp>>(ctx);
   patterns.insert<RewriteToCustomOp<LookupOp>>(ctx);
   patterns.insert<RewriteToCustomOp<SoftmaxOp>>(ctx);
   patterns.insert<RewriteToCustomOp<BatchedSoftmaxOp>>(ctx);

diff --git a/xformer/Transforms/WriteWeights.cpp b/xformer/Transforms/WriteWeights.cpp
@@ -15,16 +15,16 @@
 namespace mlir::xcore {
 
 namespace {
-// Write flash image
+// Write weights to a file
 struct WriteWeights
     : public PassWrapper<WriteWeights, OperationPass<func::FuncOp>> {
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(WriteWeights)
 
   void getDependentDialects(DialectRegistry &registry) const final {
     registry.insert<XCoreDialect>();
   }
-  StringRef getArgument() const final { return "xcore-write-flash-image"; }
-  StringRef getDescription() const final { return "Write flash image"; }
+  StringRef getArgument() const final { return "xcore-write-weights"; }
+  StringRef getDescription() const final { return "Write weights"; }
   void runOnOperation() override;
 };
 
@@ -66,7 +66,11 @@ struct WriteWeightsPattern : public OpRewritePattern<LoadConstantOp> {
       address += t.size();
     }
 
-    if (loadOp.getResult().hasOneUse()) {
+    // We try to combine loads to one op if the load has only one use or if the
+    // load is not from external memory.
+    // External memory loads have to be aligned to 32 bytes/256 bits for max
+    // speed
+    if (loadOp.getResult().hasOneUse() && !weightsInExternalMemory) {
       auto use = loadOp->use_begin();
       Operation *ownerOp = use->getOwner();
 
@@ -87,36 +91,43 @@ struct WriteWeightsPattern : public OpRewritePattern<LoadConstantOp> {
         }
       }
 
-      auto loadFlashOp =
-          rewriter.create<LoadFlashOp>(loadOp.getLoc(), outputTypes, address,
-                                       rewriter.getArrayAttr(dataSizes));
+      auto loadWeightsOp = rewriter.create<LoadWeightsOp>(
+          loadOp.getLoc(), outputTypes, address,
+          rewriter.getArrayAttr(dataSizes), /*in_ddr=*/false);
 
       for (int i = 0; i < opNums.size(); i++) {
-        ownerOp->setOperand(opNums[i], loadFlashOp.getResult(i));
+        ownerOp->setOperand(opNums[i], loadWeightsOp.getResult(i));
       }
 
-      loadFlashOp->moveBefore(ownerOp);
+      loadWeightsOp->moveBefore(ownerOp);
       loadOp.erase();
     } else {
       std::vector<char> loadOpData = getTensorData(loadOp);
       dataSizes.push_back(rewriter.getI32IntegerAttr(loadOpData.size()));
       tensorData.insert(tensorData.end(), loadOpData.begin(), loadOpData.end());
-      auto loadFlashOp = rewriter.create<LoadFlashOp>(
+      if (weightsInExternalMemory) {
+        // Pad tensordata to 32 bytes alignment
+        auto alignedSize = ((loadOpData.size() + 31) / 32) * 32;
+        auto toBePaddedSize = alignedSize - loadOpData.size();
+        // Pad with zeros
+        tensorData.insert(tensorData.end(), toBePaddedSize, 0);
+      }
+      auto loadWeightsOp = rewriter.create<LoadWeightsOp>(
           loadOp.getLoc(), loadOp.getType(), address,
-          rewriter.getArrayAttr(dataSizes));
-      rewriter.replaceOp(loadOp, loadFlashOp.getOutput());
+          rewriter.getArrayAttr(dataSizes), /*in_ddr=*/weightsInExternalMemory);
+      rewriter.replaceOp(loadOp, loadWeightsOp.getOutput());
 
-      // Find all uses of loadFlashOp and find the first Owner op
+      // Find all uses of loadWeightsOp and find the first Owner op
       // so that we can move the loading to just before that op.
       mlir::Operation *firstOwnerOp =
-          loadFlashOp->getResult(0).getUses().begin()->getOwner();
-      for (const mlir::OpOperand &use : loadFlashOp->getResult(0).getUses()) {
+          loadWeightsOp->getResult(0).getUses().begin()->getOwner();
+      for (const mlir::OpOperand &use : loadWeightsOp->getResult(0).getUses()) {
         mlir::Operation *op = use.getOwner();
         if (op->isBeforeInBlock(firstOwnerOp)) {
           firstOwnerOp = op;
         }
       }
-      loadFlashOp->moveBefore(firstOwnerOp);
+      loadWeightsOp->moveBefore(firstOwnerOp);
     }
 
     tensorsVec_->push_back(tensorData);
@@ -131,33 +142,24 @@ struct WriteWeightsPattern : public OpRewritePattern<LoadConstantOp> {
 void WriteWeights::runOnOperation() {
   func::FuncOp f = getOperation();
   if (weightsFilenameOption.empty()) {
-    f.emitError("Flash image file option should be provided to run this pass!");
+    f.emitError("Weights file option should be provided to run this pass!");
     signalPassFailure();
     return;
   }
 
   auto *ctx = &getContext();
   func::FuncOp func = getOperation();
   // For each LoadOp in the graph, save the tensor data, and replace the LoadOp
-  // with a LoadFlashOp
+  // with a LoadWeightsOp
   std::vector<std::vector<char>> tensorsVec;
   RewritePatternSet patterns(ctx);
   patterns.insert<WriteWeightsPattern>(&tensorsVec, ctx);
   (void)applyPatternsAndFoldGreedily(func, std::move(patterns));
 
-  if (weightsAsArrayOption) {
-    if (failed(utils::writeTileServerDataToFile(weightsFilenameOption,
-                                                tensorsVec,
-                                                weightsInExternalMemory))) {
-      f.emitError("Failed to write tile data!");
-      signalPassFailure();
-      return;
-    }
-  }
-  // Write tensor data to flash image file
-  else if (failed(
-               utils::writeWeightsToFile(weightsFilenameOption, tensorsVec))) {
-    f.emitError("Failed to write flash image!");
+  if (failed(utils::writeWeightsToFile(weightsFilenameOption, tensorsVec,
+                                       weightsAsArrayOption,
+                                       weightsInExternalMemory))) {
+    f.emitError("Failed to write weights to file!");
     signalPassFailure();
     return;
   }

diff --git a/xformer/Utils/FileIO.cpp b/xformer/Utils/FileIO.cpp
@@ -24,63 +24,68 @@ LogicalResult writeDataToFile(const std::string &filename, std::string data) {
 }
 
 LogicalResult writeWeightsToFile(const std::string &filename,
-                                 std::vector<std::vector<char>> tensorsVec) {
-  // Combine data for the tensors
-  std::string data;
-  for (auto const &tensor : tensorsVec) {
-    data += std::string(tensor.data(), tensor.size());
-  }
-
-  return utils::writeDataToFile(filename, data);
-}
-
-LogicalResult
-writeTileServerDataToFile(const std::string &filename,
-                          std::vector<std::vector<char>> tensorsVec,
-                          bool placeInExternalMemory) {
-  // Add header
-  auto tileHeader = utils::tileRamHeader();
-  tensorsVec.insert(tensorsVec.begin(), tileHeader);
-
-  std::ostringstream cOut;
-  cOut << R"(#include <stdint.h>)";
-
-  if (placeInExternalMemory) {
-    cOut << "\n\n" << R"(__attribute__ ((section(".ExtMem.data"))))" << "\n";
-  }
+                                 std::vector<std::vector<char>> tensorsVec,
+                                 bool writeWeightsAsArray,
+                                 bool placeInExternalMemory) {
+  if (writeWeightsAsArray) {
+    std::ostringstream cOut;
+    cOut << R"(#include <stdint.h>)";
+
+    if (placeInExternalMemory) {
+      cOut << "\n\n"
+           << R"(__attribute__ ((section(".ExtMem.data"))))"
+           << "\n";
+    } else {
+      // Weights are to be placed in SRAM tile
+      // Add tile ram server header
+      auto tileHeader = utils::tileRamServerHeader();
+      tensorsVec.insert(tensorsVec.begin(), tileHeader);
+    }
 
-  cOut << "const int8_t tile_server_weights[] = {\n";
-  int lineEnding = 0;
-  int weightsSize = 0;
-  for (auto const &tensor : tensorsVec) {
-    for (auto const &i : tensor) {
-      cOut << (int)i << ", ";
-      lineEnding++;
-      weightsSize++;
-      if (lineEnding > 80) {
-        cOut << "\n";
-        lineEnding = 0;
+    cOut << "const int8_t weights[] = {\n";
+    int lineEnding = 0;
+    int weightsSize = 0;
+    for (auto const &tensor : tensorsVec) {
+      for (auto const &i : tensor) {
+        cOut << (int)i << ", ";
+        lineEnding++;
+        weightsSize++;
+        if (lineEnding > 80) {
+          cOut << "\n";
+          lineEnding = 0;
+        }
       }
     }
-  }
 
-  cOut << R"(};
+    cOut << R"(};
 )";
 
-  if (failed(utils::writeDataToFile(filename + ".c", cOut.str()))) {
-    return failure();
-  }
+    if (failed(utils::writeDataToFile(filename + ".c", cOut.str()))) {
+      return failure();
+    }
 
-  std::ostringstream hOut;
-    hOut << R"(#ifndef TILESERVERGEN_H
-#define TILESERVERGEN_H
+    std::ostringstream hOut;
+    hOut << R"(#ifndef WEIGHTSGEN_H
+#define WEIGHTSGEN_H
 
-#define TILE_SERVER_WEIGHTS_SIZE ()" << weightsSize << R"(U)
+#define WEIGHTS_SIZE ()"
+         << weightsSize << R"(U)
 
-#endif // TILESERVERGEN_H
+#endif // WEIGHTSGEN_H
 )";
 
-  return utils::writeDataToFile(filename + ".h", hOut.str());
+    return utils::writeDataToFile(filename + ".h", hOut.str());
+
+  } else {
+    // Write data for flash image
+    // Combine data for the tensors
+    std::string data;
+    for (auto const &tensor : tensorsVec) {
+      data += std::string(tensor.data(), tensor.size());
+    }
+
+    return utils::writeDataToFile(filename, data);
+  }
 }
 
 LogicalResult getFlatBufferStringFromMLIR(

diff --git a/xformer/Utils/FileIO.h b/xformer/Utils/FileIO.h
@@ -11,12 +11,9 @@ namespace mlir::xcore::utils {
 LogicalResult writeDataToFile(const std::string &filename, std::string data);
 
 LogicalResult writeWeightsToFile(const std::string &filename,
-                                 std::vector<std::vector<char>> tensorsVec);
-
-LogicalResult
-writeTileServerDataToFile(const std::string &filename,
-                          std::vector<std::vector<char>> tensorsVec,
-                          bool placeInExternalMemory);
+                                 std::vector<std::vector<char>> tensorsVec,
+                                 bool writeWeightsAsArray,
+                                 bool placeInExternalMemory);
 
 LogicalResult getFlatBufferStringFromMLIR(
     mlir::ModuleOp module, std::map<std::string, std::string> metadata,

diff --git a/xformer/Utils/TileRamSupport.cpp b/xformer/Utils/TileRamSupport.cpp
@@ -2,7 +2,7 @@
 
 namespace mlir::xcore::utils {
 
-std::vector<char> tileRamHeader() {
+std::vector<char> tileRamServerHeader() {
   // TODO: Change flash_t struct to mem_server_header_t
   // We are reusing the flash_t struct in lib_tflite_micro as the header
   // The header version is stored as one integer

diff --git a/xformer/Utils/TileRamSupport.h b/xformer/Utils/TileRamSupport.h
@@ -7,7 +7,7 @@ namespace mlir::xcore::utils {
 
 /** Function that creates a tile_ram_header
  */
-std::vector<char> tileRamHeader();
+std::vector<char> tileRamServerHeader();
 
 } // namespace mlir::xcore::utils