xmos · panickal-xmos · Jul 18, 2024 · Jul 11, 2024 · Jul 14, 2024 · Jul 11, 2024
diff --git a/.github/workflows/release-beta.yml b/.github/workflows/release-beta.yml
@@ -112,7 +112,7 @@ jobs:
     if: github.event.pull_request.merged == true
     name: Build release wheels for macOS arm64
     needs: [build-release-archive]
-    runs-on: macos-11
+    runs-on: macos-14
     strategy:
       matrix:
         python-version: [3.9]

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -122,7 +122,7 @@ jobs:
   macos-arm-release-wheel:
     name: Build release wheels for macOS arm64
     needs: [build-release-archive]
-    runs-on: macos-11
+    runs-on: macos-14
     strategy:
       matrix:
         python-version: [3.9]

diff --git a/integration_tests/models/8x8/test_sub/1.sh b/integration_tests/models/8x8/test_sub/1.sh
@@ -0,0 +1,6 @@
+cp $1 /tmp/
+xcore-opt /tmp/$1 --lce-translate-tfl --mlir-print-ir-after-all -o /tmp/1.tflite >/tmp/1.mlir 2>&1
+cat /tmp/1.mlir | grep -v Tensor > /tmp/2.mlir
+sed -i -e 's/tfl.add/tfl.sub/g' /tmp/2.mlir
+xcore-opt --mlir-io --lce-translate-tfl /tmp/2.mlir -o /tmp/t.tflite
+cp /tmp/t.tflite $1
diff --git a/integration_tests/models/8x8/test_sub/test_sub_0.tflite b/integration_tests/models/8x8/test_sub/test_sub_0.tflite
diff --git a/integration_tests/models/8x8/test_sub/test_sub_1.tflite b/integration_tests/models/8x8/test_sub/test_sub_1.tflite
diff --git a/integration_tests/models/8x8/test_sub/test_sub_10.tflite b/integration_tests/models/8x8/test_sub/test_sub_10.tflite
diff --git a/integration_tests/models/8x8/test_sub/test_sub_11.tflite b/integration_tests/models/8x8/test_sub/test_sub_11.tflite
diff --git a/integration_tests/models/8x8/test_sub/test_sub_12.tflite b/integration_tests/models/8x8/test_sub/test_sub_12.tflite
diff --git a/integration_tests/models/8x8/test_sub/test_sub_13.tflite b/integration_tests/models/8x8/test_sub/test_sub_13.tflite
diff --git a/integration_tests/models/8x8/test_sub/test_sub_14.tflite b/integration_tests/models/8x8/test_sub/test_sub_14.tflite
diff --git a/integration_tests/models/8x8/test_sub/test_sub_15.tflite b/integration_tests/models/8x8/test_sub/test_sub_15.tflite
diff --git a/integration_tests/models/8x8/test_sub/test_sub_16.tflite b/integration_tests/models/8x8/test_sub/test_sub_16.tflite
diff --git a/integration_tests/models/8x8/test_sub/test_sub_17.tflite b/integration_tests/models/8x8/test_sub/test_sub_17.tflite
diff --git a/integration_tests/models/8x8/test_sub/test_sub_18.tflite b/integration_tests/models/8x8/test_sub/test_sub_18.tflite
diff --git a/integration_tests/models/8x8/test_sub/test_sub_19.tflite b/integration_tests/models/8x8/test_sub/test_sub_19.tflite
diff --git a/integration_tests/models/8x8/test_sub/test_sub_2.tflite b/integration_tests/models/8x8/test_sub/test_sub_2.tflite
diff --git a/integration_tests/models/8x8/test_sub/test_sub_3.tflite b/integration_tests/models/8x8/test_sub/test_sub_3.tflite
diff --git a/integration_tests/models/8x8/test_sub/test_sub_4.tflite b/integration_tests/models/8x8/test_sub/test_sub_4.tflite
diff --git a/integration_tests/models/8x8/test_sub/test_sub_41.tflite b/integration_tests/models/8x8/test_sub/test_sub_41.tflite
diff --git a/integration_tests/models/8x8/test_sub/test_sub_42.tflite b/integration_tests/models/8x8/test_sub/test_sub_42.tflite
diff --git a/integration_tests/models/8x8/test_sub/test_sub_43.tflite b/integration_tests/models/8x8/test_sub/test_sub_43.tflite
diff --git a/integration_tests/models/8x8/test_sub/test_sub_44.tflite b/integration_tests/models/8x8/test_sub/test_sub_44.tflite
diff --git a/integration_tests/models/8x8/test_sub/test_sub_45.tflite b/integration_tests/models/8x8/test_sub/test_sub_45.tflite
diff --git a/integration_tests/models/8x8/test_sub/test_sub_46.tflite b/integration_tests/models/8x8/test_sub/test_sub_46.tflite
diff --git a/integration_tests/models/8x8/test_sub/test_sub_47.tflite b/integration_tests/models/8x8/test_sub/test_sub_47.tflite
diff --git a/integration_tests/models/8x8/test_sub/test_sub_5.tflite b/integration_tests/models/8x8/test_sub/test_sub_5.tflite
diff --git a/integration_tests/models/8x8/test_sub/test_sub_6.tflite b/integration_tests/models/8x8/test_sub/test_sub_6.tflite
diff --git a/integration_tests/models/8x8/test_sub/test_sub_7.tflite b/integration_tests/models/8x8/test_sub/test_sub_7.tflite
diff --git a/integration_tests/models/8x8/test_sub/test_sub_8.tflite b/integration_tests/models/8x8/test_sub/test_sub_8.tflite
diff --git a/integration_tests/models/8x8/test_sub/test_sub_9.tflite b/integration_tests/models/8x8/test_sub/test_sub_9.tflite
diff --git a/integration_tests/models/8x8/test_sub/test_sub_dual_output.tflite b/integration_tests/models/8x8/test_sub/test_sub_dual_output.tflite
diff --git a/xformer/Analysis/MemoryPlan.cpp b/xformer/Analysis/MemoryPlan.cpp
@@ -161,43 +161,62 @@ std::vector<int> MemoryPlan::getAllocatedOffsets(const bool overlapOps,
   llvm::DenseSet<Operation *> alreadyVisited;
   if (overlapOps) {
     for (auto o : operations) {
+      // We iterate through overlappable ops which have not been visited yet
       if (o->hasTrait<OpTrait::xcore::MemoryOverlappable>() &&
-          !alreadyVisited.contains(o) && o->getOperand(0).hasOneUse()) {
-        alreadyVisited.insert(o);
-
-        llvm::SmallVector<Value> inputVals;
+          !alreadyVisited.contains(o)) {
         auto inVal = o->getOperand(0);
-        inputVals.push_back(inVal);
-
-        auto outVal = o->getResult(0);
-        auto nextOp = *outVal.getUsers().begin();
-        // Identify chain of overlappable Ops
-        while (outVal.hasOneUse() && !alreadyVisited.contains(nextOp) &&
-               nextOp->hasTrait<OpTrait::xcore::MemoryOverlappable>()) {
-          inVal = outVal;
+
+        // We have binary and unary ops as overlappable
+        // For binary ops, we might have to overlap with the second operand
+        // The complicated if condition below is to check for valid one operand
+        // or two operand cases
+        if ((o->getNumOperands() == 1 && inVal.hasOneUse() &&
+             !vInfo[inVal].isConstant) ||
+            (o->getNumOperands() == 2 &&
+             (inVal.hasOneUse() && !vInfo[inVal].isConstant ||
+              o->getOperand(1).hasOneUse() &&
+                  !vInfo[o->getOperand(1)].isConstant))) {
+          // In case of two operands and first operand is invalid, use the
+          // second one
+          if (o->getNumOperands() == 2 &&
+              (!inVal.hasOneUse() || vInfo[inVal].isConstant)) {
+            inVal = o->getOperand(1);
+          }
+
+          alreadyVisited.insert(o);
+          llvm::SmallVector<Value> inputVals;
           inputVals.push_back(inVal);
-          alreadyVisited.insert(nextOp);
-          outVal = nextOp->getResult(0);
-          nextOp = *outVal.getUsers().begin();
-        }
 
-        // Set first Used of output Val to the first input Val
-        vInfo[outVal].firstUsed = vInfo[inputVals[0]].firstUsed;
-        auto unalignedSizeOutVal =
-            utils::getShapedTypeSize(outVal.getType().dyn_cast<ShapedType>());
-        size_t maxSizeNeeded = 0;
-        for (auto inV : inputVals) {
-          auto unalignedSizeInV =
-              utils::getShapedTypeSize(inV.getType().dyn_cast<ShapedType>());
-          auto unalignedOffset = unalignedSizeOutVal - unalignedSizeInV;
-          // Align offset up to double word = 8 bytes
-          auto offset = ((unalignedOffset + 7) / 8) * 8;
-          maxSizeNeeded = std::max(vInfo[inV].size + offset, maxSizeNeeded);
-          inOutMap[inV] = {outVal, offset};
+          auto outVal = o->getResult(0);
+          auto nextOp = *outVal.getUsers().begin();
+          // Identify chain of overlappable Ops
+          while (outVal.hasOneUse() && !alreadyVisited.contains(nextOp) &&
+                 nextOp->hasTrait<OpTrait::xcore::MemoryOverlappable>()) {
+            inVal = outVal;
+            inputVals.push_back(inVal);
+            alreadyVisited.insert(nextOp);
+            outVal = nextOp->getResult(0);
+            nextOp = *outVal.getUsers().begin();
+          }
+
+          // Set first Used of output Val to the first input Val
+          vInfo[outVal].firstUsed = vInfo[inputVals[0]].firstUsed;
+          auto unalignedSizeOutVal =
+              utils::getShapedTypeSize(outVal.getType().dyn_cast<ShapedType>());
+          size_t maxSizeNeeded = 0;
+          for (auto inV : inputVals) {
+            auto unalignedSizeInV =
+                utils::getShapedTypeSize(inV.getType().dyn_cast<ShapedType>());
+            auto unalignedOffset = unalignedSizeOutVal - unalignedSizeInV;
+            // Align offset up to double word = 8 bytes
+            auto offset = ((unalignedOffset + 7) / 8) * 8;
+            maxSizeNeeded = std::max(vInfo[inV].size + offset, maxSizeNeeded);
+            inOutMap[inV] = {outVal, offset};
+          }
+          // The aligned input val size plus aligned offset might be larger than
+          // aligned output val size
+          vInfo[outVal].size = std::max(vInfo[outVal].size, maxSizeNeeded);
         }
-        // The aligned input val size plus aligned offset might be larger than
-        // aligned output val size
-        vInfo[outVal].size = std::max(vInfo[outVal].size, maxSizeNeeded);
       }
     }
   }
@@ -353,6 +372,7 @@ void MemoryPlan::printMemoryPlan() {
       line[c] = '.';
     }
     int memory_use = 0;
+    int peakSize = 0;
     for (int i = 0; i < nonConstantAllocatedValues.size(); ++i) {
       if ((t < valueInfo[nonConstantAllocatedValues[i]].firstUsed) ||
           (t > valueInfo[nonConstantAllocatedValues[i]].lastUsed)) {
@@ -362,7 +382,12 @@ void MemoryPlan::printMemoryPlan() {
       if (offset == -1) {
         continue;
       }
+
       const int size = valueInfo[nonConstantAllocatedValues[i]].size;
+      if (peakSize < offset + size) {
+        peakSize = offset + size;
+      }
+
       memory_use += size;
       const int line_start = (offset * kLineWidth) / max_size;
       const int line_end = ((offset + size) * kLineWidth) / max_size;
@@ -377,9 +402,10 @@ void MemoryPlan::printMemoryPlan() {
     line[kLineWidth] = 0;
 
     llvm::outs() << llvm::format(
-        "\n%-20s %s%d: %s (%dk)",
+        "\n%-20s %s%d: %s (%dk), (%dk)",
         operations[t]->getName().stripDialect().str().c_str(),
-        t < 10 ? " " : "", t, (const char *)line, (memory_use + 1023) / 1024);
+        t < 10 ? " " : "", t, (const char *)line, (memory_use + 1023) / 1024,
+        (peakSize + 1023) / 1024);
   }
   llvm::outs() << "\n";
 }

diff --git a/xformer/Test/add_broadcast.mlir b/xformer/Test/add_broadcast.mlir
@@ -1,4 +1,4 @@
-// RUN: xcore-opt --mlir-io %s --xcore-replace-add | FileCheck %s 
+// RUN: xcore-opt --mlir-io %s --xcore-replace-addsub | FileCheck %s 
 
 // CHECK-LABEL: add_broadcast
 func.func @add_broadcast(%arg0: tensor<1x15x1x1x!quant.uniform<i8:f32, 0.0078378040343523026:-1>> {tf_saved_model.index_path = ["input_1"]}) -> (tensor<?x15x5x4x!quant.uniform<i8:f32, 0.033033743500709534:-6>> {tf_saved_model.index_path = ["add"]}) attributes {tf.entry_function = {inputs = "serving_default_input_1:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {

diff --git a/xformer/Transforms/Passes.cpp b/xformer/Transforms/Passes.cpp
@@ -17,13 +17,13 @@ void buildXCorePreOpSplitPassPipeline(OpPassManager &pm) {
   pm.addPass(mlir::TFL::CreateTranslateToLCEPass());
   // Convert dynamic shapes in batch dimension to static
   pm.addPass(createRemoveDynamicShapePass());
+}
+
+void buildXCoreRemainingPassPipeline(OpPassManager &pm) {
   // TFL passes
   pm.addPass(createOptimizeTransposePass());
   pm.addPass(createReplaceAvgPoolWithConv2DPass());
   pm.addPass(createReplaceFCWithConv2DPass());
-}
-
-void buildXCoreRemainingPassPipeline(OpPassManager &pm) {
   if (opSplitTensorArenaOption) {
     pm.addPass(createOpSplitPass());
   }
@@ -36,7 +36,7 @@ void buildXCoreRemainingPassPipeline(OpPassManager &pm) {
   pm.addPass(mlir::createCanonicalizerPass());
 
   // XC passes
-  pm.addPass(createReplaceAddPass());
+  pm.addPass(createReplaceAddSubPass());
   pm.addPass(createReplaceMaxPoolPass());
   pm.addPass(createReplaceMulPass());
   pm.addPass(createReplaceTransposeConvPass());

diff --git a/xformer/Transforms/Passes.h b/xformer/Transforms/Passes.h
@@ -31,7 +31,7 @@ std::unique_ptr<OperationPass<func::FuncOp>> createOptimizeConv2DPass();
 std::unique_ptr<OperationPass<func::FuncOp>> createOpSplitPass();
 std::unique_ptr<OperationPass<func::FuncOp>> createApplyTFLPatternsPass();
 std::unique_ptr<OperationPass<func::FuncOp>> createRemoveDynamicShapePass();
-std::unique_ptr<OperationPass<func::FuncOp>> createReplaceAddPass();
+std::unique_ptr<OperationPass<func::FuncOp>> createReplaceAddSubPass();
 std::unique_ptr<OperationPass<func::FuncOp>> createReplaceMulPass();
 std::unique_ptr<OperationPass<func::FuncOp>> createReplaceMaxPoolPass();
 std::unique_ptr<OperationPass<func::FuncOp>> createReplaceStridedSlicePass();

diff --git a/xformer/Transforms/ReplaceAdd.cpp b/xformer/Transforms/ReplaceAdd.cpp