diff --git a/integration_tests/models/8x8/test_pad_overlap/test_pad_overlap_0.tflite b/integration_tests/models/8x8/test_pad_overlap/test_pad_overlap_0.tflite
new file mode 100644
index 000000000..3010e369a
Binary files /dev/null and b/integration_tests/models/8x8/test_pad_overlap/test_pad_overlap_0.tflite differ
diff --git a/integration_tests/models/8x8/test_pad_overlap/test_pad_overlap_1.tflite b/integration_tests/models/8x8/test_pad_overlap/test_pad_overlap_1.tflite
new file mode 100644
index 000000000..89049ba0f
Binary files /dev/null and b/integration_tests/models/8x8/test_pad_overlap/test_pad_overlap_1.tflite differ
diff --git a/integration_tests/models/8x8/test_pad_overlap/test_pad_overlap_2.tflite b/integration_tests/models/8x8/test_pad_overlap/test_pad_overlap_2.tflite
new file mode 100644
index 000000000..767712268
Binary files /dev/null and b/integration_tests/models/8x8/test_pad_overlap/test_pad_overlap_2.tflite differ
diff --git a/integration_tests/models/8x8/test_pad_overlap/test_pad_overlap_3.tflite b/integration_tests/models/8x8/test_pad_overlap/test_pad_overlap_3.tflite
new file mode 100644
index 000000000..a163e4143
Binary files /dev/null and b/integration_tests/models/8x8/test_pad_overlap/test_pad_overlap_3.tflite differ
diff --git a/xformer/Analysis/MemoryPlan.cpp b/xformer/Analysis/MemoryPlan.cpp
index ec7ad9447..003a93b17 100644
--- a/xformer/Analysis/MemoryPlan.cpp
+++ b/xformer/Analysis/MemoryPlan.cpp
@@ -152,59 +152,35 @@ std::vector<int> MemoryPlan::getAllocatedOffsets(const bool overlapOps,
   auto vInfo = valueInfo;
 
   // Overlap buffers
-  llvm::DenseMap<Value, std::pair<Value, int>> outInVals;
-  // outInInVals are only used when overlapping conv and pad together
-  llvm::DenseMap<Value, std::pair<std::pair<Value, int>, std::pair<Value, int>>>
-      outInInVals;
-
-  int maxOpId = -1;
-  if (overlapConvOption) {
-    // TODO: Try overlap conv
-    // Need to revert conv to run single-threaded which is not implemented yet
-    auto maxOp = getOpWithMaxMemoryUsed();
-    // max op is usually pad or conv
-    // if max op is pad, we choose the next one which should be conv
-    if (llvm::isa<Conv2DV2Op>(maxOp)) {
-      maxOpId = operationIds[maxOp];
-    } else if (llvm::isa<PadOp>(maxOp) &&
-               llvm::isa<Conv2DV2Op>(operations[operationIds[maxOp] + 1])) {
-      maxOpId = operationIds[maxOp] + 1;
-    }
-  }
-
+  llvm::DenseMap<Value, std::pair<Value, int>> inOutMap;
+  llvm::DenseSet<Operation *> alreadyVisited;
   if (overlapOps) {
     for (auto o : operations) {
-      if (llvm::isa<PadOp>(o)) {
-        auto in = o->getOperand(0);
-        if (in.hasOneUse()) {
-          auto out = o->getResult(0);
-          int offset = vInfo[out].size - vInfo[in].size;
-          outInVals[out] = {in, offset};
-          vInfo[in].size += offset;
-          vInfo[in].lastUsed = vInfo[out].lastUsed;
+      // We are only overlapping Pad op as of now
+      if (llvm::isa<PadOp>(o) && !alreadyVisited.contains(o) &&
+          o->getOperand(0).hasOneUse()) {
+        alreadyVisited.insert(o);
+
+        llvm::SmallVector<Value> inputVals;
+        auto inVal = o->getOperand(0);
+        inputVals.push_back(inVal);
+
+        auto outVal = o->getResult(0);
+        auto nextOp = *outVal.getUsers().begin();
+        // Identify chain of Pad Ops
+        while (outVal.hasOneUse() && llvm::isa<PadOp>(nextOp)) {
+          inVal = nextOp->getOperand(0);
+          inputVals.push_back(inVal);
+          alreadyVisited.insert(nextOp);
+          outVal = nextOp->getResult(0);
+          nextOp = *outVal.getUsers().begin();
         }
-      }
 
-      if (llvm::isa<Conv2DV2Op>(o)) {
-        if (operationIds[o] == maxOpId) {
-          auto convOp = dyn_cast<Conv2DV2Op>(o);
-          auto in = o->getOperand(0);
-          auto out = o->getResult(0);
-          int offset = out.getType().dyn_cast<RankedTensorType>().getDimSize(
-              3); // pixel size
-
-          // since pad is input to this conv and already overlapped
-          if (outInVals.count(in)) {
-            // find the original input op
-            auto firstVal = outInVals[in].first;
-            auto firstOffset = outInVals[in].second;
-
-            offset += vInfo[out].size - vInfo[firstVal].size;
-
-            outInInVals[out] = {{in, offset}, {firstVal, firstOffset}};
-            vInfo[firstVal].size += offset;
-            vInfo[firstVal].lastUsed = vInfo[out].lastUsed;
-          }
+        // Set first Used of output Val to the first input Val
+        vInfo[outVal].firstUsed = vInfo[inputVals[0]].firstUsed;
+        for (auto inV : inputVals) {
+          int offset = vInfo[outVal].size - vInfo[inV].size;
+          inOutMap[inV] = {outVal, offset};
         }
       }
     }
@@ -224,7 +200,7 @@ std::vector<int> MemoryPlan::getAllocatedOffsets(const bool overlapOps,
 
   // Insert values and their sizes into priority queue
   for (auto v : values) {
-    if (!outInVals.count(v) && !outInInVals.count(v) && !vInfo[v].isConstant) {
+    if (!inOutMap.count(v) && !vInfo[v].isConstant) {
       queue.push({v, vInfo[v].size});
     }
   }
@@ -245,54 +221,23 @@ std::vector<int> MemoryPlan::getAllocatedOffsets(const bool overlapOps,
   }
 
   // Patch up overlapped buffers
-  for (auto val : outInInVals) {
-    auto out = val.first;
-    auto inPair = val.second.first;
-    auto firstValPair = val.second.second;
-
-    auto in = inPair.first;
-    auto offset = inPair.second;
-    // We allocate here itself
-    if (outInVals.count(in)) {
-      outInVals.erase(in);
-    }
-
-    auto firstVal = firstValPair.first;
-    auto firstOffset = firstValPair.second;
-
-    auto it =
-        std::find_if(allocatedValues.begin(), allocatedValues.end(),
-                     [&](const QueueItem &p) { return p.first == firstVal; });
-
-    if (it != allocatedValues.end()) {
-      int currentOffset = it->second;
-      allocatedValues.erase(it);
-      allocatedValues.insert({firstVal, currentOffset + offset + firstOffset});
-      allocatedValues.insert({in, currentOffset + offset});
-      allocatedValues.insert({out, currentOffset});
-    } else {
-      assert(false);
-    }
-  }
-
-  for (auto val : outInVals) {
-    auto out = val.first;
-    auto in = val.second.first;
+  for (auto val : inOutMap) {
+    auto in = val.first;
+    auto out = val.second.first;
     auto offset = val.second.second;
 
     auto it = std::find_if(allocatedValues.begin(), allocatedValues.end(),
-                           [&](const QueueItem &p) { return p.first == in; });
+                           [&](const QueueItem &p) { return p.first == out; });
 
     if (it != allocatedValues.end()) {
       int currentOffset = it->second;
-      allocatedValues.erase(it);
       allocatedValues.insert({in, currentOffset + offset});
-      allocatedValues.insert({out, currentOffset});
     } else {
       assert(false);
     }
   }
 
+  // Insert -1 offset for constant values
   for (auto v : values) {
     if (vInfo[v].isConstant) {
       allocatedValues.insert({v, -1});
diff --git a/xformer/Test/memory-plan_1.mlir b/xformer/Test/memory-plan_1.mlir
new file mode 100644
index 000000000..7ab084e99
--- /dev/null
+++ b/xformer/Test/memory-plan_1.mlir
@@ -0,0 +1,7 @@
+// RUN: xcore-opt --mlir-io %s --xcore-plan-memory -mlir-print-ir-module-scope -mlir-disable-threading | FileCheck %s
+
+// CHECK: xc.offsets = dense<[384, 0]> : vector<2xi32>
+func.func @main(%arg0: tensor<1x4x1x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>> {tf_saved_model.index_path = ["zero_padding2d_input"]}) -> (tensor<1x4x3x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>> {tf_saved_model.index_path = ["zero_padding2d"]}) attributes {tf.entry_function = {inputs = "serving_default_zero_padding2d_input:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+  %0 = "xc.pad"(%arg0) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 48 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x1x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>) -> tensor<1x4x3x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>
+  return %0 : tensor<1x4x3x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>
+}
\ No newline at end of file
diff --git a/xformer/Test/memory-plan_2.mlir b/xformer/Test/memory-plan_2.mlir
new file mode 100644
index 000000000..20d6788fe
--- /dev/null
+++ b/xformer/Test/memory-plan_2.mlir
@@ -0,0 +1,9 @@
+// RUN: xcore-opt --mlir-io %s --xcore-plan-memory -mlir-print-ir-module-scope -mlir-disable-threading | FileCheck %s
+
+// CHECK: xc.offsets = dense<[1152, 768, 384, 0]> : vector<4xi32>
+func.func @main(%arg0: tensor<1x4x1x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>> {tf_saved_model.index_path = ["zero_padding2d_input"]}) -> (tensor<1x4x7x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>> {tf_saved_model.index_path = ["zero_padding2d"]}) attributes {tf.entry_function = {inputs = "serving_default_zero_padding2d_input:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+  %0 = "xc.pad"(%arg0) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 48 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x1x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>) -> tensor<1x4x3x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>
+  %1 = "xc.pad"(%0) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 144 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x3x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>) -> tensor<1x4x5x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>
+  %2 = "xc.pad"(%1) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 240 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x5x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>) -> tensor<1x4x7x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>
+  return %2 : tensor<1x4x7x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>
+}
\ No newline at end of file
diff --git a/xformer/Test/memory-plan_3.mlir b/xformer/Test/memory-plan_3.mlir
new file mode 100644
index 000000000..a9b1787fb
--- /dev/null
+++ b/xformer/Test/memory-plan_3.mlir
@@ -0,0 +1,11 @@
+// RUN: xcore-opt --mlir-io %s --xcore-plan-memory -mlir-print-ir-module-scope -mlir-disable-threading | FileCheck %s
+
+// CHECK: xc.offsets = dense<[4608, 4224, 3840, 3456, 1728, 0]> : vector<6xi32>
+func.func @main(%arg0: tensor<1x4x1x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>) -> (tensor<1x4x9x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>, tensor<1x4x9x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>) attributes {tf.entry_function = {inputs = "arg0", outputs = "4,5"}} {
+  %0 = "xc.pad"(%arg0) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 48 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x1x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>) -> tensor<1x4x3x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>
+  %1 = "xc.pad"(%0) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 144 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x3x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>) -> tensor<1x4x5x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>
+  %2 = "xc.pad"(%1) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 240 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x5x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>) -> tensor<1x4x7x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>
+  %3 = "xc.pad"(%2) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 336 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x7x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>) -> tensor<1x4x9x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>
+  %4 = "xc.pad"(%2) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 336 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x7x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>) -> tensor<1x4x9x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>
+  return %3, %4 : tensor<1x4x9x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>, tensor<1x4x9x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>
+}
\ No newline at end of file
diff --git a/xformer/Test/memory-plan_4.mlir b/xformer/Test/memory-plan_4.mlir
new file mode 100644
index 000000000..00c0022d7
--- /dev/null
+++ b/xformer/Test/memory-plan_4.mlir
@@ -0,0 +1,15 @@
+// RUN: xcore-opt --mlir-io %s --xcore-plan-memory -mlir-print-ir-module-scope -mlir-disable-threading | FileCheck %s
+
+// CHECK: xc.offsets = dense<[6144, 5760, 5376, 4992, 3264, 2880, 2496, 768, 384, 0]> : vector<10xi32>
+func.func @main(%arg0: tensor<1x4x1x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>) -> (tensor<1x4x13x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>, tensor<1x4x13x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>) attributes {tf.entry_function = {inputs = "arg0", outputs = "6,9"}} {
+  %0 = "xc.pad"(%arg0) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 48 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x1x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>) -> tensor<1x4x3x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>
+  %1 = "xc.pad"(%0) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 144 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x3x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>) -> tensor<1x4x5x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>
+  %2 = "xc.pad"(%1) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 240 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x5x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>) -> tensor<1x4x7x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>
+  %3 = "xc.pad"(%2) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 336 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x7x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>) -> tensor<1x4x9x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>
+  %4 = "xc.pad"(%3) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 432 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x9x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>) -> tensor<1x4x11x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>
+  %5 = "xc.pad"(%4) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 528 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x11x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>) -> tensor<1x4x13x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>
+  %6 = "xc.pad"(%2) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 336 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x7x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>) -> tensor<1x4x9x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>
+  %7 = "xc.pad"(%6) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 432 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x9x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>) -> tensor<1x4x11x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>
+  %8 = "xc.pad"(%7) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 528 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x11x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>) -> tensor<1x4x13x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>
+  return %5, %8 : tensor<1x4x13x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>, tensor<1x4x13x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>
+}
\ No newline at end of file
diff --git a/xformer/Test/memory-plan_5.mlir b/xformer/Test/memory-plan_5.mlir
new file mode 100644
index 000000000..5c1865480
--- /dev/null
+++ b/xformer/Test/memory-plan_5.mlir
@@ -0,0 +1,17 @@
+// RUN: xcore-opt --mlir-io %s --xcore-plan-memory -mlir-print-ir-module-scope -mlir-disable-threading | FileCheck %s
+
+// CHECK: xc.offsets = dense<[1152, 768, 384, 0, 9792, 9408, 9024, 7296, 6528, 1536, 768, 0]> : vector<12xi32>
+func.func @main(%arg0: tensor<1x4x1x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>) -> tensor<1x4x17x96x!quant.uniform<i8:f32, 0.0078384801745414734:-1>> attributes {tf.entry_function = {inputs = "arg0", outputs = "C"}} {
+  %0 = "xc.pad"(%arg0) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 48 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x1x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>) -> tensor<1x4x3x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>
+  %1 = "xc.pad"(%0) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 144 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x3x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>) -> tensor<1x4x5x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>
+  %2 = "xc.pad"(%1) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 240 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x5x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>) -> tensor<1x4x7x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>
+  %3 = "xc.pad"(%2) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 336 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x7x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>) -> tensor<1x4x9x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>
+  %4 = "xc.pad"(%3) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 432 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x9x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>) -> tensor<1x4x11x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>
+  %5 = "xc.pad"(%4) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 528 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x11x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>) -> tensor<1x4x13x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>
+  %6 = "xc.pad"(%2) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 336 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x7x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>) -> tensor<1x4x9x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>
+  %7 = "xc.pad"(%6) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 192 : i32, size = 432 : i32, start = 192 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x9x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>) -> tensor<1x4x13x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>
+  %8 = "xc.concat"(%5, %7) <{num_copies = 52 : i32, size1 = 48 : i32, size2 = 48 : i32, use_vpu = true}> : (tensor<1x4x13x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>, tensor<1x4x13x48x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>) -> tensor<1x4x13x96x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>
+  %9 = "xc.pad"(%8) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 192 : i32, size = 1248 : i32, start = 192 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x13x96x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>) -> tensor<1x4x15x96x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>
+  %10 = "xc.pad"(%9) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 192 : i32, size = 1440 : i32, start = 192 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x15x96x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>) -> tensor<1x4x17x96x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>
+  return %10 : tensor<1x4x17x96x!quant.uniform<i8:f32, 0.0078384801745414734:-1>>
+}
\ No newline at end of file
diff --git a/xformer/Transforms/PlanMemory.cpp b/xformer/Transforms/PlanMemory.cpp
index 2d248c35e..684f46c61 100644
--- a/xformer/Transforms/PlanMemory.cpp
+++ b/xformer/Transforms/PlanMemory.cpp
@@ -48,7 +48,7 @@ void PlanMemory::runOnOperation() {
       auto offlineOffsetsWithoutOverlap = m.getAllocatedOffsets(
           /*overlapOps=*/false, peakMemoryUsedWithoutOverlap);
 
-      if (peakMemoryUsedWithOverlap < peakMemoryUsedWithoutOverlap) {
+      if (peakMemoryUsedWithOverlap <= peakMemoryUsedWithoutOverlap) {
         module->setAttr("xc.offsets",
                         builder.getI32VectorAttr(offlineOffsetsWithOverlap));
       } else {
diff --git a/xformer/XCoreOptMain.cpp b/xformer/XCoreOptMain.cpp
index 865d6417f..2338bd893 100644
--- a/xformer/XCoreOptMain.cpp
+++ b/xformer/XCoreOptMain.cpp
@@ -148,10 +148,6 @@ cl::opt<bool> convDebugOption("xcore-conv-debug",
                               cl::init(false), cl::cat(XformerCategory),
                               cl::Hidden);
 
-cl::opt<bool> overlapConvOption("xcore-overlap-conv",
-                                cl::desc("Overlap conv also."), cl::init(false),
-                                cl::cat(XformerCategory), cl::Hidden);
-
 cl::opt<bool> offlineOffsetsOption("xcore-offline-offsets",
                                    cl::desc("Offline offsets"), cl::init(true),
                                    cl::cat(XformerCategory), cl::Hidden);