diff --git a/integration_tests/models/8x8/test_pad_overlap/test_pad_overlap_0.tflite b/integration_tests/models/8x8/test_pad_overlap/test_pad_overlap_0.tflite new file mode 100644 index 000000000..3010e369a Binary files /dev/null and b/integration_tests/models/8x8/test_pad_overlap/test_pad_overlap_0.tflite differ diff --git a/integration_tests/models/8x8/test_pad_overlap/test_pad_overlap_1.tflite b/integration_tests/models/8x8/test_pad_overlap/test_pad_overlap_1.tflite new file mode 100644 index 000000000..89049ba0f Binary files /dev/null and b/integration_tests/models/8x8/test_pad_overlap/test_pad_overlap_1.tflite differ diff --git a/integration_tests/models/8x8/test_pad_overlap/test_pad_overlap_2.tflite b/integration_tests/models/8x8/test_pad_overlap/test_pad_overlap_2.tflite new file mode 100644 index 000000000..767712268 Binary files /dev/null and b/integration_tests/models/8x8/test_pad_overlap/test_pad_overlap_2.tflite differ diff --git a/integration_tests/models/8x8/test_pad_overlap/test_pad_overlap_3.tflite b/integration_tests/models/8x8/test_pad_overlap/test_pad_overlap_3.tflite new file mode 100644 index 000000000..a163e4143 Binary files /dev/null and b/integration_tests/models/8x8/test_pad_overlap/test_pad_overlap_3.tflite differ diff --git a/xformer/Analysis/MemoryPlan.cpp b/xformer/Analysis/MemoryPlan.cpp index ec7ad9447..003a93b17 100644 --- a/xformer/Analysis/MemoryPlan.cpp +++ b/xformer/Analysis/MemoryPlan.cpp @@ -152,59 +152,35 @@ std::vector MemoryPlan::getAllocatedOffsets(const bool overlapOps, auto vInfo = valueInfo; // Overlap buffers - llvm::DenseMap> outInVals; - // outInInVals are only used when overlapping conv and pad together - llvm::DenseMap, std::pair>> - outInInVals; - - int maxOpId = -1; - if (overlapConvOption) { - // TODO: Try overlap conv - // Need to revert conv to run single-threaded which is not implemented yet - auto maxOp = getOpWithMaxMemoryUsed(); - // max op is usually pad or conv - // if max op is pad, we choose the next one which should be conv - if (llvm::isa(maxOp)) { - maxOpId = operationIds[maxOp]; - } else if (llvm::isa(maxOp) && - llvm::isa(operations[operationIds[maxOp] + 1])) { - maxOpId = operationIds[maxOp] + 1; - } - } - + llvm::DenseMap> inOutMap; + llvm::DenseSet alreadyVisited; if (overlapOps) { for (auto o : operations) { - if (llvm::isa(o)) { - auto in = o->getOperand(0); - if (in.hasOneUse()) { - auto out = o->getResult(0); - int offset = vInfo[out].size - vInfo[in].size; - outInVals[out] = {in, offset}; - vInfo[in].size += offset; - vInfo[in].lastUsed = vInfo[out].lastUsed; + // We are only overlapping Pad op as of now + if (llvm::isa(o) && !alreadyVisited.contains(o) && + o->getOperand(0).hasOneUse()) { + alreadyVisited.insert(o); + + llvm::SmallVector inputVals; + auto inVal = o->getOperand(0); + inputVals.push_back(inVal); + + auto outVal = o->getResult(0); + auto nextOp = *outVal.getUsers().begin(); + // Identify chain of Pad Ops + while (outVal.hasOneUse() && llvm::isa(nextOp)) { + inVal = nextOp->getOperand(0); + inputVals.push_back(inVal); + alreadyVisited.insert(nextOp); + outVal = nextOp->getResult(0); + nextOp = *outVal.getUsers().begin(); } - } - if (llvm::isa(o)) { - if (operationIds[o] == maxOpId) { - auto convOp = dyn_cast(o); - auto in = o->getOperand(0); - auto out = o->getResult(0); - int offset = out.getType().dyn_cast().getDimSize( - 3); // pixel size - - // since pad is input to this conv and already overlapped - if (outInVals.count(in)) { - // find the original input op - auto firstVal = outInVals[in].first; - auto firstOffset = outInVals[in].second; - - offset += vInfo[out].size - vInfo[firstVal].size; - - outInInVals[out] = {{in, offset}, {firstVal, firstOffset}}; - vInfo[firstVal].size += offset; - vInfo[firstVal].lastUsed = vInfo[out].lastUsed; - } + // Set first Used of output Val to the first input Val + vInfo[outVal].firstUsed = vInfo[inputVals[0]].firstUsed; + for (auto inV : inputVals) { + int offset = vInfo[outVal].size - vInfo[inV].size; + inOutMap[inV] = {outVal, offset}; } } } @@ -224,7 +200,7 @@ std::vector MemoryPlan::getAllocatedOffsets(const bool overlapOps, // Insert values and their sizes into priority queue for (auto v : values) { - if (!outInVals.count(v) && !outInInVals.count(v) && !vInfo[v].isConstant) { + if (!inOutMap.count(v) && !vInfo[v].isConstant) { queue.push({v, vInfo[v].size}); } } @@ -245,54 +221,23 @@ std::vector MemoryPlan::getAllocatedOffsets(const bool overlapOps, } // Patch up overlapped buffers - for (auto val : outInInVals) { - auto out = val.first; - auto inPair = val.second.first; - auto firstValPair = val.second.second; - - auto in = inPair.first; - auto offset = inPair.second; - // We allocate here itself - if (outInVals.count(in)) { - outInVals.erase(in); - } - - auto firstVal = firstValPair.first; - auto firstOffset = firstValPair.second; - - auto it = - std::find_if(allocatedValues.begin(), allocatedValues.end(), - [&](const QueueItem &p) { return p.first == firstVal; }); - - if (it != allocatedValues.end()) { - int currentOffset = it->second; - allocatedValues.erase(it); - allocatedValues.insert({firstVal, currentOffset + offset + firstOffset}); - allocatedValues.insert({in, currentOffset + offset}); - allocatedValues.insert({out, currentOffset}); - } else { - assert(false); - } - } - - for (auto val : outInVals) { - auto out = val.first; - auto in = val.second.first; + for (auto val : inOutMap) { + auto in = val.first; + auto out = val.second.first; auto offset = val.second.second; auto it = std::find_if(allocatedValues.begin(), allocatedValues.end(), - [&](const QueueItem &p) { return p.first == in; }); + [&](const QueueItem &p) { return p.first == out; }); if (it != allocatedValues.end()) { int currentOffset = it->second; - allocatedValues.erase(it); allocatedValues.insert({in, currentOffset + offset}); - allocatedValues.insert({out, currentOffset}); } else { assert(false); } } + // Insert -1 offset for constant values for (auto v : values) { if (vInfo[v].isConstant) { allocatedValues.insert({v, -1}); diff --git a/xformer/Test/memory-plan_1.mlir b/xformer/Test/memory-plan_1.mlir new file mode 100644 index 000000000..7ab084e99 --- /dev/null +++ b/xformer/Test/memory-plan_1.mlir @@ -0,0 +1,7 @@ +// RUN: xcore-opt --mlir-io %s --xcore-plan-memory -mlir-print-ir-module-scope -mlir-disable-threading | FileCheck %s + +// CHECK: xc.offsets = dense<[384, 0]> : vector<2xi32> +func.func @main(%arg0: tensor<1x4x1x48x!quant.uniform> {tf_saved_model.index_path = ["zero_padding2d_input"]}) -> (tensor<1x4x3x48x!quant.uniform> {tf_saved_model.index_path = ["zero_padding2d"]}) attributes {tf.entry_function = {inputs = "serving_default_zero_padding2d_input:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} { + %0 = "xc.pad"(%arg0) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 48 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x1x48x!quant.uniform>) -> tensor<1x4x3x48x!quant.uniform> + return %0 : tensor<1x4x3x48x!quant.uniform> +} \ No newline at end of file diff --git a/xformer/Test/memory-plan_2.mlir b/xformer/Test/memory-plan_2.mlir new file mode 100644 index 000000000..20d6788fe --- /dev/null +++ b/xformer/Test/memory-plan_2.mlir @@ -0,0 +1,9 @@ +// RUN: xcore-opt --mlir-io %s --xcore-plan-memory -mlir-print-ir-module-scope -mlir-disable-threading | FileCheck %s + +// CHECK: xc.offsets = dense<[1152, 768, 384, 0]> : vector<4xi32> +func.func @main(%arg0: tensor<1x4x1x48x!quant.uniform> {tf_saved_model.index_path = ["zero_padding2d_input"]}) -> (tensor<1x4x7x48x!quant.uniform> {tf_saved_model.index_path = ["zero_padding2d"]}) attributes {tf.entry_function = {inputs = "serving_default_zero_padding2d_input:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} { + %0 = "xc.pad"(%arg0) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 48 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x1x48x!quant.uniform>) -> tensor<1x4x3x48x!quant.uniform> + %1 = "xc.pad"(%0) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 144 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x3x48x!quant.uniform>) -> tensor<1x4x5x48x!quant.uniform> + %2 = "xc.pad"(%1) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 240 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x5x48x!quant.uniform>) -> tensor<1x4x7x48x!quant.uniform> + return %2 : tensor<1x4x7x48x!quant.uniform> +} \ No newline at end of file diff --git a/xformer/Test/memory-plan_3.mlir b/xformer/Test/memory-plan_3.mlir new file mode 100644 index 000000000..a9b1787fb --- /dev/null +++ b/xformer/Test/memory-plan_3.mlir @@ -0,0 +1,11 @@ +// RUN: xcore-opt --mlir-io %s --xcore-plan-memory -mlir-print-ir-module-scope -mlir-disable-threading | FileCheck %s + +// CHECK: xc.offsets = dense<[4608, 4224, 3840, 3456, 1728, 0]> : vector<6xi32> +func.func @main(%arg0: tensor<1x4x1x48x!quant.uniform>) -> (tensor<1x4x9x48x!quant.uniform>, tensor<1x4x9x48x!quant.uniform>) attributes {tf.entry_function = {inputs = "arg0", outputs = "4,5"}} { + %0 = "xc.pad"(%arg0) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 48 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x1x48x!quant.uniform>) -> tensor<1x4x3x48x!quant.uniform> + %1 = "xc.pad"(%0) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 144 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x3x48x!quant.uniform>) -> tensor<1x4x5x48x!quant.uniform> + %2 = "xc.pad"(%1) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 240 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x5x48x!quant.uniform>) -> tensor<1x4x7x48x!quant.uniform> + %3 = "xc.pad"(%2) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 336 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x7x48x!quant.uniform>) -> tensor<1x4x9x48x!quant.uniform> + %4 = "xc.pad"(%2) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 336 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x7x48x!quant.uniform>) -> tensor<1x4x9x48x!quant.uniform> + return %3, %4 : tensor<1x4x9x48x!quant.uniform>, tensor<1x4x9x48x!quant.uniform> +} \ No newline at end of file diff --git a/xformer/Test/memory-plan_4.mlir b/xformer/Test/memory-plan_4.mlir new file mode 100644 index 000000000..00c0022d7 --- /dev/null +++ b/xformer/Test/memory-plan_4.mlir @@ -0,0 +1,15 @@ +// RUN: xcore-opt --mlir-io %s --xcore-plan-memory -mlir-print-ir-module-scope -mlir-disable-threading | FileCheck %s + +// CHECK: xc.offsets = dense<[6144, 5760, 5376, 4992, 3264, 2880, 2496, 768, 384, 0]> : vector<10xi32> +func.func @main(%arg0: tensor<1x4x1x48x!quant.uniform>) -> (tensor<1x4x13x48x!quant.uniform>, tensor<1x4x13x48x!quant.uniform>) attributes {tf.entry_function = {inputs = "arg0", outputs = "6,9"}} { + %0 = "xc.pad"(%arg0) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 48 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x1x48x!quant.uniform>) -> tensor<1x4x3x48x!quant.uniform> + %1 = "xc.pad"(%0) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 144 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x3x48x!quant.uniform>) -> tensor<1x4x5x48x!quant.uniform> + %2 = "xc.pad"(%1) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 240 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x5x48x!quant.uniform>) -> tensor<1x4x7x48x!quant.uniform> + %3 = "xc.pad"(%2) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 336 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x7x48x!quant.uniform>) -> tensor<1x4x9x48x!quant.uniform> + %4 = "xc.pad"(%3) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 432 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x9x48x!quant.uniform>) -> tensor<1x4x11x48x!quant.uniform> + %5 = "xc.pad"(%4) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 528 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x11x48x!quant.uniform>) -> tensor<1x4x13x48x!quant.uniform> + %6 = "xc.pad"(%2) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 336 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x7x48x!quant.uniform>) -> tensor<1x4x9x48x!quant.uniform> + %7 = "xc.pad"(%6) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 432 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x9x48x!quant.uniform>) -> tensor<1x4x11x48x!quant.uniform> + %8 = "xc.pad"(%7) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 528 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x11x48x!quant.uniform>) -> tensor<1x4x13x48x!quant.uniform> + return %5, %8 : tensor<1x4x13x48x!quant.uniform>, tensor<1x4x13x48x!quant.uniform> +} \ No newline at end of file diff --git a/xformer/Test/memory-plan_5.mlir b/xformer/Test/memory-plan_5.mlir new file mode 100644 index 000000000..5c1865480 --- /dev/null +++ b/xformer/Test/memory-plan_5.mlir @@ -0,0 +1,17 @@ +// RUN: xcore-opt --mlir-io %s --xcore-plan-memory -mlir-print-ir-module-scope -mlir-disable-threading | FileCheck %s + +// CHECK: xc.offsets = dense<[1152, 768, 384, 0, 9792, 9408, 9024, 7296, 6528, 1536, 768, 0]> : vector<12xi32> +func.func @main(%arg0: tensor<1x4x1x48x!quant.uniform>) -> tensor<1x4x17x96x!quant.uniform> attributes {tf.entry_function = {inputs = "arg0", outputs = "C"}} { + %0 = "xc.pad"(%arg0) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 48 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x1x48x!quant.uniform>) -> tensor<1x4x3x48x!quant.uniform> + %1 = "xc.pad"(%0) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 144 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x3x48x!quant.uniform>) -> tensor<1x4x5x48x!quant.uniform> + %2 = "xc.pad"(%1) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 240 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x5x48x!quant.uniform>) -> tensor<1x4x7x48x!quant.uniform> + %3 = "xc.pad"(%2) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 336 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x7x48x!quant.uniform>) -> tensor<1x4x9x48x!quant.uniform> + %4 = "xc.pad"(%3) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 432 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x9x48x!quant.uniform>) -> tensor<1x4x11x48x!quant.uniform> + %5 = "xc.pad"(%4) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 528 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x11x48x!quant.uniform>) -> tensor<1x4x13x48x!quant.uniform> + %6 = "xc.pad"(%2) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 96 : i32, size = 336 : i32, start = 96 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x7x48x!quant.uniform>) -> tensor<1x4x9x48x!quant.uniform> + %7 = "xc.pad"(%6) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 192 : i32, size = 432 : i32, start = 192 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x9x48x!quant.uniform>) -> tensor<1x4x13x48x!quant.uniform> + %8 = "xc.concat"(%5, %7) <{num_copies = 52 : i32, size1 = 48 : i32, size2 = 48 : i32, use_vpu = true}> : (tensor<1x4x13x48x!quant.uniform>, tensor<1x4x13x48x!quant.uniform>) -> tensor<1x4x13x96x!quant.uniform> + %9 = "xc.pad"(%8) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 192 : i32, size = 1248 : i32, start = 192 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x13x96x!quant.uniform>) -> tensor<1x4x15x96x!quant.uniform> + %10 = "xc.pad"(%9) <{end = 0 : i32, num_copies = 3 : i32, pad_size = 192 : i32, size = 1440 : i32, start = 192 : i32, use_vpu = true, zero_point = -1 : i32}> : (tensor<1x4x15x96x!quant.uniform>) -> tensor<1x4x17x96x!quant.uniform> + return %10 : tensor<1x4x17x96x!quant.uniform> +} \ No newline at end of file diff --git a/xformer/Transforms/PlanMemory.cpp b/xformer/Transforms/PlanMemory.cpp index 2d248c35e..684f46c61 100644 --- a/xformer/Transforms/PlanMemory.cpp +++ b/xformer/Transforms/PlanMemory.cpp @@ -48,7 +48,7 @@ void PlanMemory::runOnOperation() { auto offlineOffsetsWithoutOverlap = m.getAllocatedOffsets( /*overlapOps=*/false, peakMemoryUsedWithoutOverlap); - if (peakMemoryUsedWithOverlap < peakMemoryUsedWithoutOverlap) { + if (peakMemoryUsedWithOverlap <= peakMemoryUsedWithoutOverlap) { module->setAttr("xc.offsets", builder.getI32VectorAttr(offlineOffsetsWithOverlap)); } else { diff --git a/xformer/XCoreOptMain.cpp b/xformer/XCoreOptMain.cpp index 865d6417f..2338bd893 100644 --- a/xformer/XCoreOptMain.cpp +++ b/xformer/XCoreOptMain.cpp @@ -148,10 +148,6 @@ cl::opt convDebugOption("xcore-conv-debug", cl::init(false), cl::cat(XformerCategory), cl::Hidden); -cl::opt overlapConvOption("xcore-overlap-conv", - cl::desc("Overlap conv also."), cl::init(false), - cl::cat(XformerCategory), cl::Hidden); - cl::opt offlineOffsetsOption("xcore-offline-offsets", cl::desc("Offline offsets"), cl::init(true), cl::cat(XformerCategory), cl::Hidden);