From 74d1b738aabbb132cc26f9eba560dae5876e7e7c Mon Sep 17 00:00:00 2001
From: Akash Kothari <134331715+akkothar@users.noreply.github.com>
Date: Thu, 15 Aug 2024 23:18:37 -0500
Subject: [PATCH] Correctly generate offsets for DMA memcpys for AIEs (#713)

* No need to insert an add instruction when loop bounds are correct

* Fix offset computation when folding loops

* clang-format

* fix format and syntax errors

* Add a test that demonstrates correctly splitting dma memcpys

* Add a test that tests folding of offsets into strides in dma memcpys

* Remove debug print
---
 mlir/lib/Conversion/AIRRtToNpuPass.cpp        |   7 +-
 mlir/lib/Util/Util.cpp                        | 109 +--
 .../AIRRtToNpu/dma_memcpy_split.mlir          | 714 ++++++++++++++++++
 .../AIRRtToNpu/dma_offset_folding.mlir        | 233 ++++++
 4 files changed, 1011 insertions(+), 52 deletions(-)
 create mode 100644 mlir/test/Conversion/AIRRtToNpu/dma_memcpy_split.mlir
 create mode 100644 mlir/test/Conversion/AIRRtToNpu/dma_offset_folding.mlir
diff --git a/mlir/lib/Conversion/AIRRtToNpuPass.cpp b/mlir/lib/Conversion/AIRRtToNpuPass.cpp
index 889e93acc..365317a5d 100644
--- a/mlir/lib/Conversion/AIRRtToNpuPass.cpp
+++ b/mlir/lib/Conversion/AIRRtToNpuPass.cpp
@@ -742,11 +742,8 @@ void tileIllegalWrapDim(airrt::DmaMemcpyNdOp memcpy_op) {
     // Innermost tiled affine.for loop induction variable as lowest offset, if
     // original rank exceeds hw limit.
     new_opers.insert(new_opers.end(), offsets.begin(), offsets.end() - 1);
-    auto new_inner_offset = builder.create<arith::AddIOp>(
-        loc,
-        builder.create<arith::IndexCastOp>(loc, IntegerType::get(ctx, 64),
-                                           inner_affine_for_iv),
-        offsets.back());
+    auto new_inner_offset = builder.create<arith::IndexCastOp>(
+        loc, IntegerType::get(ctx, 64), inner_affine_for_iv);
     new_opers.push_back(new_inner_offset);
   } else
     new_opers.insert(new_opers.end(), offsets.begin(), offsets.end());
diff --git a/mlir/lib/Util/Util.cpp b/mlir/lib/Util/Util.cpp
index 1647a1062..5a31589ef 100644
--- a/mlir/lib/Util/Util.cpp
+++ b/mlir/lib/Util/Util.cpp
@@ -907,6 +907,7 @@ LogicalResult eraseWrapNStrideDim(OpBuilder builder,
       offset_expr = offset_expr.replaceDimsAndSymbols({}, symReplacements);
       auto next_offset_map = AffineMap::get(0, 1, offset_expr);
       affine_apply.setMap(next_offset_map);
+      offsets[i] = affine_apply;
       offsets[i + 1] = offsets[i];
     }
     erased |= multiplyAdjWraps(builder, i, sizes);
@@ -927,7 +928,6 @@ LogicalResult air::canonicalizeWrapAndStrideList(OpBuilder builder,
                                                  SmallVector<Value> &sizes,
                                                  SmallVector<Value> &strides,
                                                  int memref_volume) {
-
   bool listsHaveChanged = false;
   // Match offsets size with sizes and strides
   auto max_dim_size =
@@ -1004,33 +1004,85 @@ LogicalResult air::foldForLoopNestAsExtendedSizesAndStrides(
 
   // Fold for loops int channel op's wrap and stride fields
   SmallVector<Operation *> for_loops;
+  SmallVector<Value> ivs;
   Operation *parent = channel_op;
   while (parent != for_op) {
     parent = parent->getParentOp();
-    if (isa<scf::ForOp>(parent))
+    if (auto sfo = dyn_cast<scf::ForOp>(parent)) {
       for_loops.push_back(parent);
-    else if (isa<affine::AffineForOp>(parent))
+      ivs.push_back(sfo.getInductionVar());
+    } else if (auto afo = dyn_cast<affine::AffineForOp>(parent)) {
       for_loops.push_back(parent);
+      ivs.push_back(afo.getInductionVar());
+    }
   }
 
-  // First traversal inserting new dimensions from loops
+  std::map<Operation *, int> op_to_count;
   for (auto o : for_loops) {
-    uint64_t ind_var_factor = 0;
+    int64_t stepSize = -1;
+    int loop_lower_bound = 0;
+    Value iv = nullptr;
+    if (auto afo = dyn_cast<affine::AffineForOp>(o)) {
+      iv = afo.getInductionVar();
+      loop_lower_bound = afo.getConstantLowerBound();
+      stepSize = afo.getStepAsInt();
+    } else if (auto sfo = dyn_cast<scf::ForOp>(o)) {
+      iv = sfo.getInductionVar();
+      if (auto cst_lower_bound = mlir::getConstantIntValue(sfo.getLowerBound()))
+        loop_lower_bound = *cst_lower_bound;
+      stepSize = *mlir::getConstantIntValue(sfo.getStep());
+    }
+    int64_t ind_var_factor = 0;
     for (int i = offsets.size() - 1; i >= 0; i--) {
-      Value iv = nullptr;
-      if (auto afo = dyn_cast<affine::AffineForOp>(o))
-        iv = afo.getInductionVar();
-      else if (auto sfo = dyn_cast<scf::ForOp>(o))
-        iv = sfo.getInductionVar();
       if (iv && offsets[i] == iv) {
         ind_var_factor = *getConstantIntValue(strides[i]);
+        offsets[i] = builder.template create<arith::ConstantIndexOp>(
+            loc, loop_lower_bound);
         break;
       } else if (iv && offsets[i].getDefiningOp()) {
         Operation *iv_consumer = offsets[i].getDefiningOp();
         if (auto exec = dyn_cast<air::ExecuteOp>(iv_consumer))
           iv_consumer = exec.getChildOp();
+        if (auto affop = dyn_cast<affine::AffineApplyOp>(iv_consumer)) {
+          // The induction variable must be the input to the affine op
+          if (affop.getSymbolOperands().size() == 1) {
+            bool iv_is_symbol = false;
+            for (auto val : affop.getSymbolOperands()) {
+              if (val == iv) {
+                iv_is_symbol = true;
+                break;
+              }
+            }
+            if (iv_is_symbol) {
+              auto map = affop.getAffineMap();
+              ind_var_factor = air::evaluateConstantsInMap(
+                                   map,
+                                   SmallVector<std::optional<int64_t>>{
+                                       std::optional<int64_t>{stepSize}},
+                                   for_op->getContext())
+                                   .value();
+              offsets[i] = builder.template create<arith::ConstantIndexOp>(
+                  loc, loop_lower_bound);
+              break;
+            }
+          }
+        }
         if (llvm::is_contained(iv_consumer->getOperands(), iv)) {
+          if (op_to_count.find(iv_consumer) == op_to_count.end()) {
+            op_to_count[iv_consumer] = 0;
+            for (auto operand : iv_consumer->getOperands()) {
+              for (auto iv_val : ivs) {
+                if (iv_val == operand)
+                  op_to_count[iv_consumer]++;
+              }
+            }
+          }
+          op_to_count[iv_consumer]--;
           ind_var_factor = *getConstantIntValue(strides[i]);
+          if (!op_to_count[iv_consumer]) {
+            offsets[i] = builder.template create<arith::ConstantIndexOp>(
+                loc, loop_lower_bound);
+          }
           break;
         }
       }
@@ -1042,11 +1094,6 @@ LogicalResult air::foldForLoopNestAsExtendedSizesAndStrides(
       trip_count = *getStaticScfForTripCountAsInt(sfo);
     Value new_wrap =
         builder.template create<arith::ConstantIndexOp>(loc, trip_count);
-    int stepSize = -1;
-    if (auto afo = dyn_cast<affine::AffineForOp>(o))
-      stepSize = afo.getStepAsInt();
-    else if (auto sfo = dyn_cast<scf::ForOp>(o))
-      stepSize = *mlir::getConstantIntValue(sfo.getStep());
     int64_t new_stride_value =
         (stepSize * ind_var_factor) % getTensorVolume(memref.getType());
     Value new_stride =
@@ -1069,38 +1116,6 @@ LogicalResult air::foldForLoopNestAsExtendedSizesAndStrides(
     wraps.insert(wraps.begin(), new_wrap);
     strides.insert(strides.begin(), new_stride);
   }
-
-  // Second traversal updating existing offsets
-  for (auto o : for_loops) {
-    for (int i = offsets.size() - 1; i >= 0; i--) {
-      Value iv = nullptr;
-      int loop_lower_bound = 0;
-      if (auto afo = dyn_cast<affine::AffineForOp>(o)) {
-        iv = afo.getInductionVar();
-        loop_lower_bound = afo.getConstantLowerBound();
-      } else if (auto sfo = dyn_cast<scf::ForOp>(o)) {
-        iv = sfo.getInductionVar();
-        if (auto cst_lower_bound =
-                mlir::getConstantIntValue(sfo.getLowerBound()))
-          loop_lower_bound = *cst_lower_bound;
-      }
-      if (iv && offsets[i] == iv) {
-        // Replace offset with for loop lower bound
-        offsets[i] = builder.template create<arith::ConstantIndexOp>(
-            loc, loop_lower_bound);
-        break;
-      } else if (iv && offsets[i].getDefiningOp()) {
-        Operation *iv_consumer = offsets[i].getDefiningOp();
-        if (auto exec = dyn_cast<air::ExecuteOp>(iv_consumer))
-          iv_consumer = exec.getChildOp();
-        if (llvm::is_contained(iv_consumer->getOperands(), iv)) {
-          offsets[i] = builder.template create<arith::ConstantIndexOp>(
-              loc, loop_lower_bound);
-          break;
-        }
-      }
-    }
-  }
   return success();
 }
 
diff --git a/mlir/test/Conversion/AIRRtToNpu/dma_memcpy_split.mlir b/mlir/test/Conversion/AIRRtToNpu/dma_memcpy_split.mlir
new file mode 100644
index 000000000..fd95aed61
--- /dev/null
+++ b/mlir/test/Conversion/AIRRtToNpu/dma_memcpy_split.mlir
@@ -0,0 +1,714 @@
+//===- dma_memcpy_split.mlir ---------------------------------------*- MLIR -*-===//
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===----------------------------------------------------------------------===//
+
+
+// RUN: air-opt -airrt-to-npu --split-input-file %s | FileCheck %s
+
+
+// CHECK-LABEL: aie.device(npu1_4col)
+// CHECK: aie.shim_dma_allocation @airMemcpyId29(S2MM, 0, 0)
+// CHECK: memref.global "public" @airMemcpyId29 : memref<128x128xf32, 1>
+// CHECK: aie.shim_dma_allocation @airMemcpyId4(MM2S, 0, 0)
+// CHECK: memref.global "public" @airMemcpyId4 : memref<128x256xbf16, 1>
+// CHECK: aie.shim_dma_allocation @airMemcpyId10(MM2S, 1, 0)
+// CHECK: memref.global "public" @airMemcpyId10 : memref<32x8x8x16xbf16, 1>
+// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 4, 128, 128][0, 128, 512, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][32, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 65536][32, 8, 8, 8][2048, 32, 256, 1]) {id = 2 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 131072][32, 8, 8, 8][2048, 32, 256, 1]) {id = 3 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 196608][32, 8, 8, 8][2048, 32, 256, 1]) {id = 4 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 128, 128][0, 0, 512, 1]) {id = 5 : i64, metadata = @airMemcpyId29} : memref<512x512xf32>
+// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+// CHECK: aiex.npu.dma_memcpy_nd(0, 1, %arg0[0, 0, 0, 0][1, 4, 128, 128][0, 128, 512, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(0, 1, %arg1[0, 0, 0, 8][32, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(0, 1, %arg1[0, 0, 0, 65544][32, 8, 8, 8][2048, 32, 256, 1]) {id = 2 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(0, 1, %arg1[0, 0, 0, 131080][32, 8, 8, 8][2048, 32, 256, 1]) {id = 3 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(0, 1, %arg1[0, 0, 0, 196616][32, 8, 8, 8][2048, 32, 256, 1]) {id = 4 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(0, 1, %arg2[0, 0, 0, 128][1, 1, 128, 128][0, 0, 512, 1]) {id = 5 : i64, metadata = @airMemcpyId29} : memref<512x512xf32>
+// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+// CHECK: aiex.npu.dma_memcpy_nd(0, 2, %arg0[0, 0, 0, 0][1, 4, 128, 128][0, 128, 512, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(0, 2, %arg1[0, 0, 0, 16][32, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(0, 2, %arg1[0, 0, 0, 65552][32, 8, 8, 8][2048, 32, 256, 1]) {id = 2 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(0, 2, %arg1[0, 0, 0, 131088][32, 8, 8, 8][2048, 32, 256, 1]) {id = 3 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(0, 2, %arg1[0, 0, 0, 196624][32, 8, 8, 8][2048, 32, 256, 1]) {id = 4 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(0, 2, %arg2[0, 0, 0, 256][1, 1, 128, 128][0, 0, 512, 1]) {id = 5 : i64, metadata = @airMemcpyId29} : memref<512x512xf32>
+// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+// CHECK: aiex.npu.dma_memcpy_nd(0, 3, %arg0[0, 0, 0, 0][1, 4, 128, 128][0, 128, 512, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(0, 3, %arg1[0, 0, 0, 24][32, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(0, 3, %arg1[0, 0, 0, 65560][32, 8, 8, 8][2048, 32, 256, 1]) {id = 2 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(0, 3, %arg1[0, 0, 0, 131096][32, 8, 8, 8][2048, 32, 256, 1]) {id = 3 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(0, 3, %arg1[0, 0, 0, 196632][32, 8, 8, 8][2048, 32, 256, 1]) {id = 4 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(0, 3, %arg2[0, 0, 0, 384][1, 1, 128, 128][0, 0, 512, 1]) {id = 5 : i64, metadata = @airMemcpyId29} : memref<512x512xf32>
+// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+// CHECK: aiex.npu.dma_memcpy_nd(1, 0, %arg0[0, 0, 0, 65536][1, 4, 128, 128][0, 128, 512, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(1, 0, %arg1[0, 0, 0, 0][32, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(1, 0, %arg1[0, 0, 0, 65536][32, 8, 8, 8][2048, 32, 256, 1]) {id = 2 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(1, 0, %arg1[0, 0, 0, 131072][32, 8, 8, 8][2048, 32, 256, 1]) {id = 3 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(1, 0, %arg1[0, 0, 0, 196608][32, 8, 8, 8][2048, 32, 256, 1]) {id = 4 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(1, 0, %arg2[0, 0, 128, 0][1, 1, 128, 128][0, 0, 512, 1]) {id = 5 : i64, metadata = @airMemcpyId29} : memref<512x512xf32>
+// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+// CHECK: aiex.npu.dma_memcpy_nd(1, 1, %arg0[0, 0, 0, 65536][1, 4, 128, 128][0, 128, 512, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(1, 1, %arg1[0, 0, 0, 8][32, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(1, 1, %arg1[0, 0, 0, 65544][32, 8, 8, 8][2048, 32, 256, 1]) {id = 2 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(1, 1, %arg1[0, 0, 0, 131080][32, 8, 8, 8][2048, 32, 256, 1]) {id = 3 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(1, 1, %arg1[0, 0, 0, 196616][32, 8, 8, 8][2048, 32, 256, 1]) {id = 4 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(1, 1, %arg2[0, 0, 128, 128][1, 1, 128, 128][0, 0, 512, 1]) {id = 5 : i64, metadata = @airMemcpyId29} : memref<512x512xf32>
+// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+// CHECK: aiex.npu.dma_memcpy_nd(1, 2, %arg0[0, 0, 0, 65536][1, 4, 128, 128][0, 128, 512, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(1, 2, %arg1[0, 0, 0, 16][32, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(1, 2, %arg1[0, 0, 0, 65552][32, 8, 8, 8][2048, 32, 256, 1]) {id = 2 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(1, 2, %arg1[0, 0, 0, 131088][32, 8, 8, 8][2048, 32, 256, 1]) {id = 3 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(1, 2, %arg1[0, 0, 0, 196624][32, 8, 8, 8][2048, 32, 256, 1]) {id = 4 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(1, 2, %arg2[0, 0, 128, 256][1, 1, 128, 128][0, 0, 512, 1]) {id = 5 : i64, metadata = @airMemcpyId29} : memref<512x512xf32>
+// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+// CHECK: aiex.npu.dma_memcpy_nd(1, 3, %arg0[0, 0, 0, 65536][1, 4, 128, 128][0, 128, 512, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(1, 3, %arg1[0, 0, 0, 24][32, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(1, 3, %arg1[0, 0, 0, 65560][32, 8, 8, 8][2048, 32, 256, 1]) {id = 2 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(1, 3, %arg1[0, 0, 0, 131096][32, 8, 8, 8][2048, 32, 256, 1]) {id = 3 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(1, 3, %arg1[0, 0, 0, 196632][32, 8, 8, 8][2048, 32, 256, 1]) {id = 4 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(1, 3, %arg2[0, 0, 128, 384][1, 1, 128, 128][0, 0, 512, 1]) {id = 5 : i64, metadata = @airMemcpyId29} : memref<512x512xf32>
+// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+// CHECK: aiex.npu.dma_memcpy_nd(2, 0, %arg0[0, 0, 0, 131072][1, 4, 128, 128][0, 128, 512, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(2, 0, %arg1[0, 0, 0, 0][32, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(2, 0, %arg1[0, 0, 0, 65536][32, 8, 8, 8][2048, 32, 256, 1]) {id = 2 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(2, 0, %arg1[0, 0, 0, 131072][32, 8, 8, 8][2048, 32, 256, 1]) {id = 3 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(2, 0, %arg1[0, 0, 0, 196608][32, 8, 8, 8][2048, 32, 256, 1]) {id = 4 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(2, 0, %arg2[0, 0, 256, 0][1, 1, 128, 128][0, 0, 512, 1]) {id = 5 : i64, metadata = @airMemcpyId29} : memref<512x512xf32>
+// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+// CHECK: aiex.npu.dma_memcpy_nd(2, 1, %arg0[0, 0, 0, 131072][1, 4, 128, 128][0, 128, 512, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(2, 1, %arg1[0, 0, 0, 8][32, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(2, 1, %arg1[0, 0, 0, 65544][32, 8, 8, 8][2048, 32, 256, 1]) {id = 2 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(2, 1, %arg1[0, 0, 0, 131080][32, 8, 8, 8][2048, 32, 256, 1]) {id = 3 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(2, 1, %arg1[0, 0, 0, 196616][32, 8, 8, 8][2048, 32, 256, 1]) {id = 4 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(2, 1, %arg2[0, 0, 256, 128][1, 1, 128, 128][0, 0, 512, 1]) {id = 5 : i64, metadata = @airMemcpyId29} : memref<512x512xf32>
+// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+// CHECK: aiex.npu.dma_memcpy_nd(2, 2, %arg0[0, 0, 0, 131072][1, 4, 128, 128][0, 128, 512, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(2, 2, %arg1[0, 0, 0, 16][32, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(2, 2, %arg1[0, 0, 0, 65552][32, 8, 8, 8][2048, 32, 256, 1]) {id = 2 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(2, 2, %arg1[0, 0, 0, 131088][32, 8, 8, 8][2048, 32, 256, 1]) {id = 3 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(2, 2, %arg1[0, 0, 0, 196624][32, 8, 8, 8][2048, 32, 256, 1]) {id = 4 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(2, 2, %arg2[0, 0, 256, 256][1, 1, 128, 128][0, 0, 512, 1]) {id = 5 : i64, metadata = @airMemcpyId29} : memref<512x512xf32>
+// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+// CHECK: aiex.npu.dma_memcpy_nd(2, 3, %arg0[0, 0, 0, 131072][1, 4, 128, 128][0, 128, 512, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(2, 3, %arg1[0, 0, 0, 24][32, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(2, 3, %arg1[0, 0, 0, 65560][32, 8, 8, 8][2048, 32, 256, 1]) {id = 2 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(2, 3, %arg1[0, 0, 0, 131096][32, 8, 8, 8][2048, 32, 256, 1]) {id = 3 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(2, 3, %arg1[0, 0, 0, 196632][32, 8, 8, 8][2048, 32, 256, 1]) {id = 4 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(2, 3, %arg2[0, 0, 256, 384][1, 1, 128, 128][0, 0, 512, 1]) {id = 5 : i64, metadata = @airMemcpyId29} : memref<512x512xf32>
+// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+// CHECK: aiex.npu.dma_memcpy_nd(3, 0, %arg0[0, 0, 0, 196608][1, 4, 128, 128][0, 128, 512, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(3, 0, %arg1[0, 0, 0, 0][32, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(3, 0, %arg1[0, 0, 0, 65536][32, 8, 8, 8][2048, 32, 256, 1]) {id = 2 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(3, 0, %arg1[0, 0, 0, 131072][32, 8, 8, 8][2048, 32, 256, 1]) {id = 3 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(3, 0, %arg1[0, 0, 0, 196608][32, 8, 8, 8][2048, 32, 256, 1]) {id = 4 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(3, 0, %arg2[0, 0, 384, 0][1, 1, 128, 128][0, 0, 512, 1]) {id = 5 : i64, metadata = @airMemcpyId29} : memref<512x512xf32>
+// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+// CHECK: aiex.npu.dma_memcpy_nd(3, 1, %arg0[0, 0, 0, 196608][1, 4, 128, 128][0, 128, 512, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(3, 1, %arg1[0, 0, 0, 8][32, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(3, 1, %arg1[0, 0, 0, 65544][32, 8, 8, 8][2048, 32, 256, 1]) {id = 2 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(3, 1, %arg1[0, 0, 0, 131080][32, 8, 8, 8][2048, 32, 256, 1]) {id = 3 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(3, 1, %arg1[0, 0, 0, 196616][32, 8, 8, 8][2048, 32, 256, 1]) {id = 4 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(3, 1, %arg2[0, 0, 384, 128][1, 1, 128, 128][0, 0, 512, 1]) {id = 5 : i64, metadata = @airMemcpyId29} : memref<512x512xf32>
+// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+// CHECK: aiex.npu.dma_memcpy_nd(3, 2, %arg0[0, 0, 0, 196608][1, 4, 128, 128][0, 128, 512, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(3, 2, %arg1[0, 0, 0, 16][32, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(3, 2, %arg1[0, 0, 0, 65552][32, 8, 8, 8][2048, 32, 256, 1]) {id = 2 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(3, 2, %arg1[0, 0, 0, 131088][32, 8, 8, 8][2048, 32, 256, 1]) {id = 3 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(3, 2, %arg1[0, 0, 0, 196624][32, 8, 8, 8][2048, 32, 256, 1]) {id = 4 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(3, 2, %arg2[0, 0, 384, 256][1, 1, 128, 128][0, 0, 512, 1]) {id = 5 : i64, metadata = @airMemcpyId29} : memref<512x512xf32>
+// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+// CHECK: aiex.npu.dma_memcpy_nd(3, 3, %arg0[0, 0, 0, 196608][1, 4, 128, 128][0, 128, 512, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(3, 3, %arg1[0, 0, 0, 24][32, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(3, 3, %arg1[0, 0, 0, 65560][32, 8, 8, 8][2048, 32, 256, 1]) {id = 2 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(3, 3, %arg1[0, 0, 0, 131096][32, 8, 8, 8][2048, 32, 256, 1]) {id = 3 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(3, 3, %arg1[0, 0, 0, 196632][32, 8, 8, 8][2048, 32, 256, 1]) {id = 4 : i64, metadata = @airMemcpyId10} : memref<262144xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(3, 3, %arg2[0, 0, 384, 384][1, 1, 128, 128][0, 0, 512, 1]) {id = 5 : i64, metadata = @airMemcpyId29} : memref<512x512xf32>
+// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+
+module {
+  aie.device(npu1_4col) {
+    aie.shim_dma_allocation @airMemcpyId29(S2MM, 0, 0)
+    memref.global "public" @airMemcpyId29 : memref<128x128xf32, 1>
+    aie.shim_dma_allocation @airMemcpyId4(MM2S, 0, 0)
+    memref.global "public" @airMemcpyId4 : memref<128x256xbf16, 1>
+    aie.shim_dma_allocation @airMemcpyId10(MM2S, 1, 0)
+    memref.global "public" @airMemcpyId10 : memref<32x8x8x16xbf16, 1>
+  } {sym_name = "forward_0"}
+  airrt.module_metadata{
+  }
+  func.func @forward(%arg0: memref<512x1024xbf16>, %arg1: memref<128x8x8x64xbf16>, %arg2: memref<512x512xf32>) -> memref<512x512xf32> {
+    %c384_i64 = arith.constant 384 : i64
+    %c48_i64 = arith.constant 48 : i64
+    %c3_i64 = arith.constant 3 : i64
+    %c32_i64 = arith.constant 32 : i64
+    %c2_i64 = arith.constant 2 : i64
+    %c0 = arith.constant 0 : index
+    %c16_i64 = arith.constant 16 : i64
+    %c8_i64 = arith.constant 8 : i64
+    %c512_i64 = arith.constant 512 : i64
+    %c64_i64 = arith.constant 64 : i64
+    %c4096_i64 = arith.constant 4096 : i64
+    %c128_i64 = arith.constant 128 : i64
+    %c4_i64 = arith.constant 4 : i64
+    %c1_i64 = arith.constant 1 : i64
+    %c1024_i64 = arith.constant 1024 : i64
+    %c256_i64 = arith.constant 256 : i64
+    %c0_i64 = arith.constant 0 : i64
+    %c29_i32 = arith.constant 29 : i32
+    %c10_i32 = arith.constant 10 : i32
+    %c4_i32 = arith.constant 4 : i32
+    %c128 = arith.constant 128 : index
+    %c1024 = arith.constant 1024 : index
+    %c512 = arith.constant 512 : index
+    %c64 = arith.constant 64 : index
+    %p = airrt.segment_load "forward_0" : i64
+    %0 = airrt.dma_memcpy_nd(%c4_i32, %c0_i64, %c0_i64, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c256_i64], [%c0_i64, %c256_i64, %c1024_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x1024xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %1 = airrt.dma_memcpy_nd(%c10_i32, %c0_i64, %c0_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c0_i64], [%c128_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId10} : (i32, i64, i64, memref<128x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %2 = airrt.dma_memcpy_nd(%c29_i32, %c0_i64, %c0_i64, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId29} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    affine.for %arg3 = 0 to 1 {
+      %h = airrt.herd_load "herd_0" : i64
+      %48 = airrt.wait_all : !airrt.event
+      %49 = airrt.wait_all : !airrt.event
+      %50:4 = scf.for %arg4 = %c0 to %c1024 step %c512 iter_args(%arg5 = %48, %arg6 = %49, %arg7 = %49, %arg8 = %49) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) {
+        %55 = airrt.wait_all %arg8, %arg5 : !airrt.event
+        %56 = airrt.wait_all %arg7 : !airrt.event
+        %57 = airrt.wait_all %arg7 : !airrt.event
+        airrt.wait_all %arg8, %arg5
+        %58 = airrt.wait_all : !airrt.event
+        %59 = airrt.wait_all %arg6 : !airrt.event
+        airrt.wait_all %arg6
+        %60 = airrt.wait_all : !airrt.event
+        scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event
+      }
+      %51 = airrt.wait_all : !airrt.event
+      %52 = airrt.wait_all : !airrt.event
+      %53:4 = scf.for %arg4 = %c0 to %c128 step %c64 iter_args(%arg5 = %51, %arg6 = %52, %arg7 = %52, %arg8 = %52) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) {
+        %55 = airrt.wait_all %arg8, %arg5 : !airrt.event
+        %56 = airrt.wait_all %arg7 : !airrt.event
+        %57 = airrt.wait_all %arg7 : !airrt.event
+        airrt.wait_all %arg8, %arg5
+        %58 = airrt.wait_all : !airrt.event
+        %59 = airrt.wait_all %arg6 : !airrt.event
+        airrt.wait_all %arg6
+        %60 = airrt.wait_all : !airrt.event
+        scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event
+      }
+      %54 = airrt.wait_all %50#1, %53#1 : !airrt.event
+    }
+    %p_0 = airrt.segment_load "forward_0" : i64
+    %3 = airrt.dma_memcpy_nd(%c4_i32, %c0_i64, %c1_i64, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c256_i64], [%c0_i64, %c256_i64, %c1024_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x1024xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %4 = airrt.dma_memcpy_nd(%c10_i32, %c0_i64, %c1_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c16_i64], [%c128_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId10} : (i32, i64, i64, memref<128x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %5 = airrt.dma_memcpy_nd(%c29_i32, %c0_i64, %c1_i64, %arg2[%c0_i64, %c0_i64, %c0_i64, %c128_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId29} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    affine.for %arg3 = 0 to 1 {
+      %h = airrt.herd_load "herd_0" : i64
+      %48 = airrt.wait_all : !airrt.event
+      %49 = airrt.wait_all : !airrt.event
+      %50:4 = scf.for %arg4 = %c0 to %c1024 step %c512 iter_args(%arg5 = %48, %arg6 = %49, %arg7 = %49, %arg8 = %49) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) {
+        %55 = airrt.wait_all %arg8, %arg5 : !airrt.event
+        %56 = airrt.wait_all %arg7 : !airrt.event
+        %57 = airrt.wait_all %arg7 : !airrt.event
+        airrt.wait_all %arg8, %arg5
+        %58 = airrt.wait_all : !airrt.event
+        %59 = airrt.wait_all %arg6 : !airrt.event
+        airrt.wait_all %arg6
+        %60 = airrt.wait_all : !airrt.event
+        scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event
+      }
+      %51 = airrt.wait_all : !airrt.event
+      %52 = airrt.wait_all : !airrt.event
+      %53:4 = scf.for %arg4 = %c0 to %c128 step %c64 iter_args(%arg5 = %51, %arg6 = %52, %arg7 = %52, %arg8 = %52) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) {
+        %55 = airrt.wait_all %arg8, %arg5 : !airrt.event
+        %56 = airrt.wait_all %arg7 : !airrt.event
+        %57 = airrt.wait_all %arg7 : !airrt.event
+        airrt.wait_all %arg8, %arg5
+        %58 = airrt.wait_all : !airrt.event
+        %59 = airrt.wait_all %arg6 : !airrt.event
+        airrt.wait_all %arg6
+        %60 = airrt.wait_all : !airrt.event
+        scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event
+      }
+      %54 = airrt.wait_all %50#1, %53#1 : !airrt.event
+    }
+    %p_1 = airrt.segment_load "forward_0" : i64
+    %6 = airrt.dma_memcpy_nd(%c4_i32, %c0_i64, %c2_i64, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c256_i64], [%c0_i64, %c256_i64, %c1024_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x1024xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %7 = airrt.dma_memcpy_nd(%c10_i32, %c0_i64, %c2_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c32_i64], [%c128_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId10} : (i32, i64, i64, memref<128x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %8 = airrt.dma_memcpy_nd(%c29_i32, %c0_i64, %c2_i64, %arg2[%c0_i64, %c0_i64, %c0_i64, %c256_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId29} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    affine.for %arg3 = 0 to 1 {
+      %h = airrt.herd_load "herd_0" : i64
+      %48 = airrt.wait_all : !airrt.event
+      %49 = airrt.wait_all : !airrt.event
+      %50:4 = scf.for %arg4 = %c0 to %c1024 step %c512 iter_args(%arg5 = %48, %arg6 = %49, %arg7 = %49, %arg8 = %49) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) {
+        %55 = airrt.wait_all %arg8, %arg5 : !airrt.event
+        %56 = airrt.wait_all %arg7 : !airrt.event
+        %57 = airrt.wait_all %arg7 : !airrt.event
+        airrt.wait_all %arg8, %arg5
+        %58 = airrt.wait_all : !airrt.event
+        %59 = airrt.wait_all %arg6 : !airrt.event
+        airrt.wait_all %arg6
+        %60 = airrt.wait_all : !airrt.event
+        scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event
+      }
+      %51 = airrt.wait_all : !airrt.event
+      %52 = airrt.wait_all : !airrt.event
+      %53:4 = scf.for %arg4 = %c0 to %c128 step %c64 iter_args(%arg5 = %51, %arg6 = %52, %arg7 = %52, %arg8 = %52) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) {
+        %55 = airrt.wait_all %arg8, %arg5 : !airrt.event
+        %56 = airrt.wait_all %arg7 : !airrt.event
+        %57 = airrt.wait_all %arg7 : !airrt.event
+        airrt.wait_all %arg8, %arg5
+        %58 = airrt.wait_all : !airrt.event
+        %59 = airrt.wait_all %arg6 : !airrt.event
+        airrt.wait_all %arg6
+        %60 = airrt.wait_all : !airrt.event
+        scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event
+      }
+      %54 = airrt.wait_all %50#1, %53#1 : !airrt.event
+    }
+    %p_2 = airrt.segment_load "forward_0" : i64
+    %9 = airrt.dma_memcpy_nd(%c4_i32, %c0_i64, %c3_i64, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c256_i64], [%c0_i64, %c256_i64, %c1024_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x1024xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %10 = airrt.dma_memcpy_nd(%c10_i32, %c0_i64, %c3_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c48_i64], [%c128_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId10} : (i32, i64, i64, memref<128x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %11 = airrt.dma_memcpy_nd(%c29_i32, %c0_i64, %c3_i64, %arg2[%c0_i64, %c0_i64, %c0_i64, %c384_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId29} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    affine.for %arg3 = 0 to 1 {
+      %h = airrt.herd_load "herd_0" : i64
+      %48 = airrt.wait_all : !airrt.event
+      %49 = airrt.wait_all : !airrt.event
+      %50:4 = scf.for %arg4 = %c0 to %c1024 step %c512 iter_args(%arg5 = %48, %arg6 = %49, %arg7 = %49, %arg8 = %49) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) {
+        %55 = airrt.wait_all %arg8, %arg5 : !airrt.event
+        %56 = airrt.wait_all %arg7 : !airrt.event
+        %57 = airrt.wait_all %arg7 : !airrt.event
+        airrt.wait_all %arg8, %arg5
+        %58 = airrt.wait_all : !airrt.event
+        %59 = airrt.wait_all %arg6 : !airrt.event
+        airrt.wait_all %arg6
+        %60 = airrt.wait_all : !airrt.event
+        scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event
+      }
+      %51 = airrt.wait_all : !airrt.event
+      %52 = airrt.wait_all : !airrt.event
+      %53:4 = scf.for %arg4 = %c0 to %c128 step %c64 iter_args(%arg5 = %51, %arg6 = %52, %arg7 = %52, %arg8 = %52) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) {
+        %55 = airrt.wait_all %arg8, %arg5 : !airrt.event
+        %56 = airrt.wait_all %arg7 : !airrt.event
+        %57 = airrt.wait_all %arg7 : !airrt.event
+        airrt.wait_all %arg8, %arg5
+        %58 = airrt.wait_all : !airrt.event
+        %59 = airrt.wait_all %arg6 : !airrt.event
+        airrt.wait_all %arg6
+        %60 = airrt.wait_all : !airrt.event
+        scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event
+      }
+      %54 = airrt.wait_all %50#1, %53#1 : !airrt.event
+    }
+    %p_3 = airrt.segment_load "forward_0" : i64
+    %12 = airrt.dma_memcpy_nd(%c4_i32, %c1_i64, %c0_i64, %arg0[%c0_i64, %c0_i64, %c128_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c256_i64], [%c0_i64, %c256_i64, %c1024_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x1024xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %13 = airrt.dma_memcpy_nd(%c10_i32, %c1_i64, %c0_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c0_i64], [%c128_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId10} : (i32, i64, i64, memref<128x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %14 = airrt.dma_memcpy_nd(%c29_i32, %c1_i64, %c0_i64, %arg2[%c0_i64, %c0_i64, %c128_i64, %c0_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId29} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    affine.for %arg3 = 0 to 1 {
+      %h = airrt.herd_load "herd_0" : i64
+      %48 = airrt.wait_all : !airrt.event
+      %49 = airrt.wait_all : !airrt.event
+      %50:4 = scf.for %arg4 = %c0 to %c1024 step %c512 iter_args(%arg5 = %48, %arg6 = %49, %arg7 = %49, %arg8 = %49) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) {
+        %55 = airrt.wait_all %arg8, %arg5 : !airrt.event
+        %56 = airrt.wait_all %arg7 : !airrt.event
+        %57 = airrt.wait_all %arg7 : !airrt.event
+        airrt.wait_all %arg8, %arg5
+        %58 = airrt.wait_all : !airrt.event
+        %59 = airrt.wait_all %arg6 : !airrt.event
+        airrt.wait_all %arg6
+        %60 = airrt.wait_all : !airrt.event
+        scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event
+      }
+      %51 = airrt.wait_all : !airrt.event
+      %52 = airrt.wait_all : !airrt.event
+      %53:4 = scf.for %arg4 = %c0 to %c128 step %c64 iter_args(%arg5 = %51, %arg6 = %52, %arg7 = %52, %arg8 = %52) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) {
+        %55 = airrt.wait_all %arg8, %arg5 : !airrt.event
+        %56 = airrt.wait_all %arg7 : !airrt.event
+        %57 = airrt.wait_all %arg7 : !airrt.event
+        airrt.wait_all %arg8, %arg5
+        %58 = airrt.wait_all : !airrt.event
+        %59 = airrt.wait_all %arg6 : !airrt.event
+        airrt.wait_all %arg6
+        %60 = airrt.wait_all : !airrt.event
+        scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event
+      }
+      %54 = airrt.wait_all %50#1, %53#1 : !airrt.event
+    }
+    %p_4 = airrt.segment_load "forward_0" : i64
+    %15 = airrt.dma_memcpy_nd(%c4_i32, %c1_i64, %c1_i64, %arg0[%c0_i64, %c0_i64, %c128_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c256_i64], [%c0_i64, %c256_i64, %c1024_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x1024xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %16 = airrt.dma_memcpy_nd(%c10_i32, %c1_i64, %c1_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c16_i64], [%c128_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId10} : (i32, i64, i64, memref<128x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %17 = airrt.dma_memcpy_nd(%c29_i32, %c1_i64, %c1_i64, %arg2[%c0_i64, %c0_i64, %c128_i64, %c128_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId29} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    affine.for %arg3 = 0 to 1 {
+      %h = airrt.herd_load "herd_0" : i64
+      %48 = airrt.wait_all : !airrt.event
+      %49 = airrt.wait_all : !airrt.event
+      %50:4 = scf.for %arg4 = %c0 to %c1024 step %c512 iter_args(%arg5 = %48, %arg6 = %49, %arg7 = %49, %arg8 = %49) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) {
+        %55 = airrt.wait_all %arg8, %arg5 : !airrt.event
+        %56 = airrt.wait_all %arg7 : !airrt.event
+        %57 = airrt.wait_all %arg7 : !airrt.event
+        airrt.wait_all %arg8, %arg5
+        %58 = airrt.wait_all : !airrt.event
+        %59 = airrt.wait_all %arg6 : !airrt.event
+        airrt.wait_all %arg6
+        %60 = airrt.wait_all : !airrt.event
+        scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event
+      }
+      %51 = airrt.wait_all : !airrt.event
+      %52 = airrt.wait_all : !airrt.event
+      %53:4 = scf.for %arg4 = %c0 to %c128 step %c64 iter_args(%arg5 = %51, %arg6 = %52, %arg7 = %52, %arg8 = %52) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) {
+        %55 = airrt.wait_all %arg8, %arg5 : !airrt.event
+        %56 = airrt.wait_all %arg7 : !airrt.event
+        %57 = airrt.wait_all %arg7 : !airrt.event
+        airrt.wait_all %arg8, %arg5
+        %58 = airrt.wait_all : !airrt.event
+        %59 = airrt.wait_all %arg6 : !airrt.event
+        airrt.wait_all %arg6
+        %60 = airrt.wait_all : !airrt.event
+        scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event
+      }
+      %54 = airrt.wait_all %50#1, %53#1 : !airrt.event
+    }
+    %p_5 = airrt.segment_load "forward_0" : i64
+    %18 = airrt.dma_memcpy_nd(%c4_i32, %c1_i64, %c2_i64, %arg0[%c0_i64, %c0_i64, %c128_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c256_i64], [%c0_i64, %c256_i64, %c1024_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x1024xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %19 = airrt.dma_memcpy_nd(%c10_i32, %c1_i64, %c2_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c32_i64], [%c128_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId10} : (i32, i64, i64, memref<128x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %20 = airrt.dma_memcpy_nd(%c29_i32, %c1_i64, %c2_i64, %arg2[%c0_i64, %c0_i64, %c128_i64, %c256_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId29} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    affine.for %arg3 = 0 to 1 {
+      %h = airrt.herd_load "herd_0" : i64
+      %48 = airrt.wait_all : !airrt.event
+      %49 = airrt.wait_all : !airrt.event
+      %50:4 = scf.for %arg4 = %c0 to %c1024 step %c512 iter_args(%arg5 = %48, %arg6 = %49, %arg7 = %49, %arg8 = %49) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) {
+        %55 = airrt.wait_all %arg8, %arg5 : !airrt.event
+        %56 = airrt.wait_all %arg7 : !airrt.event
+        %57 = airrt.wait_all %arg7 : !airrt.event
+        airrt.wait_all %arg8, %arg5
+        %58 = airrt.wait_all : !airrt.event
+        %59 = airrt.wait_all %arg6 : !airrt.event
+        airrt.wait_all %arg6
+        %60 = airrt.wait_all : !airrt.event
+        scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event
+      }
+      %51 = airrt.wait_all : !airrt.event
+      %52 = airrt.wait_all : !airrt.event
+      %53:4 = scf.for %arg4 = %c0 to %c128 step %c64 iter_args(%arg5 = %51, %arg6 = %52, %arg7 = %52, %arg8 = %52) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) {
+        %55 = airrt.wait_all %arg8, %arg5 : !airrt.event
+        %56 = airrt.wait_all %arg7 : !airrt.event
+        %57 = airrt.wait_all %arg7 : !airrt.event
+        airrt.wait_all %arg8, %arg5
+        %58 = airrt.wait_all : !airrt.event
+        %59 = airrt.wait_all %arg6 : !airrt.event
+        airrt.wait_all %arg6
+        %60 = airrt.wait_all : !airrt.event
+        scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event
+      }
+      %54 = airrt.wait_all %50#1, %53#1 : !airrt.event
+    }
+    %p_6 = airrt.segment_load "forward_0" : i64
+    %21 = airrt.dma_memcpy_nd(%c4_i32, %c1_i64, %c3_i64, %arg0[%c0_i64, %c0_i64, %c128_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c256_i64], [%c0_i64, %c256_i64, %c1024_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x1024xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %22 = airrt.dma_memcpy_nd(%c10_i32, %c1_i64, %c3_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c48_i64], [%c128_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId10} : (i32, i64, i64, memref<128x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %23 = airrt.dma_memcpy_nd(%c29_i32, %c1_i64, %c3_i64, %arg2[%c0_i64, %c0_i64, %c128_i64, %c384_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId29} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    affine.for %arg3 = 0 to 1 {
+      %h = airrt.herd_load "herd_0" : i64
+      %48 = airrt.wait_all : !airrt.event
+      %49 = airrt.wait_all : !airrt.event
+      %50:4 = scf.for %arg4 = %c0 to %c1024 step %c512 iter_args(%arg5 = %48, %arg6 = %49, %arg7 = %49, %arg8 = %49) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) {
+        %55 = airrt.wait_all %arg8, %arg5 : !airrt.event
+        %56 = airrt.wait_all %arg7 : !airrt.event
+        %57 = airrt.wait_all %arg7 : !airrt.event
+        airrt.wait_all %arg8, %arg5
+        %58 = airrt.wait_all : !airrt.event
+        %59 = airrt.wait_all %arg6 : !airrt.event
+        airrt.wait_all %arg6
+        %60 = airrt.wait_all : !airrt.event
+        scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event
+      }
+      %51 = airrt.wait_all : !airrt.event
+      %52 = airrt.wait_all : !airrt.event
+      %53:4 = scf.for %arg4 = %c0 to %c128 step %c64 iter_args(%arg5 = %51, %arg6 = %52, %arg7 = %52, %arg8 = %52) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) {
+        %55 = airrt.wait_all %arg8, %arg5 : !airrt.event
+        %56 = airrt.wait_all %arg7 : !airrt.event
+        %57 = airrt.wait_all %arg7 : !airrt.event
+        airrt.wait_all %arg8, %arg5
+        %58 = airrt.wait_all : !airrt.event
+        %59 = airrt.wait_all %arg6 : !airrt.event
+        airrt.wait_all %arg6
+        %60 = airrt.wait_all : !airrt.event
+        scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event
+      }
+      %54 = airrt.wait_all %50#1, %53#1 : !airrt.event
+    }
+    %p_7 = airrt.segment_load "forward_0" : i64
+    %24 = airrt.dma_memcpy_nd(%c4_i32, %c2_i64, %c0_i64, %arg0[%c0_i64, %c0_i64, %c256_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c256_i64], [%c0_i64, %c256_i64, %c1024_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x1024xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %25 = airrt.dma_memcpy_nd(%c10_i32, %c2_i64, %c0_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c0_i64], [%c128_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId10} : (i32, i64, i64, memref<128x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %26 = airrt.dma_memcpy_nd(%c29_i32, %c2_i64, %c0_i64, %arg2[%c0_i64, %c0_i64, %c256_i64, %c0_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId29} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    affine.for %arg3 = 0 to 1 {
+      %h = airrt.herd_load "herd_0" : i64
+      %48 = airrt.wait_all : !airrt.event
+      %49 = airrt.wait_all : !airrt.event
+      %50:4 = scf.for %arg4 = %c0 to %c1024 step %c512 iter_args(%arg5 = %48, %arg6 = %49, %arg7 = %49, %arg8 = %49) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) {
+        %55 = airrt.wait_all %arg8, %arg5 : !airrt.event
+        %56 = airrt.wait_all %arg7 : !airrt.event
+        %57 = airrt.wait_all %arg7 : !airrt.event
+        airrt.wait_all %arg8, %arg5
+        %58 = airrt.wait_all : !airrt.event
+        %59 = airrt.wait_all %arg6 : !airrt.event
+        airrt.wait_all %arg6
+        %60 = airrt.wait_all : !airrt.event
+        scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event
+      }
+      %51 = airrt.wait_all : !airrt.event
+      %52 = airrt.wait_all : !airrt.event
+      %53:4 = scf.for %arg4 = %c0 to %c128 step %c64 iter_args(%arg5 = %51, %arg6 = %52, %arg7 = %52, %arg8 = %52) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) {
+        %55 = airrt.wait_all %arg8, %arg5 : !airrt.event
+        %56 = airrt.wait_all %arg7 : !airrt.event
+        %57 = airrt.wait_all %arg7 : !airrt.event
+        airrt.wait_all %arg8, %arg5
+        %58 = airrt.wait_all : !airrt.event
+        %59 = airrt.wait_all %arg6 : !airrt.event
+        airrt.wait_all %arg6
+        %60 = airrt.wait_all : !airrt.event
+        scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event
+      }
+      %54 = airrt.wait_all %50#1, %53#1 : !airrt.event
+    }
+    %p_8 = airrt.segment_load "forward_0" : i64
+    %27 = airrt.dma_memcpy_nd(%c4_i32, %c2_i64, %c1_i64, %arg0[%c0_i64, %c0_i64, %c256_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c256_i64], [%c0_i64, %c256_i64, %c1024_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x1024xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %28 = airrt.dma_memcpy_nd(%c10_i32, %c2_i64, %c1_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c16_i64], [%c128_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId10} : (i32, i64, i64, memref<128x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %29 = airrt.dma_memcpy_nd(%c29_i32, %c2_i64, %c1_i64, %arg2[%c0_i64, %c0_i64, %c256_i64, %c128_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId29} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    affine.for %arg3 = 0 to 1 {
+      %h = airrt.herd_load "herd_0" : i64
+      %48 = airrt.wait_all : !airrt.event
+      %49 = airrt.wait_all : !airrt.event
+      %50:4 = scf.for %arg4 = %c0 to %c1024 step %c512 iter_args(%arg5 = %48, %arg6 = %49, %arg7 = %49, %arg8 = %49) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) {
+        %55 = airrt.wait_all %arg8, %arg5 : !airrt.event
+        %56 = airrt.wait_all %arg7 : !airrt.event
+        %57 = airrt.wait_all %arg7 : !airrt.event
+        airrt.wait_all %arg8, %arg5
+        %58 = airrt.wait_all : !airrt.event
+        %59 = airrt.wait_all %arg6 : !airrt.event
+        airrt.wait_all %arg6
+        %60 = airrt.wait_all : !airrt.event
+        scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event
+      }
+      %51 = airrt.wait_all : !airrt.event
+      %52 = airrt.wait_all : !airrt.event
+      %53:4 = scf.for %arg4 = %c0 to %c128 step %c64 iter_args(%arg5 = %51, %arg6 = %52, %arg7 = %52, %arg8 = %52) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) {
+        %55 = airrt.wait_all %arg8, %arg5 : !airrt.event
+        %56 = airrt.wait_all %arg7 : !airrt.event
+        %57 = airrt.wait_all %arg7 : !airrt.event
+        airrt.wait_all %arg8, %arg5
+        %58 = airrt.wait_all : !airrt.event
+        %59 = airrt.wait_all %arg6 : !airrt.event
+        airrt.wait_all %arg6
+        %60 = airrt.wait_all : !airrt.event
+        scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event
+      }
+      %54 = airrt.wait_all %50#1, %53#1 : !airrt.event
+    }
+    %p_9 = airrt.segment_load "forward_0" : i64
+    %30 = airrt.dma_memcpy_nd(%c4_i32, %c2_i64, %c2_i64, %arg0[%c0_i64, %c0_i64, %c256_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c256_i64], [%c0_i64, %c256_i64, %c1024_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x1024xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %31 = airrt.dma_memcpy_nd(%c10_i32, %c2_i64, %c2_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c32_i64], [%c128_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId10} : (i32, i64, i64, memref<128x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %32 = airrt.dma_memcpy_nd(%c29_i32, %c2_i64, %c2_i64, %arg2[%c0_i64, %c0_i64, %c256_i64, %c256_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId29} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    affine.for %arg3 = 0 to 1 {
+      %h = airrt.herd_load "herd_0" : i64
+      %48 = airrt.wait_all : !airrt.event
+      %49 = airrt.wait_all : !airrt.event
+      %50:4 = scf.for %arg4 = %c0 to %c1024 step %c512 iter_args(%arg5 = %48, %arg6 = %49, %arg7 = %49, %arg8 = %49) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) {
+        %55 = airrt.wait_all %arg8, %arg5 : !airrt.event
+        %56 = airrt.wait_all %arg7 : !airrt.event
+        %57 = airrt.wait_all %arg7 : !airrt.event
+        airrt.wait_all %arg8, %arg5
+        %58 = airrt.wait_all : !airrt.event
+        %59 = airrt.wait_all %arg6 : !airrt.event
+        airrt.wait_all %arg6
+        %60 = airrt.wait_all : !airrt.event
+        scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event
+      }
+      %51 = airrt.wait_all : !airrt.event
+      %52 = airrt.wait_all : !airrt.event
+      %53:4 = scf.for %arg4 = %c0 to %c128 step %c64 iter_args(%arg5 = %51, %arg6 = %52, %arg7 = %52, %arg8 = %52) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) {
+        %55 = airrt.wait_all %arg8, %arg5 : !airrt.event
+        %56 = airrt.wait_all %arg7 : !airrt.event
+        %57 = airrt.wait_all %arg7 : !airrt.event
+        airrt.wait_all %arg8, %arg5
+        %58 = airrt.wait_all : !airrt.event
+        %59 = airrt.wait_all %arg6 : !airrt.event
+        airrt.wait_all %arg6
+        %60 = airrt.wait_all : !airrt.event
+        scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event
+      }
+      %54 = airrt.wait_all %50#1, %53#1 : !airrt.event
+    }
+    %p_10 = airrt.segment_load "forward_0" : i64
+    %33 = airrt.dma_memcpy_nd(%c4_i32, %c2_i64, %c3_i64, %arg0[%c0_i64, %c0_i64, %c256_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c256_i64], [%c0_i64, %c256_i64, %c1024_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x1024xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %34 = airrt.dma_memcpy_nd(%c10_i32, %c2_i64, %c3_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c48_i64], [%c128_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId10} : (i32, i64, i64, memref<128x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %35 = airrt.dma_memcpy_nd(%c29_i32, %c2_i64, %c3_i64, %arg2[%c0_i64, %c0_i64, %c256_i64, %c384_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId29} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    affine.for %arg3 = 0 to 1 {
+      %h = airrt.herd_load "herd_0" : i64
+      %48 = airrt.wait_all : !airrt.event
+      %49 = airrt.wait_all : !airrt.event
+      %50:4 = scf.for %arg4 = %c0 to %c1024 step %c512 iter_args(%arg5 = %48, %arg6 = %49, %arg7 = %49, %arg8 = %49) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) {
+        %55 = airrt.wait_all %arg8, %arg5 : !airrt.event
+        %56 = airrt.wait_all %arg7 : !airrt.event
+        %57 = airrt.wait_all %arg7 : !airrt.event
+        airrt.wait_all %arg8, %arg5
+        %58 = airrt.wait_all : !airrt.event
+        %59 = airrt.wait_all %arg6 : !airrt.event
+        airrt.wait_all %arg6
+        %60 = airrt.wait_all : !airrt.event
+        scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event
+      }
+      %51 = airrt.wait_all : !airrt.event
+      %52 = airrt.wait_all : !airrt.event
+      %53:4 = scf.for %arg4 = %c0 to %c128 step %c64 iter_args(%arg5 = %51, %arg6 = %52, %arg7 = %52, %arg8 = %52) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) {
+        %55 = airrt.wait_all %arg8, %arg5 : !airrt.event
+        %56 = airrt.wait_all %arg7 : !airrt.event
+        %57 = airrt.wait_all %arg7 : !airrt.event
+        airrt.wait_all %arg8, %arg5
+        %58 = airrt.wait_all : !airrt.event
+        %59 = airrt.wait_all %arg6 : !airrt.event
+        airrt.wait_all %arg6
+        %60 = airrt.wait_all : !airrt.event
+        scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event
+      }
+      %54 = airrt.wait_all %50#1, %53#1 : !airrt.event
+    }
+    %p_11 = airrt.segment_load "forward_0" : i64
+    %36 = airrt.dma_memcpy_nd(%c4_i32, %c3_i64, %c0_i64, %arg0[%c0_i64, %c0_i64, %c384_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c256_i64], [%c0_i64, %c256_i64, %c1024_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x1024xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %37 = airrt.dma_memcpy_nd(%c10_i32, %c3_i64, %c0_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c0_i64], [%c128_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId10} : (i32, i64, i64, memref<128x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %38 = airrt.dma_memcpy_nd(%c29_i32, %c3_i64, %c0_i64, %arg2[%c0_i64, %c0_i64, %c384_i64, %c0_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId29} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    affine.for %arg3 = 0 to 1 {
+      %h = airrt.herd_load "herd_0" : i64
+      %48 = airrt.wait_all : !airrt.event
+      %49 = airrt.wait_all : !airrt.event
+      %50:4 = scf.for %arg4 = %c0 to %c1024 step %c512 iter_args(%arg5 = %48, %arg6 = %49, %arg7 = %49, %arg8 = %49) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) {
+        %55 = airrt.wait_all %arg8, %arg5 : !airrt.event
+        %56 = airrt.wait_all %arg7 : !airrt.event
+        %57 = airrt.wait_all %arg7 : !airrt.event
+        airrt.wait_all %arg8, %arg5
+        %58 = airrt.wait_all : !airrt.event
+        %59 = airrt.wait_all %arg6 : !airrt.event
+        airrt.wait_all %arg6
+        %60 = airrt.wait_all : !airrt.event
+        scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event
+      }
+      %51 = airrt.wait_all : !airrt.event
+      %52 = airrt.wait_all : !airrt.event
+      %53:4 = scf.for %arg4 = %c0 to %c128 step %c64 iter_args(%arg5 = %51, %arg6 = %52, %arg7 = %52, %arg8 = %52) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) {
+        %55 = airrt.wait_all %arg8, %arg5 : !airrt.event
+        %56 = airrt.wait_all %arg7 : !airrt.event
+        %57 = airrt.wait_all %arg7 : !airrt.event
+        airrt.wait_all %arg8, %arg5
+        %58 = airrt.wait_all : !airrt.event
+        %59 = airrt.wait_all %arg6 : !airrt.event
+        airrt.wait_all %arg6
+        %60 = airrt.wait_all : !airrt.event
+        scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event
+      }
+      %54 = airrt.wait_all %50#1, %53#1 : !airrt.event
+    }
+    %p_12 = airrt.segment_load "forward_0" : i64
+    %39 = airrt.dma_memcpy_nd(%c4_i32, %c3_i64, %c1_i64, %arg0[%c0_i64, %c0_i64, %c384_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c256_i64], [%c0_i64, %c256_i64, %c1024_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x1024xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %40 = airrt.dma_memcpy_nd(%c10_i32, %c3_i64, %c1_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c16_i64], [%c128_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId10} : (i32, i64, i64, memref<128x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %41 = airrt.dma_memcpy_nd(%c29_i32, %c3_i64, %c1_i64, %arg2[%c0_i64, %c0_i64, %c384_i64, %c128_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId29} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    affine.for %arg3 = 0 to 1 {
+      %h = airrt.herd_load "herd_0" : i64
+      %48 = airrt.wait_all : !airrt.event
+      %49 = airrt.wait_all : !airrt.event
+      %50:4 = scf.for %arg4 = %c0 to %c1024 step %c512 iter_args(%arg5 = %48, %arg6 = %49, %arg7 = %49, %arg8 = %49) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) {
+        %55 = airrt.wait_all %arg8, %arg5 : !airrt.event
+        %56 = airrt.wait_all %arg7 : !airrt.event
+        %57 = airrt.wait_all %arg7 : !airrt.event
+        airrt.wait_all %arg8, %arg5
+        %58 = airrt.wait_all : !airrt.event
+        %59 = airrt.wait_all %arg6 : !airrt.event
+        airrt.wait_all %arg6
+        %60 = airrt.wait_all : !airrt.event
+        scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event
+      }
+      %51 = airrt.wait_all : !airrt.event
+      %52 = airrt.wait_all : !airrt.event
+      %53:4 = scf.for %arg4 = %c0 to %c128 step %c64 iter_args(%arg5 = %51, %arg6 = %52, %arg7 = %52, %arg8 = %52) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) {
+        %55 = airrt.wait_all %arg8, %arg5 : !airrt.event
+        %56 = airrt.wait_all %arg7 : !airrt.event
+        %57 = airrt.wait_all %arg7 : !airrt.event
+        airrt.wait_all %arg8, %arg5
+        %58 = airrt.wait_all : !airrt.event
+        %59 = airrt.wait_all %arg6 : !airrt.event
+        airrt.wait_all %arg6
+        %60 = airrt.wait_all : !airrt.event
+        scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event
+      }
+      %54 = airrt.wait_all %50#1, %53#1 : !airrt.event
+    }
+    %p_13 = airrt.segment_load "forward_0" : i64
+    %42 = airrt.dma_memcpy_nd(%c4_i32, %c3_i64, %c2_i64, %arg0[%c0_i64, %c0_i64, %c384_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c256_i64], [%c0_i64, %c256_i64, %c1024_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x1024xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %43 = airrt.dma_memcpy_nd(%c10_i32, %c3_i64, %c2_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c32_i64], [%c128_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId10} : (i32, i64, i64, memref<128x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %44 = airrt.dma_memcpy_nd(%c29_i32, %c3_i64, %c2_i64, %arg2[%c0_i64, %c0_i64, %c384_i64, %c256_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId29} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    affine.for %arg3 = 0 to 1 {
+      %h = airrt.herd_load "herd_0" : i64
+      %48 = airrt.wait_all : !airrt.event
+      %49 = airrt.wait_all : !airrt.event
+      %50:4 = scf.for %arg4 = %c0 to %c1024 step %c512 iter_args(%arg5 = %48, %arg6 = %49, %arg7 = %49, %arg8 = %49) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) {
+        %55 = airrt.wait_all %arg8, %arg5 : !airrt.event
+        %56 = airrt.wait_all %arg7 : !airrt.event
+        %57 = airrt.wait_all %arg7 : !airrt.event
+        airrt.wait_all %arg8, %arg5
+        %58 = airrt.wait_all : !airrt.event
+        %59 = airrt.wait_all %arg6 : !airrt.event
+        airrt.wait_all %arg6
+        %60 = airrt.wait_all : !airrt.event
+        scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event
+      }
+      %51 = airrt.wait_all : !airrt.event
+      %52 = airrt.wait_all : !airrt.event
+      %53:4 = scf.for %arg4 = %c0 to %c128 step %c64 iter_args(%arg5 = %51, %arg6 = %52, %arg7 = %52, %arg8 = %52) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) {
+        %55 = airrt.wait_all %arg8, %arg5 : !airrt.event
+        %56 = airrt.wait_all %arg7 : !airrt.event
+        %57 = airrt.wait_all %arg7 : !airrt.event
+        airrt.wait_all %arg8, %arg5
+        %58 = airrt.wait_all : !airrt.event
+        %59 = airrt.wait_all %arg6 : !airrt.event
+        airrt.wait_all %arg6
+        %60 = airrt.wait_all : !airrt.event
+        scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event
+      }
+      %54 = airrt.wait_all %50#1, %53#1 : !airrt.event
+    }
+    %p_14 = airrt.segment_load "forward_0" : i64
+    %45 = airrt.dma_memcpy_nd(%c4_i32, %c3_i64, %c3_i64, %arg0[%c0_i64, %c0_i64, %c384_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c256_i64], [%c0_i64, %c256_i64, %c1024_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x1024xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %46 = airrt.dma_memcpy_nd(%c10_i32, %c3_i64, %c3_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c48_i64], [%c128_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId10} : (i32, i64, i64, memref<128x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %47 = airrt.dma_memcpy_nd(%c29_i32, %c3_i64, %c3_i64, %arg2[%c0_i64, %c0_i64, %c384_i64, %c384_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId29} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    affine.for %arg3 = 0 to 1 {
+      %h = airrt.herd_load "herd_0" : i64
+      %48 = airrt.wait_all : !airrt.event
+      %49 = airrt.wait_all : !airrt.event
+      %50:4 = scf.for %arg4 = %c0 to %c1024 step %c512 iter_args(%arg5 = %48, %arg6 = %49, %arg7 = %49, %arg8 = %49) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) {
+        %55 = airrt.wait_all %arg8, %arg5 : !airrt.event
+        %56 = airrt.wait_all %arg7 : !airrt.event
+        %57 = airrt.wait_all %arg7 : !airrt.event
+        airrt.wait_all %arg8, %arg5
+        %58 = airrt.wait_all : !airrt.event
+        %59 = airrt.wait_all %arg6 : !airrt.event
+        airrt.wait_all %arg6
+        %60 = airrt.wait_all : !airrt.event
+        scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event
+      }
+      %51 = airrt.wait_all : !airrt.event
+      %52 = airrt.wait_all : !airrt.event
+      %53:4 = scf.for %arg4 = %c0 to %c128 step %c64 iter_args(%arg5 = %51, %arg6 = %52, %arg7 = %52, %arg8 = %52) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) {
+        %55 = airrt.wait_all %arg8, %arg5 : !airrt.event
+        %56 = airrt.wait_all %arg7 : !airrt.event
+        %57 = airrt.wait_all %arg7 : !airrt.event
+        airrt.wait_all %arg8, %arg5
+        %58 = airrt.wait_all : !airrt.event
+        %59 = airrt.wait_all %arg6 : !airrt.event
+        airrt.wait_all %arg6
+        %60 = airrt.wait_all : !airrt.event
+        scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event
+      }
+      %54 = airrt.wait_all %50#1, %53#1 : !airrt.event
+    }
+    return %arg2 : memref<512x512xf32>
+  }
+}
diff --git a/mlir/test/Conversion/AIRRtToNpu/dma_offset_folding.mlir b/mlir/test/Conversion/AIRRtToNpu/dma_offset_folding.mlir
new file mode 100644
index 000000000..75961ce13
--- /dev/null
+++ b/mlir/test/Conversion/AIRRtToNpu/dma_offset_folding.mlir
@@ -0,0 +1,233 @@
+//===- dma_offset_folding.mlir ---------------------------------------*- MLIR -*-===//
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===-----------------------------------------------------------------------------===//
+
+
+// RUN: air-opt -airrt-to-npu --split-input-file %s | FileCheck %s
+
+// 
+//Test correctness of generated offsets, wraps and strides
+//
+//
+// CHECK-LABEL: aie.device(npu1_4col)
+// CHECK: aie.shim_dma_allocation @airMemcpyId19(S2MM, 0, 0)
+// CHECK: memref.global "public" @airMemcpyId19 : memref<128x128xf32, 1>
+// CHECK: aie.shim_dma_allocation @airMemcpyId4(MM2S, 0, 0)
+// CHECK: memref.global "public" @airMemcpyId4 : memref<128x128xbf16, 1>
+// CHECK: aie.shim_dma_allocation @airMemcpyId5(MM2S, 1, 0)
+// CHECK: memref.global "public" @airMemcpyId5 : memref<16x8x8x16xbf16, 1>
+
+
+// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 4, 128, 16][0, 16, 64, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<32768xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][16, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<32768xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 128, 128][0, 0, 512, 1]) {id = 2 : i64, metadata = @airMemcpyId19} : memref<512x512xf32>
+// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+// CHECK: aiex.npu.dma_memcpy_nd(0, 1, %arg0[0, 0, 0, 0][1, 4, 128, 16][0, 16, 64, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<32768xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(0, 1, %arg1[0, 0, 0, 8][16, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<32768xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(0, 1, %arg2[0, 0, 0, 128][1, 1, 128, 128][0, 0, 512, 1]) {id = 2 : i64, metadata = @airMemcpyId19} : memref<512x512xf32>
+// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+// CHECK: aiex.npu.dma_memcpy_nd(0, 2, %arg0[0, 0, 0, 0][1, 4, 128, 16][0, 16, 64, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<32768xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(0, 2, %arg1[0, 0, 0, 16][16, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<32768xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(0, 2, %arg2[0, 0, 0, 256][1, 1, 128, 128][0, 0, 512, 1]) {id = 2 : i64, metadata = @airMemcpyId19} : memref<512x512xf32>
+// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+// CHECK: aiex.npu.dma_memcpy_nd(0, 3, %arg0[0, 0, 0, 0][1, 4, 128, 16][0, 16, 64, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<32768xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(0, 3, %arg1[0, 0, 0, 24][16, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<32768xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(0, 3, %arg2[0, 0, 0, 384][1, 1, 128, 128][0, 0, 512, 1]) {id = 2 : i64, metadata = @airMemcpyId19} : memref<512x512xf32>
+// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+// CHECK: aiex.npu.dma_memcpy_nd(1, 0, %arg0[0, 0, 0, 8192][1, 4, 128, 16][0, 16, 64, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<32768xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(1, 0, %arg1[0, 0, 0, 0][16, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<32768xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(1, 0, %arg2[0, 0, 128, 0][1, 1, 128, 128][0, 0, 512, 1]) {id = 2 : i64, metadata = @airMemcpyId19} : memref<512x512xf32>
+// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+// CHECK: aiex.npu.dma_memcpy_nd(1, 1, %arg0[0, 0, 0, 8192][1, 4, 128, 16][0, 16, 64, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<32768xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(1, 1, %arg1[0, 0, 0, 8][16, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<32768xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(1, 1, %arg2[0, 0, 128, 128][1, 1, 128, 128][0, 0, 512, 1]) {id = 2 : i64, metadata = @airMemcpyId19} : memref<512x512xf32>
+// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+// CHECK: aiex.npu.dma_memcpy_nd(1, 2, %arg0[0, 0, 0, 8192][1, 4, 128, 16][0, 16, 64, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<32768xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(1, 2, %arg1[0, 0, 0, 16][16, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<32768xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(1, 2, %arg2[0, 0, 128, 256][1, 1, 128, 128][0, 0, 512, 1]) {id = 2 : i64, metadata = @airMemcpyId19} : memref<512x512xf32>
+// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+// CHECK: aiex.npu.dma_memcpy_nd(1, 3, %arg0[0, 0, 0, 8192][1, 4, 128, 16][0, 16, 64, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<32768xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(1, 3, %arg1[0, 0, 0, 24][16, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<32768xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(1, 3, %arg2[0, 0, 128, 384][1, 1, 128, 128][0, 0, 512, 1]) {id = 2 : i64, metadata = @airMemcpyId19} : memref<512x512xf32>
+// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+// CHECK: aiex.npu.dma_memcpy_nd(2, 0, %arg0[0, 0, 0, 16384][1, 4, 128, 16][0, 16, 64, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<32768xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(2, 0, %arg1[0, 0, 0, 0][16, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<32768xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(2, 0, %arg2[0, 0, 256, 0][1, 1, 128, 128][0, 0, 512, 1]) {id = 2 : i64, metadata = @airMemcpyId19} : memref<512x512xf32>
+// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+// CHECK: aiex.npu.dma_memcpy_nd(2, 1, %arg0[0, 0, 0, 16384][1, 4, 128, 16][0, 16, 64, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<32768xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(2, 1, %arg1[0, 0, 0, 8][16, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<32768xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(2, 1, %arg2[0, 0, 256, 128][1, 1, 128, 128][0, 0, 512, 1]) {id = 2 : i64, metadata = @airMemcpyId19} : memref<512x512xf32>
+// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+// CHECK: aiex.npu.dma_memcpy_nd(2, 2, %arg0[0, 0, 0, 16384][1, 4, 128, 16][0, 16, 64, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<32768xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(2, 2, %arg1[0, 0, 0, 16][16, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<32768xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(2, 2, %arg2[0, 0, 256, 256][1, 1, 128, 128][0, 0, 512, 1]) {id = 2 : i64, metadata = @airMemcpyId19} : memref<512x512xf32>
+// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+// CHECK: aiex.npu.dma_memcpy_nd(2, 3, %arg0[0, 0, 0, 16384][1, 4, 128, 16][0, 16, 64, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<32768xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(2, 3, %arg1[0, 0, 0, 24][16, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<32768xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(2, 3, %arg2[0, 0, 256, 384][1, 1, 128, 128][0, 0, 512, 1]) {id = 2 : i64, metadata = @airMemcpyId19} : memref<512x512xf32>
+// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+// CHECK: aiex.npu.dma_memcpy_nd(3, 0, %arg0[0, 0, 0, 24576][1, 4, 128, 16][0, 16, 64, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<32768xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(3, 0, %arg1[0, 0, 0, 0][16, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<32768xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(3, 0, %arg2[0, 0, 384, 0][1, 1, 128, 128][0, 0, 512, 1]) {id = 2 : i64, metadata = @airMemcpyId19} : memref<512x512xf32>
+// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+// CHECK: aiex.npu.dma_memcpy_nd(3, 1, %arg0[0, 0, 0, 24576][1, 4, 128, 16][0, 16, 64, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<32768xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(3, 1, %arg1[0, 0, 0, 8][16, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<32768xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(3, 1, %arg2[0, 0, 384, 128][1, 1, 128, 128][0, 0, 512, 1]) {id = 2 : i64, metadata = @airMemcpyId19} : memref<512x512xf32>
+// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+// CHECK: aiex.npu.dma_memcpy_nd(3, 2, %arg0[0, 0, 0, 24576][1, 4, 128, 16][0, 16, 64, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<32768xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(3, 2, %arg1[0, 0, 0, 16][16, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<32768xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(3, 2, %arg2[0, 0, 384, 256][1, 1, 128, 128][0, 0, 512, 1]) {id = 2 : i64, metadata = @airMemcpyId19} : memref<512x512xf32>
+// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+// CHECK: aiex.npu.dma_memcpy_nd(3, 3, %arg0[0, 0, 0, 24576][1, 4, 128, 16][0, 16, 64, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<32768xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(3, 3, %arg1[0, 0, 0, 24][16, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<32768xi32>
+// CHECK: aiex.npu.dma_memcpy_nd(3, 3, %arg2[0, 0, 384, 384][1, 1, 128, 128][0, 0, 512, 1]) {id = 2 : i64, metadata = @airMemcpyId19} : memref<512x512xf32>
+// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+
+module {
+  aie.device(npu1_4col) {
+    aie.shim_dma_allocation @airMemcpyId19(S2MM, 0, 0)
+    memref.global "public" @airMemcpyId19 : memref<128x128xf32, 1>
+    aie.shim_dma_allocation @airMemcpyId4(MM2S, 0, 0)
+    memref.global "public" @airMemcpyId4 : memref<128x128xbf16, 1>
+    aie.shim_dma_allocation @airMemcpyId5(MM2S, 1, 0)
+    memref.global "public" @airMemcpyId5 : memref<16x8x8x16xbf16, 1>
+  } {sym_name = "forward_0"}
+  airrt.module_metadata{
+  }
+  func.func @forward(%arg0: memref<512x128xbf16>, %arg1: memref<16x8x8x64xbf16>, %arg2: memref<512x512xf32>) -> memref<512x512xf32> {
+    %c384_i64 = arith.constant 384 : i64
+    %c48_i64 = arith.constant 48 : i64
+    %c3_i64 = arith.constant 3 : i64
+    %c256_i64 = arith.constant 256 : i64
+    %c2_i64 = arith.constant 2 : i64
+    %c8_i64 = arith.constant 8 : i64
+    %c16_i64 = arith.constant 16 : i64
+    %c512_i64 = arith.constant 512 : i64
+    %c64_i64 = arith.constant 64 : i64
+    %c4096_i64 = arith.constant 4096 : i64
+    %c4_i64 = arith.constant 4 : i64
+    %c1_i64 = arith.constant 1 : i64
+    %c128_i64 = arith.constant 128 : i64
+    %c32_i64 = arith.constant 32 : i64
+    %c0_i64 = arith.constant 0 : i64
+    %c19_i32 = arith.constant 19 : i32
+    %c5_i32 = arith.constant 5 : i32
+    %c4_i32 = arith.constant 4 : i32
+    %p = airrt.segment_load "forward_0" : i64
+    %0 = airrt.dma_memcpy_nd(%c4_i32, %c0_i64, %c0_i64, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c32_i64], [%c0_i64, %c32_i64, %c128_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x128xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %1 = airrt.dma_memcpy_nd(%c5_i32, %c0_i64, %c0_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c0_i64], [%c16_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId5} : (i32, i64, i64, memref<16x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %2 = airrt.dma_memcpy_nd(%c19_i32, %c0_i64, %c0_i64, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId19} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    affine.for %arg3 = 0 to 1 {
+      %h = airrt.herd_load "herd_0" : i64
+    }
+    %p_0 = airrt.segment_load "forward_0" : i64
+    %3 = airrt.dma_memcpy_nd(%c4_i32, %c0_i64, %c1_i64, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c32_i64], [%c0_i64, %c32_i64, %c128_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x128xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %4 = airrt.dma_memcpy_nd(%c5_i32, %c0_i64, %c1_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c16_i64], [%c16_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId5} : (i32, i64, i64, memref<16x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %5 = airrt.dma_memcpy_nd(%c19_i32, %c0_i64, %c1_i64, %arg2[%c0_i64, %c0_i64, %c0_i64, %c128_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId19} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    affine.for %arg3 = 0 to 1 {
+      %h = airrt.herd_load "herd_0" : i64
+    }
+    %p_1 = airrt.segment_load "forward_0" : i64
+    %6 = airrt.dma_memcpy_nd(%c4_i32, %c0_i64, %c2_i64, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c32_i64], [%c0_i64, %c32_i64, %c128_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x128xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %7 = airrt.dma_memcpy_nd(%c5_i32, %c0_i64, %c2_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c32_i64], [%c16_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId5} : (i32, i64, i64, memref<16x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %8 = airrt.dma_memcpy_nd(%c19_i32, %c0_i64, %c2_i64, %arg2[%c0_i64, %c0_i64, %c0_i64, %c256_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId19} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    affine.for %arg3 = 0 to 1 {
+      %h = airrt.herd_load "herd_0" : i64
+    }
+    %p_2 = airrt.segment_load "forward_0" : i64
+    %9 = airrt.dma_memcpy_nd(%c4_i32, %c0_i64, %c3_i64, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c32_i64], [%c0_i64, %c32_i64, %c128_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x128xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %10 = airrt.dma_memcpy_nd(%c5_i32, %c0_i64, %c3_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c48_i64], [%c16_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId5} : (i32, i64, i64, memref<16x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %11 = airrt.dma_memcpy_nd(%c19_i32, %c0_i64, %c3_i64, %arg2[%c0_i64, %c0_i64, %c0_i64, %c384_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId19} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    affine.for %arg3 = 0 to 1 {
+      %h = airrt.herd_load "herd_0" : i64
+    }
+    %p_3 = airrt.segment_load "forward_0" : i64
+    %12 = airrt.dma_memcpy_nd(%c4_i32, %c1_i64, %c0_i64, %arg0[%c0_i64, %c0_i64, %c128_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c32_i64], [%c0_i64, %c32_i64, %c128_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x128xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %13 = airrt.dma_memcpy_nd(%c5_i32, %c1_i64, %c0_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c0_i64], [%c16_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId5} : (i32, i64, i64, memref<16x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %14 = airrt.dma_memcpy_nd(%c19_i32, %c1_i64, %c0_i64, %arg2[%c0_i64, %c0_i64, %c128_i64, %c0_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId19} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    affine.for %arg3 = 0 to 1 {
+      %h = airrt.herd_load "herd_0" : i64
+    }
+    %p_4 = airrt.segment_load "forward_0" : i64
+    %15 = airrt.dma_memcpy_nd(%c4_i32, %c1_i64, %c1_i64, %arg0[%c0_i64, %c0_i64, %c128_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c32_i64], [%c0_i64, %c32_i64, %c128_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x128xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %16 = airrt.dma_memcpy_nd(%c5_i32, %c1_i64, %c1_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c16_i64], [%c16_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId5} : (i32, i64, i64, memref<16x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %17 = airrt.dma_memcpy_nd(%c19_i32, %c1_i64, %c1_i64, %arg2[%c0_i64, %c0_i64, %c128_i64, %c128_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId19} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    affine.for %arg3 = 0 to 1 {
+      %h = airrt.herd_load "herd_0" : i64
+    }
+    %p_5 = airrt.segment_load "forward_0" : i64
+    %18 = airrt.dma_memcpy_nd(%c4_i32, %c1_i64, %c2_i64, %arg0[%c0_i64, %c0_i64, %c128_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c32_i64], [%c0_i64, %c32_i64, %c128_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x128xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %19 = airrt.dma_memcpy_nd(%c5_i32, %c1_i64, %c2_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c32_i64], [%c16_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId5} : (i32, i64, i64, memref<16x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %20 = airrt.dma_memcpy_nd(%c19_i32, %c1_i64, %c2_i64, %arg2[%c0_i64, %c0_i64, %c128_i64, %c256_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId19} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    affine.for %arg3 = 0 to 1 {
+      %h = airrt.herd_load "herd_0" : i64
+    }
+    %p_6 = airrt.segment_load "forward_0" : i64
+    %21 = airrt.dma_memcpy_nd(%c4_i32, %c1_i64, %c3_i64, %arg0[%c0_i64, %c0_i64, %c128_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c32_i64], [%c0_i64, %c32_i64, %c128_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x128xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %22 = airrt.dma_memcpy_nd(%c5_i32, %c1_i64, %c3_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c48_i64], [%c16_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId5} : (i32, i64, i64, memref<16x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %23 = airrt.dma_memcpy_nd(%c19_i32, %c1_i64, %c3_i64, %arg2[%c0_i64, %c0_i64, %c128_i64, %c384_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId19} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    affine.for %arg3 = 0 to 1 {
+      %h = airrt.herd_load "herd_0" : i64
+    }
+    %p_7 = airrt.segment_load "forward_0" : i64
+    %24 = airrt.dma_memcpy_nd(%c4_i32, %c2_i64, %c0_i64, %arg0[%c0_i64, %c0_i64, %c256_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c32_i64], [%c0_i64, %c32_i64, %c128_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x128xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %25 = airrt.dma_memcpy_nd(%c5_i32, %c2_i64, %c0_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c0_i64], [%c16_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId5} : (i32, i64, i64, memref<16x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %26 = airrt.dma_memcpy_nd(%c19_i32, %c2_i64, %c0_i64, %arg2[%c0_i64, %c0_i64, %c256_i64, %c0_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId19} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    affine.for %arg3 = 0 to 1 {
+      %h = airrt.herd_load "herd_0" : i64
+    }
+    %p_8 = airrt.segment_load "forward_0" : i64
+    %27 = airrt.dma_memcpy_nd(%c4_i32, %c2_i64, %c1_i64, %arg0[%c0_i64, %c0_i64, %c256_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c32_i64], [%c0_i64, %c32_i64, %c128_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x128xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %28 = airrt.dma_memcpy_nd(%c5_i32, %c2_i64, %c1_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c16_i64], [%c16_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId5} : (i32, i64, i64, memref<16x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %29 = airrt.dma_memcpy_nd(%c19_i32, %c2_i64, %c1_i64, %arg2[%c0_i64, %c0_i64, %c256_i64, %c128_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId19} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    affine.for %arg3 = 0 to 1 {
+      %h = airrt.herd_load "herd_0" : i64
+    }
+    %p_9 = airrt.segment_load "forward_0" : i64
+    %30 = airrt.dma_memcpy_nd(%c4_i32, %c2_i64, %c2_i64, %arg0[%c0_i64, %c0_i64, %c256_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c32_i64], [%c0_i64, %c32_i64, %c128_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x128xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %31 = airrt.dma_memcpy_nd(%c5_i32, %c2_i64, %c2_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c32_i64], [%c16_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId5} : (i32, i64, i64, memref<16x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %32 = airrt.dma_memcpy_nd(%c19_i32, %c2_i64, %c2_i64, %arg2[%c0_i64, %c0_i64, %c256_i64, %c256_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId19} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    affine.for %arg3 = 0 to 1 {
+      %h = airrt.herd_load "herd_0" : i64
+    }
+    %p_10 = airrt.segment_load "forward_0" : i64
+    %33 = airrt.dma_memcpy_nd(%c4_i32, %c2_i64, %c3_i64, %arg0[%c0_i64, %c0_i64, %c256_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c32_i64], [%c0_i64, %c32_i64, %c128_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x128xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %34 = airrt.dma_memcpy_nd(%c5_i32, %c2_i64, %c3_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c48_i64], [%c16_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId5} : (i32, i64, i64, memref<16x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %35 = airrt.dma_memcpy_nd(%c19_i32, %c2_i64, %c3_i64, %arg2[%c0_i64, %c0_i64, %c256_i64, %c384_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId19} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    affine.for %arg3 = 0 to 1 {
+      %h = airrt.herd_load "herd_0" : i64
+    }
+    %p_11 = airrt.segment_load "forward_0" : i64
+    %36 = airrt.dma_memcpy_nd(%c4_i32, %c3_i64, %c0_i64, %arg0[%c0_i64, %c0_i64, %c384_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c32_i64], [%c0_i64, %c32_i64, %c128_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x128xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %37 = airrt.dma_memcpy_nd(%c5_i32, %c3_i64, %c0_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c0_i64], [%c16_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId5} : (i32, i64, i64, memref<16x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %38 = airrt.dma_memcpy_nd(%c19_i32, %c3_i64, %c0_i64, %arg2[%c0_i64, %c0_i64, %c384_i64, %c0_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId19} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    affine.for %arg3 = 0 to 1 {
+      %h = airrt.herd_load "herd_0" : i64
+    }
+    %p_12 = airrt.segment_load "forward_0" : i64
+    %39 = airrt.dma_memcpy_nd(%c4_i32, %c3_i64, %c1_i64, %arg0[%c0_i64, %c0_i64, %c384_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c32_i64], [%c0_i64, %c32_i64, %c128_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x128xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %40 = airrt.dma_memcpy_nd(%c5_i32, %c3_i64, %c1_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c16_i64], [%c16_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId5} : (i32, i64, i64, memref<16x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %41 = airrt.dma_memcpy_nd(%c19_i32, %c3_i64, %c1_i64, %arg2[%c0_i64, %c0_i64, %c384_i64, %c128_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId19} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    affine.for %arg3 = 0 to 1 {
+      %h = airrt.herd_load "herd_0" : i64
+    }
+    %p_13 = airrt.segment_load "forward_0" : i64
+    %42 = airrt.dma_memcpy_nd(%c4_i32, %c3_i64, %c2_i64, %arg0[%c0_i64, %c0_i64, %c384_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c32_i64], [%c0_i64, %c32_i64, %c128_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x128xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %43 = airrt.dma_memcpy_nd(%c5_i32, %c3_i64, %c2_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c32_i64], [%c16_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId5} : (i32, i64, i64, memref<16x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %44 = airrt.dma_memcpy_nd(%c19_i32, %c3_i64, %c2_i64, %arg2[%c0_i64, %c0_i64, %c384_i64, %c256_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId19} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    affine.for %arg3 = 0 to 1 {
+      %h = airrt.herd_load "herd_0" : i64
+    }
+    %p_14 = airrt.segment_load "forward_0" : i64
+    %45 = airrt.dma_memcpy_nd(%c4_i32, %c3_i64, %c3_i64, %arg0[%c0_i64, %c0_i64, %c384_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c32_i64], [%c0_i64, %c32_i64, %c128_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x128xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %46 = airrt.dma_memcpy_nd(%c5_i32, %c3_i64, %c3_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c48_i64], [%c16_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId5} : (i32, i64, i64, memref<16x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    %47 = airrt.dma_memcpy_nd(%c19_i32, %c3_i64, %c3_i64, %arg2[%c0_i64, %c0_i64, %c384_i64, %c384_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId19} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+    affine.for %arg3 = 0 to 1 {
+      %h = airrt.herd_load "herd_0" : i64
+    }
+    return %arg2 : memref<512x512xf32>
+  }
+}