From 74d1b738aabbb132cc26f9eba560dae5876e7e7c Mon Sep 17 00:00:00 2001 From: Akash Kothari <134331715+akkothar@users.noreply.github.com> Date: Thu, 15 Aug 2024 23:18:37 -0500 Subject: [PATCH] Correctly generate offsets for DMA memcpys for AIEs (#713) * No need to insert an add instruction when loop bounds are correct * Fix offset computation when folding loops * clang-format * fix format and syntax errors * Add a test that demonstrates correctly splitting dma memcpys * Add a test that tests folding of offsets into strides in dma memcpys * Remove debug print --- mlir/lib/Conversion/AIRRtToNpuPass.cpp | 7 +- mlir/lib/Util/Util.cpp | 109 +-- .../AIRRtToNpu/dma_memcpy_split.mlir | 714 ++++++++++++++++++ .../AIRRtToNpu/dma_offset_folding.mlir | 233 ++++++ 4 files changed, 1011 insertions(+), 52 deletions(-) create mode 100644 mlir/test/Conversion/AIRRtToNpu/dma_memcpy_split.mlir create mode 100644 mlir/test/Conversion/AIRRtToNpu/dma_offset_folding.mlir diff --git a/mlir/lib/Conversion/AIRRtToNpuPass.cpp b/mlir/lib/Conversion/AIRRtToNpuPass.cpp index 889e93acc..365317a5d 100644 --- a/mlir/lib/Conversion/AIRRtToNpuPass.cpp +++ b/mlir/lib/Conversion/AIRRtToNpuPass.cpp @@ -742,11 +742,8 @@ void tileIllegalWrapDim(airrt::DmaMemcpyNdOp memcpy_op) { // Innermost tiled affine.for loop induction variable as lowest offset, if // original rank exceeds hw limit. new_opers.insert(new_opers.end(), offsets.begin(), offsets.end() - 1); - auto new_inner_offset = builder.create( - loc, - builder.create(loc, IntegerType::get(ctx, 64), - inner_affine_for_iv), - offsets.back()); + auto new_inner_offset = builder.create( + loc, IntegerType::get(ctx, 64), inner_affine_for_iv); new_opers.push_back(new_inner_offset); } else new_opers.insert(new_opers.end(), offsets.begin(), offsets.end()); diff --git a/mlir/lib/Util/Util.cpp b/mlir/lib/Util/Util.cpp index 1647a1062..5a31589ef 100644 --- a/mlir/lib/Util/Util.cpp +++ b/mlir/lib/Util/Util.cpp @@ -907,6 +907,7 @@ LogicalResult eraseWrapNStrideDim(OpBuilder builder, offset_expr = offset_expr.replaceDimsAndSymbols({}, symReplacements); auto next_offset_map = AffineMap::get(0, 1, offset_expr); affine_apply.setMap(next_offset_map); + offsets[i] = affine_apply; offsets[i + 1] = offsets[i]; } erased |= multiplyAdjWraps(builder, i, sizes); @@ -927,7 +928,6 @@ LogicalResult air::canonicalizeWrapAndStrideList(OpBuilder builder, SmallVector &sizes, SmallVector &strides, int memref_volume) { - bool listsHaveChanged = false; // Match offsets size with sizes and strides auto max_dim_size = @@ -1004,33 +1004,85 @@ LogicalResult air::foldForLoopNestAsExtendedSizesAndStrides( // Fold for loops int channel op's wrap and stride fields SmallVector for_loops; + SmallVector ivs; Operation *parent = channel_op; while (parent != for_op) { parent = parent->getParentOp(); - if (isa(parent)) + if (auto sfo = dyn_cast(parent)) { for_loops.push_back(parent); - else if (isa(parent)) + ivs.push_back(sfo.getInductionVar()); + } else if (auto afo = dyn_cast(parent)) { for_loops.push_back(parent); + ivs.push_back(afo.getInductionVar()); + } } - // First traversal inserting new dimensions from loops + std::map op_to_count; for (auto o : for_loops) { - uint64_t ind_var_factor = 0; + int64_t stepSize = -1; + int loop_lower_bound = 0; + Value iv = nullptr; + if (auto afo = dyn_cast(o)) { + iv = afo.getInductionVar(); + loop_lower_bound = afo.getConstantLowerBound(); + stepSize = afo.getStepAsInt(); + } else if (auto sfo = dyn_cast(o)) { + iv = sfo.getInductionVar(); + if (auto cst_lower_bound = mlir::getConstantIntValue(sfo.getLowerBound())) + loop_lower_bound = *cst_lower_bound; + stepSize = *mlir::getConstantIntValue(sfo.getStep()); + } + int64_t ind_var_factor = 0; for (int i = offsets.size() - 1; i >= 0; i--) { - Value iv = nullptr; - if (auto afo = dyn_cast(o)) - iv = afo.getInductionVar(); - else if (auto sfo = dyn_cast(o)) - iv = sfo.getInductionVar(); if (iv && offsets[i] == iv) { ind_var_factor = *getConstantIntValue(strides[i]); + offsets[i] = builder.template create( + loc, loop_lower_bound); break; } else if (iv && offsets[i].getDefiningOp()) { Operation *iv_consumer = offsets[i].getDefiningOp(); if (auto exec = dyn_cast(iv_consumer)) iv_consumer = exec.getChildOp(); + if (auto affop = dyn_cast(iv_consumer)) { + // The induction variable must be the input to the affine op + if (affop.getSymbolOperands().size() == 1) { + bool iv_is_symbol = false; + for (auto val : affop.getSymbolOperands()) { + if (val == iv) { + iv_is_symbol = true; + break; + } + } + if (iv_is_symbol) { + auto map = affop.getAffineMap(); + ind_var_factor = air::evaluateConstantsInMap( + map, + SmallVector>{ + std::optional{stepSize}}, + for_op->getContext()) + .value(); + offsets[i] = builder.template create( + loc, loop_lower_bound); + break; + } + } + } if (llvm::is_contained(iv_consumer->getOperands(), iv)) { + if (op_to_count.find(iv_consumer) == op_to_count.end()) { + op_to_count[iv_consumer] = 0; + for (auto operand : iv_consumer->getOperands()) { + for (auto iv_val : ivs) { + if (iv_val == operand) + op_to_count[iv_consumer]++; + } + } + } + op_to_count[iv_consumer]--; ind_var_factor = *getConstantIntValue(strides[i]); + if (!op_to_count[iv_consumer]) { + offsets[i] = builder.template create( + loc, loop_lower_bound); + } break; } } @@ -1042,11 +1094,6 @@ LogicalResult air::foldForLoopNestAsExtendedSizesAndStrides( trip_count = *getStaticScfForTripCountAsInt(sfo); Value new_wrap = builder.template create(loc, trip_count); - int stepSize = -1; - if (auto afo = dyn_cast(o)) - stepSize = afo.getStepAsInt(); - else if (auto sfo = dyn_cast(o)) - stepSize = *mlir::getConstantIntValue(sfo.getStep()); int64_t new_stride_value = (stepSize * ind_var_factor) % getTensorVolume(memref.getType()); Value new_stride = @@ -1069,38 +1116,6 @@ LogicalResult air::foldForLoopNestAsExtendedSizesAndStrides( wraps.insert(wraps.begin(), new_wrap); strides.insert(strides.begin(), new_stride); } - - // Second traversal updating existing offsets - for (auto o : for_loops) { - for (int i = offsets.size() - 1; i >= 0; i--) { - Value iv = nullptr; - int loop_lower_bound = 0; - if (auto afo = dyn_cast(o)) { - iv = afo.getInductionVar(); - loop_lower_bound = afo.getConstantLowerBound(); - } else if (auto sfo = dyn_cast(o)) { - iv = sfo.getInductionVar(); - if (auto cst_lower_bound = - mlir::getConstantIntValue(sfo.getLowerBound())) - loop_lower_bound = *cst_lower_bound; - } - if (iv && offsets[i] == iv) { - // Replace offset with for loop lower bound - offsets[i] = builder.template create( - loc, loop_lower_bound); - break; - } else if (iv && offsets[i].getDefiningOp()) { - Operation *iv_consumer = offsets[i].getDefiningOp(); - if (auto exec = dyn_cast(iv_consumer)) - iv_consumer = exec.getChildOp(); - if (llvm::is_contained(iv_consumer->getOperands(), iv)) { - offsets[i] = builder.template create( - loc, loop_lower_bound); - break; - } - } - } - } return success(); } diff --git a/mlir/test/Conversion/AIRRtToNpu/dma_memcpy_split.mlir b/mlir/test/Conversion/AIRRtToNpu/dma_memcpy_split.mlir new file mode 100644 index 000000000..fd95aed61 --- /dev/null +++ b/mlir/test/Conversion/AIRRtToNpu/dma_memcpy_split.mlir @@ -0,0 +1,714 @@ +//===- dma_memcpy_split.mlir ---------------------------------------*- MLIR -*-===// +// +// Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: MIT +// +//===----------------------------------------------------------------------===// + + +// RUN: air-opt -airrt-to-npu --split-input-file %s | FileCheck %s + + +// CHECK-LABEL: aie.device(npu1_4col) +// CHECK: aie.shim_dma_allocation @airMemcpyId29(S2MM, 0, 0) +// CHECK: memref.global "public" @airMemcpyId29 : memref<128x128xf32, 1> +// CHECK: aie.shim_dma_allocation @airMemcpyId4(MM2S, 0, 0) +// CHECK: memref.global "public" @airMemcpyId4 : memref<128x256xbf16, 1> +// CHECK: aie.shim_dma_allocation @airMemcpyId10(MM2S, 1, 0) +// CHECK: memref.global "public" @airMemcpyId10 : memref<32x8x8x16xbf16, 1> +// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 4, 128, 128][0, 128, 512, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][32, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 65536][32, 8, 8, 8][2048, 32, 256, 1]) {id = 2 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 131072][32, 8, 8, 8][2048, 32, 256, 1]) {id = 3 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 196608][32, 8, 8, 8][2048, 32, 256, 1]) {id = 4 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 128, 128][0, 0, 512, 1]) {id = 5 : i64, metadata = @airMemcpyId29} : memref<512x512xf32> +// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} +// CHECK: aiex.npu.dma_memcpy_nd(0, 1, %arg0[0, 0, 0, 0][1, 4, 128, 128][0, 128, 512, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(0, 1, %arg1[0, 0, 0, 8][32, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(0, 1, %arg1[0, 0, 0, 65544][32, 8, 8, 8][2048, 32, 256, 1]) {id = 2 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(0, 1, %arg1[0, 0, 0, 131080][32, 8, 8, 8][2048, 32, 256, 1]) {id = 3 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(0, 1, %arg1[0, 0, 0, 196616][32, 8, 8, 8][2048, 32, 256, 1]) {id = 4 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(0, 1, %arg2[0, 0, 0, 128][1, 1, 128, 128][0, 0, 512, 1]) {id = 5 : i64, metadata = @airMemcpyId29} : memref<512x512xf32> +// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} +// CHECK: aiex.npu.dma_memcpy_nd(0, 2, %arg0[0, 0, 0, 0][1, 4, 128, 128][0, 128, 512, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(0, 2, %arg1[0, 0, 0, 16][32, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(0, 2, %arg1[0, 0, 0, 65552][32, 8, 8, 8][2048, 32, 256, 1]) {id = 2 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(0, 2, %arg1[0, 0, 0, 131088][32, 8, 8, 8][2048, 32, 256, 1]) {id = 3 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(0, 2, %arg1[0, 0, 0, 196624][32, 8, 8, 8][2048, 32, 256, 1]) {id = 4 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(0, 2, %arg2[0, 0, 0, 256][1, 1, 128, 128][0, 0, 512, 1]) {id = 5 : i64, metadata = @airMemcpyId29} : memref<512x512xf32> +// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} +// CHECK: aiex.npu.dma_memcpy_nd(0, 3, %arg0[0, 0, 0, 0][1, 4, 128, 128][0, 128, 512, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(0, 3, %arg1[0, 0, 0, 24][32, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(0, 3, %arg1[0, 0, 0, 65560][32, 8, 8, 8][2048, 32, 256, 1]) {id = 2 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(0, 3, %arg1[0, 0, 0, 131096][32, 8, 8, 8][2048, 32, 256, 1]) {id = 3 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(0, 3, %arg1[0, 0, 0, 196632][32, 8, 8, 8][2048, 32, 256, 1]) {id = 4 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(0, 3, %arg2[0, 0, 0, 384][1, 1, 128, 128][0, 0, 512, 1]) {id = 5 : i64, metadata = @airMemcpyId29} : memref<512x512xf32> +// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} +// CHECK: aiex.npu.dma_memcpy_nd(1, 0, %arg0[0, 0, 0, 65536][1, 4, 128, 128][0, 128, 512, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(1, 0, %arg1[0, 0, 0, 0][32, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(1, 0, %arg1[0, 0, 0, 65536][32, 8, 8, 8][2048, 32, 256, 1]) {id = 2 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(1, 0, %arg1[0, 0, 0, 131072][32, 8, 8, 8][2048, 32, 256, 1]) {id = 3 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(1, 0, %arg1[0, 0, 0, 196608][32, 8, 8, 8][2048, 32, 256, 1]) {id = 4 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(1, 0, %arg2[0, 0, 128, 0][1, 1, 128, 128][0, 0, 512, 1]) {id = 5 : i64, metadata = @airMemcpyId29} : memref<512x512xf32> +// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} +// CHECK: aiex.npu.dma_memcpy_nd(1, 1, %arg0[0, 0, 0, 65536][1, 4, 128, 128][0, 128, 512, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(1, 1, %arg1[0, 0, 0, 8][32, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(1, 1, %arg1[0, 0, 0, 65544][32, 8, 8, 8][2048, 32, 256, 1]) {id = 2 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(1, 1, %arg1[0, 0, 0, 131080][32, 8, 8, 8][2048, 32, 256, 1]) {id = 3 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(1, 1, %arg1[0, 0, 0, 196616][32, 8, 8, 8][2048, 32, 256, 1]) {id = 4 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(1, 1, %arg2[0, 0, 128, 128][1, 1, 128, 128][0, 0, 512, 1]) {id = 5 : i64, metadata = @airMemcpyId29} : memref<512x512xf32> +// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} +// CHECK: aiex.npu.dma_memcpy_nd(1, 2, %arg0[0, 0, 0, 65536][1, 4, 128, 128][0, 128, 512, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(1, 2, %arg1[0, 0, 0, 16][32, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(1, 2, %arg1[0, 0, 0, 65552][32, 8, 8, 8][2048, 32, 256, 1]) {id = 2 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(1, 2, %arg1[0, 0, 0, 131088][32, 8, 8, 8][2048, 32, 256, 1]) {id = 3 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(1, 2, %arg1[0, 0, 0, 196624][32, 8, 8, 8][2048, 32, 256, 1]) {id = 4 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(1, 2, %arg2[0, 0, 128, 256][1, 1, 128, 128][0, 0, 512, 1]) {id = 5 : i64, metadata = @airMemcpyId29} : memref<512x512xf32> +// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} +// CHECK: aiex.npu.dma_memcpy_nd(1, 3, %arg0[0, 0, 0, 65536][1, 4, 128, 128][0, 128, 512, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(1, 3, %arg1[0, 0, 0, 24][32, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(1, 3, %arg1[0, 0, 0, 65560][32, 8, 8, 8][2048, 32, 256, 1]) {id = 2 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(1, 3, %arg1[0, 0, 0, 131096][32, 8, 8, 8][2048, 32, 256, 1]) {id = 3 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(1, 3, %arg1[0, 0, 0, 196632][32, 8, 8, 8][2048, 32, 256, 1]) {id = 4 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(1, 3, %arg2[0, 0, 128, 384][1, 1, 128, 128][0, 0, 512, 1]) {id = 5 : i64, metadata = @airMemcpyId29} : memref<512x512xf32> +// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} +// CHECK: aiex.npu.dma_memcpy_nd(2, 0, %arg0[0, 0, 0, 131072][1, 4, 128, 128][0, 128, 512, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(2, 0, %arg1[0, 0, 0, 0][32, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(2, 0, %arg1[0, 0, 0, 65536][32, 8, 8, 8][2048, 32, 256, 1]) {id = 2 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(2, 0, %arg1[0, 0, 0, 131072][32, 8, 8, 8][2048, 32, 256, 1]) {id = 3 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(2, 0, %arg1[0, 0, 0, 196608][32, 8, 8, 8][2048, 32, 256, 1]) {id = 4 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(2, 0, %arg2[0, 0, 256, 0][1, 1, 128, 128][0, 0, 512, 1]) {id = 5 : i64, metadata = @airMemcpyId29} : memref<512x512xf32> +// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} +// CHECK: aiex.npu.dma_memcpy_nd(2, 1, %arg0[0, 0, 0, 131072][1, 4, 128, 128][0, 128, 512, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(2, 1, %arg1[0, 0, 0, 8][32, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(2, 1, %arg1[0, 0, 0, 65544][32, 8, 8, 8][2048, 32, 256, 1]) {id = 2 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(2, 1, %arg1[0, 0, 0, 131080][32, 8, 8, 8][2048, 32, 256, 1]) {id = 3 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(2, 1, %arg1[0, 0, 0, 196616][32, 8, 8, 8][2048, 32, 256, 1]) {id = 4 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(2, 1, %arg2[0, 0, 256, 128][1, 1, 128, 128][0, 0, 512, 1]) {id = 5 : i64, metadata = @airMemcpyId29} : memref<512x512xf32> +// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} +// CHECK: aiex.npu.dma_memcpy_nd(2, 2, %arg0[0, 0, 0, 131072][1, 4, 128, 128][0, 128, 512, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(2, 2, %arg1[0, 0, 0, 16][32, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(2, 2, %arg1[0, 0, 0, 65552][32, 8, 8, 8][2048, 32, 256, 1]) {id = 2 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(2, 2, %arg1[0, 0, 0, 131088][32, 8, 8, 8][2048, 32, 256, 1]) {id = 3 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(2, 2, %arg1[0, 0, 0, 196624][32, 8, 8, 8][2048, 32, 256, 1]) {id = 4 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(2, 2, %arg2[0, 0, 256, 256][1, 1, 128, 128][0, 0, 512, 1]) {id = 5 : i64, metadata = @airMemcpyId29} : memref<512x512xf32> +// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} +// CHECK: aiex.npu.dma_memcpy_nd(2, 3, %arg0[0, 0, 0, 131072][1, 4, 128, 128][0, 128, 512, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(2, 3, %arg1[0, 0, 0, 24][32, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(2, 3, %arg1[0, 0, 0, 65560][32, 8, 8, 8][2048, 32, 256, 1]) {id = 2 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(2, 3, %arg1[0, 0, 0, 131096][32, 8, 8, 8][2048, 32, 256, 1]) {id = 3 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(2, 3, %arg1[0, 0, 0, 196632][32, 8, 8, 8][2048, 32, 256, 1]) {id = 4 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(2, 3, %arg2[0, 0, 256, 384][1, 1, 128, 128][0, 0, 512, 1]) {id = 5 : i64, metadata = @airMemcpyId29} : memref<512x512xf32> +// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} +// CHECK: aiex.npu.dma_memcpy_nd(3, 0, %arg0[0, 0, 0, 196608][1, 4, 128, 128][0, 128, 512, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(3, 0, %arg1[0, 0, 0, 0][32, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(3, 0, %arg1[0, 0, 0, 65536][32, 8, 8, 8][2048, 32, 256, 1]) {id = 2 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(3, 0, %arg1[0, 0, 0, 131072][32, 8, 8, 8][2048, 32, 256, 1]) {id = 3 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(3, 0, %arg1[0, 0, 0, 196608][32, 8, 8, 8][2048, 32, 256, 1]) {id = 4 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(3, 0, %arg2[0, 0, 384, 0][1, 1, 128, 128][0, 0, 512, 1]) {id = 5 : i64, metadata = @airMemcpyId29} : memref<512x512xf32> +// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} +// CHECK: aiex.npu.dma_memcpy_nd(3, 1, %arg0[0, 0, 0, 196608][1, 4, 128, 128][0, 128, 512, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(3, 1, %arg1[0, 0, 0, 8][32, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(3, 1, %arg1[0, 0, 0, 65544][32, 8, 8, 8][2048, 32, 256, 1]) {id = 2 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(3, 1, %arg1[0, 0, 0, 131080][32, 8, 8, 8][2048, 32, 256, 1]) {id = 3 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(3, 1, %arg1[0, 0, 0, 196616][32, 8, 8, 8][2048, 32, 256, 1]) {id = 4 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(3, 1, %arg2[0, 0, 384, 128][1, 1, 128, 128][0, 0, 512, 1]) {id = 5 : i64, metadata = @airMemcpyId29} : memref<512x512xf32> +// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} +// CHECK: aiex.npu.dma_memcpy_nd(3, 2, %arg0[0, 0, 0, 196608][1, 4, 128, 128][0, 128, 512, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(3, 2, %arg1[0, 0, 0, 16][32, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(3, 2, %arg1[0, 0, 0, 65552][32, 8, 8, 8][2048, 32, 256, 1]) {id = 2 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(3, 2, %arg1[0, 0, 0, 131088][32, 8, 8, 8][2048, 32, 256, 1]) {id = 3 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(3, 2, %arg1[0, 0, 0, 196624][32, 8, 8, 8][2048, 32, 256, 1]) {id = 4 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(3, 2, %arg2[0, 0, 384, 256][1, 1, 128, 128][0, 0, 512, 1]) {id = 5 : i64, metadata = @airMemcpyId29} : memref<512x512xf32> +// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} +// CHECK: aiex.npu.dma_memcpy_nd(3, 3, %arg0[0, 0, 0, 196608][1, 4, 128, 128][0, 128, 512, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(3, 3, %arg1[0, 0, 0, 24][32, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(3, 3, %arg1[0, 0, 0, 65560][32, 8, 8, 8][2048, 32, 256, 1]) {id = 2 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(3, 3, %arg1[0, 0, 0, 131096][32, 8, 8, 8][2048, 32, 256, 1]) {id = 3 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(3, 3, %arg1[0, 0, 0, 196632][32, 8, 8, 8][2048, 32, 256, 1]) {id = 4 : i64, metadata = @airMemcpyId10} : memref<262144xi32> +// CHECK: aiex.npu.dma_memcpy_nd(3, 3, %arg2[0, 0, 384, 384][1, 1, 128, 128][0, 0, 512, 1]) {id = 5 : i64, metadata = @airMemcpyId29} : memref<512x512xf32> +// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} + +module { + aie.device(npu1_4col) { + aie.shim_dma_allocation @airMemcpyId29(S2MM, 0, 0) + memref.global "public" @airMemcpyId29 : memref<128x128xf32, 1> + aie.shim_dma_allocation @airMemcpyId4(MM2S, 0, 0) + memref.global "public" @airMemcpyId4 : memref<128x256xbf16, 1> + aie.shim_dma_allocation @airMemcpyId10(MM2S, 1, 0) + memref.global "public" @airMemcpyId10 : memref<32x8x8x16xbf16, 1> + } {sym_name = "forward_0"} + airrt.module_metadata{ + } + func.func @forward(%arg0: memref<512x1024xbf16>, %arg1: memref<128x8x8x64xbf16>, %arg2: memref<512x512xf32>) -> memref<512x512xf32> { + %c384_i64 = arith.constant 384 : i64 + %c48_i64 = arith.constant 48 : i64 + %c3_i64 = arith.constant 3 : i64 + %c32_i64 = arith.constant 32 : i64 + %c2_i64 = arith.constant 2 : i64 + %c0 = arith.constant 0 : index + %c16_i64 = arith.constant 16 : i64 + %c8_i64 = arith.constant 8 : i64 + %c512_i64 = arith.constant 512 : i64 + %c64_i64 = arith.constant 64 : i64 + %c4096_i64 = arith.constant 4096 : i64 + %c128_i64 = arith.constant 128 : i64 + %c4_i64 = arith.constant 4 : i64 + %c1_i64 = arith.constant 1 : i64 + %c1024_i64 = arith.constant 1024 : i64 + %c256_i64 = arith.constant 256 : i64 + %c0_i64 = arith.constant 0 : i64 + %c29_i32 = arith.constant 29 : i32 + %c10_i32 = arith.constant 10 : i32 + %c4_i32 = arith.constant 4 : i32 + %c128 = arith.constant 128 : index + %c1024 = arith.constant 1024 : index + %c512 = arith.constant 512 : index + %c64 = arith.constant 64 : index + %p = airrt.segment_load "forward_0" : i64 + %0 = airrt.dma_memcpy_nd(%c4_i32, %c0_i64, %c0_i64, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c256_i64], [%c0_i64, %c256_i64, %c1024_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x1024xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %1 = airrt.dma_memcpy_nd(%c10_i32, %c0_i64, %c0_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c0_i64], [%c128_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId10} : (i32, i64, i64, memref<128x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %2 = airrt.dma_memcpy_nd(%c29_i32, %c0_i64, %c0_i64, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId29} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + affine.for %arg3 = 0 to 1 { + %h = airrt.herd_load "herd_0" : i64 + %48 = airrt.wait_all : !airrt.event + %49 = airrt.wait_all : !airrt.event + %50:4 = scf.for %arg4 = %c0 to %c1024 step %c512 iter_args(%arg5 = %48, %arg6 = %49, %arg7 = %49, %arg8 = %49) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) { + %55 = airrt.wait_all %arg8, %arg5 : !airrt.event + %56 = airrt.wait_all %arg7 : !airrt.event + %57 = airrt.wait_all %arg7 : !airrt.event + airrt.wait_all %arg8, %arg5 + %58 = airrt.wait_all : !airrt.event + %59 = airrt.wait_all %arg6 : !airrt.event + airrt.wait_all %arg6 + %60 = airrt.wait_all : !airrt.event + scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event + } + %51 = airrt.wait_all : !airrt.event + %52 = airrt.wait_all : !airrt.event + %53:4 = scf.for %arg4 = %c0 to %c128 step %c64 iter_args(%arg5 = %51, %arg6 = %52, %arg7 = %52, %arg8 = %52) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) { + %55 = airrt.wait_all %arg8, %arg5 : !airrt.event + %56 = airrt.wait_all %arg7 : !airrt.event + %57 = airrt.wait_all %arg7 : !airrt.event + airrt.wait_all %arg8, %arg5 + %58 = airrt.wait_all : !airrt.event + %59 = airrt.wait_all %arg6 : !airrt.event + airrt.wait_all %arg6 + %60 = airrt.wait_all : !airrt.event + scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event + } + %54 = airrt.wait_all %50#1, %53#1 : !airrt.event + } + %p_0 = airrt.segment_load "forward_0" : i64 + %3 = airrt.dma_memcpy_nd(%c4_i32, %c0_i64, %c1_i64, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c256_i64], [%c0_i64, %c256_i64, %c1024_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x1024xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %4 = airrt.dma_memcpy_nd(%c10_i32, %c0_i64, %c1_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c16_i64], [%c128_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId10} : (i32, i64, i64, memref<128x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %5 = airrt.dma_memcpy_nd(%c29_i32, %c0_i64, %c1_i64, %arg2[%c0_i64, %c0_i64, %c0_i64, %c128_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId29} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + affine.for %arg3 = 0 to 1 { + %h = airrt.herd_load "herd_0" : i64 + %48 = airrt.wait_all : !airrt.event + %49 = airrt.wait_all : !airrt.event + %50:4 = scf.for %arg4 = %c0 to %c1024 step %c512 iter_args(%arg5 = %48, %arg6 = %49, %arg7 = %49, %arg8 = %49) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) { + %55 = airrt.wait_all %arg8, %arg5 : !airrt.event + %56 = airrt.wait_all %arg7 : !airrt.event + %57 = airrt.wait_all %arg7 : !airrt.event + airrt.wait_all %arg8, %arg5 + %58 = airrt.wait_all : !airrt.event + %59 = airrt.wait_all %arg6 : !airrt.event + airrt.wait_all %arg6 + %60 = airrt.wait_all : !airrt.event + scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event + } + %51 = airrt.wait_all : !airrt.event + %52 = airrt.wait_all : !airrt.event + %53:4 = scf.for %arg4 = %c0 to %c128 step %c64 iter_args(%arg5 = %51, %arg6 = %52, %arg7 = %52, %arg8 = %52) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) { + %55 = airrt.wait_all %arg8, %arg5 : !airrt.event + %56 = airrt.wait_all %arg7 : !airrt.event + %57 = airrt.wait_all %arg7 : !airrt.event + airrt.wait_all %arg8, %arg5 + %58 = airrt.wait_all : !airrt.event + %59 = airrt.wait_all %arg6 : !airrt.event + airrt.wait_all %arg6 + %60 = airrt.wait_all : !airrt.event + scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event + } + %54 = airrt.wait_all %50#1, %53#1 : !airrt.event + } + %p_1 = airrt.segment_load "forward_0" : i64 + %6 = airrt.dma_memcpy_nd(%c4_i32, %c0_i64, %c2_i64, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c256_i64], [%c0_i64, %c256_i64, %c1024_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x1024xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %7 = airrt.dma_memcpy_nd(%c10_i32, %c0_i64, %c2_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c32_i64], [%c128_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId10} : (i32, i64, i64, memref<128x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %8 = airrt.dma_memcpy_nd(%c29_i32, %c0_i64, %c2_i64, %arg2[%c0_i64, %c0_i64, %c0_i64, %c256_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId29} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + affine.for %arg3 = 0 to 1 { + %h = airrt.herd_load "herd_0" : i64 + %48 = airrt.wait_all : !airrt.event + %49 = airrt.wait_all : !airrt.event + %50:4 = scf.for %arg4 = %c0 to %c1024 step %c512 iter_args(%arg5 = %48, %arg6 = %49, %arg7 = %49, %arg8 = %49) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) { + %55 = airrt.wait_all %arg8, %arg5 : !airrt.event + %56 = airrt.wait_all %arg7 : !airrt.event + %57 = airrt.wait_all %arg7 : !airrt.event + airrt.wait_all %arg8, %arg5 + %58 = airrt.wait_all : !airrt.event + %59 = airrt.wait_all %arg6 : !airrt.event + airrt.wait_all %arg6 + %60 = airrt.wait_all : !airrt.event + scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event + } + %51 = airrt.wait_all : !airrt.event + %52 = airrt.wait_all : !airrt.event + %53:4 = scf.for %arg4 = %c0 to %c128 step %c64 iter_args(%arg5 = %51, %arg6 = %52, %arg7 = %52, %arg8 = %52) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) { + %55 = airrt.wait_all %arg8, %arg5 : !airrt.event + %56 = airrt.wait_all %arg7 : !airrt.event + %57 = airrt.wait_all %arg7 : !airrt.event + airrt.wait_all %arg8, %arg5 + %58 = airrt.wait_all : !airrt.event + %59 = airrt.wait_all %arg6 : !airrt.event + airrt.wait_all %arg6 + %60 = airrt.wait_all : !airrt.event + scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event + } + %54 = airrt.wait_all %50#1, %53#1 : !airrt.event + } + %p_2 = airrt.segment_load "forward_0" : i64 + %9 = airrt.dma_memcpy_nd(%c4_i32, %c0_i64, %c3_i64, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c256_i64], [%c0_i64, %c256_i64, %c1024_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x1024xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %10 = airrt.dma_memcpy_nd(%c10_i32, %c0_i64, %c3_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c48_i64], [%c128_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId10} : (i32, i64, i64, memref<128x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %11 = airrt.dma_memcpy_nd(%c29_i32, %c0_i64, %c3_i64, %arg2[%c0_i64, %c0_i64, %c0_i64, %c384_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId29} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + affine.for %arg3 = 0 to 1 { + %h = airrt.herd_load "herd_0" : i64 + %48 = airrt.wait_all : !airrt.event + %49 = airrt.wait_all : !airrt.event + %50:4 = scf.for %arg4 = %c0 to %c1024 step %c512 iter_args(%arg5 = %48, %arg6 = %49, %arg7 = %49, %arg8 = %49) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) { + %55 = airrt.wait_all %arg8, %arg5 : !airrt.event + %56 = airrt.wait_all %arg7 : !airrt.event + %57 = airrt.wait_all %arg7 : !airrt.event + airrt.wait_all %arg8, %arg5 + %58 = airrt.wait_all : !airrt.event + %59 = airrt.wait_all %arg6 : !airrt.event + airrt.wait_all %arg6 + %60 = airrt.wait_all : !airrt.event + scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event + } + %51 = airrt.wait_all : !airrt.event + %52 = airrt.wait_all : !airrt.event + %53:4 = scf.for %arg4 = %c0 to %c128 step %c64 iter_args(%arg5 = %51, %arg6 = %52, %arg7 = %52, %arg8 = %52) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) { + %55 = airrt.wait_all %arg8, %arg5 : !airrt.event + %56 = airrt.wait_all %arg7 : !airrt.event + %57 = airrt.wait_all %arg7 : !airrt.event + airrt.wait_all %arg8, %arg5 + %58 = airrt.wait_all : !airrt.event + %59 = airrt.wait_all %arg6 : !airrt.event + airrt.wait_all %arg6 + %60 = airrt.wait_all : !airrt.event + scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event + } + %54 = airrt.wait_all %50#1, %53#1 : !airrt.event + } + %p_3 = airrt.segment_load "forward_0" : i64 + %12 = airrt.dma_memcpy_nd(%c4_i32, %c1_i64, %c0_i64, %arg0[%c0_i64, %c0_i64, %c128_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c256_i64], [%c0_i64, %c256_i64, %c1024_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x1024xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %13 = airrt.dma_memcpy_nd(%c10_i32, %c1_i64, %c0_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c0_i64], [%c128_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId10} : (i32, i64, i64, memref<128x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %14 = airrt.dma_memcpy_nd(%c29_i32, %c1_i64, %c0_i64, %arg2[%c0_i64, %c0_i64, %c128_i64, %c0_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId29} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + affine.for %arg3 = 0 to 1 { + %h = airrt.herd_load "herd_0" : i64 + %48 = airrt.wait_all : !airrt.event + %49 = airrt.wait_all : !airrt.event + %50:4 = scf.for %arg4 = %c0 to %c1024 step %c512 iter_args(%arg5 = %48, %arg6 = %49, %arg7 = %49, %arg8 = %49) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) { + %55 = airrt.wait_all %arg8, %arg5 : !airrt.event + %56 = airrt.wait_all %arg7 : !airrt.event + %57 = airrt.wait_all %arg7 : !airrt.event + airrt.wait_all %arg8, %arg5 + %58 = airrt.wait_all : !airrt.event + %59 = airrt.wait_all %arg6 : !airrt.event + airrt.wait_all %arg6 + %60 = airrt.wait_all : !airrt.event + scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event + } + %51 = airrt.wait_all : !airrt.event + %52 = airrt.wait_all : !airrt.event + %53:4 = scf.for %arg4 = %c0 to %c128 step %c64 iter_args(%arg5 = %51, %arg6 = %52, %arg7 = %52, %arg8 = %52) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) { + %55 = airrt.wait_all %arg8, %arg5 : !airrt.event + %56 = airrt.wait_all %arg7 : !airrt.event + %57 = airrt.wait_all %arg7 : !airrt.event + airrt.wait_all %arg8, %arg5 + %58 = airrt.wait_all : !airrt.event + %59 = airrt.wait_all %arg6 : !airrt.event + airrt.wait_all %arg6 + %60 = airrt.wait_all : !airrt.event + scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event + } + %54 = airrt.wait_all %50#1, %53#1 : !airrt.event + } + %p_4 = airrt.segment_load "forward_0" : i64 + %15 = airrt.dma_memcpy_nd(%c4_i32, %c1_i64, %c1_i64, %arg0[%c0_i64, %c0_i64, %c128_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c256_i64], [%c0_i64, %c256_i64, %c1024_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x1024xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %16 = airrt.dma_memcpy_nd(%c10_i32, %c1_i64, %c1_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c16_i64], [%c128_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId10} : (i32, i64, i64, memref<128x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %17 = airrt.dma_memcpy_nd(%c29_i32, %c1_i64, %c1_i64, %arg2[%c0_i64, %c0_i64, %c128_i64, %c128_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId29} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + affine.for %arg3 = 0 to 1 { + %h = airrt.herd_load "herd_0" : i64 + %48 = airrt.wait_all : !airrt.event + %49 = airrt.wait_all : !airrt.event + %50:4 = scf.for %arg4 = %c0 to %c1024 step %c512 iter_args(%arg5 = %48, %arg6 = %49, %arg7 = %49, %arg8 = %49) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) { + %55 = airrt.wait_all %arg8, %arg5 : !airrt.event + %56 = airrt.wait_all %arg7 : !airrt.event + %57 = airrt.wait_all %arg7 : !airrt.event + airrt.wait_all %arg8, %arg5 + %58 = airrt.wait_all : !airrt.event + %59 = airrt.wait_all %arg6 : !airrt.event + airrt.wait_all %arg6 + %60 = airrt.wait_all : !airrt.event + scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event + } + %51 = airrt.wait_all : !airrt.event + %52 = airrt.wait_all : !airrt.event + %53:4 = scf.for %arg4 = %c0 to %c128 step %c64 iter_args(%arg5 = %51, %arg6 = %52, %arg7 = %52, %arg8 = %52) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) { + %55 = airrt.wait_all %arg8, %arg5 : !airrt.event + %56 = airrt.wait_all %arg7 : !airrt.event + %57 = airrt.wait_all %arg7 : !airrt.event + airrt.wait_all %arg8, %arg5 + %58 = airrt.wait_all : !airrt.event + %59 = airrt.wait_all %arg6 : !airrt.event + airrt.wait_all %arg6 + %60 = airrt.wait_all : !airrt.event + scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event + } + %54 = airrt.wait_all %50#1, %53#1 : !airrt.event + } + %p_5 = airrt.segment_load "forward_0" : i64 + %18 = airrt.dma_memcpy_nd(%c4_i32, %c1_i64, %c2_i64, %arg0[%c0_i64, %c0_i64, %c128_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c256_i64], [%c0_i64, %c256_i64, %c1024_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x1024xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %19 = airrt.dma_memcpy_nd(%c10_i32, %c1_i64, %c2_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c32_i64], [%c128_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId10} : (i32, i64, i64, memref<128x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %20 = airrt.dma_memcpy_nd(%c29_i32, %c1_i64, %c2_i64, %arg2[%c0_i64, %c0_i64, %c128_i64, %c256_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId29} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + affine.for %arg3 = 0 to 1 { + %h = airrt.herd_load "herd_0" : i64 + %48 = airrt.wait_all : !airrt.event + %49 = airrt.wait_all : !airrt.event + %50:4 = scf.for %arg4 = %c0 to %c1024 step %c512 iter_args(%arg5 = %48, %arg6 = %49, %arg7 = %49, %arg8 = %49) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) { + %55 = airrt.wait_all %arg8, %arg5 : !airrt.event + %56 = airrt.wait_all %arg7 : !airrt.event + %57 = airrt.wait_all %arg7 : !airrt.event + airrt.wait_all %arg8, %arg5 + %58 = airrt.wait_all : !airrt.event + %59 = airrt.wait_all %arg6 : !airrt.event + airrt.wait_all %arg6 + %60 = airrt.wait_all : !airrt.event + scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event + } + %51 = airrt.wait_all : !airrt.event + %52 = airrt.wait_all : !airrt.event + %53:4 = scf.for %arg4 = %c0 to %c128 step %c64 iter_args(%arg5 = %51, %arg6 = %52, %arg7 = %52, %arg8 = %52) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) { + %55 = airrt.wait_all %arg8, %arg5 : !airrt.event + %56 = airrt.wait_all %arg7 : !airrt.event + %57 = airrt.wait_all %arg7 : !airrt.event + airrt.wait_all %arg8, %arg5 + %58 = airrt.wait_all : !airrt.event + %59 = airrt.wait_all %arg6 : !airrt.event + airrt.wait_all %arg6 + %60 = airrt.wait_all : !airrt.event + scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event + } + %54 = airrt.wait_all %50#1, %53#1 : !airrt.event + } + %p_6 = airrt.segment_load "forward_0" : i64 + %21 = airrt.dma_memcpy_nd(%c4_i32, %c1_i64, %c3_i64, %arg0[%c0_i64, %c0_i64, %c128_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c256_i64], [%c0_i64, %c256_i64, %c1024_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x1024xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %22 = airrt.dma_memcpy_nd(%c10_i32, %c1_i64, %c3_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c48_i64], [%c128_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId10} : (i32, i64, i64, memref<128x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %23 = airrt.dma_memcpy_nd(%c29_i32, %c1_i64, %c3_i64, %arg2[%c0_i64, %c0_i64, %c128_i64, %c384_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId29} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + affine.for %arg3 = 0 to 1 { + %h = airrt.herd_load "herd_0" : i64 + %48 = airrt.wait_all : !airrt.event + %49 = airrt.wait_all : !airrt.event + %50:4 = scf.for %arg4 = %c0 to %c1024 step %c512 iter_args(%arg5 = %48, %arg6 = %49, %arg7 = %49, %arg8 = %49) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) { + %55 = airrt.wait_all %arg8, %arg5 : !airrt.event + %56 = airrt.wait_all %arg7 : !airrt.event + %57 = airrt.wait_all %arg7 : !airrt.event + airrt.wait_all %arg8, %arg5 + %58 = airrt.wait_all : !airrt.event + %59 = airrt.wait_all %arg6 : !airrt.event + airrt.wait_all %arg6 + %60 = airrt.wait_all : !airrt.event + scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event + } + %51 = airrt.wait_all : !airrt.event + %52 = airrt.wait_all : !airrt.event + %53:4 = scf.for %arg4 = %c0 to %c128 step %c64 iter_args(%arg5 = %51, %arg6 = %52, %arg7 = %52, %arg8 = %52) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) { + %55 = airrt.wait_all %arg8, %arg5 : !airrt.event + %56 = airrt.wait_all %arg7 : !airrt.event + %57 = airrt.wait_all %arg7 : !airrt.event + airrt.wait_all %arg8, %arg5 + %58 = airrt.wait_all : !airrt.event + %59 = airrt.wait_all %arg6 : !airrt.event + airrt.wait_all %arg6 + %60 = airrt.wait_all : !airrt.event + scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event + } + %54 = airrt.wait_all %50#1, %53#1 : !airrt.event + } + %p_7 = airrt.segment_load "forward_0" : i64 + %24 = airrt.dma_memcpy_nd(%c4_i32, %c2_i64, %c0_i64, %arg0[%c0_i64, %c0_i64, %c256_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c256_i64], [%c0_i64, %c256_i64, %c1024_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x1024xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %25 = airrt.dma_memcpy_nd(%c10_i32, %c2_i64, %c0_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c0_i64], [%c128_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId10} : (i32, i64, i64, memref<128x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %26 = airrt.dma_memcpy_nd(%c29_i32, %c2_i64, %c0_i64, %arg2[%c0_i64, %c0_i64, %c256_i64, %c0_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId29} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + affine.for %arg3 = 0 to 1 { + %h = airrt.herd_load "herd_0" : i64 + %48 = airrt.wait_all : !airrt.event + %49 = airrt.wait_all : !airrt.event + %50:4 = scf.for %arg4 = %c0 to %c1024 step %c512 iter_args(%arg5 = %48, %arg6 = %49, %arg7 = %49, %arg8 = %49) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) { + %55 = airrt.wait_all %arg8, %arg5 : !airrt.event + %56 = airrt.wait_all %arg7 : !airrt.event + %57 = airrt.wait_all %arg7 : !airrt.event + airrt.wait_all %arg8, %arg5 + %58 = airrt.wait_all : !airrt.event + %59 = airrt.wait_all %arg6 : !airrt.event + airrt.wait_all %arg6 + %60 = airrt.wait_all : !airrt.event + scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event + } + %51 = airrt.wait_all : !airrt.event + %52 = airrt.wait_all : !airrt.event + %53:4 = scf.for %arg4 = %c0 to %c128 step %c64 iter_args(%arg5 = %51, %arg6 = %52, %arg7 = %52, %arg8 = %52) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) { + %55 = airrt.wait_all %arg8, %arg5 : !airrt.event + %56 = airrt.wait_all %arg7 : !airrt.event + %57 = airrt.wait_all %arg7 : !airrt.event + airrt.wait_all %arg8, %arg5 + %58 = airrt.wait_all : !airrt.event + %59 = airrt.wait_all %arg6 : !airrt.event + airrt.wait_all %arg6 + %60 = airrt.wait_all : !airrt.event + scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event + } + %54 = airrt.wait_all %50#1, %53#1 : !airrt.event + } + %p_8 = airrt.segment_load "forward_0" : i64 + %27 = airrt.dma_memcpy_nd(%c4_i32, %c2_i64, %c1_i64, %arg0[%c0_i64, %c0_i64, %c256_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c256_i64], [%c0_i64, %c256_i64, %c1024_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x1024xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %28 = airrt.dma_memcpy_nd(%c10_i32, %c2_i64, %c1_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c16_i64], [%c128_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId10} : (i32, i64, i64, memref<128x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %29 = airrt.dma_memcpy_nd(%c29_i32, %c2_i64, %c1_i64, %arg2[%c0_i64, %c0_i64, %c256_i64, %c128_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId29} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + affine.for %arg3 = 0 to 1 { + %h = airrt.herd_load "herd_0" : i64 + %48 = airrt.wait_all : !airrt.event + %49 = airrt.wait_all : !airrt.event + %50:4 = scf.for %arg4 = %c0 to %c1024 step %c512 iter_args(%arg5 = %48, %arg6 = %49, %arg7 = %49, %arg8 = %49) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) { + %55 = airrt.wait_all %arg8, %arg5 : !airrt.event + %56 = airrt.wait_all %arg7 : !airrt.event + %57 = airrt.wait_all %arg7 : !airrt.event + airrt.wait_all %arg8, %arg5 + %58 = airrt.wait_all : !airrt.event + %59 = airrt.wait_all %arg6 : !airrt.event + airrt.wait_all %arg6 + %60 = airrt.wait_all : !airrt.event + scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event + } + %51 = airrt.wait_all : !airrt.event + %52 = airrt.wait_all : !airrt.event + %53:4 = scf.for %arg4 = %c0 to %c128 step %c64 iter_args(%arg5 = %51, %arg6 = %52, %arg7 = %52, %arg8 = %52) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) { + %55 = airrt.wait_all %arg8, %arg5 : !airrt.event + %56 = airrt.wait_all %arg7 : !airrt.event + %57 = airrt.wait_all %arg7 : !airrt.event + airrt.wait_all %arg8, %arg5 + %58 = airrt.wait_all : !airrt.event + %59 = airrt.wait_all %arg6 : !airrt.event + airrt.wait_all %arg6 + %60 = airrt.wait_all : !airrt.event + scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event + } + %54 = airrt.wait_all %50#1, %53#1 : !airrt.event + } + %p_9 = airrt.segment_load "forward_0" : i64 + %30 = airrt.dma_memcpy_nd(%c4_i32, %c2_i64, %c2_i64, %arg0[%c0_i64, %c0_i64, %c256_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c256_i64], [%c0_i64, %c256_i64, %c1024_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x1024xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %31 = airrt.dma_memcpy_nd(%c10_i32, %c2_i64, %c2_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c32_i64], [%c128_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId10} : (i32, i64, i64, memref<128x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %32 = airrt.dma_memcpy_nd(%c29_i32, %c2_i64, %c2_i64, %arg2[%c0_i64, %c0_i64, %c256_i64, %c256_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId29} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + affine.for %arg3 = 0 to 1 { + %h = airrt.herd_load "herd_0" : i64 + %48 = airrt.wait_all : !airrt.event + %49 = airrt.wait_all : !airrt.event + %50:4 = scf.for %arg4 = %c0 to %c1024 step %c512 iter_args(%arg5 = %48, %arg6 = %49, %arg7 = %49, %arg8 = %49) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) { + %55 = airrt.wait_all %arg8, %arg5 : !airrt.event + %56 = airrt.wait_all %arg7 : !airrt.event + %57 = airrt.wait_all %arg7 : !airrt.event + airrt.wait_all %arg8, %arg5 + %58 = airrt.wait_all : !airrt.event + %59 = airrt.wait_all %arg6 : !airrt.event + airrt.wait_all %arg6 + %60 = airrt.wait_all : !airrt.event + scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event + } + %51 = airrt.wait_all : !airrt.event + %52 = airrt.wait_all : !airrt.event + %53:4 = scf.for %arg4 = %c0 to %c128 step %c64 iter_args(%arg5 = %51, %arg6 = %52, %arg7 = %52, %arg8 = %52) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) { + %55 = airrt.wait_all %arg8, %arg5 : !airrt.event + %56 = airrt.wait_all %arg7 : !airrt.event + %57 = airrt.wait_all %arg7 : !airrt.event + airrt.wait_all %arg8, %arg5 + %58 = airrt.wait_all : !airrt.event + %59 = airrt.wait_all %arg6 : !airrt.event + airrt.wait_all %arg6 + %60 = airrt.wait_all : !airrt.event + scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event + } + %54 = airrt.wait_all %50#1, %53#1 : !airrt.event + } + %p_10 = airrt.segment_load "forward_0" : i64 + %33 = airrt.dma_memcpy_nd(%c4_i32, %c2_i64, %c3_i64, %arg0[%c0_i64, %c0_i64, %c256_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c256_i64], [%c0_i64, %c256_i64, %c1024_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x1024xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %34 = airrt.dma_memcpy_nd(%c10_i32, %c2_i64, %c3_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c48_i64], [%c128_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId10} : (i32, i64, i64, memref<128x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %35 = airrt.dma_memcpy_nd(%c29_i32, %c2_i64, %c3_i64, %arg2[%c0_i64, %c0_i64, %c256_i64, %c384_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId29} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + affine.for %arg3 = 0 to 1 { + %h = airrt.herd_load "herd_0" : i64 + %48 = airrt.wait_all : !airrt.event + %49 = airrt.wait_all : !airrt.event + %50:4 = scf.for %arg4 = %c0 to %c1024 step %c512 iter_args(%arg5 = %48, %arg6 = %49, %arg7 = %49, %arg8 = %49) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) { + %55 = airrt.wait_all %arg8, %arg5 : !airrt.event + %56 = airrt.wait_all %arg7 : !airrt.event + %57 = airrt.wait_all %arg7 : !airrt.event + airrt.wait_all %arg8, %arg5 + %58 = airrt.wait_all : !airrt.event + %59 = airrt.wait_all %arg6 : !airrt.event + airrt.wait_all %arg6 + %60 = airrt.wait_all : !airrt.event + scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event + } + %51 = airrt.wait_all : !airrt.event + %52 = airrt.wait_all : !airrt.event + %53:4 = scf.for %arg4 = %c0 to %c128 step %c64 iter_args(%arg5 = %51, %arg6 = %52, %arg7 = %52, %arg8 = %52) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) { + %55 = airrt.wait_all %arg8, %arg5 : !airrt.event + %56 = airrt.wait_all %arg7 : !airrt.event + %57 = airrt.wait_all %arg7 : !airrt.event + airrt.wait_all %arg8, %arg5 + %58 = airrt.wait_all : !airrt.event + %59 = airrt.wait_all %arg6 : !airrt.event + airrt.wait_all %arg6 + %60 = airrt.wait_all : !airrt.event + scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event + } + %54 = airrt.wait_all %50#1, %53#1 : !airrt.event + } + %p_11 = airrt.segment_load "forward_0" : i64 + %36 = airrt.dma_memcpy_nd(%c4_i32, %c3_i64, %c0_i64, %arg0[%c0_i64, %c0_i64, %c384_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c256_i64], [%c0_i64, %c256_i64, %c1024_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x1024xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %37 = airrt.dma_memcpy_nd(%c10_i32, %c3_i64, %c0_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c0_i64], [%c128_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId10} : (i32, i64, i64, memref<128x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %38 = airrt.dma_memcpy_nd(%c29_i32, %c3_i64, %c0_i64, %arg2[%c0_i64, %c0_i64, %c384_i64, %c0_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId29} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + affine.for %arg3 = 0 to 1 { + %h = airrt.herd_load "herd_0" : i64 + %48 = airrt.wait_all : !airrt.event + %49 = airrt.wait_all : !airrt.event + %50:4 = scf.for %arg4 = %c0 to %c1024 step %c512 iter_args(%arg5 = %48, %arg6 = %49, %arg7 = %49, %arg8 = %49) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) { + %55 = airrt.wait_all %arg8, %arg5 : !airrt.event + %56 = airrt.wait_all %arg7 : !airrt.event + %57 = airrt.wait_all %arg7 : !airrt.event + airrt.wait_all %arg8, %arg5 + %58 = airrt.wait_all : !airrt.event + %59 = airrt.wait_all %arg6 : !airrt.event + airrt.wait_all %arg6 + %60 = airrt.wait_all : !airrt.event + scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event + } + %51 = airrt.wait_all : !airrt.event + %52 = airrt.wait_all : !airrt.event + %53:4 = scf.for %arg4 = %c0 to %c128 step %c64 iter_args(%arg5 = %51, %arg6 = %52, %arg7 = %52, %arg8 = %52) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) { + %55 = airrt.wait_all %arg8, %arg5 : !airrt.event + %56 = airrt.wait_all %arg7 : !airrt.event + %57 = airrt.wait_all %arg7 : !airrt.event + airrt.wait_all %arg8, %arg5 + %58 = airrt.wait_all : !airrt.event + %59 = airrt.wait_all %arg6 : !airrt.event + airrt.wait_all %arg6 + %60 = airrt.wait_all : !airrt.event + scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event + } + %54 = airrt.wait_all %50#1, %53#1 : !airrt.event + } + %p_12 = airrt.segment_load "forward_0" : i64 + %39 = airrt.dma_memcpy_nd(%c4_i32, %c3_i64, %c1_i64, %arg0[%c0_i64, %c0_i64, %c384_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c256_i64], [%c0_i64, %c256_i64, %c1024_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x1024xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %40 = airrt.dma_memcpy_nd(%c10_i32, %c3_i64, %c1_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c16_i64], [%c128_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId10} : (i32, i64, i64, memref<128x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %41 = airrt.dma_memcpy_nd(%c29_i32, %c3_i64, %c1_i64, %arg2[%c0_i64, %c0_i64, %c384_i64, %c128_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId29} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + affine.for %arg3 = 0 to 1 { + %h = airrt.herd_load "herd_0" : i64 + %48 = airrt.wait_all : !airrt.event + %49 = airrt.wait_all : !airrt.event + %50:4 = scf.for %arg4 = %c0 to %c1024 step %c512 iter_args(%arg5 = %48, %arg6 = %49, %arg7 = %49, %arg8 = %49) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) { + %55 = airrt.wait_all %arg8, %arg5 : !airrt.event + %56 = airrt.wait_all %arg7 : !airrt.event + %57 = airrt.wait_all %arg7 : !airrt.event + airrt.wait_all %arg8, %arg5 + %58 = airrt.wait_all : !airrt.event + %59 = airrt.wait_all %arg6 : !airrt.event + airrt.wait_all %arg6 + %60 = airrt.wait_all : !airrt.event + scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event + } + %51 = airrt.wait_all : !airrt.event + %52 = airrt.wait_all : !airrt.event + %53:4 = scf.for %arg4 = %c0 to %c128 step %c64 iter_args(%arg5 = %51, %arg6 = %52, %arg7 = %52, %arg8 = %52) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) { + %55 = airrt.wait_all %arg8, %arg5 : !airrt.event + %56 = airrt.wait_all %arg7 : !airrt.event + %57 = airrt.wait_all %arg7 : !airrt.event + airrt.wait_all %arg8, %arg5 + %58 = airrt.wait_all : !airrt.event + %59 = airrt.wait_all %arg6 : !airrt.event + airrt.wait_all %arg6 + %60 = airrt.wait_all : !airrt.event + scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event + } + %54 = airrt.wait_all %50#1, %53#1 : !airrt.event + } + %p_13 = airrt.segment_load "forward_0" : i64 + %42 = airrt.dma_memcpy_nd(%c4_i32, %c3_i64, %c2_i64, %arg0[%c0_i64, %c0_i64, %c384_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c256_i64], [%c0_i64, %c256_i64, %c1024_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x1024xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %43 = airrt.dma_memcpy_nd(%c10_i32, %c3_i64, %c2_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c32_i64], [%c128_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId10} : (i32, i64, i64, memref<128x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %44 = airrt.dma_memcpy_nd(%c29_i32, %c3_i64, %c2_i64, %arg2[%c0_i64, %c0_i64, %c384_i64, %c256_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId29} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + affine.for %arg3 = 0 to 1 { + %h = airrt.herd_load "herd_0" : i64 + %48 = airrt.wait_all : !airrt.event + %49 = airrt.wait_all : !airrt.event + %50:4 = scf.for %arg4 = %c0 to %c1024 step %c512 iter_args(%arg5 = %48, %arg6 = %49, %arg7 = %49, %arg8 = %49) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) { + %55 = airrt.wait_all %arg8, %arg5 : !airrt.event + %56 = airrt.wait_all %arg7 : !airrt.event + %57 = airrt.wait_all %arg7 : !airrt.event + airrt.wait_all %arg8, %arg5 + %58 = airrt.wait_all : !airrt.event + %59 = airrt.wait_all %arg6 : !airrt.event + airrt.wait_all %arg6 + %60 = airrt.wait_all : !airrt.event + scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event + } + %51 = airrt.wait_all : !airrt.event + %52 = airrt.wait_all : !airrt.event + %53:4 = scf.for %arg4 = %c0 to %c128 step %c64 iter_args(%arg5 = %51, %arg6 = %52, %arg7 = %52, %arg8 = %52) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) { + %55 = airrt.wait_all %arg8, %arg5 : !airrt.event + %56 = airrt.wait_all %arg7 : !airrt.event + %57 = airrt.wait_all %arg7 : !airrt.event + airrt.wait_all %arg8, %arg5 + %58 = airrt.wait_all : !airrt.event + %59 = airrt.wait_all %arg6 : !airrt.event + airrt.wait_all %arg6 + %60 = airrt.wait_all : !airrt.event + scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event + } + %54 = airrt.wait_all %50#1, %53#1 : !airrt.event + } + %p_14 = airrt.segment_load "forward_0" : i64 + %45 = airrt.dma_memcpy_nd(%c4_i32, %c3_i64, %c3_i64, %arg0[%c0_i64, %c0_i64, %c384_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c256_i64], [%c0_i64, %c256_i64, %c1024_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x1024xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %46 = airrt.dma_memcpy_nd(%c10_i32, %c3_i64, %c3_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c48_i64], [%c128_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId10} : (i32, i64, i64, memref<128x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %47 = airrt.dma_memcpy_nd(%c29_i32, %c3_i64, %c3_i64, %arg2[%c0_i64, %c0_i64, %c384_i64, %c384_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId29} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + affine.for %arg3 = 0 to 1 { + %h = airrt.herd_load "herd_0" : i64 + %48 = airrt.wait_all : !airrt.event + %49 = airrt.wait_all : !airrt.event + %50:4 = scf.for %arg4 = %c0 to %c1024 step %c512 iter_args(%arg5 = %48, %arg6 = %49, %arg7 = %49, %arg8 = %49) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) { + %55 = airrt.wait_all %arg8, %arg5 : !airrt.event + %56 = airrt.wait_all %arg7 : !airrt.event + %57 = airrt.wait_all %arg7 : !airrt.event + airrt.wait_all %arg8, %arg5 + %58 = airrt.wait_all : !airrt.event + %59 = airrt.wait_all %arg6 : !airrt.event + airrt.wait_all %arg6 + %60 = airrt.wait_all : !airrt.event + scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event + } + %51 = airrt.wait_all : !airrt.event + %52 = airrt.wait_all : !airrt.event + %53:4 = scf.for %arg4 = %c0 to %c128 step %c64 iter_args(%arg5 = %51, %arg6 = %52, %arg7 = %52, %arg8 = %52) -> (!airrt.event, !airrt.event, !airrt.event, !airrt.event) { + %55 = airrt.wait_all %arg8, %arg5 : !airrt.event + %56 = airrt.wait_all %arg7 : !airrt.event + %57 = airrt.wait_all %arg7 : !airrt.event + airrt.wait_all %arg8, %arg5 + %58 = airrt.wait_all : !airrt.event + %59 = airrt.wait_all %arg6 : !airrt.event + airrt.wait_all %arg6 + %60 = airrt.wait_all : !airrt.event + scf.yield %58, %60, %60, %59 : !airrt.event, !airrt.event, !airrt.event, !airrt.event + } + %54 = airrt.wait_all %50#1, %53#1 : !airrt.event + } + return %arg2 : memref<512x512xf32> + } +} diff --git a/mlir/test/Conversion/AIRRtToNpu/dma_offset_folding.mlir b/mlir/test/Conversion/AIRRtToNpu/dma_offset_folding.mlir new file mode 100644 index 000000000..75961ce13 --- /dev/null +++ b/mlir/test/Conversion/AIRRtToNpu/dma_offset_folding.mlir @@ -0,0 +1,233 @@ +//===- dma_offset_folding.mlir ---------------------------------------*- MLIR -*-===// +// +// Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: MIT +// +//===-----------------------------------------------------------------------------===// + + +// RUN: air-opt -airrt-to-npu --split-input-file %s | FileCheck %s + +// +//Test correctness of generated offsets, wraps and strides +// +// +// CHECK-LABEL: aie.device(npu1_4col) +// CHECK: aie.shim_dma_allocation @airMemcpyId19(S2MM, 0, 0) +// CHECK: memref.global "public" @airMemcpyId19 : memref<128x128xf32, 1> +// CHECK: aie.shim_dma_allocation @airMemcpyId4(MM2S, 0, 0) +// CHECK: memref.global "public" @airMemcpyId4 : memref<128x128xbf16, 1> +// CHECK: aie.shim_dma_allocation @airMemcpyId5(MM2S, 1, 0) +// CHECK: memref.global "public" @airMemcpyId5 : memref<16x8x8x16xbf16, 1> + + +// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 4, 128, 16][0, 16, 64, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<32768xi32> +// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][16, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<32768xi32> +// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 128, 128][0, 0, 512, 1]) {id = 2 : i64, metadata = @airMemcpyId19} : memref<512x512xf32> +// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} +// CHECK: aiex.npu.dma_memcpy_nd(0, 1, %arg0[0, 0, 0, 0][1, 4, 128, 16][0, 16, 64, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<32768xi32> +// CHECK: aiex.npu.dma_memcpy_nd(0, 1, %arg1[0, 0, 0, 8][16, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<32768xi32> +// CHECK: aiex.npu.dma_memcpy_nd(0, 1, %arg2[0, 0, 0, 128][1, 1, 128, 128][0, 0, 512, 1]) {id = 2 : i64, metadata = @airMemcpyId19} : memref<512x512xf32> +// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} +// CHECK: aiex.npu.dma_memcpy_nd(0, 2, %arg0[0, 0, 0, 0][1, 4, 128, 16][0, 16, 64, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<32768xi32> +// CHECK: aiex.npu.dma_memcpy_nd(0, 2, %arg1[0, 0, 0, 16][16, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<32768xi32> +// CHECK: aiex.npu.dma_memcpy_nd(0, 2, %arg2[0, 0, 0, 256][1, 1, 128, 128][0, 0, 512, 1]) {id = 2 : i64, metadata = @airMemcpyId19} : memref<512x512xf32> +// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} +// CHECK: aiex.npu.dma_memcpy_nd(0, 3, %arg0[0, 0, 0, 0][1, 4, 128, 16][0, 16, 64, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<32768xi32> +// CHECK: aiex.npu.dma_memcpy_nd(0, 3, %arg1[0, 0, 0, 24][16, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<32768xi32> +// CHECK: aiex.npu.dma_memcpy_nd(0, 3, %arg2[0, 0, 0, 384][1, 1, 128, 128][0, 0, 512, 1]) {id = 2 : i64, metadata = @airMemcpyId19} : memref<512x512xf32> +// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} +// CHECK: aiex.npu.dma_memcpy_nd(1, 0, %arg0[0, 0, 0, 8192][1, 4, 128, 16][0, 16, 64, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<32768xi32> +// CHECK: aiex.npu.dma_memcpy_nd(1, 0, %arg1[0, 0, 0, 0][16, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<32768xi32> +// CHECK: aiex.npu.dma_memcpy_nd(1, 0, %arg2[0, 0, 128, 0][1, 1, 128, 128][0, 0, 512, 1]) {id = 2 : i64, metadata = @airMemcpyId19} : memref<512x512xf32> +// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} +// CHECK: aiex.npu.dma_memcpy_nd(1, 1, %arg0[0, 0, 0, 8192][1, 4, 128, 16][0, 16, 64, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<32768xi32> +// CHECK: aiex.npu.dma_memcpy_nd(1, 1, %arg1[0, 0, 0, 8][16, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<32768xi32> +// CHECK: aiex.npu.dma_memcpy_nd(1, 1, %arg2[0, 0, 128, 128][1, 1, 128, 128][0, 0, 512, 1]) {id = 2 : i64, metadata = @airMemcpyId19} : memref<512x512xf32> +// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} +// CHECK: aiex.npu.dma_memcpy_nd(1, 2, %arg0[0, 0, 0, 8192][1, 4, 128, 16][0, 16, 64, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<32768xi32> +// CHECK: aiex.npu.dma_memcpy_nd(1, 2, %arg1[0, 0, 0, 16][16, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<32768xi32> +// CHECK: aiex.npu.dma_memcpy_nd(1, 2, %arg2[0, 0, 128, 256][1, 1, 128, 128][0, 0, 512, 1]) {id = 2 : i64, metadata = @airMemcpyId19} : memref<512x512xf32> +// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} +// CHECK: aiex.npu.dma_memcpy_nd(1, 3, %arg0[0, 0, 0, 8192][1, 4, 128, 16][0, 16, 64, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<32768xi32> +// CHECK: aiex.npu.dma_memcpy_nd(1, 3, %arg1[0, 0, 0, 24][16, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<32768xi32> +// CHECK: aiex.npu.dma_memcpy_nd(1, 3, %arg2[0, 0, 128, 384][1, 1, 128, 128][0, 0, 512, 1]) {id = 2 : i64, metadata = @airMemcpyId19} : memref<512x512xf32> +// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} +// CHECK: aiex.npu.dma_memcpy_nd(2, 0, %arg0[0, 0, 0, 16384][1, 4, 128, 16][0, 16, 64, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<32768xi32> +// CHECK: aiex.npu.dma_memcpy_nd(2, 0, %arg1[0, 0, 0, 0][16, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<32768xi32> +// CHECK: aiex.npu.dma_memcpy_nd(2, 0, %arg2[0, 0, 256, 0][1, 1, 128, 128][0, 0, 512, 1]) {id = 2 : i64, metadata = @airMemcpyId19} : memref<512x512xf32> +// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} +// CHECK: aiex.npu.dma_memcpy_nd(2, 1, %arg0[0, 0, 0, 16384][1, 4, 128, 16][0, 16, 64, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<32768xi32> +// CHECK: aiex.npu.dma_memcpy_nd(2, 1, %arg1[0, 0, 0, 8][16, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<32768xi32> +// CHECK: aiex.npu.dma_memcpy_nd(2, 1, %arg2[0, 0, 256, 128][1, 1, 128, 128][0, 0, 512, 1]) {id = 2 : i64, metadata = @airMemcpyId19} : memref<512x512xf32> +// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} +// CHECK: aiex.npu.dma_memcpy_nd(2, 2, %arg0[0, 0, 0, 16384][1, 4, 128, 16][0, 16, 64, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<32768xi32> +// CHECK: aiex.npu.dma_memcpy_nd(2, 2, %arg1[0, 0, 0, 16][16, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<32768xi32> +// CHECK: aiex.npu.dma_memcpy_nd(2, 2, %arg2[0, 0, 256, 256][1, 1, 128, 128][0, 0, 512, 1]) {id = 2 : i64, metadata = @airMemcpyId19} : memref<512x512xf32> +// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} +// CHECK: aiex.npu.dma_memcpy_nd(2, 3, %arg0[0, 0, 0, 16384][1, 4, 128, 16][0, 16, 64, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<32768xi32> +// CHECK: aiex.npu.dma_memcpy_nd(2, 3, %arg1[0, 0, 0, 24][16, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<32768xi32> +// CHECK: aiex.npu.dma_memcpy_nd(2, 3, %arg2[0, 0, 256, 384][1, 1, 128, 128][0, 0, 512, 1]) {id = 2 : i64, metadata = @airMemcpyId19} : memref<512x512xf32> +// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} +// CHECK: aiex.npu.dma_memcpy_nd(3, 0, %arg0[0, 0, 0, 24576][1, 4, 128, 16][0, 16, 64, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<32768xi32> +// CHECK: aiex.npu.dma_memcpy_nd(3, 0, %arg1[0, 0, 0, 0][16, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<32768xi32> +// CHECK: aiex.npu.dma_memcpy_nd(3, 0, %arg2[0, 0, 384, 0][1, 1, 128, 128][0, 0, 512, 1]) {id = 2 : i64, metadata = @airMemcpyId19} : memref<512x512xf32> +// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} +// CHECK: aiex.npu.dma_memcpy_nd(3, 1, %arg0[0, 0, 0, 24576][1, 4, 128, 16][0, 16, 64, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<32768xi32> +// CHECK: aiex.npu.dma_memcpy_nd(3, 1, %arg1[0, 0, 0, 8][16, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<32768xi32> +// CHECK: aiex.npu.dma_memcpy_nd(3, 1, %arg2[0, 0, 384, 128][1, 1, 128, 128][0, 0, 512, 1]) {id = 2 : i64, metadata = @airMemcpyId19} : memref<512x512xf32> +// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} +// CHECK: aiex.npu.dma_memcpy_nd(3, 2, %arg0[0, 0, 0, 24576][1, 4, 128, 16][0, 16, 64, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<32768xi32> +// CHECK: aiex.npu.dma_memcpy_nd(3, 2, %arg1[0, 0, 0, 16][16, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<32768xi32> +// CHECK: aiex.npu.dma_memcpy_nd(3, 2, %arg2[0, 0, 384, 256][1, 1, 128, 128][0, 0, 512, 1]) {id = 2 : i64, metadata = @airMemcpyId19} : memref<512x512xf32> +// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} +// CHECK: aiex.npu.dma_memcpy_nd(3, 3, %arg0[0, 0, 0, 24576][1, 4, 128, 16][0, 16, 64, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<32768xi32> +// CHECK: aiex.npu.dma_memcpy_nd(3, 3, %arg1[0, 0, 0, 24][16, 8, 8, 8][2048, 32, 256, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<32768xi32> +// CHECK: aiex.npu.dma_memcpy_nd(3, 3, %arg2[0, 0, 384, 384][1, 1, 128, 128][0, 0, 512, 1]) {id = 2 : i64, metadata = @airMemcpyId19} : memref<512x512xf32> +// CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} + +module { + aie.device(npu1_4col) { + aie.shim_dma_allocation @airMemcpyId19(S2MM, 0, 0) + memref.global "public" @airMemcpyId19 : memref<128x128xf32, 1> + aie.shim_dma_allocation @airMemcpyId4(MM2S, 0, 0) + memref.global "public" @airMemcpyId4 : memref<128x128xbf16, 1> + aie.shim_dma_allocation @airMemcpyId5(MM2S, 1, 0) + memref.global "public" @airMemcpyId5 : memref<16x8x8x16xbf16, 1> + } {sym_name = "forward_0"} + airrt.module_metadata{ + } + func.func @forward(%arg0: memref<512x128xbf16>, %arg1: memref<16x8x8x64xbf16>, %arg2: memref<512x512xf32>) -> memref<512x512xf32> { + %c384_i64 = arith.constant 384 : i64 + %c48_i64 = arith.constant 48 : i64 + %c3_i64 = arith.constant 3 : i64 + %c256_i64 = arith.constant 256 : i64 + %c2_i64 = arith.constant 2 : i64 + %c8_i64 = arith.constant 8 : i64 + %c16_i64 = arith.constant 16 : i64 + %c512_i64 = arith.constant 512 : i64 + %c64_i64 = arith.constant 64 : i64 + %c4096_i64 = arith.constant 4096 : i64 + %c4_i64 = arith.constant 4 : i64 + %c1_i64 = arith.constant 1 : i64 + %c128_i64 = arith.constant 128 : i64 + %c32_i64 = arith.constant 32 : i64 + %c0_i64 = arith.constant 0 : i64 + %c19_i32 = arith.constant 19 : i32 + %c5_i32 = arith.constant 5 : i32 + %c4_i32 = arith.constant 4 : i32 + %p = airrt.segment_load "forward_0" : i64 + %0 = airrt.dma_memcpy_nd(%c4_i32, %c0_i64, %c0_i64, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c32_i64], [%c0_i64, %c32_i64, %c128_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x128xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %1 = airrt.dma_memcpy_nd(%c5_i32, %c0_i64, %c0_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c0_i64], [%c16_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId5} : (i32, i64, i64, memref<16x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %2 = airrt.dma_memcpy_nd(%c19_i32, %c0_i64, %c0_i64, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId19} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + affine.for %arg3 = 0 to 1 { + %h = airrt.herd_load "herd_0" : i64 + } + %p_0 = airrt.segment_load "forward_0" : i64 + %3 = airrt.dma_memcpy_nd(%c4_i32, %c0_i64, %c1_i64, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c32_i64], [%c0_i64, %c32_i64, %c128_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x128xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %4 = airrt.dma_memcpy_nd(%c5_i32, %c0_i64, %c1_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c16_i64], [%c16_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId5} : (i32, i64, i64, memref<16x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %5 = airrt.dma_memcpy_nd(%c19_i32, %c0_i64, %c1_i64, %arg2[%c0_i64, %c0_i64, %c0_i64, %c128_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId19} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + affine.for %arg3 = 0 to 1 { + %h = airrt.herd_load "herd_0" : i64 + } + %p_1 = airrt.segment_load "forward_0" : i64 + %6 = airrt.dma_memcpy_nd(%c4_i32, %c0_i64, %c2_i64, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c32_i64], [%c0_i64, %c32_i64, %c128_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x128xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %7 = airrt.dma_memcpy_nd(%c5_i32, %c0_i64, %c2_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c32_i64], [%c16_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId5} : (i32, i64, i64, memref<16x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %8 = airrt.dma_memcpy_nd(%c19_i32, %c0_i64, %c2_i64, %arg2[%c0_i64, %c0_i64, %c0_i64, %c256_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId19} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + affine.for %arg3 = 0 to 1 { + %h = airrt.herd_load "herd_0" : i64 + } + %p_2 = airrt.segment_load "forward_0" : i64 + %9 = airrt.dma_memcpy_nd(%c4_i32, %c0_i64, %c3_i64, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c32_i64], [%c0_i64, %c32_i64, %c128_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x128xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %10 = airrt.dma_memcpy_nd(%c5_i32, %c0_i64, %c3_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c48_i64], [%c16_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId5} : (i32, i64, i64, memref<16x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %11 = airrt.dma_memcpy_nd(%c19_i32, %c0_i64, %c3_i64, %arg2[%c0_i64, %c0_i64, %c0_i64, %c384_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId19} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + affine.for %arg3 = 0 to 1 { + %h = airrt.herd_load "herd_0" : i64 + } + %p_3 = airrt.segment_load "forward_0" : i64 + %12 = airrt.dma_memcpy_nd(%c4_i32, %c1_i64, %c0_i64, %arg0[%c0_i64, %c0_i64, %c128_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c32_i64], [%c0_i64, %c32_i64, %c128_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x128xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %13 = airrt.dma_memcpy_nd(%c5_i32, %c1_i64, %c0_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c0_i64], [%c16_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId5} : (i32, i64, i64, memref<16x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %14 = airrt.dma_memcpy_nd(%c19_i32, %c1_i64, %c0_i64, %arg2[%c0_i64, %c0_i64, %c128_i64, %c0_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId19} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + affine.for %arg3 = 0 to 1 { + %h = airrt.herd_load "herd_0" : i64 + } + %p_4 = airrt.segment_load "forward_0" : i64 + %15 = airrt.dma_memcpy_nd(%c4_i32, %c1_i64, %c1_i64, %arg0[%c0_i64, %c0_i64, %c128_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c32_i64], [%c0_i64, %c32_i64, %c128_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x128xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %16 = airrt.dma_memcpy_nd(%c5_i32, %c1_i64, %c1_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c16_i64], [%c16_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId5} : (i32, i64, i64, memref<16x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %17 = airrt.dma_memcpy_nd(%c19_i32, %c1_i64, %c1_i64, %arg2[%c0_i64, %c0_i64, %c128_i64, %c128_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId19} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + affine.for %arg3 = 0 to 1 { + %h = airrt.herd_load "herd_0" : i64 + } + %p_5 = airrt.segment_load "forward_0" : i64 + %18 = airrt.dma_memcpy_nd(%c4_i32, %c1_i64, %c2_i64, %arg0[%c0_i64, %c0_i64, %c128_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c32_i64], [%c0_i64, %c32_i64, %c128_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x128xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %19 = airrt.dma_memcpy_nd(%c5_i32, %c1_i64, %c2_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c32_i64], [%c16_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId5} : (i32, i64, i64, memref<16x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %20 = airrt.dma_memcpy_nd(%c19_i32, %c1_i64, %c2_i64, %arg2[%c0_i64, %c0_i64, %c128_i64, %c256_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId19} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + affine.for %arg3 = 0 to 1 { + %h = airrt.herd_load "herd_0" : i64 + } + %p_6 = airrt.segment_load "forward_0" : i64 + %21 = airrt.dma_memcpy_nd(%c4_i32, %c1_i64, %c3_i64, %arg0[%c0_i64, %c0_i64, %c128_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c32_i64], [%c0_i64, %c32_i64, %c128_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x128xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %22 = airrt.dma_memcpy_nd(%c5_i32, %c1_i64, %c3_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c48_i64], [%c16_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId5} : (i32, i64, i64, memref<16x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %23 = airrt.dma_memcpy_nd(%c19_i32, %c1_i64, %c3_i64, %arg2[%c0_i64, %c0_i64, %c128_i64, %c384_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId19} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + affine.for %arg3 = 0 to 1 { + %h = airrt.herd_load "herd_0" : i64 + } + %p_7 = airrt.segment_load "forward_0" : i64 + %24 = airrt.dma_memcpy_nd(%c4_i32, %c2_i64, %c0_i64, %arg0[%c0_i64, %c0_i64, %c256_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c32_i64], [%c0_i64, %c32_i64, %c128_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x128xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %25 = airrt.dma_memcpy_nd(%c5_i32, %c2_i64, %c0_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c0_i64], [%c16_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId5} : (i32, i64, i64, memref<16x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %26 = airrt.dma_memcpy_nd(%c19_i32, %c2_i64, %c0_i64, %arg2[%c0_i64, %c0_i64, %c256_i64, %c0_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId19} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + affine.for %arg3 = 0 to 1 { + %h = airrt.herd_load "herd_0" : i64 + } + %p_8 = airrt.segment_load "forward_0" : i64 + %27 = airrt.dma_memcpy_nd(%c4_i32, %c2_i64, %c1_i64, %arg0[%c0_i64, %c0_i64, %c256_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c32_i64], [%c0_i64, %c32_i64, %c128_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x128xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %28 = airrt.dma_memcpy_nd(%c5_i32, %c2_i64, %c1_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c16_i64], [%c16_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId5} : (i32, i64, i64, memref<16x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %29 = airrt.dma_memcpy_nd(%c19_i32, %c2_i64, %c1_i64, %arg2[%c0_i64, %c0_i64, %c256_i64, %c128_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId19} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + affine.for %arg3 = 0 to 1 { + %h = airrt.herd_load "herd_0" : i64 + } + %p_9 = airrt.segment_load "forward_0" : i64 + %30 = airrt.dma_memcpy_nd(%c4_i32, %c2_i64, %c2_i64, %arg0[%c0_i64, %c0_i64, %c256_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c32_i64], [%c0_i64, %c32_i64, %c128_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x128xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %31 = airrt.dma_memcpy_nd(%c5_i32, %c2_i64, %c2_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c32_i64], [%c16_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId5} : (i32, i64, i64, memref<16x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %32 = airrt.dma_memcpy_nd(%c19_i32, %c2_i64, %c2_i64, %arg2[%c0_i64, %c0_i64, %c256_i64, %c256_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId19} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + affine.for %arg3 = 0 to 1 { + %h = airrt.herd_load "herd_0" : i64 + } + %p_10 = airrt.segment_load "forward_0" : i64 + %33 = airrt.dma_memcpy_nd(%c4_i32, %c2_i64, %c3_i64, %arg0[%c0_i64, %c0_i64, %c256_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c32_i64], [%c0_i64, %c32_i64, %c128_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x128xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %34 = airrt.dma_memcpy_nd(%c5_i32, %c2_i64, %c3_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c48_i64], [%c16_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId5} : (i32, i64, i64, memref<16x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %35 = airrt.dma_memcpy_nd(%c19_i32, %c2_i64, %c3_i64, %arg2[%c0_i64, %c0_i64, %c256_i64, %c384_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId19} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + affine.for %arg3 = 0 to 1 { + %h = airrt.herd_load "herd_0" : i64 + } + %p_11 = airrt.segment_load "forward_0" : i64 + %36 = airrt.dma_memcpy_nd(%c4_i32, %c3_i64, %c0_i64, %arg0[%c0_i64, %c0_i64, %c384_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c32_i64], [%c0_i64, %c32_i64, %c128_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x128xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %37 = airrt.dma_memcpy_nd(%c5_i32, %c3_i64, %c0_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c0_i64], [%c16_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId5} : (i32, i64, i64, memref<16x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %38 = airrt.dma_memcpy_nd(%c19_i32, %c3_i64, %c0_i64, %arg2[%c0_i64, %c0_i64, %c384_i64, %c0_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId19} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + affine.for %arg3 = 0 to 1 { + %h = airrt.herd_load "herd_0" : i64 + } + %p_12 = airrt.segment_load "forward_0" : i64 + %39 = airrt.dma_memcpy_nd(%c4_i32, %c3_i64, %c1_i64, %arg0[%c0_i64, %c0_i64, %c384_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c32_i64], [%c0_i64, %c32_i64, %c128_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x128xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %40 = airrt.dma_memcpy_nd(%c5_i32, %c3_i64, %c1_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c16_i64], [%c16_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId5} : (i32, i64, i64, memref<16x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %41 = airrt.dma_memcpy_nd(%c19_i32, %c3_i64, %c1_i64, %arg2[%c0_i64, %c0_i64, %c384_i64, %c128_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId19} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + affine.for %arg3 = 0 to 1 { + %h = airrt.herd_load "herd_0" : i64 + } + %p_13 = airrt.segment_load "forward_0" : i64 + %42 = airrt.dma_memcpy_nd(%c4_i32, %c3_i64, %c2_i64, %arg0[%c0_i64, %c0_i64, %c384_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c32_i64], [%c0_i64, %c32_i64, %c128_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x128xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %43 = airrt.dma_memcpy_nd(%c5_i32, %c3_i64, %c2_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c32_i64], [%c16_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId5} : (i32, i64, i64, memref<16x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %44 = airrt.dma_memcpy_nd(%c19_i32, %c3_i64, %c2_i64, %arg2[%c0_i64, %c0_i64, %c384_i64, %c256_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId19} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + affine.for %arg3 = 0 to 1 { + %h = airrt.herd_load "herd_0" : i64 + } + %p_14 = airrt.segment_load "forward_0" : i64 + %45 = airrt.dma_memcpy_nd(%c4_i32, %c3_i64, %c3_i64, %arg0[%c0_i64, %c0_i64, %c384_i64, %c0_i64], [%c1_i64, %c4_i64, %c128_i64, %c32_i64], [%c0_i64, %c32_i64, %c128_i64]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<512x128xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %46 = airrt.dma_memcpy_nd(%c5_i32, %c3_i64, %c3_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c48_i64], [%c16_i64, %c8_i64, %c8_i64, %c16_i64], [%c4096_i64, %c64_i64, %c512_i64]) {metadata = @airMemcpyId5} : (i32, i64, i64, memref<16x8x8x64xbf16>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %47 = airrt.dma_memcpy_nd(%c19_i32, %c3_i64, %c3_i64, %arg2[%c0_i64, %c0_i64, %c384_i64, %c384_i64], [%c1_i64, %c1_i64, %c128_i64, %c128_i64], [%c0_i64, %c0_i64, %c512_i64]) {metadata = @airMemcpyId19} : (i32, i64, i64, memref<512x512xf32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + affine.for %arg3 = 0 to 1 { + %h = airrt.herd_load "herd_0" : i64 + } + return %arg2 : memref<512x512xf32> + } +}