Avoid over-optimization of SHIM DMA BDs, if the outcome is stride>1M (X…

…ilinx#553) * Avoid folding for loop into wrap-and-stride, if the outcome is stride > 1M; unroll BDs instead * Fixup clang format
nod-ai · Apr 25, 2024 · 191873f · 191873f
1 parent 4faaa09
commit 191873f
Show file tree

Hide file tree

Showing 5 changed files with 64 additions and 7 deletions.
diff --git a/mlir/include/air/Util/Util.h b/mlir/include/air/Util/Util.h
@@ -140,7 +140,7 @@ std::vector<unsigned> getMDVectorFromIterator(std::vector<unsigned> dims,
 void getDefiningOpsToOperands(Operation *op, SmallVector<Operation *> &def_ops);
 
 // Fold perfectly nested parent loops into wraps and strides list
-void foldForLoopNestAsExtendedSizesAndStrides(
+LogicalResult foldForLoopNestAsExtendedSizesAndStrides(
     OpBuilder builder, Operation *for_op, Operation *channel_op,
     SmallVector<Value> &offsets, SmallVector<Value> &wraps,
     SmallVector<Value> &strides, Value memref);

diff --git a/mlir/lib/Conversion/AIRRtToNpuPass.cpp b/mlir/lib/Conversion/AIRRtToNpuPass.cpp
@@ -740,9 +740,11 @@ specializeAffineForInAIRRtDmaWrapAndStride(OpBuilder builder,
           builder.create<arith::ConstantIndexOp>(loc, current_stride));
     }
   }
-  xilinx::air::foldForLoopNestAsExtendedSizesAndStrides(
+  auto res = xilinx::air::foldForLoopNestAsExtendedSizesAndStrides(
       builder, for_op.getOperation(), memcpy_ops[0].getOperation(), offsets,
       wraps, strides, memcpy_ops[0]->getOperand(3));
+  if (res.failed())
+    return failure();
 
   if (offsets.size() > 4 || wraps.size() > 4 || strides.size() > 4)
     return failure();

diff --git a/mlir/lib/Transform/AIRDependencyScheduleOpt.cpp b/mlir/lib/Transform/AIRDependencyScheduleOpt.cpp
@@ -1722,9 +1722,11 @@ struct AIRSpecializeChannelWrapAndStrideInScfFor
       populateDefaultWrapsAndStrides(rewriter, channel_ops[0].getMemref(),
                                      offsets, wraps, strides);
 
-    foldForLoopNestAsExtendedSizesAndStrides(
+    auto res = foldForLoopNestAsExtendedSizesAndStrides(
         rewriter, for_op.getOperation(), channel_ops[0].getOperation(), offsets,
         wraps, strides, channel_ops[0].getMemref());
+    if (res.failed())
+      return failure();
 
     (void)canonicalizeWrapAndStrideList(
         rewriter, offsets, wraps, strides,
@@ -1823,9 +1825,11 @@ struct AIRSpecializeChannelWrapAndStrideInAffineFor
         rewriter, offsets, wraps, strides,
         air::getTensorVolume(channel_ops[0].getMemref().getType()));
 
-    foldForLoopNestAsExtendedSizesAndStrides(
+    auto res = foldForLoopNestAsExtendedSizesAndStrides(
         rewriter, for_op.getOperation(), channel_ops[0].getOperation(), offsets,
         wraps, strides, channel_ops[0].getMemref());
+    if (res.failed())
+      return failure();
 
     (void)canonicalizeWrapAndStrideList(
         rewriter, offsets, wraps, strides,

diff --git a/mlir/lib/Util/Util.cpp b/mlir/lib/Util/Util.cpp
@@ -903,7 +903,7 @@ LogicalResult air::canonicalizeWrapAndStrideList(OpBuilder builder,
 }
 
 // Fold perfectly nested for loops as extra entries in wraps and strides
-void air::foldForLoopNestAsExtendedSizesAndStrides(
+LogicalResult air::foldForLoopNestAsExtendedSizesAndStrides(
     OpBuilder builder, Operation *for_op, Operation *channel_op,
     SmallVector<Value> &offsets, SmallVector<Value> &wraps,
     SmallVector<Value> &strides, Value memref) {
@@ -972,13 +972,22 @@ void air::foldForLoopNestAsExtendedSizesAndStrides(
       stepSize = afo.getStepAsInt();
     else if (auto sfo = dyn_cast<scf::ForOp>(o))
       stepSize = *mlir::getConstantIntValue(sfo.getStep());
-    Value new_stride = builder.template create<arith::ConstantIndexOp>(
-        loc, (stepSize * ind_var_factor) % getTensorVolume(memref.getType()));
+    int new_stride_value =
+        (stepSize * ind_var_factor) % getTensorVolume(memref.getType());
+    Value new_stride =
+        builder.template create<arith::ConstantIndexOp>(loc, new_stride_value);
+
+    // Check for compliance with DMA BD hardware limitation (<= 1M).
+    if (mlir::ceilDiv(
+            new_stride_value * getElementSizeInBytes(memref.getType()), 4) >
+        0x100000)
+      return failure();
 
     // Insert new dimension into the wraps and strides list.
     wraps.insert(wraps.begin(), new_wrap);
     strides.insert(strides.begin(), new_stride);
   }
+  return success();
 }
 
 // If wrap-and-stride lists are empty, populate them with default data access

diff --git a/mlir/test/Conversion/AIRRtToNpu/airrt_to_npu.mlir b/mlir/test/Conversion/AIRRtToNpu/airrt_to_npu.mlir
@@ -808,3 +808,45 @@ module {
     return
   }
 }
+
+// -----
+
+// Avoid folding for loop into wrap-and-stride, if the outcome is stride > 1M; unroll BDs instead.
+
+// CHECK-LABEL: aie.device(npu)
+// CHECK:  func.func @func18(%[[ARG0:.*]]: memref<8192x32768xi32>)
+// CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][1, 4, 64, 64][0, 64, 32768]) {id = 0 : i64, metadata = @airMemcpyId26} : memref<8192x32768xi32>
+
+#map = affine_map<()[s0] -> (s0 * 64)>
+module {
+  aie.device(npu) {
+    %tile_0_0 = aie.tile(0, 0)
+    aie.shim_dma_allocation @airMemcpyId26(S2MM, 0, 0)
+    memref.global "public" @airMemcpyId26 : memref<64x64xi32, 1>
+  } {sym_name = "segment_0"}
+  func.func @func18() {
+    %c32768_i64 = arith.constant 32768 : i64
+    %c8_i64 = arith.constant 8 : i64
+    %c512_i64 = arith.constant 512 : i64
+    %c64_i64 = arith.constant 64 : i64
+    %c26_i32 = arith.constant 26 : i32
+    %c15_i32 = arith.constant 15 : i32
+    %c14_i32 = arith.constant 14 : i32
+    %c1_i64 = arith.constant 1 : i64
+    %c0_i64 = arith.constant 0 : i64
+    %alloc = memref.alloc() : memref<8192x32768xi32>
+    affine.for %arg3 = 0 to 4 {
+      affine.for %arg4 = 0 to 4 {
+        %10 = affine.apply #map()[%arg3]
+        %11 = affine.apply #map()[%arg4]
+        %12 = arith.index_cast %arg3 : index to i64
+        %13 = arith.index_cast %arg4 : index to i64
+        %14 = arith.index_cast %10 : index to i64
+        %15 = arith.index_cast %11 : index to i64
+        %16 = airrt.dma_memcpy_nd(%c26_i32, %12, %13, %alloc[%c0_i64, %c0_i64, %14, %15], [%c1_i64, %c1_i64, %c64_i64, %c64_i64], [%c0_i64, %c0_i64, %c32768_i64]) {metadata = @airMemcpyId26} : (i32, i64, i64, memref<8192x32768xi32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+        %p = airrt.segment_load "segment_0" : i64
+      }
+    }
+    return
+  }
+}