From 191873fc7b7fae34f3c63aa814742cad6afbeb29 Mon Sep 17 00:00:00 2001 From: erwei-xilinx Date: Thu, 25 Apr 2024 15:28:12 -0700 Subject: [PATCH] Avoid over-optimization of SHIM DMA BDs, if the outcome is stride>1M (#553) * Avoid folding for loop into wrap-and-stride, if the outcome is stride > 1M; unroll BDs instead * Fixup clang format --- mlir/include/air/Util/Util.h | 2 +- mlir/lib/Conversion/AIRRtToNpuPass.cpp | 4 +- .../Transform/AIRDependencyScheduleOpt.cpp | 8 +++- mlir/lib/Util/Util.cpp | 15 +++++-- .../Conversion/AIRRtToNpu/airrt_to_npu.mlir | 42 +++++++++++++++++++ 5 files changed, 64 insertions(+), 7 deletions(-) diff --git a/mlir/include/air/Util/Util.h b/mlir/include/air/Util/Util.h index 1377e95ee..a2b51b151 100644 --- a/mlir/include/air/Util/Util.h +++ b/mlir/include/air/Util/Util.h @@ -140,7 +140,7 @@ std::vector getMDVectorFromIterator(std::vector dims, void getDefiningOpsToOperands(Operation *op, SmallVector &def_ops); // Fold perfectly nested parent loops into wraps and strides list -void foldForLoopNestAsExtendedSizesAndStrides( +LogicalResult foldForLoopNestAsExtendedSizesAndStrides( OpBuilder builder, Operation *for_op, Operation *channel_op, SmallVector &offsets, SmallVector &wraps, SmallVector &strides, Value memref); diff --git a/mlir/lib/Conversion/AIRRtToNpuPass.cpp b/mlir/lib/Conversion/AIRRtToNpuPass.cpp index 7f69f3585..a85c64b08 100644 --- a/mlir/lib/Conversion/AIRRtToNpuPass.cpp +++ b/mlir/lib/Conversion/AIRRtToNpuPass.cpp @@ -740,9 +740,11 @@ specializeAffineForInAIRRtDmaWrapAndStride(OpBuilder builder, builder.create(loc, current_stride)); } } - xilinx::air::foldForLoopNestAsExtendedSizesAndStrides( + auto res = xilinx::air::foldForLoopNestAsExtendedSizesAndStrides( builder, for_op.getOperation(), memcpy_ops[0].getOperation(), offsets, wraps, strides, memcpy_ops[0]->getOperand(3)); + if (res.failed()) + return failure(); if (offsets.size() > 4 || wraps.size() > 4 || strides.size() > 4) return failure(); diff --git a/mlir/lib/Transform/AIRDependencyScheduleOpt.cpp b/mlir/lib/Transform/AIRDependencyScheduleOpt.cpp index 5adce0e19..707427c91 100644 --- a/mlir/lib/Transform/AIRDependencyScheduleOpt.cpp +++ b/mlir/lib/Transform/AIRDependencyScheduleOpt.cpp @@ -1722,9 +1722,11 @@ struct AIRSpecializeChannelWrapAndStrideInScfFor populateDefaultWrapsAndStrides(rewriter, channel_ops[0].getMemref(), offsets, wraps, strides); - foldForLoopNestAsExtendedSizesAndStrides( + auto res = foldForLoopNestAsExtendedSizesAndStrides( rewriter, for_op.getOperation(), channel_ops[0].getOperation(), offsets, wraps, strides, channel_ops[0].getMemref()); + if (res.failed()) + return failure(); (void)canonicalizeWrapAndStrideList( rewriter, offsets, wraps, strides, @@ -1823,9 +1825,11 @@ struct AIRSpecializeChannelWrapAndStrideInAffineFor rewriter, offsets, wraps, strides, air::getTensorVolume(channel_ops[0].getMemref().getType())); - foldForLoopNestAsExtendedSizesAndStrides( + auto res = foldForLoopNestAsExtendedSizesAndStrides( rewriter, for_op.getOperation(), channel_ops[0].getOperation(), offsets, wraps, strides, channel_ops[0].getMemref()); + if (res.failed()) + return failure(); (void)canonicalizeWrapAndStrideList( rewriter, offsets, wraps, strides, diff --git a/mlir/lib/Util/Util.cpp b/mlir/lib/Util/Util.cpp index 5bcba9a02..cbdcf2a8d 100644 --- a/mlir/lib/Util/Util.cpp +++ b/mlir/lib/Util/Util.cpp @@ -903,7 +903,7 @@ LogicalResult air::canonicalizeWrapAndStrideList(OpBuilder builder, } // Fold perfectly nested for loops as extra entries in wraps and strides -void air::foldForLoopNestAsExtendedSizesAndStrides( +LogicalResult air::foldForLoopNestAsExtendedSizesAndStrides( OpBuilder builder, Operation *for_op, Operation *channel_op, SmallVector &offsets, SmallVector &wraps, SmallVector &strides, Value memref) { @@ -972,13 +972,22 @@ void air::foldForLoopNestAsExtendedSizesAndStrides( stepSize = afo.getStepAsInt(); else if (auto sfo = dyn_cast(o)) stepSize = *mlir::getConstantIntValue(sfo.getStep()); - Value new_stride = builder.template create( - loc, (stepSize * ind_var_factor) % getTensorVolume(memref.getType())); + int new_stride_value = + (stepSize * ind_var_factor) % getTensorVolume(memref.getType()); + Value new_stride = + builder.template create(loc, new_stride_value); + + // Check for compliance with DMA BD hardware limitation (<= 1M). + if (mlir::ceilDiv( + new_stride_value * getElementSizeInBytes(memref.getType()), 4) > + 0x100000) + return failure(); // Insert new dimension into the wraps and strides list. wraps.insert(wraps.begin(), new_wrap); strides.insert(strides.begin(), new_stride); } + return success(); } // If wrap-and-stride lists are empty, populate them with default data access diff --git a/mlir/test/Conversion/AIRRtToNpu/airrt_to_npu.mlir b/mlir/test/Conversion/AIRRtToNpu/airrt_to_npu.mlir index 30c6b9f00..f8d39f423 100644 --- a/mlir/test/Conversion/AIRRtToNpu/airrt_to_npu.mlir +++ b/mlir/test/Conversion/AIRRtToNpu/airrt_to_npu.mlir @@ -808,3 +808,45 @@ module { return } } + +// ----- + +// Avoid folding for loop into wrap-and-stride, if the outcome is stride > 1M; unroll BDs instead. + +// CHECK-LABEL: aie.device(npu) +// CHECK: func.func @func18(%[[ARG0:.*]]: memref<8192x32768xi32>) +// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][1, 4, 64, 64][0, 64, 32768]) {id = 0 : i64, metadata = @airMemcpyId26} : memref<8192x32768xi32> + +#map = affine_map<()[s0] -> (s0 * 64)> +module { + aie.device(npu) { + %tile_0_0 = aie.tile(0, 0) + aie.shim_dma_allocation @airMemcpyId26(S2MM, 0, 0) + memref.global "public" @airMemcpyId26 : memref<64x64xi32, 1> + } {sym_name = "segment_0"} + func.func @func18() { + %c32768_i64 = arith.constant 32768 : i64 + %c8_i64 = arith.constant 8 : i64 + %c512_i64 = arith.constant 512 : i64 + %c64_i64 = arith.constant 64 : i64 + %c26_i32 = arith.constant 26 : i32 + %c15_i32 = arith.constant 15 : i32 + %c14_i32 = arith.constant 14 : i32 + %c1_i64 = arith.constant 1 : i64 + %c0_i64 = arith.constant 0 : i64 + %alloc = memref.alloc() : memref<8192x32768xi32> + affine.for %arg3 = 0 to 4 { + affine.for %arg4 = 0 to 4 { + %10 = affine.apply #map()[%arg3] + %11 = affine.apply #map()[%arg4] + %12 = arith.index_cast %arg3 : index to i64 + %13 = arith.index_cast %arg4 : index to i64 + %14 = arith.index_cast %10 : index to i64 + %15 = arith.index_cast %11 : index to i64 + %16 = airrt.dma_memcpy_nd(%c26_i32, %12, %13, %alloc[%c0_i64, %c0_i64, %14, %15], [%c1_i64, %c1_i64, %c64_i64, %c64_i64], [%c0_i64, %c0_i64, %c32768_i64]) {metadata = @airMemcpyId26} : (i32, i64, i64, memref<8192x32768xi32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + %p = airrt.segment_load "segment_0" : i64 + } + } + return + } +}