Skip to content

Commit

Permalink
Avoid over-optimization of SHIM DMA BDs, if the outcome is stride>1M (X…
Browse files Browse the repository at this point in the history
…ilinx#553)

* Avoid folding for loop into wrap-and-stride, if the outcome is stride > 1M; unroll BDs instead

* Fixup clang format
  • Loading branch information
erwei-xilinx authored Apr 25, 2024
1 parent 4faaa09 commit 191873f
Show file tree
Hide file tree
Showing 5 changed files with 64 additions and 7 deletions.
2 changes: 1 addition & 1 deletion mlir/include/air/Util/Util.h
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ std::vector<unsigned> getMDVectorFromIterator(std::vector<unsigned> dims,
void getDefiningOpsToOperands(Operation *op, SmallVector<Operation *> &def_ops);

// Fold perfectly nested parent loops into wraps and strides list
void foldForLoopNestAsExtendedSizesAndStrides(
LogicalResult foldForLoopNestAsExtendedSizesAndStrides(
OpBuilder builder, Operation *for_op, Operation *channel_op,
SmallVector<Value> &offsets, SmallVector<Value> &wraps,
SmallVector<Value> &strides, Value memref);
Expand Down
4 changes: 3 additions & 1 deletion mlir/lib/Conversion/AIRRtToNpuPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -740,9 +740,11 @@ specializeAffineForInAIRRtDmaWrapAndStride(OpBuilder builder,
builder.create<arith::ConstantIndexOp>(loc, current_stride));
}
}
xilinx::air::foldForLoopNestAsExtendedSizesAndStrides(
auto res = xilinx::air::foldForLoopNestAsExtendedSizesAndStrides(
builder, for_op.getOperation(), memcpy_ops[0].getOperation(), offsets,
wraps, strides, memcpy_ops[0]->getOperand(3));
if (res.failed())
return failure();

if (offsets.size() > 4 || wraps.size() > 4 || strides.size() > 4)
return failure();
Expand Down
8 changes: 6 additions & 2 deletions mlir/lib/Transform/AIRDependencyScheduleOpt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1722,9 +1722,11 @@ struct AIRSpecializeChannelWrapAndStrideInScfFor
populateDefaultWrapsAndStrides(rewriter, channel_ops[0].getMemref(),
offsets, wraps, strides);

foldForLoopNestAsExtendedSizesAndStrides(
auto res = foldForLoopNestAsExtendedSizesAndStrides(
rewriter, for_op.getOperation(), channel_ops[0].getOperation(), offsets,
wraps, strides, channel_ops[0].getMemref());
if (res.failed())
return failure();

(void)canonicalizeWrapAndStrideList(
rewriter, offsets, wraps, strides,
Expand Down Expand Up @@ -1823,9 +1825,11 @@ struct AIRSpecializeChannelWrapAndStrideInAffineFor
rewriter, offsets, wraps, strides,
air::getTensorVolume(channel_ops[0].getMemref().getType()));

foldForLoopNestAsExtendedSizesAndStrides(
auto res = foldForLoopNestAsExtendedSizesAndStrides(
rewriter, for_op.getOperation(), channel_ops[0].getOperation(), offsets,
wraps, strides, channel_ops[0].getMemref());
if (res.failed())
return failure();

(void)canonicalizeWrapAndStrideList(
rewriter, offsets, wraps, strides,
Expand Down
15 changes: 12 additions & 3 deletions mlir/lib/Util/Util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -903,7 +903,7 @@ LogicalResult air::canonicalizeWrapAndStrideList(OpBuilder builder,
}

// Fold perfectly nested for loops as extra entries in wraps and strides
void air::foldForLoopNestAsExtendedSizesAndStrides(
LogicalResult air::foldForLoopNestAsExtendedSizesAndStrides(
OpBuilder builder, Operation *for_op, Operation *channel_op,
SmallVector<Value> &offsets, SmallVector<Value> &wraps,
SmallVector<Value> &strides, Value memref) {
Expand Down Expand Up @@ -972,13 +972,22 @@ void air::foldForLoopNestAsExtendedSizesAndStrides(
stepSize = afo.getStepAsInt();
else if (auto sfo = dyn_cast<scf::ForOp>(o))
stepSize = *mlir::getConstantIntValue(sfo.getStep());
Value new_stride = builder.template create<arith::ConstantIndexOp>(
loc, (stepSize * ind_var_factor) % getTensorVolume(memref.getType()));
int new_stride_value =
(stepSize * ind_var_factor) % getTensorVolume(memref.getType());
Value new_stride =
builder.template create<arith::ConstantIndexOp>(loc, new_stride_value);

// Check for compliance with DMA BD hardware limitation (<= 1M).
if (mlir::ceilDiv(
new_stride_value * getElementSizeInBytes(memref.getType()), 4) >
0x100000)
return failure();

// Insert new dimension into the wraps and strides list.
wraps.insert(wraps.begin(), new_wrap);
strides.insert(strides.begin(), new_stride);
}
return success();
}

// If wrap-and-stride lists are empty, populate them with default data access
Expand Down
42 changes: 42 additions & 0 deletions mlir/test/Conversion/AIRRtToNpu/airrt_to_npu.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -808,3 +808,45 @@ module {
return
}
}

// -----

// Avoid folding for loop into wrap-and-stride, if the outcome is stride > 1M; unroll BDs instead.

// CHECK-LABEL: aie.device(npu)
// CHECK: func.func @func18(%[[ARG0:.*]]: memref<8192x32768xi32>)
// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][1, 4, 64, 64][0, 64, 32768]) {id = 0 : i64, metadata = @airMemcpyId26} : memref<8192x32768xi32>

#map = affine_map<()[s0] -> (s0 * 64)>
module {
aie.device(npu) {
%tile_0_0 = aie.tile(0, 0)
aie.shim_dma_allocation @airMemcpyId26(S2MM, 0, 0)
memref.global "public" @airMemcpyId26 : memref<64x64xi32, 1>
} {sym_name = "segment_0"}
func.func @func18() {
%c32768_i64 = arith.constant 32768 : i64
%c8_i64 = arith.constant 8 : i64
%c512_i64 = arith.constant 512 : i64
%c64_i64 = arith.constant 64 : i64
%c26_i32 = arith.constant 26 : i32
%c15_i32 = arith.constant 15 : i32
%c14_i32 = arith.constant 14 : i32
%c1_i64 = arith.constant 1 : i64
%c0_i64 = arith.constant 0 : i64
%alloc = memref.alloc() : memref<8192x32768xi32>
affine.for %arg3 = 0 to 4 {
affine.for %arg4 = 0 to 4 {
%10 = affine.apply #map()[%arg3]
%11 = affine.apply #map()[%arg4]
%12 = arith.index_cast %arg3 : index to i64
%13 = arith.index_cast %arg4 : index to i64
%14 = arith.index_cast %10 : index to i64
%15 = arith.index_cast %11 : index to i64
%16 = airrt.dma_memcpy_nd(%c26_i32, %12, %13, %alloc[%c0_i64, %c0_i64, %14, %15], [%c1_i64, %c1_i64, %c64_i64, %c64_i64], [%c0_i64, %c0_i64, %c32768_i64]) {metadata = @airMemcpyId26} : (i32, i64, i64, memref<8192x32768xi32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
%p = airrt.segment_load "segment_0" : i64
}
}
return
}
}

0 comments on commit 191873f

Please sign in to comment.