From e8f5f8c6139884dfb7f4079253e776ac7701fdd2 Mon Sep 17 00:00:00 2001 From: Zhewen Yu <42230979+Yu-Zhewen@users.noreply.github.com> Date: Fri, 7 Jun 2024 18:58:55 +0100 Subject: [PATCH] trace generation (#592) * trace generation * fix format and test * revert change on Passes.td format * redo Passes.td format * fix format * remove experimental bit * add trace.md * update trace.md * add unit tests * increase default buffer size * fix test * update CHECK --------- Co-authored-by: erwei-xilinx --- docs/trace.md | 37 +++ mlir/include/air/Conversion/Passes.td | 12 +- mlir/lib/Conversion/AIRRtToNpuPass.cpp | 148 +++++++++++- mlir/lib/Conversion/AIRToAIEPass.cpp | 60 ++++- .../AIRRtToNpu/generate_trace_write32.mlir | 63 ++++++ .../AIRToAIE/insert_trace_packet_flow.mlir | 31 +++ python/air/compiler/aircc/cl_arguments.py | 12 + python/air/compiler/aircc/main.py | 13 +- test/xrt/01_air_to_npu/aie.py | 212 ++++++++++++------ test/xrt/01_air_to_npu/run.lit | 6 +- test/xrt/01_air_to_npu/test.cpp | 31 ++- 11 files changed, 545 insertions(+), 80 deletions(-) create mode 100644 docs/trace.md create mode 100644 mlir/test/Conversion/AIRRtToNpu/generate_trace_write32.mlir create mode 100644 mlir/test/Conversion/AIRToAIE/insert_trace_packet_flow.mlir diff --git a/docs/trace.md b/docs/trace.md new file mode 100644 index 000000000..ce363a26c --- /dev/null +++ b/docs/trace.md @@ -0,0 +1,37 @@ +# Auto Trace Generation in AIR + +## Usage + +To enable this feature, + +* provide the `insert-trace-packet-flow=true` option to the `air-to-aie` pass, and +* specify the `trace-size`, `trace-offset` options to the `airrt-to-npu` pass. + +Trace can then be generated for all compute tiles (cores) and memtiles, unless there is a routing congestion when the build might fail. + +`trace-size` defines the buffer size allocated to hold the trace data, represented in bytes. Currently, this value is chosen by the user empirically, depending on the number of cores traced and how frequent the event might be triggered. + +`trace-offset` defines the offset when the trace data are appended to the output. It might be inferred from the code in the future. In addition, it is for now hard coded that the trace data are dumped to `ddr_id = 2`. + +One such example is provided in `test/xrt/01_air_to_npu`, and the generated trace file can be further processed through [parse_trace.py](https://github.com/Xilinx/mlir-aie/blob/main/programming_examples/utils/parse_trace.py). + + +Currently, in this pariticular example and when trace is enabled, the entire column of core tiles is shifted to the right by one and all trace data comes out via the second column's shim tile. This is a workaround for the congestion that the `South` port is running out and the bottom row of core tiles (i.e. the 2nd row of the whole array) cannot be routed as `Trace->South->West/East`, once it hits the switchbox of memtile. + +## air-to-aie +Inside this pass, the packet flows are inserted when `insert-trace-packet-flow=true`. The source of the flow is `channel = 0` of the trace port and the destination is `channel = 1` of the shim tile in the same column. + +One possible future improvement can be allowing user to specify which channel/shim tile to use, or having an allocation algorithm in place. In addition, the current assumption is everything else apart from the trace are using circult-switch connections, without detecting any potential conflict in the packet id. + +## airrt-to-npu +This pass is responsible for inserting trace-related `NpuWrite32Op` to `func.func`. The details of these operations have already been documented in [MLIR-AIE](https://github.com/Xilinx/mlir-aie/blob/resnet/docs/Tracing.md), except the extra support for timestamp synchronization across multiple traces. + +To have the synchronization, the following steps are required: + +* make the internal timer of each tile reset, when the event `BROADCAST_15` is detected. The address is `0x34000` and `0x94000` for the NPU compute tile and memtile respectively. The event id is `122` and `157` respectively according to this [header file](https://github.com/Xilinx/aie-rt/blob/main-aie/driver/src/events/xaie_events_aieml.h). +* set the start of the trace triggered by `BROADCAST_15` as well, with the address as `0x340D0` and `0x940D0`. +* for the bottom left tile (0, 0), reset the timer when `USER_EVENT_1` is detected. The address to write is `0x34000` and the event id is `127`. +* use `USER_EVENT_1` to trigger `BROADCAST_15`. This is done by writing `127` to address `0x3404C`. +* actually trigger `USER_EVENT_1` by writing `127` to address `0x34008`. + +So far, the values of these operations (such as specifying which events or ports to monitor) and the addresses are all hard coded. In the future, they might also be exposed as user options and depend on the `TargetModel` as well. diff --git a/mlir/include/air/Conversion/Passes.td b/mlir/include/air/Conversion/Passes.td index 07bf4960d..020c39db6 100644 --- a/mlir/include/air/Conversion/Passes.td +++ b/mlir/include/air/Conversion/Passes.td @@ -254,6 +254,9 @@ def AIRToAIE : Pass<"air-to-aie", "ModuleOp"> { /*default=*/"false", "Choose whether to schedule shim data movement via generating AIE " " shim DMA program, or AIR runtime.">, + Option<"clInsertTracePacketFlow", "insert-trace-packet-flow", "bool", + /*default=*/"false", + "Create packet routed traces for cores and memtiles">, ]; let description = [{ This pass converts AIR dialect `herd` and `segment` operations into AIE @@ -452,7 +455,14 @@ def AIRRtToNpu : Pass<"airrt-to-npu", "ModuleOp"> { ``` }]; - let options = []; + let options = [ + Option<"clTraceSize", "trace-size", "unsigned", + /*default=*/"0", + "Trace buffer size for cores and memtiles (in bytes)">, + Option<"clTraceOffset", "trace-offset", "unsigned", + /*default=*/"0", + "Trace buffer offset appended to ddr_id=2"> + ]; let dependentDialects = ["xilinx::AIEX::AIEXDialect"]; } diff --git a/mlir/lib/Conversion/AIRRtToNpuPass.cpp b/mlir/lib/Conversion/AIRRtToNpuPass.cpp index 2f985e78e..1762c5bf2 100644 --- a/mlir/lib/Conversion/AIRRtToNpuPass.cpp +++ b/mlir/lib/Conversion/AIRRtToNpuPass.cpp @@ -36,6 +36,7 @@ using namespace xilinx; using namespace xilinx::airrt; namespace { +#define GEN_PASS_DECL_AIRRTTONPU #define GEN_PASS_DEF_AIRRTTONPU #include "air/Conversion/Passes.h.inc" @@ -53,6 +54,7 @@ namespace { // %1 = unrealized_conversion_cast %0 // %2 = memref.assume_alignment %1 // + struct RelocateAssumeAlignmentOp : public mlir::OpRewritePattern { using OpRewritePattern::OpRewritePattern; @@ -452,7 +454,8 @@ void hoistTargetOpsToNewAffineFor(OpBuilder builder, affine::AffineForOp for_op, } } -template void push_back_if_unique(SmallVector &vec, T entry) { +template +void push_back_if_unique(SmallVector &vec, T entry) { if (std::find(vec.begin(), vec.end(), entry) == vec.end()) { vec.push_back(entry); } @@ -967,6 +970,10 @@ struct AIRRtToNpuPass : public impl::AIRRtToNpuBase { // Renumber npu dma ops renumberNpuDmaOps(module.getBody()); + + // Configure the tile trace units and the shimDMA + if (clTraceSize > 0) + insertNpuWrite32ForTrace(module, clTraceSize, clTraceOffset); } void moveFuncOpToEndOfDeviceOp(ModuleOp module) { @@ -1251,6 +1258,145 @@ struct AIRRtToNpuPass : public impl::AIRRtToNpuBase { } } + void insertNpuWrite32ForTrace(ModuleOp module, int64_t trace_size, + int64_t trace_offset) { + SmallVector funcOps; + module.walk([&](mlir::func::FuncOp f) { funcOps.push_back(f); }); + + for (auto f : funcOps) { + OpBuilder builder(f); + auto d = f->getParentOfType(); + if (!d) + continue; + + auto &target_model = d.getTargetModel(); + std::map chanToIdMap; + builder.setInsertionPointToStart(&f.front()); + for (auto pktFlow : d.getOps()) { + Region &r = pktFlow.getPorts(); + Block &b = r.front(); + int flowID = pktFlow.IDInt(); + AIE::Port sourcePort, destPort; + AIE::TileOp srcTile, destTile; + + // find all packet flow with trace port as source + for (Operation &Op : b.getOperations()) { + if (auto pktSrc = dyn_cast(Op)) { + srcTile = dyn_cast(pktSrc.getTile().getDefiningOp()); + sourcePort = pktSrc.port(); + } else if (auto pktDest = dyn_cast(Op)) { + destTile = dyn_cast(pktDest.getTile().getDefiningOp()); + destPort = pktDest.port(); + } + } + if (sourcePort.bundle != AIE::WireBundle::Trace) + continue; + + int srcColIndex = srcTile.colIndex(); + int srcRowIndex = srcTile.rowIndex(); + int dstColIndex = destTile.colIndex(); + int dstRowIndex = destTile.rowIndex(); + assert((target_model.isCoreTile(srcColIndex, srcRowIndex) || + target_model.isMemTile(srcColIndex, srcRowIndex)) && + "unsupported trace src"); + assert(target_model.isShimNOCTile(dstColIndex, dstRowIndex) && + "unsupported trace dest"); + int pkt_type = 0; + if (target_model.isMemTile(srcColIndex, srcRowIndex)) + pkt_type = 3; + else if (sourcePort.channel == 1) + pkt_type = 1; + int buff_size = trace_size / target_model.columns(); + int buff_offset = trace_offset; // todo: get from func args? + buff_offset += dstColIndex * buff_size; + + // configure tile trace + if (target_model.isCoreTile(srcColIndex, srcRowIndex)) { + // event boardcast to sync timer + builder.create(builder.getUnknownLoc(), + srcColIndex, srcRowIndex, 0x34000, + 122 << 8); + builder.create(builder.getUnknownLoc(), + srcColIndex, srcRowIndex, 0x340D0, + 122 << 16); + builder.create(builder.getUnknownLoc(), + srcColIndex, srcRowIndex, 0x340D4, + pkt_type << 12 | flowID); + // configure events to monitor + // todo: allow user to specify? + builder.create( + builder.getUnknownLoc(), srcColIndex, srcRowIndex, 0x340E0, + (1 << 24) | (33 << 16) | (34 << 8) | 37); + builder.create( + builder.getUnknownLoc(), srcColIndex, srcRowIndex, 0x340E4, + (44 << 24) | (45 << 16) | (75 << 8) | 79); + // configure ports to monitor + // todo: allow user to specify? + builder.create(builder.getUnknownLoc(), + srcColIndex, srcRowIndex, 0x3FF00, + (1 << 8) | ((1 << 5) | 1)); + // builder.create( + // builder.getUnknownLoc(), srcColIndex, srcRowIndex, 0x3FF04, 0); + } else if (target_model.isMemTile(srcColIndex, srcRowIndex)) { + // event boardcast to sync timer + builder.create(builder.getUnknownLoc(), + srcColIndex, srcRowIndex, 0x94000, + 157 << 8); + builder.create(builder.getUnknownLoc(), + srcColIndex, srcRowIndex, 0x940D0, + 157 << 16); + builder.create(builder.getUnknownLoc(), + srcColIndex, srcRowIndex, 0x940D4, + pkt_type << 12 | flowID); + // configure events to monitor + // todo: allow user to specify? + builder.create( + builder.getUnknownLoc(), srcColIndex, srcRowIndex, 0x940E0, + (1 << 24) | (80 << 16) | (84 << 8) | 88); + builder.create( + builder.getUnknownLoc(), srcColIndex, srcRowIndex, 0x940E4, + (92 << 24) | (96 << 16) | (100 << 8) | 104); + // configure ports to monitor + // todo: allow user to specify? + builder.create( + builder.getUnknownLoc(), srcColIndex, srcRowIndex, 0xB0F00, + ((1 << 21) | (2 << 16)) | ((1 << 13) | (1 << 8)) | (1 << 5)); + builder.create(builder.getUnknownLoc(), + srcColIndex, srcRowIndex, 0xB0F04, + (3 << 16) | (2 << 8) | 1); + } + + // configure shim tile + if (chanToIdMap.count(dstColIndex) == 0) + chanToIdMap[dstColIndex] = 15; + int bdID = chanToIdMap[dstColIndex]; + int ddr_id = 2; // todo: let user specify + assert(bdID >= 4 && "run out of bd_id"); + builder.create( + builder.getUnknownLoc(), dstColIndex, 1, ddr_id, bdID, buff_size, + buff_offset, 1, 0, flowID, pkt_type, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0); + int address; + if (destPort.channel == 0) + address = 0x1D204; + else if (destPort.channel == 1) + address = 0x1D20C; + else + assert(false && "unknown trace dest"); + builder.create(builder.getUnknownLoc(), dstColIndex, + dstRowIndex, address, bdID--); + } + + // broadcast event to sync timer + builder.create(builder.getUnknownLoc(), 0, 0, 0x34000, + 127 << 8); + builder.create(builder.getUnknownLoc(), 0, 0, 0x3404C, + 127); + builder.create(builder.getUnknownLoc(), 0, 0, 0x34008, + 127); + } + } + // Renumber aiex.npu.dma_memcpy_nd ops per column of AIEs. void renumberNpuDmaOps(Block *blk) { std::map chanToIdMap; diff --git a/mlir/lib/Conversion/AIRToAIEPass.cpp b/mlir/lib/Conversion/AIRToAIEPass.cpp index 5764536eb..f8d12aa42 100644 --- a/mlir/lib/Conversion/AIRToAIEPass.cpp +++ b/mlir/lib/Conversion/AIRToAIEPass.cpp @@ -55,6 +55,7 @@ struct AIRToAIEConversionOptions { bool emit_while; bool emit_herd_lock; bool generate_shim_dma; + bool insert_trace_packet_flow; AIE::AIEDevice device; }; @@ -431,7 +432,8 @@ void outlineAIEMemtiles(OpBuilder &builder, AIE::DeviceOp aie_device, } } -template void push_back_if_unique(std::vector &vec, T entry) { +template +void push_back_if_unique(std::vector &vec, T entry) { if (std::find(vec.begin(), vec.end(), entry) == vec.end()) vec.push_back(entry); } @@ -3013,6 +3015,55 @@ class AIRToAIEPass : public air::impl::AIRToAIEBase { } } + void createTracePacketFlow(AIE::DeviceOp device) { + OpBuilder builder(device); + const auto &target_model = device.getTargetModel(); + + // Collect existing TileOps + DenseMap tiles; + for (auto tile : device.getOps()) { + int colIndex = tile.colIndex(); + int rowIndex = tile.rowIndex(); + tiles[{colIndex, rowIndex}] = tile; + } + + // Create packet flows + int flowID = 0; // todo: check any existing? + for (auto srcTile : device.getOps()) { + int srcColIndex = srcTile.colIndex(); + int srcRowIndex = srcTile.rowIndex(); + AIE::TileOp destTile; + + if (target_model.isCoreTile(srcColIndex, srcRowIndex) || + target_model.isMemTile(srcColIndex, srcRowIndex)) { + int destColIndex = srcColIndex; // todo: allocation? + int destRowIndex = 0; + if (!tiles[{destColIndex, destRowIndex}]) { + builder.setInsertionPointToStart(device.getBody()); + destTile = builder.create(builder.getUnknownLoc(), + destColIndex, destRowIndex); + tiles[{destColIndex, destRowIndex}] = destTile; + } else { + destTile = tiles[{destColIndex, destRowIndex}]; + } + int destChan = 1; // todo: allocation? + + builder.setInsertionPointToEnd(device.getBody()); + auto keep_pkt_header = builder.getBoolAttr(true); + AIE::PacketFlowOp pktFlow = builder.create( + builder.getUnknownLoc(), flowID++, keep_pkt_header); + Region &r_pktFlow = pktFlow.getPorts(); + Block *b_pktFlow = builder.createBlock(&r_pktFlow); + builder.setInsertionPointToStart(b_pktFlow); + builder.create(builder.getUnknownLoc(), srcTile, + AIE::WireBundle::Trace, 0); + builder.create(builder.getUnknownLoc(), destTile, + AIE::WireBundle::DMA, destChan); + builder.create(builder.getUnknownLoc()); + } + } + } + void runTestPatterns() { auto m = getOperation(); @@ -3038,6 +3089,7 @@ class AIRToAIEPass : public air::impl::AIRToAIEBase { /*.emit_while = */ clEmitWhileLoop, /*.emit_herd_lock = */ clEmitHerdLock, /*.generate_shim_dma = */ clGenerateShimDMA, + /*.insert_trace_packet_flow = */ clInsertTracePacketFlow, /*.device = */ *device}; createAIEModulesAndOutlineCores(m, aie_modules, tileToHerdMap, options); std::set seen; @@ -3064,6 +3116,8 @@ class AIRToAIEPass : public air::impl::AIRToAIEBase { renumberChannelOps(&d.getBodyRegion().front(), chan_renumber_reverse_map); } + if (options.insert_trace_packet_flow) + createTracePacketFlow(d); } } @@ -3135,6 +3189,7 @@ class AIRToAIEPass : public air::impl::AIRToAIEBase { /* .emit_while = */ clEmitWhileLoop, /* .emit_herd_lock = */ clEmitHerdLock, /* .generate_shim_dma = */ clGenerateShimDMA, + /*.insert_trace_packet_flow = */ clInsertTracePacketFlow, /* .device = */ *device}; createAIEModulesAndOutlineCores(module, aie_devices, tileToHerdMap, options); @@ -3200,6 +3255,8 @@ class AIRToAIEPass : public air::impl::AIRToAIEBase { lowerAIRMemcpyOp(device, shimDmaAlloc, options); // lowerPipelineGetPut(device, tileToHerdMap); + if (options.insert_trace_packet_flow) + createTracePacketFlow(device); SmallVector herds; SmallVector segs; @@ -3486,6 +3543,7 @@ FailureOr convertAIRToAIE(mlir::RewriterBase &rewriter, /* .emit_while = */ false, /* .emit_herd_lock = */ false, /* .generate_shim_dma = */ false, + /*.trace_size = */ 0, /* .device = */ *device}; std::vector> aie_modules; p.walk([&](xilinx::air::HerdOp h) { diff --git a/mlir/test/Conversion/AIRRtToNpu/generate_trace_write32.mlir b/mlir/test/Conversion/AIRRtToNpu/generate_trace_write32.mlir new file mode 100644 index 000000000..a6948909e --- /dev/null +++ b/mlir/test/Conversion/AIRRtToNpu/generate_trace_write32.mlir @@ -0,0 +1,63 @@ +//===- generate_trace_write32.mlir ------------------------*- MLIR -*-===// +// +// Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: MIT +// +//===----------------------------------------------------------------------===// + +// RUN: air-opt %s -airrt-to-npu='trace-offset=65536 trace-size=65536' | FileCheck %s +module { + aie.device(npu1_4col) { + %tile_0_0 = aie.tile(0, 0) + %tile_0_1 = aie.tile(0, 1) + %tile_0_2 = aie.tile(0, 2) + aie.packet_flow(0) { + aie.packet_source<%tile_0_1, Trace : 0> + aie.packet_dest<%tile_0_0, DMA : 1> + } {keep_pkt_header = true} + aie.packet_flow(1) { + aie.packet_source<%tile_0_2, Trace : 0> + aie.packet_dest<%tile_0_0, DMA : 1> + } {keep_pkt_header = true} + aie.shim_dma_allocation @airMemcpyId7(S2MM, 0, 0) + memref.global "public" @airMemcpyId7 : memref<64xi32, 1> + aie.shim_dma_allocation @airMemcpyId2(MM2S, 0, 0) + memref.global "public" @airMemcpyId2 : memref<64xi32, 1> + } {sym_name = "segment0"} + air.channel @channel_0 [1, 1] + air.channel @channel_1 [1, 1] + air.channel @channel_2 [1, 1] + air.channel @channel_3 [1, 1] + func.func @func0(%arg0: memref<64xi32>, %arg1: memref<64xi32>) { +// CHECK: aiex.npu.write32 {address = 606208 : ui32, column = 0 : i32, row = 1 : i32, value = 40192 : ui32} +// CHECK: aiex.npu.write32 {address = 606416 : ui32, column = 0 : i32, row = 1 : i32, value = 10289152 : ui32} +// CHECK: aiex.npu.write32 {address = 606420 : ui32, column = 0 : i32, row = 1 : i32, value = 12288 : ui32} +// CHECK: aiex.npu.write32 {address = 606432 : ui32, column = 0 : i32, row = 1 : i32, value = 22041688 : ui32} +// CHECK: aiex.npu.write32 {address = 606436 : ui32, column = 0 : i32, row = 1 : i32, value = 1549821032 : ui32} +// CHECK: aiex.npu.write32 {address = 724736 : ui32, column = 0 : i32, row = 1 : i32, value = 2236704 : ui32} +// CHECK: aiex.npu.write32 {address = 724740 : ui32, column = 0 : i32, row = 1 : i32, value = 197121 : ui32} +// CHECK: aiex.npu.writebd_shimtile {bd_id = 15 : i32, buffer_length = 16384 : i32, buffer_offset = 65536 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} +// CHECK: aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 15 : ui32} +// CHECK: aiex.npu.write32 {address = 212992 : ui32, column = 0 : i32, row = 2 : i32, value = 31232 : ui32} +// CHECK: aiex.npu.write32 {address = 213200 : ui32, column = 0 : i32, row = 2 : i32, value = 7995392 : ui32} +// CHECK: aiex.npu.write32 {address = 213204 : ui32, column = 0 : i32, row = 2 : i32, value = 1 : ui32} +// CHECK: aiex.npu.write32 {address = 213216 : ui32, column = 0 : i32, row = 2 : i32, value = 18948645 : ui32} +// CHECK: aiex.npu.write32 {address = 213220 : ui32, column = 0 : i32, row = 2 : i32, value = 741165903 : ui32} +// CHECK: aiex.npu.write32 {address = 261888 : ui32, column = 0 : i32, row = 2 : i32, value = 289 : ui32} +// CHECK: aiex.npu.writebd_shimtile {bd_id = 15 : i32, buffer_length = 16384 : i32, buffer_offset = 65536 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 1 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} +// CHECK: aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 15 : ui32} +// CHECK: aiex.npu.write32 {address = 212992 : ui32, column = 0 : i32, row = 0 : i32, value = 32512 : ui32} +// CHECK: aiex.npu.write32 {address = 213068 : ui32, column = 0 : i32, row = 0 : i32, value = 127 : ui32} +// CHECK: aiex.npu.write32 {address = 213000 : ui32, column = 0 : i32, row = 0 : i32, value = 127 : ui32} + + %c0_i64 = arith.constant 0 : i64 + %c1_i64 = arith.constant 1 : i64 + %c2_i32 = arith.constant 2 : i32 + %c64_i64 = arith.constant 64 : i64 + airrt.dma_memcpy_nd(%c2_i32, %c0_i64, %c0_i64, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64], [%c1_i64, %c1_i64, %c1_i64, %c64_i64], [%c0_i64, %c0_i64, %c0_i64]) {metadata = @airMemcpyId2} : (i32, i64, i64, memref<64xi32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) + %p = airrt.segment_load "segment0" : i64 + %c7_i32 = arith.constant 7 : i32 + airrt.dma_memcpy_nd(%c7_i32, %c0_i64, %c0_i64, %arg1[%c0_i64, %c0_i64, %c0_i64, %c0_i64], [%c1_i64, %c1_i64, %c1_i64, %c64_i64], [%c0_i64, %c0_i64, %c0_i64]) {metadata = @airMemcpyId7} : (i32, i64, i64, memref<64xi32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) + return + } +} \ No newline at end of file diff --git a/mlir/test/Conversion/AIRToAIE/insert_trace_packet_flow.mlir b/mlir/test/Conversion/AIRToAIE/insert_trace_packet_flow.mlir new file mode 100644 index 000000000..941218f0c --- /dev/null +++ b/mlir/test/Conversion/AIRToAIE/insert_trace_packet_flow.mlir @@ -0,0 +1,31 @@ +//===- insert_trace_packet_flow.mlir ------------------------*- MLIR -*-===// +// +// Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: MIT +// +//===----------------------------------------------------------------------===// + +// RUN: air-opt %s -air-to-aie='test-patterns=to-aie-mlir insert-trace-packet-flow=true' | FileCheck %s +// CHECK: aie.packet_flow(0) { +// CEHCK: aie.packet_source<%tile_1_1, Trace : 0> +// CHECK: aie.packet_dest<%tile_1_0, DMA : 1> +// CHECK: } {keep_pkt_header = true} +module { + +func.func @foo(%arg0: i32) { + %cst1 = arith.constant 1 : index + air.herd tile(%tx, %ty) in (%size_x = %cst1, %size_y = %cst1) { + %src0 = memref.alloc() : memref<1xi32, 2> + %src1 = memref.alloc() : memref<1xi32, 2> + %zero = arith.constant 0 : index + %0 = memref.load %src0[%zero] : memref<1xi32, 2> + %1 = memref.load %src1[%zero] : memref<1xi32, 2> + %2 = arith.addi %0, %1 : i32 + %dst0 = memref.alloc() : memref<1xi32, 2> + memref.store %2, %dst0[%zero] : memref<1xi32, 2> + air.herd_terminator + } + return +} + +} diff --git a/python/air/compiler/aircc/cl_arguments.py b/python/air/compiler/aircc/cl_arguments.py index d268e13e8..c36e4294c 100644 --- a/python/air/compiler/aircc/cl_arguments.py +++ b/python/air/compiler/aircc/cl_arguments.py @@ -54,6 +54,18 @@ def parse_args(args=None): dest="num_cols", help="Default number of rows for generated segments", ) + parser.add_argument( + "-trace-size", + dest="trace_size", + default=0, + help="Create packet routed traces for cores and memtiles", + ) + parser.add_argument( + "-trace-offset", + dest="trace_offset", + default=0, + help="Trace buffer offset appended to output", + ) parser.add_argument("-cc", dest="cc", default="clang", help="Compiler to use") parser.add_argument( "--sysroot", metavar="sysroot", default="", help="sysroot for cross-compilation" diff --git a/python/air/compiler/aircc/main.py b/python/air/compiler/aircc/main.py index 31b44ff32..5ccb79ccc 100644 --- a/python/air/compiler/aircc/main.py +++ b/python/air/compiler/aircc/main.py @@ -399,7 +399,10 @@ def run(mlir_module, args=None): air_to_aie_pass = "air-to-aie{emit-while-loop=true" air_to_aie_pass = air_to_aie_pass + f" row-offset={opts.row_offset}" air_to_aie_pass = air_to_aie_pass + f" col-offset={opts.col_offset}" - air_to_aie_pass = air_to_aie_pass + f" device={opts.device}" + "}" + air_to_aie_pass = air_to_aie_pass + f" device={opts.device}" + if opts.trace_size > 0: + air_to_aie_pass = air_to_aie_pass + " insert-trace-packet-flow=true" + air_to_aie_pass = air_to_aie_pass + "}" pass_pipeline = ",".join([air_to_aie_pass]) air_to_aie_file = opts.tmpdir + "/aie." + air_mlir_filename @@ -412,6 +415,12 @@ def run(mlir_module, args=None): ) if "npu" in opts.device: + airrt_to_npu_pass = "airrt-to-npu{" + airrt_to_npu_pass = airrt_to_npu_pass + f" trace-size={opts.trace_size}" + airrt_to_npu_pass = ( + airrt_to_npu_pass + f" trace-offset={opts.trace_offset}" + "}" + ) + air_to_npu_file = opts.tmpdir + "/npu." + air_mlir_filename air_to_npu_module = Module.parse(str(air_to_aie_module)) air_to_npu_passes = ( @@ -425,7 +434,7 @@ def run(mlir_module, args=None): "affine-expand-index-ops", "canonicalize", "cse", - "airrt-to-npu", + airrt_to_npu_pass, "canonicalize", "cse", ] diff --git a/test/xrt/01_air_to_npu/aie.py b/test/xrt/01_air_to_npu/aie.py index a9df78f39..95e75c096 100644 --- a/test/xrt/01_air_to_npu/aie.py +++ b/test/xrt/01_air_to_npu/aie.py @@ -5,32 +5,59 @@ import air.passmanager from air.dialects import air as airdialect from air.compiler.util import run_transform +import argparse import sys + + def matmul_on_tensors(m, n, k, dtype): module = Module.create() with InsertionPoint(module.body): + @func.FuncOp.from_py_func( - MemRefType.get((m, k), dtype), MemRefType.get((k, n), dtype)) + MemRefType.get((m, k), dtype), MemRefType.get((k, n), dtype) + ) def forward(lhs, rhs): out = memref.AllocOp(MemRefType.get((m, n), dtype), [], []) zero = arith.ConstantOp(dtype, 0) zero_fill = linalg.fill(zero, outs=[out]) linalg.matmul(lhs, rhs, outs=[out]) return out + return module + +parser = argparse.ArgumentParser(prog="aie.py") +parser.add_argument( + "--trace-size", + dest="trace_size", + default=65536, + type=int, + help="Create packet routed traces for cores and memtiles", +) +parser.add_argument( + "--trace-offset", + dest="trace_offset", + default=65536, + type=int, + help="Trace buffer offset appended to output", +) + +opts = parser.parse_args() + with air.ir.Context() as ctx, Location.unknown(): - air_module = matmul_on_tensors(128, 128, 256, IntegerType.get_signless(width = 32)) - + air_module = matmul_on_tensors(128, 128, 256, IntegerType.get_signless(width=32)) + ################################################ ## Tiling ################################################ - pm = air.passmanager.PassManager.parse(air.compiler.util.LINALG_TENSOR_TO_MEMREF_PIPELINE) + pm = air.passmanager.PassManager.parse( + air.compiler.util.LINALG_TENSOR_TO_MEMREF_PIPELINE + ) pm.run(air_module.operation) - with open('air_input.mlir', 'w') as f: + with open("air_input.mlir", "w") as f: f.write(str(air_module)) - + transform_ir_string = """ transform.with_pdl_patterns { ^bb0(%arg0: !pdl.operation): @@ -54,99 +81,154 @@ def forward(lhs, rhs): """ transform_ir = Module.parse(transform_ir_string) run_transform(transform_ir, air_module) - - with open('air_tiled.mlir', 'w') as f: + + with open("air_tiled.mlir", "w") as f: f.write(str(air_module)) - + ################################################ ## Binding scf.paralell to air hierarchies ################################################ - pipeline = "builtin.module("+",".join([ - "buffer-results-to-out-params", - "air-par-to-herd{depth=1}", - "air-par-to-launch{has-air-segment=true}", - "air-copy-to-dma", - "canonicalize", "cse", - ])+')' + pipeline = ( + "builtin.module(" + + ",".join( + [ + "buffer-results-to-out-params", + "air-par-to-herd{depth=1}", + "air-par-to-launch{has-air-segment=true}", + "air-copy-to-dma", + "canonicalize", + "cse", + ] + ) + + ")" + ) pm = air.passmanager.PassManager.parse(pipeline) pm.run(air_module.operation) - # with open('air_sync.mlir', 'w') as f: - # f.write(str(air_module)) - + with open("air_sync.mlir", "w") as f: + f.write(str(air_module)) + ################################################ ## Extract event dependency and optimize schedule ################################################ - pipeline = "builtin.module("+",".join([ - "air-dependency", - "air-dependency-schedule-opt", - "air-specialize-dma-broadcast", - "air-dma-to-channel", - "canonicalize", "cse", - "air-dependency-canonicalize", - "canonicalize", "cse", - "air-label-scf-for-to-ping-pong", - ])+')' + pipeline = ( + "builtin.module(" + + ",".join( + [ + "air-dependency", + "air-dependency-schedule-opt", + "air-specialize-dma-broadcast", + "air-dma-to-channel", + "canonicalize", + "cse", + "air-dependency-canonicalize", + "canonicalize", + "cse", + "air-label-scf-for-to-ping-pong", + ] + ) + + ")" + ) pm = air.passmanager.PassManager.parse(pipeline) pm.run(air_module.operation) # Not sure why parsing the ir solves the segmentation fault... air_module = Module.parse(str(air_module)) - pipeline = "builtin.module("+",".join([ - "air-ping-pong-transform{keep-memref-dealloc=true}", - "air-dealias-memref", - "canonicalize", "cse", - "air-isolate-async-dma-loop-nests", - "air-specialize-channel-wrap-and-stride", - "canonicalize", "cse", - ])+')' + pipeline = ( + "builtin.module(" + + ",".join( + [ + "air-ping-pong-transform{keep-memref-dealloc=true}", + "air-dealias-memref", + "canonicalize", + "cse", + "air-isolate-async-dma-loop-nests", + "air-specialize-channel-wrap-and-stride", + "canonicalize", + "cse", + ] + ) + + ")" + ) pm = air.passmanager.PassManager.parse(pipeline) pm.run(air_module.operation) - # with open('aircc_input.mlir', 'w') as f: - # f.write(str(air_module)) - + with open("aircc_input.mlir", "w") as f: + f.write(str(air_module)) + ################################################ ## Place herd to segment ################################################ air_async_module = Module.parse(str(air_module)) - pipeline = "builtin.module("+",".join([ - "func.func(air-collapse-herd)", - 'canonicalize', 'cse', - "air-place-herds{num-rows=4 num-cols=1 row-anchor=2 col-anchor=0}", - 'canonicalize', 'cse', - 'func.func(air-renumber-dma)', - 'func.func(convert-linalg-to-loops)', - ])+')' + col_anchor = 1 if opts.trace_size > 0 else 0 + pipeline = ( + "builtin.module(" + + ",".join( + [ + "func.func(air-collapse-herd)", + "canonicalize", + "cse", + "air-place-herds{num-rows=4 num-cols=1 row-anchor=2 col-anchor=" + + str(col_anchor) + + "}", + "canonicalize", + "cse", + "func.func(air-renumber-dma)", + "func.func(convert-linalg-to-loops)", + ] + ) + + ")" + ) + pm = air.passmanager.PassManager.parse(pipeline) pm.run(air_module.operation) - # with open('air_placed.mlir', 'w') as f: - # f.write(str(air_module)) - + with open("air_placed.mlir", "w") as f: + f.write(str(air_module)) + # ################################################ # ## MLIR-AIR to MLIR-AIE # ################################################ - - pipeline = "builtin.module("+",".join([ - 'air-to-aie{row-offset=2 col-offset=0 device=npu1_4col emit-while-loop=true}', - 'canonicalize', - ])+')' + + air_to_aie_pass = ( + "air-to-aie{row-offset=2 col-offset=0 device=npu1_4col emit-while-loop=true" + ) + if opts.trace_size > 0: + air_to_aie_pass = air_to_aie_pass + " insert-trace-packet-flow=true" + air_to_aie_pass = air_to_aie_pass + "}" + pipeline = ( + "builtin.module(" + + ",".join( + [ + air_to_aie_pass, + "canonicalize", + ] + ) + + ")" + ) pm = air.passmanager.PassManager.parse(pipeline) pm.run(air_module.operation) - # with open('aircc_decomp_aiecc.mlir', 'w') as f: - # f.write(str(air_module)) - + with open("aircc_decomp_aiecc.mlir", "w") as f: + f.write(str(air_module)) + ################################################ ## MLIR-AIR runtime lowering ################################################ - pipeline = "builtin.module("+",".join([ - 'air-to-std', - 'airrt-to-npu', - 'canonicalize', - ])+')' + pipeline = ( + "builtin.module(" + + ",".join( + [ + "air-to-std", + "airrt-to-npu{" + + f"trace-offset={opts.trace_offset} trace-size={opts.trace_size}" + + "}", + "canonicalize", + ] + ) + + ")" + ) pm = air.passmanager.PassManager.parse(pipeline) pm.run(air_module.operation) - with open('aie.mlir', 'w') as f: + with open("aie.mlir", "w") as f: f.write(str(air_module)) diff --git a/test/xrt/01_air_to_npu/run.lit b/test/xrt/01_air_to_npu/run.lit index a2d2bbebc..96ef44946 100644 --- a/test/xrt/01_air_to_npu/run.lit +++ b/test/xrt/01_air_to_npu/run.lit @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // // REQUIRES: ryzen_ai -// RUN: %python %S/aie.py +// RUN: %python %S/aie.py --trace-size 65536 --trace-offset 65536 // RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt aie.mlir -// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem -// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt +// RUN: g++-13 %S/test.cpp -o test.exe -std=c++23 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem +// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt --trace_sz 65536 diff --git a/test/xrt/01_air_to_npu/test.cpp b/test/xrt/01_air_to_npu/test.cpp index 10f41393a..9c720f722 100644 --- a/test/xrt/01_air_to_npu/test.cpp +++ b/test/xrt/01_air_to_npu/test.cpp @@ -72,6 +72,15 @@ void mm_out(std::vector a, std::vector b, std::vector &r) { } } +void write_out_trace(char *traceOutPtr, size_t trace_size, std::string path) { + std::ofstream fout(path); + uint32_t *traceOut = (uint32_t *)traceOutPtr; + for (int i = 0; i < trace_size / sizeof(traceOut[0]); i++) { + fout << std::setfill('0') << std::setw(8) << std::hex << (int)traceOut[i]; + fout << std::endl; + } +} + int main(int argc, const char *argv[]) { // Program arguments parsing @@ -84,7 +93,11 @@ int main(int argc, const char *argv[]) { "verbosity,v", po::value()->default_value(0), "the verbosity of the output")( "instr,i", po::value()->required(), - "path of file containing userspace instructions to be sent to the LX6"); + "path of file containing userspace instructions to be sent to the LX6")( + "trace_sz,t", po::value()->default_value(0), + "size of trace buffer (in bytes)")( + "trace_file", po::value()->default_value("trace.txt"), + "where to store trace output"); po::variables_map vm; try { @@ -101,6 +114,8 @@ int main(int argc, const char *argv[]) { return 1; } + int trace_size = vm["trace_sz"].as(); + check_arg_file_exists(vm, "xclbin"); check_arg_file_exists(vm, "instr"); @@ -157,8 +172,8 @@ int main(int argc, const char *argv[]) { xrt::bo(device, A_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); auto bo_b = xrt::bo(device, B_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); - auto bo_c = - xrt::bo(device, C_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + auto bo_c = xrt::bo(device, C_SIZE + trace_size, XRT_BO_FLAGS_HOST_ONLY, + kernel.group_id(4)); if (verbosity >= 1) std::cout << "Writing data into buffer objects.\n"; @@ -173,10 +188,7 @@ int main(int argc, const char *argv[]) { BVec.push_back(rand() % UINT16_MAX); memcpy(bufB, BVec.data(), (BVec.size() * sizeof(B_DATATYPE))); C_DATATYPE *bufC = bo_c.map(); - std::vector CVec; - for (int i = 0; i < C_VOLUME; i++) - CVec.push_back(0); - memcpy(bufC, CVec.data(), (CVec.size() * sizeof(C_DATATYPE))); + memset(bufC, 0, C_SIZE + trace_size); void *bufInstr = bo_instr.map(); memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); @@ -214,6 +226,11 @@ int main(int argc, const char *argv[]) { } } + if (trace_size > 0) { + write_out_trace(((char *)bufC) + C_SIZE, trace_size, + vm["trace_file"].as()); + } + if (!errors) { std::cout << "\nPASS!\n\n"; return 0;