From 455921766b43e0099a58c60d1dedf786392eb96c Mon Sep 17 00:00:00 2001 From: erwei-xilinx Date: Wed, 26 Jun 2024 16:58:24 -0700 Subject: [PATCH] Fixup AIRRt lowerings to support convolution examples (#620) * Refactor lowering to airrt.dma to fixup bug when wrap and memref shape do not match * Only using the last dimension for static offsets; switch to composing offsets via strides instead of memref shapes * Remove unused variable * Convolution board test * Formatting --- mlir/lib/Conversion/AIRLoweringPass.cpp | 91 ++++-- mlir/lib/Conversion/AIRRtToNpuPass.cpp | 36 ++- .../AIRLowering/air_channel_get_put.mlir | 26 +- .../Conversion/AIRLowering/air_to_npu.mlir | 53 ++++ .../Conversion/AIRRtToNpu/airrt_to_npu.mlir | 20 +- test/xrt/13_conv2d_i32/aie.py | 202 +++++++++++++ test/xrt/13_conv2d_i32/run.lit | 9 + test/xrt/13_conv2d_i32/test.cpp | 284 ++++++++++++++++++ 8 files changed, 653 insertions(+), 68 deletions(-) create mode 100644 test/xrt/13_conv2d_i32/aie.py create mode 100644 test/xrt/13_conv2d_i32/run.lit create mode 100644 test/xrt/13_conv2d_i32/test.cpp diff --git a/mlir/lib/Conversion/AIRLoweringPass.cpp b/mlir/lib/Conversion/AIRLoweringPass.cpp index ea65d61f6..f4d96b50d 100644 --- a/mlir/lib/Conversion/AIRLoweringPass.cpp +++ b/mlir/lib/Conversion/AIRLoweringPass.cpp @@ -453,16 +453,15 @@ AIRChannelInterfaceToAIRRtConversionImpl(OpBuilder builder, auto i64Ty = builder.getI64Type(); auto zero = builder.create(loc, i64Ty, IntegerAttr::get(i64Ty, 0)); - auto one = - builder.create(loc, i64Ty, IntegerAttr::get(i64Ty, 1)); + auto zero_idx = builder.create(loc, 0); + auto one_idx = builder.create(loc, 1); auto idTy = IntegerType::get(ctx, 32); // Get op id of the internal put/get op if (auto id_attr = theOtherOp->getAttrOfType("id")) { opers.push_back(builder.create(loc, idTy, id_attr)); } else { - opers.push_back(builder.create( - loc, idTy, IntegerAttr::get(idTy, 0))); + opers.push_back(zero); } scf::ParallelOp launch = thisOp->getParentOfType(); @@ -489,45 +488,71 @@ AIRChannelInterfaceToAIRRtConversionImpl(OpBuilder builder, opers.push_back(builder.create( loc, IntegerType::get(ctx, 64), launch.getInductionVars()[1])); else if (launch.getNumLoops() == 1) - opers.push_back(builder.create( - loc, i64Ty, IntegerAttr::get(i64Ty, 0))); + opers.push_back(zero); else - opers.push_back(builder.create( - loc, i64Ty, IntegerAttr::get(i64Ty, 0))); + opers.push_back(zero); } opers.push_back(thisOp.getMemref()); - SmallVector offsets(4, zero); - SmallVector lengths(4, one); - SmallVector strides(3, zero); + SmallVector offsets = thisOp.getOffsets(); + SmallVector wraps = thisOp.getSizes(); + SmallVector strides = thisOp.getStrides(); - int idx = 4 - thisOp.getOffsets().size(); - for (auto o : thisOp.getOffsets()) { - offsets[idx++] = - builder.create(loc, IntegerType::get(ctx, 64), o); + auto memrefType = thisOp.getMemref().getType(); + + // If empty offsets/sizes/strides, then populate the lists with default + // values. + if (offsets.empty() && wraps.empty() && strides.empty()) { + offsets.push_back(zero_idx); + auto memref_volume = air::getTensorVolume(memrefType); + wraps.push_back(builder.create(loc, memref_volume)); + strides.push_back(one_idx); + } + // Stride field implicit last element one + auto lastStrideConst = getConstantIntValue(strides.back()); + assert(lastStrideConst && "the last stride is not static"); + // If the last dimension's stride value is not 1, then for AIE2 we use the + // second dimension of shim dma bd to implement the last dimension. + if (*lastStrideConst != 1) { + offsets.push_back(zero_idx); + wraps.push_back(one_idx); + strides.push_back(one_idx); + } + + strides.pop_back(); + while (offsets.size() < 4) { + offsets.insert(offsets.begin(), zero_idx); + } + while (wraps.size() < 4) { + wraps.insert(wraps.begin(), one_idx); + } + while (strides.size() < 3) { + strides.insert(strides.begin(), zero_idx); + } + + for (unsigned i = 0; i < offsets.size(); i++) + offsets[i] = builder.create( + loc, IntegerType::get(ctx, 64), offsets[i]); + + // In aiex.npu ops, stride value 0 means 1; only the highest dimension stride + // value 0 really means repeat. + for (unsigned i = 0; i < strides.size(); i++) { + auto constStride = getConstantIntValue(strides[i]); + assert(constStride && "stride is not static"); + if (i > 0 && *constStride == 1) + strides[i] = zero; + else + strides[i] = builder.create( + loc, IntegerType::get(ctx, 64), strides[i]); } - idx = 4 - thisOp.getStrides().size(); - auto op_strides = thisOp.getStrides(); - if (op_strides.size()) - for (auto o : op_strides.drop_back()) - strides[idx++] = - builder.create(loc, IntegerType::get(ctx, 64), o); - idx = - 4 - std::max(thisOp.getSizes().size(), (size_t)thisMemrefType.getRank()); - // If sizes field is empty, then infer sizes from memref shape - if (thisOp.getSizes().empty()) - for (auto d : air::getTensorShape(thisMemrefType)) - lengths[idx++] = builder.create( - loc, i64Ty, IntegerAttr::get(i64Ty, d)); - else - for (auto o : thisOp.getSizes()) - lengths[idx++] = - builder.create(loc, IntegerType::get(ctx, 64), o); + for (unsigned i = 0; i < wraps.size(); i++) + wraps[i] = builder.create( + loc, IntegerType::get(ctx, 64), wraps[i]); opers.append(offsets); - opers.append(lengths); + opers.append(wraps); opers.append(strides); SmallVector tys; diff --git a/mlir/lib/Conversion/AIRRtToNpuPass.cpp b/mlir/lib/Conversion/AIRRtToNpuPass.cpp index 31b7963c0..32c73f09a 100644 --- a/mlir/lib/Conversion/AIRRtToNpuPass.cpp +++ b/mlir/lib/Conversion/AIRRtToNpuPass.cpp @@ -137,22 +137,32 @@ struct DmaToNpuPattern : public OpConversionPattern { .getResult(); }; SmallVector offsets; - SmallVector staticOffsets; - if (auto const_int = getConstantIntValue(adaptor.getOffset3())) - staticOffsets.push_back(*const_int); - else + SmallVector + staticOffsets; // Note: for static offsets we compose one single offset + // at the last dimension. + int64_t overallStaticOffset = 0; + if (auto const_int = getConstantIntValue(adaptor.getOffset3())) { + overallStaticOffset += + *getConstantIntValue(adaptor.getStride3()) * (*const_int); + staticOffsets.push_back(0); + } else offsets.push_back(adaptor.getOffset3()); - if (auto const_int = getConstantIntValue(adaptor.getOffset2())) - staticOffsets.push_back(*const_int); - else + if (auto const_int = getConstantIntValue(adaptor.getOffset2())) { + overallStaticOffset += + *getConstantIntValue(adaptor.getStride2()) * (*const_int); + staticOffsets.push_back(0); + } else offsets.push_back(adaptor.getOffset2()); - if (auto const_int = getConstantIntValue(adaptor.getOffset1())) - staticOffsets.push_back(*const_int); - else + if (auto const_int = getConstantIntValue(adaptor.getOffset1())) { + overallStaticOffset += + *getConstantIntValue(adaptor.getStride1()) * (*const_int); + staticOffsets.push_back(0); + } else offsets.push_back(adaptor.getOffset1()); - if (auto const_int = getConstantIntValue(adaptor.getOffset0())) - staticOffsets.push_back(*const_int / div); - else + if (auto const_int = getConstantIntValue(adaptor.getOffset0())) { + overallStaticOffset += *const_int; + staticOffsets.push_back(overallStaticOffset / div); + } else offsets.push_back(divOp(adaptor.getOffset0())); SmallVector sizes; SmallVector staticSizes; diff --git a/mlir/test/Conversion/AIRLowering/air_channel_get_put.mlir b/mlir/test/Conversion/AIRLowering/air_channel_get_put.mlir index 6bd2c134e..0d4616348 100644 --- a/mlir/test/Conversion/AIRLowering/air_channel_get_put.mlir +++ b/mlir/test/Conversion/AIRLowering/air_channel_get_put.mlir @@ -24,25 +24,26 @@ module { %c32 = arith.constant 32 : index %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index - %0 = air.channel.put async @channel_0[%c0, %c0] (%arg0[%c8, %c0] [%c8, %c16] [%c32, %c0]) {id = 1 : i32} : (memref<32x16xi32>) - %1 = air.channel.get async @channel_1[%c0, %c0] (%arg1[%c8, %c0] [%c8, %c16] [%c32, %c0]) {id = 2 : i32} : (memref<32x16xi32>) + %0 = air.channel.put async @channel_0[%c0, %c0] (%arg0[%c8, %c0] [%c8, %c16] [%c32, %c1]) {id = 1 : i32} : (memref<32x16xi32>) + %1 = air.channel.get async @channel_1[%c0, %c0] (%arg1[%c8, %c0] [%c8, %c16] [%c32, %c1]) {id = 2 : i32} : (memref<32x16xi32>) air.segment @segment_0 { %c1_0 = arith.constant 1 : index air.herd @herd_0 tile (%arg10, %arg11) in (%arg12=%c1_0, %arg13=%c1_0) { %c0_4 = arith.constant 0 : index + %c1_4 = arith.constant 1 : index %c32_5 = arith.constant 32 : index %c16_6 = arith.constant 16 : index %c8_7 = arith.constant 8 : index %alloc = memref.alloc() {sym_name = "scratch"} : memref<16x8xi32, 2> %alloc_8 = memref.alloc() {sym_name = "scratch_copy"} : memref<16x8xi32, 2> - air.channel.get @channel_0[%arg10, %arg11] (%alloc[%c0_4, %c0_4] [%c8_7, %c16_6] [%c32_5, %c0_4]) {id = 3 : i32} : (memref<16x8xi32, 2>) + air.channel.get @channel_0[%arg10, %arg11] (%alloc[%c0_4, %c0_4] [%c8_7, %c16_6] [%c32_5, %c1_4]) {id = 3 : i32} : (memref<16x8xi32, 2>) affine.for %arg18 = 0 to 8 { affine.for %arg19 = 0 to 16 { %2 = affine.load %alloc[%arg19, %arg18] : memref<16x8xi32, 2> affine.store %2, %alloc_8[%arg19, %arg18] : memref<16x8xi32, 2> } } - air.channel.put @channel_1[%arg10, %arg11] (%alloc_8[%c0_4, %c0_4] [%c8_7, %c16_6] [%c32_5, %c0_4]) {id = 4 : i32} : (memref<16x8xi32, 2>) + air.channel.put @channel_1[%arg10, %arg11] (%alloc_8[%c0_4, %c0_4] [%c8_7, %c16_6] [%c32_5, %c1_4]) {id = 4 : i32} : (memref<16x8xi32, 2>) memref.dealloc %alloc_8 : memref<16x8xi32, 2> memref.dealloc %alloc : memref<16x8xi32, 2> } @@ -76,7 +77,7 @@ module { %c0 = arith.constant 0 : index %0 = air.wait_all async %1 = scf.parallel (%a2, %a3) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) init (%0) -> !air.async.token { - %3 = air.channel.put async @channel_2[%a2, %a3] (%arg0[%c8, %c0] [%c8, %c16] [%c32, %c0]) {id = 1 : i32} : (memref<32x16xi32>) + %3 = air.channel.put async @channel_2[%a2, %a3] (%arg0[%c8, %c0] [%c8, %c16] [%c32, %c1]) {id = 1 : i32} : (memref<32x16xi32>) scf.reduce(%3 : !air.async.token) { ^bb0(%a4: !air.async.token, %a5: !air.async.token): %4 = air.wait_all async [%a4, %a5] @@ -84,7 +85,7 @@ module { } } %2 = scf.parallel (%a2, %a3) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) init (%0) -> !air.async.token { - %3 = air.channel.get async @channel_3[%a2, %a3] (%arg1[%c8, %c0] [%c8, %c16] [%c32, %c0]) {id = 2 : i32} : (memref<32x16xi32>) + %3 = air.channel.get async @channel_3[%a2, %a3] (%arg1[%c8, %c0] [%c8, %c16] [%c32, %c1]) {id = 2 : i32} : (memref<32x16xi32>) scf.reduce(%3 : !air.async.token) { ^bb0(%a4: !air.async.token, %a5: !air.async.token): %4 = air.wait_all async [%a4, %a5] @@ -96,19 +97,20 @@ module { %c2_3 = arith.constant 2 : index air.herd @herd_0 tile (%arg10, %arg11) in (%arg12=%c2_2, %arg13=%c2_3) args(%arg14=%arg6, %arg15=%arg7, %arg16=%arg8, %arg17=%arg9) : index, index, index, index { %c0_4 = arith.constant 0 : index + %c1_4 = arith.constant 1 : index %c32_5 = arith.constant 32 : index %c16_6 = arith.constant 16 : index %c8_7 = arith.constant 8 : index %alloc = memref.alloc() {sym_name = "scratch"} : memref<16x8xi32, 2> %alloc_8 = memref.alloc() {sym_name = "scratch_copy"} : memref<16x8xi32, 2> - air.channel.get @channel_2[%arg10, %arg11] (%alloc[%c0_4, %c0_4] [%c8_7, %c16_6] [%c32_5, %c0_4]) {id = 3 : i32} : (memref<16x8xi32, 2>) + air.channel.get @channel_2[%arg10, %arg11] (%alloc[%c0_4, %c0_4] [%c8_7, %c16_6] [%c32_5, %c1_4]) {id = 3 : i32} : (memref<16x8xi32, 2>) affine.for %arg18 = 0 to 8 { affine.for %arg19 = 0 to 16 { %3 = affine.load %alloc[%arg19, %arg18] : memref<16x8xi32, 2> affine.store %3, %alloc_8[%arg19, %arg18] : memref<16x8xi32, 2> } } - air.channel.put @channel_3[%arg10, %arg11] (%alloc_8[%c0_4, %c0_4] [%c8_7, %c16_6] [%c32_5, %c0_4]) {id = 4 : i32} : (memref<16x8xi32, 2>) + air.channel.put @channel_3[%arg10, %arg11] (%alloc_8[%c0_4, %c0_4] [%c8_7, %c16_6] [%c32_5, %c1_4]) {id = 4 : i32} : (memref<16x8xi32, 2>) memref.dealloc %alloc_8 : memref<16x8xi32, 2> memref.dealloc %alloc : memref<16x8xi32, 2> } @@ -144,7 +146,7 @@ module { %c0 = arith.constant 0 : index %0 = air.wait_all async %1 = scf.parallel (%a2, %a3) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) init (%0) -> !air.async.token { - %3 = air.channel.put async @channel_4[%a2, %a3] (%arg0[%c8, %c0] [%c8, %c16] [%c32, %c0]) {id = 1 : i32} : (memref<32x16xi32>) + %3 = air.channel.put async @channel_4[%a2, %a3] (%arg0[%c8, %c0] [%c8, %c16] [%c32, %c1]) {id = 1 : i32} : (memref<32x16xi32>) scf.reduce(%3 : !air.async.token) { ^bb0(%a4: !air.async.token, %a5: !air.async.token): %4 = air.wait_all async [%a4, %a5] @@ -153,7 +155,7 @@ module { } %2 = scf.parallel (%a2, %a3) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) init (%0) -> !air.async.token { %3 = scf.for %a4 = %c0 to %c2 step %c1 iter_args(%a5 = %0) -> (!air.async.token) { - %4 = air.channel.get async [%a5] @channel_5[%a2, %a3] (%arg1[%c8, %c0] [%c8, %c16] [%c32, %c0]) {id = 2 : i32} : (memref<32x16xi32>) + %4 = air.channel.get async [%a5] @channel_5[%a2, %a3] (%arg1[%c8, %c0] [%c8, %c16] [%c32, %c1]) {id = 2 : i32} : (memref<32x16xi32>) scf.yield %4 : !air.async.token } scf.reduce(%3 : !air.async.token) { @@ -174,7 +176,7 @@ module { %c8_9 = arith.constant 8 : index %alloc = memref.alloc() {sym_name = "scratch"} : memref<16x8xi32, 2> %alloc_10 = memref.alloc() {sym_name = "scratch_copy"} : memref<16x8xi32, 2> - air.channel.get @channel_4[%arg10, %arg11] (%alloc[%c0_4, %c0_4] [%c8_9, %c16_8] [%c32_7, %c0_4]) {id = 3 : i32} : (memref<16x8xi32, 2>) + air.channel.get @channel_4[%arg10, %arg11] (%alloc[%c0_4, %c0_4] [%c8_9, %c16_8] [%c32_7, %c1_6]) {id = 3 : i32} : (memref<16x8xi32, 2>) affine.for %arg18 = 0 to 8 { affine.for %arg19 = 0 to 16 { %3 = affine.load %alloc[%arg19, %arg18] : memref<16x8xi32, 2> @@ -182,7 +184,7 @@ module { } } scf.for %arg18 = %c0_4 to %c2_5 step %c1_6 { - air.channel.put @channel_5[%arg10, %arg11] (%alloc_10[%c0_4, %c0_4] [%c8_9, %c16_8] [%c32_7, %c0_4]) {id = 4 : i32} : (memref<16x8xi32, 2>) + air.channel.put @channel_5[%arg10, %arg11] (%alloc_10[%c0_4, %c0_4] [%c8_9, %c16_8] [%c32_7, %c1_6]) {id = 4 : i32} : (memref<16x8xi32, 2>) } memref.dealloc %alloc_10 : memref<16x8xi32, 2> memref.dealloc %alloc : memref<16x8xi32, 2> diff --git a/mlir/test/Conversion/AIRLowering/air_to_npu.mlir b/mlir/test/Conversion/AIRLowering/air_to_npu.mlir index a684a7b83..40e8505ec 100644 --- a/mlir/test/Conversion/AIRLowering/air_to_npu.mlir +++ b/mlir/test/Conversion/AIRLowering/air_to_npu.mlir @@ -230,3 +230,56 @@ module { return } } + +// ----- + +// Convolution. + +// CHECK-DAG: %[[CST_64:.*]] = arith.constant 64 : i64 +// CHECK-DAG: %[[CST_1:.*]] = arith.constant 1 : i64 +// CHECK-DAG: %[[CST_1152:.*]] = arith.constant 1152 : i64 +// CHECK-DAG: %[[CST_18:.*]] = arith.constant 18 : i32 +// CHECK-DAG: %[[CST_5:.*]] = arith.constant 5 : i32 +// CHECK-DAG: %[[CST_4:.*]] = arith.constant 4 : i32 +// CHECK-DAG: %[[CST_0:.*]] = arith.constant 0 : i64 +// CHECK: affine.for %[[VAL_0:.*]] = 0 to 2 { +// CHECK: %[[VAL_1:.*]] = arith.index_cast %[[VAL_0]] : index to i64 +// CHECK: airrt.dma_memcpy_nd(%[[CST_4]], %0, %[[CST_0]], %arg0[%[[CST_0]], %[[CST_0]], %0, %[[CST_0]]], [%[[CST_1]], %[[CST_1]], %[[CST_1]], %[[CST_1152]]], [%[[CST_0]], %[[CST_0]], %[[CST_1152]]]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<2x6x6x32xi32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event +// CHECK: airrt.dma_memcpy_nd(%[[CST_5]], %0, %[[CST_0]], %arg1[%[[CST_0]], %[[CST_0]], %[[CST_0]], %[[CST_0]]], [%[[CST_1]], %[[CST_1]], %[[CST_1]], %[[CST_1152]]], [%[[CST_0]], %[[CST_0]], %[[CST_0]]]) {metadata = @airMemcpyId5} : (i32, i64, i64, memref<3x3x32x4xi32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event +// CHECK: airrt.dma_memcpy_nd(%[[CST_18]], %0, %[[CST_0]], %arg2[%[[CST_0]], %[[CST_0]], %0, %[[CST_0]]], [%[[CST_1]], %[[CST_1]], %[[CST_1]], %[[CST_64]]], [%[[CST_0]], %[[CST_0]], %[[CST_64]]]) {metadata = @airMemcpyId18} : (i32, i64, i64, memref<2x4x4x4xi32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event + +module { + air.channel @channel_5 [1, 1] + air.channel @channel_2 [1, 1] + air.channel @channel_1 [1, 1] + func.func @func3(%arg0: memref<2x6x6x32xi32>, %arg1: memref<3x3x32x4xi32>, %arg2: memref<2x4x4x4xi32>) { + %c2 = arith.constant 2 : index + %0 = air.launch async (%arg3) in (%arg4=%c2) args(%arg5=%arg0, %arg6=%arg2, %arg7=%arg1) : memref<2x6x6x32xi32>, memref<2x4x4x4xi32>, memref<3x3x32x4xi32> attributes {id = 1 : i32} { + %c64 = arith.constant 64 : index + %c1152 = arith.constant 1152 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %1 = air.channel.put async @channel_1[] (%arg5[%arg3, %c0] [%c1, %c1152] [%c1152, %c1]) {id = 1 : i32, metadata = @airMemcpyId4} : (memref<2x6x6x32xi32>) + %2 = air.channel.put async @channel_2[] (%arg7[] [] []) {id = 2 : i32, metadata = @airMemcpyId5} : (memref<3x3x32x4xi32>) + %3 = air.channel.get async @channel_5[] (%arg6[%arg3, %c0] [%c1, %c64] [%c64, %c1]) {id = 3 : i32, metadata = @airMemcpyId18} : (memref<2x4x4x4xi32>) + %4 = air.segment @conv async attributes {id = 2 : i32, x_loc = 0 : i64, x_size = 1 : i64, y_loc = 2 : i64, y_size = 4 : i64} { + %async_token, %results = air.execute -> (memref<1x6x6x32xi32, 1>) { + %alloc = memref.alloc() : memref<1x6x6x32xi32, 1> + air.execute_terminator %alloc : memref<1x6x6x32xi32, 1> + } + %5 = air.channel.get async [%async_token] @channel_1[] (%results[] [] []) {id = 4 : i32} : (memref<1x6x6x32xi32, 1>) + %async_token_0, %results_1 = air.execute -> (memref<3x3x32x4xi32, 1>) { + %alloc = memref.alloc() : memref<3x3x32x4xi32, 1> + air.execute_terminator %alloc : memref<3x3x32x4xi32, 1> + } + %6 = air.channel.get async [%async_token_0] @channel_2[] (%results_1[] [] []) {id = 5 : i32} : (memref<3x3x32x4xi32, 1>) + %async_token_2, %results_3 = air.execute -> (memref<1x4x4x4xi32, 1>) { + %alloc = memref.alloc() : memref<1x4x4x4xi32, 1> + air.execute_terminator %alloc : memref<1x4x4x4xi32, 1> + } + %7 = air.channel.put async [%async_token_2] @channel_5[] (%results_3[] [] []) {id = 18 : i32} : (memref<1x4x4x4xi32, 1>) + } + } + return + } +} diff --git a/mlir/test/Conversion/AIRRtToNpu/airrt_to_npu.mlir b/mlir/test/Conversion/AIRRtToNpu/airrt_to_npu.mlir index a81839f62..86da5d244 100644 --- a/mlir/test/Conversion/AIRRtToNpu/airrt_to_npu.mlir +++ b/mlir/test/Conversion/AIRRtToNpu/airrt_to_npu.mlir @@ -259,7 +259,7 @@ module { // CHECK-LABEL: aie.device(npu1_1col) // CHECK: func.func @func5(%[[ARG0:.*]]: memref<8x8xi32>, %[[ARG1:.*]]: memref<8x8xi32>, %[[ARG2:.*]]: memref<8x8xi32>) // CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][2, 1, 4, 8][0, 0, 8]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<8x8xi32> -// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 4, 0][2, 1, 4, 8][0, 0, 8]) {id = 1 : i64, metadata = @airMemcpyId4} : memref<8x8xi32> +// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 32][2, 1, 4, 8][0, 0, 8]) {id = 1 : i64, metadata = @airMemcpyId4} : memref<8x8xi32> // CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 0][2, 2, 8, 4][0, 4, 8]) {id = 2 : i64, metadata = @airMemcpyId5} : memref<8x8xi32> // CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG2]][0, 0, 0, 0][2, 2, 4, 4][32, 4, 8]) {id = 3 : i64, metadata = @airMemcpyId16} : memref<8x8xi32> @@ -365,9 +365,9 @@ module { // CHECK-LABEL: aie.device(npu1_1col) // CHECK: func.func @func7(%[[ARG0:.*]]: memref<2048x512xi32>, %[[ARG1:.*]]: memref<512x2048xi32>, %[[ARG2:.*]]: memref<2048x2048xi32>) // CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][4, 8, 64, 64][0, 64, 512]) {id = 0 : i64, metadata = @airMemcpyId20} : memref<2048x512xi32> -// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 64, 0][4, 8, 64, 64][0, 64, 512]) {id = 1 : i64, metadata = @airMemcpyId20} : memref<2048x512xi32> -// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 128, 0][4, 8, 64, 64][0, 64, 512]) {id = 2 : i64, metadata = @airMemcpyId20} : memref<2048x512xi32> -// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 192, 0][4, 8, 64, 64][0, 64, 512]) {id = 3 : i64, metadata = @airMemcpyId20} : memref<2048x512xi32> +// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 32768][4, 8, 64, 64][0, 64, 512]) {id = 1 : i64, metadata = @airMemcpyId20} : memref<2048x512xi32> +// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 65536][4, 8, 64, 64][0, 64, 512]) {id = 2 : i64, metadata = @airMemcpyId20} : memref<2048x512xi32> +// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 98304][4, 8, 64, 64][0, 64, 512]) {id = 3 : i64, metadata = @airMemcpyId20} : memref<2048x512xi32> // CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 0][4, 4, 512, 64][0, 64, 2048]) {id = 4 : i64, metadata = @airMemcpyId21} : memref<512x2048xi32> // CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG2]][0, 0, 0, 0][4, 4, 64, 64][131072, 64, 2048]) {id = 5 : i64, metadata = @airMemcpyId26} : memref<2048x2048xi32> @@ -452,9 +452,9 @@ module { // CHECK-LABEL: aie.device(npu1_1col) // CHECK: func.func @func9(%[[ARG0:.*]]: memref<2048x2048xi32>, %[[ARG1:.*]]: memref<2048x2048xi32>, %[[ARG2:.*]]: memref<2048x2048xi32>) // CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][4, 8, 64, 256][0, 256, 2048]) {id = 0 : i64, metadata = @airMemcpyId20} : memref<2048x2048xi32> -// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 64, 0][4, 8, 64, 256][0, 256, 2048]) {id = 1 : i64, metadata = @airMemcpyId20} : memref<2048x2048xi32> -// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 128, 0][4, 8, 64, 256][0, 256, 2048]) {id = 2 : i64, metadata = @airMemcpyId20} : memref<2048x2048xi32> -// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 192, 0][4, 8, 64, 256][0, 256, 2048]) {id = 3 : i64, metadata = @airMemcpyId20} : memref<2048x2048xi32> +// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 131072][4, 8, 64, 256][0, 256, 2048]) {id = 1 : i64, metadata = @airMemcpyId20} : memref<2048x2048xi32> +// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 262144][4, 8, 64, 256][0, 256, 2048]) {id = 2 : i64, metadata = @airMemcpyId20} : memref<2048x2048xi32> +// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 393216][4, 8, 64, 256][0, 256, 2048]) {id = 3 : i64, metadata = @airMemcpyId20} : memref<2048x2048xi32> // CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 0][4, 4, 512, 64][64, 1048576, 2048]) {id = 4 : i64, metadata = @airMemcpyId21} : memref<2048x2048xi32> // CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 0][4, 4, 512, 64][64, 1048576, 2048]) {id = 5 : i64, metadata = @airMemcpyId21} : memref<2048x2048xi32> // CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 0][4, 4, 512, 64][64, 1048576, 2048]) {id = 6 : i64, metadata = @airMemcpyId21} : memref<2048x2048xi32> @@ -816,9 +816,9 @@ module { // CHECK-LABEL: aie.device(npu1_1col) // CHECK: func.func @func18(%[[ARG0:.*]]: memref<8192x32768xi32>) // CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][1, 4, 64, 64][0, 64, 32768]) {id = 0 : i64, metadata = @airMemcpyId26} : memref<8192x32768xi32> -// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 64, 0][1, 4, 64, 64][0, 64, 32768]) {id = 1 : i64, metadata = @airMemcpyId26} : memref<8192x32768xi32> -// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 128, 0][1, 4, 64, 64][0, 64, 32768]) {id = 2 : i64, metadata = @airMemcpyId26} : memref<8192x32768xi32> -// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 192, 0][1, 4, 64, 64][0, 64, 32768]) {id = 3 : i64, metadata = @airMemcpyId26} : memref<8192x32768xi32> +// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 2097152][1, 4, 64, 64][0, 64, 32768]) {id = 1 : i64, metadata = @airMemcpyId26} : memref<8192x32768xi32> +// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 4194304][1, 4, 64, 64][0, 64, 32768]) {id = 2 : i64, metadata = @airMemcpyId26} : memref<8192x32768xi32> +// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 6291456][1, 4, 64, 64][0, 64, 32768]) {id = 3 : i64, metadata = @airMemcpyId26} : memref<8192x32768xi32> #map = affine_map<()[s0] -> (s0 * 64)> module { diff --git a/test/xrt/13_conv2d_i32/aie.py b/test/xrt/13_conv2d_i32/aie.py new file mode 100644 index 000000000..f774a5a01 --- /dev/null +++ b/test/xrt/13_conv2d_i32/aie.py @@ -0,0 +1,202 @@ +# aie.py -*- Python -*- +# +# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: MIT + +import air +import air.compiler.util +from air.dialects import linalg, tensor, arith, func, memref +from air.ir import * +import air.passmanager +from air.dialects import air as airdialect +from air.compiler.util import run_transform +import sys + +with air.ir.Context() as ctx, Location.unknown(): + + ################################################ + ## Tiling + ################################################ + + air_tiled_ir_string = """ + #map = affine_map<()[s0] -> (s0 * 4)> + module { + func.func @conv_static_dispatch_0_conv_2d_nhwc_hwcf_2x12x12x64x3x3x32_i32(%0 : memref<2x14x14x32xi32>, %1 : memref<3x3x32x64xi32>, %2 : memref<2x12x12x64xi32>) { + %c4 = arith.constant 4 : index + %c16 = arith.constant 16 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c32 = arith.constant 32 : index + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + scf.parallel (%arg0, %arg1, %arg2, %arg3) = (%c0, %c0, %c0, %c0) to (%c2, %c3, %c3, %c16) step (%c1, %c1, %c1, %c1) { + %3 = affine.apply #map()[%arg1] + %4 = affine.apply #map()[%arg2] + %5 = affine.apply #map()[%arg3] + %subview = memref.subview %0[%arg0, %3, %4, 0] [1, 6, 6, 32] [1, 1, 1, 1] : memref<2x14x14x32xi32> to memref<1x6x6x32xi32, strided<[6272, 448, 32, 1], offset: ?>> + %subview_0 = memref.subview %1[0, 0, 0, %5] [3, 3, 32, 4] [1, 1, 1, 1] : memref<3x3x32x64xi32> to memref<3x3x32x4xi32, strided<[6144, 2048, 64, 1], offset: ?>> + %subview_1 = memref.subview %2[%arg0, %3, %4, %5] [1, 4, 4, 4] [1, 1, 1, 1] : memref<2x12x12x64xi32> to memref<1x4x4x4xi32, strided<[9216, 768, 64, 1], offset: ?>> + %alloc = memref.alloc() : memref<1x6x6x32xi32, 1> + memref.copy %subview, %alloc : memref<1x6x6x32xi32, strided<[6272, 448, 32, 1], offset: ?>> to memref<1x6x6x32xi32, 1> + %alloc_2 = memref.alloc() : memref<3x3x32x4xi32, 1> + memref.copy %subview_0, %alloc_2 : memref<3x3x32x4xi32, strided<[6144, 2048, 64, 1], offset: ?>> to memref<3x3x32x4xi32, 1> + %alloc_3 = memref.alloc() : memref<1x4x4x4xi32, 1> + scf.parallel (%arg4) = (%c0) to (%c4) step (%c1) { + %subview_4 = memref.subview %alloc[0, %arg4, 0, 0] [1, 3, 6, 32] [1, 1, 1, 1] : memref<1x6x6x32xi32, 1> to memref<1x3x6x32xi32, strided<[1152, 192, 32, 1], offset: ?>, 1> + %subview_5 = memref.subview %alloc_3[0, %arg4, 0, 0] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x4x4x4xi32, 1> to memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 1> + %alloc_6 = memref.alloc() : memref<1x1x4x4xi32, 2> + linalg.fill ins(%c0_i32 : i32) outs(%alloc_6 : memref<1x1x4x4xi32, 2>) + %subview_7 = memref.subview %alloc_6[0, 0, 0, 0] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x4x4xi32, 2> to memref<1x4x4xi32, strided<[16, 4, 1]>, 2> + scf.for %arg5 = %c0 to %c3 step %c1 { + scf.for %arg6 = %c0 to %c3 step %c1 { + scf.for %arg7 = %c0 to %c32 step %c8 { + %subview_8 = memref.subview %subview_4[0, %arg5, %arg6, %arg7] [1, 1, 4, 8] [1, 1, 1, 1] : memref<1x3x6x32xi32, strided<[1152, 192, 32, 1], offset: ?>, 1> to memref<1x1x4x8xi32, strided<[1152, 192, 32, 1], offset: ?>, 1> + %subview_9 = memref.subview %alloc_2[%arg5, %arg6, %arg7, 0] [1, 1, 8, 4] [1, 1, 1, 1] : memref<3x3x32x4xi32, 1> to memref<1x1x8x4xi32, strided<[384, 128, 4, 1], offset: ?>, 1> + %subview_10 = memref.subview %subview_8[0, 0, 0, 0] [1, 1, 4, 8] [1, 1, 1, 1] : memref<1x1x4x8xi32, strided<[1152, 192, 32, 1], offset: ?>, 1> to memref<1x4x8xi32, strided<[1152, 32, 1], offset: ?>, 1> + %subview_11 = memref.subview %subview_9[0, 0, 0, 0] [1, 1, 8, 4] [1, 1, 1, 1] : memref<1x1x8x4xi32, strided<[384, 128, 4, 1], offset: ?>, 1> to memref<1x8x4xi32, strided<[384, 4, 1], offset: ?>, 1> + %alloc_12 = memref.alloc() : memref<1x4x8xi32, 2> + memref.copy %subview_10, %alloc_12 : memref<1x4x8xi32, strided<[1152, 32, 1], offset: ?>, 1> to memref<1x4x8xi32, 2> + %alloc_13 = memref.alloc() : memref<1x8x4xi32, 2> + memref.copy %subview_11, %alloc_13 : memref<1x8x4xi32, strided<[384, 4, 1], offset: ?>, 1> to memref<1x8x4xi32, 2> + linalg.conv_1d_nwc_wcf {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%alloc_12, %alloc_13 : memref<1x4x8xi32, 2>, memref<1x8x4xi32, 2>) outs(%subview_7 : memref<1x4x4xi32, strided<[16, 4, 1]>, 2>) + memref.dealloc %alloc_12 : memref<1x4x8xi32, 2> + memref.dealloc %alloc_13 : memref<1x8x4xi32, 2> + } + } + } + memref.copy %alloc_6, %subview_5 : memref<1x1x4x4xi32, 2> to memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 1> + memref.dealloc %alloc_6 : memref<1x1x4x4xi32, 2> + scf.reduce + } + memref.copy %alloc_3, %subview_1 : memref<1x4x4x4xi32, 1> to memref<1x4x4x4xi32, strided<[9216, 768, 64, 1], offset: ?>> + memref.dealloc %alloc : memref<1x6x6x32xi32, 1> + memref.dealloc %alloc_2 : memref<3x3x32x4xi32, 1> + memref.dealloc %alloc_3 : memref<1x4x4x4xi32, 1> + scf.reduce + } + return + } + } + """ + air_module = Module.parse(air_tiled_ir_string) + + ################################################ + ## Binding scf.paralell to air hierarchies + ################################################ + + pipeline = ( + "builtin.module(" + + ",".join( + [ + "buffer-results-to-out-params", + "air-par-to-herd{depth=1}", + "air-par-to-launch{has-air-segment=true}", + "air-copy-to-dma", + "canonicalize", + "cse", + ] + ) + + ")" + ) + pm = air.passmanager.PassManager.parse(pipeline) + pm.run(air_module.operation) + + ############################################### + # Extract event dependency and optimize schedule + ############################################### + + pipeline = ( + "builtin.module(" + + ",".join( + [ + "air-dependency", + "air-dependency-schedule-opt", + "air-specialize-dma-broadcast", + "air-dma-to-channel", + "canonicalize", + "cse", + "air-dependency-canonicalize", + "canonicalize", + "cse", + "air-isolate-async-dma-loop-nests", + "func.func(air-loop-fusion)", + "air-label-scf-for-to-ping-pong", + "air-ping-pong-transform{keep-memref-dealloc=true}", + "canonicalize", + "cse", + "air-specialize-channel-wrap-and-stride", + "canonicalize", + "cse", + ] + ) + + ")" + ) + pm = air.passmanager.PassManager.parse(pipeline) + pm.run(air_module.operation) + + ################################################ + ## Place herd to segment + ################################################ + + air_async_module = Module.parse(str(air_module)) + pipeline = ( + "builtin.module(" + + ",".join( + [ + "func.func(air-collapse-herd)", + "canonicalize", + "cse", + "air-place-herds{num-rows=4 num-cols=1 row-anchor=2 col-anchor=0}", + "canonicalize", + "cse", + "func.func(air-renumber-dma)", + "func.func(convert-linalg-to-loops)", + ] + ) + + ")" + ) + pm = air.passmanager.PassManager.parse(pipeline) + pm.run(air_module.operation) + + ################################################ + ## MLIR-AIR to MLIR-AIE + ################################################ + + pipeline = ( + "builtin.module(" + + ",".join( + [ + "air-to-aie{row-offset=2 col-offset=0 device=npu1_4col emit-while-loop=true}", + "canonicalize", + ] + ) + + ")" + ) + pm = air.passmanager.PassManager.parse(pipeline) + pm.run(air_module.operation) + + ################################################ + ## MLIR-AIR runtime lowering + ################################################ + + pipeline = ( + "builtin.module(" + + ",".join( + [ + "air-to-std", + "canonicalize", + "symbol-dce", + "func.func(air-unroll-outer-affine-loops{depth=4})", + "affine-expand-index-ops", + "airrt-to-npu", + "canonicalize", + ] + ) + + ")" + ) + pm = air.passmanager.PassManager.parse(pipeline) + pm.run(air_module.operation) + with open("aie.mlir", "w") as f: + f.write(str(air_module)) diff --git a/test/xrt/13_conv2d_i32/run.lit b/test/xrt/13_conv2d_i32/run.lit new file mode 100644 index 000000000..de5b9a9e2 --- /dev/null +++ b/test/xrt/13_conv2d_i32/run.lit @@ -0,0 +1,9 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: MIT + +// REQUIRES: ryzen_ai, valid_xchess_license + +// RUN: %python %S/aie.py +// RUN: %python aiecc.py --xchesscc --xbridge --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt aie.mlir +// RUN: g++-13 %S/test.cpp -o test.exe -std=c++23 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem +// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt diff --git a/test/xrt/13_conv2d_i32/test.cpp b/test/xrt/13_conv2d_i32/test.cpp new file mode 100644 index 000000000..32b8ec30f --- /dev/null +++ b/test/xrt/13_conv2d_i32/test.cpp @@ -0,0 +1,284 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +#define BATCH 2 +#define CHIN 32 +#define CHOUT 64 +#define K 3 +#define XIN 14 +#define YIN 14 +#define XOUT (XIN - K + 1) +#define YOUT (YIN - K + 1) + +#define A_VOLUME (BATCH * CHIN * XIN * YIN) +#define B_VOLUME (CHIN * CHOUT * K * K) +#define C_VOLUME (BATCH * CHOUT * XOUT * YOUT) + +#define A_DATATYPE int32_t +#define B_DATATYPE int32_t +#define C_DATATYPE int32_t + +constexpr int A_SIZE = (A_VOLUME * sizeof(A_DATATYPE)); +constexpr int B_SIZE = (B_VOLUME * sizeof(B_DATATYPE)); +constexpr int C_SIZE = (C_VOLUME * sizeof(C_DATATYPE)); +constexpr int TRACE_SIZE = (0 * sizeof(uint32_t)); + +namespace po = boost::program_options; + +void check_arg_file_exists(po::variables_map &vm_in, std::string name) { + if (!vm_in.count(name)) { + throw std::runtime_error("Error: no " + name + " file was provided\n"); + } else { + std::ifstream test(vm_in[name].as()); + if (!test) { + throw std::runtime_error("The " + name + " file " + + vm_in[name].as() + + " does not exist.\n"); + } + } +} + +std::vector load_instr_sequence(std::string instr_path) { + std::ifstream instr_file(instr_path); + std::string line; + std::vector instr_v; + while (std::getline(instr_file, line)) { + std::istringstream iss(line); + uint32_t a; + if (!(iss >> std::hex >> a)) { + throw std::runtime_error("Unable to parse instruction file\n"); + } + instr_v.push_back(a); + } + return instr_v; +} + +template +void conv_out_nchw_fchw(std::vector a, std::vector b, std::vector &r) { + for (size_t batch = 0; batch < BATCH; batch++) { + for (size_t cout = 0; cout < CHOUT; cout++) { + for (size_t y = 0; y < YOUT; y++) { + for (size_t x = 0; x < XOUT; x++) { + size_t idx = + batch * CHOUT * XOUT * YOUT + cout * XOUT * YOUT + y * XOUT + x; + r[idx] = (T)(0); + for (size_t cin = 0; cin < CHIN; cin++) { + for (size_t ky = 0; ky < K; ky++) { + for (size_t kx = 0; kx < K; kx++) { + T _a = a[batch * CHIN * XIN * YIN + cin * XIN * YIN + + (y + ky) * XIN + x + kx]; + T _b = b[cout * CHIN * K * K + cin * K * K + ky * K + kx]; + r[idx] += _a * _b; + } + } + } + } + } + } + } +} + +template +void conv_out_nhwc_hwcf(std::vector a, std::vector b, std::vector &r) { + for (size_t batch = 0; batch < BATCH; batch++) { + for (size_t cout = 0; cout < CHOUT; cout++) { + for (size_t y = 0; y < YOUT; y++) { + for (size_t x = 0; x < XOUT; x++) { + size_t idx = + batch * CHOUT * XOUT * YOUT + y * XOUT * CHOUT + x * CHOUT + cout; + r[idx] = (T)(0); + for (size_t cin = 0; cin < CHIN; cin++) { + for (size_t ky = 0; ky < K; ky++) { + for (size_t kx = 0; kx < K; kx++) { + T _a = a[batch * CHIN * XIN * YIN + (y + ky) * XIN * CHIN + + (x + kx) * CHIN + cin]; + T _b = b[ky * CHIN * CHOUT * K + kx * CHOUT * CHIN + + cin * CHOUT + cout]; + r[idx] += _a * _b; + } + } + } + } + } + } + } +} + +void write_out_trace(char *traceOutPtr, size_t trace_size, std::string path) { + std::ofstream fout(path); + uint32_t *traceOut = (uint32_t *)traceOutPtr; + for (int i = 0; i < trace_size / sizeof(traceOut[0]); i++) { + fout << std::setfill('0') << std::setw(8) << std::hex << (int)traceOut[i]; + fout << std::endl; + } +} + +int main(int argc, const char *argv[]) { + + // Program arguments parsing + po::options_description desc("Allowed options"); + desc.add_options()("help,h", "produce help message")( + "xclbin,x", po::value()->required(), + "the input xclbin path")( + "kernel,k", po::value()->required(), + "the kernel name in the XCLBIN (for instance PP_PRE_FD)")( + "verbosity,v", po::value()->default_value(0), + "the verbosity of the output")( + "instr,i", po::value()->required(), + "path of file containing userspace instructions to be sent to the LX6")( + "trace_sz,t", po::value()->default_value(0), + "size of trace buffer (in bytes)")( + "trace_file", po::value()->default_value("trace.txt"), + "where to store trace output"); + po::variables_map vm; + + try { + po::store(po::parse_command_line(argc, argv, desc), vm); + po::notify(vm); + + if (vm.count("help")) { + std::cout << desc << "\n"; + return 1; + } + } catch (const std::exception &ex) { + std::cerr << ex.what() << "\n\n"; + std::cerr << "Usage:\n" << desc << "\n"; + return 1; + } + + int trace_size = vm["trace_sz"].as(); + + check_arg_file_exists(vm, "xclbin"); + check_arg_file_exists(vm, "instr"); + + std::vector instr_v = + load_instr_sequence(vm["instr"].as()); + + int verbosity = vm["verbosity"].as(); + if (verbosity >= 1) + std::cout << "Sequence instr count: " << instr_v.size() << "\n"; + + // Start the XRT test code + // Get a device handle + unsigned int device_index = 0; + auto device = xrt::device(device_index); + + // Load the xclbin + if (verbosity >= 1) + std::cout << "Loading xclbin: " << vm["xclbin"].as() << "\n"; + auto xclbin = xrt::xclbin(vm["xclbin"].as()); + + if (verbosity >= 1) + std::cout << "Kernel opcode: " << vm["kernel"].as() << "\n"; + std::string Node = vm["kernel"].as(); + + // Get the kernel from the xclbin + auto xkernels = xclbin.get_kernels(); + auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(), + [Node](xrt::xclbin::kernel &k) { + auto name = k.get_name(); + std::cout << "Name: " << name << std::endl; + return name.rfind(Node, 0) == 0; + }); + auto kernelName = xkernel.get_name(); + + if (verbosity >= 1) + std::cout << "Registering xclbin: " << vm["xclbin"].as() + << "\n"; + + device.register_xclbin(xclbin); + + // get a hardware context + if (verbosity >= 1) + std::cout << "Getting hardware context.\n"; + xrt::hw_context context(device, xclbin.get_uuid()); + + // get a kernel handle + if (verbosity >= 1) + std::cout << "Getting handle to kernel:" << kernelName << "\n"; + auto kernel = xrt::kernel(context, kernelName); + + auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); + auto bo_a = + xrt::bo(device, A_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + auto bo_b = + xrt::bo(device, B_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + auto bo_c = xrt::bo(device, C_SIZE + trace_size, XRT_BO_FLAGS_HOST_ONLY, + kernel.group_id(5)); + + if (verbosity >= 1) + std::cout << "Writing data into buffer objects.\n"; + A_DATATYPE *bufA = bo_a.map(); + std::vector AVec; + for (int i = 0; i < A_VOLUME; i++) + AVec.push_back(rand() % UINT16_MAX); + memcpy(bufA, AVec.data(), (AVec.size() * sizeof(A_DATATYPE))); + B_DATATYPE *bufB = bo_b.map(); + std::vector BVec; + for (int i = 0; i < B_VOLUME; i++) + BVec.push_back(rand() % UINT16_MAX); + memcpy(bufB, BVec.data(), (BVec.size() * sizeof(B_DATATYPE))); + C_DATATYPE *bufC = bo_c.map(); + memset(bufC, 0, C_SIZE + trace_size); + + void *bufInstr = bo_instr.map(); + memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); + + bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_a.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_b.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_c.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + if (verbosity >= 1) + std::cout << "Running Kernel.\n"; + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_a, bo_b, bo_c); + run.wait(); + + bo_c.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + C_DATATYPE *bufOut = bo_c.map(); + + int errors = 0; + int max_errors = 100; + + std::vector output_ref0; + for (uint32_t i = 0; i < C_VOLUME; i++) + output_ref0.push_back(0); + conv_out_nhwc_hwcf(AVec, BVec, output_ref0); + + for (uint32_t i = 0; i < C_VOLUME; i++) { + if (bufOut[i] != output_ref0[i]) { + errors++; + if (errors < max_errors) { + std::cout << "\nerror, id " << i << " expected " + << std::to_string(output_ref0[i]) << ", got" + << std::to_string(bufOut[i]) << "\n"; + } + } + } + + if (trace_size > 0) { + write_out_trace(((char *)bufC) + C_SIZE, trace_size, + vm["trace_file"].as()); + } + + if (!errors) { + std::cout << "\nPASS!\n\n"; + return 0; + } else { + std::cout << "\nerror count: " << errors << "\n\n"; + std::cout << "\nfailed.\n\n"; + return 1; + } +}