From 455921766b43e0099a58c60d1dedf786392eb96c Mon Sep 17 00:00:00 2001
From: erwei-xilinx <erweiw@xilinx.com>
Date: Wed, 26 Jun 2024 16:58:24 -0700
Subject: [PATCH] Fixup AIRRt lowerings to support convolution examples (#620)

* Refactor lowering to airrt.dma to fixup bug when wrap and memref shape do not match

* Only using the last dimension for static offsets; switch to composing offsets via strides instead of memref shapes

* Remove unused variable

* Convolution board test

* Formatting
---
 mlir/lib/Conversion/AIRLoweringPass.cpp       |  91 ++++--
 mlir/lib/Conversion/AIRRtToNpuPass.cpp        |  36 ++-
 .../AIRLowering/air_channel_get_put.mlir      |  26 +-
 .../Conversion/AIRLowering/air_to_npu.mlir    |  53 ++++
 .../Conversion/AIRRtToNpu/airrt_to_npu.mlir   |  20 +-
 test/xrt/13_conv2d_i32/aie.py                 | 202 +++++++++++++
 test/xrt/13_conv2d_i32/run.lit                |   9 +
 test/xrt/13_conv2d_i32/test.cpp               | 284 ++++++++++++++++++
 8 files changed, 653 insertions(+), 68 deletions(-)
 create mode 100644 test/xrt/13_conv2d_i32/aie.py
 create mode 100644 test/xrt/13_conv2d_i32/run.lit
 create mode 100644 test/xrt/13_conv2d_i32/test.cpp
diff --git a/mlir/lib/Conversion/AIRLoweringPass.cpp b/mlir/lib/Conversion/AIRLoweringPass.cpp
index ea65d61f6..f4d96b50d 100644
--- a/mlir/lib/Conversion/AIRLoweringPass.cpp
+++ b/mlir/lib/Conversion/AIRLoweringPass.cpp
@@ -453,16 +453,15 @@ AIRChannelInterfaceToAIRRtConversionImpl(OpBuilder builder,
   auto i64Ty = builder.getI64Type();
   auto zero =
       builder.create<arith::ConstantOp>(loc, i64Ty, IntegerAttr::get(i64Ty, 0));
-  auto one =
-      builder.create<arith::ConstantOp>(loc, i64Ty, IntegerAttr::get(i64Ty, 1));
+  auto zero_idx = builder.create<arith::ConstantIndexOp>(loc, 0);
+  auto one_idx = builder.create<arith::ConstantIndexOp>(loc, 1);
 
   auto idTy = IntegerType::get(ctx, 32);
   // Get op id of the internal put/get op
   if (auto id_attr = theOtherOp->getAttrOfType<IntegerAttr>("id")) {
     opers.push_back(builder.create<arith::ConstantOp>(loc, idTy, id_attr));
   } else {
-    opers.push_back(builder.create<arith::ConstantOp>(
-        loc, idTy, IntegerAttr::get(idTy, 0)));
+    opers.push_back(zero);
   }
 
   scf::ParallelOp launch = thisOp->getParentOfType<scf::ParallelOp>();
@@ -489,45 +488,71 @@ AIRChannelInterfaceToAIRRtConversionImpl(OpBuilder builder,
       opers.push_back(builder.create<arith::IndexCastOp>(
           loc, IntegerType::get(ctx, 64), launch.getInductionVars()[1]));
     else if (launch.getNumLoops() == 1)
-      opers.push_back(builder.create<arith::ConstantOp>(
-          loc, i64Ty, IntegerAttr::get(i64Ty, 0)));
+      opers.push_back(zero);
     else
-      opers.push_back(builder.create<arith::ConstantOp>(
-          loc, i64Ty, IntegerAttr::get(i64Ty, 0)));
+      opers.push_back(zero);
   }
 
   opers.push_back(thisOp.getMemref());
 
-  SmallVector<Value, 4> offsets(4, zero);
-  SmallVector<Value, 4> lengths(4, one);
-  SmallVector<Value, 3> strides(3, zero);
+  SmallVector<Value> offsets = thisOp.getOffsets();
+  SmallVector<Value> wraps = thisOp.getSizes();
+  SmallVector<Value> strides = thisOp.getStrides();
 
-  int idx = 4 - thisOp.getOffsets().size();
-  for (auto o : thisOp.getOffsets()) {
-    offsets[idx++] =
-        builder.create<arith::IndexCastOp>(loc, IntegerType::get(ctx, 64), o);
+  auto memrefType = thisOp.getMemref().getType();
+
+  // If empty offsets/sizes/strides, then populate the lists with default
+  // values.
+  if (offsets.empty() && wraps.empty() && strides.empty()) {
+    offsets.push_back(zero_idx);
+    auto memref_volume = air::getTensorVolume(memrefType);
+    wraps.push_back(builder.create<arith::ConstantIndexOp>(loc, memref_volume));
+    strides.push_back(one_idx);
+  }
+  // Stride field implicit last element one
+  auto lastStrideConst = getConstantIntValue(strides.back());
+  assert(lastStrideConst && "the last stride is not static");
+  // If the last dimension's stride value is not 1, then for AIE2 we use the
+  // second dimension of shim dma bd to implement the last dimension.
+  if (*lastStrideConst != 1) {
+    offsets.push_back(zero_idx);
+    wraps.push_back(one_idx);
+    strides.push_back(one_idx);
+  }
+
+  strides.pop_back();
+  while (offsets.size() < 4) {
+    offsets.insert(offsets.begin(), zero_idx);
+  }
+  while (wraps.size() < 4) {
+    wraps.insert(wraps.begin(), one_idx);
+  }
+  while (strides.size() < 3) {
+    strides.insert(strides.begin(), zero_idx);
+  }
+
+  for (unsigned i = 0; i < offsets.size(); i++)
+    offsets[i] = builder.create<arith::IndexCastOp>(
+        loc, IntegerType::get(ctx, 64), offsets[i]);
+
+  // In aiex.npu ops, stride value 0 means 1; only the highest dimension stride
+  // value 0 really means repeat.
+  for (unsigned i = 0; i < strides.size(); i++) {
+    auto constStride = getConstantIntValue(strides[i]);
+    assert(constStride && "stride is not static");
+    if (i > 0 && *constStride == 1)
+      strides[i] = zero;
+    else
+      strides[i] = builder.create<arith::IndexCastOp>(
+          loc, IntegerType::get(ctx, 64), strides[i]);
   }
 
-  idx = 4 - thisOp.getStrides().size();
-  auto op_strides = thisOp.getStrides();
-  if (op_strides.size())
-    for (auto o : op_strides.drop_back())
-      strides[idx++] =
-          builder.create<arith::IndexCastOp>(loc, IntegerType::get(ctx, 64), o);
-  idx =
-      4 - std::max(thisOp.getSizes().size(), (size_t)thisMemrefType.getRank());
-  // If sizes field is empty, then infer sizes from memref shape
-  if (thisOp.getSizes().empty())
-    for (auto d : air::getTensorShape(thisMemrefType))
-      lengths[idx++] = builder.create<arith::ConstantOp>(
-          loc, i64Ty, IntegerAttr::get(i64Ty, d));
-  else
-    for (auto o : thisOp.getSizes())
-      lengths[idx++] =
-          builder.create<arith::IndexCastOp>(loc, IntegerType::get(ctx, 64), o);
+  for (unsigned i = 0; i < wraps.size(); i++)
+    wraps[i] = builder.create<arith::IndexCastOp>(
+        loc, IntegerType::get(ctx, 64), wraps[i]);
 
   opers.append(offsets);
-  opers.append(lengths);
+  opers.append(wraps);
   opers.append(strides);
 
   SmallVector<Type, 1> tys;
diff --git a/mlir/lib/Conversion/AIRRtToNpuPass.cpp b/mlir/lib/Conversion/AIRRtToNpuPass.cpp
index 31b7963c0..32c73f09a 100644
--- a/mlir/lib/Conversion/AIRRtToNpuPass.cpp
+++ b/mlir/lib/Conversion/AIRRtToNpuPass.cpp
@@ -137,22 +137,32 @@ struct DmaToNpuPattern : public OpConversionPattern<DmaMemcpyNdOp> {
           .getResult();
     };
     SmallVector<Value> offsets;
-    SmallVector<int64_t> staticOffsets;
-    if (auto const_int = getConstantIntValue(adaptor.getOffset3()))
-      staticOffsets.push_back(*const_int);
-    else
+    SmallVector<int64_t>
+        staticOffsets; // Note: for static offsets we compose one single offset
+                       // at the last dimension.
+    int64_t overallStaticOffset = 0;
+    if (auto const_int = getConstantIntValue(adaptor.getOffset3())) {
+      overallStaticOffset +=
+          *getConstantIntValue(adaptor.getStride3()) * (*const_int);
+      staticOffsets.push_back(0);
+    } else
       offsets.push_back(adaptor.getOffset3());
-    if (auto const_int = getConstantIntValue(adaptor.getOffset2()))
-      staticOffsets.push_back(*const_int);
-    else
+    if (auto const_int = getConstantIntValue(adaptor.getOffset2())) {
+      overallStaticOffset +=
+          *getConstantIntValue(adaptor.getStride2()) * (*const_int);
+      staticOffsets.push_back(0);
+    } else
       offsets.push_back(adaptor.getOffset2());
-    if (auto const_int = getConstantIntValue(adaptor.getOffset1()))
-      staticOffsets.push_back(*const_int);
-    else
+    if (auto const_int = getConstantIntValue(adaptor.getOffset1())) {
+      overallStaticOffset +=
+          *getConstantIntValue(adaptor.getStride1()) * (*const_int);
+      staticOffsets.push_back(0);
+    } else
       offsets.push_back(adaptor.getOffset1());
-    if (auto const_int = getConstantIntValue(adaptor.getOffset0()))
-      staticOffsets.push_back(*const_int / div);
-    else
+    if (auto const_int = getConstantIntValue(adaptor.getOffset0())) {
+      overallStaticOffset += *const_int;
+      staticOffsets.push_back(overallStaticOffset / div);
+    } else
       offsets.push_back(divOp(adaptor.getOffset0()));
     SmallVector<Value> sizes;
     SmallVector<int64_t> staticSizes;
diff --git a/mlir/test/Conversion/AIRLowering/air_channel_get_put.mlir b/mlir/test/Conversion/AIRLowering/air_channel_get_put.mlir
index 6bd2c134e..0d4616348 100644
--- a/mlir/test/Conversion/AIRLowering/air_channel_get_put.mlir
+++ b/mlir/test/Conversion/AIRLowering/air_channel_get_put.mlir
@@ -24,25 +24,26 @@ module {
       %c32 = arith.constant 32 : index
       %c1 = arith.constant 1 : index
       %c0 = arith.constant 0 : index
-      %0 = air.channel.put async  @channel_0[%c0, %c0] (%arg0[%c8, %c0] [%c8, %c16] [%c32, %c0]) {id = 1 : i32} : (memref<32x16xi32>)
-      %1 = air.channel.get async  @channel_1[%c0, %c0] (%arg1[%c8, %c0] [%c8, %c16] [%c32, %c0]) {id = 2 : i32} : (memref<32x16xi32>)
+      %0 = air.channel.put async  @channel_0[%c0, %c0] (%arg0[%c8, %c0] [%c8, %c16] [%c32, %c1]) {id = 1 : i32} : (memref<32x16xi32>)
+      %1 = air.channel.get async  @channel_1[%c0, %c0] (%arg1[%c8, %c0] [%c8, %c16] [%c32, %c1]) {id = 2 : i32} : (memref<32x16xi32>)
       air.segment @segment_0 {
         %c1_0 = arith.constant 1 : index
         air.herd @herd_0  tile (%arg10, %arg11) in (%arg12=%c1_0, %arg13=%c1_0) {
           %c0_4 = arith.constant 0 : index
+          %c1_4 = arith.constant 1 : index
           %c32_5 = arith.constant 32 : index
           %c16_6 = arith.constant 16 : index
           %c8_7 = arith.constant 8 : index
           %alloc = memref.alloc() {sym_name = "scratch"} : memref<16x8xi32, 2>
           %alloc_8 = memref.alloc() {sym_name = "scratch_copy"} : memref<16x8xi32, 2>
-          air.channel.get  @channel_0[%arg10, %arg11] (%alloc[%c0_4, %c0_4] [%c8_7, %c16_6] [%c32_5, %c0_4]) {id = 3 : i32} : (memref<16x8xi32, 2>)
+          air.channel.get  @channel_0[%arg10, %arg11] (%alloc[%c0_4, %c0_4] [%c8_7, %c16_6] [%c32_5, %c1_4]) {id = 3 : i32} : (memref<16x8xi32, 2>)
           affine.for %arg18 = 0 to 8 {
             affine.for %arg19 = 0 to 16 {
               %2 = affine.load %alloc[%arg19, %arg18] : memref<16x8xi32, 2>
               affine.store %2, %alloc_8[%arg19, %arg18] : memref<16x8xi32, 2>
             }
           }
-          air.channel.put  @channel_1[%arg10, %arg11] (%alloc_8[%c0_4, %c0_4] [%c8_7, %c16_6] [%c32_5, %c0_4]) {id = 4 : i32} : (memref<16x8xi32, 2>)
+          air.channel.put  @channel_1[%arg10, %arg11] (%alloc_8[%c0_4, %c0_4] [%c8_7, %c16_6] [%c32_5, %c1_4]) {id = 4 : i32} : (memref<16x8xi32, 2>)
           memref.dealloc %alloc_8 : memref<16x8xi32, 2>
           memref.dealloc %alloc : memref<16x8xi32, 2>
         }
@@ -76,7 +77,7 @@ module {
       %c0 = arith.constant 0 : index
       %0 = air.wait_all async 
       %1 = scf.parallel (%a2, %a3) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) init (%0) -> !air.async.token {
-        %3 = air.channel.put async  @channel_2[%a2, %a3] (%arg0[%c8, %c0] [%c8, %c16] [%c32, %c0]) {id = 1 : i32} : (memref<32x16xi32>)
+        %3 = air.channel.put async  @channel_2[%a2, %a3] (%arg0[%c8, %c0] [%c8, %c16] [%c32, %c1]) {id = 1 : i32} : (memref<32x16xi32>)
         scf.reduce(%3 : !air.async.token) {
         ^bb0(%a4: !air.async.token, %a5: !air.async.token):
           %4 = air.wait_all async [%a4, %a5] 
@@ -84,7 +85,7 @@ module {
         }
       }
       %2 = scf.parallel (%a2, %a3) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) init (%0) -> !air.async.token {
-        %3 = air.channel.get async  @channel_3[%a2, %a3] (%arg1[%c8, %c0] [%c8, %c16] [%c32, %c0]) {id = 2 : i32} : (memref<32x16xi32>)
+        %3 = air.channel.get async  @channel_3[%a2, %a3] (%arg1[%c8, %c0] [%c8, %c16] [%c32, %c1]) {id = 2 : i32} : (memref<32x16xi32>)
         scf.reduce(%3 : !air.async.token) {
         ^bb0(%a4: !air.async.token, %a5: !air.async.token):
           %4 = air.wait_all async [%a4, %a5] 
@@ -96,19 +97,20 @@ module {
         %c2_3 = arith.constant 2 : index
         air.herd @herd_0  tile (%arg10, %arg11) in (%arg12=%c2_2, %arg13=%c2_3) args(%arg14=%arg6, %arg15=%arg7, %arg16=%arg8, %arg17=%arg9) : index, index, index, index {
           %c0_4 = arith.constant 0 : index
+          %c1_4 = arith.constant 1 : index
           %c32_5 = arith.constant 32 : index
           %c16_6 = arith.constant 16 : index
           %c8_7 = arith.constant 8 : index
           %alloc = memref.alloc() {sym_name = "scratch"} : memref<16x8xi32, 2>
           %alloc_8 = memref.alloc() {sym_name = "scratch_copy"} : memref<16x8xi32, 2>
-          air.channel.get  @channel_2[%arg10, %arg11] (%alloc[%c0_4, %c0_4] [%c8_7, %c16_6] [%c32_5, %c0_4]) {id = 3 : i32} : (memref<16x8xi32, 2>)
+          air.channel.get  @channel_2[%arg10, %arg11] (%alloc[%c0_4, %c0_4] [%c8_7, %c16_6] [%c32_5, %c1_4]) {id = 3 : i32} : (memref<16x8xi32, 2>)
           affine.for %arg18 = 0 to 8 {
             affine.for %arg19 = 0 to 16 {
               %3 = affine.load %alloc[%arg19, %arg18] : memref<16x8xi32, 2>
               affine.store %3, %alloc_8[%arg19, %arg18] : memref<16x8xi32, 2>
             }
           }
-          air.channel.put  @channel_3[%arg10, %arg11] (%alloc_8[%c0_4, %c0_4] [%c8_7, %c16_6] [%c32_5, %c0_4]) {id = 4 : i32} : (memref<16x8xi32, 2>)
+          air.channel.put  @channel_3[%arg10, %arg11] (%alloc_8[%c0_4, %c0_4] [%c8_7, %c16_6] [%c32_5, %c1_4]) {id = 4 : i32} : (memref<16x8xi32, 2>)
           memref.dealloc %alloc_8 : memref<16x8xi32, 2>
           memref.dealloc %alloc : memref<16x8xi32, 2>
         }
@@ -144,7 +146,7 @@ module {
       %c0 = arith.constant 0 : index
       %0 = air.wait_all async 
       %1 = scf.parallel (%a2, %a3) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) init (%0) -> !air.async.token {
-        %3 = air.channel.put async  @channel_4[%a2, %a3] (%arg0[%c8, %c0] [%c8, %c16] [%c32, %c0]) {id = 1 : i32} : (memref<32x16xi32>)
+        %3 = air.channel.put async  @channel_4[%a2, %a3] (%arg0[%c8, %c0] [%c8, %c16] [%c32, %c1]) {id = 1 : i32} : (memref<32x16xi32>)
         scf.reduce(%3 : !air.async.token) {
         ^bb0(%a4: !air.async.token, %a5: !air.async.token):
           %4 = air.wait_all async [%a4, %a5] 
@@ -153,7 +155,7 @@ module {
       }
       %2 = scf.parallel (%a2, %a3) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) init (%0) -> !air.async.token {
         %3 = scf.for %a4 = %c0 to %c2 step %c1 iter_args(%a5 = %0) -> (!air.async.token) {
-          %4 = air.channel.get async [%a5]  @channel_5[%a2, %a3] (%arg1[%c8, %c0] [%c8, %c16] [%c32, %c0]) {id = 2 : i32} : (memref<32x16xi32>)
+          %4 = air.channel.get async [%a5]  @channel_5[%a2, %a3] (%arg1[%c8, %c0] [%c8, %c16] [%c32, %c1]) {id = 2 : i32} : (memref<32x16xi32>)
           scf.yield %4 : !air.async.token
         }
         scf.reduce(%3 : !air.async.token) {
@@ -174,7 +176,7 @@ module {
           %c8_9 = arith.constant 8 : index
           %alloc = memref.alloc() {sym_name = "scratch"} : memref<16x8xi32, 2>
           %alloc_10 = memref.alloc() {sym_name = "scratch_copy"} : memref<16x8xi32, 2>
-          air.channel.get  @channel_4[%arg10, %arg11] (%alloc[%c0_4, %c0_4] [%c8_9, %c16_8] [%c32_7, %c0_4]) {id = 3 : i32} : (memref<16x8xi32, 2>)
+          air.channel.get  @channel_4[%arg10, %arg11] (%alloc[%c0_4, %c0_4] [%c8_9, %c16_8] [%c32_7, %c1_6]) {id = 3 : i32} : (memref<16x8xi32, 2>)
           affine.for %arg18 = 0 to 8 {
             affine.for %arg19 = 0 to 16 {
               %3 = affine.load %alloc[%arg19, %arg18] : memref<16x8xi32, 2>
@@ -182,7 +184,7 @@ module {
             }
           }
           scf.for %arg18 = %c0_4 to %c2_5 step %c1_6 {
-            air.channel.put  @channel_5[%arg10, %arg11] (%alloc_10[%c0_4, %c0_4] [%c8_9, %c16_8] [%c32_7, %c0_4]) {id = 4 : i32} : (memref<16x8xi32, 2>)
+            air.channel.put  @channel_5[%arg10, %arg11] (%alloc_10[%c0_4, %c0_4] [%c8_9, %c16_8] [%c32_7, %c1_6]) {id = 4 : i32} : (memref<16x8xi32, 2>)
           }
           memref.dealloc %alloc_10 : memref<16x8xi32, 2>
           memref.dealloc %alloc : memref<16x8xi32, 2>
diff --git a/mlir/test/Conversion/AIRLowering/air_to_npu.mlir b/mlir/test/Conversion/AIRLowering/air_to_npu.mlir
index a684a7b83..40e8505ec 100644
--- a/mlir/test/Conversion/AIRLowering/air_to_npu.mlir
+++ b/mlir/test/Conversion/AIRLowering/air_to_npu.mlir
@@ -230,3 +230,56 @@ module {
     return
   }
 }
+
+// -----
+
+// Convolution.
+
+// CHECK-DAG: %[[CST_64:.*]] = arith.constant 64 : i64
+// CHECK-DAG: %[[CST_1:.*]] = arith.constant 1 : i64
+// CHECK-DAG: %[[CST_1152:.*]] = arith.constant 1152 : i64
+// CHECK-DAG: %[[CST_18:.*]] = arith.constant 18 : i32
+// CHECK-DAG: %[[CST_5:.*]] = arith.constant 5 : i32
+// CHECK-DAG: %[[CST_4:.*]] = arith.constant 4 : i32
+// CHECK-DAG: %[[CST_0:.*]] = arith.constant 0 : i64
+// CHECK: affine.for %[[VAL_0:.*]] = 0 to 2 {
+// CHECK:   %[[VAL_1:.*]] = arith.index_cast %[[VAL_0]] : index to i64
+// CHECK:   airrt.dma_memcpy_nd(%[[CST_4]], %0, %[[CST_0]], %arg0[%[[CST_0]], %[[CST_0]], %0, %[[CST_0]]], [%[[CST_1]], %[[CST_1]], %[[CST_1]], %[[CST_1152]]], [%[[CST_0]], %[[CST_0]], %[[CST_1152]]]) {metadata = @airMemcpyId4} : (i32, i64, i64, memref<2x6x6x32xi32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+// CHECK:   airrt.dma_memcpy_nd(%[[CST_5]], %0, %[[CST_0]], %arg1[%[[CST_0]], %[[CST_0]], %[[CST_0]], %[[CST_0]]], [%[[CST_1]], %[[CST_1]], %[[CST_1]], %[[CST_1152]]], [%[[CST_0]], %[[CST_0]], %[[CST_0]]]) {metadata = @airMemcpyId5} : (i32, i64, i64, memref<3x3x32x4xi32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+// CHECK:   airrt.dma_memcpy_nd(%[[CST_18]], %0, %[[CST_0]], %arg2[%[[CST_0]], %[[CST_0]], %0, %[[CST_0]]], [%[[CST_1]], %[[CST_1]], %[[CST_1]], %[[CST_64]]], [%[[CST_0]], %[[CST_0]], %[[CST_64]]]) {metadata = @airMemcpyId18} : (i32, i64, i64, memref<2x4x4x4xi32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64]) : !airrt.event
+
+module {
+  air.channel @channel_5 [1, 1]
+  air.channel @channel_2 [1, 1]
+  air.channel @channel_1 [1, 1]
+  func.func @func3(%arg0: memref<2x6x6x32xi32>, %arg1: memref<3x3x32x4xi32>, %arg2: memref<2x4x4x4xi32>) {
+    %c2 = arith.constant 2 : index
+    %0 = air.launch async (%arg3) in (%arg4=%c2) args(%arg5=%arg0, %arg6=%arg2, %arg7=%arg1) : memref<2x6x6x32xi32>, memref<2x4x4x4xi32>, memref<3x3x32x4xi32> attributes {id = 1 : i32} {
+      %c64 = arith.constant 64 : index
+      %c1152 = arith.constant 1152 : index
+      %c1 = arith.constant 1 : index
+      %c0 = arith.constant 0 : index
+      %1 = air.channel.put async  @channel_1[] (%arg5[%arg3, %c0] [%c1, %c1152] [%c1152, %c1]) {id = 1 : i32, metadata = @airMemcpyId4} : (memref<2x6x6x32xi32>)
+      %2 = air.channel.put async  @channel_2[] (%arg7[] [] []) {id = 2 : i32, metadata = @airMemcpyId5} : (memref<3x3x32x4xi32>)
+      %3 = air.channel.get async  @channel_5[] (%arg6[%arg3, %c0] [%c1, %c64] [%c64, %c1]) {id = 3 : i32, metadata = @airMemcpyId18} : (memref<2x4x4x4xi32>)
+      %4 = air.segment @conv async  attributes {id = 2 : i32, x_loc = 0 : i64, x_size = 1 : i64, y_loc = 2 : i64, y_size = 4 : i64} {
+        %async_token, %results = air.execute -> (memref<1x6x6x32xi32, 1>) {
+          %alloc = memref.alloc() : memref<1x6x6x32xi32, 1>
+          air.execute_terminator %alloc : memref<1x6x6x32xi32, 1>
+        }
+        %5 = air.channel.get async [%async_token]  @channel_1[] (%results[] [] []) {id = 4 : i32} : (memref<1x6x6x32xi32, 1>)
+        %async_token_0, %results_1 = air.execute -> (memref<3x3x32x4xi32, 1>) {
+          %alloc = memref.alloc() : memref<3x3x32x4xi32, 1>
+          air.execute_terminator %alloc : memref<3x3x32x4xi32, 1>
+        }
+        %6 = air.channel.get async [%async_token_0]  @channel_2[] (%results_1[] [] []) {id = 5 : i32} : (memref<3x3x32x4xi32, 1>)
+        %async_token_2, %results_3 = air.execute -> (memref<1x4x4x4xi32, 1>) {
+          %alloc = memref.alloc() : memref<1x4x4x4xi32, 1>
+          air.execute_terminator %alloc : memref<1x4x4x4xi32, 1>
+        }
+        %7 = air.channel.put async [%async_token_2]  @channel_5[] (%results_3[] [] []) {id = 18 : i32} : (memref<1x4x4x4xi32, 1>)
+      }
+    }
+    return
+  }
+}
diff --git a/mlir/test/Conversion/AIRRtToNpu/airrt_to_npu.mlir b/mlir/test/Conversion/AIRRtToNpu/airrt_to_npu.mlir
index a81839f62..86da5d244 100644
--- a/mlir/test/Conversion/AIRRtToNpu/airrt_to_npu.mlir
+++ b/mlir/test/Conversion/AIRRtToNpu/airrt_to_npu.mlir
@@ -259,7 +259,7 @@ module {
 // CHECK-LABEL: aie.device(npu1_1col)
 // CHECK:  func.func @func5(%[[ARG0:.*]]: memref<8x8xi32>, %[[ARG1:.*]]: memref<8x8xi32>, %[[ARG2:.*]]: memref<8x8xi32>)
 // CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][2, 1, 4, 8][0, 0, 8]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<8x8xi32>
-// CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 4, 0][2, 1, 4, 8][0, 0, 8]) {id = 1 : i64, metadata = @airMemcpyId4} : memref<8x8xi32>
+// CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 32][2, 1, 4, 8][0, 0, 8]) {id = 1 : i64, metadata = @airMemcpyId4} : memref<8x8xi32>
 // CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 0][2, 2, 8, 4][0, 4, 8]) {id = 2 : i64, metadata = @airMemcpyId5} : memref<8x8xi32>
 // CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG2]][0, 0, 0, 0][2, 2, 4, 4][32, 4, 8]) {id = 3 : i64, metadata = @airMemcpyId16} : memref<8x8xi32>
 
@@ -365,9 +365,9 @@ module {
 // CHECK-LABEL: aie.device(npu1_1col)
 // CHECK:  func.func @func7(%[[ARG0:.*]]: memref<2048x512xi32>, %[[ARG1:.*]]: memref<512x2048xi32>, %[[ARG2:.*]]: memref<2048x2048xi32>)
 // CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][4, 8, 64, 64][0, 64, 512]) {id = 0 : i64, metadata = @airMemcpyId20} : memref<2048x512xi32>
-// CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 64, 0][4, 8, 64, 64][0, 64, 512]) {id = 1 : i64, metadata = @airMemcpyId20} : memref<2048x512xi32>
-// CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 128, 0][4, 8, 64, 64][0, 64, 512]) {id = 2 : i64, metadata = @airMemcpyId20} : memref<2048x512xi32>
-// CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 192, 0][4, 8, 64, 64][0, 64, 512]) {id = 3 : i64, metadata = @airMemcpyId20} : memref<2048x512xi32>
+// CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 32768][4, 8, 64, 64][0, 64, 512]) {id = 1 : i64, metadata = @airMemcpyId20} : memref<2048x512xi32>
+// CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 65536][4, 8, 64, 64][0, 64, 512]) {id = 2 : i64, metadata = @airMemcpyId20} : memref<2048x512xi32>
+// CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 98304][4, 8, 64, 64][0, 64, 512]) {id = 3 : i64, metadata = @airMemcpyId20} : memref<2048x512xi32>
 // CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 0][4, 4, 512, 64][0, 64, 2048]) {id = 4 : i64, metadata = @airMemcpyId21} : memref<512x2048xi32>
 // CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG2]][0, 0, 0, 0][4, 4, 64, 64][131072, 64, 2048]) {id = 5 : i64, metadata = @airMemcpyId26} : memref<2048x2048xi32>
 
@@ -452,9 +452,9 @@ module {
 // CHECK-LABEL: aie.device(npu1_1col)
 // CHECK:  func.func @func9(%[[ARG0:.*]]: memref<2048x2048xi32>, %[[ARG1:.*]]: memref<2048x2048xi32>, %[[ARG2:.*]]: memref<2048x2048xi32>)
 // CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][4, 8, 64, 256][0, 256, 2048]) {id = 0 : i64, metadata = @airMemcpyId20} : memref<2048x2048xi32>
-// CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 64, 0][4, 8, 64, 256][0, 256, 2048]) {id = 1 : i64, metadata = @airMemcpyId20} : memref<2048x2048xi32>
-// CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 128, 0][4, 8, 64, 256][0, 256, 2048]) {id = 2 : i64, metadata = @airMemcpyId20} : memref<2048x2048xi32>
-// CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 192, 0][4, 8, 64, 256][0, 256, 2048]) {id = 3 : i64, metadata = @airMemcpyId20} : memref<2048x2048xi32>
+// CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 131072][4, 8, 64, 256][0, 256, 2048]) {id = 1 : i64, metadata = @airMemcpyId20} : memref<2048x2048xi32>
+// CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 262144][4, 8, 64, 256][0, 256, 2048]) {id = 2 : i64, metadata = @airMemcpyId20} : memref<2048x2048xi32>
+// CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 393216][4, 8, 64, 256][0, 256, 2048]) {id = 3 : i64, metadata = @airMemcpyId20} : memref<2048x2048xi32>
 // CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 0][4, 4, 512, 64][64, 1048576, 2048]) {id = 4 : i64, metadata = @airMemcpyId21} : memref<2048x2048xi32>
 // CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 0][4, 4, 512, 64][64, 1048576, 2048]) {id = 5 : i64, metadata = @airMemcpyId21} : memref<2048x2048xi32>
 // CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 0][4, 4, 512, 64][64, 1048576, 2048]) {id = 6 : i64, metadata = @airMemcpyId21} : memref<2048x2048xi32>
@@ -816,9 +816,9 @@ module {
 // CHECK-LABEL: aie.device(npu1_1col)
 // CHECK:  func.func @func18(%[[ARG0:.*]]: memref<8192x32768xi32>)
 // CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][1, 4, 64, 64][0, 64, 32768]) {id = 0 : i64, metadata = @airMemcpyId26} : memref<8192x32768xi32>
-// CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 64, 0][1, 4, 64, 64][0, 64, 32768]) {id = 1 : i64, metadata = @airMemcpyId26} : memref<8192x32768xi32>
-// CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 128, 0][1, 4, 64, 64][0, 64, 32768]) {id = 2 : i64, metadata = @airMemcpyId26} : memref<8192x32768xi32>
-// CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 192, 0][1, 4, 64, 64][0, 64, 32768]) {id = 3 : i64, metadata = @airMemcpyId26} : memref<8192x32768xi32>
+// CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 2097152][1, 4, 64, 64][0, 64, 32768]) {id = 1 : i64, metadata = @airMemcpyId26} : memref<8192x32768xi32>
+// CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 4194304][1, 4, 64, 64][0, 64, 32768]) {id = 2 : i64, metadata = @airMemcpyId26} : memref<8192x32768xi32>
+// CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 6291456][1, 4, 64, 64][0, 64, 32768]) {id = 3 : i64, metadata = @airMemcpyId26} : memref<8192x32768xi32>
 
 #map = affine_map<()[s0] -> (s0 * 64)>
 module {
diff --git a/test/xrt/13_conv2d_i32/aie.py b/test/xrt/13_conv2d_i32/aie.py
new file mode 100644
index 000000000..f774a5a01
--- /dev/null
+++ b/test/xrt/13_conv2d_i32/aie.py
@@ -0,0 +1,202 @@
+# aie.py -*- Python -*-
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: MIT
+
+import air
+import air.compiler.util
+from air.dialects import linalg, tensor, arith, func, memref
+from air.ir import *
+import air.passmanager
+from air.dialects import air as airdialect
+from air.compiler.util import run_transform
+import sys
+
+with air.ir.Context() as ctx, Location.unknown():
+
+    ################################################
+    ## Tiling
+    ################################################
+
+    air_tiled_ir_string = """
+    #map = affine_map<()[s0] -> (s0 * 4)>
+    module {
+      func.func @conv_static_dispatch_0_conv_2d_nhwc_hwcf_2x12x12x64x3x3x32_i32(%0 : memref<2x14x14x32xi32>, %1 : memref<3x3x32x64xi32>, %2 : memref<2x12x12x64xi32>) {
+        %c4 = arith.constant 4 : index
+        %c16 = arith.constant 16 : index
+        %c2 = arith.constant 2 : index
+        %c8 = arith.constant 8 : index
+        %c32 = arith.constant 32 : index
+        %c3 = arith.constant 3 : index
+        %c1 = arith.constant 1 : index
+        %c0_i32 = arith.constant 0 : i32
+        %c0 = arith.constant 0 : index
+        scf.parallel (%arg0, %arg1, %arg2, %arg3) = (%c0, %c0, %c0, %c0) to (%c2, %c3, %c3, %c16) step (%c1, %c1, %c1, %c1) {
+          %3 = affine.apply #map()[%arg1]
+          %4 = affine.apply #map()[%arg2]
+          %5 = affine.apply #map()[%arg3]
+          %subview = memref.subview %0[%arg0, %3, %4, 0] [1, 6, 6, 32] [1, 1, 1, 1] : memref<2x14x14x32xi32> to memref<1x6x6x32xi32, strided<[6272, 448, 32, 1], offset: ?>>
+          %subview_0 = memref.subview %1[0, 0, 0, %5] [3, 3, 32, 4] [1, 1, 1, 1] : memref<3x3x32x64xi32> to memref<3x3x32x4xi32, strided<[6144, 2048, 64, 1], offset: ?>>
+          %subview_1 = memref.subview %2[%arg0, %3, %4, %5] [1, 4, 4, 4] [1, 1, 1, 1] : memref<2x12x12x64xi32> to memref<1x4x4x4xi32, strided<[9216, 768, 64, 1], offset: ?>>
+          %alloc = memref.alloc() : memref<1x6x6x32xi32, 1>
+          memref.copy %subview, %alloc : memref<1x6x6x32xi32, strided<[6272, 448, 32, 1], offset: ?>> to memref<1x6x6x32xi32, 1>
+          %alloc_2 = memref.alloc() : memref<3x3x32x4xi32, 1>
+          memref.copy %subview_0, %alloc_2 : memref<3x3x32x4xi32, strided<[6144, 2048, 64, 1], offset: ?>> to memref<3x3x32x4xi32, 1>
+          %alloc_3 = memref.alloc() : memref<1x4x4x4xi32, 1>
+          scf.parallel (%arg4) = (%c0) to (%c4) step (%c1) {
+            %subview_4 = memref.subview %alloc[0, %arg4, 0, 0] [1, 3, 6, 32] [1, 1, 1, 1] : memref<1x6x6x32xi32, 1> to memref<1x3x6x32xi32, strided<[1152, 192, 32, 1], offset: ?>, 1>
+            %subview_5 = memref.subview %alloc_3[0, %arg4, 0, 0] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x4x4x4xi32, 1> to memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 1>
+            %alloc_6 = memref.alloc() : memref<1x1x4x4xi32, 2>
+            linalg.fill ins(%c0_i32 : i32) outs(%alloc_6 : memref<1x1x4x4xi32, 2>)
+            %subview_7 = memref.subview %alloc_6[0, 0, 0, 0] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x4x4xi32, 2> to memref<1x4x4xi32, strided<[16, 4, 1]>, 2>
+            scf.for %arg5 = %c0 to %c3 step %c1 {
+              scf.for %arg6 = %c0 to %c3 step %c1 {
+                scf.for %arg7 = %c0 to %c32 step %c8 {
+                  %subview_8 = memref.subview %subview_4[0, %arg5, %arg6, %arg7] [1, 1, 4, 8] [1, 1, 1, 1] : memref<1x3x6x32xi32, strided<[1152, 192, 32, 1], offset: ?>, 1> to memref<1x1x4x8xi32, strided<[1152, 192, 32, 1], offset: ?>, 1>
+                  %subview_9 = memref.subview %alloc_2[%arg5, %arg6, %arg7, 0] [1, 1, 8, 4] [1, 1, 1, 1] : memref<3x3x32x4xi32, 1> to memref<1x1x8x4xi32, strided<[384, 128, 4, 1], offset: ?>, 1>
+                  %subview_10 = memref.subview %subview_8[0, 0, 0, 0] [1, 1, 4, 8] [1, 1, 1, 1] : memref<1x1x4x8xi32, strided<[1152, 192, 32, 1], offset: ?>, 1> to memref<1x4x8xi32, strided<[1152, 32, 1], offset: ?>, 1>
+                  %subview_11 = memref.subview %subview_9[0, 0, 0, 0] [1, 1, 8, 4] [1, 1, 1, 1] : memref<1x1x8x4xi32, strided<[384, 128, 4, 1], offset: ?>, 1> to memref<1x8x4xi32, strided<[384, 4, 1], offset: ?>, 1>
+                  %alloc_12 = memref.alloc() : memref<1x4x8xi32, 2>
+                  memref.copy %subview_10, %alloc_12 : memref<1x4x8xi32, strided<[1152, 32, 1], offset: ?>, 1> to memref<1x4x8xi32, 2>
+                  %alloc_13 = memref.alloc() : memref<1x8x4xi32, 2>
+                  memref.copy %subview_11, %alloc_13 : memref<1x8x4xi32, strided<[384, 4, 1], offset: ?>, 1> to memref<1x8x4xi32, 2>
+                  linalg.conv_1d_nwc_wcf {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%alloc_12, %alloc_13 : memref<1x4x8xi32, 2>, memref<1x8x4xi32, 2>) outs(%subview_7 : memref<1x4x4xi32, strided<[16, 4, 1]>, 2>)
+                  memref.dealloc %alloc_12 : memref<1x4x8xi32, 2>
+                  memref.dealloc %alloc_13 : memref<1x8x4xi32, 2>
+                }
+              }
+            }
+            memref.copy %alloc_6, %subview_5 : memref<1x1x4x4xi32, 2> to memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 1>
+            memref.dealloc %alloc_6 : memref<1x1x4x4xi32, 2>
+            scf.reduce 
+          }
+          memref.copy %alloc_3, %subview_1 : memref<1x4x4x4xi32, 1> to memref<1x4x4x4xi32, strided<[9216, 768, 64, 1], offset: ?>>
+          memref.dealloc %alloc : memref<1x6x6x32xi32, 1>
+          memref.dealloc %alloc_2 : memref<3x3x32x4xi32, 1>
+          memref.dealloc %alloc_3 : memref<1x4x4x4xi32, 1>
+          scf.reduce 
+        }
+        return
+      }
+    }
+    """
+    air_module = Module.parse(air_tiled_ir_string)
+
+    ################################################
+    ## Binding scf.paralell to air hierarchies
+    ################################################
+
+    pipeline = (
+        "builtin.module("
+        + ",".join(
+            [
+                "buffer-results-to-out-params",
+                "air-par-to-herd{depth=1}",
+                "air-par-to-launch{has-air-segment=true}",
+                "air-copy-to-dma",
+                "canonicalize",
+                "cse",
+            ]
+        )
+        + ")"
+    )
+    pm = air.passmanager.PassManager.parse(pipeline)
+    pm.run(air_module.operation)
+
+    ###############################################
+    # Extract event dependency and optimize schedule
+    ###############################################
+
+    pipeline = (
+        "builtin.module("
+        + ",".join(
+            [
+                "air-dependency",
+                "air-dependency-schedule-opt",
+                "air-specialize-dma-broadcast",
+                "air-dma-to-channel",
+                "canonicalize",
+                "cse",
+                "air-dependency-canonicalize",
+                "canonicalize",
+                "cse",
+                "air-isolate-async-dma-loop-nests",
+                "func.func(air-loop-fusion)",
+                "air-label-scf-for-to-ping-pong",
+                "air-ping-pong-transform{keep-memref-dealloc=true}",
+                "canonicalize",
+                "cse",
+                "air-specialize-channel-wrap-and-stride",
+                "canonicalize",
+                "cse",
+            ]
+        )
+        + ")"
+    )
+    pm = air.passmanager.PassManager.parse(pipeline)
+    pm.run(air_module.operation)
+
+    ################################################
+    ## Place herd to segment
+    ################################################
+
+    air_async_module = Module.parse(str(air_module))
+    pipeline = (
+        "builtin.module("
+        + ",".join(
+            [
+                "func.func(air-collapse-herd)",
+                "canonicalize",
+                "cse",
+                "air-place-herds{num-rows=4 num-cols=1 row-anchor=2 col-anchor=0}",
+                "canonicalize",
+                "cse",
+                "func.func(air-renumber-dma)",
+                "func.func(convert-linalg-to-loops)",
+            ]
+        )
+        + ")"
+    )
+    pm = air.passmanager.PassManager.parse(pipeline)
+    pm.run(air_module.operation)
+
+    ################################################
+    ## MLIR-AIR to MLIR-AIE
+    ################################################
+
+    pipeline = (
+        "builtin.module("
+        + ",".join(
+            [
+                "air-to-aie{row-offset=2 col-offset=0 device=npu1_4col emit-while-loop=true}",
+                "canonicalize",
+            ]
+        )
+        + ")"
+    )
+    pm = air.passmanager.PassManager.parse(pipeline)
+    pm.run(air_module.operation)
+
+    ################################################
+    ## MLIR-AIR runtime lowering
+    ################################################
+
+    pipeline = (
+        "builtin.module("
+        + ",".join(
+            [
+                "air-to-std",
+                "canonicalize",
+                "symbol-dce",
+                "func.func(air-unroll-outer-affine-loops{depth=4})",
+                "affine-expand-index-ops",
+                "airrt-to-npu",
+                "canonicalize",
+            ]
+        )
+        + ")"
+    )
+    pm = air.passmanager.PassManager.parse(pipeline)
+    pm.run(air_module.operation)
+    with open("aie.mlir", "w") as f:
+        f.write(str(air_module))
diff --git a/test/xrt/13_conv2d_i32/run.lit b/test/xrt/13_conv2d_i32/run.lit
new file mode 100644
index 000000000..de5b9a9e2
--- /dev/null
+++ b/test/xrt/13_conv2d_i32/run.lit
@@ -0,0 +1,9 @@
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: MIT
+
+// REQUIRES: ryzen_ai, valid_xchess_license
+
+// RUN: %python %S/aie.py
+// RUN: %python aiecc.py --xchesscc --xbridge --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt aie.mlir
+// RUN: g++-13 %S/test.cpp -o test.exe -std=c++23 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
+// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt
diff --git a/test/xrt/13_conv2d_i32/test.cpp b/test/xrt/13_conv2d_i32/test.cpp
new file mode 100644
index 000000000..32b8ec30f
--- /dev/null
+++ b/test/xrt/13_conv2d_i32/test.cpp
@@ -0,0 +1,284 @@
+#include <boost/program_options.hpp>
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+#define BATCH 2
+#define CHIN 32
+#define CHOUT 64
+#define K 3
+#define XIN 14
+#define YIN 14
+#define XOUT (XIN - K + 1)
+#define YOUT (YIN - K + 1)
+
+#define A_VOLUME (BATCH * CHIN * XIN * YIN)
+#define B_VOLUME (CHIN * CHOUT * K * K)
+#define C_VOLUME (BATCH * CHOUT * XOUT * YOUT)
+
+#define A_DATATYPE int32_t
+#define B_DATATYPE int32_t
+#define C_DATATYPE int32_t
+
+constexpr int A_SIZE = (A_VOLUME * sizeof(A_DATATYPE));
+constexpr int B_SIZE = (B_VOLUME * sizeof(B_DATATYPE));
+constexpr int C_SIZE = (C_VOLUME * sizeof(C_DATATYPE));
+constexpr int TRACE_SIZE = (0 * sizeof(uint32_t));
+
+namespace po = boost::program_options;
+
+void check_arg_file_exists(po::variables_map &vm_in, std::string name) {
+  if (!vm_in.count(name)) {
+    throw std::runtime_error("Error: no " + name + " file was provided\n");
+  } else {
+    std::ifstream test(vm_in[name].as<std::string>());
+    if (!test) {
+      throw std::runtime_error("The " + name + " file " +
+                               vm_in[name].as<std::string>() +
+                               " does not exist.\n");
+    }
+  }
+}
+
+std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
+  std::ifstream instr_file(instr_path);
+  std::string line;
+  std::vector<uint32_t> instr_v;
+  while (std::getline(instr_file, line)) {
+    std::istringstream iss(line);
+    uint32_t a;
+    if (!(iss >> std::hex >> a)) {
+      throw std::runtime_error("Unable to parse instruction file\n");
+    }
+    instr_v.push_back(a);
+  }
+  return instr_v;
+}
+
+template <typename T>
+void conv_out_nchw_fchw(std::vector<T> a, std::vector<T> b, std::vector<T> &r) {
+  for (size_t batch = 0; batch < BATCH; batch++) {
+    for (size_t cout = 0; cout < CHOUT; cout++) {
+      for (size_t y = 0; y < YOUT; y++) {
+        for (size_t x = 0; x < XOUT; x++) {
+          size_t idx =
+              batch * CHOUT * XOUT * YOUT + cout * XOUT * YOUT + y * XOUT + x;
+          r[idx] = (T)(0);
+          for (size_t cin = 0; cin < CHIN; cin++) {
+            for (size_t ky = 0; ky < K; ky++) {
+              for (size_t kx = 0; kx < K; kx++) {
+                T _a = a[batch * CHIN * XIN * YIN + cin * XIN * YIN +
+                         (y + ky) * XIN + x + kx];
+                T _b = b[cout * CHIN * K * K + cin * K * K + ky * K + kx];
+                r[idx] += _a * _b;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void conv_out_nhwc_hwcf(std::vector<T> a, std::vector<T> b, std::vector<T> &r) {
+  for (size_t batch = 0; batch < BATCH; batch++) {
+    for (size_t cout = 0; cout < CHOUT; cout++) {
+      for (size_t y = 0; y < YOUT; y++) {
+        for (size_t x = 0; x < XOUT; x++) {
+          size_t idx =
+              batch * CHOUT * XOUT * YOUT + y * XOUT * CHOUT + x * CHOUT + cout;
+          r[idx] = (T)(0);
+          for (size_t cin = 0; cin < CHIN; cin++) {
+            for (size_t ky = 0; ky < K; ky++) {
+              for (size_t kx = 0; kx < K; kx++) {
+                T _a = a[batch * CHIN * XIN * YIN + (y + ky) * XIN * CHIN +
+                         (x + kx) * CHIN + cin];
+                T _b = b[ky * CHIN * CHOUT * K + kx * CHOUT * CHIN +
+                         cin * CHOUT + cout];
+                r[idx] += _a * _b;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void write_out_trace(char *traceOutPtr, size_t trace_size, std::string path) {
+  std::ofstream fout(path);
+  uint32_t *traceOut = (uint32_t *)traceOutPtr;
+  for (int i = 0; i < trace_size / sizeof(traceOut[0]); i++) {
+    fout << std::setfill('0') << std::setw(8) << std::hex << (int)traceOut[i];
+    fout << std::endl;
+  }
+}
+
+int main(int argc, const char *argv[]) {
+
+  // Program arguments parsing
+  po::options_description desc("Allowed options");
+  desc.add_options()("help,h", "produce help message")(
+      "xclbin,x", po::value<std::string>()->required(),
+      "the input xclbin path")(
+      "kernel,k", po::value<std::string>()->required(),
+      "the kernel name in the XCLBIN (for instance PP_PRE_FD)")(
+      "verbosity,v", po::value<int>()->default_value(0),
+      "the verbosity of the output")(
+      "instr,i", po::value<std::string>()->required(),
+      "path of file containing userspace instructions to be sent to the LX6")(
+      "trace_sz,t", po::value<int>()->default_value(0),
+      "size of trace buffer (in bytes)")(
+      "trace_file", po::value<std::string>()->default_value("trace.txt"),
+      "where to store trace output");
+  po::variables_map vm;
+
+  try {
+    po::store(po::parse_command_line(argc, argv, desc), vm);
+    po::notify(vm);
+
+    if (vm.count("help")) {
+      std::cout << desc << "\n";
+      return 1;
+    }
+  } catch (const std::exception &ex) {
+    std::cerr << ex.what() << "\n\n";
+    std::cerr << "Usage:\n" << desc << "\n";
+    return 1;
+  }
+
+  int trace_size = vm["trace_sz"].as<int>();
+
+  check_arg_file_exists(vm, "xclbin");
+  check_arg_file_exists(vm, "instr");
+
+  std::vector<uint32_t> instr_v =
+      load_instr_sequence(vm["instr"].as<std::string>());
+
+  int verbosity = vm["verbosity"].as<int>();
+  if (verbosity >= 1)
+    std::cout << "Sequence instr count: " << instr_v.size() << "\n";
+
+  // Start the XRT test code
+  // Get a device handle
+  unsigned int device_index = 0;
+  auto device = xrt::device(device_index);
+
+  // Load the xclbin
+  if (verbosity >= 1)
+    std::cout << "Loading xclbin: " << vm["xclbin"].as<std::string>() << "\n";
+  auto xclbin = xrt::xclbin(vm["xclbin"].as<std::string>());
+
+  if (verbosity >= 1)
+    std::cout << "Kernel opcode: " << vm["kernel"].as<std::string>() << "\n";
+  std::string Node = vm["kernel"].as<std::string>();
+
+  // Get the kernel from the xclbin
+  auto xkernels = xclbin.get_kernels();
+  auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(),
+                               [Node](xrt::xclbin::kernel &k) {
+                                 auto name = k.get_name();
+                                 std::cout << "Name: " << name << std::endl;
+                                 return name.rfind(Node, 0) == 0;
+                               });
+  auto kernelName = xkernel.get_name();
+
+  if (verbosity >= 1)
+    std::cout << "Registering xclbin: " << vm["xclbin"].as<std::string>()
+              << "\n";
+
+  device.register_xclbin(xclbin);
+
+  // get a hardware context
+  if (verbosity >= 1)
+    std::cout << "Getting hardware context.\n";
+  xrt::hw_context context(device, xclbin.get_uuid());
+
+  // get a kernel handle
+  if (verbosity >= 1)
+    std::cout << "Getting handle to kernel:" << kernelName << "\n";
+  auto kernel = xrt::kernel(context, kernelName);
+
+  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
+  auto bo_a =
+      xrt::bo(device, A_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+  auto bo_b =
+      xrt::bo(device, B_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+  auto bo_c = xrt::bo(device, C_SIZE + trace_size, XRT_BO_FLAGS_HOST_ONLY,
+                      kernel.group_id(5));
+
+  if (verbosity >= 1)
+    std::cout << "Writing data into buffer objects.\n";
+  A_DATATYPE *bufA = bo_a.map<A_DATATYPE *>();
+  std::vector<A_DATATYPE> AVec;
+  for (int i = 0; i < A_VOLUME; i++)
+    AVec.push_back(rand() % UINT16_MAX);
+  memcpy(bufA, AVec.data(), (AVec.size() * sizeof(A_DATATYPE)));
+  B_DATATYPE *bufB = bo_b.map<B_DATATYPE *>();
+  std::vector<B_DATATYPE> BVec;
+  for (int i = 0; i < B_VOLUME; i++)
+    BVec.push_back(rand() % UINT16_MAX);
+  memcpy(bufB, BVec.data(), (BVec.size() * sizeof(B_DATATYPE)));
+  C_DATATYPE *bufC = bo_c.map<C_DATATYPE *>();
+  memset(bufC, 0, C_SIZE + trace_size);
+
+  void *bufInstr = bo_instr.map<void *>();
+  memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
+
+  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_a.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_b.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_c.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  if (verbosity >= 1)
+    std::cout << "Running Kernel.\n";
+  unsigned int opcode = 3;
+  auto run = kernel(opcode, bo_instr, instr_v.size(), bo_a, bo_b, bo_c);
+  run.wait();
+
+  bo_c.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+  C_DATATYPE *bufOut = bo_c.map<C_DATATYPE *>();
+
+  int errors = 0;
+  int max_errors = 100;
+
+  std::vector<C_DATATYPE> output_ref0;
+  for (uint32_t i = 0; i < C_VOLUME; i++)
+    output_ref0.push_back(0);
+  conv_out_nhwc_hwcf(AVec, BVec, output_ref0);
+
+  for (uint32_t i = 0; i < C_VOLUME; i++) {
+    if (bufOut[i] != output_ref0[i]) {
+      errors++;
+      if (errors < max_errors) {
+        std::cout << "\nerror, id " << i << " expected "
+                  << std::to_string(output_ref0[i]) << ", got"
+                  << std::to_string(bufOut[i]) << "\n";
+      }
+    }
+  }
+
+  if (trace_size > 0) {
+    write_out_trace(((char *)bufC) + C_SIZE, trace_size,
+                    vm["trace_file"].as<std::string>());
+  }
+
+  if (!errors) {
+    std::cout << "\nPASS!\n\n";
+    return 0;
+  } else {
+    std::cout << "\nerror count: " << errors << "\n\n";
+    std::cout << "\nfailed.\n\n";
+    return 1;
+  }
+}