triton-lang · hmalgewatta · Sep 20, 2024 · Sep 23, 2024 · Sep 24, 2024 · Sep 25, 2024
@@ -404,6 +404,7 @@ jobs:
             echo "Could not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
           fi
           pytest --capture=tee-sys -rfs python/tutorials/06-fused-attention.py
+          pytest --capture=tee-sys -rfs third_party/amd/python/test/test_extract_slice.py
           cd python/test/unit
           pytest --capture=tee-sys -rfs -n 16 language runtime \
                  --ignore=language/test_line_info.py \

@@ -402,6 +402,7 @@ jobs:
             echo "Could not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
           fi
           pytest --capture=tee-sys -rfs python/tutorials/06-fused-attention.py
+          pytest --capture=tee-sys -rfs third_party/amd/python/test/test_extract_slice.py
           cd python/test/unit
           pytest --capture=tee-sys -rfs -n 16 language runtime \
                  --ignore=language/test_line_info.py \

@@ -0,0 +1,111 @@
+// RUN: triton-opt -split-input-file %s --convert-triton-amdgpu-to-llvm='arch=gfx942' -verify-diagnostics
+
+// Invalid size
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
+tt.func @invalid_size_input(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibility = 16 : i32}) {
+  // expected-error @+1 {{sizes [256, 2] must be a multiple of shapePerCTATile [256, 16]}}
+  %1 = amdgpu.extract_slice %arg0 [0,0] : tensor<256x128xi32, #blocked1> to tensor<256x2xi32, #blocked1>
+  tt.return
+}
+
+// -----
+
+// Invalid zero source dimension
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
+tt.func @invalid_size_input(%arg0: tensor<256x0xi32, #blocked1> {tt.divisibility = 16 : i32}) {
+  // expected-error @+1 {{source tensor dimension size zero at dimension 1}}
+  %1 = amdgpu.extract_slice %arg0 [0,0] : tensor<256x0xi32, #blocked1> to tensor<256x16xi32, #blocked1>
+  tt.return
+}
+
+// -----
+
+// Invalid zero result dimension
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
+tt.func @invalid_size_input(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibility = 16 : i32}) {
+  // expected-error @+1 {{result tensor dimension size zero at dimension 1}}
+  %1 = amdgpu.extract_slice %arg0 [0,0] : tensor<256x128xi32, #blocked1> to tensor<256x0xi32, #blocked1>
+  tt.return
+}
+
+// -----
+
+// Invalid offset, not multiple of shapePerTile
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
+tt.func @invalid_offset_input(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibility = 16 : i32}) {
+  // expected-error @+1 {{offset [0, 5] must be a multiple of shapePerCTATile [256, 16]}}
+  %1 = amdgpu.extract_slice %arg0 [0,5] : tensor<256x128xi32, #blocked1> to tensor<256x16xi32, #blocked1>
+  tt.return
+}
+
+// -----
+
+// Invalid offset, out of bounds for dimension
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
+tt.func @invalid_offset_input(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibility = 16 : i32}) {
+  // expected-error @+1 {{invalid offset 128 at dimension 1}}
+  %1 = amdgpu.extract_slice %arg0 [0,128] : tensor<256x128xi32, #blocked1> to tensor<256x16xi32, #blocked1>
+  tt.return
+}
+
+// -----
+
+// Invalid result layout
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
+#blocked2 = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
+tt.func @invalid_result_layout(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibility = 16 : i32}) {
+  // expected-error @+1 {{result layout must match source layout}}
+  %1 = amdgpu.extract_slice %arg0 [0,0] : tensor<256x128xi32, #blocked1> to tensor<256x16xi32, #blocked2>
+  tt.return
+}
+
+// -----
+
+// Invalid result element type
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
+tt.func @invalid_result_element_type(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibility = 16 : i32}) {
+  // expected-error @+1 {{result element type must match source element type}}
+  %1 = amdgpu.extract_slice %arg0 [0,0] : tensor<256x128xi32, #blocked1> to tensor<256x16xi64, #blocked1>
+  tt.return
+}
+
+// -----
+
+// Invalid result rank
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
+tt.func @invalid_result_rank(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibility = 16 : i32}) {
+  // expected-error @+1 {{result rank must be equal to source rank}}
+  %1 = amdgpu.extract_slice %arg0 [0,0] : tensor<256x128xi32, #blocked1> to tensor<256x16x2xi32, #blocked1>
+  tt.return
+}
+
+// -----
+
+// Invalid result shape
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
+tt.func @invalid_result_rank(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibility = 16 : i32}) {
+  // expected-error @+1 {{result shape cannot be larger than input shape at dimension 1}}
+  %1 = amdgpu.extract_slice %arg0 [0,0] : tensor<256x128xi32, #blocked1> to tensor<256x256xi32, #blocked1>
+  tt.return
+}
+
+// -----
+
+// Invalid rank
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
+tt.func @invalid_rank(%arg0: tensor<256x128x2xi32, #blocked1> {tt.divisibility = 16 : i32}) {
+  // expected-error @+1 {{currently only 2D tensors are supported}}
+  %1 = amdgpu.extract_slice %arg0 [0,0,0] : tensor<256x128x2xi32, #blocked1> to tensor<256x16x2xi32, #blocked1>
+  tt.return
+}
+
+// -----
+
+// Invalid non static offset
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
+tt.func @invalid_non_static_offset(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibility = 16 : i32}, %arg1: i32) {
+  // expected-error @+2 {{expected ']'}}
+  // expected-error @+1 {{expected integer value}}
+  %2 = amdgpu.extract_slice %arg0 [%arg1, 0] : tensor<256x128xi32, #blocked1> to tensor<256x16xi32, #blocked1>
+  tt.return
+}
@@ -0,0 +1,14 @@
+// RUN: triton-opt %s --convert-triton-amdgpu-to-llvm='arch=gfx942' | FileCheck %s
+
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
+#blocked2 = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
+module attributes {"triton_gpu.compute-capability" = 0 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 64 : i32} {
+  tt.func @basic_insert_slice(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibility = 16 : i32}) {
+    // CHECK: llvm.func @basic_insert_slice
+    // CHECK-COUNT-64: %{{[0-9]*}} = llvm.extractvalue  %arg0[{{[0-9]*}}] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)>
+    // CHECK: %64 = llvm.mlir.undef : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+    // CHECK-COUNT-8:  %{{[0-9]*}} = llvm.insertvalue %{{[0-9]*}}, %{{[0-9]*}}[{{[0-9]*}}] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+    %72 = amdgpu.extract_slice %arg0 [0,0] : tensor<256x128xi32, #blocked1> to tensor<256x16xi32, #blocked1>
+    tt.return
+  }
+}
@@ -31,10 +31,12 @@ include "mlir/IR/EnumAttr.td"
 include "triton/Dialect/Triton/IR/TritonTypes.td"
 include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
+include "mlir/Interfaces/SideEffectInterfaces.td" // Pure
 include "triton/Dialect/Triton/IR/TritonInterfaces.td"
 include "TritonAMDGPUDialect.td"
 include "TritonAMDGPUAttrDefs.td"
 
+
 class TT_AMDGPU_Op<string mnemonic, list<Trait> traits = []> :
     Op<TritonAMDGPU_Dialect, mnemonic, !listconcat(traits, [])> {
 }
@@ -44,6 +46,74 @@ class TT_AMDGPU_Op<string mnemonic, list<Trait> traits = []> :
 //
 def GlobalMemory : Resource<"::mlir::triton::GlobalMemory">;
 
+//===----------------------------------------------------------------------===//
+// ExtractSliceOp
+//===----------------------------------------------------------------------===//
+
+def ExtractSliceOp
+    : TT_AMDGPU_Op<"extract_slice", [Pure]> {
+  let summary = "extract slice operation";
+  let description = [{
+    The "extract_slice" operation enables extracting a slice of a tensor in
+    registers.
+
+    The "extract_slice" operation supports the following arguments:
+
+    * source: the base tensor on which to create a view tensor
+    * offsets: offsets into the base tensor at which to create the view
+
+    Example 1:
+
+    ```mlir
+    #blocked = #triton_gpu.blocked<{sizePerThread = [1, 8],
+        threadsPerWarp = [4, 16], warpsPerCTA = [4, 1], order = [0, 1]}>
+    #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8],
+        threadsPerWarp = [16, 4], warpsPerCTA = [4, 1], order = [0, 1]}>
+    %1 = triton_gpu.convert_layout %0 : tensor<128x128xf16, #blocked>
+        -> tensor<128x128xf16, #blocked1>
+    // create a slice of base tensor %1 with static offsets
+    %2 = amdgpu.extract_slice %0 [0, 0] :
+      tensor<128x128xf16, #blocked1> to tensor<128x32xf16, #blocked1>
+    ```
+
+    Example 1 shows how "extract_slice" operation may be used. In this example a
+    new slice of 128x32 is created. "extract_slice" works on tensors with layout
+    where the desired slice has the same layout as the source tensor.
+    "%0" cannot be sliced directly as the resulting slice cannot have the same
+    layout as "%0". Therefore it needs to be converted to a layout suitable
+    for slicing. "#blocked1" layout is appropriate for this as it keeps the
+    sizePerThread the same thus keeping coalescing properties the same.
+    In order to utilize all threads in a warp, "threadsPerWarp" is set to
+    [16,4] for this new layout. This layout conversion carried out before
+    using "extract_slice" ensures slicing still uses all threads efficiently. The
+    size of the slice is determined by the result type.
+    }];
+
+  let arguments = (ins AnyRankedTensor:$source,
+      DenseI64ArrayAttr:$static_offsets);
+  let results = (outs AnyRankedTensor:$result);
+
+  let builders = [
+      // Build a ExtractSliceOp with static offsets and the same result type
+      OpBuilder<(ins "RankedTensorType":$resultType,
+          "Value":$source,
+          "ArrayRef<int64_t>": $static_offsets)>,
+  ];
+
+  let extraClassDeclaration = [{
+    std::array<unsigned, 3> getArrayAttrMaxRanks() {
+      unsigned rank = getSource().getType().getRank();
+      return {rank, rank, rank};
+    }
+  }];
+
+  let assemblyFormat = [{
+    $source $static_offsets attr-dict `:` type($source) `to` type($result)
+  }];
+
+  let hasVerifier = 1;
+}
+
 def InstructionSchedHint : TT_AMDGPU_Op<"instruction_sched_hint", []> {
   let summary = "A placeholder op for instruction scheduling hints within a basic block";
   let description = [{

@@ -0,0 +1,17 @@
+#ifndef TRITONAMDGPU_TO_LLVM_PATTERNS_AMDGPU_OP_TO_LLVM_H
+#define TRITONAMDGPU_TO_LLVM_PATTERNS_AMDGPU_OP_TO_LLVM_H
+
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "triton/Analysis/AxisInfo.h"
+
+using namespace mlir;
+
+namespace mlir::triton::AMD {
+
+void populateExtractSliceOpToLLVMPatterns(
+    mlir::LLVMTypeConverter &typeConverter, mlir::RewritePatternSet &patterns,
+    mlir::PatternBenefit benefit);
+
+}
+
+#endif
@@ -28,6 +28,8 @@
 
 #include "llvm/ADT/TypeSwitch.h"
 
+#include "triton/Conversion/TritonGPUToLLVM/Utility.h"
+
 // clang-format off
 #include "Dialect/TritonAMDGPU/IR/Dialect.h"
 #include "Dialect/TritonAMDGPU/IR/Dialect.cpp.inc"
@@ -53,3 +55,78 @@ void mlir::triton::amdgpu::TritonAMDGPUDialect::initialize() {
 
 #define GET_OP_CLASSES
 #include "Dialect/TritonAMDGPU/IR/Ops.cpp.inc"
+
+namespace mlir::triton::amdgpu {
+
+LogicalResult ExtractSliceOp::verify() {
+  auto srcTy = getSource().getType();
+  auto srcLayout = srcTy.getEncoding();
+  auto srcElementType = dyn_cast<RankedTensorType>(srcTy).getElementType();
+  auto resultTy = getResult().getType();
+  auto resultLayout = resultTy.getEncoding();
+  auto resultElementType =
+      dyn_cast<RankedTensorType>(resultTy).getElementType();
+
+  if (srcElementType != resultElementType) {
+    return emitError("result type must match source type");
+  }
+
+  if (srcLayout != resultLayout)
+    return emitError("result layout must match source layout");
+
+  auto srcShape = srcTy.getShape();
+  auto shapePerCTATile =
+      mlir::triton::gpu::getShapePerCTATile(srcLayout, srcShape);
+  shapePerCTATile[0] =
+      std::min(static_cast<unsigned>(srcShape[0]), shapePerCTATile[0]);
+  shapePerCTATile[1] =
+      std::min(static_cast<unsigned>(srcShape[1]), shapePerCTATile[1]);
+
+  // ExtractSlice only supports slicing where offsets and sizes are multiples of
+  // shapePerCTATile. This condition ensures that slice has the same layout as
+  // the original tensor.
+
+  auto offsets = getStaticOffsets();
+  if (offsets.size() != 2) {
+    return emitError("invalid offset shape ") << offsets;
+  }
+
+  SmallVector<int64_t, 2> sizes;
+  for (auto i = 0; i < 2; ++i) {
+    auto resultDimSize = resultTy.getDimSize(i);
+    auto srcDimSize = srcTy.getDimSize(i);
+    if (resultDimSize == 0) {
+      return emitError("result tensor dimension size zero at dimension ") << i;
+    }
+    if (srcDimSize == 0) {
+      return emitError("source tensor dimension size zero at dimension ") << i;
+    }
+    if (resultDimSize > srcDimSize) {
+      return emitError(
+                 "result shape cannot be larger than input shape at dimension ")
+             << i;
+    }
+    if (offsets[i] + resultDimSize > srcDimSize) {
+      return emitError("invalid offset ")
+             << offsets[i] << " at dimension " << i;
+    }
+    sizes.push_back(resultDimSize);
+  }
+
+  if (sizes[0] % shapePerCTATile[0] != 0 ||
+      sizes[1] % shapePerCTATile[1] != 0) {
+    return emitError() << "sizes [" << sizes
+                       << "] must be a multiple of shapePerCTATile ["
+                       << shapePerCTATile << "]";
+  }
+
+  if (offsets[0] % shapePerCTATile[0] != 0 ||
+      offsets[1] % shapePerCTATile[1] != 0) {
+    return emitError() << "offset [" << offsets
+                       << "] must be a multiple of shapePerCTATile ["
+                       << shapePerCTATile << "]";
+  }
+
+  return success();
+}
+} // namespace mlir::triton::amdgpu
@@ -1,5 +1,6 @@
 add_triton_library(TritonAMDGPUDialectToLLVM
     TritonAMDGPUToLLVMPatterns.cpp
+    ExtractSliceOpToLLVM.cpp
 
     DEPENDS
     TritonAMDGPUIR