Adds zero dimension check and related tests

triton-lang · Nov 19, 2024 · 7b20b6b · 7b20b6b
1 parent d87893f
commit 7b20b6b
Show file tree

Hide file tree

Showing 3 changed files with 39 additions and 13 deletions.
diff --git a/test/Conversion/amd/invalid_extractslice_to_llvm.mlir b/test/Conversion/amd/invalid_extractslice_to_llvm.mlir
@@ -10,6 +10,26 @@ tt.func @invalid_size_input(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibili
 
 // -----
 
+// Invalid zero source dimension
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
+tt.func @invalid_size_input(%arg0: tensor<256x0xi32, #blocked1> {tt.divisibility = 16 : i32}) {
+  // expected-error @+1 {{source tensor dimension size zero at dimension 1}}
+  %1 = amdgpu.extract_slice %arg0 [0,0] : tensor<256x0xi32, #blocked1> to tensor<256x16xi32, #blocked1>
+  tt.return
+}
+
+// -----
+
+// Invalid zero result dimension
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
+tt.func @invalid_size_input(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibility = 16 : i32}) {
+  // expected-error @+1 {{result tensor dimension size zero at dimension 1}}
+  %1 = amdgpu.extract_slice %arg0 [0,0] : tensor<256x128xi32, #blocked1> to tensor<256x0xi32, #blocked1>
+  tt.return
+}
+
+// -----
+
 // Invalid offset, not multiple of shapePerTile
 #blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
 tt.func @invalid_offset_input(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibility = 16 : i32}) {

diff --git a/third_party/amd/lib/Dialect/TritonAMDGPU/IR/Dialect.cpp b/third_party/amd/lib/Dialect/TritonAMDGPU/IR/Dialect.cpp
@@ -92,6 +92,12 @@ LogicalResult ExtractSliceOp::verify() {
   for (auto i = 0; i < 2; ++i) {
     auto resultDimSize = resultTy.getDimSize(i);
     auto srcDimSize = srcTy.getDimSize(i);
+    if (resultDimSize == 0) {
+      return emitError("result tensor dimension size zero at dimension ") << i;
+    }
+    if (srcDimSize == 0) {
+      return emitError("source tensor dimension size zero at dimension ") << i;
+    }
     if (resultDimSize > srcDimSize) {
       return emitError(
                  "result shape cannot be larger than input shape at dimension ")

diff --git a/third_party/amd/lib/TritonAMDGPUDialectToLLVM/ExtractSliceOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUDialectToLLVM/ExtractSliceOpToLLVM.cpp
@@ -9,8 +9,6 @@
 
 using namespace mlir;
 using namespace mlir::triton;
-using namespace mlir::triton::gpu;
-namespace tta = mlir::triton::amdgpu;
 
 // clang-format off
 /***
@@ -66,11 +64,13 @@ struct ExtractSliceOpConversion
     auto srcShape = srcTy.getShape();
     auto resultTy = cast<RankedTensorType>(op.getType());
     auto vals = unpackLLElements(loc, adaptor.getSource(), rewriter);
-    auto elemsPerThread = mlir::triton::gpu::getElemsPerThread(srcTy);
-    auto sizePerThread = getSizePerThread(srcLayout);
-    auto totalSizePerThread = sizePerThread[0] * sizePerThread[1];
-    auto order = getOrder(srcLayout);
-    auto shapePerCTA = getShapePerCTATile(srcLayout, srcShape);
+    auto elemsPerThread = triton::gpu::getElemsPerThread(srcTy);
+    auto sizePerThread = triton::gpu::getSizePerThread(srcLayout);
+    auto totalSizePerThread = product<unsigned>(sizePerThread);
+    auto order = triton::gpu::getOrder(srcLayout);
+
+    // Calculate valid total number of workers in each dimension
+    auto shapePerCTA = triton::gpu::getShapePerCTATile(srcLayout, srcShape);
     shapePerCTA[0] =
         std::min(static_cast<unsigned>(srcShape[0]), shapePerCTA[0]);
     shapePerCTA[1] =
@@ -85,12 +85,12 @@ struct ExtractSliceOpConversion
     auto offsets = op.getStaticOffsets();
 
     // Calculate offsets and sizes in terms of CTA units.
-    std::vector<long int> CTAOffsets{offsets[0] / shapePerCTA[0],
-                                     offsets[1] / shapePerCTA[1]};
-    std::vector<long int> CTASizes{sizes[0] / shapePerCTA[0],
-                                   sizes[1] / shapePerCTA[1]};
-    std::vector<long int> CTAPerShape{srcShape[0] / shapePerCTA[0],
-                                      srcShape[1] / shapePerCTA[1]};
+    std::array<int64_t,2> CTAOffsets{offsets[0] / shapePerCTA[0],
+                                    offsets[1] / shapePerCTA[1]};
+    std::array<int64_t,2> CTASizes{sizes[0] / shapePerCTA[0],
+                                  sizes[1] / shapePerCTA[1]};
+    std::array<int64_t,2> CTAPerShape{srcShape[0] / shapePerCTA[0],
+                                     srcShape[1] / shapePerCTA[1]};
 
     // The diagram above illustrates the graphical representation of the
     // skipElems, tensorStride, and lastIdx variables.