triton-lang · binarman · Jul 2, 2024 · Aug 16, 2024 · Aug 28, 2024
@@ -102,6 +102,8 @@ using namespace mlir::triton;
 #define undef(...) rewriter.create<LLVM::UndefOp>(loc, __VA_ARGS__)
 #define null(...) rewriter.create<LLVM::ZeroOp>(loc, __VA_ARGS__)
 #define call(...) rewriter.create<LLVM::CallOp>(loc, __VA_ARGS__)
+#define call_intrinsic(...)                                                    \
+  rewriter.create<LLVM::CallIntrinsicOp>(loc, __VA_ARGS__)
 
 // Types
 #define int_ty(width) rewriter.getIntegerType(width)
@@ -1473,6 +1475,22 @@ inline bool isLayoutMmaV1(Attribute layout) {
   return isMmaV1;
 }
 
+inline SharedMemoryObject
+getExpandedSharedMemoryObject(ConversionPatternRewriter &rewriter, Location loc,
+                              SharedMemoryObject smemObj,
+                              ArrayRef<int64_t> shape) {
+  auto strides = smemObj.getStrides();
+  auto offsets = smemObj.getOffsets();
+  auto rank = strides.size();
+  if (rank == 3)
+    return smemObj;
+  strides.insert(strides.begin(), i32_val(shape[0] * shape[1]));
+  offsets.insert(offsets.begin(), i32_val(0));
+  auto expandedSmemObj = SharedMemoryObject(
+      smemObj.getBase(), smemObj.getBaseElemType(), strides, offsets);
+  return expandedSmemObj;
+}
+
 } // namespace mlir
 
 #endif
@@ -131,6 +131,16 @@ void dumpHWLayout(RankedTensorType tensorType);
 // Return a string representation of the layout of the tensor.
 std::string getLayoutStr(RankedTensorType tensorType, bool useHWPointOfView);
 
+template <typename T>
+llvm::SmallVector<T> expandMatrixShapeWithBatch(llvm::ArrayRef<T> s) {
+  llvm::SmallVector<T> expanded(3 - s.size(), 1);
+  expanded.append(s.begin(), s.end());
+  return expanded;
+}
+
+llvm::SmallVector<unsigned>
+expandMatrixOrderWithBatch(llvm::ArrayRef<unsigned> o);
+
 } // namespace gpu
 } // namespace triton
 } // namespace mlir

@@ -180,6 +180,9 @@ bool isPureUnaryInlineAsm(Operation *op);
 // read the compute capability from the module attributes
 int getNVIDIAComputeCapability(Operation *module);
 
+// read the amd target from the module attributes
+StringRef getAMDArch(Operation *module);
+
 } // namespace mlir
 
 #endif // TRITON_DIALECT_TRITONGPU_TRANSFORMS_UTILITY_H_
@@ -482,12 +482,18 @@ bool supportMMA(triton::DotOp op, int version) {
   // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-fragment-mma-884-f16
   auto aElemTy = op.getA().getType().getElementType();
   auto bElemTy = op.getB().getType().getElementType();
+  auto retType = op.getType();
+  auto retShapePerCTA = getShapePerCTA(retType);
+  auto rank = retShapePerCTA.size();
+  auto aTensorTy = cast<RankedTensorType>(op.getA().getType());
+  auto aShape = aTensorTy.getShape();
+  auto encoding = cast<DotOperandEncodingAttr>(aTensorTy.getEncoding());
+  if (retShapePerCTA[rank - 2] < 16 || retShapePerCTA[rank - 1] < 16 ||
+      aShape[rank - 1] < 16)
+    return false;
   if (version == 3) {
     if (triton::tools::getBoolEnv("DISABLE_MMA_V3"))
       return false;
-    auto retType = op.getType();
-    auto retShapePerCTA = getShapePerCTA(retType);
-    auto rank = retShapePerCTA.size();
     auto mod = op->getParentOfType<ModuleOp>();
     int numWarps = TritonGPUDialect::getNumWarps(mod);
     // TODO(Keren): for now, fallback to MMAv2 if handling batch matmul.