[Transform][Tiling] Add deep tile support for matmul #90

zhczhong · 2024-05-20T07:51:52Z

Tracking #53

TODO:

zhczhong · 2024-06-05T03:21:39Z

Support use linalgx.batch_reduce_vnni(bf16xbf16->f32) and fuse the cast(f32->bf16) to the last loop about K axis

func.func @matmul_4Dx4D_bf16(%arg0: tensor<128x128x32x32xbf16>, %arg1: tensor<128x128x16x32x2xbf16>) -> tensor<128x128x32x32xbf16> {
    %cst_0 = arith.constant 0.000000e+00 : bf16
    %0 = tensor.empty() : tensor<128x128x32x32xbf16>
    %1 = linalg.fill ins(%cst_0 : bf16) outs(%0 : tensor<128x128x32x32xbf16>) -> tensor<128x128x32x32xbf16>
    %2 = linalgx.mm4d_vnni ins(%arg0, %arg1 : tensor<128x128x32x32xbf16>, tensor<128x128x16x32x2xbf16>) outs(%1 : tensor<128x128x32x32xbf16>)  -> tensor<128x128x32x32xbf16>
    return %2 : tensor<128x128x32x32xbf16>
}

will be transformed into

#map = affine_map<(d0) -> (d0 * 64)>
#map1 = affine_map<(d0)[s0, s1] -> (d0 * 64 + s0 + s1)>
module {
  func.func @matmul_4Dx4D_bf16(%arg0: tensor<128x128x32x32xbf16>, %arg1: tensor<128x128x16x32x2xbf16>) -> tensor<128x128x32x32xbf16> {
    %c1 = arith.constant 1 : index
    %c128 = arith.constant 128 : index
    %c2 = arith.constant 2 : index
    %c64 = arith.constant 64 : index
    %c0 = arith.constant 0 : index
    %cst = arith.constant 0.000000e+00 : bf16
    %0 = tensor.empty() : tensor<128x128x32x32xbf16>
    %1 = scf.forall (%arg2, %arg3) in (2, 2) shared_outs(%arg4 = %0) -> (tensor<128x128x32x32xbf16>) {
      %2 = affine.apply #map(%arg2)
      %3 = affine.apply #map(%arg3)
      %extracted_slice = tensor.extract_slice %arg4[%2, %3, 0, 0] [64, 64, 32, 32] [1, 1, 1, 1] : tensor<128x128x32x32xbf16> to tensor<64x64x32x32xbf16>
      %4 = scf.for %arg5 = %c0 to %c64 step %c2 iter_args(%arg6 = %extracted_slice) -> (tensor<64x64x32x32xbf16>) {
        %extracted_slice_0 = tensor.extract_slice %arg6[%arg5, 0, 0, 0] [2, 64, 32, 32] [1, 1, 1, 1] : tensor<64x64x32x32xbf16> to tensor<2x64x32x32xbf16>
        %7 = scf.for %arg7 = %c0 to %c64 step %c2 iter_args(%arg8 = %extracted_slice_0) -> (tensor<2x64x32x32xbf16>) {
          %extracted_slice_1 = tensor.extract_slice %arg8[0, %arg7, 0, 0] [2, 2, 32, 32] [1, 1, 1, 1] : tensor<2x64x32x32xbf16> to tensor<2x2x32x32xbf16>
          %8 = tensor.empty() : tensor<2x2x32x32xf32>
          %9 = scf.for %arg9 = %c0 to %c128 step %c2 iter_args(%arg10 = %8) -> (tensor<2x2x32x32xf32>) {
            %11 = scf.for %arg11 = %c0 to %c2 step %c1 iter_args(%arg12 = %arg10) -> (tensor<2x2x32x32xf32>) {
              %extracted_slice_3 = tensor.extract_slice %arg12[%arg11, 0, 0, 0] [1, 2, 32, 32] [1, 1, 1, 1] : tensor<2x2x32x32xf32> to tensor<1x2x32x32xf32>
              %12 = scf.for %arg13 = %c0 to %c2 step %c1 iter_args(%arg14 = %extracted_slice_3) -> (tensor<1x2x32x32xf32>) {
                %13 = affine.apply #map1(%arg2)[%arg11, %arg5]
                %extracted_slice_5 = tensor.extract_slice %arg0[%13, %arg9, 0, 0] [1, 2, 32, 32] [1, 1, 1, 1] : tensor<128x128x32x32xbf16> to tensor<2x32x32xbf16>
                %14 = affine.apply #map1(%arg3)[%arg13, %arg7]
                %extracted_slice_6 = tensor.extract_slice %arg1[%14, %arg9, 0, 0, 0] [1, 2, 16, 32, 2] [1, 1, 1, 1, 1] : tensor<128x128x16x32x2xbf16> to tensor<2x16x32x2xbf16>
                %extracted_slice_7 = tensor.extract_slice %arg14[0, %arg13, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xf32> to tensor<32x32xf32>
                %15 = arith.cmpi eq, %arg9, %c0 : index
                %16 = scf.if %15 -> (tensor<32x32xf32>) {
                  %17 = linalg.fill ins(%cst : bf16) outs(%extracted_slice_7 : tensor<32x32xf32>) -> tensor<32x32xf32>
                  %18 = linalgx.batch_reduce_matmul_vnni ins(%extracted_slice_5, %extracted_slice_6 : tensor<2x32x32xbf16>, tensor<2x16x32x2xbf16>) outs(%17 : tensor<32x32xf32>) -> tensor<32x32xf32>
                  scf.yield %18 : tensor<32x32xf32>
                } else {
                  %17 = linalgx.batch_reduce_matmul_vnni ins(%extracted_slice_5, %extracted_slice_6 : tensor<2x32x32xbf16>, tensor<2x16x32x2xbf16>) outs(%extracted_slice_7 : tensor<32x32xf32>) -> tensor<32x32xf32>
                  scf.yield %17 : tensor<32x32xf32>
                }
                %inserted_slice_8 = tensor.insert_slice %16 into %arg14[0, %arg13, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<32x32xf32> into tensor<1x2x32x32xf32>
                scf.yield %inserted_slice_8 : tensor<1x2x32x32xf32>
              }
              %inserted_slice_4 = tensor.insert_slice %12 into %arg12[%arg11, 0, 0, 0] [1, 2, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xf32> into tensor<2x2x32x32xf32>
              scf.yield %inserted_slice_4 : tensor<2x2x32x32xf32>
            }
            scf.yield %11 : tensor<2x2x32x32xf32>
          }
          %10 = linalg.copy ins(%9 : tensor<2x2x32x32xf32>) outs(%extracted_slice_1 : tensor<2x2x32x32xbf16>) -> tensor<2x2x32x32xbf16>
          %inserted_slice_2 = tensor.insert_slice %10 into %arg8[0, %arg7, 0, 0] [2, 2, 32, 32] [1, 1, 1, 1] : tensor<2x2x32x32xbf16> into tensor<2x64x32x32xbf16>
          scf.yield %inserted_slice_2 : tensor<2x64x32x32xbf16>
        }
        %inserted_slice = tensor.insert_slice %7 into %arg6[%arg5, 0, 0, 0] [2, 64, 32, 32] [1, 1, 1, 1] : tensor<2x64x32x32xbf16> into tensor<64x64x32x32xbf16>
        scf.yield %inserted_slice : tensor<64x64x32x32xbf16>
      }
      %5 = affine.apply #map(%arg2)
      %6 = affine.apply #map(%arg3)
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %4 into %arg4[%5, %6, 0, 0] [64, 64, 32, 32] [1, 1, 1, 1] : tensor<64x64x32x32xbf16> into tensor<128x128x32x32xbf16>
      }
    }
    return %1 : tensor<128x128x32x32xbf16>
  }
}

lib/gc/Transforms/DeepTileContractionNamedOp.cpp

zhczhong · 2024-06-13T08:35:27Z

Update: Fuse the cast(f32->bf16) to the innermost loop

func.func @matmul_4Dx4D_bf16(%arg0: tensor<128x128x32x32xbf16>, %arg1: tensor<128x128x16x32x2xbf16>) -> tensor<128x128x32x32xbf16> {
    %cst_0 = arith.constant 0.000000e+00 : bf16
    %0 = tensor.empty() : tensor<128x128x32x32xbf16>
    %1 = linalg.fill ins(%cst_0 : bf16) outs(%0 : tensor<128x128x32x32xbf16>) -> tensor<128x128x32x32xbf16>
    %2 = linalgx.mm4d_vnni ins(%arg0, %arg1 : tensor<128x128x32x32xbf16>, tensor<128x128x16x32x2xbf16>) outs(%1 : tensor<128x128x32x32xbf16>)  -> tensor<128x128x32x32xbf16>
    return %2 : tensor<128x128x32x32xbf16>
}

will be transformed to

#map = affine_map<(d0) -> (d0 * 64)>
#map1 = affine_map<(d0)[s0, s1] -> (d0 * 64 + s0 + s1)>
module {
  func.func @matmul_4Dx4D_bf16(%arg0: tensor<128x128x32x32xbf16>, %arg1: tensor<128x128x16x32x2xbf16>) -> tensor<128x128x32x32xbf16> {
    %c1 = arith.constant 1 : index
    %c128 = arith.constant 128 : index
    %c2 = arith.constant 2 : index
    %c64 = arith.constant 64 : index
    %c0 = arith.constant 0 : index
    %cst = arith.constant 0.000000e+00 : bf16
    %0 = tensor.empty() : tensor<128x128x32x32xbf16>
    %1 = scf.forall (%arg2, %arg3) in (2, 2) shared_outs(%arg4 = %0) -> (tensor<128x128x32x32xbf16>) {
      %2 = affine.apply #map(%arg2)
      %3 = affine.apply #map(%arg3)
      %extracted_slice = tensor.extract_slice %arg4[%2, %3, 0, 0] [64, 64, 32, 32] [1, 1, 1, 1] : tensor<128x128x32x32xbf16> to tensor<64x64x32x32xbf16>
      %4 = scf.for %arg5 = %c0 to %c64 step %c2 iter_args(%arg6 = %extracted_slice) -> (tensor<64x64x32x32xbf16>) {
        %extracted_slice_0 = tensor.extract_slice %arg6[%arg5, 0, 0, 0] [2, 64, 32, 32] [1, 1, 1, 1] : tensor<64x64x32x32xbf16> to tensor<2x64x32x32xbf16>
        %7 = scf.for %arg7 = %c0 to %c64 step %c2 iter_args(%arg8 = %extracted_slice_0) -> (tensor<2x64x32x32xbf16>) {
          %extracted_slice_1 = tensor.extract_slice %arg8[0, %arg7, 0, 0] [2, 2, 32, 32] [1, 1, 1, 1] : tensor<2x64x32x32xbf16> to tensor<2x2x32x32xbf16>
          %8 = tensor.empty() : tensor<2x2x32x32xf32>
          %9:2 = scf.for %arg9 = %c0 to %c128 step %c2 iter_args(%arg10 = %8, %arg11 = %extracted_slice_1) -> (tensor<2x2x32x32xf32>, tensor<2x2x32x32xbf16>) {
            %10:2 = scf.for %arg12 = %c0 to %c2 step %c1 iter_args(%arg13 = %arg10, %arg14 = %arg11) -> (tensor<2x2x32x32xf32>, tensor<2x2x32x32xbf16>) {
              %extracted_slice_3 = tensor.extract_slice %arg13[%arg12, 0, 0, 0] [1, 2, 32, 32] [1, 1, 1, 1] : tensor<2x2x32x32xf32> to tensor<1x2x32x32xf32>
              %extracted_slice_4 = tensor.extract_slice %arg14[%arg12, 0, 0, 0] [1, 2, 32, 32] [1, 1, 1, 1] : tensor<2x2x32x32xbf16> to tensor<1x2x32x32xbf16>
              %11:2 = scf.for %arg15 = %c0 to %c2 step %c1 iter_args(%arg16 = %extracted_slice_3, %arg17 = %extracted_slice_4) -> (tensor<1x2x32x32xf32>, tensor<1x2x32x32xbf16>) {
                %12 = affine.apply #map1(%arg2)[%arg12, %arg5]
                %extracted_slice_7 = tensor.extract_slice %arg0[%12, %arg9, 0, 0] [1, 2, 32, 32] [1, 1, 1, 1] : tensor<128x128x32x32xbf16> to tensor<2x32x32xbf16>
                %13 = affine.apply #map1(%arg3)[%arg15, %arg7]
                %extracted_slice_8 = tensor.extract_slice %arg1[%13, %arg9, 0, 0, 0] [1, 2, 16, 32, 2] [1, 1, 1, 1, 1] : tensor<128x128x16x32x2xbf16> to tensor<2x16x32x2xbf16>
                %extracted_slice_9 = tensor.extract_slice %arg16[0, %arg15, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xf32> to tensor<32x32xf32>
                %extracted_slice_10 = tensor.extract_slice %arg17[0, %arg15, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<32x32xbf16>
                %14 = arith.cmpi eq, %arg9, %c0 : index
                %15 = scf.if %14 -> (tensor<32x32xf32>) {
                  %18 = linalg.fill ins(%cst : bf16) outs(%extracted_slice_9 : tensor<32x32xf32>) -> tensor<32x32xf32>
                  %19 = linalgx.batch_reduce_matmul_vnni ins(%extracted_slice_7, %extracted_slice_8 : tensor<2x32x32xbf16>, tensor<2x16x32x2xbf16>) outs(%18 : tensor<32x32xf32>) -> tensor<32x32xf32>
                  scf.yield %19 : tensor<32x32xf32>
                } else {
                  %18 = linalgx.batch_reduce_matmul_vnni ins(%extracted_slice_7, %extracted_slice_8 : tensor<2x32x32xbf16>, tensor<2x16x32x2xbf16>) outs(%extracted_slice_9 : tensor<32x32xf32>) -> tensor<32x32xf32>
                  scf.yield %18 : tensor<32x32xf32>
                }
                %16 = arith.cmpi eq, %arg9, %c0 : index
                %17 = scf.if %16 -> (tensor<32x32xbf16>) {
                  %18 = linalg.copy ins(%15 : tensor<32x32xf32>) outs(%extracted_slice_10 : tensor<32x32xbf16>) -> tensor<32x32xbf16>
                  scf.yield %18 : tensor<32x32xbf16>
                } else {
                  scf.yield %extracted_slice_10 : tensor<32x32xbf16>
                }
                %inserted_slice_11 = tensor.insert_slice %15 into %arg16[0, %arg15, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<32x32xf32> into tensor<1x2x32x32xf32>
                %inserted_slice_12 = tensor.insert_slice %17 into %arg17[0, %arg15, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<32x32xbf16> into tensor<1x2x32x32xbf16>
                scf.yield %inserted_slice_11, %inserted_slice_12 : tensor<1x2x32x32xf32>, tensor<1x2x32x32xbf16>
              }
              %inserted_slice_5 = tensor.insert_slice %11#0 into %arg13[%arg12, 0, 0, 0] [1, 2, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xf32> into tensor<2x2x32x32xf32>
              %inserted_slice_6 = tensor.insert_slice %11#1 into %arg14[%arg12, 0, 0, 0] [1, 2, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> into tensor<2x2x32x32xbf16>
              scf.yield %inserted_slice_5, %inserted_slice_6 : tensor<2x2x32x32xf32>, tensor<2x2x32x32xbf16>
            }
            scf.yield %10#0, %10#1 : tensor<2x2x32x32xf32>, tensor<2x2x32x32xbf16>
          }
          %inserted_slice_2 = tensor.insert_slice %9#1 into %arg8[0, %arg7, 0, 0] [2, 2, 32, 32] [1, 1, 1, 1] : tensor<2x2x32x32xbf16> into tensor<2x64x32x32xbf16>
          scf.yield %inserted_slice_2 : tensor<2x64x32x32xbf16>
        }
        %inserted_slice = tensor.insert_slice %7 into %arg6[%arg5, 0, 0, 0] [2, 64, 32, 32] [1, 1, 1, 1] : tensor<2x64x32x32xbf16> into tensor<64x64x32x32xbf16>
        scf.yield %inserted_slice : tensor<64x64x32x32xbf16>
      }
      %5 = affine.apply #map(%arg2)
      %6 = affine.apply #map(%arg3)
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %4 into %arg4[%5, %6, 0, 0] [64, 64, 32, 32] [1, 1, 1, 1] : tensor<64x64x32x32xbf16> into tensor<128x128x32x32xbf16>
      }
    }
    return %1 : tensor<128x128x32x32xbf16>
  }
}

include/gc/IR/EasyBuildSCF.h

include/gc/Dialect/Arith/Utils/EasyBuild.h

zhczhong added the WIP work in progress label May 20, 2024

zhczhong force-pushed the zhicong/deep_tile_matmul branch 3 times, most recently from 7c8cfbb to 927322a Compare May 23, 2024 06:11

zhczhong force-pushed the zhicong/deep_tile_matmul branch 6 times, most recently from ea02416 to f261c3c Compare June 3, 2024 03:47

zhczhong force-pushed the zhicong/deep_tile_matmul branch 5 times, most recently from 5ed4fc1 to 22d86d4 Compare June 5, 2024 03:21

yifeizh2 reviewed Jun 13, 2024

View reviewed changes

lib/gc/Transforms/DeepTileContractionNamedOp.cpp Outdated Show resolved Hide resolved

yifeizh2 reviewed Jun 13, 2024

View reviewed changes

lib/gc/Transforms/DeepTileContractionNamedOp.cpp Outdated Show resolved Hide resolved

zhczhong force-pushed the zhicong/deep_tile_matmul branch from 22d86d4 to 8577250 Compare June 13, 2024 08:25

zhczhong force-pushed the zhicong/deep_tile_matmul branch from 8577250 to 206fead Compare June 13, 2024 08:40

yifeizh2 reviewed Jun 14, 2024

View reviewed changes

include/gc/IR/EasyBuildSCF.h Outdated Show resolved Hide resolved

yifeizh2 reviewed Jun 14, 2024

View reviewed changes

include/gc/Dialect/Arith/Utils/EasyBuild.h Outdated Show resolved Hide resolved

zhczhong force-pushed the zhicong/deep_tile_matmul branch from 206fead to 65dfab8 Compare June 14, 2024 03:10

zhczhong force-pushed the zhicong/deep_tile_matmul branch 4 times, most recently from d69856f to 823be69 Compare July 2, 2024 02:53

zhczhong force-pushed the zhicong/deep_tile_matmul branch from 823be69 to 02f519b Compare July 3, 2024 08:38

zhczhong linked an issue Jul 10, 2024 that may be closed by this pull request

nested matmul implementation #53

Closed

zhczhong force-pushed the zhicong/deep_tile_matmul branch 3 times, most recently from 304dcde to 9dce4b3 Compare August 7, 2024 03:17

zhczhong added 21 commits August 7, 2024 18:42

add deep tile pass for matmul and tests

373d10c

Enhance upstream utility and merge all parallel into one forall

8ec246b

add easy builder support

90c2b4b

Init C buffer with easy builder

c0c5749

support partial reduction

c4b777c

support bf16 cast fuse

b98100b

replace generic op with named op

66a986b

support 2Dx4D/5D case

56624bb

support fusing cast to the innermost loop

b950edb

enhance config

dc9a1a4

rebase to the latest llvm

9c9ff10

fix deepTileMatmul

162466e

tune config

b138713

add merge forall pass

8c7d155

polish code

df1c683

support dlti

af8aad6

fix comments

24198fb

format code

9f294ea

replace sysDesc with target info

3c5567f

deprecated tileToForallUsingTileSize

a205731

use expand/collapse_shape to do rank alter

ccd02f2

zhczhong force-pushed the zhicong/deep_tile_matmul branch from 9dce4b3 to ccd02f2 Compare August 8, 2024 03:05

ZhennanQin approved these changes Aug 8, 2024

View reviewed changes

Yun-Fly approved these changes Aug 8, 2024

View reviewed changes

ciyongch approved these changes Aug 9, 2024

View reviewed changes

ZhennanQin merged commit 8948c6b into main Aug 9, 2024
4 checks passed

zhczhong deleted the zhicong/deep_tile_matmul branch August 29, 2024 06:54

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[Transform][Tiling] Add deep tile support for matmul #90

[Transform][Tiling] Add deep tile support for matmul #90

zhczhong commented May 20, 2024 •

edited

Loading

zhczhong commented Jun 5, 2024

zhczhong commented Jun 13, 2024

[Transform][Tiling] Add deep tile support for matmul #90

[Transform][Tiling] Add deep tile support for matmul #90

Conversation

zhczhong commented May 20, 2024 • edited Loading

zhczhong commented Jun 5, 2024

zhczhong commented Jun 13, 2024

zhczhong commented May 20, 2024 •

edited

Loading