#4836: Add support for blocking conv activation in 2d systolic conv v…

…ariant
tenstorrent · Jan 25, 2024 · 38659f6 · 38659f6
1 parent cfe78bb
commit 38659f6
Show file tree

Hide file tree

Showing 4 changed files with 287 additions and 158 deletions.
diff --git a/tests/ttnn/unit_tests/operations/test_conv.py b/tests/ttnn/unit_tests/operations/test_conv.py
@@ -11,66 +11,7 @@
 import ttnn
 
 
-@skip_for_wormhole_b0()
-@pytest.mark.parametrize(
-    "batch_size, output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, use_1d_systolic_array",
-    (
-        # unique convs in rn50 (complete list)
-        # first conv post folding and input_channels padding to tile width
-        (8, 64, 16, 115, 115, 4, 4, 1, 1, 0, 0, True),
-        # rn50 layer1
-        (8, 64, 64, 56, 56, 3, 3, 1, 1, 1, 1, True),
-        # rn50 layer2
-        (8, 128, 128, 56, 56, 3, 3, 2, 2, 1, 1, True),
-        (20, 128, 128, 56, 56, 3, 3, 2, 2, 1, 1, True),
-        (8, 128, 128, 28, 28, 3, 3, 1, 1, 1, 1, True),
-        # rn50 layer3
-        (8, 256, 256, 28, 28, 3, 3, 2, 2, 1, 1, False),
-        (8, 256, 256, 14, 14, 3, 3, 1, 1, 1, 1, False),
-        # rn50 layer4
-        (8, 512, 512, 14, 14, 3, 3, 2, 2, 1, 1, False),
-        (8, 512, 512, 7, 7, 3, 3, 1, 1, 1, 1, False),
-        # sd convs with HxW=32x32
-        # (1, 320, 320, 32, 32, 3, 3, 1, 1, 1, 1, False),
-        # (1, 320, 320, 32, 32, 3, 3, 2, 2, 1, 1, False),
-        # (1, 640, 640, 16, 16, 3, 3, 1, 1, 1, 1, False),
-        # (1, 640, 640, 16, 16, 3, 3, 2, 2, 1, 1, False),
-        # (1, 640, 640, 16, 16, 3, 3, 2, 2, 1, 1, False), # bfloat16 activations doesnt fit
-        # (1, 1280, 1280, 8, 8, 3, 3, 1, 1, 1, 1, False), # slighlty low pcc with 0.99689. bfloat16 weights doesnt fit
-        # (1, 1280, 1280, 8, 8, 3, 3, 2, 2, 1, 1, False), #fails to parallelize with sharding
-        # (1, 1280, 1280, 4, 4, 3, 3, 1, 1, 1, 1, False), #fails to parallelize with sharding
-        # (1, 1280, 1280, 16, 16, 3, 3, 1, 1, 1, 1, False), # slightly low pcc with 0.99698. bfloat16 weights doesnt fit
-        # (1, 640, 640, 32, 32, 3, 3, 1, 1, 1, 1, False), # doesnt fit at all.. for all data types
-        # sd conv with HxW=512x512
-        # (1, 320, 320, 512, 512, 3, 3, 1, 1, 1, 1, False), # doesnt fit at all.. for all data types
-        # sd conv with HxW=256x256
-        # (1, 320, 320, 256, 256, 3, 3, 1, 1, 1, 1, False), # doesnt fit at all.. for all data types
-        # sd convs with HxW=64x64
-        # (1, 320, 320, 64, 64, 3, 3, 1, 1, 1, 1, False), # bfloat16 weights or activations doesnt fit
-        (1, 320, 320, 64, 64, 3, 3, 2, 2, 1, 1, False),
-        # (1, 640, 640, 32, 32, 3, 3, 1, 1, 1, 1, False), # doesnt fit at all.. for all datatypes
-        # (1, 640, 640, 32, 32, 3, 3, 2, 2, 1, 1, False), # bfloat16 weights or activations doesnt fit
-        # (1, 640, 640, 32, 32, 3, 3, 2, 2, 1, 1, False), # bfloat16 activations doesnt fit
-        # (1, 1280, 1280, 16, 16, 3, 3, 1, 1, 1, 1, False), # slighlty low pcc with 0.99689. bfloat16 weights doesnt fit
-        # (1, 1280, 1280, 16, 16, 3, 3, 2, 2, 1, 1, False), #slightly low pcc 0.99697. bfloat16 doesnt fit.
-        # (1, 1280, 1280, 8, 8, 3, 3, 1, 1, 1, 1, False), # slighlty low pcc with 0.99689. bfloat16 weights doesnt fit
-        # (1, 1280, 1280, 32, 32, 3, 3, 1, 1, 1, 1, False), # not tested yet
-        # (1, 640, 640, 64, 64, 3, 3, 1, 1, 1, 1, False), # not tested yet
-    ),
-)
-@pytest.mark.parametrize(
-    "weights_dtype",
-    [ttnn.bfloat16, ttnn.bfloat8_b],
-    ids=["weights_BFLOAT16", "weights_BFLOAT8_B"],
-)
-@pytest.mark.parametrize(
-    "activations_dtype",
-    [ttnn.bfloat16, ttnn.bfloat8_b],
-    ids=["activations_BFLOAT16", "activations_BFLOAT8_B"],
-)
-@pytest.mark.parametrize("math_fidelity", [ttnn.MathFidelity.HiFi4, ttnn.MathFidelity.LoFi], ids=["HiFi4", "LoFi"])
-def test_conv(
-    use_program_cache,
+def run_conv(
     device,
     math_fidelity,
     activations_dtype,
@@ -87,28 +28,8 @@ def test_conv(
     pad_h,
     pad_w,
     use_1d_systolic_array,
+    config_override,
 ):
-    if input_channels == 16:
-        pytest.skip("These tests are hanging in interleaved_to_sharded after rebase. Issue: #4336")
-
-    if math_fidelity != ttnn.MathFidelity.LoFi:
-        pytest.skip(
-            "By default, only run tests with LoFi math for pipelines. For local unit testing, enable the other variants by uncommenting the skip here!"
-        )
-
-    if (
-        activations_dtype == ttnn.bfloat16
-        and batch_size == 20
-        and (
-            output_channels == 64
-            or (
-                stride_h == 2
-                and (output_channels == 256 or (output_channels == 128 and weights_dtype == ttnn.bfloat16))
-            )
-        )
-    ):
-        pytest.skip("Skipping test because it won't fit in L1!")
-
     torch.manual_seed(0)
     conv_input_shape = [batch_size, input_channels, input_height, input_width]
     conv_weight_shape = [output_channels, input_channels, filter_height, filter_width]
@@ -150,6 +71,7 @@ def test_conv(
         bias=tt_bias_tensor,
         math_fidelity=math_fidelity,
         weights_dtype=weights_dtype,
+        conv_blocking_and_parallelization_config_override=config_override,
     )
 
     assert "conv" in reader_patterns_cache and "halo" in reader_patterns_cache
@@ -169,7 +91,207 @@ def test_conv(
     torch_output_tensor = torch.permute(torch_output_tensor, (0, 3, 1, 2))
 
     if math_fidelity == ttnn.MathFidelity.LoFi and activations_dtype == ttnn.bfloat8_b:
-        pcc = 0.998
+        pcc = 0.9969
     else:
         pcc = 0.998
     assert_with_pcc(torch_output_tensor, torch_out_golden_tensor, pcc=pcc)
+
+
+@skip_for_wormhole_b0()
+@pytest.mark.parametrize(
+    "output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, use_1d_systolic_array",
+    (
+        # unique convs in rn50 (complete list)
+        # first conv post folding and input_channels padding to tile width
+        (64, 16, 115, 115, 4, 4, 1, 1, 0, 0, True),
+        # rn50 layer1
+        (64, 64, 56, 56, 3, 3, 1, 1, 1, 1, True),
+        # rn50 layer2
+        (128, 128, 56, 56, 3, 3, 2, 2, 1, 1, True),
+        (128, 128, 56, 56, 3, 3, 2, 2, 1, 1, True),
+        (128, 128, 28, 28, 3, 3, 1, 1, 1, 1, True),
+        # rn50 layer3
+        (256, 256, 28, 28, 3, 3, 2, 2, 1, 1, False),
+        (256, 256, 14, 14, 3, 3, 1, 1, 1, 1, False),
+        # rn50 layer4
+        (512, 512, 14, 14, 3, 3, 2, 2, 1, 1, False),
+        (512, 512, 7, 7, 3, 3, 1, 1, 1, 1, False),
+    ),
+)
+@pytest.mark.parametrize(
+    "batch_size",
+    [8, 16, 20],
+    ids=["batch_size_8", "batch_size_16", "batch_size_20"],
+)
+@pytest.mark.parametrize(
+    "weights_dtype",
+    [ttnn.bfloat16, ttnn.bfloat8_b],
+    ids=["weights_BFLOAT16", "weights_BFLOAT8_B"],
+)
+@pytest.mark.parametrize(
+    "activations_dtype",
+    [ttnn.bfloat16, ttnn.bfloat8_b],
+    ids=["activations_BFLOAT16", "activations_BFLOAT8_B"],
+)
+@pytest.mark.parametrize("math_fidelity", [ttnn.MathFidelity.HiFi4, ttnn.MathFidelity.LoFi], ids=["HiFi4", "LoFi"])
+def test_resnet50_conv(
+    use_program_cache,
+    device,
+    math_fidelity,
+    activations_dtype,
+    weights_dtype,
+    batch_size,
+    output_channels,
+    input_channels,
+    input_height,
+    input_width,
+    filter_height,
+    filter_width,
+    stride_h,
+    stride_w,
+    pad_h,
+    pad_w,
+    use_1d_systolic_array,
+):
+    if input_channels == 16:
+        pytest.skip("These tests are hanging in interleaved_to_sharded after rebase. Issue: #4336")
+
+    if math_fidelity != ttnn.MathFidelity.LoFi:
+        pytest.skip(
+            "By default, only run tests with LoFi math for pipelines. For local unit testing, enable the other variants by uncommenting the skip here!"
+        )
+
+    if (
+        activations_dtype == ttnn.bfloat16
+        and batch_size == 20
+        and (
+            output_channels == 64
+            or (
+                stride_h == 2
+                and (output_channels == 256 or (output_channels == 128 and weights_dtype == ttnn.bfloat16))
+            )
+        )
+    ):
+        pytest.skip("Skipping test because it won't fit in L1!")
+
+    if (
+        input_channels >= 320
+        and (not input_channels == 512)
+        and (activations_dtype == ttnn.bfloat16 or weights_dtype == ttnn.bfloat16)
+    ):
+        pytest.skip("Skipping tests with bfloat16 for sd convs")
+
+    run_conv(
+        device,
+        math_fidelity,
+        activations_dtype,
+        weights_dtype,
+        batch_size,
+        output_channels,
+        input_channels,
+        input_height,
+        input_width,
+        filter_height,
+        filter_width,
+        stride_h,
+        stride_w,
+        pad_h,
+        pad_w,
+        use_1d_systolic_array,
+        config_override=None,
+    )
+
+
+@skip_for_wormhole_b0()
+@pytest.mark.parametrize(
+    "batch_size, output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, use_1d_systolic_array, config_override",
+    (
+        # sd convs with HxW=32x32
+        # (1, 320, 320, 32, 32, 3, 3, 1, 1, 1, 1, False, None),
+        # (1, 320, 320, 32, 32, 3, 3, 2, 2, 1, 1, False, None),
+        # (1, 640, 640, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        # (1, 640, 640, 16, 16, 3, 3, 2, 2, 1, 1, False, None),
+        # (1, 640, 640, 16, 16, 3, 3, 2, 2, 1, 1, False, None), # bfloat16 activations doesnt fit
+        # (1, 1280, 1280, 8, 8, 3, 3, 1, 1, 1, 1, False, None), # slighlty low pcc with 0.99689. bfloat16 weights doesnt fit
+        # (1, 1280, 1280, 8, 8, 3, 3, 2, 2, 1, 1, False, None), #fails to parallelize with sharding
+        # (1, 1280, 1280, 4, 4, 3, 3, 1, 1, 1, 1, False, None), #fails to parallelize with sharding
+        # (1, 1280, 1280, 16, 16, 3, 3, 1, 1, 1, 1, False, None), # slightly low pcc with 0.99698. bfloat16 weights doesnt fit
+        # (1, 640, 640, 32, 32, 3, 3, 1, 1, 1, 1, False, None), # doesnt fit at all.. for all data types
+        # sd convs with HxW=64x64 with batch size = 1
+        # (2, 32, 16, 64, 64, 3, 3, 1, 1, 1, 1, True, None), # not supported
+        (1, 320, 320, 64, 64, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 32}),  # bfloat16 doesnt fit
+        (1, 320, 320, 64, 64, 3, 3, 2, 2, 1, 1, False, None),
+        (1, 640, 640, 32, 32, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 32}),  #
+        (1, 640, 640, 32, 32, 3, 3, 2, 2, 1, 1, False, None),  # bfloat16 doesnt fit
+        (1, 1280, 1280, 16, 16, 3, 3, 1, 1, 1, 1, False, None),  # bfloat16 weights doesnt fit
+        (1, 1280, 1280, 16, 16, 3, 3, 2, 2, 1, 1, False, None),  # bfloat16 doesnt fit.
+        (1, 1280, 1280, 8, 8, 3, 3, 1, 1, 1, 1, False, None),  # bfloat16 weights doesnt fit
+        (1, 1280, 1280, 32, 32, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 640, 640, 64, 64, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 32}),
+        # sd convs with HxW=64x64 with batch size=2
+        (2, 320, 320, 64, 64, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 64}),
+        (2, 320, 320, 64, 64, 3, 3, 2, 2, 1, 1, False, None),  # fits with bfloat8_b
+        (2, 640, 640, 32, 32, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 64}),
+        (2, 640, 640, 32, 32, 3, 3, 2, 2, 1, 1, False, None),  # bfloat16 doesnt fit
+        (2, 1280, 1280, 16, 16, 3, 3, 1, 1, 1, 1, False, None),  # bfloat16 doesnt fit
+        (2, 1280, 1280, 16, 16, 3, 3, 2, 2, 1, 1, False, {"act_block_h": 32}),  # bfloat16 doesnt fit
+        (2, 1280, 1280, 8, 8, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 32}),
+        (2, 1280, 1280, 32, 32, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 64}),  # bfloat16 doesnt fit
+        (2, 640, 640, 64, 64, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 64}),
+    ),
+)
+@pytest.mark.parametrize(
+    "weights_dtype",
+    [ttnn.bfloat8_b],
+    ids=["weights_BFLOAT8_B"],
+)
+@pytest.mark.parametrize(
+    "activations_dtype",
+    [ttnn.bfloat8_b],
+    ids=["activations_BFLOAT8_B"],
+)
+@pytest.mark.parametrize("math_fidelity", [ttnn.MathFidelity.HiFi4, ttnn.MathFidelity.LoFi], ids=["HiFi4", "LoFi"])
+def test_sd_conv(
+    use_program_cache,
+    device,
+    math_fidelity,
+    activations_dtype,
+    weights_dtype,
+    batch_size,
+    output_channels,
+    input_channels,
+    input_height,
+    input_width,
+    filter_height,
+    filter_width,
+    stride_h,
+    stride_w,
+    pad_h,
+    pad_w,
+    use_1d_systolic_array,
+    config_override,
+):
+    if math_fidelity != ttnn.MathFidelity.LoFi:
+        pytest.skip(
+            "By default, only run tests with LoFi math for pipelines. For local unit testing, enable the other variants by uncommenting the skip here!"
+        )
+
+    run_conv(
+        device,
+        math_fidelity,
+        activations_dtype,
+        weights_dtype,
+        batch_size,
+        output_channels,
+        input_channels,
+        input_height,
+        input_width,
+        filter_height,
+        filter_width,
+        stride_h,
+        stride_w,
+        pad_h,
+        pad_w,
+        use_1d_systolic_array,
+        config_override,
+    )
diff --git a/tt_eager/tt_dnn/op_library/conv/kernels/conv_bmm_tilize_col_major_out_blocks.cpp b/tt_eager/tt_dnn/op_library/conv/kernels/conv_bmm_tilize_col_major_out_blocks.cpp
@@ -173,22 +173,23 @@ void MAIN {
     #ifdef SFPU_OP_INIT_ACTIVATION
     SFPU_OP_INIT_ACTIVATION
     #endif
+    // in1 num blocks w is the outer loop. Output blocks are computed in col major order.
+    for(uint32_t in1_block_w_i = 0; in1_block_w_i < in1_num_blocks_w; ++in1_block_w_i) {
 
-    #ifdef PRE_TILIZE
-    unpack_reconfig_data_format_srca(in1_cb_id, in0_pretilize_cb_id);
+        for(uint32_t in0_block_h_i = 0; in0_block_h_i < in0_num_blocks_h; ++in0_block_h_i) {
 
-    col_major_to_row_major_init();
-    tilize_in(in0_pretilize_cb_id, in0_subblock_h, in0_block_w, in0_num_subblocks, tilized_in0_cb_id);
-    row_major_to_col_major_init();
+            #ifdef PRE_TILIZE
+            unpack_reconfig_data_format_srca(in1_cb_id, in0_pretilize_cb_id);
 
-    // TODO: unpack_reconfig_data_format_srca(in0_pretilize_cb_id, in1_cb_id) doesn't work if in0 is BFLOATB_B and in1 is BFLOAT16
-    mm_block_init_short();
-    unpack_reconfig_data_format_srca(in1_cb_id);
-    #endif
+            col_major_to_row_major_init();
+            tilize_in(in0_pretilize_cb_id, in0_subblock_h, in0_block_w, in0_num_subblocks, tilized_in0_cb_id);
+            row_major_to_col_major_init();
+
+            // TODO: unpack_reconfig_data_format_srca(in0_pretilize_cb_id, in1_cb_id) doesn't work if in0 is BFLOATB_B and in1 is BFLOAT16
+            mm_block_init_short();
+            unpack_reconfig_data_format_srca(in1_cb_id);
+            #endif
 
-    // in1 num blocks w is the outer loop. Output blocks are computed in col major order.
-    for(uint32_t in1_block_w_i = 0; in1_block_w_i < in1_num_blocks_w; ++in1_block_w_i) {
-        for(uint32_t in0_block_h_i = 0; in0_block_h_i < in0_num_blocks_h; ++in0_block_h_i) {
             bool enable_reload = false;
 
             #ifdef PACK_RELU
@@ -296,6 +297,7 @@ void MAIN {
                         PACK( cb_interface[matmul_partials_cb].fifo_wr_ptr = partials_cb_write_ptr );
                     }
                 }
+
                 cb_pop_front(mm_in0_cb_id, in0_block_num_tiles);
                 cb_pop_front(in1_cb_id, in1_block_num_tiles);
             } // for in0_num_blocks_w
@@ -380,6 +382,7 @@ void MAIN {
                 }
             }
             #endif
+
         } // for in0_num_blocks_h
         #ifdef FUSE_BIAS
             bias_block_offset += in1_block_w;