#0: squash later

tenstorrent · Jan 25, 2024 · ff68aed · ff68aed
1 parent 33878e5
commit ff68aed
Show file tree

Hide file tree

Showing 3 changed files with 205 additions and 126 deletions.
diff --git a/tests/ttnn/unit_tests/test_conv.py b/tests/ttnn/unit_tests/test_conv.py
@@ -11,98 +11,7 @@
 import ttnn
 
 
-@skip_for_wormhole_b0()
-@pytest.mark.parametrize(
-    "batch_size, output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, use_1d_systolic_array, config_override",
-    (
-        # unique convs in rn50 (complete list)
-        # first conv post folding and input_channels padding to tile width
-        (8, 64, 16, 115, 115, 4, 4, 1, 1, 0, 0, True, None),
-        # rn50 layer1
-        (8, 64, 64, 56, 56, 3, 3, 1, 1, 1, 1, True, None),
-        # rn50 layer2
-        (8, 128, 128, 56, 56, 3, 3, 2, 2, 1, 1, True, None),
-        (20, 128, 128, 56, 56, 3, 3, 2, 2, 1, 1, True, None),
-        (8, 128, 128, 28, 28, 3, 3, 1, 1, 1, 1, True, None),
-        # rn50 layer3
-        (8, 256, 256, 28, 28, 3, 3, 2, 2, 1, 1, False, None),
-        (8, 256, 256, 14, 14, 3, 3, 1, 1, 1, 1, False, None),
-        # rn50 layer4
-        (8, 512, 512, 14, 14, 3, 3, 2, 2, 1, 1, False, None),
-        (8, 512, 512, 7, 7, 3, 3, 1, 1, 1, 1, False, None),
-        # rn50 with batch size = 16
-        (16, 64, 16, 115, 115, 4, 4, 1, 1, 0, 0, True, None),
-        # rn50 layer1
-        (16, 64, 64, 56, 56, 3, 3, 1, 1, 1, 1, True, None),
-        # rn50 layer2
-        (16, 128, 128, 56, 56, 3, 3, 2, 2, 1, 1, True, None),
-        (16, 128, 128, 28, 28, 3, 3, 1, 1, 1, 1, True, None),
-        # rn50 layer3
-        (16, 256, 256, 28, 28, 3, 3, 2, 2, 1, 1, False, None),
-        (16, 256, 256, 14, 14, 3, 3, 1, 1, 1, 1, False, None),
-        # rn50 layer4
-        (16, 512, 512, 14, 14, 3, 3, 2, 2, 1, 1, False, None),
-        (16, 512, 512, 7, 7, 3, 3, 1, 1, 1, 1, False, None),
-        # rn50 with batch size = 20
-        (20, 64, 16, 115, 115, 4, 4, 1, 1, 0, 0, True, None),
-        # rn50 layer1
-        (20, 64, 64, 56, 56, 3, 3, 1, 1, 1, 1, True, None),
-        # rn50 layer2
-        (20, 128, 128, 56, 56, 3, 3, 2, 2, 1, 1, True, None),
-        (20, 128, 128, 28, 28, 3, 3, 1, 1, 1, 1, True, None),
-        # rn50 layer3
-        (20, 256, 256, 28, 28, 3, 3, 2, 2, 1, 1, False, None),
-        (20, 256, 256, 14, 14, 3, 3, 1, 1, 1, 1, False, None),
-        # rn50 layer4
-        (20, 512, 512, 14, 14, 3, 3, 2, 2, 1, 1, False, None),
-        (20, 512, 512, 7, 7, 3, 3, 1, 1, 1, 1, False, None),
-        # sd convs with HxW=32x32
-        # (1, 320, 320, 32, 32, 3, 3, 1, 1, 1, 1, False, None),
-        # (1, 320, 320, 32, 32, 3, 3, 2, 2, 1, 1, False, None),
-        # (1, 640, 640, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
-        # (1, 640, 640, 16, 16, 3, 3, 2, 2, 1, 1, False, None),
-        # (1, 640, 640, 16, 16, 3, 3, 2, 2, 1, 1, False, None), # bfloat16 activations doesnt fit
-        # (1, 1280, 1280, 8, 8, 3, 3, 1, 1, 1, 1, False, None), # slighlty low pcc with 0.99689. bfloat16 weights doesnt fit
-        # (1, 1280, 1280, 8, 8, 3, 3, 2, 2, 1, 1, False, None), #fails to parallelize with sharding
-        # (1, 1280, 1280, 4, 4, 3, 3, 1, 1, 1, 1, False, None), #fails to parallelize with sharding
-        # (1, 1280, 1280, 16, 16, 3, 3, 1, 1, 1, 1, False, None), # slightly low pcc with 0.99698. bfloat16 weights doesnt fit
-        # (1, 640, 640, 32, 32, 3, 3, 1, 1, 1, 1, False, None), # doesnt fit at all.. for all data types
-        # sd convs with HxW=64x64 with batch size = 1
-        # (2, 32, 16, 64, 64, 3, 3, 1, 1, 1, 1, True, None), # not supported
-        (1, 320, 320, 64, 64, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 32}),  # bfloat16 doesnt fit
-        (1, 320, 320, 64, 64, 3, 3, 2, 2, 1, 1, False, None),
-        (1, 640, 640, 32, 32, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 32}),  #
-        (1, 640, 640, 32, 32, 3, 3, 2, 2, 1, 1, False, None),  # bfloat16 doesnt fit
-        (1, 1280, 1280, 16, 16, 3, 3, 1, 1, 1, 1, False, None),  # bfloat16 weights doesnt fit
-        (1, 1280, 1280, 16, 16, 3, 3, 2, 2, 1, 1, False, None),  # bfloat16 doesnt fit.
-        (1, 1280, 1280, 8, 8, 3, 3, 1, 1, 1, 1, False, None),  # bfloat16 weights doesnt fit
-        (1, 1280, 1280, 32, 32, 3, 3, 1, 1, 1, 1, False, None),
-        (1, 640, 640, 64, 64, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 32}),
-        # sd convs with HxW=64x64 with batch size=2
-        (2, 320, 320, 64, 64, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 64}),
-        (2, 320, 320, 64, 64, 3, 3, 2, 2, 1, 1, False, None),  # fits with bfloat8_b
-        (2, 640, 640, 32, 32, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 64}),
-        (2, 640, 640, 32, 32, 3, 3, 2, 2, 1, 1, False, None),  # bfloat16 doesnt fit
-        (2, 1280, 1280, 16, 16, 3, 3, 1, 1, 1, 1, False, None),  # bfloat16 doesnt fit
-        (2, 1280, 1280, 16, 16, 3, 3, 2, 2, 1, 1, False, {"act_block_h": 32}),  # bfloat16 doesnt fit
-        (2, 1280, 1280, 8, 8, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 32}),
-        (2, 1280, 1280, 32, 32, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 64}),  # bfloat16 doesnt fit
-        (2, 640, 640, 64, 64, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 64}),
-    ),
-)
-@pytest.mark.parametrize(
-    "weights_dtype",
-    [ttnn.bfloat16, ttnn.bfloat8_b],
-    ids=["weights_BFLOAT16", "weights_BFLOAT8_B"],
-)
-@pytest.mark.parametrize(
-    "activations_dtype",
-    [ttnn.bfloat16, ttnn.bfloat8_b],
-    ids=["activations_BFLOAT16", "activations_BFLOAT8_B"],
-)
-@pytest.mark.parametrize("math_fidelity", [ttnn.MathFidelity.HiFi4, ttnn.MathFidelity.LoFi], ids=["HiFi4", "LoFi"])
-def test_conv(
-    use_program_cache,
+def run_conv(
     device,
     math_fidelity,
     activations_dtype,
@@ -121,34 +30,6 @@ def test_conv(
     use_1d_systolic_array,
     config_override,
 ):
-    if input_channels == 16:
-        pytest.skip("These tests are hanging in interleaved_to_sharded after rebase. Issue: #4336")
-
-    if math_fidelity != ttnn.MathFidelity.LoFi:
-        pytest.skip(
-            "By default, only run tests with LoFi math for pipelines. For local unit testing, enable the other variants by uncommenting the skip here!"
-        )
-
-    if (
-        activations_dtype == ttnn.bfloat16
-        and batch_size == 20
-        and (
-            output_channels == 64
-            or (
-                stride_h == 2
-                and (output_channels == 256 or (output_channels == 128 and weights_dtype == ttnn.bfloat16))
-            )
-        )
-    ):
-        pytest.skip("Skipping test because it won't fit in L1!")
-
-    if (
-        input_channels >= 320
-        and (not input_channels == 512)
-        and (activations_dtype == ttnn.bfloat16 or weights_dtype == ttnn.bfloat16)
-    ):
-        pytest.skip("Skipping tests with bfloat16 for sd convs")
-
     torch.manual_seed(0)
     conv_input_shape = [batch_size, input_channels, input_height, input_width]
     conv_weight_shape = [output_channels, input_channels, filter_height, filter_width]
@@ -214,3 +95,203 @@ def test_conv(
     else:
         pcc = 0.998
     assert_with_pcc(torch_output_tensor, torch_out_golden_tensor, pcc=pcc)
+
+
+@skip_for_wormhole_b0()
+@pytest.mark.parametrize(
+    "output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, use_1d_systolic_array",
+    (
+        # unique convs in rn50 (complete list)
+        # first conv post folding and input_channels padding to tile width
+        (64, 16, 115, 115, 4, 4, 1, 1, 0, 0, True),
+        # rn50 layer1
+        (64, 64, 56, 56, 3, 3, 1, 1, 1, 1, True),
+        # rn50 layer2
+        (128, 128, 56, 56, 3, 3, 2, 2, 1, 1, True),
+        (128, 128, 56, 56, 3, 3, 2, 2, 1, 1, True),
+        (128, 128, 28, 28, 3, 3, 1, 1, 1, 1, True),
+        # rn50 layer3
+        (256, 256, 28, 28, 3, 3, 2, 2, 1, 1, False),
+        (256, 256, 14, 14, 3, 3, 1, 1, 1, 1, False),
+        # rn50 layer4
+        (512, 512, 14, 14, 3, 3, 2, 2, 1, 1, False),
+        (512, 512, 7, 7, 3, 3, 1, 1, 1, 1, False),
+    ),
+)
+@pytest.mark.parametrize(
+    "batch_size",
+    [8, 16, 20],
+    ids=["batch_size_8", "batch_size_16", "batch_size_20"],
+)
+@pytest.mark.parametrize(
+    "weights_dtype",
+    [ttnn.bfloat16, ttnn.bfloat8_b],
+    ids=["weights_BFLOAT16", "weights_BFLOAT8_B"],
+)
+@pytest.mark.parametrize(
+    "activations_dtype",
+    [ttnn.bfloat16, ttnn.bfloat8_b],
+    ids=["activations_BFLOAT16", "activations_BFLOAT8_B"],
+)
+@pytest.mark.parametrize("math_fidelity", [ttnn.MathFidelity.HiFi4, ttnn.MathFidelity.LoFi], ids=["HiFi4", "LoFi"])
+def test_resnet50_conv(
+    use_program_cache,
+    device,
+    math_fidelity,
+    activations_dtype,
+    weights_dtype,
+    batch_size,
+    output_channels,
+    input_channels,
+    input_height,
+    input_width,
+    filter_height,
+    filter_width,
+    stride_h,
+    stride_w,
+    pad_h,
+    pad_w,
+    use_1d_systolic_array,
+):
+    if input_channels == 16:
+        pytest.skip("These tests are hanging in interleaved_to_sharded after rebase. Issue: #4336")
+
+    if math_fidelity != ttnn.MathFidelity.LoFi:
+        pytest.skip(
+            "By default, only run tests with LoFi math for pipelines. For local unit testing, enable the other variants by uncommenting the skip here!"
+        )
+
+    if (
+        activations_dtype == ttnn.bfloat16
+        and batch_size == 20
+        and (
+            output_channels == 64
+            or (
+                stride_h == 2
+                and (output_channels == 256 or (output_channels == 128 and weights_dtype == ttnn.bfloat16))
+            )
+        )
+    ):
+        pytest.skip("Skipping test because it won't fit in L1!")
+
+    if (
+        input_channels >= 320
+        and (not input_channels == 512)
+        and (activations_dtype == ttnn.bfloat16 or weights_dtype == ttnn.bfloat16)
+    ):
+        pytest.skip("Skipping tests with bfloat16 for sd convs")
+
+    run_conv(
+        device,
+        math_fidelity,
+        activations_dtype,
+        weights_dtype,
+        batch_size,
+        output_channels,
+        input_channels,
+        input_height,
+        input_width,
+        filter_height,
+        filter_width,
+        stride_h,
+        stride_w,
+        pad_h,
+        pad_w,
+        use_1d_systolic_array,
+        config_override=None,
+    )
+
+
+@skip_for_wormhole_b0()
+@pytest.mark.parametrize(
+    "batch_size, output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, use_1d_systolic_array, config_override",
+    (
+        # sd convs with HxW=32x32
+        # (1, 320, 320, 32, 32, 3, 3, 1, 1, 1, 1, False, None),
+        # (1, 320, 320, 32, 32, 3, 3, 2, 2, 1, 1, False, None),
+        # (1, 640, 640, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        # (1, 640, 640, 16, 16, 3, 3, 2, 2, 1, 1, False, None),
+        # (1, 640, 640, 16, 16, 3, 3, 2, 2, 1, 1, False, None), # bfloat16 activations doesnt fit
+        # (1, 1280, 1280, 8, 8, 3, 3, 1, 1, 1, 1, False, None), # slighlty low pcc with 0.99689. bfloat16 weights doesnt fit
+        # (1, 1280, 1280, 8, 8, 3, 3, 2, 2, 1, 1, False, None), #fails to parallelize with sharding
+        # (1, 1280, 1280, 4, 4, 3, 3, 1, 1, 1, 1, False, None), #fails to parallelize with sharding
+        # (1, 1280, 1280, 16, 16, 3, 3, 1, 1, 1, 1, False, None), # slightly low pcc with 0.99698. bfloat16 weights doesnt fit
+        # (1, 640, 640, 32, 32, 3, 3, 1, 1, 1, 1, False, None), # doesnt fit at all.. for all data types
+        # sd convs with HxW=64x64 with batch size = 1
+        # (2, 32, 16, 64, 64, 3, 3, 1, 1, 1, 1, True, None), # not supported
+        (1, 320, 320, 64, 64, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 32}),  # bfloat16 doesnt fit
+        (1, 320, 320, 64, 64, 3, 3, 2, 2, 1, 1, False, None),
+        (1, 640, 640, 32, 32, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 32}),  #
+        (1, 640, 640, 32, 32, 3, 3, 2, 2, 1, 1, False, None),  # bfloat16 doesnt fit
+        (1, 1280, 1280, 16, 16, 3, 3, 1, 1, 1, 1, False, None),  # bfloat16 weights doesnt fit
+        (1, 1280, 1280, 16, 16, 3, 3, 2, 2, 1, 1, False, None),  # bfloat16 doesnt fit.
+        (1, 1280, 1280, 8, 8, 3, 3, 1, 1, 1, 1, False, None),  # bfloat16 weights doesnt fit
+        (1, 1280, 1280, 32, 32, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 640, 640, 64, 64, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 32}),
+        # sd convs with HxW=64x64 with batch size=2
+        (2, 320, 320, 64, 64, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 64}),
+        (2, 320, 320, 64, 64, 3, 3, 2, 2, 1, 1, False, None),  # fits with bfloat8_b
+        (2, 640, 640, 32, 32, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 64}),
+        (2, 640, 640, 32, 32, 3, 3, 2, 2, 1, 1, False, None),  # bfloat16 doesnt fit
+        (2, 1280, 1280, 16, 16, 3, 3, 1, 1, 1, 1, False, None),  # bfloat16 doesnt fit
+        (2, 1280, 1280, 16, 16, 3, 3, 2, 2, 1, 1, False, {"act_block_h": 32}),  # bfloat16 doesnt fit
+        (2, 1280, 1280, 8, 8, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 32}),
+        (2, 1280, 1280, 32, 32, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 64}),  # bfloat16 doesnt fit
+        (2, 640, 640, 64, 64, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 64}),
+    ),
+)
+@pytest.mark.parametrize(
+    "weights_dtype",
+    [ttnn.bfloat8_b],
+    ids=["weights_BFLOAT8_B"],
+)
+@pytest.mark.parametrize(
+    "activations_dtype",
+    [ttnn.bfloat8_b],
+    ids=["activations_BFLOAT8_B"],
+)
+@pytest.mark.parametrize("math_fidelity", [ttnn.MathFidelity.HiFi4, ttnn.MathFidelity.LoFi], ids=["HiFi4", "LoFi"])
+def test_sd_conv(
+    use_program_cache,
+    device,
+    math_fidelity,
+    activations_dtype,
+    weights_dtype,
+    batch_size,
+    output_channels,
+    input_channels,
+    input_height,
+    input_width,
+    filter_height,
+    filter_width,
+    stride_h,
+    stride_w,
+    pad_h,
+    pad_w,
+    use_1d_systolic_array,
+    config_override,
+):
+    if math_fidelity != ttnn.MathFidelity.LoFi:
+        pytest.skip(
+            "By default, only run tests with LoFi math for pipelines. For local unit testing, enable the other variants by uncommenting the skip here!"
+        )
+
+    run_conv(
+        device,
+        math_fidelity,
+        activations_dtype,
+        weights_dtype,
+        batch_size,
+        output_channels,
+        input_channels,
+        input_height,
+        input_width,
+        filter_height,
+        filter_width,
+        stride_h,
+        stride_w,
+        pad_h,
+        pad_w,
+        use_1d_systolic_array,
+        config_override,
+    )
diff --git a/...tt_dnn/op_library/conv/multi_core_optimized_conv_sharded/optimized_conv_op_sharded_v2.cpp b/...tt_dnn/op_library/conv/multi_core_optimized_conv_sharded/optimized_conv_op_sharded_v2.cpp
@@ -551,13 +551,14 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_(const Tens
 
     if (fully_buffer_weights) {
         num_weight_cb_tiles *= window_outer;
-    } else if (per_core_weight_matrix_width_ntiles < 8) {
+    } else if (per_core_weight_matrix_width_ntiles < 5 && per_core_out_matrix_height_ntiles < 22) {
         num_weight_cb_tiles = num_weight_cb_tiles * 2;
     }
-    if (conv_act_size_c / conv_act_c_blocks < 256) {
+
+    if (conv_act_size_c / conv_act_c_blocks < 160 && per_core_out_matrix_height_ntiles < 22) {
         num_act_cb_tiles = num_act_cb_tiles * 2; // double buffered
     }
-    cout << "here" << endl;
+
     uint32_t writer_output_block_num_tiles = out_block_h_ntiles * weight_block_w_ntiles;
 
     // TODO: Moving this function call to after kernel logic causes pcc fails

diff --git a/tt_eager/tt_dnn/op_library/sliding_window_op_infra/tt_py_composite_conv.py b/tt_eager/tt_dnn/op_library/sliding_window_op_infra/tt_py_composite_conv.py
@@ -181,9 +181,6 @@ def determine_parallel_config(
         per_core_out_matrix_height_ntiles=per_core_out_matrix_height_ntiles,
         per_core_weight_matrix_width_ntiles=per_core_out_matrix_width_ntiles,
     )
-    print("grid_size=", grid_size)
-    print("per_core_out_matrix_height=", per_core_out_matrix_height_ntiles)
-    print("per_core_out_matrix_width=", per_core_out_matrix_width_ntiles)
     return conv_parallelization_config, num_cores_nhw