gs resnet50 fix

tenstorrent · Nov 13, 2024 · 3e92356 · 3e92356
1 parent b18ae2f
commit 3e92356
Showing 1 changed file with 56 additions and 8 deletions.
diff --git a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_new_conv_api.py b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_new_conv_api.py
@@ -8,12 +8,37 @@
     is_grayskull,
     is_wormhole_b0,
     _nearest_y,
+    nearest_y,
     pad_and_fold_conv_activation_for_unity_stride,
 )
 from typing import List
 from loguru import logger
 from tests.ttnn.utils_for_testing import assert_with_pcc
 
+
+def get_core_grid_from_num_cores(num_cores: int, grid_rows: int, grid_cols: int):
+    columns = num_cores // grid_rows
+    assert columns <= grid_cols, "Not enough cores for specified core grid"
+    ranges = []
+    if columns != 0:
+        ranges.append(
+            ttnn.CoreRange(
+                ttnn.CoreCoord(0, 0),
+                ttnn.CoreCoord(grid_rows - 1, columns - 1),
+            )
+        )
+    remainder = num_cores % grid_rows
+    if remainder != 0:
+        assert columns + 1 <= grid_cols, "Not enough cores for specified core grid"
+        ranges.append(
+            ttnn.CoreRange(
+                ttnn.CoreCoord(0, columns),
+                ttnn.CoreCoord(remainder - 1, columns),
+            )
+        )
+    return ttnn.CoreRangeSet({*ranges})
+
+
 hardcoded_matmul_config_linear = {
     8: ttnn.MatmulMultiCoreReuseMultiCast1DProgramConfig(
         compute_with_storage_grid_size=(8, 4),
@@ -632,15 +657,38 @@ def __init__(
 
         conv_dummy_tensor = torch.rand((self.fold_output_shape), dtype=torch.bfloat16)
         conv_dummy_tensor = ttnn.from_torch(conv_dummy_tensor, layout=ttnn.ROW_MAJOR_LAYOUT)
-        _, self.override_fold_mem_config, _, _ = ttnn.get_conv_padded_input_shape_and_mem_config(
-            device=device,
-            input_tensor=conv_dummy_tensor,
-            conv_config=self.conv1_config,
+
+        parallel_config = ttnn._ttnn.operations.conv.determine_parallel_config(
+            shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
             batch_size=self.batch_size,
-            height=self.conv1_output_height,
-            width=self.conv1_output_width,
-            in_channels=self.conv1_input_channels,
-            out_channels=self.conv1_output_channels,
+            input_channels=self.conv1_input_channels,
+            output_height=self.conv1_output_height,
+            output_width=self.conv1_output_width,
+            output_channels=self.conv1_output_channels,
+            compute_grid_size=device.compute_with_storage_grid_size(),
+            block_shard_orientation=ttnn.ShardOrientation.ROW_MAJOR,
+            is_conv2d_op=True,
+            is_out_tiled=True,
+        )
+        # / Override compute grid size for Grayskull
+        #  First convs would got to 108 cores by default
+        #  but this would add padding into output tensor
+        # and reshard that follows first conv fails with padding ATM.
+        if is_grayskull():
+            compute_grid = device.compute_with_storage_grid_size()
+            parallel_config.grid = get_core_grid_from_num_cores(98, compute_grid.x, compute_grid.y)
+
+        self.override_fold_mem_config = ttnn._ttnn.operations.conv.create_sharded_memory_config_from_parallel_config(
+            tensor_shape=ttnn.Shape(
+                [
+                    1,
+                    1,
+                    self.conv1_input_width * self.conv1_input_height * self.batch_size,
+                    nearest_y(self.conv1_input_channels, self.conv1_config.input_channels_alignment),
+                ]
+            ),
+            parallel_config=parallel_config,
+            tile_size=32,
         )
 
     def __del__(self):