#9059: adjust matmul parameters for rounding up in some scenarios (#9105

) * #9059: adjust matmul parameters for rounding up in some scenarios * #9059: Adjust some matmul parameters to use div_up
tenstorrent · Jun 5, 2024 · 4276e5c · 4276e5c
1 parent 6bd0fbf
commit 4276e5c
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 7 deletions.
diff --git a/tt_eager/tt_dnn/op_library/bmm/bmm_op.cpp b/tt_eager/tt_dnn/op_library/bmm/bmm_op.cpp
@@ -7,6 +7,7 @@
 #include <algorithm>
 #include <cmath>
 #include <optional>
+#include <numeric>
 
 #include "third_party/magic_enum/magic_enum.hpp"
 #include "tt_dnn/op_library/run_operation.hpp"
@@ -368,7 +369,7 @@ tt::operations::primary::MatmulProgramConfig get_matmul_program_config(
                 mcast_in0 = true;
                 per_core_M = M;
                 per_core_N = div_up(N, input_tensor_a.shard_spec().value().grid.num_cores());
-                in0_block_w = shard_shape[1] / TILE_WIDTH;
+                in0_block_w = std::gcd(shard_shape[1] / TILE_WIDTH, K);
             } else if (input_tensor_a.memory_config().memory_layout == TensorMemoryLayout::HEIGHT_SHARDED) {
                 mcast_in0 = false;
                 per_core_M = shard_shape[0] / TILE_HEIGHT;
@@ -413,14 +414,16 @@ tt::operations::primary::MatmulProgramConfig get_matmul_program_config(
             auto shard_shape = input_tensor_a.shard_spec().value().shape;
             uint32_t virtual_x = transpose_mcast ? grid_size.y : grid_size.x;
             uint32_t virtual_y = transpose_mcast ? grid_size.x : grid_size.y;
+            bool cores_along_x_match_grid_size = virtual_x == (K / (shard_shape[1] / TILE_WIDTH));
+            bool cores_along_y_match_grid_size = virtual_y == (M / (shard_shape[0] / TILE_HEIGHT));
             TT_FATAL(
-                virtual_y == (M / (shard_shape[0] / TILE_HEIGHT)), "Num cores along y must match provided grid size!");
+                cores_along_y_match_grid_size || virtual_y == div_up(M, (shard_shape[0] / TILE_HEIGHT)), "Num cores along y must match provided grid size!");
             TT_FATAL(
-                virtual_x == (K / (shard_shape[1] / TILE_WIDTH)), "Num cores along x must match provided grid size!");
+                cores_along_x_match_grid_size || virtual_x == div_up(K, (shard_shape[1] / TILE_WIDTH)), "Num cores along x must match provided grid size!");
 
-            uint32_t per_core_M = M / virtual_y;
-            uint32_t per_core_N = N / virtual_x;
-            uint32_t in0_block_w = shard_shape[1] / TILE_WIDTH;
+            uint32_t per_core_M = (M < virtual_y) ? 1 : M / virtual_y;
+            uint32_t per_core_N = (N < virtual_x) ? 1 : N / virtual_x;
+            uint32_t in0_block_w = cores_along_x_match_grid_size ? shard_shape[1] / TILE_WIDTH : 1;
 
             auto subblock_hw = get_matmul_subblock_params(
                 per_core_M, per_core_N, false, per_core_N_equals_subblock_w_constraint, fp32_dest_acc_en);

diff --git a/...bmm/kernels/dataflow/reader_bmm_tile_layout_in0_sender_receiver_padding_width_sharded.cpp b/...bmm/kernels/dataflow/reader_bmm_tile_layout_in0_sender_receiver_padding_width_sharded.cpp
@@ -67,7 +67,7 @@ void kernel_main() {
     in0_mcast_sender_semaphore_valid_addr_ptr[0] =
         VALID;  // Load const 1 to be used as semaphore valid value sent from sender to receivers
 
-    constexpr uint32_t num_remote_senders = num_blocks / num_blocks_per_shard;
+    constexpr uint32_t num_remote_senders = (num_blocks + num_blocks_per_shard - 1) / num_blocks_per_shard;
     uint64_t remote_sender_noc_addrs[num_remote_senders];
     if constexpr (transpose_mcast) {
         uint32_t x = 0, y = 0;