From 6bd0fbf922400a4a7d5d45c4ba5d2b2a22ec0d87 Mon Sep 17 00:00:00 2001 From: Milos Trajkovic Date: Wed, 5 Jun 2024 17:43:45 -0400 Subject: [PATCH 1/2] #8424: Add new llk-wormhole-b0 commit: remove assert for fp32 zeroacc --- tt_metal/third_party/tt_llk_wormhole_b0 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tt_metal/third_party/tt_llk_wormhole_b0 b/tt_metal/third_party/tt_llk_wormhole_b0 index 2b92e02e8e7..5cc75fbf0df 160000 --- a/tt_metal/third_party/tt_llk_wormhole_b0 +++ b/tt_metal/third_party/tt_llk_wormhole_b0 @@ -1 +1 @@ -Subproject commit 2b92e02e8e723f82e9c1a87049c96b140b103c7a +Subproject commit 5cc75fbf0dfe5e5dd796220fee08fae9d6ca6b43 From 4276e5c19e5b0ded51b92e2463102afcd52715b6 Mon Sep 17 00:00:00 2001 From: Borys Bradel <164946524+bbradelTT@users.noreply.github.com> Date: Wed, 5 Jun 2024 19:47:44 -0400 Subject: [PATCH 2/2] #9059: adjust matmul parameters for rounding up in some scenarios (#9105) * #9059: adjust matmul parameters for rounding up in some scenarios * #9059: Adjust some matmul parameters to use div_up --- tt_eager/tt_dnn/op_library/bmm/bmm_op.cpp | 15 +++++++++------ ..._in0_sender_receiver_padding_width_sharded.cpp | 2 +- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/tt_eager/tt_dnn/op_library/bmm/bmm_op.cpp b/tt_eager/tt_dnn/op_library/bmm/bmm_op.cpp index 29cbae91947..be80425fa59 100644 --- a/tt_eager/tt_dnn/op_library/bmm/bmm_op.cpp +++ b/tt_eager/tt_dnn/op_library/bmm/bmm_op.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include "third_party/magic_enum/magic_enum.hpp" #include "tt_dnn/op_library/run_operation.hpp" @@ -368,7 +369,7 @@ tt::operations::primary::MatmulProgramConfig get_matmul_program_config( mcast_in0 = true; per_core_M = M; per_core_N = div_up(N, input_tensor_a.shard_spec().value().grid.num_cores()); - in0_block_w = shard_shape[1] / TILE_WIDTH; + in0_block_w = std::gcd(shard_shape[1] / TILE_WIDTH, K); } else if (input_tensor_a.memory_config().memory_layout == TensorMemoryLayout::HEIGHT_SHARDED) { mcast_in0 = false; per_core_M = shard_shape[0] / TILE_HEIGHT; @@ -413,14 +414,16 @@ tt::operations::primary::MatmulProgramConfig get_matmul_program_config( auto shard_shape = input_tensor_a.shard_spec().value().shape; uint32_t virtual_x = transpose_mcast ? grid_size.y : grid_size.x; uint32_t virtual_y = transpose_mcast ? grid_size.x : grid_size.y; + bool cores_along_x_match_grid_size = virtual_x == (K / (shard_shape[1] / TILE_WIDTH)); + bool cores_along_y_match_grid_size = virtual_y == (M / (shard_shape[0] / TILE_HEIGHT)); TT_FATAL( - virtual_y == (M / (shard_shape[0] / TILE_HEIGHT)), "Num cores along y must match provided grid size!"); + cores_along_y_match_grid_size || virtual_y == div_up(M, (shard_shape[0] / TILE_HEIGHT)), "Num cores along y must match provided grid size!"); TT_FATAL( - virtual_x == (K / (shard_shape[1] / TILE_WIDTH)), "Num cores along x must match provided grid size!"); + cores_along_x_match_grid_size || virtual_x == div_up(K, (shard_shape[1] / TILE_WIDTH)), "Num cores along x must match provided grid size!"); - uint32_t per_core_M = M / virtual_y; - uint32_t per_core_N = N / virtual_x; - uint32_t in0_block_w = shard_shape[1] / TILE_WIDTH; + uint32_t per_core_M = (M < virtual_y) ? 1 : M / virtual_y; + uint32_t per_core_N = (N < virtual_x) ? 1 : N / virtual_x; + uint32_t in0_block_w = cores_along_x_match_grid_size ? shard_shape[1] / TILE_WIDTH : 1; auto subblock_hw = get_matmul_subblock_params( per_core_M, per_core_N, false, per_core_N_equals_subblock_w_constraint, fp32_dest_acc_en); diff --git a/tt_eager/tt_dnn/op_library/bmm/kernels/dataflow/reader_bmm_tile_layout_in0_sender_receiver_padding_width_sharded.cpp b/tt_eager/tt_dnn/op_library/bmm/kernels/dataflow/reader_bmm_tile_layout_in0_sender_receiver_padding_width_sharded.cpp index ffec17d6344..881b79b3c80 100644 --- a/tt_eager/tt_dnn/op_library/bmm/kernels/dataflow/reader_bmm_tile_layout_in0_sender_receiver_padding_width_sharded.cpp +++ b/tt_eager/tt_dnn/op_library/bmm/kernels/dataflow/reader_bmm_tile_layout_in0_sender_receiver_padding_width_sharded.cpp @@ -67,7 +67,7 @@ void kernel_main() { in0_mcast_sender_semaphore_valid_addr_ptr[0] = VALID; // Load const 1 to be used as semaphore valid value sent from sender to receivers - constexpr uint32_t num_remote_senders = num_blocks / num_blocks_per_shard; + constexpr uint32_t num_remote_senders = (num_blocks + num_blocks_per_shard - 1) / num_blocks_per_shard; uint64_t remote_sender_noc_addrs[num_remote_senders]; if constexpr (transpose_mcast) { uint32_t x = 0, y = 0;