From 00f0bfd28087e8ae513bebf0ce130fb33c7d627a Mon Sep 17 00:00:00 2001 From: yugaoT Date: Wed, 29 May 2024 17:45:03 +0000 Subject: [PATCH] #0: cleanup mm code, fix trid start from 1, rm in1 padding --- .../misc/test_matmul_dram_sharded.py | 35 ++-------- .../operations/test_experimental.py | 34 +++------- tt_eager/tt_dnn/op_library/bmm/bmm_op.cpp | 66 ++----------------- tt_eager/tt_dnn/op_library/bmm/bmm_op.hpp | 14 +--- ...m_large_block_zm_fused_bias_activation.cpp | 2 - ..._tile_layout_in0_receiver_dram_sharded.cpp | 1 - ...mm_tile_layout_in0_sender_dram_sharded.cpp | 1 - ...mm_tile_layout_in1_sender_dram_sharded.cpp | 15 ++--- ...ulti_core_reuse_dram_sharded_optimized.cpp | 21 +++--- .../tt_lib/csrc/operations/primary/module.hpp | 44 +++++++------ tt_metal/hw/inc/dataflow_api.h | 1 - tt_metal/hw/inc/debug/dprint_tile.h | 32 --------- tt_metal/include/compute_kernel_api/matmul.h | 13 ---- 13 files changed, 62 insertions(+), 217 deletions(-) diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_matmul_dram_sharded.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_matmul_dram_sharded.py index 2f29845e008..ac1d76f67fa 100644 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_matmul_dram_sharded.py +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_matmul_dram_sharded.py @@ -112,10 +112,8 @@ def test_matmul_in1_dram_sharded( in0_shape = [1, 1, M, K] in1_shape = [1, 1, K, N] - in1_shape_padded = [1, 1, K, N_padded] in1_shard_shape = [K, N_padded // num_banks] - bias_shape = [1, 1, 1, N] - bias_shape_padded = [1, 1, 32, N_padded] + bias_shape = [1, 1, 32, N] bias_shard_shape = [32, N_padded // num_banks] num_cores = grid_size[0] * grid_size[1] @@ -152,19 +150,11 @@ def test_matmul_in1_dram_sharded( logger.debug("in1_shard_grid " + str(in1_shard_grid)) in0 = torch.randn(in0_shape).bfloat16().float() - # step = K // num_cores - # in0 = torch.ones(in0_shape).bfloat16().float() - # for i in range(num_cores): # since 32768 / 16 = 2048 - # in0[:, :, :, i * step : (i + 1) * step] = i + 1 in1 = torch.randn(in1_shape).bfloat16().float() - # in1 = torch.ones(in1_shape).bfloat16().float() bias = torch.randn(bias_shape).bfloat16().float() - # bias = torch.ones(bias_shape).bfloat16().float() * 10 in0_t = torch2tt_tensor(in0, device, tt_memory_config=interleaved_mem_config, tt_dtype=in0_dtype) - in1_t = ttl.tensor.Tensor(in1.flatten().tolist(), in1_shape, in1_dtype, ttl.tensor.Layout.ROW_MAJOR) - in1_t = in1_t.pad(in1_shape_padded, (0, 0, 0, 0), 0).to(ttl.tensor.Layout.TILE) - in1_t = in1_t.to(device, in1_mem_config) + in1_t = torch2tt_tensor(in1, device, tt_memory_config=in1_mem_config, tt_dtype=in1_dtype) if has_bias: bias_shard_grid = ttl.tensor.CoreCoord(device.dram_grid_size().x - 1, device.dram_grid_size().y - 1) @@ -175,12 +165,7 @@ def test_matmul_in1_dram_sharded( bias_mem_config = ttl.tensor.MemoryConfig( ttl.tensor.TensorMemoryLayout.WIDTH_SHARDED, ttl.tensor.BufferType.DRAM, bias_shard_spec ) - - bias_t = ttl.tensor.Tensor( - bias.flatten().tolist(), bias_shape, ttl.tensor.DataType.BFLOAT16, ttl.tensor.Layout.ROW_MAJOR - ) - bias_t = bias_t.pad(bias_shape_padded, (0, 0, 0, 0), 0).to(ttl.tensor.Layout.TILE) - bias_t = bias_t.to(device, bias_mem_config) + bias_t = torch2tt_tensor(bias, device, tt_memory_config=bias_mem_config, tt_dtype=ttl.tensor.DataType.BFLOAT16) in0_t = ttl.tensor.interleaved_to_sharded( in0_t, @@ -198,9 +183,6 @@ def test_matmul_in1_dram_sharded( per_core_N=out_block_w, fuse_batch=True, fused_activation=None, - skip_compute=False, - skip_in0_mcast=False, - skip_write_back=False, ) if is_grayskull(): @@ -217,7 +199,7 @@ def test_matmul_in1_dram_sharded( ) if has_bias: - output_t = ttl.operations.primary.matmul_dram_sharded( + output_t = ttl.operations.primary.matmul( in0_t, in1_t, bias=bias_t, @@ -227,7 +209,7 @@ def test_matmul_in1_dram_sharded( compute_kernel_config=compute_kernel_config, ) else: - output_t = ttl.operations.primary.matmul_dram_sharded( + output_t = ttl.operations.primary.matmul( in0_t, in1_t, program_config=program_config, @@ -243,11 +225,6 @@ def test_matmul_in1_dram_sharded( tt_out = tt2torch_tensor(output_t) - print(pt_out) - print(tt_out) - - pt_out_unpad = pt_out[:, :, :, 0:N] - tt_out_unpad = tt_out[:, :, :, 0:N] - passing, output = comp_pcc(pt_out_unpad, tt_out_unpad) + passing, output = comp_pcc(pt_out, tt_out) logger.info(output) assert passing diff --git a/tests/ttnn/unit_tests/operations/test_experimental.py b/tests/ttnn/unit_tests/operations/test_experimental.py index b5453421e78..75637c94a8a 100644 --- a/tests/ttnn/unit_tests/operations/test_experimental.py +++ b/tests/ttnn/unit_tests/operations/test_experimental.py @@ -172,8 +172,7 @@ def test_ttnn_experimental_operations_primary_matmul_1d( @pytest.mark.parametrize("m_size", [32]) @pytest.mark.parametrize("k_size", [8192]) @pytest.mark.parametrize("n_size", [1024]) -@pytest.mark.parametrize("n_padded_size", [1152]) -def test_ttnn_experimental_operations_primary_matmul_dram_sharded(device, m_size, k_size, n_size, n_padded_size): +def test_ttnn_experimental_operations_primary_matmul_dram_sharded(device, m_size, k_size, n_size): torch.manual_seed(0) grid_size = ttnn.CoreGrid(y=1, x=8) @@ -200,21 +199,6 @@ def test_ttnn_experimental_operations_primary_matmul_dram_sharded(device, m_size ) input_tensor_in0 = ttnn.to_memory_config(input_tensor_in0, sharded_mem_config) - # in1 ttnn tensor, for now need to bring tensor to device to pad first, then bring back, and send sharded tensor to dram! - input_tensor_in1 = ttnn.from_torch(torch_input_tensor_in1, layout=ttnn.TILE_LAYOUT) - input_tensor_in1 = ttnn.to_device(input_tensor_in1, device) - input_tensor_in1 = ttnn.pad(input_tensor_in1, ((0, 0), (0, 0), (0, 0), (0, n_padded_size - n_size)), 0) - input_tensor_in1 = ttnn.from_device(input_tensor_in1) - input_tensor_in1 = ttnn.to_torch(input_tensor_in1) - - # in1 host padding cause seg faults! - # input_tensor_in1 = ttnn.Tensor(torch_input_tensor_in1.flatten().tolist(), in1_shape, ttnn.bfloat8_b, ttnn.ROW_MAJOR_LAYOUT) - # input_tensor_in1 = input_tensor_in1.pad([1, 1, k_size, n_padded_size], (0, 0, 0, 0), 0) - # in1_shape = (1, 1, m_size, k_size) - # in1 = torch.randn(in1_shape).bfloat16().float() - # in1_t = ttnn.experimental.tensor.Tensor(in1.flatten().tolist(), in1_shape, ttnn.experimental.tensor.DataType.BFLOAT16, ttnn.experimental.tensor.Layout.ROW_MAJOR) - # in1_t = in1_t.pad([1, 1, k_size, n_padded_size], (0, 0, 0, 0), 0).to(ttnn.experimental.tensor.Layout.TILE) - # in1 shard config in1_shard_grid = ttnn.CoreCoord(device.dram_grid_size().x - 1, device.dram_grid_size().y - 1) in1_shard_grid = ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), in1_shard_grid)}) @@ -224,7 +208,11 @@ def test_ttnn_experimental_operations_primary_matmul_dram_sharded(device, m_size ttnn.types.TensorMemoryLayout.WIDTH_SHARDED, ttnn.types.BufferType.DRAM, in1_shard_spec ) input_tensor_in1 = ttnn.from_torch( - input_tensor_in1, layout=ttnn.TILE_LAYOUT, device=device, dtype=ttnn.bfloat8_b, memory_config=in1_mem_config + torch_input_tensor_in1, + layout=ttnn.TILE_LAYOUT, + device=device, + dtype=ttnn.bfloat8_b, + memory_config=in1_mem_config, ) program_config = ttnn.experimental.operations.primary.MatmulMultiCoreReuseMultiCastDRAMShardedProgramConfig( @@ -235,9 +223,6 @@ def test_ttnn_experimental_operations_primary_matmul_dram_sharded(device, m_size per_core_N=4, fuse_batch=True, fused_activation=None, - skip_compute=False, - skip_in0_mcast=False, - skip_write_back=False, ) compute_kernel_config = ttnn.WormholeComputeKernelConfig( @@ -247,19 +232,18 @@ def test_ttnn_experimental_operations_primary_matmul_dram_sharded(device, m_size packer_l1_acc=True, ) - output_tensor = ttnn.experimental.operations.primary.matmul_dram_sharded( + output_tensor = ttnn.matmul( input_tensor_in0, input_tensor_in1, program_config=program_config, - output_mem_config=sharded_mem_config, - output_dtype=ttnn.bfloat16, + memory_config=sharded_mem_config, + dtype=ttnn.bfloat16, compute_kernel_config=compute_kernel_config, ) output_tensor = ttnn.to_memory_config(output_tensor, ttnn.L1_MEMORY_CONFIG) output_tensor = ttnn.from_device(output_tensor) output_tensor = ttnn.to_torch(output_tensor) - output_tensor = output_tensor[:, :, :, 0:n_size] assert_with_pcc(torch_output_tensor, output_tensor, pcc=0.9999) diff --git a/tt_eager/tt_dnn/op_library/bmm/bmm_op.cpp b/tt_eager/tt_dnn/op_library/bmm/bmm_op.cpp index 29371de7302..010b0f8dd53 100644 --- a/tt_eager/tt_dnn/op_library/bmm/bmm_op.cpp +++ b/tt_eager/tt_dnn/op_library/bmm/bmm_op.cpp @@ -1039,7 +1039,6 @@ void Matmul::validate( TT_FATAL(per_core_M == (shard_shape[0] / TILE_HEIGHT)); TT_FATAL(K % program_config.in0_block_w == 0); TT_FATAL((shard_shape[1] / TILE_WIDTH) % program_config.in0_block_w == 0); - // TT_FATAL(div_up(N, per_core_N) == input_tensor_a.shard_spec().value().grid.num_cores()); // subbblock constraint TT_FATAL(program_config.out_subblock_w == per_core_N || program_config.out_subblock_h == 1); @@ -1195,22 +1194,6 @@ std::vector Matmul::compute_output_shapes(const std::vector& inpu return {Shape(output_shape, padding)}; } -std::vector Matmul::compute_output_shapes_dram_sharded( - const std::vector& input_tensors, uint32_t N_unpadded) const { - const auto input_shape_a = input_tensors.at(0).get_legacy_shape(); - const auto input_shape_b = input_tensors.at(1).get_legacy_shape(); - - auto output_shape = input_shape_a; - output_shape[-1] = N_unpadded; - auto dimensions_pads = std::vector(); - for (auto index = 0; index < input_shape_a.rank() - 1; index++) { - dimensions_pads.push_back(input_shape_a.padding()[index]); - } - dimensions_pads.push_back(input_shape_b.padding()[input_shape_b.rank() - 1]); - const auto padding = Padding(dimensions_pads, Padding::PadValue::Any); - return {Shape(output_shape, padding)}; -}; - std::vector Matmul::create_output_tensors(const std::vector& input_tensors) const { const auto& input_tensor_a = input_tensors.at(0); const auto& input_tensor_b = input_tensors.at(1); @@ -1250,12 +1233,9 @@ std::vector Matmul::create_output_tensors(const std::vector& inp MatmulMultiCoreReuseMultiCastDRAMShardedProgramConfig>) { uint32_t M = (program_config.fuse_batch ? input_tensor_a.volume() / input_tensor_a.get_legacy_shape()[-1] - : input_tensor_a.get_legacy_shape()[-2]) / - TILE_HEIGHT; + : input_tensor_a.get_legacy_shape()[-2]) / TILE_HEIGHT; uint32_t N = input_tensor_b.get_legacy_shape()[-1] / TILE_WIDTH; auto input_tensor_b_shape = input_tensor_b.get_legacy_shape(); - uint32_t N_unpaddded = input_tensor_b.get_legacy_shape()[-1] - - input_tensor_b_shape.padding()[input_tensor_b_shape.rank() - 1].back; uint32_t per_core_M = program_config.per_core_M; uint32_t per_core_N = program_config.per_core_N; @@ -1266,7 +1246,7 @@ std::vector Matmul::create_output_tensors(const std::vector& inp auto mem_config = this->output_mem_config; mem_config.shard_spec = shard_spec; return {create_device_tensor( - this->compute_output_shapes_dram_sharded(input_tensors, N_unpaddded).at(0), + this->compute_output_shapes(input_tensors).at(0), this->output_dtype, output_layout, input_tensor_a.device(), @@ -1454,9 +1434,9 @@ operation::ProgramWithCallbacks Matmul::create_program( program_config.fuse_batch, program_config.fused_activation, this->untilize_out, - program_config.skip_compute, - program_config.skip_in0_mcast, - program_config.skip_write_back); + false, + false, + false); } else if constexpr (std::is_same_v) { TT_FATAL(!bias.has_value(), "Bias is not supported for matmul multi core non-optimized reuse"); return matmul_multi_core_reuse(input_tensor_a, input_tensor_b, output_tensor, broadcast_batch); @@ -1517,42 +1497,6 @@ Tensor matmul_1d( return output_tensors.at(0); } -Tensor matmul_dram_sharded( - const Tensor& input_tensor_a, - const Tensor& input_tensor_b, - std::optional bias, - std::optional program_config, - const MemoryConfig& mem_config, - std::optional output_dtype, - std::optional compute_kernel_config, - bool untilize_out) { - std::vector output_tensors = { - Tensor(operation::get_workers_for_op_output({input_tensor_a, input_tensor_b}, {bias}))}; - operation::launch_op( - [program_config, mem_config, output_dtype, compute_kernel_config, untilize_out]( - const std::vector& input_tensors, - const std::vector>& optional_input_tensors, - const std::vector>& optional_output_tensors) mutable -> std::vector { - const auto& input_tensor_a = input_tensors.at(0); - const auto& input_tensor_b = input_tensors.at(1); - - auto kernel_config_val = - init_device_compute_kernel_config(input_tensor_a.device()->arch(), compute_kernel_config); - return {operations::primary::matmul( - input_tensor_a, - input_tensor_b, - optional_input_tensors.at(0), - program_config.value(), - mem_config, - output_dtype, - kernel_config_val, - untilize_out)}; - }, - {input_tensor_a, input_tensor_b}, - output_tensors, - {bias}); - return output_tensors.at(0); -} operation::OpPerformanceModel Matmul::create_op_performance_model( const std::vector& input_tensors, diff --git a/tt_eager/tt_dnn/op_library/bmm/bmm_op.hpp b/tt_eager/tt_dnn/op_library/bmm/bmm_op.hpp index f1a09bd4326..a7f59fb5693 100644 --- a/tt_eager/tt_dnn/op_library/bmm/bmm_op.hpp +++ b/tt_eager/tt_dnn/op_library/bmm/bmm_op.hpp @@ -224,9 +224,6 @@ struct MatmulMultiCoreReuseMultiCastDRAMShardedProgramConfig { std::size_t per_core_N; bool fuse_batch; std::optional fused_activation; - bool skip_compute; - bool skip_in0_mcast; - bool skip_write_back; static constexpr auto attribute_names = std::make_tuple( "in0_block_w", @@ -235,10 +232,7 @@ struct MatmulMultiCoreReuseMultiCastDRAMShardedProgramConfig { "per_core_M", "per_core_N", "fuse_batch", - "fused_activation", - "skip_compute", - "skip_in0_mcast", - "skip_write_back"); + "fused_activation"); const auto attribute_values() const { return std::make_tuple( std::cref(this->in0_block_w), @@ -247,10 +241,7 @@ struct MatmulMultiCoreReuseMultiCastDRAMShardedProgramConfig { std::cref(this->per_core_M), std::cref(this->per_core_N), std::cref(this->fuse_batch), - std::cref(this->fused_activation), - std::cref(this->skip_compute), - std::cref(this->skip_in0_mcast), - std::cref(this->skip_write_back)); + std::cref(this->fused_activation)); } }; @@ -401,7 +392,6 @@ inline Tensor matmul( } Tensor matmul_1d(const Tensor &input_tensor_a, const Tensor &input_tensor_b, std::optional bias, std::optional program_config = std::nullopt, const MemoryConfig& mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, std::optional output_dtype=std::nullopt, std::optional compute_kernel_config = std::nullopt, bool untilize_out = false); -Tensor matmul_dram_sharded(const Tensor &input_tensor_a, const Tensor &input_tensor_b, std::optional bias, std::optional program_config = std::nullopt, const MemoryConfig& mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, std::optional output_dtype=std::nullopt, std::optional compute_kernel_config = std::nullopt, bool untilize_out = false); MatmulProgramConfig generate_matmul_program_config(const Tensor &input_tensor_a, const Tensor &input_tensor_b, const MemoryConfig &mem_config, const std::optional compute_kernel_config, const std::optional user_core_coord, const std::optional user_fused_activation, const std::optional user_run_batched); diff --git a/tt_eager/tt_dnn/op_library/bmm/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp b/tt_eager/tt_dnn/op_library/bmm/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp index a3c8e56fa75..ca37e340a70 100644 --- a/tt_eager/tt_dnn/op_library/bmm/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp +++ b/tt_eager/tt_dnn/op_library/bmm/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp @@ -15,8 +15,6 @@ #include "compute_kernel_api/eltwise_unary/sfpu_split_includes.h" -// #include "debug/dprint.h" - namespace NAMESPACE { diff --git a/tt_eager/tt_dnn/op_library/bmm/kernels/dataflow/reader_bmm_tile_layout_in0_receiver_dram_sharded.cpp b/tt_eager/tt_dnn/op_library/bmm/kernels/dataflow/reader_bmm_tile_layout_in0_receiver_dram_sharded.cpp index 8d507e9c243..49ae846e85b 100644 --- a/tt_eager/tt_dnn/op_library/bmm/kernels/dataflow/reader_bmm_tile_layout_in0_receiver_dram_sharded.cpp +++ b/tt_eager/tt_dnn/op_library/bmm/kernels/dataflow/reader_bmm_tile_layout_in0_receiver_dram_sharded.cpp @@ -6,7 +6,6 @@ #include "dataflow_api.h" #include "hostdevcommon/common_values.hpp" -// #include "debug/dprint.h" void kernel_main() { // COMPILE TIME ARGS diff --git a/tt_eager/tt_dnn/op_library/bmm/kernels/dataflow/reader_bmm_tile_layout_in0_sender_dram_sharded.cpp b/tt_eager/tt_dnn/op_library/bmm/kernels/dataflow/reader_bmm_tile_layout_in0_sender_dram_sharded.cpp index 9821fdcfdfd..109def5e9bc 100644 --- a/tt_eager/tt_dnn/op_library/bmm/kernels/dataflow/reader_bmm_tile_layout_in0_sender_dram_sharded.cpp +++ b/tt_eager/tt_dnn/op_library/bmm/kernels/dataflow/reader_bmm_tile_layout_in0_sender_dram_sharded.cpp @@ -6,7 +6,6 @@ #include "dataflow_api.h" #include "hostdevcommon/common_values.hpp" -// #include "debug/dprint.h" void kernel_main() { // COMPILE TIME ARGS diff --git a/tt_eager/tt_dnn/op_library/bmm/kernels/dataflow/reader_bmm_tile_layout_in1_sender_dram_sharded.cpp b/tt_eager/tt_dnn/op_library/bmm/kernels/dataflow/reader_bmm_tile_layout_in1_sender_dram_sharded.cpp index 11133c0a5ea..5bde1c06534 100644 --- a/tt_eager/tt_dnn/op_library/bmm/kernels/dataflow/reader_bmm_tile_layout_in1_sender_dram_sharded.cpp +++ b/tt_eager/tt_dnn/op_library/bmm/kernels/dataflow/reader_bmm_tile_layout_in1_sender_dram_sharded.cpp @@ -6,8 +6,6 @@ #include "dataflow_api.h" #include "hostdevcommon/common_values.hpp" -// #include "debug/dprint.h" - void kernel_main() { // RUNTIME ARGS @@ -76,9 +74,10 @@ void kernel_main() { } #else constexpr uint32_t total_num_blocks_in_buffer = 3; + constexpr uint32_t total_num_trid = 4; uint32_t num_free_blocks_in_buffer = total_num_blocks_in_buffer; - uint32_t curr_block_trid = 0; - uint32_t block_trid_to_wait = 0; + uint32_t curr_block_trid = 1; + uint32_t block_trid_to_wait = 1; cb_reserve_back(cb_id_in1, in1_block_num_tiles); uint32_t l1_write_addr_in1_offset = 0; @@ -88,7 +87,7 @@ void kernel_main() { noc_async_read_tile_dram_sharded_set_trid(curr_block_trid); for(uint32_t h = 0; h < in1_num_pages; ++h) { - noc_async_read_tile_dram_sharded_with_state_with_trid(in1_base_addr, l1_read_addr_in1, l1_write_addr_in1); + noc_async_read_tile_dram_sharded_with_state_with_trid(in1_base_addr, l1_read_addr_in1, l1_write_addr_in1, curr_block_trid); l1_read_addr_in1 += in1_page_size; l1_write_addr_in1 += in1_page_size; } @@ -97,16 +96,16 @@ void kernel_main() { noc_async_read_barrier_with_trid(block_trid_to_wait); cb_push_back(cb_id_in1, in1_block_num_tiles); // wait for next block trid - block_trid_to_wait = (block_trid_to_wait + 1) % total_num_blocks_in_buffer; + block_trid_to_wait = block_trid_to_wait == 3 ? 1 : (block_trid_to_wait + 1); // reserve for next block cb_reserve_back(cb_id_in1, in1_block_num_tiles * 2); } else { num_free_blocks_in_buffer -= 1; } - if (curr_block_trid == total_num_blocks_in_buffer - 1) { + if (curr_block_trid == total_num_blocks_in_buffer) { l1_write_addr_in1_offset = 0; - curr_block_trid = 0; + curr_block_trid = 1; } else { l1_write_addr_in1_offset += in1_block_size_bytes; curr_block_trid += 1; diff --git a/tt_eager/tt_dnn/op_library/bmm/multi_core_reuse_mcast_dram_sharded_optimized/bmm_op_multi_core_reuse_dram_sharded_optimized.cpp b/tt_eager/tt_dnn/op_library/bmm/multi_core_reuse_mcast_dram_sharded_optimized/bmm_op_multi_core_reuse_dram_sharded_optimized.cpp index 2ff9646fbcd..9c9f6766337 100644 --- a/tt_eager/tt_dnn/op_library/bmm/multi_core_reuse_mcast_dram_sharded_optimized/bmm_op_multi_core_reuse_dram_sharded_optimized.cpp +++ b/tt_eager/tt_dnn/op_library/bmm/multi_core_reuse_mcast_dram_sharded_optimized/bmm_op_multi_core_reuse_dram_sharded_optimized.cpp @@ -130,13 +130,13 @@ void get_dram_reader_core_coords_wormhole_b0( // get dram banks and coords uint32_t num_banks = device->num_dram_channels(); uint32_t max_bank_id = num_banks - 1; - std::vector dram_coord_phy; + std::vector dram_coord_phy; dram_coord_phy.reserve(num_banks); for (int i = 0; i < num_banks; ++i) { dram_coord_phy.push_back(device->dram_core_from_dram_channel(i)); } // get worker logical coords - std::vector all_worker_cores_logical; + std::vector all_worker_cores_logical; all_worker_cores_logical.reserve(num_cores_x * num_cores_y); for (int i = 0; i < num_cores_x; ++i) { for (int j = 0; j < num_cores_y; ++j) { all_worker_cores_logical.push_back(CoreCoord(i, j)); @@ -144,7 +144,7 @@ void get_dram_reader_core_coords_wormhole_b0( } // get y coords of the workers - std::vector all_worker_cores_y_physical; + std::vector all_worker_cores_y_physical; all_worker_cores_y_physical.reserve(num_cores_y); uint32_t max_worker_y_physical = 0; uint32_t min_worker_y_physical = 10000; for (int i = 0; i < num_cores_y; ++i) { @@ -170,7 +170,7 @@ void get_dram_reader_core_coords_wormhole_b0( } // get the ajacent cores of DRAM banks - std::vector adj_core_physical; + std::vector adj_core_physical; adj_core_physical.reserve(num_banks); for (int i = 0; i < num_banks; ++i) { auto dram_core = dram_coord_phy[i]; uint32_t adj_core_x = dram_core.x + 1; @@ -179,10 +179,10 @@ void get_dram_reader_core_coords_wormhole_b0( } // split the adjacent coords into two groups, because DRAM banks has two cols - std::vector adj_core_physical_g1; - std::vector adj_core_physical_y_g1; - std::vector adj_core_physical_g2; - std::vector adj_core_physical_y_g2; + std::vector adj_core_physical_g1; adj_core_physical_g1.reserve(num_banks); + std::vector adj_core_physical_y_g1; adj_core_physical_y_g1.reserve(num_banks); + std::vector adj_core_physical_g2; adj_core_physical_g2.reserve(num_banks); + std::vector adj_core_physical_y_g2; adj_core_physical_y_g2.reserve(num_banks); for (auto core : adj_core_physical) { if (core.x == adj_core_physical.front().x) { adj_core_physical_g1.push_back(core); @@ -260,7 +260,7 @@ void get_dram_reader_core_coords_wormhole_b0( process_group(adj_core_physical_g2, adj_core_physical_y_g2, x_step); // merge two group into one - std::vector adj_core_physical_realloc; + std::vector adj_core_physical_realloc; adj_core_physical_realloc.reserve(num_banks); for (int i = 0; i < indices_g1_realloc.size(); ++i) { adj_core_physical_realloc.push_back(adj_core_physical_g1[indices_g1_realloc[i]]); } @@ -269,7 +269,7 @@ void get_dram_reader_core_coords_wormhole_b0( } // find the logical coord from physical coord - std::vector adj_core_logical_realloc; + std::vector adj_core_logical_realloc; adj_core_logical_realloc.reserve(num_banks); for (int i = 0; i < adj_core_physical_realloc.size(); ++i) { for (int j = 0; j < all_worker_cores_logical.size(); ++j) { auto core = device->worker_core_from_logical_core(all_worker_cores_logical[j]); @@ -552,7 +552,6 @@ operation::ProgramWithCallbacks create_program_dram_sharded( (std::uint32_t)num_blocks_per_shard}; std::vector in1_sender_writer_compile_time_args = { - // (std::uint32_t) in1_buffer->address(), (std::uint32_t)in1_buffer_page_size, (std::uint32_t)in1_buffer_num_pages, // in1 block args diff --git a/tt_eager/tt_lib/csrc/operations/primary/module.hpp b/tt_eager/tt_lib/csrc/operations/primary/module.hpp index 80123e02ec1..00bfced2d4f 100644 --- a/tt_eager/tt_lib/csrc/operations/primary/module.hpp +++ b/tt_eager/tt_lib/csrc/operations/primary/module.hpp @@ -129,10 +129,7 @@ void py_module(py::module& m_primary) { std::size_t, std::size_t, bool, - std::optional, - bool, - bool, - bool>(), + std::optional>(), py::kw_only(), py::arg("in0_block_w").noconvert(), py::arg("out_subblock_h").noconvert(), @@ -140,10 +137,7 @@ void py_module(py::module& m_primary) { py::arg("per_core_M").noconvert(), py::arg("per_core_N").noconvert(), py::arg("fuse_batch").noconvert(), - py::arg("fused_activation"), - py::arg("skip_compute").noconvert(), - py::arg("skip_in0_mcast").noconvert(), - py::arg("skip_write_back").noconvert()) + py::arg("fused_activation")) .def_readwrite("fused_activation", &MatmulMultiCoreReuseMultiCastDRAMShardedProgramConfig::fused_activation) .def("__repr__", [](const MatmulMultiCoreReuseMultiCastDRAMShardedProgramConfig& config) { return fmt::format("{}", config); @@ -364,31 +358,34 @@ void py_module(py::module& m_primary) { )doc"); m_primary.def( - "matmul_1d", + "matmul", [](const Tensor& input_tensor_a, const Tensor& input_tensor_b, std::optional bias, - const std::optional& program_config, + const MatmulMultiCoreReuseMultiCastDRAMShardedProgramConfig& program_config, const MemoryConfig& out_mem_config, std::optional output_dtype, - std::optional compute_kernel_config) { - return matmul_1d( + std::optional compute_kernel_config, + const bool untilize_out) { + return matmul( input_tensor_a, input_tensor_b, bias, program_config, out_mem_config, output_dtype, - compute_kernel_config); + compute_kernel_config, + untilize_out); }, py::arg("input_tensor_a").noconvert(), py::arg("input_tensor_b").noconvert(), py::kw_only(), py::arg("bias").noconvert() = std::nullopt, - py::arg("program_config").noconvert() = std::nullopt, + py::arg("program_config").noconvert(), py::arg("output_mem_config").noconvert() = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, py::arg("output_dtype").noconvert() = std::nullopt, py::arg("compute_kernel_config").noconvert() = std::nullopt, + py::arg("untilize_out").noconvert() = false, R"doc( Perform a matrix multiplication ``input_tensor_a x input_tensor_b``. @@ -398,23 +395,28 @@ void py_module(py::module& m_primary) { "input_tensor_a", "First tensor to multiply", "Tensor", "Tensor of shape [B_a, C_a, M, K]", "Yes" "input_tensor_b", "Second tensor to multiply", "Tensor", "Tensor of shape [B_b, C_b, K, N]", "Yes" "bias", "Bias to add", "Tensor", "Tensor of shape [1, 1, 1, N]", "Yes" - "program_config", "", "MatmulMultiCoreReuseMultiCast1DProgramConfig", "Config will be automatically determined if not passed", "Yes" + "program_config", "", "MatmulMultiCoreReuseMultiCastDRAMShardedProgramConfig", "", "Yes" "output_mem_config", "Layout of tensor in TT Accelerator device memory banks", "MemoryConfig", "Default is interleaved in DRAM", "No" "output_dtype", "Output Data Type", "DataType", "By default it will be set to the data type of `input_tensor_a`", "No" )doc"); m_primary.def( - "matmul_dram_sharded", + "matmul_1d", [](const Tensor& input_tensor_a, const Tensor& input_tensor_b, std::optional bias, - const std::optional& program_config, + const std::optional& program_config, const MemoryConfig& out_mem_config, std::optional output_dtype, - std::optional compute_kernel_config - ) { - return matmul_dram_sharded( - input_tensor_a, input_tensor_b, bias, program_config, out_mem_config, output_dtype, compute_kernel_config); + std::optional compute_kernel_config) { + return matmul_1d( + input_tensor_a, + input_tensor_b, + bias, + program_config, + out_mem_config, + output_dtype, + compute_kernel_config); }, py::arg("input_tensor_a").noconvert(), py::arg("input_tensor_b").noconvert(), diff --git a/tt_metal/hw/inc/dataflow_api.h b/tt_metal/hw/inc/dataflow_api.h index dfc08bac6f7..e985e0f521c 100644 --- a/tt_metal/hw/inc/dataflow_api.h +++ b/tt_metal/hw/inc/dataflow_api.h @@ -1826,7 +1826,6 @@ uint32_t noc_async_read_tile_dram_sharded_set_state(uint32_t bank_base_address, src_noc_xy = dram_bank_to_noc_xy[noc_index][bank_id]; DEBUG_STATUS("NRTW"); - DEBUG_SANITIZE_NOC_READ_TRANSACTION(get_noc_addr_helper(src_noc_xy, src_addr_), dest_addr, page_size); while (!noc_cmd_buf_ready(noc_index, NCRISC_RD_CMD_BUF)); DEBUG_STATUS("NRTD"); diff --git a/tt_metal/hw/inc/debug/dprint_tile.h b/tt_metal/hw/inc/debug/dprint_tile.h index fde6517cd44..835fe3b6fea 100644 --- a/tt_metal/hw/inc/debug/dprint_tile.h +++ b/tt_metal/hw/inc/debug/dprint_tile.h @@ -19,38 +19,6 @@ struct SliceRange { static inline SliceRange hw0_32_4() { return SliceRange{ .h0 = 0, .h1 = 32, .hs = 4, .w0 = 0, .w1 = 32, .ws = 4 }; } // [0, 0:32] static inline SliceRange h0_w0_32() { return SliceRange{ .h0 = 0, .h1 = 1, .hs = 1, .w0 = 0, .w1 = 32, .ws = 1 }; } - static inline SliceRange h1_w0_32() { return SliceRange{ .h0 = 1, .h1 = 2, .hs = 1, .w0 = 0, .w1 = 32, .ws = 1 }; } - static inline SliceRange h2_w0_32() { return SliceRange{ .h0 = 2, .h1 = 3, .hs = 1, .w0 = 0, .w1 = 32, .ws = 1 }; } - static inline SliceRange h3_w0_32() { return SliceRange{ .h0 = 3, .h1 = 4, .hs = 1, .w0 = 0, .w1 = 32, .ws = 1 }; } - static inline SliceRange h4_w0_32() { return SliceRange{ .h0 = 4, .h1 = 5, .hs = 1, .w0 = 0, .w1 = 32, .ws = 1 }; } - static inline SliceRange h5_w0_32() { return SliceRange{ .h0 = 5, .h1 = 6, .hs = 1, .w0 = 0, .w1 = 32, .ws = 1 }; } - static inline SliceRange h6_w0_32() { return SliceRange{ .h0 = 6, .h1 = 7, .hs = 1, .w0 = 0, .w1 = 32, .ws = 1 }; } - static inline SliceRange h7_w0_32() { return SliceRange{ .h0 = 7, .h1 = 8, .hs = 1, .w0 = 0, .w1 = 32, .ws = 1 }; } - static inline SliceRange h8_w0_32() { return SliceRange{ .h0 = 8, .h1 = 9, .hs = 1, .w0 = 0, .w1 = 32, .ws = 1 }; } - static inline SliceRange h9_w0_32() { return SliceRange{ .h0 = 9, .h1 = 10, .hs = 1, .w0 = 0, .w1 = 32, .ws = 1 }; } - static inline SliceRange h10_w0_32() { return SliceRange{ .h0 = 10, .h1 = 11, .hs = 1, .w0 = 0, .w1 = 32, .ws = 1 }; } - static inline SliceRange h11_w0_32() { return SliceRange{ .h0 = 11, .h1 = 12, .hs = 1, .w0 = 0, .w1 = 32, .ws = 1 }; } - static inline SliceRange h12_w0_32() { return SliceRange{ .h0 = 12, .h1 = 13, .hs = 1, .w0 = 0, .w1 = 32, .ws = 1 }; } - static inline SliceRange h13_w0_32() { return SliceRange{ .h0 = 13, .h1 = 14, .hs = 1, .w0 = 0, .w1 = 32, .ws = 1 }; } - static inline SliceRange h14_w0_32() { return SliceRange{ .h0 = 14, .h1 = 15, .hs = 1, .w0 = 0, .w1 = 32, .ws = 1 }; } - static inline SliceRange h15_w0_32() { return SliceRange{ .h0 = 15, .h1 = 16, .hs = 1, .w0 = 0, .w1 = 32, .ws = 1 }; } - static inline SliceRange h16_w0_32() { return SliceRange{ .h0 = 16, .h1 = 17, .hs = 1, .w0 = 0, .w1 = 32, .ws = 1 }; } - static inline SliceRange h17_w0_32() { return SliceRange{ .h0 = 17, .h1 = 18, .hs = 1, .w0 = 0, .w1 = 32, .ws = 1 }; } - static inline SliceRange h18_w0_32() { return SliceRange{ .h0 = 18, .h1 = 19, .hs = 1, .w0 = 0, .w1 = 32, .ws = 1 }; } - static inline SliceRange h19_w0_32() { return SliceRange{ .h0 = 19, .h1 = 20, .hs = 1, .w0 = 0, .w1 = 32, .ws = 1 }; } - static inline SliceRange h20_w0_32() { return SliceRange{ .h0 = 20, .h1 = 21, .hs = 1, .w0 = 0, .w1 = 32, .ws = 1 }; } - static inline SliceRange h21_w0_32() { return SliceRange{ .h0 = 21, .h1 = 22, .hs = 1, .w0 = 0, .w1 = 32, .ws = 1 }; } - static inline SliceRange h22_w0_32() { return SliceRange{ .h0 = 22, .h1 = 23, .hs = 1, .w0 = 0, .w1 = 32, .ws = 1 }; } - static inline SliceRange h23_w0_32() { return SliceRange{ .h0 = 23, .h1 = 24, .hs = 1, .w0 = 0, .w1 = 32, .ws = 1 }; } - static inline SliceRange h24_w0_32() { return SliceRange{ .h0 = 24, .h1 = 25, .hs = 1, .w0 = 0, .w1 = 32, .ws = 1 }; } - static inline SliceRange h25_w0_32() { return SliceRange{ .h0 = 25, .h1 = 26, .hs = 1, .w0 = 0, .w1 = 32, .ws = 1 }; } - static inline SliceRange h26_w0_32() { return SliceRange{ .h0 = 26, .h1 = 27, .hs = 1, .w0 = 0, .w1 = 32, .ws = 1 }; } - static inline SliceRange h27_w0_32() { return SliceRange{ .h0 = 27, .h1 = 28, .hs = 1, .w0 = 0, .w1 = 32, .ws = 1 }; } - static inline SliceRange h28_w0_32() { return SliceRange{ .h0 = 28, .h1 = 29, .hs = 1, .w0 = 0, .w1 = 32, .ws = 1 }; } - static inline SliceRange h29_w0_32() { return SliceRange{ .h0 = 29, .h1 = 30, .hs = 1, .w0 = 0, .w1 = 32, .ws = 1 }; } - static inline SliceRange h30_w0_32() { return SliceRange{ .h0 = 30, .h1 = 31, .hs = 1, .w0 = 0, .w1 = 32, .ws = 1 }; } - static inline SliceRange h31_w0_32() { return SliceRange{ .h0 = 31, .h1 = 32, .hs = 1, .w0 = 0, .w1 = 32, .ws = 1 }; } - // [0:32, 0] static inline SliceRange h0_32_w0() { return SliceRange{ .h0 = 0, .h1 = 32, .hs = 1, .w0 = 0, .w1 = 1, .ws = 1 }; } // [0:32:1, 1] diff --git a/tt_metal/include/compute_kernel_api/matmul.h b/tt_metal/include/compute_kernel_api/matmul.h index ba111c23bc9..154f957f60b 100644 --- a/tt_metal/include/compute_kernel_api/matmul.h +++ b/tt_metal/include/compute_kernel_api/matmul.h @@ -141,19 +141,6 @@ ALWI void mm_block_init(uint32_t in0_cb_id = 0, uint32_t in1_cb_id = 1, uint32_t PACK(( llk_setup_outputs() )); PACK(( llk_pack_dest_init() )); } -ALWI void mm_block_init_new(uint32_t in0_cb_id = 0, uint32_t in1_cb_id = 1, uint32_t out_cb_id = 16, const uint32_t transpose = 0, uint32_t ct_dim = 1, uint32_t rt_dim = 1, uint32_t kt_dim = 1) { - // UNPACK(( llk_setup_operands() )); - // UNPACK(( llk_unpack_reconfig_data_format_srca(24, in1_cb_id) )); - UNPACK(( llk_unpack_AB_matmul_hw_configure_disaggregated(in0_cb_id, in1_cb_id) )); - UNPACK(( llk_unpack_AB_matmul_init(in0_cb_id, in1_cb_id, transpose, ct_dim, rt_dim, kt_dim) )); - MATH(( llk_math_matmul_init(in0_cb_id, in1_cb_id, transpose, ct_dim, rt_dim, kt_dim) )); - // MATH(( llk_math_pack_sync_init() )); - - PACK(( llk_pack_hw_configure_disaggregated(out_cb_id) )); - PACK(( llk_pack_init(out_cb_id) )); - // PACK(( llk_setup_outputs() )); - // PACK(( llk_pack_dest_init() )); -} /** * Performs block-sized matrix multiplication *C=A\*B* between the blocks in two