diff --git a/models/demos/ttnn_falcon7b/tests/test_perf_device_falcon.py b/models/demos/ttnn_falcon7b/tests/test_perf_device_falcon.py index 6dade4d29b8..610bc838119 100644 --- a/models/demos/ttnn_falcon7b/tests/test_perf_device_falcon.py +++ b/models/demos/ttnn_falcon7b/tests/test_perf_device_falcon.py @@ -10,7 +10,7 @@ @pytest.mark.parametrize( "batch_size, test, expected_perf", [ - [1, "BFLOAT16-L1-falcon_7b-layers_32-prefill_seq256", 3.49], + [1, "BFLOAT16-L1-falcon_7b-layers_32-prefill_seq256", 3.44], [32, "BFLOAT16-L1-falcon_7b-layers_32-decode_batch32", 139], ], ) diff --git a/tests/ttnn/unit_tests/operations/test_linear.py b/tests/ttnn/unit_tests/operations/test_linear.py index cadb3a39d7b..d54caef1ed3 100644 --- a/tests/ttnn/unit_tests/operations/test_linear.py +++ b/tests/ttnn/unit_tests/operations/test_linear.py @@ -243,4 +243,4 @@ def test_bloom_ff2_linear(device): dtype=ttnn.bfloat16, ) - assert ttnn.pearson_correlation_coefficient(torch_output, output) >= 0.9989 + assert ttnn.pearson_correlation_coefficient(torch_output, output) >= 0.9988 diff --git a/tt_eager/tt_dnn/op_library/bmm/bmm_op.cpp b/tt_eager/tt_dnn/op_library/bmm/bmm_op.cpp index 136f015e8a9..ff4d8b3c3f9 100644 --- a/tt_eager/tt_dnn/op_library/bmm/bmm_op.cpp +++ b/tt_eager/tt_dnn/op_library/bmm/bmm_op.cpp @@ -1229,6 +1229,7 @@ MatmulProgramConfig create_matmul_program_config(const Tensor& input_tensor_a, c uint32_t batch_size_a = get_batch_size(a_padded_shape); uint32_t batch_size_b = get_batch_size(b_padded_shape); bool input_b_is_batched = batch_size_b > 1; + bool any_size_within_tile = k_size <= ttnn::TILE_SIZE || m_size <= ttnn::TILE_SIZE || n_size <= ttnn::TILE_SIZE; auto input_tensor_a_memory_config = input_tensor_a.memory_config(); auto input_tensor_b_memory_config = input_tensor_b.memory_config(); bool fp32_dest_acc_en = bmm_op_utils::get_fp32_dest_acc_en(compute_kernel_config); @@ -1285,7 +1286,7 @@ MatmulProgramConfig create_matmul_program_config(const Tensor& input_tensor_a, c auto height = batch_size_a * m_size; auto width = n_size; auto height_width_ratio = (height > width) ? height / width : width / height; - if (height_width_ratio > 8) { + if (height_width_ratio > 8 || any_size_within_tile) { return create_matmul_1d_systolic_array_program_config(a_shape, b_shape, core_coord, fused_activation, fp32_dest_acc_en); } if (!a_is_sharded) { diff --git a/tt_eager/tt_dnn/op_library/bmm/bmm_op.hpp b/tt_eager/tt_dnn/op_library/bmm/bmm_op.hpp index a32a3c5d76a..fe1d2f52f87 100644 --- a/tt_eager/tt_dnn/op_library/bmm/bmm_op.hpp +++ b/tt_eager/tt_dnn/op_library/bmm/bmm_op.hpp @@ -401,8 +401,8 @@ inline Tensor matmul( const auto& input_tensor_a = input_tensors.at(0); const auto& input_tensor_b = input_tensors.at(1); auto arch = input_tensor_a.device()->arch(); - const auto program_config_default = is_program_config_default(program_config); - auto math_fidelity = program_config_default ? MathFidelity::HiFi2 : MathFidelity::LoFi; + const auto increase_fidelity = is_program_config_default(program_config) || user_core_coord.has_value(); + auto math_fidelity = increase_fidelity ? MathFidelity::HiFi2 : MathFidelity::LoFi; auto kernel_config_val = init_device_compute_kernel_config(arch, compute_kernel_config, math_fidelity); bool broadcast_batch = get_broadcast_batch(input_tensor_a, input_tensor_b, program_config); auto matmul_program_config = program_config;