#7511: Use 1d matmul if any dim is 1 tile, adjust fidelity and tests …

…slightly
tenstorrent · May 11, 2024 · 780b0f3 · 780b0f3
1 parent 1a35e85
commit 780b0f3
Show file tree

Hide file tree

Showing 4 changed files with 6 additions and 5 deletions.
diff --git a/models/demos/ttnn_falcon7b/tests/test_perf_device_falcon.py b/models/demos/ttnn_falcon7b/tests/test_perf_device_falcon.py
@@ -10,7 +10,7 @@
 @pytest.mark.parametrize(
     "batch_size, test, expected_perf",
     [
-        [1, "BFLOAT16-L1-falcon_7b-layers_32-prefill_seq256", 3.49],
+        [1, "BFLOAT16-L1-falcon_7b-layers_32-prefill_seq256", 3.44],
         [32, "BFLOAT16-L1-falcon_7b-layers_32-decode_batch32", 139],
     ],
 )

diff --git a/tests/ttnn/unit_tests/operations/test_linear.py b/tests/ttnn/unit_tests/operations/test_linear.py
@@ -243,4 +243,4 @@ def test_bloom_ff2_linear(device):
         dtype=ttnn.bfloat16,
     )
 
-    assert ttnn.pearson_correlation_coefficient(torch_output, output) >= 0.9989
+    assert ttnn.pearson_correlation_coefficient(torch_output, output) >= 0.9988
diff --git a/tt_eager/tt_dnn/op_library/bmm/bmm_op.cpp b/tt_eager/tt_dnn/op_library/bmm/bmm_op.cpp
@@ -1229,6 +1229,7 @@ MatmulProgramConfig create_matmul_program_config(const Tensor& input_tensor_a, c
     uint32_t batch_size_a = get_batch_size(a_padded_shape);
     uint32_t batch_size_b = get_batch_size(b_padded_shape);
     bool input_b_is_batched = batch_size_b > 1;
+    bool any_size_within_tile = k_size <= ttnn::TILE_SIZE || m_size <= ttnn::TILE_SIZE || n_size <= ttnn::TILE_SIZE;
     auto input_tensor_a_memory_config = input_tensor_a.memory_config();
     auto input_tensor_b_memory_config = input_tensor_b.memory_config();
     bool fp32_dest_acc_en = bmm_op_utils::get_fp32_dest_acc_en(compute_kernel_config);
@@ -1285,7 +1286,7 @@ MatmulProgramConfig create_matmul_program_config(const Tensor& input_tensor_a, c
     auto height = batch_size_a * m_size;
     auto width = n_size;
     auto height_width_ratio = (height > width) ? height / width : width / height;
-    if (height_width_ratio > 8) {
+    if (height_width_ratio > 8 || any_size_within_tile) {
 	return create_matmul_1d_systolic_array_program_config(a_shape, b_shape, core_coord, fused_activation, fp32_dest_acc_en);
     }
     if (!a_is_sharded) {

diff --git a/tt_eager/tt_dnn/op_library/bmm/bmm_op.hpp b/tt_eager/tt_dnn/op_library/bmm/bmm_op.hpp
@@ -401,8 +401,8 @@ inline Tensor matmul(
             const auto& input_tensor_a = input_tensors.at(0);
             const auto& input_tensor_b = input_tensors.at(1);
             auto arch = input_tensor_a.device()->arch();
-            const auto program_config_default = is_program_config_default(program_config);
-            auto math_fidelity = program_config_default ? MathFidelity::HiFi2 : MathFidelity::LoFi;
+            const auto increase_fidelity = is_program_config_default(program_config) || user_core_coord.has_value();
+            auto math_fidelity = increase_fidelity ? MathFidelity::HiFi2 : MathFidelity::LoFi;
             auto kernel_config_val = init_device_compute_kernel_config(arch, compute_kernel_config, math_fidelity);
             bool broadcast_batch = get_broadcast_batch(input_tensor_a, input_tensor_b, program_config);
             auto matmul_program_config = program_config;