#8437: Use fallback matmul for large tensors and adjust tests

tenstorrent · May 24, 2024 · 277b990 · 277b990
1 parent 8c8e174
commit 277b990
Show file tree

Hide file tree

Showing 3 changed files with 13 additions and 7 deletions.
diff --git a/models/demos/falcon7b/tests/test_perf_falcon.py b/models/demos/falcon7b/tests/test_perf_falcon.py
@@ -376,10 +376,10 @@ class TestParametrized:
     @pytest.mark.parametrize(
         "llm_mode, num_layers, batch, seq_len, kv_cache_len, model_config_str, expected_output_pcc, expected_k_cache_pcc, expected_v_cache_pcc, expected_inference_time",
         (
-            ("prefill", 32, 1, 128, 0, "BFLOAT16-DRAM", 0.85, 0.97, 0.86, 0.31),
-            ("prefill", 32, 1, 128, 0, "BFLOAT16-L1", 0.85, 0.97, 0.86, 0.29),
-            ("prefill", 32, 1, 256, 0, "BFLOAT16-DRAM", 0.90, 0.97, 0.87, 0.43),
-            ("prefill", 32, 1, 256, 0, "BFLOAT16-L1", 0.90, 0.97, 0.87, 0.34),
+            ("prefill", 32, 1, 128, 0, "BFLOAT16-DRAM", 0.837, 0.97, 0.86, 0.31),
+            ("prefill", 32, 1, 128, 0, "BFLOAT16-L1", 0.837, 0.97, 0.86, 0.29),
+            ("prefill", 32, 1, 256, 0, "BFLOAT16-DRAM", 0.90, 0.97, 0.865, 0.43),
+            ("prefill", 32, 1, 256, 0, "BFLOAT16-L1", 0.90, 0.97, 0.865, 0.34),
             ("decode", 32, 32, 1, 128, "BFLOAT16-DRAM", 0.63, 0.80, 0.84, 0.28),
             ("decode", 32, 32, 1, 128, "BFLOAT16-L1", 0.63, 0.80, 0.84, 0.28),
             ("decode", 32, 32, 1, 1024, "BFLOAT16-DRAM", 0.56, 0.86, 0.88, 0.37),
@@ -509,8 +509,8 @@ def run_perf_wh_bare_metal(
         (
             ("prefill", 32, 1, 128, 0, "BFLOAT16-DRAM", 0.97, 0.99, 0.96, 0.1),
             ("prefill", 32, 1, 128, 0, "BFLOAT16-L1", 0.97, 0.99, 0.96, 0.1),
-            ("prefill", 32, 1, 256, 0, "BFLOAT16-DRAM", 0.98, 0.99, 0.96, 0.18),
-            ("prefill", 32, 1, 256, 0, "BFLOAT16-L1", 0.98, 0.99, 0.96, 0.18),
+            ("prefill", 32, 1, 256, 0, "BFLOAT16-DRAM", 0.979, 0.99, 0.96, 0.18),
+            ("prefill", 32, 1, 256, 0, "BFLOAT16-L1", 0.979, 0.99, 0.96, 0.18),
             ("decode", 32, 32, 1, 128, "BFLOAT16-DRAM", 0.91, 0.92, 0.93, 0.15),
             ("decode", 32, 32, 1, 128, "BFLOAT16-L1", 0.91, 0.92, 0.93, 0.15),
             ("decode", 32, 32, 1, 128, "BFLOAT16-L1_SHARDED", 0.92, 0.95, 0.95, 0.1),

diff --git a/models/demos/ttnn_falcon7b/tests/test_perf_device_falcon.py b/models/demos/ttnn_falcon7b/tests/test_perf_device_falcon.py
@@ -10,7 +10,7 @@
 @pytest.mark.parametrize(
     "batch_size, test, expected_perf",
     [
-        [1, "BFLOAT16-L1-falcon_7b-layers_32-prefill_seq256", 3.44],
+        [1, "BFLOAT16-L1-falcon_7b-layers_32-prefill_seq256", 3.58],
         [32, "BFLOAT16-L1-falcon_7b-layers_32-decode_batch32", 139],
     ],
 )

diff --git a/tt_eager/tt_dnn/op_library/bmm/bmm_op.cpp b/tt_eager/tt_dnn/op_library/bmm/bmm_op.cpp
@@ -1059,6 +1059,12 @@ operation::ProgramWithCallbacks Matmul::create_program(
                         per_core_M = get_per_core_for_multiple_blocks(per_core_M, Mt);
                         per_core_N = get_per_core_for_multiple_blocks(per_core_N, Nt);
                     }
+                    if (per_core_M > max_per_tile || per_core_N > max_per_tile) {
+                        tt::log_debug(tt::LogOp, "Per core value too large. Need to use Matmul multi core. No support for bias and fused activation. per_core_M={}, per_core_N={}", per_core_M, per_core_N);
+                        TT_FATAL(!program_config.fused_activation.has_value(), "Fused activation is not supported for matmul multi core");
+                        TT_FATAL(!bias.has_value(), "Bias is not supported for matmul multi core");
+                        return matmul_multi_core(input_tensor_a, input_tensor_b, output_tensor, broadcast_batch);
+                    }
                     tt::log_debug(tt::LogOp, "Matmul adjustments: fuse_batch={} in0_block_w={} per_core_M={}, per_core_N={}", fuse_batch, in0_block_w, per_core_M, per_core_N);
                 }
                 return matmul_multi_core_reuse_mcast_2d_optimized(