Skip to content

Commit

Permalink
#8437: Use fallback matmul for large tensors and adjust tests
Browse files Browse the repository at this point in the history
  • Loading branch information
bbradelTT committed May 24, 2024
1 parent 8c8e174 commit 277b990
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 7 deletions.
12 changes: 6 additions & 6 deletions models/demos/falcon7b/tests/test_perf_falcon.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,10 +376,10 @@ class TestParametrized:
@pytest.mark.parametrize(
"llm_mode, num_layers, batch, seq_len, kv_cache_len, model_config_str, expected_output_pcc, expected_k_cache_pcc, expected_v_cache_pcc, expected_inference_time",
(
("prefill", 32, 1, 128, 0, "BFLOAT16-DRAM", 0.85, 0.97, 0.86, 0.31),
("prefill", 32, 1, 128, 0, "BFLOAT16-L1", 0.85, 0.97, 0.86, 0.29),
("prefill", 32, 1, 256, 0, "BFLOAT16-DRAM", 0.90, 0.97, 0.87, 0.43),
("prefill", 32, 1, 256, 0, "BFLOAT16-L1", 0.90, 0.97, 0.87, 0.34),
("prefill", 32, 1, 128, 0, "BFLOAT16-DRAM", 0.837, 0.97, 0.86, 0.31),
("prefill", 32, 1, 128, 0, "BFLOAT16-L1", 0.837, 0.97, 0.86, 0.29),
("prefill", 32, 1, 256, 0, "BFLOAT16-DRAM", 0.90, 0.97, 0.865, 0.43),
("prefill", 32, 1, 256, 0, "BFLOAT16-L1", 0.90, 0.97, 0.865, 0.34),
("decode", 32, 32, 1, 128, "BFLOAT16-DRAM", 0.63, 0.80, 0.84, 0.28),
("decode", 32, 32, 1, 128, "BFLOAT16-L1", 0.63, 0.80, 0.84, 0.28),
("decode", 32, 32, 1, 1024, "BFLOAT16-DRAM", 0.56, 0.86, 0.88, 0.37),
Expand Down Expand Up @@ -509,8 +509,8 @@ def run_perf_wh_bare_metal(
(
("prefill", 32, 1, 128, 0, "BFLOAT16-DRAM", 0.97, 0.99, 0.96, 0.1),
("prefill", 32, 1, 128, 0, "BFLOAT16-L1", 0.97, 0.99, 0.96, 0.1),
("prefill", 32, 1, 256, 0, "BFLOAT16-DRAM", 0.98, 0.99, 0.96, 0.18),
("prefill", 32, 1, 256, 0, "BFLOAT16-L1", 0.98, 0.99, 0.96, 0.18),
("prefill", 32, 1, 256, 0, "BFLOAT16-DRAM", 0.979, 0.99, 0.96, 0.18),
("prefill", 32, 1, 256, 0, "BFLOAT16-L1", 0.979, 0.99, 0.96, 0.18),
("decode", 32, 32, 1, 128, "BFLOAT16-DRAM", 0.91, 0.92, 0.93, 0.15),
("decode", 32, 32, 1, 128, "BFLOAT16-L1", 0.91, 0.92, 0.93, 0.15),
("decode", 32, 32, 1, 128, "BFLOAT16-L1_SHARDED", 0.92, 0.95, 0.95, 0.1),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
@pytest.mark.parametrize(
"batch_size, test, expected_perf",
[
[1, "BFLOAT16-L1-falcon_7b-layers_32-prefill_seq256", 3.44],
[1, "BFLOAT16-L1-falcon_7b-layers_32-prefill_seq256", 3.58],
[32, "BFLOAT16-L1-falcon_7b-layers_32-decode_batch32", 139],
],
)
Expand Down
6 changes: 6 additions & 0 deletions tt_eager/tt_dnn/op_library/bmm/bmm_op.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1059,6 +1059,12 @@ operation::ProgramWithCallbacks Matmul::create_program(
per_core_M = get_per_core_for_multiple_blocks(per_core_M, Mt);
per_core_N = get_per_core_for_multiple_blocks(per_core_N, Nt);
}
if (per_core_M > max_per_tile || per_core_N > max_per_tile) {
tt::log_debug(tt::LogOp, "Per core value too large. Need to use Matmul multi core. No support for bias and fused activation. per_core_M={}, per_core_N={}", per_core_M, per_core_N);
TT_FATAL(!program_config.fused_activation.has_value(), "Fused activation is not supported for matmul multi core");
TT_FATAL(!bias.has_value(), "Bias is not supported for matmul multi core");
return matmul_multi_core(input_tensor_a, input_tensor_b, output_tensor, broadcast_batch);
}
tt::log_debug(tt::LogOp, "Matmul adjustments: fuse_batch={} in0_block_w={} per_core_M={}, per_core_N={}", fuse_batch, in0_block_w, per_core_M, per_core_N);
}
return matmul_multi_core_reuse_mcast_2d_optimized(
Expand Down

0 comments on commit 277b990

Please sign in to comment.