From 2ecffa19879c3b837d65ecd1f1ee1f6713a09426 Mon Sep 17 00:00:00 2001 From: Brian Liu Date: Wed, 8 May 2024 15:28:10 +0000 Subject: [PATCH] #8174: [WIP] Replace ttlib custom Falcon matmuls with ttnn matmuls --- models/demos/falcon7b/demo/demo.py | 16 ++-- .../falcon7b/tests/test_falcon_attention.py | 4 +- .../falcon7b/tests/test_falcon_causallm.py | 8 +- .../falcon7b/tests/test_falcon_decoder.py | 4 +- .../falcon7b/tests/test_falcon_end_to_end.py | 4 +- .../demos/falcon7b/tests/test_falcon_mlp.py | 4 +- .../demos/falcon7b/tests/test_falcon_model.py | 8 +- .../tests/test_falcon_prefill_decode.py | 8 +- .../demos/falcon7b/tests/test_perf_falcon.py | 29 +++--- ...n_matmuls_and_bmms_with_mixed_precision.py | 61 +++++++++--- models/demos/falcon7b/tt/falcon_attention.py | 49 ++++++---- models/demos/falcon7b/tt/falcon_causallm.py | 43 ++++++++- models/demos/falcon7b/tt/falcon_mlp.py | 91 ++++++++++++++++-- models/demos/falcon7b/tt/model_config.py | 3 +- models/demos/falcon7b/tt/model_utils.py | 9 ++ tt_eager/tt_dnn/op_library/bmm/bmm_op.cpp | 94 ------------------- tt_eager/tt_dnn/op_library/bmm/bmm_op.hpp | 9 -- .../tt_lib_bindings_tensor_custom_bmm_ops.cpp | 21 ----- ttnn/ttnn/__init__.py | 1 + ttnn/ttnn/device.py | 1 + ttnn/ttnn/operations/matmul.py | 1 + 21 files changed, 269 insertions(+), 199 deletions(-) diff --git a/models/demos/falcon7b/demo/demo.py b/models/demos/falcon7b/demo/demo.py index 490b5dd041b..a7acfc82182 100644 --- a/models/demos/falcon7b/demo/demo.py +++ b/models/demos/falcon7b/demo/demo.py @@ -16,6 +16,7 @@ from models.demos.falcon7b.reference.hf_modeling_falcon import FalconConfig, FalconForCausalLM from models.demos.falcon7b.tt.falcon_causallm import TtFalconCausalLM from models.demos.falcon7b.tt.model_config import get_model_config, model_config_entries +from models.demos.falcon7b.tt.model_utils import get_falcon_default_core_grid from models.utility_functions import ( disable_compilation_reports, disable_persistent_kernel_cache, @@ -104,8 +105,8 @@ def print_output_prompts(generated_ids, tokenizer, batch_size, num_users_to_disp logger.info(f"Output for user {user_id}:\n{output_prompt}") -def update_model_config(model, model_config_str, prefill_seq_len=0): - model.model_config.update(get_model_config(model_config_str, prefill_seq_len)) +def update_model_config(model, model_config_str, core_grid, prefill_seq_len=0): + model.model_config.update(get_model_config(model_config_str, core_grid, prefill_seq_len)) def top_pk_logits(logits, p=0.9, k=10, temperature=1.0, return_probs=False): @@ -184,7 +185,10 @@ def run_falcon_demo_kv( ) profiler.end(f"tokenizing_inputs") - model_config = get_model_config(model_config_strs_prefill_decode[0], nearest_32(num_input_tokens)) + default_core_grid = get_falcon_default_core_grid(devices[0]) + model_config = get_model_config( + model_config_strs_prefill_decode[0], default_core_grid, nearest_32(num_input_tokens) + ) tt_cache_path = get_tt_cache_path( model_version, model_subdir="Falcon", default_dir=model_config["DEFAULT_CACHE_PATH"] ) @@ -280,7 +284,7 @@ def run_falcon_demo_kv( logger.info("Running 1st run decode stage with compile...") # Update model config - update_model_config(tt_FalconCausalLM_singlelayer, model_config_strs_prefill_decode[1]) + update_model_config(tt_FalconCausalLM_singlelayer, model_config_strs_prefill_decode[1], default_core_grid) decode_ids = torch.randint(low=0, high=configuration.vocab_size - 1, size=(global_batch, 1), dtype=torch.int64) @@ -333,7 +337,7 @@ def run_falcon_demo_kv( num_layers, configuration, max_seq_len, - get_model_config(model_config_strs_prefill_decode[0], nearest_32(num_input_tokens)), + get_model_config(model_config_strs_prefill_decode[0], default_core_grid, nearest_32(num_input_tokens)), tt_cache_path, nearest_32(num_input_tokens), ) @@ -408,7 +412,7 @@ def run_falcon_demo_kv( logger.info("Running inference decode stage...") # Update model config - update_model_config(tt_FalconCausalLM, model_config_strs_prefill_decode[1]) + update_model_config(tt_FalconCausalLM, model_config_strs_prefill_decode[1], default_core_grid) decode_ids = torch.zeros(global_batch, 1, dtype=torch.int64) for user_id, output_id in enumerate(output_ids): diff --git a/models/demos/falcon7b/tests/test_falcon_attention.py b/models/demos/falcon7b/tests/test_falcon_attention.py index 91e6d40c027..98fced7e555 100644 --- a/models/demos/falcon7b/tests/test_falcon_attention.py +++ b/models/demos/falcon7b/tests/test_falcon_attention.py @@ -11,6 +11,7 @@ ) from models.demos.falcon7b.tt.falcon_attention import TtFalconAttentionDecode, TtFalconAttentionPrefill from models.demos.falcon7b.tt.model_config import get_model_config +from models.demos.falcon7b.tt.model_utils import get_falcon_default_core_grid from models.demos.falcon7b.tests.test_utils import get_rand_falcon_inputs, concat_device_outputs from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import ( comp_allclose, @@ -181,7 +182,8 @@ def test_FalconAttention_inference( ): devices = get_devices_for_t3000(all_devices, num_devices) - model_config = get_model_config(model_config_str, seq_len) + default_core_grid = get_falcon_default_core_grid(devices[0]) + model_config = get_model_config(model_config_str, default_core_grid, seq_len) tt_cache_path = get_tt_cache_path( model_version, model_subdir="Falcon", default_dir=model_config["DEFAULT_CACHE_PATH"] ) diff --git a/models/demos/falcon7b/tests/test_falcon_causallm.py b/models/demos/falcon7b/tests/test_falcon_causallm.py index ef01cd0e5a2..bd08ed74451 100644 --- a/models/demos/falcon7b/tests/test_falcon_causallm.py +++ b/models/demos/falcon7b/tests/test_falcon_causallm.py @@ -12,9 +12,8 @@ ) from models.demos.falcon7b.tt.falcon_causallm import TtFalconCausalLM -from models.demos.falcon7b.tt.model_config import ( - get_model_config, -) +from models.demos.falcon7b.tt.model_config import get_model_config +from models.demos.falcon7b.tt.model_utils import get_falcon_default_core_grid from models.demos.falcon7b.tests.test_utils import get_rand_falcon_inputs, concat_device_out_layer_present from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import ( comp_allclose, @@ -230,7 +229,8 @@ def test_FalconCausalLM_inference( ): devices = get_devices_for_t3000(all_devices, num_devices) - model_config = get_model_config(model_config_str, seq_len) + default_core_grid = get_falcon_default_core_grid(devices[0]) + model_config = get_model_config(model_config_str, default_core_grid, seq_len) tt_cache_path = get_tt_cache_path( model_version, model_subdir="Falcon", default_dir=model_config["DEFAULT_CACHE_PATH"] ) diff --git a/models/demos/falcon7b/tests/test_falcon_decoder.py b/models/demos/falcon7b/tests/test_falcon_decoder.py index d723ebff1ab..5fabf2b3bf4 100644 --- a/models/demos/falcon7b/tests/test_falcon_decoder.py +++ b/models/demos/falcon7b/tests/test_falcon_decoder.py @@ -11,6 +11,7 @@ ) from models.demos.falcon7b.tt.falcon_decoder import TtFalconDecoderLayer from models.demos.falcon7b.tt.model_config import get_model_config +from models.demos.falcon7b.tt.model_utils import get_falcon_default_core_grid from models.demos.falcon7b.tests.test_utils import get_rand_falcon_inputs, concat_device_outputs from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import ( comp_pcc, @@ -172,7 +173,8 @@ def test_FalconDecoder_inference( ): devices = get_devices_for_t3000(all_devices, num_devices) - model_config = get_model_config(model_config_str, seq_len) + default_core_grid = get_falcon_default_core_grid(devices[0]) + model_config = get_model_config(model_config_str, default_core_grid, seq_len) tt_cache_path = get_tt_cache_path( model_version, model_subdir="Falcon", default_dir=model_config["DEFAULT_CACHE_PATH"] ) diff --git a/models/demos/falcon7b/tests/test_falcon_end_to_end.py b/models/demos/falcon7b/tests/test_falcon_end_to_end.py index baed005074e..e92df37029f 100644 --- a/models/demos/falcon7b/tests/test_falcon_end_to_end.py +++ b/models/demos/falcon7b/tests/test_falcon_end_to_end.py @@ -14,6 +14,7 @@ # TODO: Remove this? from models.demos.falcon7b.tt.falcon_common import PytorchFalconCausalLM from models.demos.falcon7b.tt.model_config import get_model_config +from models.demos.falcon7b.tt.model_utils import get_falcon_default_core_grid from models.utility_functions import ( disable_compilation_reports, disable_persistent_kernel_cache, @@ -369,7 +370,8 @@ def test_FalconCausalLM_end_to_end_with_program_cache( ): pytest.skip("#7933: Out of DRAM space error for tensor") - model_config = get_model_config(model_config_str, seq_len) + default_core_grid = get_falcon_default_core_grid(device) + model_config = get_model_config(model_config_str, default_core_grid, seq_len) tt_cache_path = get_tt_cache_path( model_version, model_subdir="Falcon", default_dir=model_config["DEFAULT_CACHE_PATH"] ) diff --git a/models/demos/falcon7b/tests/test_falcon_mlp.py b/models/demos/falcon7b/tests/test_falcon_mlp.py index f4b2baf8af6..32d8f797484 100644 --- a/models/demos/falcon7b/tests/test_falcon_mlp.py +++ b/models/demos/falcon7b/tests/test_falcon_mlp.py @@ -8,6 +8,7 @@ from models.demos.falcon7b.reference.hf_modeling_falcon import FalconForCausalLM from models.demos.falcon7b.tt.falcon_mlp import TtFalconMLPDecode, TtFalconMLPPrefill from models.demos.falcon7b.tt.model_config import get_model_config +from models.demos.falcon7b.tt.model_utils import get_falcon_default_core_grid from models.utility_functions import get_devices_for_t3000, torch2tt_tensor, tt2torch_tensor from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import comp_allclose, comp_pcc @@ -147,7 +148,8 @@ def test_FalconMLP_inference( ): devices = get_devices_for_t3000(all_devices, num_devices) - model_config = get_model_config(model_config_str, seq_len) + default_core_grid = get_falcon_default_core_grid(devices[0]) + model_config = get_model_config(model_config_str, default_core_grid, seq_len) tt_cache_path = get_tt_cache_path( model_version, model_subdir="Falcon", default_dir=model_config["DEFAULT_CACHE_PATH"] ) diff --git a/models/demos/falcon7b/tests/test_falcon_model.py b/models/demos/falcon7b/tests/test_falcon_model.py index 3ea2af42415..b2aa457033c 100644 --- a/models/demos/falcon7b/tests/test_falcon_model.py +++ b/models/demos/falcon7b/tests/test_falcon_model.py @@ -10,9 +10,8 @@ FalconForCausalLM, ) from models.demos.falcon7b.tt.falcon_model import TtFalconModel -from models.demos.falcon7b.tt.model_config import ( - get_model_config, -) +from models.demos.falcon7b.tt.model_config import get_model_config +from models.demos.falcon7b.tt.model_utils import get_falcon_default_core_grid from models.demos.falcon7b.tests.test_utils import get_rand_falcon_inputs, concat_device_out_layer_present from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import ( comp_allclose, @@ -220,7 +219,8 @@ def test_FalconModel_inference( ): devices = get_devices_for_t3000(all_devices, num_devices) - model_config = get_model_config(model_config_str) + default_core_grid = get_falcon_default_core_grid(devices[0]) + model_config = get_model_config(model_config_str, default_core_grid) tt_cache_path = get_tt_cache_path( model_version, model_subdir="Falcon", default_dir=model_config["DEFAULT_CACHE_PATH"] ) diff --git a/models/demos/falcon7b/tests/test_falcon_prefill_decode.py b/models/demos/falcon7b/tests/test_falcon_prefill_decode.py index 5a0a0842cf0..b0650dcd83f 100644 --- a/models/demos/falcon7b/tests/test_falcon_prefill_decode.py +++ b/models/demos/falcon7b/tests/test_falcon_prefill_decode.py @@ -12,9 +12,8 @@ ) from models.demos.falcon7b.tt.falcon_causallm import TtFalconCausalLM -from models.demos.falcon7b.tt.model_config import ( - get_model_config, -) +from models.demos.falcon7b.tt.model_config import get_model_config +from models.demos.falcon7b.tt.model_utils import get_falcon_default_core_grid from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import ( comp_allclose, @@ -205,7 +204,8 @@ def test_FalconCausalLM_inference( get_tt_cache_path, device, ): - model_config = get_model_config(model_config_str) + default_core_grid = get_falcon_default_core_grid(device) + model_config = get_model_config(model_config_str, default_core_grid) tt_cache_path = get_tt_cache_path( model_version, model_subdir="Falcon", default_dir=model_config["DEFAULT_CACHE_PATH"] ) diff --git a/models/demos/falcon7b/tests/test_perf_falcon.py b/models/demos/falcon7b/tests/test_perf_falcon.py index 7e036c3b239..9f267597150 100644 --- a/models/demos/falcon7b/tests/test_perf_falcon.py +++ b/models/demos/falcon7b/tests/test_perf_falcon.py @@ -19,9 +19,8 @@ PytorchFalconCausalLM, ) -from models.demos.falcon7b.tt.model_config import ( - get_model_config, -) +from models.demos.falcon7b.tt.model_config import get_model_config +from models.demos.falcon7b.tt.model_utils import get_falcon_default_core_grid from models.demos.falcon7b.tests.test_utils import get_rand_falcon_inputs, concat_device_out_layer_present from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import ( get_atol_rtol_pcc, @@ -426,7 +425,8 @@ def test_perf_gs_bare_metal( if model_config_str == "BFLOAT16-L1_SHARDED": pytest.skip("Sharded config is not supported on GS") - model_config = get_model_config(model_config_str) + default_core_grid = get_falcon_default_core_grid(device) + model_config = get_model_config(model_config_str, default_core_grid) tt_cache_path = get_tt_cache_path( model_version, model_subdir="Falcon", default_dir=model_config["DEFAULT_CACHE_PATH"] ) @@ -478,7 +478,8 @@ def run_perf_wh_bare_metal( # Enable Async Mode for device in devices: device.enable_async(async_mode) - model_config = get_model_config(model_config_str) + default_core_grid = get_falcon_default_core_grid(device) + model_config = get_model_config(model_config_str, default_core_grid) tt_cache_path = get_tt_cache_path( model_version, model_subdir="Falcon", default_dir=model_config["DEFAULT_CACHE_PATH"] ) @@ -511,14 +512,14 @@ def run_perf_wh_bare_metal( ("prefill", 32, 1, 128, 0, "BFLOAT16-L1", 0.97, 0.99, 0.96, 0.1), ("prefill", 32, 1, 256, 0, "BFLOAT16-DRAM", 0.98, 0.99, 0.96, 0.18), ("prefill", 32, 1, 256, 0, "BFLOAT16-L1", 0.98, 0.99, 0.96, 0.18), - ("decode", 32, 32, 1, 128, "BFLOAT16-DRAM", 0.91, 0.92, 0.93, 0.15), - ("decode", 32, 32, 1, 128, "BFLOAT16-L1", 0.91, 0.92, 0.93, 0.15), + ("decode", 32, 32, 1, 128, "BFLOAT16-DRAM", 0.92, 0.94, 0.94, 0.15), + ("decode", 32, 32, 1, 128, "BFLOAT16-L1", 0.92, 0.94, 0.94, 0.15), ("decode", 32, 32, 1, 128, "BFLOAT16-L1_SHARDED", 0.92, 0.95, 0.95, 0.1), - ("decode", 32, 32, 1, 1024, "BFLOAT16-DRAM", 0.86, 0.92, 0.92, 0.4), - ("decode", 32, 32, 1, 1024, "BFLOAT16-L1", 0.86, 0.92, 0.92, 0.35), - ("decode", 32, 32, 1, 1024, "BFLOAT16-L1_SHARDED", 0.85, 0.93, 0.94, 0.1), - ("decode", 32, 32, 1, 2047, "BFLOAT16-DRAM", 0.88, 0.93, 0.93, 0.75), - ("decode", 32, 32, 1, 2047, "BFLOAT16-L1", 0.88, 0.93, 0.93, 0.6), + ("decode", 32, 32, 1, 1024, "BFLOAT16-DRAM", 0.90, 0.94, 0.94, 0.4), + ("decode", 32, 32, 1, 1024, "BFLOAT16-L1", 0.90, 0.94, 0.94, 0.35), + ("decode", 32, 32, 1, 1024, "BFLOAT16-L1_SHARDED", 0.89, 0.95, 0.95, 0.1), + ("decode", 32, 32, 1, 2047, "BFLOAT16-DRAM", 0.89, 0.92, 0.93, 0.75), + ("decode", 32, 32, 1, 2047, "BFLOAT16-L1", 0.89, 0.92, 0.93, 0.6), ), ids=[ "prefill_seq128_bf16_dram", @@ -589,9 +590,9 @@ def test_perf_wh_bare_metal( "llm_mode, num_devices, num_layers, batch, seq_len, kv_cache_len, model_config_str, expected_output_pcc, expected_k_cache_pcc, expected_v_cache_pcc, expected_inference_time, async_mode", ( ("prefill", 4, 32, 1, 256, 0, "BFLOAT16-DRAM", 0.98, 0.99, 0.96, 0.18, False), # Issue 7816 Inference time - ("decode", 4, 32, 32, 1, 1024, "BFLOAT16-L1_SHARDED", 0.87, 0.91, 0.91, 0.21, False), + ("decode", 4, 32, 32, 1, 1024, "BFLOAT16-L1_SHARDED", 0.87, 0.89, 0.90, 0.21, False), ("prefill", 4, 32, 1, 256, 0, "BFLOAT16-DRAM", 0.98, 0.99, 0.96, 0.18, True), - ("decode", 4, 32, 32, 1, 1024, "BFLOAT16-L1_SHARDED", 0.87, 0.91, 0.91, 0.09, True), + ("decode", 4, 32, 32, 1, 1024, "BFLOAT16-L1_SHARDED", 0.87, 0.89, 0.90, 0.09, True), ), ids=[ "prefill_seq256", diff --git a/models/demos/falcon7b/tests/unit_tests/test_falcon_matmuls_and_bmms_with_mixed_precision.py b/models/demos/falcon7b/tests/unit_tests/test_falcon_matmuls_and_bmms_with_mixed_precision.py index 9e3b00eb013..2bebbfed6d5 100644 --- a/models/demos/falcon7b/tests/unit_tests/test_falcon_matmuls_and_bmms_with_mixed_precision.py +++ b/models/demos/falcon7b/tests/unit_tests/test_falcon_matmuls_and_bmms_with_mixed_precision.py @@ -5,8 +5,12 @@ import pytest from loguru import logger +import ttnn import tt_lib as ttl from models.utility_functions import comp_pcc, tt2torch_tensor, torch2tt_tensor, skip_for_wormhole_b0 +from models.demos.falcon7b.tt.model_utils import get_falcon_default_core_grid +from models.demos.falcon7b.tt.falcon_mlp import falcon_dense_4h_to_h_matmul, falcon_dense_h_to_4h_matmul +from models.demos.falcon7b.tt.falcon_causallm import falcon_lm_head_matmul import torch import math @@ -26,15 +30,15 @@ def run_falcon_matmul_test( if out_dtype == ttl.tensor.DataType.BFLOAT8_B: pcc = 0.98 - if falcon_op == ttl.tensor.falcon_fused_qkv_matmul: + if falcon_op == "falcon_fused_qkv_matmul": a_shape = [1, 1, seq_len, 4544] b_shape = [1, 1, 4544, 4672] expected_output_shape = [1, 1, seq_len, 4672] - elif falcon_op == ttl.tensor.falcon_selfout_matmul: + elif falcon_op == "falcon_selfout_matmul": a_shape = [1, 1, seq_len, 4544] b_shape = [1, 1, 4544, 4544] expected_output_shape = [1, 1, seq_len, 4544] - elif falcon_op == ttl.tensor.falcon_dense_4h_to_h_matmul: + elif falcon_op == "falcon_dense_4h_to_h_matmul": a_shape = [1, 1, seq_len, 18176] b_shape = [1, 1, 18176, 4544] expected_output_shape = [1, 1, seq_len, 4544] @@ -59,7 +63,7 @@ def run_falcon_matmul_test( out_mem_config = ttl.tensor.MemoryConfig( ttl.tensor.TensorMemoryLayout.INTERLEAVED, ttl.tensor.BufferType.DRAM ) - elif falcon_op == ttl.tensor.falcon_dense_h_to_4h_matmul: + elif falcon_op == "falcon_dense_h_to_4h_matmul": a_shape = [1, 1, seq_len, 4544] b_shape = [1, 1, 4544, 18176] expected_output_shape = [1, 1, seq_len, 18176] @@ -77,7 +81,7 @@ def run_falcon_matmul_test( out_mem_config = ttl.tensor.MemoryConfig( ttl.tensor.TensorMemoryLayout.INTERLEAVED, ttl.tensor.BufferType.DRAM ) - elif falcon_op == ttl.tensor.falcon_lm_head_matmul: + elif falcon_op == "falcon_lm_head_matmul": a_shape = [1, 1, seq_len, 4544] b_shape = [1, 1, 4544, 65024] expected_output_shape = [1, 1, seq_len, 65024] @@ -116,7 +120,40 @@ def run_falcon_matmul_test( b_t = ttl.tensor.Tensor(B, in1_dtype).to(ttl.tensor.Layout.TILE).to(device, in1_mem_config) bias_t = None - out = falcon_op(a_t, b_t, bias_t, output_mem_config=out_mem_config, output_dtype=out_dtype) + default_core_grid = get_falcon_default_core_grid(device) + if falcon_op in ("falcon_fused_qkv_matmul", "falcon_selfout_matmul"): + out = ttnn.matmul( + a_t, + b_t, + memory_config=out_mem_config, + dtype=out_dtype, + core_grid=default_core_grid, + use_1d_systolic_array=True, + ) + elif falcon_op == "falcon_dense_4h_to_h_matmul": + out = falcon_dense_4h_to_h_matmul( + a_t, + b_t, + core_grid=default_core_grid, + output_mem_config=out_mem_config, + output_dtype=out_dtype, + packer_l1_acc=True, + ) + elif falcon_op == "falcon_dense_h_to_4h_matmul": + out = falcon_dense_h_to_4h_matmul( + a_t, + b_t, + core_grid=default_core_grid, + fused_activation=None, + output_mem_config=out_mem_config, + output_dtype=out_dtype, + ) + elif falcon_op == "falcon_lm_head_matmul": + out = falcon_lm_head_matmul( + a_t, b_t, core_grid=default_core_grid, output_mem_config=out_mem_config, output_dtype=out_dtype + ) + else: + raise NotImplementedError(f"falcon matmul op is undefined!") # Check memory and dtype of inputs and outputs assert a_t.memory_config().buffer_type == in0_mem_config.buffer_type @@ -172,11 +209,11 @@ def run_falcon_matmul_test( @pytest.mark.parametrize( "falcon_op", ( - ttl.tensor.falcon_fused_qkv_matmul, - ttl.tensor.falcon_selfout_matmul, - ttl.tensor.falcon_dense_4h_to_h_matmul, - ttl.tensor.falcon_dense_h_to_4h_matmul, - ttl.tensor.falcon_lm_head_matmul, + "falcon_fused_qkv_matmul", + "falcon_selfout_matmul", + "falcon_dense_4h_to_h_matmul", + "falcon_dense_h_to_4h_matmul", + "falcon_lm_head_matmul", ), ids=["fused_qkv", "selfout", "dense_4h_to_h", "dense_h_to_4h", "lm_head"], ) @@ -199,7 +236,7 @@ def test_falcon_matmul( ): compute_grid_size = device.compute_with_storage_grid_size() is_e75_grid_size = (compute_grid_size.x * compute_grid_size.y) == 88 - if is_e75_grid_size and (seq_len == 512) and (falcon_op == ttl.tensor.falcon_lm_head_matmul): + if is_e75_grid_size and (seq_len == 512) and (falcon_op == "falcon_lm_head_matmul"): pytest.skip(f"LM Head does not work on E75 grid size {compute_grid_size}") run_falcon_matmul_test( diff --git a/models/demos/falcon7b/tt/falcon_attention.py b/models/demos/falcon7b/tt/falcon_attention.py index 06e027e9537..6f99c9efa9c 100644 --- a/models/demos/falcon7b/tt/falcon_attention.py +++ b/models/demos/falcon7b/tt/falcon_attention.py @@ -8,6 +8,7 @@ from typing import List, Optional, Tuple import tt_lib +import ttnn from models.utility_functions import ( torch2tt_tensor, @@ -231,11 +232,13 @@ def forward( fused_query_key_value = [] for i in range(self.num_devices): fused_query_key_value.append( - tt_lib.tensor.falcon_fused_qkv_matmul( + ttnn.matmul( hidden_states[i], self.query_key_value_weights[i], - output_mem_config=self.model_config["FUSED_QKV_MM_OUTPUT_MEMCFG"], - output_dtype=self.model_config["FUSED_QKV_MM_OUTPUT_DTYPE"], + memory_config=self.model_config["FUSED_QKV_MM_OUTPUT_MEMCFG"], + dtype=self.model_config["FUSED_QKV_MM_OUTPUT_DTYPE"], + core_grid=self.model_config["DEFAULT_CORE_GRID"], + use_1d_systolic_array=True, ) ) @@ -353,11 +356,13 @@ def forward( ) for i in range(self.num_devices): - attn_output[i] = tt_lib.tensor.falcon_selfout_matmul( + attn_output[i] = ttnn.matmul( attn_output[i], self.dense_weights[i], - output_mem_config=self.model_config["SELFOUT_MM_OUTPUT_MEMCFG"], - output_dtype=self.model_config["SELFOUT_MM_OUTPUT_DTYPE"], + memory_config=self.model_config["SELFOUT_MM_OUTPUT_MEMCFG"], + dtype=self.model_config["SELFOUT_MM_OUTPUT_DTYPE"], + core_grid=self.model_config["DEFAULT_CORE_GRID"], + use_1d_systolic_array=True, ) return attn_output, layer_present @@ -389,11 +394,13 @@ def _optimized_forward( ] else: fused_query_key_value = [ - tt_lib.tensor.falcon_fused_qkv_matmul( + ttnn.matmul( hidden_states[device_id], self.query_key_value_weights[device_id], - output_mem_config=self.model_config["FUSED_QKV_MM_OUTPUT_MEMCFG"], - output_dtype=self.model_config["FUSED_QKV_MM_OUTPUT_DTYPE"], + memory_config=self.model_config["FUSED_QKV_MM_OUTPUT_MEMCFG"], + dtype=self.model_config["FUSED_QKV_MM_OUTPUT_DTYPE"], + core_grid=self.model_config["DEFAULT_CORE_GRID"], + use_1d_systolic_array=True, ) for device_id in range(self.num_devices) ] @@ -545,11 +552,13 @@ def _optimized_forward( ] attn_outputs = [ - tt_lib.tensor.falcon_selfout_matmul( + ttnn.matmul( attn_outputs[device_id], self.dense_weights[device_id], - output_mem_config=self.model_config["SELFOUT_MM_OUTPUT_MEMCFG"], - output_dtype=self.model_config["SELFOUT_MM_OUTPUT_DTYPE"], + memory_config=self.model_config["SELFOUT_MM_OUTPUT_MEMCFG"], + dtype=self.model_config["SELFOUT_MM_OUTPUT_DTYPE"], + core_grid=self.model_config["DEFAULT_CORE_GRID"], + use_1d_systolic_array=True, ) for device_id in range(self.num_devices) ] @@ -662,11 +671,13 @@ def forward( fused_query_key_value = [] for i in range(self.num_devices): fused_query_key_value.append( - tt_lib.tensor.falcon_fused_qkv_matmul( + ttnn.matmul( hidden_states[i], self.query_key_value_weights[i], - output_mem_config=self.model_config["FUSED_QKV_MM_OUTPUT_MEMCFG"], - output_dtype=self.model_config["FUSED_QKV_MM_OUTPUT_DTYPE"], + memory_config=self.model_config["FUSED_QKV_MM_OUTPUT_MEMCFG"], + dtype=self.model_config["FUSED_QKV_MM_OUTPUT_DTYPE"], + core_grid=self.model_config["DEFAULT_CORE_GRID"], + use_1d_systolic_array=True, ) ) @@ -968,11 +979,13 @@ def forward( ) for i in range(self.num_devices): - attn_output[i] = tt_lib.tensor.falcon_selfout_matmul( + attn_output[i] = ttnn.matmul( attn_output[i], self.dense_weights[i], - output_mem_config=self.model_config["SELFOUT_MM_OUTPUT_MEMCFG"], - output_dtype=self.model_config["SELFOUT_MM_OUTPUT_DTYPE"], + memory_config=self.model_config["SELFOUT_MM_OUTPUT_MEMCFG"], + dtype=self.model_config["SELFOUT_MM_OUTPUT_DTYPE"], + core_grid=self.model_config["DEFAULT_CORE_GRID"], + use_1d_systolic_array=True, ) return attn_output, layer_present diff --git a/models/demos/falcon7b/tt/falcon_causallm.py b/models/demos/falcon7b/tt/falcon_causallm.py index 34f7539f0fc..a85a44f665b 100644 --- a/models/demos/falcon7b/tt/falcon_causallm.py +++ b/models/demos/falcon7b/tt/falcon_causallm.py @@ -6,12 +6,51 @@ import torch import tt_lib +import ttnn from models.demos.falcon7b.tt.falcon_lm_head import falcon_lm_head_matmul_2d from models.demos.falcon7b.tt.falcon_model import TtFalconModelShared from models.demos.falcon7b.tt.model_utils import get_weights_cached from models.utility_functions import torch_tensors_to_tt_tensors +def falcon_lm_head_matmul( + input_tensor_a, + input_tensor_b, + core_grid, + output_mem_config=ttnn.DRAM_MEMORY_CONFIG, + output_dtype=None, +): + seq_len = input_tensor_a.get_legacy_shape()[2] + if seq_len > 512: + # TODO: Review if this path is used? If not, we can delete + return ttnn.matmul(input_tensor_a, input_tensor_b, memory_config=output_mem_config, dtype=output_dtype) + else: + device_arch = input_tensor_a.device().arch() + if device_arch == ttnn.Arch.GRAYSKULL: + compute_kernel_config = ttnn.GrayskullComputeKernelConfig( + math_fidelity=ttnn.MathFidelity.LoFi, math_approx_mode=True + ) + elif device_arch == ttnn.Arch.WORMHOLE_B0: + compute_kernel_config = ttnn.WormholeComputeKernelConfig( + math_fidelity=ttnn.MathFidelity.LoFi, + math_approx_mode=True, + fp32_dest_acc_en=False, + packer_l1_acc=True, + ) + else: + raise RuntimeError(f"Unsupported arch: {device_arch}") + + return ttnn.matmul( + input_tensor_a, + input_tensor_b, + memory_config=output_mem_config, + dtype=output_dtype, + core_grid=core_grid, + use_1d_systolic_array=True, + compute_kernel_config=compute_kernel_config, + ) + + class TtFalconCausalLM(TtFalconModelShared): def __init__( self, @@ -124,10 +163,10 @@ def forward( ] else: lm_logits = [ - tt_lib.tensor.falcon_lm_head_matmul( + falcon_lm_head_matmul( hidden_states[device_id], self.lm_head_weights[device_id], - bias=None, + core_grid=self.model_config["DEFAULT_CORE_GRID"], output_mem_config=self.model_config["LM_HEAD_MM_OUTPUT_MEMCFG"], output_dtype=self.model_config["LM_HEAD_MM_OUTPUT_DTYPE"], ) diff --git a/models/demos/falcon7b/tt/falcon_mlp.py b/models/demos/falcon7b/tt/falcon_mlp.py index 8cf59a9a040..9b5de6b8364 100644 --- a/models/demos/falcon7b/tt/falcon_mlp.py +++ b/models/demos/falcon7b/tt/falcon_mlp.py @@ -10,6 +10,81 @@ from torch import nn +def falcon_dense_4h_to_h_matmul( + input_tensor_a, + input_tensor_b, + core_grid, + output_mem_config=ttnn.DRAM_MEMORY_CONFIG, + output_dtype=None, + packer_l1_acc=False, +): + device_arch = input_tensor_a.device().arch() + if device_arch == ttnn.Arch.GRAYSKULL: + compute_kernel_config = ttnn.GrayskullComputeKernelConfig( + math_fidelity=ttnn.MathFidelity.LoFi, math_approx_mode=True + ) + elif device_arch == ttnn.Arch.WORMHOLE_B0: + compute_kernel_config = ttnn.WormholeComputeKernelConfig( + math_fidelity=ttnn.MathFidelity.LoFi, + math_approx_mode=True, + fp32_dest_acc_en=False, + packer_l1_acc=packer_l1_acc, + ) + else: + raise RuntimeError(f"Unsupported arch: {device_arch}") + + return ttnn.matmul( + input_tensor_a, + input_tensor_b, + memory_config=output_mem_config, + dtype=output_dtype, + core_grid=core_grid, + use_1d_systolic_array=True, + compute_kernel_config=compute_kernel_config, + ) + + +def falcon_dense_h_to_4h_matmul( + input_tensor_a, + input_tensor_b, + core_grid, + fused_activation=None, + output_mem_config=ttnn.DRAM_MEMORY_CONFIG, + output_dtype=None, +): + seq_len = input_tensor_a.get_legacy_shape()[2] + if seq_len > 1024: + # TODO: Review if this path is used? If not, we can delete + assert fused_activation == None + return ttnn.matmul(input_tensor_a, input_tensor_b, memory_config=output_mem_config, dtype=output_dtype) + else: + device_arch = input_tensor_a.device().arch() + if device_arch == ttnn.Arch.GRAYSKULL: + compute_kernel_config = ttnn.GrayskullComputeKernelConfig( + math_fidelity=ttnn.MathFidelity.LoFi, math_approx_mode=True + ) + elif device_arch == ttnn.Arch.WORMHOLE_B0: + compute_kernel_config = ttnn.WormholeComputeKernelConfig( + math_fidelity=ttnn.MathFidelity.LoFi, + math_approx_mode=True, + fp32_dest_acc_en=False, + packer_l1_acc=True, + ) + else: + raise RuntimeError(f"Unsupported arch: {device_arch}") + + return ttnn.matmul( + input_tensor_a, + input_tensor_b, + memory_config=output_mem_config, + dtype=output_dtype, + core_grid=core_grid, + activation=fused_activation, + use_1d_systolic_array=True, + compute_kernel_config=compute_kernel_config, + ) + + class TtFalconMLPPrefill(nn.Module): def __init__( self, @@ -190,19 +265,21 @@ def forward(self, x: tt_lib.tensor.Tensor) -> tt_lib.tensor.Tensor: hidden_states = [] for device_id in range(len(x)): hidden_states.append( - tt_lib.tensor.falcon_dense_h_to_4h_matmul( + falcon_dense_h_to_4h_matmul( x[device_id], self.dense_h_to_4h_weights[device_id], - fused_activation=[tt_lib.tensor.FusibleActivation.GELU, True], + core_grid=self.model_config["DEFAULT_CORE_GRID"], + fused_activation="gelu", output_mem_config=self.model_config["DENSE_H_TO_4H_MM_OUTPUT_MEMCFG"], output_dtype=self.model_config["DENSE_H_TO_4H_MM_OUTPUT_DTYPE"], ) ) x[device_id].deallocate() for device_id in range(len(x)): - hidden_states[device_id] = tt_lib.tensor.falcon_dense_4h_to_h_matmul( + hidden_states[device_id] = falcon_dense_4h_to_h_matmul( hidden_states[device_id], self.dense_4h_to_h_weights[device_id], + core_grid=self.model_config["DEFAULT_CORE_GRID"], output_mem_config=self.model_config["DENSE_4H_TO_H_MM_OUTPUT_MEMCFG"], output_dtype=self.model_config["DENSE_4H_TO_H_MM_OUTPUT_DTYPE"], packer_l1_acc=True, @@ -300,19 +377,21 @@ def forward(self, x: tt_lib.tensor.Tensor) -> tt_lib.tensor.Tensor: [x[device_id], self.model_config["MLP_DECODE_PADDING_TENSORS"][device_id]], dim=3 ) hidden_states.append( - tt_lib.tensor.falcon_dense_h_to_4h_matmul( + falcon_dense_h_to_4h_matmul( x[device_id], self.dense_h_to_4h_weights[device_id], - fused_activation=[tt_lib.tensor.FusibleActivation.GELU, True], + core_grid=self.model_config["DEFAULT_CORE_GRID"], + fused_activation="gelu", output_mem_config=self.model_config["DENSE_H_TO_4H_MM_OUTPUT_MEMCFG"], output_dtype=self.model_config["DENSE_H_TO_4H_MM_OUTPUT_DTYPE"], ) ) x[device_id].deallocate() for device_id in range(len(x)): - hidden_states[device_id] = tt_lib.tensor.falcon_dense_4h_to_h_matmul( + hidden_states[device_id] = falcon_dense_4h_to_h_matmul( hidden_states[device_id], self.dense_4h_to_h_weights[device_id], + core_grid=self.model_config["DEFAULT_CORE_GRID"], output_mem_config=self.model_config["DENSE_4H_TO_H_MM_OUTPUT_MEMCFG"], output_dtype=self.model_config["DENSE_4H_TO_H_MM_OUTPUT_DTYPE"], packer_l1_acc=True, diff --git a/models/demos/falcon7b/tt/model_config.py b/models/demos/falcon7b/tt/model_config.py index f66125f3cc6..98539fddaa5 100644 --- a/models/demos/falcon7b/tt/model_config.py +++ b/models/demos/falcon7b/tt/model_config.py @@ -92,7 +92,7 @@ def pretty_print_model_config(model_config): return "\n".join(print_str) -def get_model_config(model_config_str, prefill_seq_len=0, optimized=False): +def get_model_config(model_config_str, default_core_grid, prefill_seq_len=0, optimized=False): assert model_config_str in ACCEPTABLE_MODEL_CONFIG_STRS DRAM_MEMCFG = ttl.tensor.MemoryConfig(ttl.tensor.TensorMemoryLayout.INTERLEAVED, ttl.tensor.BufferType.DRAM) L1_MEMCFG = ttl.tensor.MemoryConfig(ttl.tensor.TensorMemoryLayout.INTERLEAVED, ttl.tensor.BufferType.L1) @@ -115,6 +115,7 @@ def get_model_config(model_config_str, prefill_seq_len=0, optimized=False): "DEFAULT_MEMCFG": mem_config, "MOVE_DECODER_OUTPUT_BOOL": False, "DEFAULT_CACHE_PATH": Path(f"models/demos/falcon7b/datasets/"), + "DEFAULT_CORE_GRID": default_core_grid, } # DEFAULT_MEMCFG also used to determine banking for ttl.device.InitializeDevice model_config.update({f"{key}_MEMCFG": mem_config for key in OP_KEYS if key not in NO_MEMCFG}) model_config.update({f"{key}_DTYPE": dtype for key in OP_KEYS if key not in NO_DTYPE}) diff --git a/models/demos/falcon7b/tt/model_utils.py b/models/demos/falcon7b/tt/model_utils.py index 3ec7ea3aeb1..27a0f961d9e 100644 --- a/models/demos/falcon7b/tt/model_utils.py +++ b/models/demos/falcon7b/tt/model_utils.py @@ -4,6 +4,7 @@ import torch import tt_lib +import ttnn from models.utility_functions import torch2tt_tensor, pad_by_zero @@ -80,3 +81,11 @@ def get_weights_cached( tt_lib.tensor.dump_tensor(str(path), weights_host) return weights + + +# TODO: Remove this once there are no more hangs on 8x8 (Issue #6795) +def get_falcon_default_core_grid(device): + grid_size = device.compute_with_storage_grid_size() + if device.arch() == ttnn.Arch.WORMHOLE_B0 and grid_size.y >= 8: + grid_size.y = 7 + return ttnn.CoreGrid(y=grid_size.y, x=grid_size.x) diff --git a/tt_eager/tt_dnn/op_library/bmm/bmm_op.cpp b/tt_eager/tt_dnn/op_library/bmm/bmm_op.cpp index 72013283fcf..154495882ce 100644 --- a/tt_eager/tt_dnn/op_library/bmm/bmm_op.cpp +++ b/tt_eager/tt_dnn/op_library/bmm/bmm_op.cpp @@ -529,100 +529,6 @@ tuple get_subblock_sizes(uint32_t m_tiles_per_core, uint32_t namespace tt { namespace tt_metal { -CoreCoord get_falcon_matmul_grid_size(Device *device){ - CoreCoord grid_size = device->compute_with_storage_grid_size(); - // TODO: Remove this once there are no more hangs on 8x8 (Issue #6795) - if (device->arch() == ARCH::WORMHOLE_B0 and grid_size.y >= 8){ - grid_size.y = 7; - } - return grid_size; -} - -/** - * Falcon matmuls using operations::primary::matmul + program_config - */ -Tensor falcon_fused_qkv_matmul(const Tensor &input_tensor_a, const Tensor &input_tensor_b, std::optional bias, const MemoryConfig& mem_config, std::optional output_dtype) { - CoreCoord grid_size = get_falcon_matmul_grid_size(input_tensor_a.device()); - auto program_config = bmm_op_utils::get_mcast_1d_config(input_tensor_a, input_tensor_b, true, std::nullopt, true, mem_config.is_sharded(), grid_size); - return operations::primary::matmul_1d(input_tensor_a, input_tensor_b, bias, program_config, mem_config, output_dtype); -} - -Tensor falcon_selfout_matmul(const Tensor &input_tensor_a, const Tensor &input_tensor_b, std::optional bias, const MemoryConfig& mem_config, std::optional output_dtype) { - CoreCoord grid_size = get_falcon_matmul_grid_size(input_tensor_a.device()); - auto program_config = bmm_op_utils::get_mcast_1d_config(input_tensor_a, input_tensor_b, true, std::nullopt, true, mem_config.is_sharded(), grid_size); - return operations::primary::matmul_1d(input_tensor_a, input_tensor_b, bias, program_config, mem_config, output_dtype); -} - -Tensor falcon_dense_4h_to_h_matmul(const Tensor &input_tensor_a, const Tensor &input_tensor_b, std::optional bias, const MemoryConfig& mem_config, std::optional output_dtype, std::optional packer_l1_acc) { - CoreCoord grid_size = get_falcon_matmul_grid_size(input_tensor_a.device()); - std::optional config = std::nullopt; - auto compute_kernel_config = init_device_compute_kernel_config(input_tensor_a.device()->arch(), config, MathFidelity::LoFi, true, false, packer_l1_acc.value_or(false)); - auto program_config = bmm_op_utils::get_mcast_1d_config( - input_tensor_a, input_tensor_b, true, std::nullopt, true, mem_config.is_sharded(), grid_size, compute_kernel_config); - return operations::primary::matmul_1d( - input_tensor_a, - input_tensor_b, - bias, - program_config, - mem_config, - output_dtype, - compute_kernel_config); -} - -Tensor falcon_dense_h_to_4h_matmul(const Tensor &input_tensor_a, const Tensor &input_tensor_b, std::optional bias, std::optional fused_activation, const MemoryConfig& mem_config, std::optional output_dtype) { - auto seq_len = input_tensor_a.get_legacy_shape()[2]; - if (seq_len > 1024) { - TT_FATAL(not fused_activation.has_value()); - // TODO: Check support for seq_len == 128, 256, 512, ..., 2048 - TT_FATAL(seq_len % TILE_HEIGHT == 0, "Falcon mm's seq_len must be a multiple of 32!"); - TT_FATAL(seq_len >= 128, "Falcon mm's seq_len must be greater than 128!"); - TT_FATAL((input_tensor_a.get_legacy_shape() == Shape({1, 1, seq_len, 4544})), "Unsupported input shape"); - TT_FATAL((input_tensor_b.get_legacy_shape() == Shape({1, 1, 4544, 18176})), "Unsupported input shape"); - TT_FATAL(!fused_activation.has_value()); - return operation::run_with_autoformat(tt::operations::primary::Matmul{.program_config=tt::operations::primary::MatmulDefaultProgramConfig{}, .bcast_batch=true, .output_mem_config=mem_config, .output_dtype=output_dtype.value_or(input_tensor_a.get_dtype())}, {input_tensor_a, input_tensor_b}).at(0); - } else { - CoreCoord grid_size = get_falcon_matmul_grid_size(input_tensor_a.device()); - std::optional config = std::nullopt; - auto compute_kernel_config = init_device_compute_kernel_config(input_tensor_a.device()->arch(), config, MathFidelity::LoFi, true /* math_approx_mode */, false /* fp32_dest_acc_en */, true /* packer_l1_acc */); - auto program_config = bmm_op_utils::get_mcast_1d_config(input_tensor_a, input_tensor_b, true, fused_activation, true, mem_config.is_sharded(), grid_size, compute_kernel_config); - return operations::primary::matmul_1d(input_tensor_a, input_tensor_b, bias, program_config, mem_config, output_dtype, compute_kernel_config); - } -} - -Tensor falcon_lm_head_matmul(const Tensor &input_tensor_a, const Tensor &input_tensor_b, std::optional bias, const MemoryConfig& mem_config, std::optional output_dtype) { - auto seq_len = input_tensor_a.get_legacy_shape()[2]; - std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor_a, input_tensor_b}, {bias}))}; - if (seq_len > 512) { - // TODO: Check support for seq_len == 128, 256, 512, ..., 2048 - operation::launch_with_autoformat( - [seq_len, mem_config, output_dtype] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { - auto& input_tensor_a = input_tensors.at(0); - auto& input_tensor_b = input_tensors.at(1); - auto& bias = optional_input_tensors.at(0); - TT_FATAL(seq_len % TILE_HEIGHT == 0, "Falcon mm's seq_len must be a multiple of 32!"); - TT_FATAL(seq_len >= 128, "Falcon mm's seq_len must be greater than 128!"); - TT_FATAL((input_tensor_a.get_legacy_shape() == Shape({1, 1, seq_len, 4544})), "Unsupported input shape"); - TT_FATAL((input_tensor_b.get_legacy_shape() == Shape({1, 1, 4544, 65024})), "Unsupported input shape"); - return operation::run_with_autoformat(tt::operations::primary::Matmul{.program_config=tt::operations::primary::MatmulDefaultProgramConfig{}, .bcast_batch=true, .output_mem_config=mem_config, .output_dtype=output_dtype.value_or(input_tensor_a.get_dtype())}, {input_tensor_a, input_tensor_b}, {bias}); - }, - {input_tensor_a, input_tensor_b}, output_tensors, {bias}); - - } else { - operation::launch_op( - [mem_config, output_dtype] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { - auto& input_tensor_a = input_tensors.at(0); - auto& input_tensor_b = input_tensors.at(1); - auto& bias = optional_input_tensors.at(0); - CoreCoord grid_size = get_falcon_matmul_grid_size(input_tensor_a.device()); - std::optional config = std::nullopt; - auto compute_kernel_config = init_device_compute_kernel_config(input_tensor_a.device()->arch(), config, MathFidelity::LoFi, true /* math_approx_mode */, false /* fp32_dest_acc_en */, true /* packer_l1_acc */); - auto program_config = bmm_op_utils::get_mcast_1d_config(input_tensor_a, input_tensor_b, true, std::nullopt, true, mem_config.is_sharded(), grid_size, compute_kernel_config); - return {operations::primary::matmul_1d(input_tensor_a, input_tensor_b, bias, program_config, mem_config, output_dtype, compute_kernel_config)}; - }, - {input_tensor_a, input_tensor_b}, output_tensors, {bias}); - } - return output_tensors.at(0); -} /** * Resnet50 matmul with fused batch diff --git a/tt_eager/tt_dnn/op_library/bmm/bmm_op.hpp b/tt_eager/tt_dnn/op_library/bmm/bmm_op.hpp index bad8eeaddfd..ad435760bc0 100644 --- a/tt_eager/tt_dnn/op_library/bmm/bmm_op.hpp +++ b/tt_eager/tt_dnn/op_library/bmm/bmm_op.hpp @@ -44,15 +44,6 @@ operation::ProgramWithCallbacks matmul_multi_core_reuse_mcast_2d_optimized(const operation::ProgramWithCallbacks bmm_multi_core_reuse_optimized(const Tensor& input_tensor_a, const Tensor& input_tensor_b, Tensor &output_tensor, bool bcast_batch, CoreCoord compute_with_storage_grid_size, tt::tt_metal::DataType output_dtype, DeviceComputeKernelConfig compute_kernel_config, uint32_t in0_block_w, uint32_t out_subblock_h, uint32_t out_subblock_w, uint32_t per_core_M, uint32_t per_core_N, bool fuse_batch, bool untilize_out); -/** - * Falcon matmuls using operations::primary::matmul + program_config - */ -Tensor falcon_fused_qkv_matmul(const Tensor &input_tensor_a, const Tensor &input_tensor_b, std::optional bias, const MemoryConfig& mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, std::optional output_dtype=std::nullopt); -Tensor falcon_selfout_matmul(const Tensor &input_tensor_a, const Tensor &input_tensor_b, std::optional bias, const MemoryConfig& mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, std::optional output_dtype=std::nullopt); -Tensor falcon_dense_4h_to_h_matmul(const Tensor &input_tensor_a, const Tensor &input_tensor_b, std::optional bias, const MemoryConfig& mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, std::optional output_dtype=std::nullopt, std::optional packer_l1_acc = std::nullopt); -Tensor falcon_dense_h_to_4h_matmul (const Tensor &input_tensor_a, const Tensor &input_tensor_b, std::optional bias, std::optional fused_activation = std::nullopt, const MemoryConfig& mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, std::optional output_dtype=std::nullopt); -Tensor falcon_lm_head_matmul (const Tensor &input_tensor_a, const Tensor &input_tensor_b, std::optional bias, const MemoryConfig& mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, std::optional output_dtype=std::nullopt); - /** * Resnet matmul for linear */ diff --git a/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_custom_bmm_ops.cpp b/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_custom_bmm_ops.cpp index db37c717ccd..e953f11b602 100644 --- a/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_custom_bmm_ops.cpp +++ b/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_custom_bmm_ops.cpp @@ -94,27 +94,6 @@ namespace tt::tt_metal::detail "output_mem_config", "Layout of tensor in TT Accelerator device memory banks", "MemoryConfig", "Default is interleaved in DRAM", "No" )doc"); - // Custom Falcon matmuls/bmms - m_tensor.def("falcon_fused_qkv_matmul", &falcon_fused_qkv_matmul, - py::arg().noconvert(), py::arg().noconvert(), py::arg("bias").noconvert() = std::nullopt, py::arg("output_mem_config").noconvert() = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, py::arg("output_dtype").noconvert() = std::nullopt, R"doc( - Perform a falcon_fused_qkv non-batched matmul ``A x B`` with two tensors. - )doc"); - m_tensor.def("falcon_selfout_matmul", &falcon_selfout_matmul, - py::arg().noconvert(), py::arg().noconvert(), py::arg("bias").noconvert() = std::nullopt, py::arg("output_mem_config").noconvert() = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, py::arg("output_dtype").noconvert() = std::nullopt, R"doc( - Perform a falcon_selfout non-batched matmul ``A x B`` with two tensors. - )doc"); - m_tensor.def("falcon_dense_4h_to_h_matmul", &falcon_dense_4h_to_h_matmul, - py::arg().noconvert(), py::arg().noconvert(), py::arg("bias").noconvert() = std::nullopt, py::arg("output_mem_config").noconvert() = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, py::arg("output_dtype").noconvert() = std::nullopt, py::arg("packer_l1_acc").noconvert() = std::nullopt, R"doc( - Perform a falcon_dense_4h_to_h non-batched matmul ``A x B`` with two tensors. - )doc"); - m_tensor.def("falcon_dense_h_to_4h_matmul", &falcon_dense_h_to_4h_matmul, - py::arg().noconvert(), py::arg().noconvert(), py::arg("bias").noconvert() = std::nullopt, py::arg("fused_activation") = std::nullopt, py::arg("output_mem_config").noconvert() = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, py::arg("output_dtype").noconvert() = std::nullopt, R"doc( - Perform a falcon_dense_h_to_4h non-batched matmul ``A x B`` with two tensors. This invokes the MULTI_CORE matmul parallelization. This parallelization does not support bias option yet. - )doc"); - m_tensor.def("falcon_lm_head_matmul", &falcon_lm_head_matmul, - py::arg().noconvert(), py::arg().noconvert(), py::arg("bias").noconvert() = std::nullopt, py::arg("output_mem_config").noconvert() = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, py::arg("output_dtype").noconvert() = std::nullopt, R"doc( - Perform a falcon_lm_head non-batched matmul ``A x B`` with two tensors. This invokes the MULTI_CORE matmul parallelization. This parallelization does not support bias option yet. - )doc"); // Custom Generic NLP TMs // This op should support arbitrary B and S divisible by 32 on DRAM; on L1, might error out due to space diff --git a/ttnn/ttnn/__init__.py b/ttnn/ttnn/__init__.py index f7d8a76e73e..c08658ffb86 100644 --- a/ttnn/ttnn/__init__.py +++ b/ttnn/ttnn/__init__.py @@ -190,6 +190,7 @@ def manage_config(name, value): from ttnn.device import ( Device, + Arch, open_device, close_device, enable_program_cache, diff --git a/ttnn/ttnn/device.py b/ttnn/ttnn/device.py index c926936dc75..7d27f38a1c4 100644 --- a/ttnn/ttnn/device.py +++ b/ttnn/ttnn/device.py @@ -16,6 +16,7 @@ def get_device_core_grid(device): # TODO: Device = ttnn._ttnn.Device Device = ttl.device.Device Device.core_grid = property(get_device_core_grid) +Arch = ttl.device.Arch def open_device(device_id: int, l1_small_size: int = ttl.device.DEFAULT_L1_SMALL_SIZE): diff --git a/ttnn/ttnn/operations/matmul.py b/ttnn/ttnn/operations/matmul.py index fdf9e710113..78bf8b40f57 100644 --- a/ttnn/ttnn/operations/matmul.py +++ b/ttnn/ttnn/operations/matmul.py @@ -25,6 +25,7 @@ def matmul( dtype: Optional[ttnn.DataType] = None, core_grid: Optional[ttnn.CoreGrid] = None, program_config: Optional[MatmulProgramConfig] = None, + activation: Optional[str] = None, use_1d_systolic_array: Optional[bool] = None, compute_kernel_config: Optional[ttnn.DeviceComputeKernelConfig] = None, ) -> ttnn.Tensor: