Skip to content

Commit

Permalink
#4463: add packer_l1_acc support for matmul
Browse files Browse the repository at this point in the history
#0: comment out bert op test
  • Loading branch information
yugaoTT committed Jan 11, 2024
1 parent e0a98a6 commit 02f47ff
Show file tree
Hide file tree
Showing 7 changed files with 252 additions and 97 deletions.
30 changes: 11 additions & 19 deletions tests/tt_eager/python_api_testing/unit_testing/test_bert_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,7 @@ def test_bert_linear(


@pytest.mark.skipif(is_grayskull(), reason="not tested for GS")
@pytest.mark.parametrize("packer_l1_acc", [True, False], ids=["pack_l1", "no_pack_l1"])
@pytest.mark.parametrize("fp32_acc_mode", [True, False], ids=["fp32", "no_fp32"])
@pytest.mark.parametrize(
"fidelity",
Expand Down Expand Up @@ -247,7 +248,7 @@ def test_bert_linear(
(True, True, False, 2688, 1024, 1024, None),
(True, False, True, 2688, 1024, 1024, None),
(True, False, False, 2688, 1024, 1024, None),
# # # in1-L1-ff1
# # # # in1-L1-ff1
(False, True, True, 2688, 1024, 4096, (ttl.tensor.FusibleActivation.GELU, True)),
(False, True, False, 2688, 1024, 4096, (ttl.tensor.FusibleActivation.GELU, True)),
(False, False, True, 2688, 1024, 4096, (ttl.tensor.FusibleActivation.GELU, True)),
Expand Down Expand Up @@ -288,6 +289,7 @@ def test_bert_linear_batch7(
in1_in_dram,
has_bias,
fp32_acc_mode,
packer_l1_acc,
M,
K,
N,
Expand All @@ -298,7 +300,6 @@ def test_bert_linear_batch7(
in1_shape = [1, 1, K, N]
bias_shape = [1, 1, N]
grid_size = (8, 7)
# grid_size = (2, 2)

in0_block_h = M // grid_size[1] // 32
in0_block_w = K // grid_size[0] // 32
Expand Down Expand Up @@ -340,23 +341,12 @@ def test_bert_linear_batch7(
in1 = torch.randn(in1_shape).bfloat16().float()
bias = torch.randn(bias_shape).bfloat16().float()

if in0_sharded:
in0_t = torch2tt_tensor(
in0, device, tt_memory_config=interleaved_mem_config_DRAM, tt_dtype=ttl.tensor.DataType.BFLOAT8_B
)
else:
in0_t = torch2tt_tensor(
in0, device, tt_memory_config=interleaved_mem_config_L1, tt_dtype=ttl.tensor.DataType.BFLOAT8_B
)

if in1_in_dram:
in1_t = torch2tt_tensor(
in1, device, tt_memory_config=interleaved_mem_config_DRAM, tt_dtype=ttl.tensor.DataType.BFLOAT8_B
)
else:
in1_t = torch2tt_tensor(
in1, device, tt_memory_config=interleaved_mem_config_L1, tt_dtype=ttl.tensor.DataType.BFLOAT8_B
)
in0_t = torch2tt_tensor(
in0, device, tt_memory_config=interleaved_mem_config_DRAM, tt_dtype=ttl.tensor.DataType.BFLOAT8_B
)
in1_t = torch2tt_tensor(
in1, device, tt_memory_config=interleaved_mem_config_DRAM, tt_dtype=ttl.tensor.DataType.BFLOAT8_B
)

output_mem_config = sharded_mem_config if out_sharded else interleaved_mem_config_L1
bias_t = pad_by_zero(
Expand Down Expand Up @@ -392,6 +382,7 @@ def test_bert_linear_batch7(
output_mem_config=output_mem_config,
math_fidelity=fidelity,
fp32_dest_acc_en=fp32_acc_mode,
packer_l1_acc=packer_l1_acc,
)
else:
output_t = ttl.operations.primary.matmul(
Expand All @@ -401,6 +392,7 @@ def test_bert_linear_batch7(
output_mem_config=output_mem_config,
math_fidelity=fidelity,
fp32_dest_acc_en=fp32_acc_mode,
packer_l1_acc=packer_l1_acc,
)

if out_sharded:
Expand Down
21 changes: 11 additions & 10 deletions tt_eager/tt_dnn/op_library/bmm/bmm_op.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,7 @@ operation::ProgramWithCallbacks Matmul::create_program(const std::vector<Tensor>
input_tensor_a, input_tensor_b, std::nullopt, output_tensor,
this->bcast_batch,
input_tensor_a.device()->compute_with_storage_grid_size(),
MathFidelity::HiFi4, false, true,
MathFidelity::HiFi4, false, true, false,
2, 4, 2,
16, 16, false, false, std::nullopt
);
Expand All @@ -376,7 +376,7 @@ operation::ProgramWithCallbacks Matmul::create_program(const std::vector<Tensor>
input_tensor_a, input_tensor_b, std::nullopt, output_tensor,
this->bcast_batch,
input_tensor_a.device()->compute_with_storage_grid_size(),
MathFidelity::HiFi4, false, true,
MathFidelity::HiFi4, false, true, false,
config.in0_block_w, config.out_subblock_h, config.out_subblock_w,
config.per_core_M, config.per_core_N, false, std::nullopt, true
);
Expand All @@ -386,7 +386,7 @@ operation::ProgramWithCallbacks Matmul::create_program(const std::vector<Tensor>
input_tensor_a, input_tensor_b, std::nullopt, output_tensor,
this->bcast_batch,
input_tensor_a.device()->compute_with_storage_grid_size(),
MathFidelity::HiFi4, false, true,
MathFidelity::HiFi4, false, true, false,
config.in0_block_w, config.out_subblock_h, config.out_subblock_w,
config.per_core_M, config.per_core_N, false, std::nullopt, false
);
Expand Down Expand Up @@ -886,6 +886,7 @@ operation::ProgramWithCallbacks Matmul::create_program(
MathFidelity math_fidelity = this->math_fidelity;
bool fp32_dest_acc_en = this->fp32_dest_acc_en;
bool math_approx_mode = this->math_approx_mode;
bool packer_l1_acc = this->packer_l1_acc;

bool fuse_batch = true;
bool broadcast_batch = input_tensor_b.shape()[0] * input_tensor_b.shape()[1] == 1;
Expand All @@ -905,7 +906,7 @@ operation::ProgramWithCallbacks Matmul::create_program(
input_tensor_a, input_tensor_b, bias, output_tensor,
broadcast_batch,
input_tensor_a.device()->compute_with_storage_grid_size(),
MathFidelity::HiFi4, fp32_dest_acc_en, math_approx_mode,
math_fidelity, fp32_dest_acc_en, math_approx_mode, packer_l1_acc,
2, 4, 2,
16, 16, false, false, std::nullopt
);
Expand All @@ -914,7 +915,7 @@ operation::ProgramWithCallbacks Matmul::create_program(
input_tensor_a, input_tensor_b, std::nullopt, output_tensor,
broadcast_batch,
input_tensor_a.device()->compute_with_storage_grid_size(),
MathFidelity::HiFi4, fp32_dest_acc_en, math_approx_mode,
math_fidelity, fp32_dest_acc_en, math_approx_mode, packer_l1_acc,
2, 4, 2,
16, 16, false, std::nullopt, true
);
Expand All @@ -923,7 +924,7 @@ operation::ProgramWithCallbacks Matmul::create_program(
input_tensor_a, input_tensor_b, std::nullopt, output_tensor,
broadcast_batch,
input_tensor_a.device()->compute_with_storage_grid_size(),
MathFidelity::HiFi4, fp32_dest_acc_en, math_approx_mode,
math_fidelity, fp32_dest_acc_en, math_approx_mode, packer_l1_acc,
2, 4, 2,
16, 16, false, std::nullopt, false
);
Expand Down Expand Up @@ -953,7 +954,7 @@ operation::ProgramWithCallbacks Matmul::create_program(
input_tensor_a, input_tensor_b, bias, output_tensor,
broadcast_batch,
program_config.compute_with_storage_grid_size,
math_fidelity, fp32_dest_acc_en, math_approx_mode,
math_fidelity, fp32_dest_acc_en, math_approx_mode, packer_l1_acc,
program_config.in0_block_w, program_config.out_subblock_h, program_config.out_subblock_w,
program_config.per_core_M, program_config.per_core_N, fuse_batch, program_config.transpose_mcast, program_config.fused_activation
);
Expand All @@ -963,7 +964,7 @@ operation::ProgramWithCallbacks Matmul::create_program(
input_tensor_a, input_tensor_b, bias, output_tensor,
broadcast_batch,
program_config.compute_with_storage_grid_size,
math_fidelity, fp32_dest_acc_en, math_approx_mode,
math_fidelity, fp32_dest_acc_en, math_approx_mode, packer_l1_acc,
program_config.in0_block_w, program_config.out_subblock_h, program_config.out_subblock_w,
program_config.per_core_M, program_config.per_core_N, program_config.fuse_batch, program_config.fused_activation,
program_config.mcast_in0
Expand Down Expand Up @@ -1007,11 +1008,11 @@ MatmulParallelizationStrategy Matmul::get_parallelization_strategy(const std::ve
);
}

Tensor matmul_1d(const Tensor &input_tensor_a, const Tensor &input_tensor_b, std::optional<const Tensor> bias, std::optional<MatmulMultiCoreReuseMultiCast1DProgramConfig> program_config, const MemoryConfig& mem_config, std::optional<const DataType> output_dtype, const MathFidelity math_fidelity, const bool fp32_dest_acc_en, const bool math_approx_mode) {
Tensor matmul_1d(const Tensor &input_tensor_a, const Tensor &input_tensor_b, std::optional<const Tensor> bias, std::optional<MatmulMultiCoreReuseMultiCast1DProgramConfig> program_config, const MemoryConfig& mem_config, std::optional<const DataType> output_dtype, const MathFidelity math_fidelity, const bool fp32_dest_acc_en, const bool math_approx_mode, const bool packer_l1_acc) {
if (!program_config.has_value()) {
program_config = bmm_op_utils::get_mcast_1d_config(input_tensor_a, input_tensor_b);
}
return operations::primary::matmul(input_tensor_a, input_tensor_b, bias, program_config.value(), mem_config, output_dtype, math_fidelity, fp32_dest_acc_en, math_approx_mode);
return operations::primary::matmul(input_tensor_a, input_tensor_b, bias, program_config.value(), mem_config, output_dtype, math_fidelity, fp32_dest_acc_en, math_approx_mode, packer_l1_acc);
}

} // namespace primary
Expand Down
17 changes: 10 additions & 7 deletions tt_eager/tt_dnn/op_library/bmm/bmm_op.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,8 @@ inline Tensor bmm (const Tensor &input_tensor_a, const Tensor &input_tensor_b
return operation::run_with_autoformat(Matmul{.bcast_batch=false, .output_mem_config=mem_config, .output_dtype=input_tensor_a.dtype()}, {input_tensor_a, input_tensor_b}, {std::nullopt}).at(0);
}

operation::ProgramWithCallbacks matmul_multi_core_reuse_mcast_1d_optimized(const Tensor &input_tensor_a, const Tensor &input_tensor_b, const std::optional<const Tensor> bias, Tensor &output_tensor, bool bcast_batch, CoreCoord compute_with_storage_grid_size, MathFidelity math_fidelity, bool fp32_dest_acc_en, bool math_approx_mode, uint32_t in0_block_w, uint32_t out_subblock_h, uint32_t out_subblock_w, uint32_t per_core_M, uint32_t per_core_N, bool fuse_batch, std::optional<UnaryWithParam> fused_activation, bool mcast_in0);
operation::ProgramWithCallbacks matmul_multi_core_reuse_mcast_2d_optimized(const Tensor &input_tensor_a, const Tensor &input_tensor_b, const std::optional<const Tensor> bias, Tensor &output_tensor, bool bcast_batch, CoreCoord compute_with_storage_grid_size, MathFidelity math_fidelity, bool fp32_dest_acc_en, bool math_approx_mode, uint32_t in0_block_w, uint32_t out_subblock_h, uint32_t out_subblock_w, uint32_t per_core_M, uint32_t per_core_N, bool fuse_batch, bool transpose_mcast, std::optional<UnaryWithParam> fused_activation);
operation::ProgramWithCallbacks matmul_multi_core_reuse_mcast_1d_optimized(const Tensor &input_tensor_a, const Tensor &input_tensor_b, const std::optional<const Tensor> bias, Tensor &output_tensor, bool bcast_batch, CoreCoord compute_with_storage_grid_size, MathFidelity math_fidelity, bool fp32_dest_acc_en, bool math_approx_mode, bool packer_l1_acc, uint32_t in0_block_w, uint32_t out_subblock_h, uint32_t out_subblock_w, uint32_t per_core_M, uint32_t per_core_N, bool fuse_batch, std::optional<UnaryWithParam> fused_activation, bool mcast_in0);
operation::ProgramWithCallbacks matmul_multi_core_reuse_mcast_2d_optimized(const Tensor &input_tensor_a, const Tensor &input_tensor_b, const std::optional<const Tensor> bias, Tensor &output_tensor, bool bcast_batch, CoreCoord compute_with_storage_grid_size, MathFidelity math_fidelity, bool fp32_dest_acc_en, bool math_approx_mode, bool packer_l1_acc, uint32_t in0_block_w, uint32_t out_subblock_h, uint32_t out_subblock_w, uint32_t per_core_M, uint32_t per_core_N, bool fuse_batch, bool transpose_mcast, std::optional<UnaryWithParam> fused_activation);
operation::ProgramWithCallbacks bmm_multi_core_reuse_optimized(const Tensor& input_tensor_a, const Tensor& input_tensor_b, Tensor &output_tensor, bool bcast_batch, CoreCoord compute_with_storage_grid_size, tt::tt_metal::DataType output_dtype, MathFidelity math_fidelity, bool fp32_dest_acc_en, bool math_approx_mode, uint32_t in0_block_w, uint32_t out_subblock_h, uint32_t out_subblock_w, uint32_t per_core_M, uint32_t per_core_N, bool fuse_batch);


Expand Down Expand Up @@ -291,6 +291,7 @@ struct Matmul {
const MathFidelity math_fidelity;
const bool fp32_dest_acc_en;
const bool math_approx_mode;
const bool packer_l1_acc;

void validate(const std::vector<Tensor>& input_tensors, const std::vector<std::optional<const Tensor>>& optional_input_tensors) const;
std::vector<Shape> compute_output_shapes(const std::vector<Tensor>& input_tensors) const;
Expand Down Expand Up @@ -322,9 +323,10 @@ inline Tensor matmul(
std::optional<const DataType> output_dtype=std::nullopt,
const MathFidelity math_fidelity = MathFidelity::LoFi,
const bool fp32_dest_acc_en = false,
const bool math_approx_mode = true
const bool math_approx_mode = true,
const bool packer_l1_acc = false
) {
return operation::run(Matmul{program_config, mem_config, output_dtype.value_or(input_tensor_a.dtype()), math_fidelity, fp32_dest_acc_en, math_approx_mode}, {input_tensor_a, input_tensor_b}, {std::nullopt}).at(0);
return operation::run(Matmul{program_config, mem_config, output_dtype.value_or(input_tensor_a.dtype()), math_fidelity, fp32_dest_acc_en, math_approx_mode, packer_l1_acc}, {input_tensor_a, input_tensor_b}, {std::nullopt}).at(0);
}

inline Tensor matmul(
Expand All @@ -335,12 +337,13 @@ inline Tensor matmul(
std::optional<const DataType> output_dtype=std::nullopt,
const MathFidelity math_fidelity = MathFidelity::LoFi,
const bool fp32_dest_acc_en = false,
const bool math_approx_mode = true
const bool math_approx_mode = true,
const bool packer_l1_acc = false
) {
return operation::run(Matmul{program_config, mem_config, output_dtype.value_or(input_tensor_a.dtype()), math_fidelity, fp32_dest_acc_en, math_approx_mode}, {input_tensor_a, input_tensor_b}, {bias}).at(0);
return operation::run(Matmul{program_config, mem_config, output_dtype.value_or(input_tensor_a.dtype()), math_fidelity, fp32_dest_acc_en, math_approx_mode, packer_l1_acc}, {input_tensor_a, input_tensor_b}, {bias}).at(0);
}

Tensor matmul_1d(const Tensor &input_tensor_a, const Tensor &input_tensor_b, std::optional<const Tensor> bias, std::optional<MatmulMultiCoreReuseMultiCast1DProgramConfig> program_config = std::nullopt, const MemoryConfig& mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, std::optional<const DataType> output_dtype=std::nullopt, const MathFidelity math_fidelity = MathFidelity::LoFi, const bool fp32_dest_acc_en = false, const bool math_approx_mode = true);
Tensor matmul_1d(const Tensor &input_tensor_a, const Tensor &input_tensor_b, std::optional<const Tensor> bias, std::optional<MatmulMultiCoreReuseMultiCast1DProgramConfig> program_config = std::nullopt, const MemoryConfig& mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, std::optional<const DataType> output_dtype=std::nullopt, const MathFidelity math_fidelity = MathFidelity::LoFi, const bool fp32_dest_acc_en = false, const bool math_approx_mode = true, const bool packer_l1_acc = false);

} // namespace primary

Expand Down
Loading

0 comments on commit 02f47ff

Please sign in to comment.