-
Notifications
You must be signed in to change notification settings - Fork 74
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
#4681: Add new group_attn_matmul (uplift + optimizations of attn_matmul)
- Same as attn_matmul but we can have kv_heads > 1 - kv_heads is mcasted by 32 cores to all q_head cores - Supports interleaved and height sharded (row or col) for any mix of in0, in1, or output - This op is fully dynamic across input shape, similar to eltwise_binary - Add unit testing for group_attn_matmul and program caching
- Loading branch information
1 parent
16d4df0
commit 8a5cc63
Showing
9 changed files
with
1,280 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
46 changes: 46 additions & 0 deletions
46
tt_eager/tt_dnn/kernels/dataflow/writer_transformer_group_attn_matmul.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. | ||
// | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
#include "dataflow_api.h" | ||
|
||
void kernel_main() { | ||
uint32_t has_work = get_arg_val<uint32_t>(0); | ||
if (has_work == 0) return; | ||
uint32_t dst_addr = get_arg_val<uint32_t>(1); | ||
uint32_t num_tiles = get_arg_val<uint32_t>(2); | ||
uint32_t start_id = get_arg_val<uint32_t>(3); | ||
|
||
constexpr uint32_t cb_id_out = get_compile_time_arg_val(0); | ||
constexpr bool dst_is_dram = get_compile_time_arg_val(1) == 1; | ||
|
||
#ifdef OUT_SHARDED | ||
cb_wait_front(cb_id_out, num_tiles); | ||
#else | ||
|
||
// single-tile ublocks | ||
constexpr uint32_t onetile = 1; | ||
const uint32_t tile_bytes = get_tile_size(cb_id_out); | ||
const DataFormat data_format = get_dataformat(cb_id_out); | ||
|
||
const InterleavedAddrGenFast<dst_is_dram> s = { | ||
.bank_base_address = dst_addr, | ||
.page_size = tile_bytes, | ||
.data_format = data_format | ||
}; | ||
|
||
#ifdef BACKWARDS | ||
uint32_t end_id = start_id - num_tiles; | ||
for (uint32_t i = start_id; i != end_id; -- i) { | ||
#else | ||
uint32_t end_id = start_id + num_tiles; | ||
for (uint32_t i = start_id; i < end_id; ++ i) { | ||
#endif | ||
cb_wait_front(cb_id_out, onetile); | ||
uint32_t l1_read_addr = get_read_ptr(cb_id_out); | ||
noc_async_write_tile(i, s, l1_read_addr); | ||
noc_async_write_barrier(); | ||
cb_pop_front(cb_id_out, onetile); | ||
} | ||
#endif | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
101 changes: 101 additions & 0 deletions
101
tt_eager/tt_dnn/op_library/transformer_tms/kernels/compute/transformer_group_attn_matmul.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. | ||
// | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
#include <cstdint> | ||
#include "compute_kernel_api/tile_move_copy.h" | ||
#include "compute_kernel_api/matmul.h" | ||
#include "compute_kernel_api/tilize.h" | ||
#include "compute_kernel_api/untilize.h" | ||
|
||
using std::uint32_t; | ||
|
||
// matmul C=A*B using dims MK*KN = MN (row major order) | ||
// | ||
namespace NAMESPACE { | ||
void MAIN { | ||
|
||
constexpr uint32_t onetile = 1; | ||
|
||
constexpr uint32_t transpose_hw = get_compile_time_arg_val(0); | ||
|
||
uint32_t has_work = get_arg_val<uint32_t>(0); | ||
if (has_work == 0) return; | ||
uint32_t batch = get_arg_val<uint32_t>(1); | ||
uint32_t Mt = get_arg_val<uint32_t>(2); | ||
uint32_t Kt = get_arg_val<uint32_t>(3); | ||
uint32_t Nt = get_arg_val<uint32_t>(4); | ||
|
||
constexpr uint32_t cb_in0 = 0; | ||
constexpr uint32_t cb_in1 = 1; | ||
constexpr uint32_t cb_intermed0 = 24; | ||
constexpr uint32_t cb_intermed1 = 25; | ||
constexpr uint32_t cb_intermed2 = 26; | ||
constexpr uint32_t out_cb_id = 16; | ||
|
||
constexpr uint32_t num_rows_in_one_tile = 32; | ||
|
||
mm_init(cb_in0, cb_in1, cb_intermed0, transpose_hw); | ||
|
||
for (uint32_t nb = 0; nb < batch; ++nb) { | ||
for (uint32_t mt_C = 0; mt_C < Mt; ++mt_C) { // output tile of C | ||
cb_wait_front(cb_in0, Kt); | ||
for (uint32_t nt_C = 0; nt_C < Nt; ++nt_C) { // output tile index of C | ||
for (uint32_t tile_row_id = 0; tile_row_id < num_rows_in_one_tile; ++tile_row_id) { | ||
tile_regs_acquire(); | ||
for (uint32_t kt = 0; kt < Kt; ++kt) { | ||
cb_wait_front(cb_in1, onetile); | ||
|
||
matmul_tiles(cb_in0, cb_in1, kt, 0, 0, transpose_hw); | ||
|
||
cb_pop_front(cb_in1, onetile); | ||
} | ||
tile_regs_commit(); | ||
|
||
cb_reserve_back(cb_intermed0, onetile); | ||
tile_regs_wait(); | ||
pack_tile(0, cb_intermed0); | ||
tile_regs_release(); | ||
cb_push_back(cb_intermed0, onetile); | ||
|
||
// untilize tile and write to CB::c_intermed1 | ||
unpack_reconfig_data_format_srca(cb_in1, cb_intermed0); | ||
cb_wait_front(cb_intermed0, onetile); | ||
untilize_init_short(cb_intermed0); | ||
cb_reserve_back(cb_intermed1, 1); | ||
untilize_block(cb_intermed0, 1, cb_intermed1); | ||
cb_push_back(cb_intermed1, 1); | ||
|
||
cb_pop_front(cb_intermed0, 1); | ||
untilize_uninit(cb_intermed0); | ||
|
||
unpack_reconfig_data_format_srca(cb_intermed0, cb_in1); | ||
mm_init_short(transpose_hw); | ||
} | ||
|
||
// cb_intermed2 comes from reader; untilized row-major tile | ||
unpack_reconfig_data_format_srca(cb_in1, cb_intermed2); | ||
pack_reconfig_data_format(cb_intermed1, out_cb_id); | ||
cb_wait_front(cb_intermed2, 1); | ||
cb_reserve_back(out_cb_id, onetile); | ||
|
||
// tilize CB::intermed2 and write to CB::c_out0 | ||
tilize_init_short(cb_intermed2, 1); | ||
tilize_block(cb_intermed2, 1, out_cb_id); | ||
cb_push_back(out_cb_id, 1); | ||
|
||
cb_pop_front(cb_intermed2, 1); | ||
tilize_uninit(); | ||
|
||
// Hangs when in0 is BFLOAT8_B if we don't force the reconfig | ||
unpack_reconfig_data_format_srca(cb_in1); | ||
pack_reconfig_data_format(out_cb_id, cb_intermed0); | ||
mm_init_short(transpose_hw); | ||
} // Nt | ||
|
||
cb_pop_front(cb_in0, Kt); | ||
} // Mt | ||
} // batch | ||
|
||
} | ||
} // NAMESPACE |
Oops, something went wrong.