Skip to content

Commit

Permalink
#10107: Fix hangs w/ launch_msg size >32bytes
Browse files Browse the repository at this point in the history
Change default MMIO TLB ordering from posted to strict (will have a negative
perf impact)
Split launch message into kernel_config and go
Write these separately from host w/ an sfence between
  • Loading branch information
pgkeller committed Jul 11, 2024
1 parent 41284be commit 83d596e
Show file tree
Hide file tree
Showing 21 changed files with 162 additions and 120 deletions.
3 changes: 2 additions & 1 deletion tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ void kernel_main() {
#else
tt_l1_ptr mailboxes_t* const mailboxes = (tt_l1_ptr mailboxes_t*)(MEM_MAILBOX_BASE);
#endif
uint64_t dispatch_addr = NOC_XY_ADDR(NOC_X(mailboxes->launch.dispatch_core_x), NOC_Y(mailboxes->launch.dispatch_core_y), DISPATCH_MESSAGE_ADDR);
uint64_t dispatch_addr = NOC_XY_ADDR(NOC_X(mailboxes->launch.kernel_config.dispatch_core_x),
NOC_Y(mailboxes->launch.kernel_config.dispatch_core_y), DISPATCH_MESSAGE_ADDR);
noc_fast_atomic_increment(noc_index, NCRISC_AT_CMD_BUF, dispatch_addr, NOC_UNICAST_WRITE_VC, 1, 31, false);
#endif

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,9 @@ void MAIN {
#else
tt_l1_ptr mailboxes_t* const mailboxes = (tt_l1_ptr mailboxes_t*)(MEM_MAILBOX_BASE);
#endif
uint64_t dispatch_addr = NOC_XY_ADDR(NOC_X(mailboxes->launch.dispatch_core_x), NOC_Y(mailboxes->launch.dispatch_core_y), DISPATCH_MESSAGE_ADDR);
uint64_t dispatch_addr =
NOC_XY_ADDR(NOC_X(mailboxes->launch.kernel_config.dispatch_core_x),
NOC_Y(mailboxes->launch.kernel_config.dispatch_core_y), DISPATCH_MESSAGE_ADDR);
noc_fast_atomic_increment(noc_index, NCRISC_AT_CMD_BUF, dispatch_addr, NOC_UNICAST_WRITE_VC, 1, 31 /*wrap*/, false /*linked*/);
}
#else
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ static void RunTest(WatcherFixture *fixture, Device *device, riscv_id_t riscv_ty
// We should be able to find the expected watcher error in the log as well,
// expected error message depends on the risc we're running on.
string kernel = "tests/tt_metal/tt_metal/test_kernels/misc/watcher_asserts.cpp";
int line_num = 55;
int line_num = 57;

string expected = fmt::format(
"Device {} {} core(x={:2},y={:2}) phys(x={:2},y={:2}): {} tripped an assert on line {}. Current kernel: {}.",
Expand Down
29 changes: 16 additions & 13 deletions tt_metal/hw/firmware/src/brisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ inline void deassert_ncrisc_trisc() {
// Below sets ncrisc to go so we can wait until it is cleared on first iteration
mailboxes->slave_sync.all = RUN_SYNC_MSG_ALL_SLAVES_DONE;

uint16_t fw_size16 = mailboxes->launch.ncrisc_kernel_size16;
uint16_t fw_size16 = mailboxes->launch.kernel_config.ncrisc_kernel_size16;
ncrisc_kernel_start_offset16 = fw_size16;

// Copies from L1 to IRAM on chips where NCRISC has IRAM
Expand Down Expand Up @@ -354,42 +354,44 @@ int main() {
// Wait for ncrisc to halt
wait_for_ncrisc_to_halt();

mailboxes->launch.run = RUN_MSG_DONE;
mailboxes->launch.go.run = RUN_MSG_DONE;

while (1) {
init_sync_registers();
reset_ncrisc_with_iram();

DEBUG_STATUS("GW");
while (mailboxes->launch.run != RUN_MSG_GO);
while (mailboxes->launch.go.run != RUN_MSG_GO);
DEBUG_STATUS("GD");

{
DeviceZoneScopedMainN("BRISC-FW");

// Copies from L1 to IRAM on chips where NCRISC has IRAM
l1_to_ncrisc_iram_copy(mailboxes->launch.ncrisc_kernel_size16, ncrisc_kernel_start_offset16);
l1_to_ncrisc_iram_copy(mailboxes->launch.kernel_config.ncrisc_kernel_size16, ncrisc_kernel_start_offset16);

// Invalidate the i$ now the kernels have loaded and before running
volatile tt_reg_ptr uint32_t* cfg_regs = core.cfg_regs_base(0);
cfg_regs[RISCV_IC_INVALIDATE_InvalidateAll_ADDR32] = RISCV_IC_BRISC_MASK | RISCV_IC_TRISC_ALL_MASK | RISCV_IC_NCRISC_MASK;

enum dispatch_core_processor_masks enables = (enum dispatch_core_processor_masks)mailboxes->launch.enables;
enum dispatch_core_processor_masks enables = (enum dispatch_core_processor_masks)mailboxes->launch.kernel_config.enables;
run_triscs(enables);

noc_index = mailboxes->launch.brisc_noc_id;
noc_index = mailboxes->launch.kernel_config.brisc_noc_id;

setup_cb_read_write_interfaces(0, num_cbs_to_early_init, true, true);
finish_ncrisc_copy_and_run(enables);

// Run the BRISC kernel
DEBUG_STATUS("R");
uint32_t kernel_config_base = mailboxes->launch.kernel_config_base;
rta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + mailboxes->launch.mem_map[DISPATCH_CLASS_TENSIX_DM0].rta_offset);
crta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + mailboxes->launch.mem_map[DISPATCH_CLASS_TENSIX_DM0].crta_offset);
uint32_t kernel_config_base = mailboxes->launch.kernel_config.kernel_config_base;
rta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base +
mailboxes->launch.kernel_config.mem_map[DISPATCH_CLASS_TENSIX_DM0].rta_offset);
crta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base +
mailboxes->launch.kernel_config.mem_map[DISPATCH_CLASS_TENSIX_DM0].crta_offset);

if (enables & DISPATCH_CLASS_MASK_TENSIX_ENABLE_DM0) {
setup_cb_read_write_interfaces(num_cbs_to_early_init, mailboxes->launch.max_cb_index, true, true);
setup_cb_read_write_interfaces(num_cbs_to_early_init, mailboxes->launch.kernel_config.max_cb_index, true, true);
kernel_init();
} else {
// This was not initialized in kernel_init
Expand All @@ -399,12 +401,13 @@ int main() {

wait_ncrisc_trisc();

mailboxes->launch.run = RUN_MSG_DONE;
mailboxes->launch.go.run = RUN_MSG_DONE;

// Notify dispatcher core that it has completed
if (mailboxes->launch.mode == DISPATCH_MODE_DEV) {
if (mailboxes->launch.kernel_config.mode == DISPATCH_MODE_DEV) {
uint64_t dispatch_addr =
NOC_XY_ADDR(NOC_X(mailboxes->launch.dispatch_core_x), NOC_Y(mailboxes->launch.dispatch_core_y), DISPATCH_MESSAGE_ADDR);
NOC_XY_ADDR(NOC_X(mailboxes->launch.kernel_config.dispatch_core_x),
NOC_Y(mailboxes->launch.kernel_config.dispatch_core_y), DISPATCH_MESSAGE_ADDR);
DEBUG_SANITIZE_NOC_ADDR(dispatch_addr, 4);
noc_fast_atomic_increment(
noc_index,
Expand Down
10 changes: 6 additions & 4 deletions tt_metal/hw/firmware/src/erisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -77,12 +77,14 @@ void __attribute__((section("erisc_l1_code.1"), noinline)) Application(void) {

while (routing_info->routing_enabled) {
// FD: assume that no more host -> remote writes are pending
if (mailboxes->launch.run == RUN_MSG_GO) {
if (mailboxes->launch.go.run == RUN_MSG_GO) {
DeviceZoneScopedMainN("ERISC-FW");
DEBUG_STATUS("R");
uint32_t kernel_config_base = mailboxes->launch.kernel_config_base;
rta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + mailboxes->launch.mem_map[DISPATCH_CLASS_ETH_DM0].rta_offset);
crta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + mailboxes->launch.mem_map[DISPATCH_CLASS_ETH_DM0].crta_offset);
uint32_t kernel_config_base = mailboxes->launch.kernel_config.kernel_config_base;
rta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base +
mailboxes->launch.kernel_config.mem_map[DISPATCH_CLASS_ETH_DM0].rta_offset);
crta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base +
mailboxes->launch.kernel_config.mem_map[DISPATCH_CLASS_ETH_DM0].crta_offset);

kernel_init();
} else {
Expand Down
8 changes: 5 additions & 3 deletions tt_metal/hw/firmware/src/erisck.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,11 @@ void __attribute__((section("erisc_l1_code"))) kernel_launch() {
rtos_context_switch_ptr = (void (*)())RtosTable[0];

kernel_main();
mailboxes->launch.run = RUN_MSG_DONE;
uint64_t dispatch_addr = NOC_XY_ADDR(NOC_X(mailboxes->launch.dispatch_core_x), NOC_Y(mailboxes->launch.dispatch_core_y), DISPATCH_MESSAGE_ADDR);
if (routing_info->routing_enabled and mailboxes->launch.mode == DISPATCH_MODE_DEV) {
mailboxes->launch.go.run = RUN_MSG_DONE;
uint64_t dispatch_addr =
NOC_XY_ADDR(NOC_X(mailboxes->launch.kernel_config.dispatch_core_x),
NOC_Y(mailboxes->launch.kernel_config.dispatch_core_y), DISPATCH_MESSAGE_ADDR);
if (routing_info->routing_enabled and mailboxes->launch.kernel_config.mode == DISPATCH_MODE_DEV) {
internal_::notify_dispatch_core_done(dispatch_addr);
}
}
24 changes: 14 additions & 10 deletions tt_metal/hw/firmware/src/idle_erisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -92,15 +92,15 @@ int main() {
//device_setup();
noc_init();

mailboxes->launch.run = RUN_MSG_DONE;
mailboxes->launch.go.run = RUN_MSG_DONE;

// Cleanup profiler buffer incase we never get the go message
while (1) {

init_sync_registers();
// Wait...
DEBUG_STATUS("GW");
while (mailboxes->launch.run != RUN_MSG_GO)
while (mailboxes->launch.go.run != RUN_MSG_GO)
{
RISC_POST_HEARTBEAT(heartbeat);
};
Expand All @@ -109,7 +109,7 @@ int main() {
{
DeviceZoneScopedMainN("ERISC-IDLE-FW");

noc_index = mailboxes->launch.brisc_noc_id;
noc_index = mailboxes->launch.kernel_config.brisc_noc_id;

//UC FIXME: do i need this?
setup_cb_read_write_interfaces(0, num_cbs_to_early_init, true, true);
Expand All @@ -118,10 +118,12 @@ int main() {
DEBUG_STATUS("R");
//if (mailboxes->launch.enable_brisc) {
//UC FIXME: do i need this?
setup_cb_read_write_interfaces(num_cbs_to_early_init, mailboxes->launch.max_cb_index, true, true);
uint32_t kernel_config_base = mailboxes->launch.kernel_config_base;
rta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + mailboxes->launch.mem_map[DISPATCH_CLASS_ETH_DM0].rta_offset);
crta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + mailboxes->launch.mem_map[DISPATCH_CLASS_ETH_DM0].crta_offset);
setup_cb_read_write_interfaces(num_cbs_to_early_init, mailboxes->launch.kernel_config.max_cb_index, true, true);
uint32_t kernel_config_base = mailboxes->launch.kernel_config.kernel_config_base;
rta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base +
mailboxes->launch.kernel_config.mem_map[DISPATCH_CLASS_ETH_DM0].rta_offset);
crta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base +
mailboxes->launch.kernel_config.mem_map[DISPATCH_CLASS_ETH_DM0].crta_offset);

kernel_init();
//} else {
Expand All @@ -130,12 +132,14 @@ int main() {
//}
DEBUG_STATUS("D");

mailboxes->launch.run = RUN_MSG_DONE;
mailboxes->launch.go.run = RUN_MSG_DONE;


// Notify dispatcher core that it has completed
if (mailboxes->launch.mode == DISPATCH_MODE_DEV) {
uint64_t dispatch_addr = NOC_XY_ADDR(NOC_X(mailboxes->launch.dispatch_core_x), NOC_Y(mailboxes->launch.dispatch_core_y), DISPATCH_MESSAGE_ADDR);
if (mailboxes->launch.kernel_config.mode == DISPATCH_MODE_DEV) {
uint64_t dispatch_addr =
NOC_XY_ADDR(NOC_X(mailboxes->launch.kernel_config.dispatch_core_x),
NOC_Y(mailboxes->launch.kernel_config.dispatch_core_y), DISPATCH_MESSAGE_ADDR);
DEBUG_SANITIZE_NOC_ADDR(dispatch_addr, 4);
noc_fast_atomic_increment(noc_index, NCRISC_AT_CMD_BUF, dispatch_addr, NOC_UNICAST_WRITE_VC, 1, 31 /*wrap*/, false /*linked*/);
}
Expand Down
10 changes: 6 additions & 4 deletions tt_metal/hw/firmware/src/ncrisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -94,11 +94,13 @@ int main(int argc, char *argv[]) {
notify_brisc_and_wait();
DeviceZoneScopedMainN("NCRISC-FW");

setup_cb_read_write_interfaces(0, mailboxes->launch.max_cb_index, true, true);
setup_cb_read_write_interfaces(0, mailboxes->launch.kernel_config.max_cb_index, true, true);

uint32_t kernel_config_base = mailboxes->launch.kernel_config_base;
rta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + mailboxes->launch.mem_map[DISPATCH_CLASS_TENSIX_DM1].rta_offset);
crta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + mailboxes->launch.mem_map[DISPATCH_CLASS_TENSIX_DM1].crta_offset);
uint32_t kernel_config_base = mailboxes->launch.kernel_config.kernel_config_base;
rta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base +
mailboxes->launch.kernel_config.mem_map[DISPATCH_CLASS_TENSIX_DM1].rta_offset);
crta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base +
mailboxes->launch.kernel_config.mem_map[DISPATCH_CLASS_TENSIX_DM1].crta_offset);

DEBUG_STATUS("R");
kernel_init();
Expand Down
10 changes: 6 additions & 4 deletions tt_metal/hw/firmware/src/trisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -106,12 +106,14 @@ int main(int argc, char *argv[]) {
DeviceZoneScopedMainN("TRISC-FW");

#if !defined(UCK_CHLKC_MATH)
setup_cb_read_write_interfaces(0, mailboxes->launch.max_cb_index, cb_init_read, cb_init_write);
setup_cb_read_write_interfaces(0, mailboxes->launch.kernel_config.max_cb_index, cb_init_read, cb_init_write);
#endif

uint32_t kernel_config_base = mailboxes->launch.kernel_config_base;
rta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + mailboxes->launch.mem_map[DISPATCH_CLASS_TENSIX_COMPUTE].rta_offset);
crta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + mailboxes->launch.mem_map[DISPATCH_CLASS_TENSIX_COMPUTE].crta_offset);
uint32_t kernel_config_base = mailboxes->launch.kernel_config.kernel_config_base;
rta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base +
mailboxes->launch.kernel_config.mem_map[DISPATCH_CLASS_TENSIX_COMPUTE].rta_offset);
crta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base +
mailboxes->launch.kernel_config.mem_map[DISPATCH_CLASS_TENSIX_COMPUTE].crta_offset);

DEBUG_STATUS("R");
kernel_init();
Expand Down
2 changes: 1 addition & 1 deletion tt_metal/hw/inc/debug/assert.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ void assert_and_hang(uint32_t line_num) {

// Update launch msg to show that we've exited.
tt_l1_ptr launch_msg_t *launch_msg = GET_MAILBOX_ADDRESS_DEV(launch);
launch_msg->run = RUN_MSG_DONE;
launch_msg->go.run = RUN_MSG_DONE;

// Hang, or in the case of erisc, early exit.
#if defined(COMPILE_FOR_ERISC)
Expand Down
2 changes: 1 addition & 1 deletion tt_metal/hw/inc/debug/sanitize_noc.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ inline void debug_sanitize_post_noc_addr_and_hang(

// Update launch msg to show that we've exited.
tt_l1_ptr launch_msg_t *launch_msg = GET_MAILBOX_ADDRESS_DEV(launch);
launch_msg->run = RUN_MSG_DONE;
launch_msg->go.run = RUN_MSG_DONE;

#if defined(COMPILE_FOR_ERISC)
// For erisc, we can't hang the kernel/fw, because the core doesn't get restarted when a new
Expand Down
15 changes: 12 additions & 3 deletions tt_metal/hw/inc/dev_msgs.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ struct dyn_mem_map_t {
volatile uint16_t crta_offset;
};

struct launch_msg_t { // must be cacheline aligned
struct kernel_config_msg_t {
volatile uint16_t watcher_kernel_ids[DISPATCH_CLASS_MAX];
volatile uint16_t ncrisc_kernel_size16; // size in 16 byte units

Expand All @@ -91,7 +91,16 @@ struct launch_msg_t { // must be cacheline aligned
volatile uint8_t dispatch_core_x;
volatile uint8_t dispatch_core_y;
volatile uint8_t exit_erisc_kernel;
volatile uint8_t run; // must be in last cacheline of this msg
volatile uint8_t pad1;
} __attribute__((packed));

struct go_msg_t {
volatile uint32_t run; // must be in last cacheline of this msg
} __attribute__((packed));

struct launch_msg_t { // must be cacheline aligned
kernel_config_msg_t kernel_config;
go_msg_t go;
} __attribute__((packed));

struct slave_sync_msg_t {
Expand Down Expand Up @@ -194,7 +203,7 @@ struct mailboxes_t {
struct debug_insert_delays_msg_t debug_insert_delays;
};

static_assert(sizeof(launch_msg_t) % sizeof(uint32_t) == 0);
static_assert(sizeof(kernel_config_msg_t) % sizeof(uint32_t) == 0);

#ifndef TENSIX_FIRMWARE
// Validate assumptions on mailbox layout on host compile
Expand Down
Loading

0 comments on commit 83d596e

Please sign in to comment.