From 87bd9a213d70d367d86adc26164b025893fbfaa1 Mon Sep 17 00:00:00 2001 From: John Bauman Date: Tue, 12 Nov 2024 19:58:44 +0000 Subject: [PATCH] #0: Preload kernel before starting --- tt_metal/hostdevcommon/dprint_common.h | 3 ++- tt_metal/hw/firmware/src/brisc.cc | 3 ++- tt_metal/hw/firmware/src/brisck.cc | 5 +++++ tt_metal/hw/firmware/src/ncrisck.cc | 15 +++++++++++---- tt_metal/hw/firmware/src/trisck.cc | 5 +++++ tt_metal/hw/inc/dev_msgs.h | 5 +++++ tt_metal/hw/inc/grayskull/noc_nonblocking_api.h | 1 + tt_metal/impl/dispatch/command_queue.cpp | 5 +++++ 8 files changed, 36 insertions(+), 6 deletions(-) diff --git a/tt_metal/hostdevcommon/dprint_common.h b/tt_metal/hostdevcommon/dprint_common.h index d555cf6c254..b9134778bc9 100644 --- a/tt_metal/hostdevcommon/dprint_common.h +++ b/tt_metal/hostdevcommon/dprint_common.h @@ -18,7 +18,8 @@ #if !defined(KERNEL_BUILD) && !defined(FW_BUILD) // SW #include "common/tt_backend_api_types.hpp" typedef tt::DataFormat CommonDataFormat; -#else // HW already includes tensix_types.h +#else +#include "tensix_types.h" typedef DataFormat CommonDataFormat; #endif diff --git a/tt_metal/hw/firmware/src/brisc.cc b/tt_metal/hw/firmware/src/brisc.cc index c019e5a4764..74dc1bc1f72 100644 --- a/tt_metal/hw/firmware/src/brisc.cc +++ b/tt_metal/hw/firmware/src/brisc.cc @@ -369,7 +369,8 @@ int main() { WAYPOINT("GW"); uint8_t go_message_signal = RUN_MSG_DONE; - while ((go_message_signal = mailboxes->go_message.signal) != RUN_MSG_GO) { + while (((go_message_signal = mailboxes->go_message.signal) != RUN_MSG_GO) && + !(mailboxes->launch[mailboxes->launch_msg_rd_ptr].kernel_config.enables & DISPATCH_ENABLE_FLAG_PRELOAD)) { // While the go signal for kernel execution is not sent, check if the worker was signalled // to reset its launch message read pointer. if (go_message_signal == RUN_MSG_RESET_READ_PTR) { diff --git a/tt_metal/hw/firmware/src/brisck.cc b/tt_metal/hw/firmware/src/brisck.cc index f9f04eec011..75e93e9be42 100644 --- a/tt_metal/hw/firmware/src/brisck.cc +++ b/tt_metal/hw/firmware/src/brisck.cc @@ -22,6 +22,11 @@ extern uint32_t __kernel_init_local_l1_base[]; extern uint32_t __fw_export_end_text[]; void kernel_launch(uint32_t kernel_base_addr) { + uint8_t go_message_signal; + tt_l1_ptr mailboxes_t *const mailboxes = (tt_l1_ptr mailboxes_t *)(MEM_MAILBOX_BASE); + + while ((go_message_signal = mailboxes->go_message.signal) != RUN_MSG_GO) { + } #if defined(DEBUG_NULL_KERNELS) && !defined(DISPATCH_KERNEL) #ifdef KERNEL_RUN_TIME diff --git a/tt_metal/hw/firmware/src/ncrisck.cc b/tt_metal/hw/firmware/src/ncrisck.cc index 6f24d5b107b..b32309bf3af 100644 --- a/tt_metal/hw/firmware/src/ncrisck.cc +++ b/tt_metal/hw/firmware/src/ncrisck.cc @@ -2,13 +2,16 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "risc_common.h" -#include "tensix.h" -#include "tensix_types.h" +#include + #include "noc.h" -#include "noc_overlay_parameters.h" #include "noc_nonblocking_api.h" +#include "noc_overlay_parameters.h" +#include "risc_attribs.h" +#include "risc_common.h" #include "stream_io_map.h" +#include "tensix.h" +#include "tensix_types.h" #ifdef PERF_DUMP #include "risc_perf.h" #endif @@ -30,7 +33,11 @@ extern uint32_t __kernel_init_local_l1_base[]; extern uint32_t __fw_export_end_text[]; void kernel_launch(uint32_t kernel_base_addr) { + uint8_t go_message_signal; + tt_l1_ptr mailboxes_t *const mailboxes = (tt_l1_ptr mailboxes_t *)(MEM_MAILBOX_BASE); + while ((go_message_signal = mailboxes->go_message.signal) != RUN_MSG_GO) { + } DeviceZoneScopedMainChildN("NCRISC-KERNEL"); #if defined(DEBUG_NULL_KERNELS) && !defined(DISPATCH_KERNEL) #ifdef KERNEL_RUN_TIME diff --git a/tt_metal/hw/firmware/src/trisck.cc b/tt_metal/hw/firmware/src/trisck.cc index 862c2964808..8b6875bf639 100644 --- a/tt_metal/hw/firmware/src/trisck.cc +++ b/tt_metal/hw/firmware/src/trisck.cc @@ -38,6 +38,11 @@ extern uint32_t __fw_export_end_text[]; void kernel_launch(uint32_t kernel_base_addr) { + uint8_t go_message_signal; + tt_l1_ptr mailboxes_t *const mailboxes = (tt_l1_ptr mailboxes_t *)(MEM_MAILBOX_BASE); + + while ((go_message_signal = mailboxes->go_message.signal) != RUN_MSG_GO) { + } DeviceZoneScopedMainChildN("TRISC-KERNEL"); #if defined(DEBUG_NULL_KERNELS) && !defined(DISPATCH_KERNEL) #ifdef KERNEL_RUN_TIME diff --git a/tt_metal/hw/inc/dev_msgs.h b/tt_metal/hw/inc/dev_msgs.h index 4c3f883b61b..0904ff88ed9 100644 --- a/tt_metal/hw/inc/dev_msgs.h +++ b/tt_metal/hw/inc/dev_msgs.h @@ -89,6 +89,10 @@ struct rta_offset_t { volatile uint16_t crta_offset; }; +enum dispatchenable_flags { + DISPATCH_ENABLE_FLAG_PRELOAD = 1 << 7, +}; + struct kernel_config_msg_t { volatile uint16_t watcher_kernel_ids[DISPATCH_CLASS_MAX]; volatile uint16_t ncrisc_kernel_size16; // size in 16 byte units @@ -107,6 +111,7 @@ struct kernel_config_msg_t { volatile uint8_t brisc_noc_mode; volatile uint8_t max_cb_index; volatile uint8_t exit_erisc_kernel; + // Or of enable_flags and dispatch_core_processor_masks. volatile uint8_t enables; volatile uint8_t pad2[12]; } __attribute__((packed)); diff --git a/tt_metal/hw/inc/grayskull/noc_nonblocking_api.h b/tt_metal/hw/inc/grayskull/noc_nonblocking_api.h index 8fe58acf1a6..84368ac96f1 100644 --- a/tt_metal/hw/inc/grayskull/noc_nonblocking_api.h +++ b/tt_metal/hw/inc/grayskull/noc_nonblocking_api.h @@ -8,6 +8,7 @@ #include "noc_parameters.h" #include "dev_msgs.h" +#include "risc_attribs.h" //// diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp index c23ea335737..5120d8d664f 100644 --- a/tt_metal/impl/dispatch/command_queue.cpp +++ b/tt_metal/impl/dispatch/command_queue.cpp @@ -1020,6 +1020,7 @@ void EnqueueProgramCommand::assemble_device_commands( uint32_t programmable_core_index = hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX); for (KernelGroup& kernel_group : program.get_kernel_groups(programmable_core_index)) { kernel_group.launch_msg.kernel_config.mode = DISPATCH_MODE_DEV; + kernel_group.launch_msg.kernel_config.enables |= DISPATCH_ENABLE_FLAG_PRELOAD; for (uint32_t i = 0; i < kernel_config_addrs.size(); i++) { kernel_group.launch_msg.kernel_config.kernel_config_base[i] = kernel_config_addrs[i].addr; } @@ -1052,6 +1053,7 @@ void EnqueueProgramCommand::assemble_device_commands( if (programmable_core_index != -1) { for (KernelGroup& kernel_group : program.get_kernel_groups(programmable_core_index)) { kernel_group.launch_msg.kernel_config.mode = DISPATCH_MODE_DEV; + kernel_group.launch_msg.kernel_config.enables |= DISPATCH_ENABLE_FLAG_PRELOAD; for (uint32_t i = 0; i < kernel_config_addrs.size(); i++) { kernel_group.launch_msg.kernel_config.kernel_config_base[i] = kernel_config_addrs[i].addr; } @@ -1084,6 +1086,8 @@ void EnqueueProgramCommand::assemble_device_commands( // if not, check if the program is active on workers. If active, have dispatch_d issue a write barrier cmd_sequence_sizeB += (this->device->dispatch_s_enabled() || program_transfer_info.num_active_cores > 0) * CQ_PREFETCH_CMD_BARE_MIN_SIZE; + // Wait for eveything to be written to before launch. + cmd_sequence_sizeB += CQ_PREFETCH_CMD_BARE_MIN_SIZE; // either dispatch_s or dispatch_d will send the go signal (go_signal_mcast command) cmd_sequence_sizeB += CQ_PREFETCH_CMD_BARE_MIN_SIZE; @@ -1201,6 +1205,7 @@ void EnqueueProgramCommand::assemble_device_commands( kernel_bins_prefetch_subcmds[i].size()); } + device_command_sequence.add_dispatch_wait(/*barrier*/ true, 0, 0, false, false, /*do_wait */ false); // Go Signals program_command_sequence.go_signals.reserve( multicast_go_signal_sub_cmds.size() + unicast_go_signal_sub_cmds.size());