From 59ffa8d234aa46e1c840ac2bfa4602565b54930b Mon Sep 17 00:00:00 2001 From: Mo Memarian Date: Thu, 9 May 2024 20:18:46 +0000 Subject: [PATCH] #8223: No hangs on dispatch cores when profiling them --- .../perf_microbenchmark/dispatch/common.h | 1 + tt_metal/hw/firmware/src/brisc.cc | 9 ++ tt_metal/hw/firmware/src/erisc.cc | 2 + tt_metal/hw/firmware/src/idle_erisc.cc | 2 + tt_metal/hw/firmware/src/ncrisc.cc | 2 + tt_metal/hw/inc/debug/dprint.h | 2 +- tt_metal/impl/debug/dprint_server.cpp | 10 +- tt_metal/impl/device/device.cpp | 8 ++ .../impl/dispatch/kernels/cq_dispatch.cpp | 56 +++++---- .../impl/dispatch/kernels/cq_prefetch.cpp | 23 ++-- tt_metal/jit_build/genfiles.cpp | 10 +- tt_metal/llrt/llrt.cpp | 3 + tt_metal/llrt/rtoptions.cpp | 2 +- .../test_custom_cycle_count.cpp | 4 +- tt_metal/tools/profiler/kernel_profiler.hpp | 118 ++++++++++++------ tt_metal/tools/profiler/profiler.cpp | 11 +- 16 files changed, 177 insertions(+), 86 deletions(-) diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h index 02a362eb035..8c59fe7aa24 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h @@ -458,6 +458,7 @@ void configure_kernel_variant( CoreCoord phys_downstream_core) { std::map defines = { + {"DISPATCH_KERNEL", "1"}, {"MY_NOC_X", std::to_string(phys_my_core.x)}, {"MY_NOC_Y", std::to_string(phys_my_core.y)}, {"UPSTREAM_NOC_X", std::to_string(phys_upstream_core.x)}, diff --git a/tt_metal/hw/firmware/src/brisc.cc b/tt_metal/hw/firmware/src/brisc.cc index 4695b1fee8d..1582bd15a93 100644 --- a/tt_metal/hw/firmware/src/brisc.cc +++ b/tt_metal/hw/firmware/src/brisc.cc @@ -67,6 +67,8 @@ namespace kernel_profiler { uint32_t stackSize __attribute__((used)); uint32_t sums[SUM_COUNT] __attribute__((used)); uint32_t sumIDs[SUM_COUNT] __attribute__((used)); + bool resultsPushed __attribute__((used)); + uint16_t core_flat_id __attribute__((used)); } void enable_power_management() { @@ -342,7 +344,14 @@ int main() { } DEBUG_STATUS("D"); + int i = my_x[0]; + int j = my_y[0]; + DPRINT << i << "NC WAIT" << j << ENDL(); wait_ncrisc_trisc(); + DPRINT << i << "NC DONE" << j << ENDL(); + DPRINT << i << "NC DONE" << j << ENDL(); + DPRINT << i << "NC DONE" << j << ENDL(); + DPRINT << i << "NC DONE" << j << ENDL(); mailboxes->launch.run = RUN_MSG_DONE; diff --git a/tt_metal/hw/firmware/src/erisc.cc b/tt_metal/hw/firmware/src/erisc.cc index ab722594826..09d13944ffa 100644 --- a/tt_metal/hw/firmware/src/erisc.cc +++ b/tt_metal/hw/firmware/src/erisc.cc @@ -29,6 +29,8 @@ namespace kernel_profiler { uint32_t stackSize __attribute__((used)); uint32_t sums[SUM_COUNT] __attribute__((used)); uint32_t sumIDs[SUM_COUNT] __attribute__((used)); + bool resultsPushed __attribute__((used)); + uint16_t core_flat_id __attribute__((used)); } uint8_t noc_index = 0; // TODO: remove hardcoding diff --git a/tt_metal/hw/firmware/src/idle_erisc.cc b/tt_metal/hw/firmware/src/idle_erisc.cc index e8fc6889016..0b2cb9bd6a9 100644 --- a/tt_metal/hw/firmware/src/idle_erisc.cc +++ b/tt_metal/hw/firmware/src/idle_erisc.cc @@ -53,6 +53,8 @@ namespace kernel_profiler { uint32_t stackSize __attribute__((used)); uint32_t sums[SUM_COUNT] __attribute__((used)); uint32_t sumIDs[SUM_COUNT] __attribute__((used)); + bool resultsPushed __attribute__((used)); + uint16_t core_flat_id __attribute__((used)); } //inline void RISC_POST_STATUS(uint32_t status) { diff --git a/tt_metal/hw/firmware/src/ncrisc.cc b/tt_metal/hw/firmware/src/ncrisc.cc index 1193c57a736..b91359aab12 100644 --- a/tt_metal/hw/firmware/src/ncrisc.cc +++ b/tt_metal/hw/firmware/src/ncrisc.cc @@ -36,6 +36,8 @@ namespace kernel_profiler { uint32_t stackSize __attribute__((used)); uint32_t sums[SUM_COUNT] __attribute__((used)); uint32_t sumIDs[SUM_COUNT] __attribute__((used)); + bool resultsPushed __attribute__((used)); + uint16_t core_flat_id __attribute__((used)); } extern "C" void ncrisc_resume(void); diff --git a/tt_metal/hw/inc/debug/dprint.h b/tt_metal/hw/inc/debug/dprint.h index b30043de6e2..cfe16836d18 100644 --- a/tt_metal/hw/inc/debug/dprint.h +++ b/tt_metal/hw/inc/debug/dprint.h @@ -261,7 +261,7 @@ void debug_print(DebugPrinter &dp, DebugPrintData data) { template __attribute__((__noinline__)) DebugPrinter operator <<(DebugPrinter dp, T val) { -#if defined(DEBUG_PRINT_ENABLED) && !defined(PROFILE_KERNEL) +#if defined(DEBUG_PRINT_ENABLED) DebugPrintData data{ .sz = DebugPrintTypeToSize(val), // includes terminating 0 for char* .data_ptr = DebugPrintTypeAddr(&val), diff --git a/tt_metal/impl/debug/dprint_server.cpp b/tt_metal/impl/debug/dprint_server.cpp index dea20343f66..3867fdac9b7 100644 --- a/tt_metal/impl/debug/dprint_server.cpp +++ b/tt_metal/impl/debug/dprint_server.cpp @@ -955,10 +955,10 @@ void DprintServerAttach(Device* device) { // Skip if DPRINT not enabled, and make sure profiler is not running. if (!tt::llrt::OptionsG.get_dprint_enabled()) return; - TT_FATAL( - DebugPrintServerContext::ProfilerIsRunning == false, - "Device side profiler is running, cannot start print server" - ); + //TT_FATAL( + //DebugPrintServerContext::ProfilerIsRunning == false, + //"Device side profiler is running, cannot start print server" + //); // If no server is running, create one if (!DprintServerIsRunning()) @@ -981,7 +981,7 @@ void DprintServerDetach(Device* device) { } void DprintServerSetProfilerState(bool profile_device) { - DebugPrintServerContext::ProfilerIsRunning = profile_device; + //DebugPrintServerContext::ProfilerIsRunning = profile_device; } bool DprintServerIsRunning() { diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index 9788155b973..3963c830d58 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -398,6 +398,7 @@ void Device::compile_command_queue_programs() { {"UPSTREAM_NOC_Y", std::to_string(0)}, {"DOWNSTREAM_NOC_X", std::to_string(dispatch_physical_core.x)}, {"DOWNSTREAM_NOC_Y", std::to_string(dispatch_physical_core.y)}, + {"PROFILE_KERNEL", "1"}, }; std::vector prefetch_compile_args = { @@ -456,6 +457,7 @@ void Device::compile_command_queue_programs() { {"UPSTREAM_NOC_Y", std::to_string(prefetch_physical_core.y)}, {"DOWNSTREAM_NOC_X", std::to_string(0)}, {"DOWNSTREAM_NOC_Y", std::to_string(0)}, + {"PROFILE_KERNEL", "1"}, }; std::vector dispatch_compile_args = { dispatch_constants::DISPATCH_BUFFER_BASE, @@ -583,6 +585,7 @@ void Device::compile_command_queue_programs() { {"UPSTREAM_NOC_Y", std::to_string(0)}, {"DOWNSTREAM_NOC_X", std::to_string(mux_physical_core.x)}, {"DOWNSTREAM_NOC_Y", std::to_string(mux_physical_core.y)}, + {"PROFILE_KERNEL", "1"}, }; std::vector prefetch_compile_args = { @@ -865,6 +868,7 @@ void Device::compile_command_queue_programs() { {"UPSTREAM_NOC_Y", std::to_string(demux_physical_core.y)}, {"DOWNSTREAM_NOC_X", std::to_string(0xffffffff)}, {"DOWNSTREAM_NOC_Y", std::to_string(0xffffffff)}, + {"PROFILE_KERNEL", "1"}, }; log_debug(LogDevice, "run dispatch_h at {}", dispatch_location.str()); @@ -1106,6 +1110,7 @@ void Device::compile_command_queue_programs() { {"UPSTREAM_NOC_Y", std::to_string(demux_d_physical_core.y)}, {"DOWNSTREAM_NOC_X", std::to_string(dispatch_physical_core.x)}, {"DOWNSTREAM_NOC_Y", std::to_string(dispatch_physical_core.y)}, + {"PROFILE_KERNEL", "1"}, }; std::vector prefetch_d_compile_args = { @@ -1169,6 +1174,7 @@ void Device::compile_command_queue_programs() { {"UPSTREAM_NOC_Y", std::to_string(prefetch_d_physical_core.y)}, {"DOWNSTREAM_NOC_X", std::to_string(mux_d_physical_core.x)}, {"DOWNSTREAM_NOC_Y", std::to_string(mux_d_physical_core.y)}, + {"PROFILE_KERNEL", "1"}, }; std::vector dispatch_d_compile_args = { dispatch_constants::DISPATCH_BUFFER_BASE, @@ -1431,6 +1437,8 @@ bool Device::initialize(size_t l1_small_size, const std::vector &l1_ba log_info(tt::LogMetal, "Initializing device {}. Program cache is {}enabled", this->id_, this->program_cache.is_enabled() ? "": "NOT "); this->initialize_cluster(); this->initialize_allocator(l1_small_size, l1_bank_remap); + if (minimal) return true; + log_info(tt::LogMetal, "Initializing device {}. Program cache is NOT enabled", this->id_); this->initialize_build(); auto num_devices = tt::tt_metal::GetNumAvailableDevices(); tt::tt_metal::device_pool::devices.resize(num_devices, nullptr); diff --git a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp index fd9acf99005..3f4fe9797c6 100644 --- a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp +++ b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp @@ -74,6 +74,10 @@ static uint32_t cmd_ptr; // walks through pages in cb cmd by cmd static uint32_t downstream_cb_data_ptr = downstream_cb_base; +namespace kernel_profiler { + uint32_t nocWriteSize __attribute__((used)); +} + FORCE_INLINE volatile uint32_t* get_cq_completion_read_ptr() { return reinterpret_cast(CQ_COMPLETION_READ_PTR); } @@ -849,33 +853,37 @@ void kernel_main() { } bool done = false; while (!done) { - if (cmd_ptr == cb_fence) { - get_cb_page< - dispatch_cb_base, - dispatch_cb_blocks, - dispatch_cb_log_page_size, - my_noc_xy, - my_dispatch_cb_sem_id>(cmd_ptr, - cb_fence, - block_noc_writes_to_clear, - block_next_start_addr, - rd_block_idx); - } + { + DeviceZoneScopedMainN("CQ-DISPATCH"); + if (cmd_ptr == cb_fence) { + get_cb_page< + dispatch_cb_base, + dispatch_cb_blocks, + dispatch_cb_log_page_size, + my_noc_xy, + my_dispatch_cb_sem_id>(cmd_ptr, + cb_fence, + block_noc_writes_to_clear, + block_next_start_addr, + rd_block_idx); + } - done = is_d_variant ? - process_cmd_d(cmd_ptr) : - process_cmd_h(cmd_ptr); + done = is_d_variant ? + process_cmd_d(cmd_ptr) : + process_cmd_h(cmd_ptr); - // Move to next page - cmd_ptr = round_up_pow2(cmd_ptr, dispatch_cb_page_size); + // Move to next page + cmd_ptr = round_up_pow2(cmd_ptr, dispatch_cb_page_size); - // XXXXX move this inside while loop waiting for get_dispatch_cb_page above - // XXXXX can potentially clear a partial block when stalled w/ some more bookkeeping - cb_block_release_pages(block_noc_writes_to_clear, - wr_block_idx); + // XXXXX move this inside while loop waiting for get_dispatch_cb_page above + // XXXXX can potentially clear a partial block when stalled w/ some more bookkeeping + cb_block_release_pages(block_noc_writes_to_clear, + wr_block_idx); + } + block_noc_writes_to_clear[rd_block_idx] += DeviceProfilerNOCWriteBlockCountAndReset(); } noc_async_write_barrier(); diff --git a/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp b/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp index 5e3fc910bee..6ee58f419af 100644 --- a/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp +++ b/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp @@ -81,6 +81,10 @@ static struct PrefetchExecBufState { uint32_t length; } exec_buf_state; +namespace kernel_profiler { + uint32_t nocWriteSize __attribute__((used)); +} + static_assert((downstream_cb_base & (downstream_cb_page_size - 1)) == 0); template(fence, cmd_ptr, pcie_read_ptr); DEBUG_STATUS("HQD"); @@ -743,6 +747,7 @@ uint32_t process_exec_buf_cmd(uint32_t cmd_ptr_outer, bool done = false; while (!done) { + //DeviceZoneScopedMainN("PROC-MAIN"); uint32_t cmd_ptr = cmddat_q_base; paged_read_into_cmddat_q(cmd_ptr); @@ -801,7 +806,7 @@ bool process_cmd(uint32_t& cmd_ptr, break; case CQ_PREFETCH_CMD_RELAY_INLINE: - DPRINT << "relay inline" << ENDL(); + //DPRINT << "relay inline" << ENDL(); if (exec_buf) { stride = process_relay_inline_exec_buf_cmd(cmd_ptr, downstream_data_ptr); } else { @@ -827,7 +832,7 @@ bool process_cmd(uint32_t& cmd_ptr, break; case CQ_PREFETCH_CMD_STALL: - DPRINT << "stall" << ENDL(); + //DPRINT << "stall" << ENDL(); stride = process_stall(cmd_ptr); break; @@ -973,6 +978,7 @@ void kernel_main_h() { bool done = false; while (!done) { + //DeviceZoneScopedMainN("KERNEL-MAIN-H"); fetch_q_get_cmds(fence, cmd_ptr, pcie_read_ptr); volatile CQPrefetchCmd tt_l1_ptr *cmd = (volatile CQPrefetchCmd tt_l1_ptr *)(cmd_ptr + sizeof(CQPrefetchHToPrefetchDHeader)); @@ -984,7 +990,7 @@ void kernel_main_h() { DPRINT << "exec buf\n"; process_exec_buf_cmd_h(); } else if (cmd_id == CQ_PREFETCH_CMD_TERMINATE) { - DPRINT << "prefetch terminating_" << is_h_variant << is_d_variant << ENDL();; + DPRINT << "prefetch 2 terminating_" << is_h_variant << is_d_variant << ENDL();; done = true; } #if defined(COMPILE_FOR_IDLE_ERISC) @@ -1010,6 +1016,7 @@ void kernel_main_d() { bool done = false; while (!done) { + //DeviceZoneScopedMainN("KERNEL-MAIN-D"); // cmds come in packed batches based on HostQ reads in prefetch_h // once a packed batch ends, we need to jump to the next page uint32_t length = relay_cb_get_cmds(fence, cmd_ptr); @@ -1059,9 +1066,9 @@ void kernel_main_hd() { uint32_t cmd_ptr = cmddat_q_base; uint32_t fence = cmddat_q_base; - bool done = false; while (!done) { + DeviceZoneScopedMainN("KERNEL-MAIN-HD"); constexpr uint32_t preamble_size = 0; fetch_q_get_cmds(fence, cmd_ptr, pcie_read_ptr); diff --git a/tt_metal/jit_build/genfiles.cpp b/tt_metal/jit_build/genfiles.cpp index ac403b211ac..0bead689105 100644 --- a/tt_metal/jit_build/genfiles.cpp +++ b/tt_metal/jit_build/genfiles.cpp @@ -414,10 +414,12 @@ std::string generate_bank_to_noc_coord_descriptor_string( ss << endl; ss << "extern uint16_t dram_bank_to_noc_xy[NUM_NOCS][NUM_DRAM_BANKS];" << endl; ss << "extern int32_t bank_to_dram_offset[NUM_DRAM_BANKS];" << endl; - ss << "extern int32_t noc_xy_to_profiler_flat_id[noc_size_x][noc_size_y];" << endl; ss << "extern uint16_t l1_bank_to_noc_xy[NUM_NOCS][NUM_L1_BANKS];" << endl; ss << "extern int32_t bank_to_l1_offset[NUM_L1_BANKS];" << endl; + ss << "#if defined(COMPILE_FOR_BRISC) || defined(COMPILE_FOR_NCRISC) || defined(COMPILE_FOR_ERISC)" << endl; + ss << "extern uint8_t noc_xy_to_profiler_flat_id[noc_size_x][noc_size_y];" << endl; ss << "extern uint16_t profiler_core_count_per_dram;" << endl; + ss << "#endif" << endl; ss << endl; ss << "#else // !KERNEL_BUILD (FW_BUILD)" << endl; @@ -454,17 +456,18 @@ std::string generate_bank_to_noc_coord_descriptor_string( * For DRAM banks in particular, integer division of flat_id/core_count_per_dram gives the dram bank id and the modulo * is the offset. * */ + ss << "#if defined(COMPILE_FOR_BRISC) || defined(COMPILE_FOR_NCRISC) || defined(COMPILE_FOR_ERISC)" << endl; ss << "uint16_t profiler_core_count_per_dram __attribute__((used)) = "; ss << core_count_per_dram << ";" << endl; ss << endl; - ss << "int32_t noc_xy_to_profiler_flat_id[noc_size_x][noc_size_y] __attribute__((used)) = {" << endl; + ss << "uint8_t noc_xy_to_profiler_flat_id[noc_size_x][noc_size_y] __attribute__((used)) = {" << endl; for (unsigned int x = 0; x < grid_size.x; x++) { ss << " {" << endl; for (unsigned int y = 0; y < grid_size.y; y++) { CoreCoord core = {x,y}; if (profiler_flat_id_map.find(core) == profiler_flat_id_map.end()){ - ss << " " << -1 << "," << endl; + ss << " " << 255 << "," << endl; } else{ ss << " " << profiler_flat_id_map.at(core) << "," << endl; @@ -474,6 +477,7 @@ std::string generate_bank_to_noc_coord_descriptor_string( } ss << "};" << endl; ss << endl; + ss << "#endif" << endl; #endif diff --git a/tt_metal/llrt/llrt.cpp b/tt_metal/llrt/llrt.cpp index 79ef4cbf57b..2a9b31532fd 100644 --- a/tt_metal/llrt/llrt.cpp +++ b/tt_metal/llrt/llrt.cpp @@ -305,6 +305,9 @@ static bool check_if_riscs_on_specified_core_done(chip_id_t chip_id, const CoreC bool is_active_eth_core = false; bool is_inactive_eth_core = false; + //if (core.x == 7 && core.y ==11 ) return true; + //log_info("checking: chip {}, device {} {}", chip_id, core.x, core.y); + // Determine whether an ethernet core is active or idle. Their host handshake interfaces are different. if (is_eth_core) { auto active_eth_cores = tt::Cluster::instance().get_active_ethernet_cores(chip_id); diff --git a/tt_metal/llrt/rtoptions.cpp b/tt_metal/llrt/rtoptions.cpp index aaf6250b4a4..bb5057d6a08 100644 --- a/tt_metal/llrt/rtoptions.cpp +++ b/tt_metal/llrt/rtoptions.cpp @@ -41,7 +41,7 @@ RunTimeOptions::RunTimeOptions() { profiler_enabled = true; } #endif - TT_FATAL(!(get_dprint_enabled() && get_profiler_enabled()), "Cannot enable both debug printing and profiling"); + //TT_FATAL(!(get_dprint_enabled() && get_profiler_enabled()), "Cannot enable both debug printing and profiling"); null_kernels = (std::getenv("TT_METAL_NULL_KERNELS") != nullptr); diff --git a/tt_metal/programming_examples/profiler/test_custom_cycle_count/test_custom_cycle_count.cpp b/tt_metal/programming_examples/profiler/test_custom_cycle_count/test_custom_cycle_count.cpp index 35ae809be4f..a7a5be7cbde 100644 --- a/tt_metal/programming_examples/profiler/test_custom_cycle_count/test_custom_cycle_count.cpp +++ b/tt_metal/programming_examples/profiler/test_custom_cycle_count/test_custom_cycle_count.cpp @@ -7,7 +7,7 @@ using namespace tt; -bool RunCustomCycle(tt_metal::Device *device, int loop_count, bool dumpProfile = false) +bool RunCustomCycle(tt_metal::Device *device, int loop_count) { bool pass = true; @@ -43,7 +43,7 @@ bool RunCustomCycle(tt_metal::Device *device, int loop_count, bool dumpProfile = ); EnqueueProgram(device->command_queue(), program, false); - tt_metal::DumpDeviceProfileResults(device, program); + tt_metal::detail::DumpDeviceProfileResults(device); return pass; } diff --git a/tt_metal/tools/profiler/kernel_profiler.hpp b/tt_metal/tools/profiler/kernel_profiler.hpp index 4dab65d6805..bee68b90133 100644 --- a/tt_metal/tools/profiler/kernel_profiler.hpp +++ b/tt_metal/tools/profiler/kernel_profiler.hpp @@ -8,7 +8,7 @@ #include -#if defined(COMPILE_FOR_NCRISC) | defined(COMPILE_FOR_BRISC) | defined(COMPILE_FOR_ERISC) +#if defined(COMPILE_FOR_NCRISC) || defined(COMPILE_FOR_BRISC) || defined(COMPILE_FOR_ERISC) #include "risc_common.h" #include "dataflow_api.h" #else @@ -18,6 +18,7 @@ #include "hostdevcommon/profiler_common.h" #include "risc_attribs.h" +#include "dprint.h" #ifdef PROFILER_KERNEL_FORCE_INLINE #define PROFILER_INLINE inline __attribute__((always_inline)) #else @@ -33,11 +34,14 @@ #define PROFILER_MSG __FILE__ "," $Line ",KERNEL_PROFILER" #define PROFILER_MSG_NAME( name ) name "," PROFILER_MSG -#ifdef PROFILE_KERNEL +#ifdef PROFILE_KERNEL namespace kernel_profiler{ extern uint32_t wIndex; extern uint32_t stackSize; +#if defined(DISPATCH_KERNEL) + extern uint32_t nocWriteSize; +#endif extern uint32_t sums[SUM_COUNT]; extern uint32_t sumIDs[SUM_COUNT]; @@ -46,16 +50,20 @@ namespace kernel_profiler{ constexpr uint32_t profilerBuffer = PROFILER_L1_BUFFER_BR; constexpr uint32_t deviceBufferEndIndex = DEVICE_BUFFER_END_INDEX_BR; volatile tt_l1_ptr uint32_t *profiler_control_buffer = reinterpret_cast(PROFILER_L1_BUFFER_CONTROL); - uint16_t core_flat_id; + extern uint16_t core_flat_id; + extern bool resultsPushed; #elif defined(COMPILE_FOR_ERISC) constexpr uint32_t profilerBuffer = eth_l1_mem::address_map::PROFILER_L1_BUFFER_ER; constexpr uint32_t deviceBufferEndIndex = DEVICE_BUFFER_END_INDEX_ER; volatile tt_l1_ptr uint32_t *profiler_control_buffer = reinterpret_cast(eth_l1_mem::address_map::PROFILER_L1_BUFFER_CONTROL); - uint16_t core_flat_id; + extern uint16_t core_flat_id; + extern bool resultsPushed; #elif defined(COMPILE_FOR_NCRISC) constexpr uint32_t profilerBuffer = PROFILER_L1_BUFFER_NC; constexpr uint32_t deviceBufferEndIndex = DEVICE_BUFFER_END_INDEX_NC; volatile tt_l1_ptr uint32_t *profiler_control_buffer = reinterpret_cast(PROFILER_L1_BUFFER_CONTROL); + extern uint16_t core_flat_id; + extern bool resultsPushed; #elif COMPILE_FOR_TRISC == 0 constexpr uint32_t profilerBuffer = PROFILER_L1_BUFFER_T0; constexpr uint32_t deviceBufferEndIndex = DEVICE_BUFFER_END_INDEX_T0; @@ -72,6 +80,7 @@ namespace kernel_profiler{ inline __attribute__((always_inline)) void init_profiler(uint16_t briscKernelID = 0, uint16_t ncriscKernelID = 0, uint16_t triscsKernelID = 0) { + while (!profiler_control_buffer[DRAM_PROFILER_ADDRESS]); wIndex = CUSTOM_MARKERS; stackSize = 0; @@ -81,8 +90,9 @@ namespace kernel_profiler{ sums[i] = 0; } -#if defined(COMPILE_FOR_ERISC) || defined(COMPILE_FOR_BRISC) +#if defined(COMPILE_FOR_ERISC) || defined(COMPILE_FOR_BRISC) || (defined(DISPATCH_KERNEL) && defined(COMPILE_FOR_NCRISC)) uint32_t runCounter = profiler_control_buffer[RUN_COUNTER]; + resultsPushed = false; #if defined(COMPILE_FOR_ERISC) volatile tt_l1_ptr uint32_t *eriscBuffer = reinterpret_cast(eth_l1_mem::address_map::PROFILER_L1_BUFFER_ER); @@ -114,7 +124,11 @@ namespace kernel_profiler{ eriscBuffer [ID_LL] = runCounter; #endif //ERISC_INIT -#if defined(COMPILE_FOR_BRISC) +#if defined(COMPILE_FOR_BRISC) || (defined(DISPATCH_KERNEL) && defined(COMPILE_FOR_NCRISC)) + +#if defined(DISPATCH_KERNEL) + nocWriteSize = 0; +#endif volatile tt_l1_ptr uint32_t *briscBuffer = reinterpret_cast(PROFILER_L1_BUFFER_BR); volatile tt_l1_ptr uint32_t *ncriscBuffer = reinterpret_cast(PROFILER_L1_BUFFER_NC); volatile tt_l1_ptr uint32_t *trisc0Buffer = reinterpret_cast(PROFILER_L1_BUFFER_T0); @@ -233,8 +247,8 @@ namespace kernel_profiler{ inline __attribute__((always_inline)) void finish_profiler() { risc_finished_profiling(); -#if (defined(COMPILE_FOR_ERISC) || defined(COMPILE_FOR_BRISC)) - +#if (defined(COMPILE_FOR_ERISC) || defined(COMPILE_FOR_BRISC) || (defined(DISPATCH_KERNEL) && defined(COMPILE_FOR_NCRISC))) + if (resultsPushed) return; uint32_t pageSize = PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC * PROFILER_RISC_COUNT * profiler_core_count_per_dram; @@ -272,51 +286,63 @@ namespace kernel_profiler{ profiler_control_buffer[hostIndex] = PROFILER_FULL_HOST_VECTOR_SIZE_PER_RISC+1; } #endif -#if defined(COMPILE_FOR_BRISC) +#if defined(COMPILE_FOR_BRISC) || (defined(DISPATCH_KERNEL) && defined(COMPILE_FOR_NCRISC)) int hostIndex; int deviceIndex; - for (hostIndex = kernel_profiler::HOST_BUFFER_END_INDEX_BR, deviceIndex = kernel_profiler::DEVICE_BUFFER_END_INDEX_BR; - (hostIndex <= kernel_profiler::HOST_BUFFER_END_INDEX_T2) && (deviceIndex <= kernel_profiler::DEVICE_BUFFER_END_INDEX_T2); - hostIndex++, deviceIndex++) +#if defined(COMPILE_FOR_BRISC) + volatile tt_l1_ptr uint32_t *buffer =\ + reinterpret_cast(PROFILER_L1_BUFFER_BR); + if (buffer[kernel_profiler::GUARANTEED_MARKER_1_L] != 0x80000000) +#endif { - if (profiler_control_buffer[deviceIndex]) + for (hostIndex = kernel_profiler::HOST_BUFFER_END_INDEX_BR, deviceIndex = kernel_profiler::DEVICE_BUFFER_END_INDEX_BR; + (hostIndex <= kernel_profiler::HOST_BUFFER_END_INDEX_T2) && (deviceIndex <= kernel_profiler::DEVICE_BUFFER_END_INDEX_T2); + hostIndex++, deviceIndex++) { - uint32_t currEndIndex = - profiler_control_buffer[deviceIndex] + - profiler_control_buffer[hostIndex]; - - uint32_t dram_offset = - (core_flat_id % profiler_core_count_per_dram) * PROFILER_RISC_COUNT * PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC + - hostIndex * PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC + - profiler_control_buffer[hostIndex] * sizeof(uint32_t); - const InterleavedAddrGen s = { - .bank_base_address = dram_profiler_address, - .page_size = pageSize - }; - - if ( currEndIndex <= PROFILER_FULL_HOST_VECTOR_SIZE_PER_RISC) + if (profiler_control_buffer[deviceIndex]) { - uint64_t dram_bank_dst_noc_addr = s.get_noc_addr(core_flat_id / profiler_core_count_per_dram, dram_offset); + uint32_t currEndIndex = + profiler_control_buffer[deviceIndex] + + profiler_control_buffer[hostIndex]; + + uint32_t dram_offset = + (core_flat_id % profiler_core_count_per_dram) * PROFILER_RISC_COUNT * PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC + + hostIndex * PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC + + profiler_control_buffer[hostIndex] * sizeof(uint32_t); + + const InterleavedAddrGen s = { + .bank_base_address = dram_profiler_address, + .page_size = pageSize + }; + + if ( currEndIndex <= PROFILER_FULL_HOST_VECTOR_SIZE_PER_RISC) + { + uint64_t dram_bank_dst_noc_addr = s.get_noc_addr(core_flat_id / profiler_core_count_per_dram, dram_offset); + + noc_async_write( + PROFILER_L1_BUFFER_BR + hostIndex * PROFILER_L1_BUFFER_SIZE, + dram_bank_dst_noc_addr, + profiler_control_buffer[deviceIndex] * sizeof(uint32_t)); +#if defined(DISPATCH_KERNEL) + nocWriteSize += profiler_control_buffer[deviceIndex] * sizeof(uint32_t); +#endif - noc_async_write( - PROFILER_L1_BUFFER_BR + hostIndex * PROFILER_L1_BUFFER_SIZE, - dram_bank_dst_noc_addr, - profiler_control_buffer[deviceIndex] * sizeof(uint32_t)); + profiler_control_buffer[hostIndex] = currEndIndex; + } + else + { + profiler_control_buffer[hostIndex] = PROFILER_FULL_HOST_VECTOR_SIZE_PER_RISC+1; + } - profiler_control_buffer[hostIndex] = currEndIndex; + profiler_control_buffer[deviceIndex] = 0; } - else - { - profiler_control_buffer[hostIndex] = PROFILER_FULL_HOST_VECTOR_SIZE_PER_RISC+1; - } - - profiler_control_buffer[deviceIndex] = 0; } } #endif noc_async_write_barrier(); profiler_control_buffer[RUN_COUNTER] ++; + resultsPushed = true; #endif } @@ -400,9 +426,19 @@ namespace kernel_profiler{ sums[index] += (((uint64_t)p_reg[1] << 32) | p_reg[0]) - start_time; } }; + uint32_t get_and_reset_noc_write_size() + { + uint32_t ret = 0; +#if defined(DISPATCH_KERNEL) + ret = kernel_profiler::nocWriteSize; + kernel_profiler::nocWriteSize = 0; +#endif + return ret; + } } + #define DeviceZoneScopedN( name ) DO_PRAGMA(message(PROFILER_MSG_NAME(name))); auto constexpr hash = kernel_profiler::Hash16_CT(PROFILER_MSG_NAME(name)); kernel_profiler::profileScope zone = kernel_profiler::profileScope(); #define DeviceZoneScopedMainN( name ) DO_PRAGMA(message(PROFILER_MSG_NAME(name))); auto constexpr hash = kernel_profiler::Hash16_CT(PROFILER_MSG_NAME(name)); kernel_profiler::profileScopeGuaranteed zone = kernel_profiler::profileScopeGuaranteed(); @@ -413,6 +449,8 @@ namespace kernel_profiler{ #define DeviceZoneScopedSumN2( name ) DO_PRAGMA(message(PROFILER_MSG_NAME(name))); auto constexpr hash = kernel_profiler::Hash16_CT(PROFILER_MSG_NAME(name)); kernel_profiler::profileScopeAccumulate zone = kernel_profiler::profileScopeAccumulate(); +#define DeviceProfilerNOCWriteBlockCountAndReset() ((kernel_profiler::get_and_reset_noc_write_size() + NOC_MAX_BURST_SIZE -1 )/NOC_MAX_BURST_SIZE); + #else #define DeviceZoneScopedMainN( name ) @@ -425,4 +463,6 @@ namespace kernel_profiler{ #define DeviceZoneScopedSumN2( name ) +#define DeviceProfilerNOCWriteBlockCountAndReset() 0 + #endif diff --git a/tt_metal/tools/profiler/profiler.cpp b/tt_metal/tools/profiler/profiler.cpp index 4bc38a7086f..59fdd893a20 100644 --- a/tt_metal/tools/profiler/profiler.cpp +++ b/tt_metal/tools/profiler/profiler.cpp @@ -66,8 +66,8 @@ void DeviceProfiler::readRiscProfilerResults( riscEndIndices.push_back(kernel_profiler::HOST_BUFFER_END_INDEX_ER); } - if ((control_buffer[kernel_profiler::HOST_BUFFER_END_INDEX_BR] == 0) && + (control_buffer[kernel_profiler::HOST_BUFFER_END_INDEX_NC] == 0) && (control_buffer[kernel_profiler::HOST_BUFFER_END_INDEX_ER] == 0)) { return; @@ -180,11 +180,13 @@ void DeviceProfiler::readRiscProfilerResults( riscNum ++; } - std::vector zero_buffer(PROFILER_L1_CONTROL_VECTOR_SIZE, 0); + std::vector control_buffer_reset(PROFILER_L1_CONTROL_VECTOR_SIZE, 0); + control_buffer_reset[kernel_profiler::DRAM_PROFILER_ADDRESS] = output_dram_buffer->address(); + tt::llrt::write_hex_vec_to_core( device_id, worker_core, - zero_buffer, + control_buffer_reset, PROFILER_L1_BUFFER_CONTROL); } @@ -405,6 +407,9 @@ void DeviceProfiler::dumpResults ( void DeviceProfiler::pushTracyDeviceResults() { #if defined(PROFILER) && defined(TRACY_ENABLE) + ZoneScoped; + std::set> device_cores_set; + std::vector> device_cores; for (auto& event: device_events) { std::pair device_core = {event.chip_id, (CoreCoord){event.core_x,event.core_y}};