From a877879d1768b93ac7deb7acffe50b1c58667166 Mon Sep 17 00:00:00 2001 From: Mo Memarian Date: Thu, 9 May 2024 20:18:46 +0000 Subject: [PATCH] #8223: Profiling dispatch cores --- conftest.py | 12 +- tt_eager/tracy.py | 2 + tt_eager/tt_lib/csrc/tt_lib_bindings.cpp | 4 +- tt_metal/detail/tt_metal.hpp | 12 +- tt_metal/hostdevcommon/profiler_common.h | 5 +- tt_metal/hw/firmware/src/brisc.cc | 12 +- tt_metal/hw/firmware/src/erisc.cc | 2 + tt_metal/hw/firmware/src/idle_erisc.cc | 4 +- tt_metal/hw/firmware/src/ncrisc.cc | 15 +- tt_metal/hw/firmware/src/trisc.cc | 10 +- tt_metal/impl/device/device.cpp | 9 +- .../impl/dispatch/kernels/cq_dispatch.cpp | 18 +- .../impl/dispatch/kernels/cq_prefetch.cpp | 13 +- tt_metal/jit_build/build.cpp | 7 +- tt_metal/llrt/llrt.cpp | 2 +- tt_metal/llrt/rtoptions.cpp | 5 + tt_metal/llrt/rtoptions.hpp | 2 + .../test_custom_cycle_count.cpp | 2 +- .../profiler/test_multi_op/test_multi_op.cpp | 2 +- tt_metal/third_party/tracy | 2 +- tt_metal/tools/profiler/kernel_profiler.hpp | 253 ++++++++++++++---- tt_metal/tools/profiler/process_device_log.py | 7 +- tt_metal/tools/profiler/profiler.cpp | 75 ++++-- tt_metal/tools/profiler/profiler.hpp | 2 +- tt_metal/tools/profiler/tt_metal_profiler.cpp | 47 ++-- 25 files changed, 364 insertions(+), 160 deletions(-) diff --git a/conftest.py b/conftest.py index 93cb05b6084..d718db6bc1d 100644 --- a/conftest.py +++ b/conftest.py @@ -283,7 +283,7 @@ def device(device_l1_small_size): device = ttl.device.GetDefaultDevice() yield device - ttl.device.DumpDeviceProfiler(device, True) + ttl.device.DumpDeviceProfiler(device) ttl.device.DeallocateBuffers(device) @@ -299,7 +299,7 @@ def pcie_devices(request): yield [devices[i] for i in range(num_devices)] for device in devices.values(): - ttl.device.DumpDeviceProfiler(device, True) + ttl.device.DumpDeviceProfiler(device) ttl.device.DeallocateBuffers(device) ttl.device.CloseDevices(devices) @@ -317,7 +317,7 @@ def all_devices(request): yield [devices[i] for i in range(num_devices)] for device in devices.values(): - ttl.device.DumpDeviceProfiler(device, True) + ttl.device.DumpDeviceProfiler(device) ttl.device.DeallocateBuffers(device) ttl.device.CloseDevices(devices) @@ -344,7 +344,7 @@ def device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0): import tt_lib as ttl for device in device_mesh.get_devices(): - ttl.device.DumpDeviceProfiler(device, True) + ttl.device.DumpDeviceProfiler(device) ttl.device.DeallocateBuffers(device) ttnn.close_device_mesh(device_mesh) @@ -374,7 +374,7 @@ def pcie_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0): import tt_lib as ttl for device in device_mesh.get_devices(): - ttl.device.DumpDeviceProfiler(device, True) + ttl.device.DumpDeviceProfiler(device) ttl.device.DeallocateBuffers(device) ttnn.close_device_mesh(device_mesh) @@ -404,7 +404,7 @@ def t3k_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0): import tt_lib as ttl for device in device_mesh.get_devices(): - ttl.device.DumpDeviceProfiler(device, True) + ttl.device.DumpDeviceProfiler(device) ttl.device.DeallocateBuffers(device) ttnn.close_device_mesh(device_mesh) diff --git a/tt_eager/tracy.py b/tt_eager/tracy.py index d92a5c05e83..ab7814f5613 100644 --- a/tt_eager/tracy.py +++ b/tt_eager/tracy.py @@ -300,6 +300,8 @@ def main(): testCommand = f"python -m tracy {osCmd}" envVars = dict(os.environ) + # No Dispatch cores for op_report + envVars["TT_METAL_DEVICE_PROFILER_DISPATCH"] = "0" if options.device: envVars["TT_METAL_DEVICE_PROFILER"] = "1" else: diff --git a/tt_eager/tt_lib/csrc/tt_lib_bindings.cpp b/tt_eager/tt_lib/csrc/tt_lib_bindings.cpp index 545a130cd7f..c5deb51a9f2 100644 --- a/tt_eager/tt_lib/csrc/tt_lib_bindings.cpp +++ b/tt_eager/tt_lib/csrc/tt_lib_bindings.cpp @@ -196,14 +196,14 @@ void DeviceModule(py::module &m_device) { the FinishCommand. Once set to false, all subsequent commands will immediately notify the device that the write pointer has been updated. )doc"); - m_device.def("DumpDeviceProfiler", &detail::DumpDeviceProfiler, py::arg("device"), py::arg("free_buffers") = false, R"doc( + m_device.def("DumpDeviceProfiler", &detail::DumpDeviceProfiler, py::arg("device"), py::arg("last_dump") = false, R"doc( Dump device side profiling data. +------------------+----------------------------------+-----------------------+-------------+----------+ | Argument | Description | Data type | Valid range | Required | +==================+==================================+=======================+=============+==========+ | device | Device to dump profiling data of | tt_lib.device.Device | | Yes | - | free_buffers | Option to free buffer | bool | | No | + | last_dump | Last dump before process dies | bool | | No | +------------------+----------------------------------+-----------------------+-------------+----------+ )doc"); m_device.def("DeallocateBuffers", &detail::DeallocateBuffers, R"doc( diff --git a/tt_metal/detail/tt_metal.hpp b/tt_metal/detail/tt_metal.hpp index bcc80005d87..0a93802bb1a 100644 --- a/tt_metal/detail/tt_metal.hpp +++ b/tt_metal/detail/tt_metal.hpp @@ -165,9 +165,9 @@ namespace tt::tt_metal{ * |---------------|---------------------------------------------------|--------------------------------------------------------------|---------------------------|----------| * | device | The device holding the program being profiled. | Device * | | True | * | core_coords | The logical core coordinates being profiled. | const std::unordered_map> & | | True | - * | free_buffers | Free up the profiler buffer spaces for the device | bool | | False | + * | last_dump | Last dump before process dies | bool | | False | * */ - void DumpDeviceProfileResults(Device *device, std::vector &worker_cores, bool free_buffers = false); + void DumpDeviceProfileResults(Device *device, std::vector &worker_cores, bool last_dump = false); /** * Traverse all cores and read device side profiler data and dump results into device side CSV log @@ -177,9 +177,9 @@ namespace tt::tt_metal{ * | Argument | Description | Type | Valid Range | Required | * |---------------|---------------------------------------------------|--------------------------------------------------------------|---------------------------|----------| * | device | The device holding the program being profiled. | Device * | | True | - * | free_buffers | Free up the profiler buffer spaces for the device | bool | | False | + * | last_dump | Last dump before process dies | bool | | False | * */ - void DumpDeviceProfileResults(Device *device, bool free_buffers = false); + void DumpDeviceProfileResults(Device *device, bool last_dump = false); /** * Set the directory for device-side CSV logs produced by the profiler instance in the tt-metal module @@ -333,9 +333,9 @@ namespace tt::tt_metal{ DispatchStateCheck(true); LAZY_COMMAND_QUEUE_MODE = lazy; } - inline void DumpDeviceProfiler(Device * device, bool free_buffers) + inline void DumpDeviceProfiler(Device * device, bool last_dump) { - tt::tt_metal::detail::DumpDeviceProfileResults(device, free_buffers); + tt::tt_metal::detail::DumpDeviceProfileResults(device, last_dump); } void AllocateBuffer(Buffer* buffer, bool bottom_up); diff --git a/tt_metal/hostdevcommon/profiler_common.h b/tt_metal/hostdevcommon/profiler_common.h index d1b8ca63fe2..db075241368 100644 --- a/tt_metal/hostdevcommon/profiler_common.h +++ b/tt_metal/hostdevcommon/profiler_common.h @@ -4,6 +4,8 @@ #pragma once +#define PROFILER_OPT_DO_DISPATCH_CORES 2 + namespace kernel_profiler{ constexpr static uint32_t PADDING_MARKER = ((1<<16) - 1); @@ -40,7 +42,8 @@ namespace kernel_profiler{ RUN_COUNTER, NOC_X, NOC_Y, - FLAT_ID + FLAT_ID, + DROPPED_ZONES, }; diff --git a/tt_metal/hw/firmware/src/brisc.cc b/tt_metal/hw/firmware/src/brisc.cc index 52a779a02bf..aee422570cd 100644 --- a/tt_metal/hw/firmware/src/brisc.cc +++ b/tt_metal/hw/firmware/src/brisc.cc @@ -66,11 +66,13 @@ CBInterface cb_interface[NUM_CIRCULAR_BUFFERS] __attribute__((used)); #define MEM_MOVER_VIEW_IRAM_BASE_ADDR (0x4 << 12) namespace kernel_profiler { -uint32_t wIndex __attribute__((used)); -uint32_t stackSize __attribute__((used)); -uint32_t sums[SUM_COUNT] __attribute__((used)); -uint32_t sumIDs[SUM_COUNT] __attribute__((used)); -} // namespace kernel_profiler + uint32_t wIndex __attribute__((used)); + uint32_t stackSize __attribute__((used)); + uint32_t sums[SUM_COUNT] __attribute__((used)); + uint32_t sumIDs[SUM_COUNT] __attribute__((used)); + bool resultsPushed __attribute__((used)); + uint16_t core_flat_id __attribute__((used)); +} void enable_power_management() { // Mask and Hyst taken from tb_tensix math_tests diff --git a/tt_metal/hw/firmware/src/erisc.cc b/tt_metal/hw/firmware/src/erisc.cc index ab722594826..09d13944ffa 100644 --- a/tt_metal/hw/firmware/src/erisc.cc +++ b/tt_metal/hw/firmware/src/erisc.cc @@ -29,6 +29,8 @@ namespace kernel_profiler { uint32_t stackSize __attribute__((used)); uint32_t sums[SUM_COUNT] __attribute__((used)); uint32_t sumIDs[SUM_COUNT] __attribute__((used)); + bool resultsPushed __attribute__((used)); + uint16_t core_flat_id __attribute__((used)); } uint8_t noc_index = 0; // TODO: remove hardcoding diff --git a/tt_metal/hw/firmware/src/idle_erisc.cc b/tt_metal/hw/firmware/src/idle_erisc.cc index e8fc6889016..a5e02f05672 100644 --- a/tt_metal/hw/firmware/src/idle_erisc.cc +++ b/tt_metal/hw/firmware/src/idle_erisc.cc @@ -53,6 +53,8 @@ namespace kernel_profiler { uint32_t stackSize __attribute__((used)); uint32_t sums[SUM_COUNT] __attribute__((used)); uint32_t sumIDs[SUM_COUNT] __attribute__((used)); + bool resultsPushed __attribute__((used)); + uint16_t core_flat_id __attribute__((used)); } //inline void RISC_POST_STATUS(uint32_t status) { @@ -101,7 +103,7 @@ int main() { DEBUG_STATUS("GD"); { - DeviceZoneScopedMainN("ERISC-FW"); + DeviceZoneScopedMainN("ERISC-IDLE-FW"); noc_index = mailboxes->launch.brisc_noc_id; diff --git a/tt_metal/hw/firmware/src/ncrisc.cc b/tt_metal/hw/firmware/src/ncrisc.cc index c3442f54053..ef8ae9fbf9a 100644 --- a/tt_metal/hw/firmware/src/ncrisc.cc +++ b/tt_metal/hw/firmware/src/ncrisc.cc @@ -34,11 +34,16 @@ uint32_t atomic_ret_val __attribute__((section("l1_data"))) __attribute__((used) CBInterface cb_interface[NUM_CIRCULAR_BUFFERS] __attribute__((used)); namespace kernel_profiler { -uint32_t wIndex __attribute__((used)); -uint32_t stackSize __attribute__((used)); -uint32_t sums[SUM_COUNT] __attribute__((used)); -uint32_t sumIDs[SUM_COUNT] __attribute__((used)); -} // namespace kernel_profiler + uint32_t wIndex __attribute__((used)); + uint32_t stackSize __attribute__((used)); + uint32_t sums[SUM_COUNT] __attribute__((used)); + uint32_t sumIDs[SUM_COUNT] __attribute__((used)); + bool resultsPushed __attribute__((used)); + uint16_t core_flat_id __attribute__((used)); + uint32_t nocWriteSize __attribute__((used)); + uint32_t *nocWriteBuffer __attribute__((used)); + uint32_t *nocWriteIndex __attribute__((used)); +} extern "C" void ncrisc_resume(void); extern "C" void notify_brisc_and_halt(uint32_t status); diff --git a/tt_metal/hw/firmware/src/trisc.cc b/tt_metal/hw/firmware/src/trisc.cc index 32d2019fd37..badf07a9408 100644 --- a/tt_metal/hw/firmware/src/trisc.cc +++ b/tt_metal/hw/firmware/src/trisc.cc @@ -17,11 +17,11 @@ // clang-format on namespace kernel_profiler { -uint32_t wIndex __attribute__((used)); -uint32_t stackSize __attribute__((used)); -uint32_t sums[SUM_COUNT] __attribute__((used)); -uint32_t sumIDs[SUM_COUNT] __attribute__((used)); -} // namespace kernel_profiler + uint32_t wIndex __attribute__((used)); + uint32_t stackSize __attribute__((used)); + uint32_t sums[SUM_COUNT] __attribute__((used)); + uint32_t sumIDs[SUM_COUNT] __attribute__((used)); +} namespace ckernel { diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index e73c9efbdbb..e34df0be357 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -1289,8 +1289,6 @@ bool Device::close() { if (not this->initialized_) { TT_THROW("Cannot close device {} that has not been initialized!", this->id_); } - this->deallocate_buffers(); - watcher_detach(this); for (const std::unique_ptr &hw_command_queue : hw_command_queues_) { if (hw_command_queue->manager.get_bypass_mode()) { @@ -1298,9 +1296,16 @@ bool Device::close() { } hw_command_queue->terminate(); } + + tt_metal::detail::DumpDeviceProfileResults(this, true); + this->trace_buffer_pool_.clear(); detail::EnableAllocs(this); + this->deallocate_buffers(); + watcher_detach(this); + + std::unordered_set not_done_dispatch_cores; std::unordered_set cores_to_skip; diff --git a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp index a506c16df3e..bf405c4fa30 100644 --- a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp +++ b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp @@ -80,6 +80,7 @@ constexpr uint32_t l1_cache_size = ((max_write_packed_cores + l1_to_local_cache_ static uint32_t l1_cache[l1_cache_size]; + // NOTE: CAREFUL USING THIS FUNCTION // It is call "careful_copy" because you need to be careful... // It copies beyond count by up to 5 elements make sure src and dst addresses are safe @@ -886,6 +887,7 @@ void kernel_main() { } bool done = false; while (!done) { + DeviceZoneScopedND("CQ-DISPATCH", block_noc_writes_to_clear, rd_block_idx ); if (cmd_ptr == cb_fence) { get_cb_page< dispatch_cb_base, @@ -893,10 +895,10 @@ void kernel_main() { dispatch_cb_log_page_size, my_noc_xy, my_dispatch_cb_sem_id>(cmd_ptr, - cb_fence, - block_noc_writes_to_clear, - block_next_start_addr, - rd_block_idx); + cb_fence, + block_noc_writes_to_clear, + block_next_start_addr, + rd_block_idx); } done = is_d_variant ? @@ -909,10 +911,10 @@ void kernel_main() { // XXXXX move this inside while loop waiting for get_dispatch_cb_page above // XXXXX can potentially clear a partial block when stalled w/ some more bookkeeping cb_block_release_pages(block_noc_writes_to_clear, - wr_block_idx); + upstream_dispatch_cb_sem_id, + dispatch_cb_blocks, + dispatch_cb_pages_per_block>(block_noc_writes_to_clear, + wr_block_idx); } noc_async_write_barrier(); diff --git a/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp b/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp index ef200e75630..6eb25023f12 100644 --- a/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp +++ b/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp @@ -83,7 +83,6 @@ static struct PrefetchExecBufState { // Feature to stall the prefetcher, mainly for ExecBuf impl which reuses CmdDataQ static enum StallState { STALL_NEXT = 2, STALLED = 1, NOT_STALLED = 0} stall_state = NOT_STALLED; - static_assert((downstream_cb_base & (downstream_cb_page_size - 1)) == 0); template(fence, cmd_ptr, pcie_read_ptr); DEBUG_STATUS("HQD"); @@ -295,6 +294,7 @@ static uint32_t process_relay_inline_cmd(uint32_t cmd_ptr, // Assume the downstream buffer is big relative to cmddat command size that we can // grab what we need in one chunk + cb_acquire_pages(npages); uint32_t remaining = cmddat_q_end - data_ptr; @@ -804,6 +804,7 @@ bool process_cmd(uint32_t& cmd_ptr, uint32_t& downstream_data_ptr, uint32_t& stride) { + DeviceZoneScopedND("PROCESS-CMD", block_noc_writes_to_clear, rd_block_idx ); volatile CQPrefetchCmd tt_l1_ptr *cmd = (volatile CQPrefetchCmd tt_l1_ptr *)cmd_ptr; bool done = false; @@ -830,7 +831,7 @@ bool process_cmd(uint32_t& cmd_ptr, break; case CQ_PREFETCH_CMD_RELAY_INLINE: - DPRINT << "relay inline" << ENDL(); + DPRINT << "relay inline" << ENDL(); if (exec_buf) { stride = process_relay_inline_exec_buf_cmd(cmd_ptr, downstream_data_ptr); } else { @@ -860,7 +861,7 @@ bool process_cmd(uint32_t& cmd_ptr, break; case CQ_PREFETCH_CMD_STALL: - DPRINT << "stall" << ENDL(); + DPRINT << "stall" << ENDL(); stride = process_stall(cmd_ptr); break; @@ -1093,9 +1094,9 @@ void kernel_main_hd() { uint32_t cmd_ptr = cmddat_q_base; uint32_t fence = cmddat_q_base; - bool done = false; while (!done) { + DeviceZoneScopedND("KERNEL-MAIN-HD", block_noc_writes_to_clear, rd_block_idx ); constexpr uint32_t preamble_size = 0; fetch_q_get_cmds(fence, cmd_ptr, pcie_read_ptr); diff --git a/tt_metal/jit_build/build.cpp b/tt_metal/jit_build/build.cpp index ea931df4b93..a3c54e390d0 100644 --- a/tt_metal/jit_build/build.cpp +++ b/tt_metal/jit_build/build.cpp @@ -89,7 +89,12 @@ void JitBuildEnv::init(uint32_t build_key, tt::ARCH arch) { this->defines_ += "-DTENSIX_FIRMWARE -DLOCAL_MEM_EN=0 "; if (tt::tt_metal::getDeviceProfilerState()) { - this->defines_ += "-DPROFILE_KERNEL=1 "; + if (tt::llrt::OptionsG.get_profiler_do_dispatch_cores()) { + //TODO(MO): Standard bit mask for device side profiler options + this->defines_ += "-DPROFILE_KERNEL=2 "; + } else { + this->defines_ += "-DPROFILE_KERNEL=1 "; + } } if (tt::llrt::OptionsG.get_watcher_enabled()) { diff --git a/tt_metal/llrt/llrt.cpp b/tt_metal/llrt/llrt.cpp index 79ef4cbf57b..82c1bf28070 100644 --- a/tt_metal/llrt/llrt.cpp +++ b/tt_metal/llrt/llrt.cpp @@ -305,7 +305,7 @@ static bool check_if_riscs_on_specified_core_done(chip_id_t chip_id, const CoreC bool is_active_eth_core = false; bool is_inactive_eth_core = false; - // Determine whether an ethernet core is active or idle. Their host handshake interfaces are different. + // Determine whether an ethernet core is active or idle. Their host handshake interfaces are different. if (is_eth_core) { auto active_eth_cores = tt::Cluster::instance().get_active_ethernet_cores(chip_id); auto inactive_eth_cores = tt::Cluster::instance().get_inactive_ethernet_cores(chip_id); diff --git a/tt_metal/llrt/rtoptions.cpp b/tt_metal/llrt/rtoptions.cpp index 6e36559ca88..1026749baa8 100644 --- a/tt_metal/llrt/rtoptions.cpp +++ b/tt_metal/llrt/rtoptions.cpp @@ -47,10 +47,15 @@ RunTimeOptions::RunTimeOptions() { test_mode_enabled = false; profiler_enabled = false; + profile_dispatch_cores = false; #if defined(PROFILER) const char *profiler_enabled_str = std::getenv("TT_METAL_DEVICE_PROFILER"); if (profiler_enabled_str != nullptr && profiler_enabled_str[0] == '1') { profiler_enabled = true; + const char *profile_dispatch_str = std::getenv("TT_METAL_DEVICE_PROFILER_DISPATCH"); + if (profile_dispatch_str != nullptr && profile_dispatch_str[0] == '1') { + profile_dispatch_cores = true; + } } #endif TT_FATAL( diff --git a/tt_metal/llrt/rtoptions.hpp b/tt_metal/llrt/rtoptions.hpp index 2004949da94..1defdde4fce 100644 --- a/tt_metal/llrt/rtoptions.hpp +++ b/tt_metal/llrt/rtoptions.hpp @@ -86,6 +86,7 @@ class RunTimeOptions { bool test_mode_enabled = false; bool profiler_enabled = false; + bool profile_dispatch_cores = false; bool null_kernels = false; @@ -213,6 +214,7 @@ class RunTimeOptions { inline void set_test_mode_enabled(bool enable) { test_mode_enabled = enable; } inline bool get_profiler_enabled() { return profiler_enabled; } + inline bool get_profiler_do_dispatch_cores() { return profile_dispatch_cores; } inline void set_kernels_nullified(bool v) { null_kernels = v; } inline bool get_kernels_nullified() { return null_kernels; } diff --git a/tt_metal/programming_examples/profiler/test_custom_cycle_count/test_custom_cycle_count.cpp b/tt_metal/programming_examples/profiler/test_custom_cycle_count/test_custom_cycle_count.cpp index 35ae809be4f..587782c90ea 100644 --- a/tt_metal/programming_examples/profiler/test_custom_cycle_count/test_custom_cycle_count.cpp +++ b/tt_metal/programming_examples/profiler/test_custom_cycle_count/test_custom_cycle_count.cpp @@ -7,7 +7,7 @@ using namespace tt; -bool RunCustomCycle(tt_metal::Device *device, int loop_count, bool dumpProfile = false) +bool RunCustomCycle(tt_metal::Device *device, int loop_count) { bool pass = true; diff --git a/tt_metal/programming_examples/profiler/test_multi_op/test_multi_op.cpp b/tt_metal/programming_examples/profiler/test_multi_op/test_multi_op.cpp index b0424fb2fe5..98b821ebb9f 100644 --- a/tt_metal/programming_examples/profiler/test_multi_op/test_multi_op.cpp +++ b/tt_metal/programming_examples/profiler/test_multi_op/test_multi_op.cpp @@ -58,7 +58,7 @@ int main(int argc, char **argv) { // Run 2 RunCustomCycle(device, PROFILER_OP_SUPPORT_COUNT); - tt_metal::detail::DumpDeviceProfileResults(device); + Finish(device->command_queue()); pass &= tt_metal::CloseDevice(device); diff --git a/tt_metal/third_party/tracy b/tt_metal/third_party/tracy index 555d4cb56cf..77f94cbb6f6 160000 --- a/tt_metal/third_party/tracy +++ b/tt_metal/third_party/tracy @@ -1 +1 @@ -Subproject commit 555d4cb56cf07c1cb651da16e2addb0d6304a5b1 +Subproject commit 77f94cbb6f6725b6768668b5907a95e9e1e8d6ab diff --git a/tt_metal/tools/profiler/kernel_profiler.hpp b/tt_metal/tools/profiler/kernel_profiler.hpp index 4dab65d6805..cd9b2bfeb86 100644 --- a/tt_metal/tools/profiler/kernel_profiler.hpp +++ b/tt_metal/tools/profiler/kernel_profiler.hpp @@ -8,7 +8,7 @@ #include -#if defined(COMPILE_FOR_NCRISC) | defined(COMPILE_FOR_BRISC) | defined(COMPILE_FOR_ERISC) +#if defined(COMPILE_FOR_NCRISC) || defined(COMPILE_FOR_BRISC) || defined(COMPILE_FOR_ERISC) #include "risc_common.h" #include "dataflow_api.h" #else @@ -18,6 +18,7 @@ #include "hostdevcommon/profiler_common.h" #include "risc_attribs.h" +#include "dprint.h" #ifdef PROFILER_KERNEL_FORCE_INLINE #define PROFILER_INLINE inline __attribute__((always_inline)) #else @@ -33,7 +34,7 @@ #define PROFILER_MSG __FILE__ "," $Line ",KERNEL_PROFILER" #define PROFILER_MSG_NAME( name ) name "," PROFILER_MSG -#ifdef PROFILE_KERNEL +#if defined(PROFILE_KERNEL) && ( !defined(DISPATCH_KERNEL) || (defined(DISPATCH_KERNEL) && defined(COMPILE_FOR_NCRISC) && (PROFILE_KERNEL == PROFILER_OPT_DO_DISPATCH_CORES))) namespace kernel_profiler{ extern uint32_t wIndex; @@ -42,20 +43,32 @@ namespace kernel_profiler{ extern uint32_t sums[SUM_COUNT]; extern uint32_t sumIDs[SUM_COUNT]; +#if (defined(DISPATCH_KERNEL) && defined(COMPILE_FOR_NCRISC) && (PROFILE_KERNEL == PROFILER_OPT_DO_DISPATCH_CORES)) + extern uint32_t nocWriteSize; + extern uint32_t *nocWriteBuffer; + extern uint32_t *nocWriteIndex; +#endif + + constexpr uint32_t QUICK_PUSH_MARKER_COUNT = 2; + #if defined(COMPILE_FOR_BRISC) constexpr uint32_t profilerBuffer = PROFILER_L1_BUFFER_BR; constexpr uint32_t deviceBufferEndIndex = DEVICE_BUFFER_END_INDEX_BR; volatile tt_l1_ptr uint32_t *profiler_control_buffer = reinterpret_cast(PROFILER_L1_BUFFER_CONTROL); - uint16_t core_flat_id; + extern uint16_t core_flat_id; + extern bool resultsPushed; #elif defined(COMPILE_FOR_ERISC) constexpr uint32_t profilerBuffer = eth_l1_mem::address_map::PROFILER_L1_BUFFER_ER; constexpr uint32_t deviceBufferEndIndex = DEVICE_BUFFER_END_INDEX_ER; volatile tt_l1_ptr uint32_t *profiler_control_buffer = reinterpret_cast(eth_l1_mem::address_map::PROFILER_L1_BUFFER_CONTROL); - uint16_t core_flat_id; + extern uint16_t core_flat_id; + extern bool resultsPushed; #elif defined(COMPILE_FOR_NCRISC) constexpr uint32_t profilerBuffer = PROFILER_L1_BUFFER_NC; constexpr uint32_t deviceBufferEndIndex = DEVICE_BUFFER_END_INDEX_NC; volatile tt_l1_ptr uint32_t *profiler_control_buffer = reinterpret_cast(PROFILER_L1_BUFFER_CONTROL); + extern uint16_t core_flat_id; + extern bool resultsPushed; #elif COMPILE_FOR_TRISC == 0 constexpr uint32_t profilerBuffer = PROFILER_L1_BUFFER_T0; constexpr uint32_t deviceBufferEndIndex = DEVICE_BUFFER_END_INDEX_T0; @@ -70,8 +83,21 @@ namespace kernel_profiler{ volatile tt_l1_ptr uint32_t *profiler_control_buffer = reinterpret_cast(PROFILER_L1_BUFFER_CONTROL); #endif + constexpr uint32_t Hash32_CT( const char * str, size_t n, uint32_t basis = UINT32_C( 2166136261 ) ) { + return n == 0 ? basis : Hash32_CT( str + 1, n - 1, ( basis ^ str[ 0 ] ) * UINT32_C( 16777619 ) ); + } + + template< size_t N > + constexpr uint32_t Hash16_CT( const char ( &s )[ N ] ) { + auto res = Hash32_CT( s, N - 1 ); + return ((res & 0xFFFF) ^ ((res & 0xFFFF0000) >> 16)) & 0xFFFF; + } + +#define SrcLocNameToHash( name ) DO_PRAGMA(message(PROFILER_MSG_NAME(name))); auto constexpr hash = kernel_profiler::Hash16_CT(PROFILER_MSG_NAME( name )); + inline __attribute__((always_inline)) void init_profiler(uint16_t briscKernelID = 0, uint16_t ncriscKernelID = 0, uint16_t triscsKernelID = 0) { + while (!profiler_control_buffer[DRAM_PROFILER_ADDRESS]); wIndex = CUSTOM_MARKERS; stackSize = 0; @@ -81,8 +107,13 @@ namespace kernel_profiler{ sums[i] = 0; } -#if defined(COMPILE_FOR_ERISC) || defined(COMPILE_FOR_BRISC) +#if (defined(DISPATCH_KERNEL) && defined(COMPILE_FOR_NCRISC) && (PROFILE_KERNEL == PROFILER_OPT_DO_DISPATCH_CORES)) + nocWriteSize = 0; +#endif + +#if defined(COMPILE_FOR_ERISC) || defined(COMPILE_FOR_BRISC) uint32_t runCounter = profiler_control_buffer[RUN_COUNTER]; + resultsPushed = false; #if defined(COMPILE_FOR_ERISC) volatile tt_l1_ptr uint32_t *eriscBuffer = reinterpret_cast(eth_l1_mem::address_map::PROFILER_L1_BUFFER_ER); @@ -114,7 +145,8 @@ namespace kernel_profiler{ eriscBuffer [ID_LL] = runCounter; #endif //ERISC_INIT -#if defined(COMPILE_FOR_BRISC) +#if defined(COMPILE_FOR_BRISC) + volatile tt_l1_ptr uint32_t *briscBuffer = reinterpret_cast(PROFILER_L1_BUFFER_BR); volatile tt_l1_ptr uint32_t *ncriscBuffer = reinterpret_cast(PROFILER_L1_BUFFER_NC); volatile tt_l1_ptr uint32_t *trisc0Buffer = reinterpret_cast(PROFILER_L1_BUFFER_T0); @@ -186,6 +218,22 @@ namespace kernel_profiler{ buffer[index+1] = p_reg[0]; } + inline __attribute__((always_inline)) void mark_start_at_index_inlined(uint32_t index) + { + volatile tt_l1_ptr uint32_t *buffer = reinterpret_cast(kernel_profiler::profilerBuffer); + volatile tt_reg_ptr uint32_t *p_reg = reinterpret_cast (RISCV_DEBUG_REG_WALL_CLOCK_L); + buffer[index+1] = p_reg[0]; + } + + inline __attribute__((always_inline)) void mark_end_at_index_inlined(uint32_t index, uint32_t timer_id_s, uint32_t timer_id) + { + volatile tt_l1_ptr uint32_t *buffer = reinterpret_cast(kernel_profiler::profilerBuffer); + volatile tt_reg_ptr uint32_t *p_reg = reinterpret_cast (RISCV_DEBUG_REG_WALL_CLOCK_L); + buffer[index] = 0x80000000 | ((timer_id_s & 0x7FFFF) << 12) | (p_reg[1] & 0xFFF); + buffer[index+2] = 0x80000000 | ((timer_id & 0x7FFFF) << 12) | (p_reg[1] & 0xFFF); + buffer[index+3] = p_reg[0]; + } + PROFILER_INLINE void mark_padding() { if (wIndex < PROFILER_L1_VECTOR_SIZE) @@ -206,6 +254,13 @@ namespace kernel_profiler{ profiler_control_buffer[FW_RESET_H] = time_H; } + + inline __attribute__((always_inline)) void mark_dropped_timestamps(uint32_t index) + { + uint32_t curr = profiler_control_buffer[DROPPED_ZONES]; + profiler_control_buffer[DROPPED_ZONES] = (1 << index) | curr; + } + inline __attribute__((always_inline)) void risc_finished_profiling() { for (int i = 0; i < SUM_COUNT; i ++) @@ -227,14 +282,13 @@ namespace kernel_profiler{ mark_padding(); } profiler_control_buffer[kernel_profiler::deviceBufferEndIndex] = wIndex; - } inline __attribute__((always_inline)) void finish_profiler() { risc_finished_profiling(); -#if (defined(COMPILE_FOR_ERISC) || defined(COMPILE_FOR_BRISC)) - +#if defined(COMPILE_FOR_ERISC) || defined(COMPILE_FOR_BRISC) + if (resultsPushed) return; uint32_t pageSize = PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC * PROFILER_RISC_COUNT * profiler_core_count_per_dram; @@ -269,74 +323,142 @@ namespace kernel_profiler{ } else { - profiler_control_buffer[hostIndex] = PROFILER_FULL_HOST_VECTOR_SIZE_PER_RISC+1; + mark_dropped_timestamps(hostIndex); } #endif -#if defined(COMPILE_FOR_BRISC) +#if defined(COMPILE_FOR_BRISC) int hostIndex; int deviceIndex; - for (hostIndex = kernel_profiler::HOST_BUFFER_END_INDEX_BR, deviceIndex = kernel_profiler::DEVICE_BUFFER_END_INDEX_BR; - (hostIndex <= kernel_profiler::HOST_BUFFER_END_INDEX_T2) && (deviceIndex <= kernel_profiler::DEVICE_BUFFER_END_INDEX_T2); - hostIndex++, deviceIndex++) - { - if (profiler_control_buffer[deviceIndex]) - { - uint32_t currEndIndex = - profiler_control_buffer[deviceIndex] + - profiler_control_buffer[hostIndex]; + for (hostIndex = kernel_profiler::HOST_BUFFER_END_INDEX_BR, deviceIndex = kernel_profiler::DEVICE_BUFFER_END_INDEX_BR; + (hostIndex <= kernel_profiler::HOST_BUFFER_END_INDEX_T2) && (deviceIndex <= kernel_profiler::DEVICE_BUFFER_END_INDEX_T2); + hostIndex++, deviceIndex++) + { + if (profiler_control_buffer[deviceIndex]) + { + uint32_t currEndIndex = + profiler_control_buffer[deviceIndex] + + profiler_control_buffer[hostIndex]; + + if (currEndIndex <= PROFILER_FULL_HOST_VECTOR_SIZE_PER_RISC) + { + uint32_t dram_offset = + (core_flat_id % profiler_core_count_per_dram) * PROFILER_RISC_COUNT * PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC + + hostIndex * PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC + + profiler_control_buffer[hostIndex] * sizeof(uint32_t); + + const InterleavedAddrGen s = { + .bank_base_address = dram_profiler_address, + .page_size = pageSize + }; + + uint64_t dram_bank_dst_noc_addr = s.get_noc_addr(core_flat_id / profiler_core_count_per_dram, dram_offset); + + noc_async_write( + PROFILER_L1_BUFFER_BR + hostIndex * PROFILER_L1_BUFFER_SIZE, + dram_bank_dst_noc_addr, + profiler_control_buffer[deviceIndex] * sizeof(uint32_t)); + + profiler_control_buffer[hostIndex] = currEndIndex; + } + else if (profiler_control_buffer[RUN_COUNTER] < 1) + { + uint32_t dram_offset = + (core_flat_id % profiler_core_count_per_dram) * + PROFILER_RISC_COUNT * PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC + + hostIndex * PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC; + + const InterleavedAddrGen s = { + .bank_base_address = dram_profiler_address, + .page_size = pageSize + }; + + uint64_t dram_bank_dst_noc_addr = s.get_noc_addr(core_flat_id / profiler_core_count_per_dram, dram_offset); + + noc_async_write( + PROFILER_L1_BUFFER_BR + hostIndex * PROFILER_L1_BUFFER_SIZE, + dram_bank_dst_noc_addr, + CUSTOM_MARKERS * sizeof(uint32_t)); + mark_dropped_timestamps(hostIndex); + } + else{ + mark_dropped_timestamps(hostIndex); + } + profiler_control_buffer[deviceIndex] = 0; + } + } +#endif + noc_async_write_barrier(); + profiler_control_buffer[RUN_COUNTER] ++; + resultsPushed = true; +#endif + } + + inline __attribute__((always_inline)) void quick_push () + { +#if defined(DISPATCH_KERNEL) && defined(COMPILE_FOR_NCRISC) && (PROFILE_KERNEL == PROFILER_OPT_DO_DISPATCH_CORES) + SrcLocNameToHash("PROFILER-NOC-QUICK-SEND"); + mark_time_at_index_inlined(wIndex, hash); + core_flat_id = noc_xy_to_profiler_flat_id[my_x[0]][my_y[0]]; - uint32_t dram_offset = - (core_flat_id % profiler_core_count_per_dram) * PROFILER_RISC_COUNT * PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC + - hostIndex * PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC + - profiler_control_buffer[hostIndex] * sizeof(uint32_t); + uint32_t dram_offset = + (core_flat_id % profiler_core_count_per_dram) * PROFILER_RISC_COUNT * PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC + + HOST_BUFFER_END_INDEX_NC * PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC + + profiler_control_buffer[HOST_BUFFER_END_INDEX_NC] * sizeof(uint32_t); - const InterleavedAddrGen s = { - .bank_base_address = dram_profiler_address, - .page_size = pageSize - }; + const InterleavedAddrGen s = { + .bank_base_address = profiler_control_buffer[DRAM_PROFILER_ADDRESS], + .page_size = PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC * PROFILER_RISC_COUNT * profiler_core_count_per_dram + }; - if ( currEndIndex <= PROFILER_FULL_HOST_VECTOR_SIZE_PER_RISC) - { - uint64_t dram_bank_dst_noc_addr = s.get_noc_addr(core_flat_id / profiler_core_count_per_dram, dram_offset); + uint64_t dram_bank_dst_noc_addr = s.get_noc_addr(core_flat_id / profiler_core_count_per_dram, dram_offset); - noc_async_write( - PROFILER_L1_BUFFER_BR + hostIndex * PROFILER_L1_BUFFER_SIZE, - dram_bank_dst_noc_addr, - profiler_control_buffer[deviceIndex] * sizeof(uint32_t)); + mark_end_at_index_inlined(wIndex, hash, get_end_timer_id(hash)); + wIndex += QUICK_PUSH_MARKER_COUNT * PROFILER_L1_MARKER_UINT32_SIZE; - profiler_control_buffer[hostIndex] = currEndIndex; - } - else - { - profiler_control_buffer[hostIndex] = PROFILER_FULL_HOST_VECTOR_SIZE_PER_RISC+1; - } + uint32_t currEndIndex = profiler_control_buffer[HOST_BUFFER_END_INDEX_NC] + wIndex; + + if ( currEndIndex <= PROFILER_FULL_HOST_VECTOR_SIZE_PER_RISC) + { + noc_async_write( + PROFILER_L1_BUFFER_NC, + dram_bank_dst_noc_addr, + wIndex * sizeof(uint32_t)); + + nocWriteSize += (wIndex * sizeof(uint32_t)); + + profiler_control_buffer[HOST_BUFFER_END_INDEX_NC] = currEndIndex; - profiler_control_buffer[deviceIndex] = 0; - } } -#endif - noc_async_write_barrier(); - profiler_control_buffer[RUN_COUNTER] ++; -#endif - } + else + { + mark_dropped_timestamps(HOST_BUFFER_END_INDEX_NC); + } - constexpr uint32_t Hash32_CT( const char * str, size_t n, uint32_t basis = UINT32_C( 2166136261 ) ) { - return n == 0 ? basis : Hash32_CT( str + 1, n - 1, ( basis ^ str[ 0 ] ) * UINT32_C( 16777619 ) ); - } + wIndex = CUSTOM_MARKERS; - template< size_t N > - constexpr uint32_t Hash16_CT( const char ( &s )[ N ] ) { - auto res = Hash32_CT( s, N - 1 ); - return ((res & 0xFFFF) ^ ((res & 0xFFFF0000) >> 16)) & 0xFFFF; + nocWriteBuffer[(*nocWriteIndex)] = nocWriteBuffer[(*nocWriteIndex)] + (( nocWriteSize + NOC_MAX_BURST_SIZE -1 )/NOC_MAX_BURST_SIZE); + nocWriteSize = 0; +#endif } - template + + template struct profileScope { bool start_marked = false; - PROFILER_INLINE profileScope () + inline __attribute__((always_inline)) profileScope () { - if (wIndex < (PROFILER_L1_VECTOR_SIZE - stackSize)) + bool bufferHasRoom = false; + if constexpr (dispatch) + { + bufferHasRoom = wIndex < (PROFILER_L1_VECTOR_SIZE - stackSize - (QUICK_PUSH_MARKER_COUNT * PROFILER_L1_MARKER_UINT32_SIZE)); + } + else + { + bufferHasRoom = wIndex < (PROFILER_L1_VECTOR_SIZE - stackSize); + } + + if (bufferHasRoom) { stackSize += PROFILER_L1_MARKER_UINT32_SIZE; start_marked = true; @@ -345,7 +467,7 @@ namespace kernel_profiler{ } } - PROFILER_INLINE ~profileScope () + inline __attribute__((always_inline)) ~profileScope () { if (start_marked) { @@ -354,6 +476,14 @@ namespace kernel_profiler{ start_marked = false; stackSize -= PROFILER_L1_MARKER_UINT32_SIZE; } + + if constexpr (dispatch) + { + if (wIndex >= (PROFILER_L1_VECTOR_SIZE - (QUICK_PUSH_MARKER_COUNT * PROFILER_L1_MARKER_UINT32_SIZE))) + { + quick_push(); + } + } } }; @@ -403,8 +533,11 @@ namespace kernel_profiler{ } + #define DeviceZoneScopedN( name ) DO_PRAGMA(message(PROFILER_MSG_NAME(name))); auto constexpr hash = kernel_profiler::Hash16_CT(PROFILER_MSG_NAME(name)); kernel_profiler::profileScope zone = kernel_profiler::profileScope(); +#define DeviceZoneScopedND( name , nocBuffer, nocIndex ) DO_PRAGMA(message(PROFILER_MSG_NAME(name))); auto constexpr hash = kernel_profiler::Hash16_CT(PROFILER_MSG_NAME(name)); kernel_profiler::profileScope zone = kernel_profiler::profileScope(); kernel_profiler::nocWriteBuffer = nocBuffer; kernel_profiler::nocWriteIndex = &nocIndex; + #define DeviceZoneScopedMainN( name ) DO_PRAGMA(message(PROFILER_MSG_NAME(name))); auto constexpr hash = kernel_profiler::Hash16_CT(PROFILER_MSG_NAME(name)); kernel_profiler::profileScopeGuaranteed zone = kernel_profiler::profileScopeGuaranteed(); #define DeviceZoneScopedMainChildN( name ) DO_PRAGMA(message(PROFILER_MSG_NAME(name))); auto constexpr hash = kernel_profiler::Hash16_CT(PROFILER_MSG_NAME(name));kernel_profiler::profileScopeGuaranteed zone = kernel_profiler::profileScopeGuaranteed(); @@ -425,4 +558,6 @@ namespace kernel_profiler{ #define DeviceZoneScopedSumN2( name ) +#define DeviceZoneScopedND( name , nocBuffer, nocIndex ) + #endif diff --git a/tt_metal/tools/profiler/process_device_log.py b/tt_metal/tools/profiler/process_device_log.py index 65a47046be0..ac481d8edf6 100755 --- a/tt_metal/tools/profiler/process_device_log.py +++ b/tt_metal/tools/profiler/process_device_log.py @@ -302,9 +302,10 @@ def is_new_op_device(tsCore, coreOpMap): if (risc == "BRISC" and timerID["zone_name"] == "BRISC-FW" and timerID["zone_phase"] == "begin") or ( risc == "ERISC" and timerID["zone_name"] == "ERISC-FW" and timerID["zone_phase"] == "begin" ): - assert ( - core not in coreOpMap.keys() - ), f"Unexpected BRISC start in {tsCore} {coreOpMap[core]}, this could be caused by soft resets" + assert core not in coreOpMap.keys(), ( + f"Unexpected BRISC start in {tsCore} {coreOpMap[core]}, this could be caused by soft resets \n\n" + + "\n".join([f"{core} - {data}" for core, data in coreOpMap.items()]) + ) if not coreOpMap: isNewOp = True coreOpMap[core] = (tsValue,) diff --git a/tt_metal/tools/profiler/profiler.cpp b/tt_metal/tools/profiler/profiler.cpp index 4bc38a7086f..f444731c287 100644 --- a/tt_metal/tools/profiler/profiler.cpp +++ b/tt_metal/tools/profiler/profiler.cpp @@ -7,7 +7,6 @@ #include #include - #include "tt_metal/host_api.hpp" #include "tt_metal/detail/tt_metal.hpp" #include "tools/profiler/profiler.hpp" @@ -66,8 +65,8 @@ void DeviceProfiler::readRiscProfilerResults( riscEndIndices.push_back(kernel_profiler::HOST_BUFFER_END_INDEX_ER); } - if ((control_buffer[kernel_profiler::HOST_BUFFER_END_INDEX_BR] == 0) && + (control_buffer[kernel_profiler::HOST_BUFFER_END_INDEX_NC] == 0) && (control_buffer[kernel_profiler::HOST_BUFFER_END_INDEX_ER] == 0)) { return; @@ -79,10 +78,11 @@ void DeviceProfiler::readRiscProfilerResults( if (bufferEndIndex > 0) { uint32_t bufferRiscShift = riscNum * PROFILER_FULL_HOST_VECTOR_SIZE_PER_RISC + startIndex; - if (bufferEndIndex > PROFILER_FULL_HOST_VECTOR_SIZE_PER_RISC) + if ((control_buffer[kernel_profiler::DROPPED_ZONES] >> riscEndIndex) & 1) { - log_warning("Profiler DRAM buffers were full, markers were dropped! device {}, worker core {}, {}, Risc {}, bufferEndIndex = {}, host_size = {}", device_id, worker_core.x, worker_core.y, tracy::riscName[riscEndIndex], bufferEndIndex , PROFILER_FULL_HOST_VECTOR_SIZE_PER_RISC ); - bufferEndIndex = PROFILER_FULL_HOST_VECTOR_SIZE_PER_RISC; + std::string warningMsg = fmt::format("Profiler DRAM buffers were full, markers were dropped! device {}, worker core {}, {}, Risc {}, bufferEndIndex = {}", device_id, worker_core.x, worker_core.y, tracy::riscName[riscEndIndex], bufferEndIndex); + TracyMessageC(warningMsg.c_str(), warningMsg.size(), tracy::Color::Tomato3); + log_warning(warningMsg.c_str()); } uint32_t riscNumRead = 0; @@ -180,11 +180,13 @@ void DeviceProfiler::readRiscProfilerResults( riscNum ++; } - std::vector zero_buffer(PROFILER_L1_CONTROL_VECTOR_SIZE, 0); + std::vector control_buffer_reset(PROFILER_L1_CONTROL_VECTOR_SIZE, 0); + control_buffer_reset[kernel_profiler::DRAM_PROFILER_ADDRESS] = output_dram_buffer->address(); + tt::llrt::write_hex_vec_to_core( device_id, worker_core, - zero_buffer, + control_buffer_reset, PROFILER_L1_BUFFER_CONTROL); } @@ -236,7 +238,9 @@ void DeviceProfiler::dumpResultToFile( tracy::TTDeviceEvent event = tracy::TTDeviceEvent(run_id, device_id, core.x, core.y, risc_num, timer_id, timestamp, source_line, source_file, zone_name, zone_phase); - device_events.push_back(event); + auto ret = device_events.insert(event); + + if (!ret.second) return; firstTimestamp(timestamp); @@ -286,6 +290,7 @@ DeviceProfiler::~DeviceProfiler() { #if defined(PROFILER) ZoneScoped; + pushTracyDeviceResults(); for (auto tracyCtx : device_tracy_contexts) { TracyTTDestroy(tracyCtx.second); @@ -375,25 +380,6 @@ void DeviceProfiler::dumpResults ( } - for (const auto &worker_core : worker_cores) { - std::pair device_core = {device_id, worker_core}; - if (device_tracy_contexts.find(device_core) == device_tracy_contexts.end()) - { - auto tracyCtx = TracyTTContext(); - std::string tracyTTCtxName = fmt::format("Device: {}, Core ({},{})", device_id, worker_core.x, worker_core.y); - TracyTTContextPopulate(tracyCtx, smallest_timestamp, 1000.f / (float)device_core_frequency); - TracyTTContextName(tracyCtx, tracyTTCtxName.c_str(), tracyTTCtxName.size()); - - device_tracy_contexts.emplace( - device_core, - tracyCtx - ); - } - } - - std::sort (device_events.begin(), device_events.end()); - - pushTracyDeviceResults(); } else { @@ -405,6 +391,41 @@ void DeviceProfiler::dumpResults ( void DeviceProfiler::pushTracyDeviceResults() { #if defined(PROFILER) && defined(TRACY_ENABLE) + ZoneScoped; + std::set> device_cores_set; + std::vector> device_cores; + for (auto& event: device_events) + { + std::pair device_core = {event.chip_id, (CoreCoord){event.core_x,event.core_y}}; + auto ret = device_cores_set.insert(device_core); + if (ret.second ) + { + device_cores.push_back(device_core); + } + } + + for (auto& device_core: device_cores) + { + int device_id = device_core.first; + CoreCoord worker_core = device_core.second; + + + if (device_tracy_contexts.find(device_core) == device_tracy_contexts.end()) + { + auto tracyCtx = TracyTTContext(); + std::string tracyTTCtxName = fmt::format("Device: {}, Core ({},{})", device_id, worker_core.x, worker_core.y); + + TracyTTContextPopulate(tracyCtx, smallest_timestamp, 1000.f / (float)device_core_frequency); + + TracyTTContextName(tracyCtx, tracyTTCtxName.c_str(), tracyTTCtxName.size()); + + device_tracy_contexts.emplace( + device_core, + tracyCtx + ); + } + } + for (auto& event: device_events) { std::pair device_core = {event.chip_id, (CoreCoord){event.core_x,event.core_y}}; diff --git a/tt_metal/tools/profiler/profiler.hpp b/tt_metal/tools/profiler/profiler.hpp index 189782d186a..129c4e6cc85 100644 --- a/tt_metal/tools/profiler/profiler.hpp +++ b/tt_metal/tools/profiler/profiler.hpp @@ -51,7 +51,7 @@ class DeviceProfiler { std::map, TracyTTCtx> device_tracy_contexts; // Device-Core tracy context - std::vector device_events; + std::set device_events; // Hash to zone source locations std::unordered_map hash_to_zone_src_locations; diff --git a/tt_metal/tools/profiler/tt_metal_profiler.cpp b/tt_metal/tools/profiler/tt_metal_profiler.cpp index 05653e0b778..9d106b0e214 100644 --- a/tt_metal/tools/profiler/tt_metal_profiler.cpp +++ b/tt_metal/tools/profiler/tt_metal_profiler.cpp @@ -106,7 +106,7 @@ void InitDeviceProfiler(Device *device){ #endif } -void DumpDeviceProfileResults(Device *device, bool free_buffers) { +void DumpDeviceProfileResults(Device *device, bool lastDump) { #if defined(PROFILER) std::vector workerCores; auto device_id = device->id(); @@ -115,36 +115,47 @@ void DumpDeviceProfileResults(Device *device, bool free_buffers) { const CoreCoord curr_core = device->worker_core_from_logical_core(core); workerCores.push_back(curr_core); } - for (const CoreCoord& core : tt::get_logical_dispatch_cores(device_id, device_num_hw_cqs)) { - CoreType dispatch_core_type = tt::get_dispatch_core_type(device_id, device_num_hw_cqs); - const auto curr_core = device->physical_core_from_logical_core(core, dispatch_core_type); - workerCores.push_back(curr_core); - } - for (const CoreCoord& core : tt::Cluster::instance().get_soc_desc(device_id).physical_ethernet_cores) - { - workerCores.push_back(core); + for (const CoreCoord& core : device->get_active_ethernet_cores(true)){ + auto physicalCore = device->physical_core_from_logical_core(core, CoreType::ETH); + workerCores.push_back(physicalCore); } - DumpDeviceProfileResults(device, workerCores, free_buffers); + DumpDeviceProfileResults(device, workerCores, lastDump); #endif } -void DumpDeviceProfileResults(Device *device, std::vector &worker_cores, bool free_buffers){ +void DumpDeviceProfileResults(Device *device, std::vector &worker_cores, bool lastDump){ #if defined(PROFILER) ZoneScoped; + + if (tt::llrt::OptionsG.get_profiler_do_dispatch_cores()) { + auto device_id = device->id(); + auto device_num_hw_cqs = device->num_hw_cqs(); + for (const CoreCoord& core : tt::get_logical_dispatch_cores(device_id, device_num_hw_cqs)) { + CoreType dispatch_core_type = tt::get_dispatch_core_type(device_id, device_num_hw_cqs); + const auto curr_core = device->physical_core_from_logical_core(core, dispatch_core_type); + worker_cores.push_back(curr_core); + } + for (const CoreCoord& core : tt::Cluster::instance().get_soc_desc(device_id).physical_ethernet_cores){ + worker_cores.push_back(core); + } + } if (getDeviceProfilerState()) { - const auto USE_FAST_DISPATCH = std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr; - if (USE_FAST_DISPATCH) - { - Finish(device->command_queue()); - } - TT_FATAL(DprintServerIsRunning() == false, "Debug print server is running, cannot dump device profiler data"); + if (!lastDump) + { + const auto USE_FAST_DISPATCH = std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr; + if (USE_FAST_DISPATCH) + { + Finish(device->command_queue()); + } + } + TT_FATAL(DprintServerIsRunning() == false, "Debug print server is running, cannot dump device profiler data"); auto device_id = device->id(); if (tt_metal_device_profiler_map.find(device_id) != tt_metal_device_profiler_map.end()) { tt_metal_device_profiler_map.at(device_id).setDeviceArchitecture(device->arch()); tt_metal_device_profiler_map.at(device_id).dumpResults(device, worker_cores); - if (free_buffers) + if (lastDump) { // Process is ending, no more device dumps are coming, reset your ref on the buffer so deallocate is the last // owner.