diff --git a/conftest.py b/conftest.py index 514c9ed2e36..344247a7226 100644 --- a/conftest.py +++ b/conftest.py @@ -273,7 +273,7 @@ def device(request, device_params): yield device - ttl.device.DumpDeviceProfiler(device, True) + ttl.device.DumpDeviceProfiler(device) ttl.device.DeallocateBuffers(device) ttl.device.Synchronize(device) @@ -292,7 +292,7 @@ def pcie_devices(request, device_params): yield [devices[i] for i in range(num_devices)] for device in devices.values(): - ttl.device.DumpDeviceProfiler(device, True) + ttl.device.DumpDeviceProfiler(device) ttl.device.DeallocateBuffers(device) ttl.device.CloseDevices(devices) @@ -310,7 +310,7 @@ def all_devices(request, device_params): yield [devices[i] for i in range(num_devices)] for device in devices.values(): - ttl.device.DumpDeviceProfiler(device, True) + ttl.device.DumpDeviceProfiler(device) ttl.device.DeallocateBuffers(device) ttl.device.CloseDevices(devices) @@ -334,7 +334,7 @@ def device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0): import tt_lib as ttl for device in device_mesh.get_devices(): - ttl.device.DumpDeviceProfiler(device, True) + ttl.device.DumpDeviceProfiler(device) ttl.device.DeallocateBuffers(device) ttnn.close_device_mesh(device_mesh) @@ -361,7 +361,7 @@ def pcie_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0): import tt_lib as ttl for device in device_mesh.get_devices(): - ttl.device.DumpDeviceProfiler(device, True) + ttl.device.DumpDeviceProfiler(device) ttl.device.DeallocateBuffers(device) ttnn.close_device_mesh(device_mesh) @@ -388,7 +388,7 @@ def t3k_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0): import tt_lib as ttl for device in device_mesh.get_devices(): - ttl.device.DumpDeviceProfiler(device, True) + ttl.device.DumpDeviceProfiler(device) ttl.device.DeallocateBuffers(device) ttnn.close_device_mesh(device_mesh) diff --git a/tests/scripts/run_profiler_regressions.sh b/tests/scripts/run_profiler_regressions.sh index df0d6e58433..7f275dc5a1d 100755 --- a/tests/scripts/run_profiler_regressions.sh +++ b/tests/scripts/run_profiler_regressions.sh @@ -61,12 +61,7 @@ run_profiling_test(){ run_async_mode_T3000_test - TT_METAL_DEVICE_PROFILER=1 pytest $PROFILER_TEST_SCRIPTS_ROOT/test_device_profiler.py::test_custom_cycle_count -vvv - TT_METAL_DEVICE_PROFILER=1 pytest $PROFILER_TEST_SCRIPTS_ROOT/test_device_profiler.py::test_full_buffer -vvv - #TODO(MO): Needed until #6560 is fixed. - if [ "$ARCH_NAME" != "grayskull" ]; then - TT_METAL_DEVICE_PROFILER=1 pytest $PROFILER_TEST_SCRIPTS_ROOT/test_device_profiler.py::test_multi_op -vvv - fi + TT_METAL_DEVICE_PROFILER=1 pytest $PROFILER_TEST_SCRIPTS_ROOT/test_device_profiler.py remove_default_log_locations @@ -92,7 +87,7 @@ run_profiling_no_reset_test(){ source python_env/bin/activate export PYTHONPATH=$TT_METAL_HOME - TT_METAL_DEVICE_PROFILER=1 pytest $PROFILER_TEST_SCRIPTS_ROOT/test_device_profiler.py::test_multi_op -vvv + TT_METAL_DEVICE_PROFILER=1 pytest $PROFILER_TEST_SCRIPTS_ROOT/test_device_profiler_gs_no_reset.py remove_default_log_locations } diff --git a/tests/tt_metal/tools/profiler/test_device_profiler.py b/tests/tt_metal/tools/profiler/test_device_profiler.py index b6bf8e0e3d6..1e6705abb40 100644 --- a/tests/tt_metal/tools/profiler/test_device_profiler.py +++ b/tests/tt_metal/tools/profiler/test_device_profiler.py @@ -16,6 +16,8 @@ clear_profiler_runtime_artifacts, ) +from models.utility_functions import skip_for_grayskull + PROG_EXMP_DIR = "programming_examples/profiler" @@ -53,6 +55,7 @@ def get_function_name(): return frame.f_code.co_name +@skip_for_grayskull() def test_multi_op(): OP_COUNT = 1000 RUN_COUNT = 2 @@ -129,3 +132,42 @@ def test_full_buffer(): assert stats[statNameEth]["stats"]["Count"] % (OP_COUNT * ZONE_COUNT) == 0, "Wrong Eth Marker Repeat count" else: assert stats[statName]["stats"]["Count"] in REF_COUNT_DICT[ENV_VAR_ARCH_NAME], "Wrong Marker Repeat count" + + +def test_dispatch_cores(): + OP_COUNT = 1 + RISC_COUNT = 1 + ZONE_COUNT = 37 + REF_COUNT_DICT = { + "grayskull": { + "Tensix CQ Dispatch": 37, + "Tensix CQ Prefetch": 44, + }, + "wormhole_b0": { + "Tensix CQ Dispatch": 37, + "Tensix CQ Prefetch": 44, + }, + } + + ENV_VAR_ARCH_NAME = os.getenv("ARCH_NAME") + assert ENV_VAR_ARCH_NAME in REF_COUNT_DICT.keys() + + os.environ["TT_METAL_DEVICE_PROFILER_DISPATCH"] = "1" + + devicesData = run_device_profiler_test(setup=True) + + stats = devicesData["data"]["devices"]["0"]["cores"]["DEVICE"]["analysis"] + + verifiedStat = [] + for stat in REF_COUNT_DICT[ENV_VAR_ARCH_NAME].keys(): + if stat in stats.keys(): + verifiedStat.append(stat) + assert stats[stat]["stats"]["Count"] == REF_COUNT_DICT[ENV_VAR_ARCH_NAME][stat], "Wrong Dispatch zone count" + + statTypes = ["Dispatch", "Prefetch"] + statTypesSet = set(statTypes) + for statType in statTypes: + for stat in verifiedStat: + if statType in stat: + statTypesSet.remove(statType) + assert len(statTypesSet) == 0 diff --git a/tests/tt_metal/tools/profiler/test_device_profiler_gs_no_reset.py b/tests/tt_metal/tools/profiler/test_device_profiler_gs_no_reset.py new file mode 100644 index 00000000000..75bc7162bdf --- /dev/null +++ b/tests/tt_metal/tools/profiler/test_device_profiler_gs_no_reset.py @@ -0,0 +1,9 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +from tests.tt_metal.tools.profiler import test_device_profiler + + +def test_multi_op_gs_no_reset(): + test_device_profiler.test_multi_op() diff --git a/tt_eager/tracy.py b/tt_eager/tracy.py index d92a5c05e83..ab7814f5613 100644 --- a/tt_eager/tracy.py +++ b/tt_eager/tracy.py @@ -300,6 +300,8 @@ def main(): testCommand = f"python -m tracy {osCmd}" envVars = dict(os.environ) + # No Dispatch cores for op_report + envVars["TT_METAL_DEVICE_PROFILER_DISPATCH"] = "0" if options.device: envVars["TT_METAL_DEVICE_PROFILER"] = "1" else: diff --git a/tt_eager/tt_lib/csrc/tt_lib_bindings.cpp b/tt_eager/tt_lib/csrc/tt_lib_bindings.cpp index 2d15d354531..bebe8306c01 100644 --- a/tt_eager/tt_lib/csrc/tt_lib_bindings.cpp +++ b/tt_eager/tt_lib/csrc/tt_lib_bindings.cpp @@ -198,14 +198,14 @@ void DeviceModule(py::module &m_device) { the FinishCommand. Once set to false, all subsequent commands will immediately notify the device that the write pointer has been updated. )doc"); - m_device.def("DumpDeviceProfiler", &detail::DumpDeviceProfiler, py::arg("device"), py::arg("free_buffers") = false, R"doc( + m_device.def("DumpDeviceProfiler", &detail::DumpDeviceProfiler, py::arg("device"), py::arg("last_dump") = false, R"doc( Dump device side profiling data. +------------------+----------------------------------+-----------------------+-------------+----------+ | Argument | Description | Data type | Valid range | Required | +==================+==================================+=======================+=============+==========+ | device | Device to dump profiling data of | tt_lib.device.Device | | Yes | - | free_buffers | Option to free buffer | bool | | No | + | last_dump | Last dump before process dies | bool | | No | +------------------+----------------------------------+-----------------------+-------------+----------+ )doc"); m_device.def("DeallocateBuffers", &detail::DeallocateBuffers, R"doc( diff --git a/tt_metal/detail/tt_metal.hpp b/tt_metal/detail/tt_metal.hpp index bcc80005d87..0a93802bb1a 100644 --- a/tt_metal/detail/tt_metal.hpp +++ b/tt_metal/detail/tt_metal.hpp @@ -165,9 +165,9 @@ namespace tt::tt_metal{ * |---------------|---------------------------------------------------|--------------------------------------------------------------|---------------------------|----------| * | device | The device holding the program being profiled. | Device * | | True | * | core_coords | The logical core coordinates being profiled. | const std::unordered_map> & | | True | - * | free_buffers | Free up the profiler buffer spaces for the device | bool | | False | + * | last_dump | Last dump before process dies | bool | | False | * */ - void DumpDeviceProfileResults(Device *device, std::vector &worker_cores, bool free_buffers = false); + void DumpDeviceProfileResults(Device *device, std::vector &worker_cores, bool last_dump = false); /** * Traverse all cores and read device side profiler data and dump results into device side CSV log @@ -177,9 +177,9 @@ namespace tt::tt_metal{ * | Argument | Description | Type | Valid Range | Required | * |---------------|---------------------------------------------------|--------------------------------------------------------------|---------------------------|----------| * | device | The device holding the program being profiled. | Device * | | True | - * | free_buffers | Free up the profiler buffer spaces for the device | bool | | False | + * | last_dump | Last dump before process dies | bool | | False | * */ - void DumpDeviceProfileResults(Device *device, bool free_buffers = false); + void DumpDeviceProfileResults(Device *device, bool last_dump = false); /** * Set the directory for device-side CSV logs produced by the profiler instance in the tt-metal module @@ -333,9 +333,9 @@ namespace tt::tt_metal{ DispatchStateCheck(true); LAZY_COMMAND_QUEUE_MODE = lazy; } - inline void DumpDeviceProfiler(Device * device, bool free_buffers) + inline void DumpDeviceProfiler(Device * device, bool last_dump) { - tt::tt_metal::detail::DumpDeviceProfileResults(device, free_buffers); + tt::tt_metal::detail::DumpDeviceProfileResults(device, last_dump); } void AllocateBuffer(Buffer* buffer, bool bottom_up); diff --git a/tt_metal/hostdevcommon/profiler_common.h b/tt_metal/hostdevcommon/profiler_common.h index d1b8ca63fe2..ce4a7325f18 100644 --- a/tt_metal/hostdevcommon/profiler_common.h +++ b/tt_metal/hostdevcommon/profiler_common.h @@ -4,6 +4,8 @@ #pragma once +#define PROFILER_OPT_DO_DISPATCH_CORES 2 + namespace kernel_profiler{ constexpr static uint32_t PADDING_MARKER = ((1<<16) - 1); @@ -40,7 +42,9 @@ namespace kernel_profiler{ RUN_COUNTER, NOC_X, NOC_Y, - FLAT_ID + FLAT_ID, + DROPPED_ZONES, + PROFILER_DONE, }; diff --git a/tt_metal/hw/firmware/src/brisc.cc b/tt_metal/hw/firmware/src/brisc.cc index 41af5976a17..98e5817e547 100644 --- a/tt_metal/hw/firmware/src/brisc.cc +++ b/tt_metal/hw/firmware/src/brisc.cc @@ -65,12 +65,15 @@ CBInterface cb_interface[NUM_CIRCULAR_BUFFERS] __attribute__((used)); #define MEM_MOVER_VIEW_IRAM_BASE_ADDR (0x4 << 12) +#if defined(PROFILE_KERNEL) namespace kernel_profiler { -uint32_t wIndex __attribute__((used)); -uint32_t stackSize __attribute__((used)); -uint32_t sums[SUM_COUNT] __attribute__((used)); -uint32_t sumIDs[SUM_COUNT] __attribute__((used)); -} // namespace kernel_profiler + uint32_t wIndex __attribute__((used)); + uint32_t stackSize __attribute__((used)); + uint32_t sums[SUM_COUNT] __attribute__((used)); + uint32_t sumIDs[SUM_COUNT] __attribute__((used)); + uint16_t core_flat_id __attribute__((used)); +} +#endif void enable_power_management() { // Mask and Hyst taken from tb_tensix math_tests diff --git a/tt_metal/hw/firmware/src/erisc.cc b/tt_metal/hw/firmware/src/erisc.cc index ab722594826..d48627c3e79 100644 --- a/tt_metal/hw/firmware/src/erisc.cc +++ b/tt_metal/hw/firmware/src/erisc.cc @@ -24,12 +24,15 @@ void ApplicationHandler(void) __attribute__((__section__(".init"))); } #endif +#if defined(PROFILE_KERNEL) namespace kernel_profiler { uint32_t wIndex __attribute__((used)); uint32_t stackSize __attribute__((used)); uint32_t sums[SUM_COUNT] __attribute__((used)); uint32_t sumIDs[SUM_COUNT] __attribute__((used)); + uint16_t core_flat_id __attribute__((used)); } +#endif uint8_t noc_index = 0; // TODO: remove hardcoding uint8_t my_x[NUM_NOCS] __attribute__((used)); diff --git a/tt_metal/hw/firmware/src/idle_erisc.cc b/tt_metal/hw/firmware/src/idle_erisc.cc index e8fc6889016..abd593271e5 100644 --- a/tt_metal/hw/firmware/src/idle_erisc.cc +++ b/tt_metal/hw/firmware/src/idle_erisc.cc @@ -48,12 +48,15 @@ constexpr uint32_t num_cbs_to_early_init = 4; // safe small number to overlap w CBInterface cb_interface[NUM_CIRCULAR_BUFFERS] __attribute__((used)); +#if defined(PROFILE_KERNEL) namespace kernel_profiler { uint32_t wIndex __attribute__((used)); uint32_t stackSize __attribute__((used)); uint32_t sums[SUM_COUNT] __attribute__((used)); uint32_t sumIDs[SUM_COUNT] __attribute__((used)); + uint16_t core_flat_id __attribute__((used)); } +#endif //inline void RISC_POST_STATUS(uint32_t status) { // volatile uint32_t* ptr = (volatile uint32_t*)(NOC_CFG(ROUTER_CFG_2)); @@ -101,7 +104,7 @@ int main() { DEBUG_STATUS("GD"); { - DeviceZoneScopedMainN("ERISC-FW"); + DeviceZoneScopedMainN("ERISC-IDLE-FW"); noc_index = mailboxes->launch.brisc_noc_id; diff --git a/tt_metal/hw/firmware/src/ncrisc.cc b/tt_metal/hw/firmware/src/ncrisc.cc index c3442f54053..5160d355fe1 100644 --- a/tt_metal/hw/firmware/src/ncrisc.cc +++ b/tt_metal/hw/firmware/src/ncrisc.cc @@ -33,12 +33,18 @@ uint32_t atomic_ret_val __attribute__((section("l1_data"))) __attribute__((used) CBInterface cb_interface[NUM_CIRCULAR_BUFFERS] __attribute__((used)); +#if defined(PROFILE_KERNEL) namespace kernel_profiler { -uint32_t wIndex __attribute__((used)); -uint32_t stackSize __attribute__((used)); -uint32_t sums[SUM_COUNT] __attribute__((used)); -uint32_t sumIDs[SUM_COUNT] __attribute__((used)); -} // namespace kernel_profiler + uint32_t wIndex __attribute__((used)); + uint32_t stackSize __attribute__((used)); + uint32_t sums[SUM_COUNT] __attribute__((used)); + uint32_t sumIDs[SUM_COUNT] __attribute__((used)); + uint16_t core_flat_id __attribute__((used)); + uint32_t nocWriteSize __attribute__((used)); + uint32_t *nocWriteBuffer __attribute__((used)); + uint32_t *nocWriteIndex __attribute__((used)); +} +#endif extern "C" void ncrisc_resume(void); extern "C" void notify_brisc_and_halt(uint32_t status); diff --git a/tt_metal/hw/firmware/src/trisc.cc b/tt_metal/hw/firmware/src/trisc.cc index 32d2019fd37..61541c598d6 100644 --- a/tt_metal/hw/firmware/src/trisc.cc +++ b/tt_metal/hw/firmware/src/trisc.cc @@ -16,12 +16,14 @@ #include "circular_buffer.h" // clang-format on +#if defined(PROFILE_KERNEL) namespace kernel_profiler { -uint32_t wIndex __attribute__((used)); -uint32_t stackSize __attribute__((used)); -uint32_t sums[SUM_COUNT] __attribute__((used)); -uint32_t sumIDs[SUM_COUNT] __attribute__((used)); -} // namespace kernel_profiler + uint32_t wIndex __attribute__((used)); + uint32_t stackSize __attribute__((used)); + uint32_t sums[SUM_COUNT] __attribute__((used)); + uint32_t sumIDs[SUM_COUNT] __attribute__((used)); +} +#endif namespace ckernel { diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index 6e9892c130c..4520d575474 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -1307,8 +1307,6 @@ bool Device::close() { if (not this->initialized_) { TT_THROW("Cannot close device {} that has not been initialized!", this->id_); } - this->deallocate_buffers(); - watcher_detach(this); for (const std::unique_ptr &hw_command_queue : hw_command_queues_) { if (hw_command_queue->manager.get_bypass_mode()) { @@ -1316,9 +1314,16 @@ bool Device::close() { } hw_command_queue->terminate(); } + + tt_metal::detail::DumpDeviceProfileResults(this, true); + this->trace_buffer_pool_.clear(); detail::EnableAllocs(this); + this->deallocate_buffers(); + watcher_detach(this); + + std::unordered_set not_done_dispatch_cores; std::unordered_set cores_to_skip; diff --git a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp index 07bf38efdb2..efa9ef809a0 100644 --- a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp +++ b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp @@ -863,6 +863,7 @@ void kernel_main() { } bool done = false; while (!done) { + DeviceZoneScopedND("CQ-DISPATCH", block_noc_writes_to_clear, rd_block_idx ); if (cmd_ptr == cb_fence) { get_cb_page< dispatch_cb_base, diff --git a/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp b/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp index 6c6a6c5d8d6..defedc88699 100644 --- a/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp +++ b/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp @@ -812,6 +812,7 @@ bool process_cmd(uint32_t& cmd_ptr, uint32_t& downstream_data_ptr, uint32_t& stride) { + DeviceZoneScopedND("PROCESS-CMD", block_noc_writes_to_clear, rd_block_idx ); volatile CQPrefetchCmd tt_l1_ptr *cmd = (volatile CQPrefetchCmd tt_l1_ptr *)cmd_ptr; bool done = false; @@ -1101,9 +1102,9 @@ void kernel_main_hd() { uint32_t cmd_ptr = cmddat_q_base; uint32_t fence = cmddat_q_base; - bool done = false; while (!done) { + DeviceZoneScopedND("KERNEL-MAIN-HD", block_noc_writes_to_clear, rd_block_idx ); constexpr uint32_t preamble_size = 0; fetch_q_get_cmds(fence, cmd_ptr, pcie_read_ptr); diff --git a/tt_metal/jit_build/build.cpp b/tt_metal/jit_build/build.cpp index 681b118f2ce..a2fc03a7275 100644 --- a/tt_metal/jit_build/build.cpp +++ b/tt_metal/jit_build/build.cpp @@ -86,7 +86,12 @@ void JitBuildEnv::init(uint32_t build_key, tt::ARCH arch) { this->defines_ += "-DTENSIX_FIRMWARE -DLOCAL_MEM_EN=0 "; if (tt::tt_metal::getDeviceProfilerState()) { - this->defines_ += "-DPROFILE_KERNEL=1 "; + if (tt::llrt::OptionsG.get_profiler_do_dispatch_cores()) { + //TODO(MO): Standard bit mask for device side profiler options + this->defines_ += "-DPROFILE_KERNEL=2 "; + } else { + this->defines_ += "-DPROFILE_KERNEL=1 "; + } } if (tt::llrt::OptionsG.get_watcher_enabled()) { diff --git a/tt_metal/llrt/rtoptions.cpp b/tt_metal/llrt/rtoptions.cpp index 6e36559ca88..1026749baa8 100644 --- a/tt_metal/llrt/rtoptions.cpp +++ b/tt_metal/llrt/rtoptions.cpp @@ -47,10 +47,15 @@ RunTimeOptions::RunTimeOptions() { test_mode_enabled = false; profiler_enabled = false; + profile_dispatch_cores = false; #if defined(PROFILER) const char *profiler_enabled_str = std::getenv("TT_METAL_DEVICE_PROFILER"); if (profiler_enabled_str != nullptr && profiler_enabled_str[0] == '1') { profiler_enabled = true; + const char *profile_dispatch_str = std::getenv("TT_METAL_DEVICE_PROFILER_DISPATCH"); + if (profile_dispatch_str != nullptr && profile_dispatch_str[0] == '1') { + profile_dispatch_cores = true; + } } #endif TT_FATAL( diff --git a/tt_metal/llrt/rtoptions.hpp b/tt_metal/llrt/rtoptions.hpp index 2004949da94..1defdde4fce 100644 --- a/tt_metal/llrt/rtoptions.hpp +++ b/tt_metal/llrt/rtoptions.hpp @@ -86,6 +86,7 @@ class RunTimeOptions { bool test_mode_enabled = false; bool profiler_enabled = false; + bool profile_dispatch_cores = false; bool null_kernels = false; @@ -213,6 +214,7 @@ class RunTimeOptions { inline void set_test_mode_enabled(bool enable) { test_mode_enabled = enable; } inline bool get_profiler_enabled() { return profiler_enabled; } + inline bool get_profiler_do_dispatch_cores() { return profile_dispatch_cores; } inline void set_kernels_nullified(bool v) { null_kernels = v; } inline bool get_kernels_nullified() { return null_kernels; } diff --git a/tt_metal/programming_examples/profiler/CMakeLists.txt b/tt_metal/programming_examples/profiler/CMakeLists.txt index a153ef8b787..091890d580d 100644 --- a/tt_metal/programming_examples/profiler/CMakeLists.txt +++ b/tt_metal/programming_examples/profiler/CMakeLists.txt @@ -3,6 +3,7 @@ set(PROFILER_EXAMPLES_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/test_custom_cycle_count/test_custom_cycle_count ${CMAKE_CURRENT_SOURCE_DIR}/test_full_buffer/test_full_buffer ${CMAKE_CURRENT_SOURCE_DIR}/test_multi_op/test_multi_op + ${CMAKE_CURRENT_SOURCE_DIR}/test_dispatch_cores/test_dispatch_cores ) CREATE_PGM_EXAMPLES_EXE("${PROFILER_EXAMPLES_SRCS}" "profiler") diff --git a/tt_metal/programming_examples/profiler/test_custom_cycle_count/test_custom_cycle_count.cpp b/tt_metal/programming_examples/profiler/test_custom_cycle_count/test_custom_cycle_count.cpp index 35ae809be4f..587782c90ea 100644 --- a/tt_metal/programming_examples/profiler/test_custom_cycle_count/test_custom_cycle_count.cpp +++ b/tt_metal/programming_examples/profiler/test_custom_cycle_count/test_custom_cycle_count.cpp @@ -7,7 +7,7 @@ using namespace tt; -bool RunCustomCycle(tt_metal::Device *device, int loop_count, bool dumpProfile = false) +bool RunCustomCycle(tt_metal::Device *device, int loop_count) { bool pass = true; diff --git a/tt_metal/programming_examples/profiler/test_dispatch_cores/test_dispatch_cores.cpp b/tt_metal/programming_examples/profiler/test_dispatch_cores/test_dispatch_cores.cpp new file mode 100644 index 00000000000..7db7a983293 --- /dev/null +++ b/tt_metal/programming_examples/profiler/test_dispatch_cores/test_dispatch_cores.cpp @@ -0,0 +1,80 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "tt_metal/host_api.hpp" +#include "tt_metal/detail/tt_metal.hpp" + +using namespace tt; + +void RunCustomCycle(tt_metal::Device *device, int loop_count) +{ + CoreCoord compute_with_storage_size = device->compute_with_storage_grid_size(); + CoreCoord start_core = {0, 0}; + CoreCoord end_core = {compute_with_storage_size.x - 1, compute_with_storage_size.y - 1}; + CoreRange all_cores(start_core, end_core); + + tt_metal::Program program = tt_metal::CreateProgram(); + + constexpr int loop_size = 50; + constexpr bool profile_device = true; + std::map kernel_defines = { + {"LOOP_COUNT", std::to_string(loop_count)}, + {"LOOP_SIZE", std::to_string(loop_size)} + }; + + tt_metal::KernelHandle brisc_kernel = tt_metal::CreateKernel( + program, "tt_metal/programming_examples/profiler/test_custom_cycle_count/kernels/custom_cycle_count.cpp", + all_cores, + tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default, .defines = kernel_defines}); + + tt_metal::KernelHandle ncrisc_kernel = tt_metal::CreateKernel( + program, "tt_metal/programming_examples/profiler/test_custom_cycle_count/kernels/custom_cycle_count.cpp", + all_cores, + tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default, .defines = kernel_defines}); + + vector trisc_kernel_args = {}; + tt_metal::KernelHandle trisc_kernel = tt_metal::CreateKernel( + program, "tt_metal/programming_examples/profiler/test_custom_cycle_count/kernels/custom_cycle_count_compute.cpp", + all_cores, + tt_metal::ComputeConfig{.compile_args = trisc_kernel_args, .defines = kernel_defines} + ); + + EnqueueProgram(device->command_queue(), program, false); + tt_metal::DumpDeviceProfileResults(device, program); +} + +int main(int argc, char **argv) { + bool pass = true; + + try { + //////////////////////////////////////////////////////////////////////////// + // Device Setup + //////////////////////////////////////////////////////////////////////////// + int device_id = 0; + tt_metal::Device *device = + tt_metal::CreateDevice(device_id); + + int loop_count = 2000; + RunCustomCycle(device, loop_count); + + pass &= tt_metal::CloseDevice(device); + + } catch (const std::exception &e) { + pass = false; + // Capture the exception error message + log_error(LogTest, "{}", e.what()); + // Capture system call errors that may have returned from driver/kernel + log_error(LogTest, "System error message: {}", std::strerror(errno)); + } + + if (pass) { + log_info(LogTest, "Test Passed"); + } else { + TT_THROW("Test Failed"); + } + + TT_FATAL(pass); + + return 0; +} diff --git a/tt_metal/programming_examples/profiler/test_multi_op/test_multi_op.cpp b/tt_metal/programming_examples/profiler/test_multi_op/test_multi_op.cpp index b0424fb2fe5..98b821ebb9f 100644 --- a/tt_metal/programming_examples/profiler/test_multi_op/test_multi_op.cpp +++ b/tt_metal/programming_examples/profiler/test_multi_op/test_multi_op.cpp @@ -58,7 +58,7 @@ int main(int argc, char **argv) { // Run 2 RunCustomCycle(device, PROFILER_OP_SUPPORT_COUNT); - tt_metal::detail::DumpDeviceProfileResults(device); + Finish(device->command_queue()); pass &= tt_metal::CloseDevice(device); diff --git a/tt_metal/third_party/tracy b/tt_metal/third_party/tracy index 555d4cb56cf..77f94cbb6f6 160000 --- a/tt_metal/third_party/tracy +++ b/tt_metal/third_party/tracy @@ -1 +1 @@ -Subproject commit 555d4cb56cf07c1cb651da16e2addb0d6304a5b1 +Subproject commit 77f94cbb6f6725b6768668b5907a95e9e1e8d6ab diff --git a/tt_metal/tools/profiler/device_post_proc_config.py b/tt_metal/tools/profiler/device_post_proc_config.py index bdb362f5f78..7d285eae486 100644 --- a/tt_metal/tools/profiler/device_post_proc_config.py +++ b/tt_metal/tools/profiler/device_post_proc_config.py @@ -161,6 +161,36 @@ class test_full_buffer(default_setup): detectOps = False +class test_dispatch_cores(default_setup): + timerAnalysis = { + "Tensix CQ Dispatch": { + "across": "core", + "type": "adjacent", + "start": {"risc": "NCRISC", "zone_name": "CQ-DISPATCH"}, + "end": {"risc": "NCRISC", "zone_name": "CQ-DISPATCH"}, + }, + "Tensix CQ Prefetch": { + "across": "core", + "type": "adjacent", + "start": {"risc": "NCRISC", "zone_name": "KERNEL-MAIN-HD"}, + "end": {"risc": "NCRISC", "zone_name": "KERNEL-MAIN-HD"}, + }, + "Ethernet CQ Dispatch": { + "across": "core", + "type": "adjacent", + "start": {"risc": "ERISC", "zone_name": "CQ-DISPATCH"}, + "end": {"risc": "ERISC", "zone_name": "CQ-DISPATCH"}, + }, + "Ethernet CQ Prefetch": { + "across": "core", + "type": "adjacent", + "start": {"risc": "ERISC", "zone_name": "KERNEL-MAIN-HD"}, + "end": {"risc": "ERISC", "zone_name": "KERNEL-MAIN-HD"}, + }, + } + detectOps = False + + class test_noc(default_setup): timerAnalysis = { "NoC For Loop": { diff --git a/tt_metal/tools/profiler/kernel_profiler.hpp b/tt_metal/tools/profiler/kernel_profiler.hpp index 4dab65d6805..366c49933cf 100644 --- a/tt_metal/tools/profiler/kernel_profiler.hpp +++ b/tt_metal/tools/profiler/kernel_profiler.hpp @@ -8,7 +8,7 @@ #include -#if defined(COMPILE_FOR_NCRISC) | defined(COMPILE_FOR_BRISC) | defined(COMPILE_FOR_ERISC) +#if defined(COMPILE_FOR_NCRISC) || defined(COMPILE_FOR_BRISC) || defined(COMPILE_FOR_ERISC) #include "risc_common.h" #include "dataflow_api.h" #else @@ -33,7 +33,7 @@ #define PROFILER_MSG __FILE__ "," $Line ",KERNEL_PROFILER" #define PROFILER_MSG_NAME( name ) name "," PROFILER_MSG -#ifdef PROFILE_KERNEL +#if defined(PROFILE_KERNEL) && ( !defined(DISPATCH_KERNEL) || (defined(DISPATCH_KERNEL) && defined(COMPILE_FOR_NCRISC) && (PROFILE_KERNEL == PROFILER_OPT_DO_DISPATCH_CORES))) namespace kernel_profiler{ extern uint32_t wIndex; @@ -42,20 +42,29 @@ namespace kernel_profiler{ extern uint32_t sums[SUM_COUNT]; extern uint32_t sumIDs[SUM_COUNT]; +#if (defined(DISPATCH_KERNEL) && defined(COMPILE_FOR_NCRISC) && (PROFILE_KERNEL == PROFILER_OPT_DO_DISPATCH_CORES)) + extern uint32_t nocWriteSize; + extern uint32_t *nocWriteBuffer; + extern uint32_t *nocWriteIndex; +#endif + + constexpr uint32_t QUICK_PUSH_MARKER_COUNT = 2; + #if defined(COMPILE_FOR_BRISC) constexpr uint32_t profilerBuffer = PROFILER_L1_BUFFER_BR; constexpr uint32_t deviceBufferEndIndex = DEVICE_BUFFER_END_INDEX_BR; volatile tt_l1_ptr uint32_t *profiler_control_buffer = reinterpret_cast(PROFILER_L1_BUFFER_CONTROL); - uint16_t core_flat_id; + extern uint16_t core_flat_id; #elif defined(COMPILE_FOR_ERISC) constexpr uint32_t profilerBuffer = eth_l1_mem::address_map::PROFILER_L1_BUFFER_ER; constexpr uint32_t deviceBufferEndIndex = DEVICE_BUFFER_END_INDEX_ER; volatile tt_l1_ptr uint32_t *profiler_control_buffer = reinterpret_cast(eth_l1_mem::address_map::PROFILER_L1_BUFFER_CONTROL); - uint16_t core_flat_id; + extern uint16_t core_flat_id; #elif defined(COMPILE_FOR_NCRISC) constexpr uint32_t profilerBuffer = PROFILER_L1_BUFFER_NC; constexpr uint32_t deviceBufferEndIndex = DEVICE_BUFFER_END_INDEX_NC; volatile tt_l1_ptr uint32_t *profiler_control_buffer = reinterpret_cast(PROFILER_L1_BUFFER_CONTROL); + extern uint16_t core_flat_id; #elif COMPILE_FOR_TRISC == 0 constexpr uint32_t profilerBuffer = PROFILER_L1_BUFFER_T0; constexpr uint32_t deviceBufferEndIndex = DEVICE_BUFFER_END_INDEX_T0; @@ -70,6 +79,18 @@ namespace kernel_profiler{ volatile tt_l1_ptr uint32_t *profiler_control_buffer = reinterpret_cast(PROFILER_L1_BUFFER_CONTROL); #endif + constexpr uint32_t Hash32_CT( const char * str, size_t n, uint32_t basis = UINT32_C( 2166136261 ) ) { + return n == 0 ? basis : Hash32_CT( str + 1, n - 1, ( basis ^ str[ 0 ] ) * UINT32_C( 16777619 ) ); + } + + template< size_t N > + constexpr uint32_t Hash16_CT( const char ( &s )[ N ] ) { + auto res = Hash32_CT( s, N - 1 ); + return ((res & 0xFFFF) ^ ((res & 0xFFFF0000) >> 16)) & 0xFFFF; + } + +#define SrcLocNameToHash( name ) DO_PRAGMA(message(PROFILER_MSG_NAME(name))); auto constexpr hash = kernel_profiler::Hash16_CT(PROFILER_MSG_NAME( name )); + inline __attribute__((always_inline)) void init_profiler(uint16_t briscKernelID = 0, uint16_t ncriscKernelID = 0, uint16_t triscsKernelID = 0) { wIndex = CUSTOM_MARKERS; @@ -81,8 +102,13 @@ namespace kernel_profiler{ sums[i] = 0; } -#if defined(COMPILE_FOR_ERISC) || defined(COMPILE_FOR_BRISC) +#if (defined(DISPATCH_KERNEL) && defined(COMPILE_FOR_NCRISC) && (PROFILE_KERNEL == PROFILER_OPT_DO_DISPATCH_CORES)) + nocWriteSize = 0; +#endif + +#if defined(COMPILE_FOR_ERISC) || defined(COMPILE_FOR_BRISC) uint32_t runCounter = profiler_control_buffer[RUN_COUNTER]; + profiler_control_buffer[PROFILER_DONE] = 0; #if defined(COMPILE_FOR_ERISC) volatile tt_l1_ptr uint32_t *eriscBuffer = reinterpret_cast(eth_l1_mem::address_map::PROFILER_L1_BUFFER_ER); @@ -114,7 +140,8 @@ namespace kernel_profiler{ eriscBuffer [ID_LL] = runCounter; #endif //ERISC_INIT -#if defined(COMPILE_FOR_BRISC) +#if defined(COMPILE_FOR_BRISC) + volatile tt_l1_ptr uint32_t *briscBuffer = reinterpret_cast(PROFILER_L1_BUFFER_BR); volatile tt_l1_ptr uint32_t *ncriscBuffer = reinterpret_cast(PROFILER_L1_BUFFER_NC); volatile tt_l1_ptr uint32_t *trisc0Buffer = reinterpret_cast(PROFILER_L1_BUFFER_T0); @@ -186,6 +213,22 @@ namespace kernel_profiler{ buffer[index+1] = p_reg[0]; } + inline __attribute__((always_inline)) void mark_start_at_index_inlined(uint32_t index) + { + volatile tt_l1_ptr uint32_t *buffer = reinterpret_cast(kernel_profiler::profilerBuffer); + volatile tt_reg_ptr uint32_t *p_reg = reinterpret_cast (RISCV_DEBUG_REG_WALL_CLOCK_L); + buffer[index+1] = p_reg[0]; + } + + inline __attribute__((always_inline)) void mark_end_at_index_inlined(uint32_t index, uint32_t timer_id_s, uint32_t timer_id) + { + volatile tt_l1_ptr uint32_t *buffer = reinterpret_cast(kernel_profiler::profilerBuffer); + volatile tt_reg_ptr uint32_t *p_reg = reinterpret_cast (RISCV_DEBUG_REG_WALL_CLOCK_L); + buffer[index] = 0x80000000 | ((timer_id_s & 0x7FFFF) << 12) | (p_reg[1] & 0xFFF); + buffer[index+2] = 0x80000000 | ((timer_id & 0x7FFFF) << 12) | (p_reg[1] & 0xFFF); + buffer[index+3] = p_reg[0]; + } + PROFILER_INLINE void mark_padding() { if (wIndex < PROFILER_L1_VECTOR_SIZE) @@ -206,6 +249,13 @@ namespace kernel_profiler{ profiler_control_buffer[FW_RESET_H] = time_H; } + + inline __attribute__((always_inline)) void mark_dropped_timestamps(uint32_t index) + { + uint32_t curr = profiler_control_buffer[DROPPED_ZONES]; + profiler_control_buffer[DROPPED_ZONES] = (1 << index) | curr; + } + inline __attribute__((always_inline)) void risc_finished_profiling() { for (int i = 0; i < SUM_COUNT; i ++) @@ -227,17 +277,19 @@ namespace kernel_profiler{ mark_padding(); } profiler_control_buffer[kernel_profiler::deviceBufferEndIndex] = wIndex; - } inline __attribute__((always_inline)) void finish_profiler() { risc_finished_profiling(); -#if (defined(COMPILE_FOR_ERISC) || defined(COMPILE_FOR_BRISC)) - +#if defined(COMPILE_FOR_ERISC) || defined(COMPILE_FOR_BRISC) + if (profiler_control_buffer[PROFILER_DONE] == 1){ + return; + } uint32_t pageSize = PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC * PROFILER_RISC_COUNT * profiler_core_count_per_dram; + while (!profiler_control_buffer[DRAM_PROFILER_ADDRESS]); uint32_t dram_profiler_address = profiler_control_buffer[DRAM_PROFILER_ADDRESS]; #if defined(COMPILE_FOR_ERISC) @@ -269,74 +321,143 @@ namespace kernel_profiler{ } else { - profiler_control_buffer[hostIndex] = PROFILER_FULL_HOST_VECTOR_SIZE_PER_RISC+1; + mark_dropped_timestamps(hostIndex); } #endif -#if defined(COMPILE_FOR_BRISC) +#if defined(COMPILE_FOR_BRISC) int hostIndex; int deviceIndex; - for (hostIndex = kernel_profiler::HOST_BUFFER_END_INDEX_BR, deviceIndex = kernel_profiler::DEVICE_BUFFER_END_INDEX_BR; - (hostIndex <= kernel_profiler::HOST_BUFFER_END_INDEX_T2) && (deviceIndex <= kernel_profiler::DEVICE_BUFFER_END_INDEX_T2); - hostIndex++, deviceIndex++) - { - if (profiler_control_buffer[deviceIndex]) - { - uint32_t currEndIndex = - profiler_control_buffer[deviceIndex] + - profiler_control_buffer[hostIndex]; - - uint32_t dram_offset = - (core_flat_id % profiler_core_count_per_dram) * PROFILER_RISC_COUNT * PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC + - hostIndex * PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC + - profiler_control_buffer[hostIndex] * sizeof(uint32_t); + for (hostIndex = kernel_profiler::HOST_BUFFER_END_INDEX_BR, deviceIndex = kernel_profiler::DEVICE_BUFFER_END_INDEX_BR; + (hostIndex <= kernel_profiler::HOST_BUFFER_END_INDEX_T2) && (deviceIndex <= kernel_profiler::DEVICE_BUFFER_END_INDEX_T2); + hostIndex++, deviceIndex++) + { + if (profiler_control_buffer[deviceIndex]) + { + uint32_t currEndIndex = + profiler_control_buffer[deviceIndex] + + profiler_control_buffer[hostIndex]; + + if (currEndIndex <= PROFILER_FULL_HOST_VECTOR_SIZE_PER_RISC) + { + uint32_t dram_offset = + (core_flat_id % profiler_core_count_per_dram) * PROFILER_RISC_COUNT * PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC + + hostIndex * PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC + + profiler_control_buffer[hostIndex] * sizeof(uint32_t); + + const InterleavedAddrGen s = { + .bank_base_address = dram_profiler_address, + .page_size = pageSize + }; + + uint64_t dram_bank_dst_noc_addr = s.get_noc_addr(core_flat_id / profiler_core_count_per_dram, dram_offset); + + noc_async_write( + PROFILER_L1_BUFFER_BR + hostIndex * PROFILER_L1_BUFFER_SIZE, + dram_bank_dst_noc_addr, + profiler_control_buffer[deviceIndex] * sizeof(uint32_t)); + + profiler_control_buffer[hostIndex] = currEndIndex; + } + else if (profiler_control_buffer[RUN_COUNTER] < 1) + { + uint32_t dram_offset = + (core_flat_id % profiler_core_count_per_dram) * + PROFILER_RISC_COUNT * PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC + + hostIndex * PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC; + + const InterleavedAddrGen s = { + .bank_base_address = dram_profiler_address, + .page_size = pageSize + }; - const InterleavedAddrGen s = { - .bank_base_address = dram_profiler_address, - .page_size = pageSize - }; - - if ( currEndIndex <= PROFILER_FULL_HOST_VECTOR_SIZE_PER_RISC) - { uint64_t dram_bank_dst_noc_addr = s.get_noc_addr(core_flat_id / profiler_core_count_per_dram, dram_offset); noc_async_write( PROFILER_L1_BUFFER_BR + hostIndex * PROFILER_L1_BUFFER_SIZE, dram_bank_dst_noc_addr, - profiler_control_buffer[deviceIndex] * sizeof(uint32_t)); - - profiler_control_buffer[hostIndex] = currEndIndex; + CUSTOM_MARKERS * sizeof(uint32_t)); + mark_dropped_timestamps(hostIndex); + } + else{ + mark_dropped_timestamps(hostIndex); } - else - { - profiler_control_buffer[hostIndex] = PROFILER_FULL_HOST_VECTOR_SIZE_PER_RISC+1; - } - - profiler_control_buffer[deviceIndex] = 0; - } - } + profiler_control_buffer[deviceIndex] = 0; + } + } #endif noc_async_write_barrier(); profiler_control_buffer[RUN_COUNTER] ++; + profiler_control_buffer[PROFILER_DONE] = 1; #endif } - constexpr uint32_t Hash32_CT( const char * str, size_t n, uint32_t basis = UINT32_C( 2166136261 ) ) { - return n == 0 ? basis : Hash32_CT( str + 1, n - 1, ( basis ^ str[ 0 ] ) * UINT32_C( 16777619 ) ); - } + inline __attribute__((always_inline)) void quick_push () + { +#if defined(DISPATCH_KERNEL) && defined(COMPILE_FOR_NCRISC) && (PROFILE_KERNEL == PROFILER_OPT_DO_DISPATCH_CORES) + SrcLocNameToHash("PROFILER-NOC-QUICK-SEND"); + mark_time_at_index_inlined(wIndex, hash); + core_flat_id = noc_xy_to_profiler_flat_id[my_x[0]][my_y[0]]; - template< size_t N > - constexpr uint32_t Hash16_CT( const char ( &s )[ N ] ) { - auto res = Hash32_CT( s, N - 1 ); - return ((res & 0xFFFF) ^ ((res & 0xFFFF0000) >> 16)) & 0xFFFF; + uint32_t dram_offset = + (core_flat_id % profiler_core_count_per_dram) * PROFILER_RISC_COUNT * PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC + + HOST_BUFFER_END_INDEX_NC * PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC + + profiler_control_buffer[HOST_BUFFER_END_INDEX_NC] * sizeof(uint32_t); + + while (!profiler_control_buffer[DRAM_PROFILER_ADDRESS]); + const InterleavedAddrGen s = { + .bank_base_address = profiler_control_buffer[DRAM_PROFILER_ADDRESS], + .page_size = PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC * PROFILER_RISC_COUNT * profiler_core_count_per_dram + }; + + uint64_t dram_bank_dst_noc_addr = s.get_noc_addr(core_flat_id / profiler_core_count_per_dram, dram_offset); + + mark_end_at_index_inlined(wIndex, hash, get_end_timer_id(hash)); + wIndex += QUICK_PUSH_MARKER_COUNT * PROFILER_L1_MARKER_UINT32_SIZE; + + uint32_t currEndIndex = profiler_control_buffer[HOST_BUFFER_END_INDEX_NC] + wIndex; + + if ( currEndIndex <= PROFILER_FULL_HOST_VECTOR_SIZE_PER_RISC) + { + noc_async_write( + PROFILER_L1_BUFFER_NC, + dram_bank_dst_noc_addr, + wIndex * sizeof(uint32_t)); + + nocWriteSize += (wIndex * sizeof(uint32_t)); + + profiler_control_buffer[HOST_BUFFER_END_INDEX_NC] = currEndIndex; + + } + else + { + mark_dropped_timestamps(HOST_BUFFER_END_INDEX_NC); + } + + wIndex = CUSTOM_MARKERS; + + nocWriteBuffer[(*nocWriteIndex)] = nocWriteBuffer[(*nocWriteIndex)] + (( nocWriteSize + NOC_MAX_BURST_SIZE -1 )/NOC_MAX_BURST_SIZE); + nocWriteSize = 0; +#endif } - template + + template struct profileScope { bool start_marked = false; - PROFILER_INLINE profileScope () + inline __attribute__((always_inline)) profileScope () { - if (wIndex < (PROFILER_L1_VECTOR_SIZE - stackSize)) + bool bufferHasRoom = false; + if constexpr (dispatch) + { + bufferHasRoom = wIndex < (PROFILER_L1_VECTOR_SIZE - stackSize - (QUICK_PUSH_MARKER_COUNT * PROFILER_L1_MARKER_UINT32_SIZE)); + } + else + { + bufferHasRoom = wIndex < (PROFILER_L1_VECTOR_SIZE - stackSize); + } + + if (bufferHasRoom) { stackSize += PROFILER_L1_MARKER_UINT32_SIZE; start_marked = true; @@ -345,7 +466,7 @@ namespace kernel_profiler{ } } - PROFILER_INLINE ~profileScope () + inline __attribute__((always_inline)) ~profileScope () { if (start_marked) { @@ -354,6 +475,14 @@ namespace kernel_profiler{ start_marked = false; stackSize -= PROFILER_L1_MARKER_UINT32_SIZE; } + + if constexpr (dispatch) + { + if (wIndex >= (PROFILER_L1_VECTOR_SIZE - (QUICK_PUSH_MARKER_COUNT * PROFILER_L1_MARKER_UINT32_SIZE))) + { + quick_push(); + } + } } }; @@ -403,8 +532,11 @@ namespace kernel_profiler{ } + #define DeviceZoneScopedN( name ) DO_PRAGMA(message(PROFILER_MSG_NAME(name))); auto constexpr hash = kernel_profiler::Hash16_CT(PROFILER_MSG_NAME(name)); kernel_profiler::profileScope zone = kernel_profiler::profileScope(); +#define DeviceZoneScopedND( name , nocBuffer, nocIndex ) DO_PRAGMA(message(PROFILER_MSG_NAME(name))); auto constexpr hash = kernel_profiler::Hash16_CT(PROFILER_MSG_NAME(name)); kernel_profiler::profileScope zone = kernel_profiler::profileScope(); kernel_profiler::nocWriteBuffer = nocBuffer; kernel_profiler::nocWriteIndex = &nocIndex; + #define DeviceZoneScopedMainN( name ) DO_PRAGMA(message(PROFILER_MSG_NAME(name))); auto constexpr hash = kernel_profiler::Hash16_CT(PROFILER_MSG_NAME(name)); kernel_profiler::profileScopeGuaranteed zone = kernel_profiler::profileScopeGuaranteed(); #define DeviceZoneScopedMainChildN( name ) DO_PRAGMA(message(PROFILER_MSG_NAME(name))); auto constexpr hash = kernel_profiler::Hash16_CT(PROFILER_MSG_NAME(name));kernel_profiler::profileScopeGuaranteed zone = kernel_profiler::profileScopeGuaranteed(); @@ -425,4 +557,6 @@ namespace kernel_profiler{ #define DeviceZoneScopedSumN2( name ) +#define DeviceZoneScopedND( name , nocBuffer, nocIndex ) + #endif diff --git a/tt_metal/tools/profiler/profiler.cpp b/tt_metal/tools/profiler/profiler.cpp index 4bc38a7086f..f444731c287 100644 --- a/tt_metal/tools/profiler/profiler.cpp +++ b/tt_metal/tools/profiler/profiler.cpp @@ -7,7 +7,6 @@ #include #include - #include "tt_metal/host_api.hpp" #include "tt_metal/detail/tt_metal.hpp" #include "tools/profiler/profiler.hpp" @@ -66,8 +65,8 @@ void DeviceProfiler::readRiscProfilerResults( riscEndIndices.push_back(kernel_profiler::HOST_BUFFER_END_INDEX_ER); } - if ((control_buffer[kernel_profiler::HOST_BUFFER_END_INDEX_BR] == 0) && + (control_buffer[kernel_profiler::HOST_BUFFER_END_INDEX_NC] == 0) && (control_buffer[kernel_profiler::HOST_BUFFER_END_INDEX_ER] == 0)) { return; @@ -79,10 +78,11 @@ void DeviceProfiler::readRiscProfilerResults( if (bufferEndIndex > 0) { uint32_t bufferRiscShift = riscNum * PROFILER_FULL_HOST_VECTOR_SIZE_PER_RISC + startIndex; - if (bufferEndIndex > PROFILER_FULL_HOST_VECTOR_SIZE_PER_RISC) + if ((control_buffer[kernel_profiler::DROPPED_ZONES] >> riscEndIndex) & 1) { - log_warning("Profiler DRAM buffers were full, markers were dropped! device {}, worker core {}, {}, Risc {}, bufferEndIndex = {}, host_size = {}", device_id, worker_core.x, worker_core.y, tracy::riscName[riscEndIndex], bufferEndIndex , PROFILER_FULL_HOST_VECTOR_SIZE_PER_RISC ); - bufferEndIndex = PROFILER_FULL_HOST_VECTOR_SIZE_PER_RISC; + std::string warningMsg = fmt::format("Profiler DRAM buffers were full, markers were dropped! device {}, worker core {}, {}, Risc {}, bufferEndIndex = {}", device_id, worker_core.x, worker_core.y, tracy::riscName[riscEndIndex], bufferEndIndex); + TracyMessageC(warningMsg.c_str(), warningMsg.size(), tracy::Color::Tomato3); + log_warning(warningMsg.c_str()); } uint32_t riscNumRead = 0; @@ -180,11 +180,13 @@ void DeviceProfiler::readRiscProfilerResults( riscNum ++; } - std::vector zero_buffer(PROFILER_L1_CONTROL_VECTOR_SIZE, 0); + std::vector control_buffer_reset(PROFILER_L1_CONTROL_VECTOR_SIZE, 0); + control_buffer_reset[kernel_profiler::DRAM_PROFILER_ADDRESS] = output_dram_buffer->address(); + tt::llrt::write_hex_vec_to_core( device_id, worker_core, - zero_buffer, + control_buffer_reset, PROFILER_L1_BUFFER_CONTROL); } @@ -236,7 +238,9 @@ void DeviceProfiler::dumpResultToFile( tracy::TTDeviceEvent event = tracy::TTDeviceEvent(run_id, device_id, core.x, core.y, risc_num, timer_id, timestamp, source_line, source_file, zone_name, zone_phase); - device_events.push_back(event); + auto ret = device_events.insert(event); + + if (!ret.second) return; firstTimestamp(timestamp); @@ -286,6 +290,7 @@ DeviceProfiler::~DeviceProfiler() { #if defined(PROFILER) ZoneScoped; + pushTracyDeviceResults(); for (auto tracyCtx : device_tracy_contexts) { TracyTTDestroy(tracyCtx.second); @@ -375,25 +380,6 @@ void DeviceProfiler::dumpResults ( } - for (const auto &worker_core : worker_cores) { - std::pair device_core = {device_id, worker_core}; - if (device_tracy_contexts.find(device_core) == device_tracy_contexts.end()) - { - auto tracyCtx = TracyTTContext(); - std::string tracyTTCtxName = fmt::format("Device: {}, Core ({},{})", device_id, worker_core.x, worker_core.y); - TracyTTContextPopulate(tracyCtx, smallest_timestamp, 1000.f / (float)device_core_frequency); - TracyTTContextName(tracyCtx, tracyTTCtxName.c_str(), tracyTTCtxName.size()); - - device_tracy_contexts.emplace( - device_core, - tracyCtx - ); - } - } - - std::sort (device_events.begin(), device_events.end()); - - pushTracyDeviceResults(); } else { @@ -405,6 +391,41 @@ void DeviceProfiler::dumpResults ( void DeviceProfiler::pushTracyDeviceResults() { #if defined(PROFILER) && defined(TRACY_ENABLE) + ZoneScoped; + std::set> device_cores_set; + std::vector> device_cores; + for (auto& event: device_events) + { + std::pair device_core = {event.chip_id, (CoreCoord){event.core_x,event.core_y}}; + auto ret = device_cores_set.insert(device_core); + if (ret.second ) + { + device_cores.push_back(device_core); + } + } + + for (auto& device_core: device_cores) + { + int device_id = device_core.first; + CoreCoord worker_core = device_core.second; + + + if (device_tracy_contexts.find(device_core) == device_tracy_contexts.end()) + { + auto tracyCtx = TracyTTContext(); + std::string tracyTTCtxName = fmt::format("Device: {}, Core ({},{})", device_id, worker_core.x, worker_core.y); + + TracyTTContextPopulate(tracyCtx, smallest_timestamp, 1000.f / (float)device_core_frequency); + + TracyTTContextName(tracyCtx, tracyTTCtxName.c_str(), tracyTTCtxName.size()); + + device_tracy_contexts.emplace( + device_core, + tracyCtx + ); + } + } + for (auto& event: device_events) { std::pair device_core = {event.chip_id, (CoreCoord){event.core_x,event.core_y}}; diff --git a/tt_metal/tools/profiler/profiler.hpp b/tt_metal/tools/profiler/profiler.hpp index 189782d186a..129c4e6cc85 100644 --- a/tt_metal/tools/profiler/profiler.hpp +++ b/tt_metal/tools/profiler/profiler.hpp @@ -51,7 +51,7 @@ class DeviceProfiler { std::map, TracyTTCtx> device_tracy_contexts; // Device-Core tracy context - std::vector device_events; + std::set device_events; // Hash to zone source locations std::unordered_map hash_to_zone_src_locations; diff --git a/tt_metal/tools/profiler/tt_metal_profiler.cpp b/tt_metal/tools/profiler/tt_metal_profiler.cpp index 05653e0b778..1237ca64c59 100644 --- a/tt_metal/tools/profiler/tt_metal_profiler.cpp +++ b/tt_metal/tools/profiler/tt_metal_profiler.cpp @@ -2,6 +2,9 @@ // // SPDX-License-Identifier: Apache-2.0 +#include +#include + #include "tt_metal/host_api.hpp" #include "impl/debug/dprint_server.hpp" @@ -106,7 +109,7 @@ void InitDeviceProfiler(Device *device){ #endif } -void DumpDeviceProfileResults(Device *device, bool free_buffers) { +void DumpDeviceProfileResults(Device *device, bool lastDump) { #if defined(PROFILER) std::vector workerCores; auto device_id = device->id(); @@ -115,36 +118,111 @@ void DumpDeviceProfileResults(Device *device, bool free_buffers) { const CoreCoord curr_core = device->worker_core_from_logical_core(core); workerCores.push_back(curr_core); } - for (const CoreCoord& core : tt::get_logical_dispatch_cores(device_id, device_num_hw_cqs)) { - CoreType dispatch_core_type = tt::get_dispatch_core_type(device_id, device_num_hw_cqs); - const auto curr_core = device->physical_core_from_logical_core(core, dispatch_core_type); - workerCores.push_back(curr_core); - } - for (const CoreCoord& core : tt::Cluster::instance().get_soc_desc(device_id).physical_ethernet_cores) - { - workerCores.push_back(core); + for (const CoreCoord& core : device->get_active_ethernet_cores(true)){ + auto physicalCore = device->physical_core_from_logical_core(core, CoreType::ETH); + workerCores.push_back(physicalCore); } - DumpDeviceProfileResults(device, workerCores, free_buffers); + DumpDeviceProfileResults(device, workerCores, lastDump); #endif } -void DumpDeviceProfileResults(Device *device, std::vector &worker_cores, bool free_buffers){ + +void DumpDeviceProfileResults(Device *device, std::vector &worker_cores, bool lastDump){ #if defined(PROFILER) ZoneScoped; + + if (tt::llrt::OptionsG.get_profiler_do_dispatch_cores()) { + auto device_id = device->id(); + auto device_num_hw_cqs = device->num_hw_cqs(); + for (const CoreCoord& core : tt::get_logical_dispatch_cores(device_id, device_num_hw_cqs)) { + CoreType dispatch_core_type = tt::get_dispatch_core_type(device_id, device_num_hw_cqs); + const auto curr_core = device->physical_core_from_logical_core(core, dispatch_core_type); + worker_cores.push_back(curr_core); + } + for (const CoreCoord& core : tt::Cluster::instance().get_soc_desc(device_id).physical_ethernet_cores){ + worker_cores.push_back(core); + } + } if (getDeviceProfilerState()) { - const auto USE_FAST_DISPATCH = std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr; - if (USE_FAST_DISPATCH) + if (!lastDump) + { + const auto USE_FAST_DISPATCH = std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr; + if (USE_FAST_DISPATCH) + { + Finish(device->command_queue()); + } + } + else { - Finish(device->command_queue()); + if (tt::llrt::OptionsG.get_profiler_do_dispatch_cores()) + { + bool waitForDispatch = true; + uint8_t loopCount = 0; + CoreCoord unfinishedCore = {0,0}; + constexpr uint8_t maxLoopCount = 10; + constexpr uint32_t loopDuration_us = 10000; + while (waitForDispatch) + { + waitForDispatch = false; + std::this_thread::sleep_for(std::chrono::microseconds(loopDuration_us)); + auto device_id = device->id(); + auto device_num_hw_cqs = device->num_hw_cqs(); + loopCount++; + if (loopCount > maxLoopCount) + { + std::string msg = fmt::format( + "Device profiling never finished on device {}, worker core {}, {}", + device_id, unfinishedCore.x, unfinishedCore.y); + TracyMessageC(msg.c_str(), msg.size(), tracy::Color::Tomato3); + log_warning(msg.c_str()); + break; + } + for (const CoreCoord& core : tt::get_logical_dispatch_cores(device_id, device_num_hw_cqs)) + { + CoreType dispatch_core_type = tt::get_dispatch_core_type(device_id, device_num_hw_cqs); + const auto curr_core = device->physical_core_from_logical_core(core, dispatch_core_type); + vector control_buffer = tt::llrt::read_hex_vec_from_core( + device_id, + curr_core, + PROFILER_L1_BUFFER_CONTROL, + PROFILER_L1_CONTROL_BUFFER_SIZE); + if (control_buffer[kernel_profiler::PROFILER_DONE] == 0) + { + unfinishedCore = curr_core; + waitForDispatch = true; + continue; + } + } + if (waitForDispatch) + { + continue; + } + for (const CoreCoord& core : tt::Cluster::instance().get_soc_desc(device_id).physical_ethernet_cores) + { + vector control_buffer = tt::llrt::read_hex_vec_from_core( + device_id, + core, + eth_l1_mem::address_map::PROFILER_L1_BUFFER_CONTROL, + PROFILER_L1_CONTROL_BUFFER_SIZE); + if (control_buffer[kernel_profiler::PROFILER_DONE] == 0) + { + unfinishedCore = core; + waitForDispatch = true; + continue; + } + } + + } + } } - TT_FATAL(DprintServerIsRunning() == false, "Debug print server is running, cannot dump device profiler data"); + TT_FATAL(DprintServerIsRunning() == false, "Debug print server is running, cannot dump device profiler data"); auto device_id = device->id(); if (tt_metal_device_profiler_map.find(device_id) != tt_metal_device_profiler_map.end()) { tt_metal_device_profiler_map.at(device_id).setDeviceArchitecture(device->arch()); tt_metal_device_profiler_map.at(device_id).dumpResults(device, worker_cores); - if (free_buffers) + if (lastDump) { // Process is ending, no more device dumps are coming, reset your ref on the buffer so deallocate is the last // owner.