From 374d29e987886b57fe187b3294e1b4631962efaa Mon Sep 17 00:00:00 2001 From: Austin Ho Date: Wed, 26 Jun 2024 21:29:06 +0000 Subject: [PATCH] #3764: Device side opID support --- tests/scripts/run_profiler_regressions.sh | 43 +++++++------- tt_eager/tt_dnn/op_library/run_operation.cpp | 9 +++ tt_metal/hw/firmware/src/brisc.cc | 1 + tt_metal/hw/firmware/src/erisc.cc | 1 + tt_metal/hw/firmware/src/idle_erisc.cc | 1 + tt_metal/hw/inc/dev_msgs.h | 3 + tt_metal/impl/dispatch/command_queue.cpp | 6 +- tt_metal/impl/program/program.cpp | 4 +- tt_metal/impl/program/program.hpp | 3 + tt_metal/third_party/tracy | 2 +- tt_metal/tools/profiler/kernel_profiler.hpp | 58 ++++++++++++------- tt_metal/tools/profiler/process_device_log.py | 10 +++- tt_metal/tools/profiler/process_ops_logs.py | 4 ++ tt_metal/tools/profiler/profiler.cpp | 14 +++-- tt_metal/tools/profiler/profiler.hpp | 1 + tt_metal/tt_metal.cpp | 2 +- ttnn/cpp/ttnn/device_operation.hpp | 2 + 17 files changed, 111 insertions(+), 53 deletions(-) diff --git a/tests/scripts/run_profiler_regressions.sh b/tests/scripts/run_profiler_regressions.sh index f8fc6a69759..fc8fa936a59 100755 --- a/tests/scripts/run_profiler_regressions.sh +++ b/tests/scripts/run_profiler_regressions.sh @@ -4,26 +4,6 @@ source scripts/tools_setup_common.sh set -eo pipefail -run_additional_T3000_test(){ - remove_default_log_locations - mkdir -p $PROFILER_ARTIFACTS_DIR - - ./tt_metal/tools/profiler/profile_this.py -c "pytest tests/tt_eager/python_api_testing/unit_testing/misc/test_all_gather.py::test_all_gather_on_t3000_post_commit[mem_config0-input_dtype0-8-1-input_shape1-0-layout1]" > $PROFILER_ARTIFACTS_DIR/test_out.log - - cat $PROFILER_ARTIFACTS_DIR/test_out.log - - if cat $PROFILER_ARTIFACTS_DIR/test_out.log | grep "SKIPPED" - then - echo "No verification as test was skipped" - else - echo "Verifying test results" - runDate=$(ls $PROFILER_OUTPUT_DIR/) - LINE_COUNT=9 #1 header + 8 devices - res=$(verify_perf_line_count "$PROFILER_OUTPUT_DIR/$runDate/ops_perf_results_$runDate.csv" "$LINE_COUNT") - echo $res - fi -} - run_async_mode_T3000_test(){ #Some tests here do not skip grayskull if [ "$ARCH_NAME" != "grayskull" ]; then @@ -47,6 +27,27 @@ run_async_mode_T3000_test(){ fi } +run_additional_T3000_test(){ + remove_default_log_locations + mkdir -p $PROFILER_ARTIFACTS_DIR + + ./tt_metal/tools/profiler/profile_this.py -c "pytest tests/tt_eager/python_api_testing/unit_testing/misc/test_all_gather.py::test_all_gather_on_t3000_post_commit[mem_config0-input_dtype0-8-1-input_shape1-0-layout1]" > $PROFILER_ARTIFACTS_DIR/test_out.log + + cat $PROFILER_ARTIFACTS_DIR/test_out.log + + if cat $PROFILER_ARTIFACTS_DIR/test_out.log | grep "SKIPPED" + then + echo "No verification as test was skipped" + else + echo "Verifying test results" + runDate=$(ls $PROFILER_OUTPUT_DIR/) + LINE_COUNT=9 #1 header + 8 devices + res=$(verify_perf_line_count "$PROFILER_OUTPUT_DIR/$runDate/ops_perf_results_$runDate.csv" "$LINE_COUNT") + echo $res + run_async_mode_T3000_test + fi +} + run_profiling_test(){ if [[ -z "$ARCH_NAME" ]]; then echo "Must provide ARCH_NAME in environment" 1>&2 @@ -60,8 +61,6 @@ run_profiling_test(){ run_additional_T3000_test - run_async_mode_T3000_test - TT_METAL_DEVICE_PROFILER=1 pytest $PROFILER_TEST_SCRIPTS_ROOT/test_device_profiler.py remove_default_log_locations diff --git a/tt_eager/tt_dnn/op_library/run_operation.cpp b/tt_eager/tt_dnn/op_library/run_operation.cpp index 2c6085773b3..cc6954969e9 100644 --- a/tt_eager/tt_dnn/op_library/run_operation.cpp +++ b/tt_eager/tt_dnn/op_library/run_operation.cpp @@ -224,6 +224,15 @@ OutputTensors run_device_operation( operation, input_tensors, optional_input_tensors, output_tensors, optional_output_tensors); uint32_t device_id = detail::get_device(input_tensors, optional_input_tensors)->id(); + if (std::holds_alternative>(program)) + { + std::get>(program).get().set_global_id(op_id); + } + else + { + std::get>(program)->set_global_id(op_id); + } + // Enqueue or Launch Program std::visit( [&operation, &input_tensors, &optional_input_tensors, &output_tensors, queue](auto&& program) { diff --git a/tt_metal/hw/firmware/src/brisc.cc b/tt_metal/hw/firmware/src/brisc.cc index 41134ccaf90..739828edc20 100644 --- a/tt_metal/hw/firmware/src/brisc.cc +++ b/tt_metal/hw/firmware/src/brisc.cc @@ -366,6 +366,7 @@ int main() { { DeviceZoneScopedMainN("BRISC-FW"); + DeviceZoneSetCounter(mailboxes->launch.kernel_config.host_assigned_op_id); // Copies from L1 to IRAM on chips where NCRISC has IRAM l1_to_ncrisc_iram_copy(mailboxes->launch.kernel_config.ncrisc_kernel_size16, ncrisc_kernel_start_offset16); diff --git a/tt_metal/hw/firmware/src/erisc.cc b/tt_metal/hw/firmware/src/erisc.cc index 4523be27cfa..c3ebdb826a1 100644 --- a/tt_metal/hw/firmware/src/erisc.cc +++ b/tt_metal/hw/firmware/src/erisc.cc @@ -79,6 +79,7 @@ void __attribute__((section("erisc_l1_code.1"), noinline)) Application(void) { // FD: assume that no more host -> remote writes are pending if (mailboxes->launch.go.run == RUN_MSG_GO) { DeviceZoneScopedMainN("ERISC-FW"); + DeviceZoneSetCounter(mailboxes->launch.kernel_config.host_assigned_op_id); DEBUG_STATUS("R"); uint32_t kernel_config_base = mailboxes->launch.kernel_config.kernel_config_base; rta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + diff --git a/tt_metal/hw/firmware/src/idle_erisc.cc b/tt_metal/hw/firmware/src/idle_erisc.cc index bf7b746cb71..3d825bb98ba 100644 --- a/tt_metal/hw/firmware/src/idle_erisc.cc +++ b/tt_metal/hw/firmware/src/idle_erisc.cc @@ -108,6 +108,7 @@ int main() { { DeviceZoneScopedMainN("ERISC-IDLE-FW"); + DeviceZoneSetCounter(mailboxes->launch.kernel_config.host_assigned_op_id); noc_index = mailboxes->launch.kernel_config.brisc_noc_id; diff --git a/tt_metal/hw/inc/dev_msgs.h b/tt_metal/hw/inc/dev_msgs.h index c6a6cde74a2..87fe99896ba 100644 --- a/tt_metal/hw/inc/dev_msgs.h +++ b/tt_metal/hw/inc/dev_msgs.h @@ -80,6 +80,8 @@ struct kernel_config_msg_t { volatile uint16_t watcher_kernel_ids[DISPATCH_CLASS_MAX]; volatile uint16_t ncrisc_kernel_size16; // size in 16 byte units + volatile uint16_t host_assigned_op_id; + // Ring buffer of kernel configuration data volatile uint32_t kernel_config_base; dyn_mem_map_t mem_map[DISPATCH_CLASS_MAX]; @@ -92,6 +94,7 @@ struct kernel_config_msg_t { volatile uint8_t dispatch_core_y; volatile uint8_t exit_erisc_kernel; volatile uint8_t pad1; + volatile uint16_t pad2; } __attribute__((packed)); struct go_msg_t { diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp index 2f36d29a638..c4fa3c5f62b 100644 --- a/tt_metal/impl/dispatch/command_queue.cpp +++ b/tt_metal/impl/dispatch/command_queue.cpp @@ -651,6 +651,7 @@ void EnqueueProgramCommand::assemble_runtime_args_commands() { } void EnqueueProgramCommand::assemble_device_commands() { + ZoneScoped; auto& cached_program_command_sequence = this->cached_program_command_sequences[this->program.id]; if (!program.is_finalized()) { // Calculate size of command and fill program indices of data to update @@ -895,6 +896,7 @@ void EnqueueProgramCommand::assemble_device_commands() { kernel_group.launch_msg.kernel_config.mode = DISPATCH_MODE_DEV; kernel_group.launch_msg.kernel_config.dispatch_core_x = this->dispatch_core.x; kernel_group.launch_msg.kernel_config.dispatch_core_y = this->dispatch_core.y; + kernel_group.launch_msg.kernel_config.host_assigned_op_id = program.get_global_id(); const void* launch_message_data = (const void*)(&kernel_group.launch_msg); for (const CoreRange& core_range : kernel_group.core_ranges.ranges()) { CoreCoord physical_start = @@ -917,11 +919,11 @@ void EnqueueProgramCommand::assemble_device_commands() { this->packed_write_max_unicast_sub_cmds, multicast_go_signals_payload); } - for (KernelGroup& kernel_group : program.get_kernel_groups(CoreType::ETH)) { kernel_group.launch_msg.kernel_config.mode = DISPATCH_MODE_DEV; kernel_group.launch_msg.kernel_config.dispatch_core_x = this->dispatch_core.x; kernel_group.launch_msg.kernel_config.dispatch_core_y = this->dispatch_core.y; + kernel_group.launch_msg.kernel_config.host_assigned_op_id = program.get_global_id(); const void* launch_message_data = (const launch_msg_t*)(&kernel_group.launch_msg); for (const CoreRange& core_range : kernel_group.core_ranges.ranges()) { for (auto x = core_range.start.x; x <= core_range.end.x; x++) { @@ -1113,6 +1115,7 @@ void EnqueueProgramCommand::assemble_device_commands() { } } else { uint32_t i = 0; + ZoneScopedN("program_loaded_on_device"); for (const auto& cbs_on_core_range : cached_program_command_sequence.circular_buffers_on_core_ranges) { uint32_t* cb_config_payload = cached_program_command_sequence.cb_configs_payloads[i]; for (const shared_ptr& cb : cbs_on_core_range) { @@ -1134,6 +1137,7 @@ void EnqueueProgramCommand::assemble_device_commands() { for (auto& go_signal : cached_program_command_sequence.go_signals) { go_signal->kernel_config.dispatch_core_x = this->dispatch_core.x; go_signal->kernel_config.dispatch_core_y = this->dispatch_core.y; + go_signal->kernel_config.host_assigned_op_id = program.get_global_id(); } } } diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp index a355e2dbe21..614700e7186 100644 --- a/tt_metal/impl/program/program.cpp +++ b/tt_metal/impl/program/program.cpp @@ -81,7 +81,7 @@ void DisablePersistentKernelCache() { enable_persistent_kernel_cache = false; } std::atomic Program::program_counter = 0; Program::Program() : - id(program_counter++), worker_crs_({}), local_circular_buffer_allocation_needed_(false), loaded_onto_device(false) { + id(program_counter++), global_id(0), worker_crs_({}), local_circular_buffer_allocation_needed_(false), loaded_onto_device(false) { std::set supported_core_types = {CoreType::WORKER, CoreType::ETH}; for (const auto &core_type : supported_core_types) { kernels_.insert({core_type, {}}); @@ -934,5 +934,7 @@ void Program::compile(Device *device) { this->loaded_onto_device = false; } +void Program::set_global_id(uint64_t id) { this->global_id = id; } + Program::~Program() {} } // namespace tt::tt_metal diff --git a/tt_metal/impl/program/program.hpp b/tt_metal/impl/program/program.hpp index 48cb966b976..ea19daff48d 100644 --- a/tt_metal/impl/program/program.hpp +++ b/tt_metal/impl/program/program.hpp @@ -80,11 +80,13 @@ class Program { Program(Program &&other) = default; Program& operator=(Program &&other) = default; + void set_global_id(uint64_t id); ~Program(); void construct_core_range_set_for_worker_cores(); const uint64_t get_id() const { return this->id; } + const uint64_t get_global_id() const { return this->global_id; } size_t num_kernels() const { size_t count = 0; @@ -177,6 +179,7 @@ class Program { }; uint64_t id; // Need to make non-const due to move constructor + uint64_t global_id; // Need to make non-const due to move constructor static std::atomic program_counter; std::unordered_map >> kernels_; std::unordered_map grid_extent_; diff --git a/tt_metal/third_party/tracy b/tt_metal/third_party/tracy index 2591e70eaca..71d4c8d378b 160000 --- a/tt_metal/third_party/tracy +++ b/tt_metal/third_party/tracy @@ -1 +1 @@ -Subproject commit 2591e70eaca0a12705ea23cbe4059e086c9a2a9f +Subproject commit 71d4c8d378b52af7da7012b9b595a61e9304f0bb diff --git a/tt_metal/tools/profiler/kernel_profiler.hpp b/tt_metal/tools/profiler/kernel_profiler.hpp index 343dcbbd3c5..b307be780eb 100644 --- a/tt_metal/tools/profiler/kernel_profiler.hpp +++ b/tt_metal/tools/profiler/kernel_profiler.hpp @@ -111,7 +111,6 @@ namespace kernel_profiler{ { core_flat_id = noc_xy_to_profiler_flat_id[my_x[0]][my_y[0]]; -#pragma GCC unroll 65534 for (int i = ID_HH; i < GUARANTEED_MARKER_1_H; i ++) { eriscBuffer[i] = 0; @@ -124,14 +123,13 @@ namespace kernel_profiler{ profiler_control_buffer[FLAT_ID] = core_flat_id; } -#pragma GCC unroll 65534 for (int i = GUARANTEED_MARKER_1_H; i < CUSTOM_MARKERS; i ++) { //TODO(MO): Clean up magic numbers eriscBuffer[i] = 0x80000000; } - eriscBuffer [ID_LL] = runCounter; + eriscBuffer [ID_LL] = (runCounter & 0xFFFF) | (eriscBuffer [ID_LL] & 0xFFFF0000); #endif //ERISC_INIT #if defined(COMPILE_FOR_BRISC) @@ -146,7 +144,6 @@ namespace kernel_profiler{ { core_flat_id = noc_xy_to_profiler_flat_id[my_x[0]][my_y[0]]; -#pragma GCC unroll 65534 for (int i = ID_HH; i < GUARANTEED_MARKER_1_H; i ++) { briscBuffer[i] = 0; @@ -167,10 +164,9 @@ namespace kernel_profiler{ profiler_control_buffer[FLAT_ID] = core_flat_id; } -#pragma GCC unroll 65534 for (int i = GUARANTEED_MARKER_1_H; i < CUSTOM_MARKERS; i ++) { - //TODO(MO): Clean up magic numbers + //TODO(MO): Clean up magic numbers briscBuffer[i] = 0x80000000; ncriscBuffer[i] = 0x80000000; trisc0Buffer[i] = 0x80000000; @@ -178,12 +174,11 @@ namespace kernel_profiler{ trisc2Buffer[i] = 0x80000000; } - //TODO(MO): Clean up magic numbers - briscBuffer [ID_LL] = runCounter; - ncriscBuffer[ID_LL] = runCounter; - trisc0Buffer[ID_LL] = runCounter; - trisc1Buffer[ID_LL] = runCounter; - trisc2Buffer[ID_LL] = runCounter; + briscBuffer [ID_LL] = (runCounter & 0xFFFF) | (briscBuffer [ID_LL] & 0xFFFF0000); + ncriscBuffer[ID_LL] = (runCounter & 0xFFFF) | (ncriscBuffer[ID_LL] & 0xFFFF0000); + trisc0Buffer[ID_LL] = (runCounter & 0xFFFF) | (trisc0Buffer[ID_LL] & 0xFFFF0000); + trisc1Buffer[ID_LL] = (runCounter & 0xFFFF) | (trisc1Buffer[ID_LL] & 0xFFFF0000); + trisc2Buffer[ID_LL] = (runCounter & 0xFFFF) | (trisc2Buffer[ID_LL] & 0xFFFF0000); #endif //BRISC_INIT @@ -208,14 +203,6 @@ namespace kernel_profiler{ buffer[index+1] = p_reg[WALL_CLOCK_LOW_INDEX]; } - inline __attribute__((always_inline)) void mark_end_at_index_inlined(uint32_t index, uint32_t timer_id_s, uint32_t timer_id) - { - volatile tt_l1_ptr uint32_t *buffer = reinterpret_cast(kernel_profiler::profilerBuffer); - volatile tt_reg_ptr uint32_t *p_reg = reinterpret_cast (RISCV_DEBUG_REG_WALL_CLOCK_L); - buffer[index+2] = 0x80000000 | ((timer_id & 0x7FFFF) << 12) | (p_reg[WALL_CLOCK_HIGH_INDEX] & 0xFFF); - buffer[index+3] = p_reg[WALL_CLOCK_LOW_INDEX]; - } - inline __attribute__((always_inline)) void mark_padding() { if (wIndex < PROFILER_L1_VECTOR_SIZE) @@ -233,6 +220,28 @@ namespace kernel_profiler{ profiler_control_buffer[DROPPED_ZONES] = (1 << index) | curr; } + inline __attribute__((always_inline)) void set_host_counter(uint32_t counterValue) + { +#if defined(COMPILE_FOR_ERISC) + volatile tt_l1_ptr uint32_t *eriscBuffer = reinterpret_cast(eth_l1_mem::address_map::PROFILER_L1_BUFFER_ER); + + eriscBuffer[ID_LL] = (counterValue << 16) | (eriscBuffer[ID_LL] & 0xFFFF); +#endif //ERISC_INIT + +#if defined(COMPILE_FOR_BRISC) + volatile tt_l1_ptr uint32_t *briscBuffer = reinterpret_cast(PROFILER_L1_BUFFER_BR); + volatile tt_l1_ptr uint32_t *ncriscBuffer = reinterpret_cast(PROFILER_L1_BUFFER_NC); + volatile tt_l1_ptr uint32_t *trisc0Buffer = reinterpret_cast(PROFILER_L1_BUFFER_T0); + volatile tt_l1_ptr uint32_t *trisc1Buffer = reinterpret_cast(PROFILER_L1_BUFFER_T1); + volatile tt_l1_ptr uint32_t *trisc2Buffer = reinterpret_cast(PROFILER_L1_BUFFER_T2); + + briscBuffer[ID_LL] = (counterValue << 16) | (briscBuffer[ID_LL] & 0xFFFF); + ncriscBuffer[ID_LL] = (counterValue << 16) | (ncriscBuffer[ID_LL] & 0xFFFF); + trisc0Buffer[ID_LL] = (counterValue << 16) | (trisc0Buffer[ID_LL] & 0xFFFF); + trisc1Buffer[ID_LL] = (counterValue << 16) | (trisc1Buffer[ID_LL] & 0xFFFF); + trisc2Buffer[ID_LL] = (counterValue << 16) | (trisc2Buffer[ID_LL] & 0xFFFF); +#endif //ERISC_INIT + } inline __attribute__((always_inline)) void risc_finished_profiling() { @@ -374,6 +383,7 @@ namespace kernel_profiler{ #if defined(DISPATCH_KERNEL) && defined(COMPILE_FOR_NCRISC) && (PROFILE_KERNEL == PROFILER_OPT_DO_DISPATCH_CORES) SrcLocNameToHash("PROFILER-NOC-QUICK-SEND"); mark_time_at_index_inlined(wIndex, hash); + wIndex += PROFILER_L1_MARKER_UINT32_SIZE; core_flat_id = noc_xy_to_profiler_flat_id[my_x[0]][my_y[0]]; uint32_t dram_offset = @@ -389,8 +399,8 @@ namespace kernel_profiler{ uint64_t dram_bank_dst_noc_addr = s.get_noc_addr(core_flat_id / profiler_core_count_per_dram, dram_offset); - mark_end_at_index_inlined(wIndex, hash, get_end_timer_id(hash)); - wIndex += QUICK_PUSH_MARKER_COUNT * PROFILER_L1_MARKER_UINT32_SIZE; + mark_time_at_index_inlined(wIndex, get_end_timer_id(hash)); + wIndex += PROFILER_L1_MARKER_UINT32_SIZE; uint32_t currEndIndex = profiler_control_buffer[HOST_BUFFER_END_INDEX_NC] + wIndex; @@ -523,6 +533,8 @@ namespace kernel_profiler{ #define DeviceZoneScopedSumN2( name ) DO_PRAGMA(message(PROFILER_MSG_NAME(name))); auto constexpr hash = kernel_profiler::Hash16_CT(PROFILER_MSG_NAME(name)); kernel_profiler::profileScopeAccumulate zone = kernel_profiler::profileScopeAccumulate(); +#define DeviceZoneSetCounter( counter ) kernel_profiler::set_host_counter(counter); + #else #define DeviceZoneScopedMainN( name ) @@ -537,4 +549,6 @@ namespace kernel_profiler{ #define DeviceZoneScopedND( name , nocBuffer, nocIndex ) +#define DeviceZoneSetCounter( counter ) + #endif diff --git a/tt_metal/tools/profiler/process_device_log.py b/tt_metal/tools/profiler/process_device_log.py index 65a47046be0..8495150c608 100755 --- a/tt_metal/tools/profiler/process_device_log.py +++ b/tt_metal/tools/profiler/process_device_log.py @@ -209,7 +209,15 @@ def import_device_profile_log(logPath): timerID = {"id": int(row[4].strip()), "zone_name": "", "zone_phase": "", "src_line": "", "src_file": ""} timeData = int(row[5].strip()) statData = 0 - if len(row) > 6: + if len(row) == 13: + statData = int(row[6].strip()) + timerID["run_id"] = int(row[7].strip()) + timerID["op_id"] = int(row[8].strip()) + timerID["zone_name"] = row[9].strip() + timerID["zone_phase"] = row[10].strip() + timerID["src_line"] = int(row[11].strip()) + timerID["src_file"] = row[12].strip() + elif len(row) == 12: statData = int(row[6].strip()) timerID["run_id"] = int(row[7].strip()) timerID["zone_name"] = row[8].strip() diff --git a/tt_metal/tools/profiler/process_ops_logs.py b/tt_metal/tools/profiler/process_ops_logs.py index 5a0259a5f9b..6b531be4ced 100755 --- a/tt_metal/tools/profiler/process_ops_logs.py +++ b/tt_metal/tools/profiler/process_ops_logs.py @@ -197,6 +197,10 @@ def append_device_data(ops, deviceLogFolder): cores = set() for timeID, ts, statData, risc, core in deviceOpTime["timeseries"]: if "zone_name" in timeID.keys() and "FW" in timeID["zone_name"]: + if "op_id" in timeID.keys(): + assert ( + timeID["op_id"] == deviceOp["global_call_count"] + ), f"op id {timeID['op_id']} reproted by device is not matching assigned op id {deviceOp['global_call_count']}" if core not in cores: cores.add(core) deviceOp["core_usage"] = {"count": len(cores), "cores": [str(core) for core in cores]} diff --git a/tt_metal/tools/profiler/profiler.cpp b/tt_metal/tools/profiler/profiler.cpp index ed8000cf91f..b5e64d7143b 100644 --- a/tt_metal/tools/profiler/profiler.cpp +++ b/tt_metal/tools/profiler/profiler.cpp @@ -88,6 +88,7 @@ void DeviceProfiler::readRiscProfilerResults( uint32_t riscNumRead = 0; uint32_t coreFlatIDRead = 0; uint32_t runCounterRead = 0; + uint32_t runHostCounterRead = 0; bool newRunStart = false; @@ -108,7 +109,8 @@ void DeviceProfiler::readRiscProfilerResults( //TODO(MO): Cleanup magic numbers riscNumRead = profile_buffer[index] & 0x7; coreFlatIDRead = (profile_buffer[index] >> 3) & 0xFF; - runCounterRead = profile_buffer[index + 1]; + runCounterRead = profile_buffer[index + 1] & 0xFFFF; + runHostCounterRead = (profile_buffer[index + 1] >> 16 ) & 0xFFFF; } else @@ -149,6 +151,7 @@ void DeviceProfiler::readRiscProfilerResults( dumpResultToFile( runCounterRead, + runHostCounterRead, device_id, worker_core, coreFlatID, @@ -167,6 +170,7 @@ void DeviceProfiler::readRiscProfilerResults( uint32_t time_L = opTime_L; dumpResultToFile( runCounterRead, + runHostCounterRead, device_id, worker_core, coreFlatID, @@ -201,6 +205,7 @@ void DeviceProfiler::firstTimestamp(uint64_t timestamp) void DeviceProfiler::dumpResultToFile( uint32_t run_id, + uint32_t run_host_id, int device_id, CoreCoord core, int core_flat, @@ -237,7 +242,7 @@ void DeviceProfiler::dumpResultToFile( source_line = stoi(source_line_str); } - tracy::TTDeviceEvent event = tracy::TTDeviceEvent(run_id, device_id, core.x, core.y, risc_num, timer_id, timestamp, source_line, source_file, zone_name, zone_phase); + tracy::TTDeviceEvent event = tracy::TTDeviceEvent(run_host_id, device_id, core.x, core.y, risc_num, timer_id, timestamp, source_line, source_file, zone_name, zone_phase); auto ret = device_events.insert(event); @@ -249,7 +254,7 @@ void DeviceProfiler::dumpResultToFile( { log_file.open(log_path); log_file << "ARCH: " << get_string_lowercase(device_architecture) << ", CHIP_FREQ[MHz]: " << device_core_frequency << std::endl; - log_file << "PCIe slot, core_x, core_y, RISC processor type, timer_id, time[cycles since reset], stat value, Run ID, zone name, zone phase, source line, source file" << std::endl; + log_file << "PCIe slot, core_x, core_y, RISC processor type, timer_id, time[cycles since reset], stat value, run ID, run host ID, zone name, zone phase, source line, source file" << std::endl; new_log = false; } else @@ -258,7 +263,7 @@ void DeviceProfiler::dumpResultToFile( } //log_file << fmt::format("{:4},{:3},{:3},{:>7},{:7},{:15},{:15},{:5},{:>25},{:>6},{:6},{}", - log_file << fmt::format("{},{},{},{},{},{},{},{},{},{},{},{}", + log_file << fmt::format("{},{},{},{},{},{},{},{},{},{},{},{},{}", device_id, core.x, core.y, @@ -267,6 +272,7 @@ void DeviceProfiler::dumpResultToFile( timestamp, stat_value, run_id, + run_host_id, zone_name, magic_enum::enum_name(zone_phase), source_line, diff --git a/tt_metal/tools/profiler/profiler.hpp b/tt_metal/tools/profiler/profiler.hpp index 79bbf2a3777..8370e3ec97c 100644 --- a/tt_metal/tools/profiler/profiler.hpp +++ b/tt_metal/tools/profiler/profiler.hpp @@ -72,6 +72,7 @@ class DeviceProfiler { // Dumping profile result to file void dumpResultToFile( uint32_t runID, + uint32_t runHostID, int device_id, CoreCoord core, int core_flat, diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp index 54d60c44523..c2d9dc1f18a 100644 --- a/tt_metal/tt_metal.cpp +++ b/tt_metal/tt_metal.cpp @@ -532,7 +532,7 @@ void LaunchProgram(Device *device, Program &program, bool wait_until_cores_done) for (const auto &[core_type, logical_cores] : logical_cores_used_in_program) { for (const auto &logical_core : logical_cores) { launch_msg_t *msg = &program.kernels_on_core(logical_core, core_type)->launch_msg; - + msg->kernel_config.host_assigned_op_id = program.get_global_id(); auto physical_core = device->physical_core_from_logical_core(logical_core, core_type); not_done_cores.insert(physical_core); tt::llrt::write_launch_msg_to_core(device->id(), physical_core, msg); diff --git a/ttnn/cpp/ttnn/device_operation.hpp b/ttnn/cpp/ttnn/device_operation.hpp index a29bb23dca0..82a057ee433 100644 --- a/ttnn/cpp/ttnn/device_operation.hpp +++ b/ttnn/cpp/ttnn/device_operation.hpp @@ -293,6 +293,8 @@ typename device_operation_t::tensor_return_value_t run( auto& program = create_or_get_program_from_cache( program_cache, program_cache_hit, program_hash, operation_attributes, tensor_args, tensor_return_value); + program.set_global_id(operation_id); + if (USE_FAST_DISPATCH) { ZoneScopedN("EnqueueProgram"); auto& queue = device->command_queue(cq_id);