From c4366b9b81c9d2696854a061f312f30de1484a1c Mon Sep 17 00:00:00 2001 From: Austin Ho Date: Wed, 26 Jun 2024 21:29:06 +0000 Subject: [PATCH] #3764: Device side opID support - Device side opID is verified against the expected ID extracted by host - Tracy device zones include opID --- tt_eager/tt_dnn/op_library/run_operation.cpp | 9 ++++ tt_metal/hw/firmware/src/brisc.cc | 1 + tt_metal/hw/inc/dev_msgs.h | 2 + tt_metal/impl/dispatch/command_queue.cpp | 7 ++- tt_metal/impl/program/program.cpp | 2 + tt_metal/impl/program/program.hpp | 3 ++ tt_metal/third_party/tracy | 2 +- tt_metal/tools/profiler/kernel_profiler.hpp | 48 +++++++++++-------- tt_metal/tools/profiler/process_device_log.py | 10 +++- tt_metal/tools/profiler/process_ops_logs.py | 4 ++ tt_metal/tools/profiler/profiler.cpp | 14 ++++-- tt_metal/tools/profiler/profiler.hpp | 1 + tt_metal/tt_metal.cpp | 1 + ttnn/cpp/ttnn/device_operation.hpp | 2 + 14 files changed, 78 insertions(+), 28 deletions(-) diff --git a/tt_eager/tt_dnn/op_library/run_operation.cpp b/tt_eager/tt_dnn/op_library/run_operation.cpp index 2c6085773b3..cc6954969e9 100644 --- a/tt_eager/tt_dnn/op_library/run_operation.cpp +++ b/tt_eager/tt_dnn/op_library/run_operation.cpp @@ -224,6 +224,15 @@ OutputTensors run_device_operation( operation, input_tensors, optional_input_tensors, output_tensors, optional_output_tensors); uint32_t device_id = detail::get_device(input_tensors, optional_input_tensors)->id(); + if (std::holds_alternative>(program)) + { + std::get>(program).get().set_global_id(op_id); + } + else + { + std::get>(program)->set_global_id(op_id); + } + // Enqueue or Launch Program std::visit( [&operation, &input_tensors, &optional_input_tensors, &output_tensors, queue](auto&& program) { diff --git a/tt_metal/hw/firmware/src/brisc.cc b/tt_metal/hw/firmware/src/brisc.cc index d88ea516cea..587fef5408c 100644 --- a/tt_metal/hw/firmware/src/brisc.cc +++ b/tt_metal/hw/firmware/src/brisc.cc @@ -365,6 +365,7 @@ int main() { { DeviceZoneScopedMainN("BRISC-FW"); + DeviceZoneSetCounter(mailboxes->launch.host_assigned_op_id); // Copies from L1 to IRAM on chips where NCRISC has IRAM l1_to_ncrisc_iram_copy(mailboxes->launch.ncrisc_kernel_size16, ncrisc_kernel_start_offset16); diff --git a/tt_metal/hw/inc/dev_msgs.h b/tt_metal/hw/inc/dev_msgs.h index c63ac707050..c9f042ea22c 100644 --- a/tt_metal/hw/inc/dev_msgs.h +++ b/tt_metal/hw/inc/dev_msgs.h @@ -65,6 +65,8 @@ struct launch_msg_t { // must be cacheline aligned volatile uint16_t watcher_kernel_ids[DISPATCH_CLASS_MAX_PROC]; volatile uint16_t ncrisc_kernel_size16; // size in 16 byte units + volatile uint16_t host_assigned_op_id; + // Ring buffer of kernel configuration data volatile uint32_t kernel_config_base; volatile uint16_t rta_offsets[DISPATCH_CLASS_MAX_PROC]; diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp index f18f2dbc375..1ae1b2401db 100644 --- a/tt_metal/impl/dispatch/command_queue.cpp +++ b/tt_metal/impl/dispatch/command_queue.cpp @@ -598,8 +598,10 @@ void EnqueueProgramCommand::assemble_runtime_args_commands() { } void EnqueueProgramCommand::assemble_device_commands() { + ZoneScoped; auto& cached_program_command_sequence = this->cached_program_command_sequences[this->program.id]; if (!program.loaded_onto_device) { + ZoneScopedN("program_not_loaded_on_device"); // Calculate size of command and fill program indices of data to update // TODO: Would be nice if we could pull this out of program uint32_t cmd_sequence_sizeB = 0; @@ -842,6 +844,7 @@ void EnqueueProgramCommand::assemble_device_commands() { kernel_group.launch_msg.mode = DISPATCH_MODE_DEV; kernel_group.launch_msg.dispatch_core_x = this->dispatch_core.x; kernel_group.launch_msg.dispatch_core_y = this->dispatch_core.y; + kernel_group.launch_msg.host_assigned_op_id = program.get_global_id(); const void* launch_message_data = (const void*)(&kernel_group.launch_msg); for (const CoreRange& core_range : kernel_group.core_ranges.ranges()) { CoreCoord physical_start = @@ -863,11 +866,11 @@ void EnqueueProgramCommand::assemble_device_commands() { max_prefetch_command_size, multicast_go_signals_payload); } - for (KernelGroup& kernel_group : program.get_kernel_groups(CoreType::ETH)) { kernel_group.launch_msg.mode = DISPATCH_MODE_DEV; kernel_group.launch_msg.dispatch_core_x = this->dispatch_core.x; kernel_group.launch_msg.dispatch_core_y = this->dispatch_core.y; + kernel_group.launch_msg.host_assigned_op_id = program.get_global_id(); const void* launch_message_data = (const launch_msg_t*)(&kernel_group.launch_msg); for (const CoreRange& core_range : kernel_group.core_ranges.ranges()) { for (auto x = core_range.start.x; x <= core_range.end.x; x++) { @@ -1055,6 +1058,7 @@ void EnqueueProgramCommand::assemble_device_commands() { } } else { uint32_t i = 0; + ZoneScopedN("program_loaded_on_device"); for (const auto& cbs_on_core_range : cached_program_command_sequence.circular_buffers_on_core_ranges) { uint32_t* cb_config_payload = cached_program_command_sequence.cb_configs_payloads[i]; for (const shared_ptr& cb : cbs_on_core_range) { @@ -1076,6 +1080,7 @@ void EnqueueProgramCommand::assemble_device_commands() { for (auto& go_signal : cached_program_command_sequence.go_signals) { go_signal->dispatch_core_x = this->dispatch_core.x; go_signal->dispatch_core_y = this->dispatch_core.y; + go_signal->host_assigned_op_id = program.get_global_id(); } } } diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp index 9b054d4abec..a2bc1064a78 100644 --- a/tt_metal/impl/program/program.cpp +++ b/tt_metal/impl/program/program.cpp @@ -847,5 +847,7 @@ void Program::compile(Device *device) { this->loaded_onto_device = false; } +void Program::set_global_id(uint64_t id) { this->global_id = id; } + Program::~Program() {} } // namespace tt::tt_metal diff --git a/tt_metal/impl/program/program.hpp b/tt_metal/impl/program/program.hpp index 2b4144acddc..6255f4c3fdf 100644 --- a/tt_metal/impl/program/program.hpp +++ b/tt_metal/impl/program/program.hpp @@ -77,11 +77,13 @@ class Program { Program(Program &&other) = default; Program& operator=(Program &&other) = default; + void set_global_id(uint64_t id); ~Program(); void construct_core_range_set_for_worker_cores(); const uint64_t get_id() const { return this->id; } + const uint64_t get_global_id() const { return this->global_id; } size_t num_kernels() const { size_t count = 0; @@ -170,6 +172,7 @@ class Program { }; uint64_t id; // Need to make non-const due to move constructor + uint64_t global_id; // Need to make non-const due to move constructor static std::atomic program_counter; std::unordered_map >> kernels_; std::unordered_map grid_extent_; diff --git a/tt_metal/third_party/tracy b/tt_metal/third_party/tracy index 2591e70eaca..71d4c8d378b 160000 --- a/tt_metal/third_party/tracy +++ b/tt_metal/third_party/tracy @@ -1 +1 @@ -Subproject commit 2591e70eaca0a12705ea23cbe4059e086c9a2a9f +Subproject commit 71d4c8d378b52af7da7012b9b595a61e9304f0bb diff --git a/tt_metal/tools/profiler/kernel_profiler.hpp b/tt_metal/tools/profiler/kernel_profiler.hpp index 343dcbbd3c5..3ff3d9b5a83 100644 --- a/tt_metal/tools/profiler/kernel_profiler.hpp +++ b/tt_metal/tools/profiler/kernel_profiler.hpp @@ -111,7 +111,6 @@ namespace kernel_profiler{ { core_flat_id = noc_xy_to_profiler_flat_id[my_x[0]][my_y[0]]; -#pragma GCC unroll 65534 for (int i = ID_HH; i < GUARANTEED_MARKER_1_H; i ++) { eriscBuffer[i] = 0; @@ -124,7 +123,6 @@ namespace kernel_profiler{ profiler_control_buffer[FLAT_ID] = core_flat_id; } -#pragma GCC unroll 65534 for (int i = GUARANTEED_MARKER_1_H; i < CUSTOM_MARKERS; i ++) { //TODO(MO): Clean up magic numbers @@ -146,7 +144,6 @@ namespace kernel_profiler{ { core_flat_id = noc_xy_to_profiler_flat_id[my_x[0]][my_y[0]]; -#pragma GCC unroll 65534 for (int i = ID_HH; i < GUARANTEED_MARKER_1_H; i ++) { briscBuffer[i] = 0; @@ -167,10 +164,9 @@ namespace kernel_profiler{ profiler_control_buffer[FLAT_ID] = core_flat_id; } -#pragma GCC unroll 65534 for (int i = GUARANTEED_MARKER_1_H; i < CUSTOM_MARKERS; i ++) { - //TODO(MO): Clean up magic numbers + //TODO(MO): Clean up magic numbers briscBuffer[i] = 0x80000000; ncriscBuffer[i] = 0x80000000; trisc0Buffer[i] = 0x80000000; @@ -178,12 +174,11 @@ namespace kernel_profiler{ trisc2Buffer[i] = 0x80000000; } - //TODO(MO): Clean up magic numbers - briscBuffer [ID_LL] = runCounter; - ncriscBuffer[ID_LL] = runCounter; - trisc0Buffer[ID_LL] = runCounter; - trisc1Buffer[ID_LL] = runCounter; - trisc2Buffer[ID_LL] = runCounter; + briscBuffer [ID_LL] = (runCounter & 0xFFFF) | (briscBuffer [ID_LL] & 0xFFFF0000); + ncriscBuffer[ID_LL] = (runCounter & 0xFFFF) | (ncriscBuffer[ID_LL] & 0xFFFF0000); + trisc0Buffer[ID_LL] = (runCounter & 0xFFFF) | (trisc0Buffer[ID_LL] & 0xFFFF0000); + trisc1Buffer[ID_LL] = (runCounter & 0xFFFF) | (trisc1Buffer[ID_LL] & 0xFFFF0000); + trisc2Buffer[ID_LL] = (runCounter & 0xFFFF) | (trisc2Buffer[ID_LL] & 0xFFFF0000); #endif //BRISC_INIT @@ -208,14 +203,6 @@ namespace kernel_profiler{ buffer[index+1] = p_reg[WALL_CLOCK_LOW_INDEX]; } - inline __attribute__((always_inline)) void mark_end_at_index_inlined(uint32_t index, uint32_t timer_id_s, uint32_t timer_id) - { - volatile tt_l1_ptr uint32_t *buffer = reinterpret_cast(kernel_profiler::profilerBuffer); - volatile tt_reg_ptr uint32_t *p_reg = reinterpret_cast (RISCV_DEBUG_REG_WALL_CLOCK_L); - buffer[index+2] = 0x80000000 | ((timer_id & 0x7FFFF) << 12) | (p_reg[WALL_CLOCK_HIGH_INDEX] & 0xFFF); - buffer[index+3] = p_reg[WALL_CLOCK_LOW_INDEX]; - } - inline __attribute__((always_inline)) void mark_padding() { if (wIndex < PROFILER_L1_VECTOR_SIZE) @@ -233,6 +220,20 @@ namespace kernel_profiler{ profiler_control_buffer[DROPPED_ZONES] = (1 << index) | curr; } + inline __attribute__((always_inline)) void set_host_counter(uint32_t counterValue) + { + volatile tt_l1_ptr uint32_t *briscBuffer = reinterpret_cast(PROFILER_L1_BUFFER_BR); + volatile tt_l1_ptr uint32_t *ncriscBuffer = reinterpret_cast(PROFILER_L1_BUFFER_NC); + volatile tt_l1_ptr uint32_t *trisc0Buffer = reinterpret_cast(PROFILER_L1_BUFFER_T0); + volatile tt_l1_ptr uint32_t *trisc1Buffer = reinterpret_cast(PROFILER_L1_BUFFER_T1); + volatile tt_l1_ptr uint32_t *trisc2Buffer = reinterpret_cast(PROFILER_L1_BUFFER_T2); + + briscBuffer[ID_LL] = (counterValue << 16) | (briscBuffer[ID_LL] & 0xFFFF); + ncriscBuffer[ID_LL] = (counterValue << 16) | (ncriscBuffer[ID_LL] & 0xFFFF); + trisc0Buffer[ID_LL] = (counterValue << 16) | (trisc0Buffer[ID_LL] & 0xFFFF); + trisc1Buffer[ID_LL] = (counterValue << 16) | (trisc1Buffer[ID_LL] & 0xFFFF); + trisc2Buffer[ID_LL] = (counterValue << 16) | (trisc2Buffer[ID_LL] & 0xFFFF); + } inline __attribute__((always_inline)) void risc_finished_profiling() { @@ -374,6 +375,7 @@ namespace kernel_profiler{ #if defined(DISPATCH_KERNEL) && defined(COMPILE_FOR_NCRISC) && (PROFILE_KERNEL == PROFILER_OPT_DO_DISPATCH_CORES) SrcLocNameToHash("PROFILER-NOC-QUICK-SEND"); mark_time_at_index_inlined(wIndex, hash); + wIndex += PROFILER_L1_MARKER_UINT32_SIZE; core_flat_id = noc_xy_to_profiler_flat_id[my_x[0]][my_y[0]]; uint32_t dram_offset = @@ -389,8 +391,8 @@ namespace kernel_profiler{ uint64_t dram_bank_dst_noc_addr = s.get_noc_addr(core_flat_id / profiler_core_count_per_dram, dram_offset); - mark_end_at_index_inlined(wIndex, hash, get_end_timer_id(hash)); - wIndex += QUICK_PUSH_MARKER_COUNT * PROFILER_L1_MARKER_UINT32_SIZE; + mark_time_at_index_inlined(wIndex, get_end_timer_id(hash)); + wIndex += PROFILER_L1_MARKER_UINT32_SIZE; uint32_t currEndIndex = profiler_control_buffer[HOST_BUFFER_END_INDEX_NC] + wIndex; @@ -523,6 +525,8 @@ namespace kernel_profiler{ #define DeviceZoneScopedSumN2( name ) DO_PRAGMA(message(PROFILER_MSG_NAME(name))); auto constexpr hash = kernel_profiler::Hash16_CT(PROFILER_MSG_NAME(name)); kernel_profiler::profileScopeAccumulate zone = kernel_profiler::profileScopeAccumulate(); +#define DeviceZoneSetCounter( counter ) kernel_profiler::set_host_counter(counter); + #else #define DeviceZoneScopedMainN( name ) @@ -537,4 +541,6 @@ namespace kernel_profiler{ #define DeviceZoneScopedND( name , nocBuffer, nocIndex ) +#define DeviceZoneSetCounter( counter ) + #endif diff --git a/tt_metal/tools/profiler/process_device_log.py b/tt_metal/tools/profiler/process_device_log.py index 65a47046be0..8495150c608 100755 --- a/tt_metal/tools/profiler/process_device_log.py +++ b/tt_metal/tools/profiler/process_device_log.py @@ -209,7 +209,15 @@ def import_device_profile_log(logPath): timerID = {"id": int(row[4].strip()), "zone_name": "", "zone_phase": "", "src_line": "", "src_file": ""} timeData = int(row[5].strip()) statData = 0 - if len(row) > 6: + if len(row) == 13: + statData = int(row[6].strip()) + timerID["run_id"] = int(row[7].strip()) + timerID["op_id"] = int(row[8].strip()) + timerID["zone_name"] = row[9].strip() + timerID["zone_phase"] = row[10].strip() + timerID["src_line"] = int(row[11].strip()) + timerID["src_file"] = row[12].strip() + elif len(row) == 12: statData = int(row[6].strip()) timerID["run_id"] = int(row[7].strip()) timerID["zone_name"] = row[8].strip() diff --git a/tt_metal/tools/profiler/process_ops_logs.py b/tt_metal/tools/profiler/process_ops_logs.py index 5a0259a5f9b..6b531be4ced 100755 --- a/tt_metal/tools/profiler/process_ops_logs.py +++ b/tt_metal/tools/profiler/process_ops_logs.py @@ -197,6 +197,10 @@ def append_device_data(ops, deviceLogFolder): cores = set() for timeID, ts, statData, risc, core in deviceOpTime["timeseries"]: if "zone_name" in timeID.keys() and "FW" in timeID["zone_name"]: + if "op_id" in timeID.keys(): + assert ( + timeID["op_id"] == deviceOp["global_call_count"] + ), f"op id {timeID['op_id']} reproted by device is not matching assigned op id {deviceOp['global_call_count']}" if core not in cores: cores.add(core) deviceOp["core_usage"] = {"count": len(cores), "cores": [str(core) for core in cores]} diff --git a/tt_metal/tools/profiler/profiler.cpp b/tt_metal/tools/profiler/profiler.cpp index ed8000cf91f..b5e64d7143b 100644 --- a/tt_metal/tools/profiler/profiler.cpp +++ b/tt_metal/tools/profiler/profiler.cpp @@ -88,6 +88,7 @@ void DeviceProfiler::readRiscProfilerResults( uint32_t riscNumRead = 0; uint32_t coreFlatIDRead = 0; uint32_t runCounterRead = 0; + uint32_t runHostCounterRead = 0; bool newRunStart = false; @@ -108,7 +109,8 @@ void DeviceProfiler::readRiscProfilerResults( //TODO(MO): Cleanup magic numbers riscNumRead = profile_buffer[index] & 0x7; coreFlatIDRead = (profile_buffer[index] >> 3) & 0xFF; - runCounterRead = profile_buffer[index + 1]; + runCounterRead = profile_buffer[index + 1] & 0xFFFF; + runHostCounterRead = (profile_buffer[index + 1] >> 16 ) & 0xFFFF; } else @@ -149,6 +151,7 @@ void DeviceProfiler::readRiscProfilerResults( dumpResultToFile( runCounterRead, + runHostCounterRead, device_id, worker_core, coreFlatID, @@ -167,6 +170,7 @@ void DeviceProfiler::readRiscProfilerResults( uint32_t time_L = opTime_L; dumpResultToFile( runCounterRead, + runHostCounterRead, device_id, worker_core, coreFlatID, @@ -201,6 +205,7 @@ void DeviceProfiler::firstTimestamp(uint64_t timestamp) void DeviceProfiler::dumpResultToFile( uint32_t run_id, + uint32_t run_host_id, int device_id, CoreCoord core, int core_flat, @@ -237,7 +242,7 @@ void DeviceProfiler::dumpResultToFile( source_line = stoi(source_line_str); } - tracy::TTDeviceEvent event = tracy::TTDeviceEvent(run_id, device_id, core.x, core.y, risc_num, timer_id, timestamp, source_line, source_file, zone_name, zone_phase); + tracy::TTDeviceEvent event = tracy::TTDeviceEvent(run_host_id, device_id, core.x, core.y, risc_num, timer_id, timestamp, source_line, source_file, zone_name, zone_phase); auto ret = device_events.insert(event); @@ -249,7 +254,7 @@ void DeviceProfiler::dumpResultToFile( { log_file.open(log_path); log_file << "ARCH: " << get_string_lowercase(device_architecture) << ", CHIP_FREQ[MHz]: " << device_core_frequency << std::endl; - log_file << "PCIe slot, core_x, core_y, RISC processor type, timer_id, time[cycles since reset], stat value, Run ID, zone name, zone phase, source line, source file" << std::endl; + log_file << "PCIe slot, core_x, core_y, RISC processor type, timer_id, time[cycles since reset], stat value, run ID, run host ID, zone name, zone phase, source line, source file" << std::endl; new_log = false; } else @@ -258,7 +263,7 @@ void DeviceProfiler::dumpResultToFile( } //log_file << fmt::format("{:4},{:3},{:3},{:>7},{:7},{:15},{:15},{:5},{:>25},{:>6},{:6},{}", - log_file << fmt::format("{},{},{},{},{},{},{},{},{},{},{},{}", + log_file << fmt::format("{},{},{},{},{},{},{},{},{},{},{},{},{}", device_id, core.x, core.y, @@ -267,6 +272,7 @@ void DeviceProfiler::dumpResultToFile( timestamp, stat_value, run_id, + run_host_id, zone_name, magic_enum::enum_name(zone_phase), source_line, diff --git a/tt_metal/tools/profiler/profiler.hpp b/tt_metal/tools/profiler/profiler.hpp index 79bbf2a3777..8370e3ec97c 100644 --- a/tt_metal/tools/profiler/profiler.hpp +++ b/tt_metal/tools/profiler/profiler.hpp @@ -72,6 +72,7 @@ class DeviceProfiler { // Dumping profile result to file void dumpResultToFile( uint32_t runID, + uint32_t runHostID, int device_id, CoreCoord core, int core_flat, diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp index dc529c1256e..d9f148b534f 100644 --- a/tt_metal/tt_metal.cpp +++ b/tt_metal/tt_metal.cpp @@ -523,6 +523,7 @@ void LaunchProgram(Device *device, Program &program, bool wait_until_cores_done) for (const auto &[core_type, logical_cores] : logical_cores_used_in_program) { for (const auto &logical_core : logical_cores) { launch_msg_t *msg = &program.kernels_on_core(logical_core, core_type)->launch_msg; + msg->host_assigned_op_id = program.get_global_id(); auto physical_core = device->physical_core_from_logical_core(logical_core, core_type); not_done_cores.insert(physical_core); tt::llrt::write_launch_msg_to_core(device->id(), physical_core, msg); diff --git a/ttnn/cpp/ttnn/device_operation.hpp b/ttnn/cpp/ttnn/device_operation.hpp index d7d37f48e85..3305f82309c 100644 --- a/ttnn/cpp/ttnn/device_operation.hpp +++ b/ttnn/cpp/ttnn/device_operation.hpp @@ -213,6 +213,8 @@ typename device_operation_t::tensor_return_value_t run( auto& program = create_or_get_program_from_cache( program_cache, cache_hit, program_hash, operation_attributes, tensor_args, tensor_return_value); + program.set_global_id(operation_id); + if (USE_FAST_DISPATCH) { ZoneScopedN("EnqueueProgram"); auto& queue = device->command_queue(cq_id);