Skip to content

Commit

Permalink
#3764: Device side opID support
Browse files Browse the repository at this point in the history
- Device side opID is verified against the expected ID extracted by host
- Tracy device zones include opID
  • Loading branch information
tt-aho authored and mo-tenstorrent committed Jul 10, 2024
1 parent b2ab884 commit e642070
Show file tree
Hide file tree
Showing 14 changed files with 77 additions and 29 deletions.
9 changes: 9 additions & 0 deletions tt_eager/tt_dnn/op_library/run_operation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,15 @@ OutputTensors run_device_operation(
operation, input_tensors, optional_input_tensors, output_tensors, optional_output_tensors);
uint32_t device_id = detail::get_device(input_tensors, optional_input_tensors)->id();

if (std::holds_alternative<std::reference_wrapper<Program>>(program))
{
std::get<std::reference_wrapper<Program>>(program).get().set_global_id(op_id);
}
else
{
std::get<std::shared_ptr<Program>>(program)->set_global_id(op_id);
}

// Enqueue or Launch Program
std::visit(
[&operation, &input_tensors, &optional_input_tensors, &output_tensors, queue](auto&& program) {
Expand Down
1 change: 1 addition & 0 deletions tt_metal/hw/firmware/src/brisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,7 @@ int main() {

{
DeviceZoneScopedMainN("BRISC-FW");
DeviceZoneSetCounter(mailboxes->launch.host_assigned_op_id);

// Copies from L1 to IRAM on chips where NCRISC has IRAM
l1_to_ncrisc_iram_copy(mailboxes->launch.ncrisc_kernel_size16, ncrisc_kernel_start_offset16);
Expand Down
2 changes: 2 additions & 0 deletions tt_metal/hw/inc/dev_msgs.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@ struct launch_msg_t { // must be cacheline aligned
volatile uint16_t watcher_kernel_ids[DISPATCH_CLASS_MAX];
volatile uint16_t ncrisc_kernel_size16; // size in 16 byte units

volatile uint16_t host_assigned_op_id;

// Ring buffer of kernel configuration data
volatile uint32_t kernel_config_base;
dyn_mem_map_t mem_map[DISPATCH_CLASS_MAX];
Expand Down
6 changes: 5 additions & 1 deletion tt_metal/impl/dispatch/command_queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -651,6 +651,7 @@ void EnqueueProgramCommand::assemble_runtime_args_commands() {
}

void EnqueueProgramCommand::assemble_device_commands() {
ZoneScoped;
auto& cached_program_command_sequence = this->cached_program_command_sequences[this->program.id];
if (!program.is_finalized()) {
// Calculate size of command and fill program indices of data to update
Expand Down Expand Up @@ -895,6 +896,7 @@ void EnqueueProgramCommand::assemble_device_commands() {
kernel_group.launch_msg.mode = DISPATCH_MODE_DEV;
kernel_group.launch_msg.dispatch_core_x = this->dispatch_core.x;
kernel_group.launch_msg.dispatch_core_y = this->dispatch_core.y;
kernel_group.launch_msg.host_assigned_op_id = program.get_global_id();
const void* launch_message_data = (const void*)(&kernel_group.launch_msg);
for (const CoreRange& core_range : kernel_group.core_ranges.ranges()) {
CoreCoord physical_start =
Expand All @@ -917,11 +919,11 @@ void EnqueueProgramCommand::assemble_device_commands() {
this->packed_write_max_unicast_sub_cmds,
multicast_go_signals_payload);
}

for (KernelGroup& kernel_group : program.get_kernel_groups(CoreType::ETH)) {
kernel_group.launch_msg.mode = DISPATCH_MODE_DEV;
kernel_group.launch_msg.dispatch_core_x = this->dispatch_core.x;
kernel_group.launch_msg.dispatch_core_y = this->dispatch_core.y;
kernel_group.launch_msg.host_assigned_op_id = program.get_global_id();
const void* launch_message_data = (const launch_msg_t*)(&kernel_group.launch_msg);
for (const CoreRange& core_range : kernel_group.core_ranges.ranges()) {
for (auto x = core_range.start.x; x <= core_range.end.x; x++) {
Expand Down Expand Up @@ -1113,6 +1115,7 @@ void EnqueueProgramCommand::assemble_device_commands() {
}
} else {
uint32_t i = 0;
ZoneScopedN("program_loaded_on_device");
for (const auto& cbs_on_core_range : cached_program_command_sequence.circular_buffers_on_core_ranges) {
uint32_t* cb_config_payload = cached_program_command_sequence.cb_configs_payloads[i];
for (const shared_ptr<CircularBuffer>& cb : cbs_on_core_range) {
Expand All @@ -1134,6 +1137,7 @@ void EnqueueProgramCommand::assemble_device_commands() {
for (auto& go_signal : cached_program_command_sequence.go_signals) {
go_signal->dispatch_core_x = this->dispatch_core.x;
go_signal->dispatch_core_y = this->dispatch_core.y;
go_signal->host_assigned_op_id = program.get_global_id();
}
}
}
Expand Down
2 changes: 2 additions & 0 deletions tt_metal/impl/program/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -934,5 +934,7 @@ void Program::compile(Device *device) {
this->loaded_onto_device = false;
}

void Program::set_global_id(uint64_t id) { this->global_id = id; }

Program::~Program() {}
} // namespace tt::tt_metal
3 changes: 3 additions & 0 deletions tt_metal/impl/program/program.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,11 +80,13 @@ class Program {
Program(Program &&other) = default;
Program& operator=(Program &&other) = default;

void set_global_id(uint64_t id);
~Program();

void construct_core_range_set_for_worker_cores();

const uint64_t get_id() const { return this->id; }
const uint64_t get_global_id() const { return this->global_id; }

size_t num_kernels() const {
size_t count = 0;
Expand Down Expand Up @@ -177,6 +179,7 @@ class Program {
};

uint64_t id; // Need to make non-const due to move constructor
uint64_t global_id; // Need to make non-const due to move constructor
static std::atomic<uint64_t> program_counter;
std::unordered_map<CoreType, std::unordered_map<KernelHandle, std::shared_ptr<Kernel> >> kernels_;
std::unordered_map<CoreType, CoreCoord> grid_extent_;
Expand Down
2 changes: 1 addition & 1 deletion tt_metal/third_party/tracy
48 changes: 27 additions & 21 deletions tt_metal/tools/profiler/kernel_profiler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,6 @@ namespace kernel_profiler{
{
core_flat_id = noc_xy_to_profiler_flat_id[my_x[0]][my_y[0]];

#pragma GCC unroll 65534
for (int i = ID_HH; i < GUARANTEED_MARKER_1_H; i ++)
{
eriscBuffer[i] = 0;
Expand All @@ -124,7 +123,6 @@ namespace kernel_profiler{
profiler_control_buffer[FLAT_ID] = core_flat_id;
}

#pragma GCC unroll 65534
for (int i = GUARANTEED_MARKER_1_H; i < CUSTOM_MARKERS; i ++)
{
//TODO(MO): Clean up magic numbers
Expand All @@ -146,7 +144,6 @@ namespace kernel_profiler{
{
core_flat_id = noc_xy_to_profiler_flat_id[my_x[0]][my_y[0]];

#pragma GCC unroll 65534
for (int i = ID_HH; i < GUARANTEED_MARKER_1_H; i ++)
{
briscBuffer[i] = 0;
Expand All @@ -167,23 +164,21 @@ namespace kernel_profiler{
profiler_control_buffer[FLAT_ID] = core_flat_id;
}

#pragma GCC unroll 65534
for (int i = GUARANTEED_MARKER_1_H; i < CUSTOM_MARKERS; i ++)
{
//TODO(MO): Clean up magic numbers
//TODO(MO): Clean up magic numbers
briscBuffer[i] = 0x80000000;
ncriscBuffer[i] = 0x80000000;
trisc0Buffer[i] = 0x80000000;
trisc1Buffer[i] = 0x80000000;
trisc2Buffer[i] = 0x80000000;
}

//TODO(MO): Clean up magic numbers
briscBuffer [ID_LL] = runCounter;
ncriscBuffer[ID_LL] = runCounter;
trisc0Buffer[ID_LL] = runCounter;
trisc1Buffer[ID_LL] = runCounter;
trisc2Buffer[ID_LL] = runCounter;
briscBuffer [ID_LL] = (runCounter & 0xFFFF) | (briscBuffer [ID_LL] & 0xFFFF0000);
ncriscBuffer[ID_LL] = (runCounter & 0xFFFF) | (ncriscBuffer[ID_LL] & 0xFFFF0000);
trisc0Buffer[ID_LL] = (runCounter & 0xFFFF) | (trisc0Buffer[ID_LL] & 0xFFFF0000);
trisc1Buffer[ID_LL] = (runCounter & 0xFFFF) | (trisc1Buffer[ID_LL] & 0xFFFF0000);
trisc2Buffer[ID_LL] = (runCounter & 0xFFFF) | (trisc2Buffer[ID_LL] & 0xFFFF0000);


#endif //BRISC_INIT
Expand All @@ -208,14 +203,6 @@ namespace kernel_profiler{
buffer[index+1] = p_reg[WALL_CLOCK_LOW_INDEX];
}

inline __attribute__((always_inline)) void mark_end_at_index_inlined(uint32_t index, uint32_t timer_id_s, uint32_t timer_id)
{
volatile tt_l1_ptr uint32_t *buffer = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(kernel_profiler::profilerBuffer);
volatile tt_reg_ptr uint32_t *p_reg = reinterpret_cast<volatile tt_reg_ptr uint32_t *> (RISCV_DEBUG_REG_WALL_CLOCK_L);
buffer[index+2] = 0x80000000 | ((timer_id & 0x7FFFF) << 12) | (p_reg[WALL_CLOCK_HIGH_INDEX] & 0xFFF);
buffer[index+3] = p_reg[WALL_CLOCK_LOW_INDEX];
}

inline __attribute__((always_inline)) void mark_padding()
{
if (wIndex < PROFILER_L1_VECTOR_SIZE)
Expand All @@ -233,6 +220,20 @@ namespace kernel_profiler{
profiler_control_buffer[DROPPED_ZONES] = (1 << index) | curr;
}

inline __attribute__((always_inline)) void set_host_counter(uint32_t counterValue)
{
volatile tt_l1_ptr uint32_t *briscBuffer = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(PROFILER_L1_BUFFER_BR);
volatile tt_l1_ptr uint32_t *ncriscBuffer = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(PROFILER_L1_BUFFER_NC);
volatile tt_l1_ptr uint32_t *trisc0Buffer = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(PROFILER_L1_BUFFER_T0);
volatile tt_l1_ptr uint32_t *trisc1Buffer = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(PROFILER_L1_BUFFER_T1);
volatile tt_l1_ptr uint32_t *trisc2Buffer = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(PROFILER_L1_BUFFER_T2);

briscBuffer[ID_LL] = (counterValue << 16) | (briscBuffer[ID_LL] & 0xFFFF);
ncriscBuffer[ID_LL] = (counterValue << 16) | (ncriscBuffer[ID_LL] & 0xFFFF);
trisc0Buffer[ID_LL] = (counterValue << 16) | (trisc0Buffer[ID_LL] & 0xFFFF);
trisc1Buffer[ID_LL] = (counterValue << 16) | (trisc1Buffer[ID_LL] & 0xFFFF);
trisc2Buffer[ID_LL] = (counterValue << 16) | (trisc2Buffer[ID_LL] & 0xFFFF);
}

inline __attribute__((always_inline)) void risc_finished_profiling()
{
Expand Down Expand Up @@ -374,6 +375,7 @@ namespace kernel_profiler{
#if defined(DISPATCH_KERNEL) && defined(COMPILE_FOR_NCRISC) && (PROFILE_KERNEL == PROFILER_OPT_DO_DISPATCH_CORES)
SrcLocNameToHash("PROFILER-NOC-QUICK-SEND");
mark_time_at_index_inlined(wIndex, hash);
wIndex += PROFILER_L1_MARKER_UINT32_SIZE;
core_flat_id = noc_xy_to_profiler_flat_id[my_x[0]][my_y[0]];

uint32_t dram_offset =
Expand All @@ -389,8 +391,8 @@ namespace kernel_profiler{

uint64_t dram_bank_dst_noc_addr = s.get_noc_addr(core_flat_id / profiler_core_count_per_dram, dram_offset);

mark_end_at_index_inlined(wIndex, hash, get_end_timer_id(hash));
wIndex += QUICK_PUSH_MARKER_COUNT * PROFILER_L1_MARKER_UINT32_SIZE;
mark_time_at_index_inlined(wIndex, get_end_timer_id(hash));
wIndex += PROFILER_L1_MARKER_UINT32_SIZE;

uint32_t currEndIndex = profiler_control_buffer[HOST_BUFFER_END_INDEX_NC] + wIndex;

Expand Down Expand Up @@ -523,6 +525,8 @@ namespace kernel_profiler{

#define DeviceZoneScopedSumN2( name ) DO_PRAGMA(message(PROFILER_MSG_NAME(name))); auto constexpr hash = kernel_profiler::Hash16_CT(PROFILER_MSG_NAME(name)); kernel_profiler::profileScopeAccumulate<hash, 1> zone = kernel_profiler::profileScopeAccumulate<hash, 1>();

#define DeviceZoneSetCounter( counter ) kernel_profiler::set_host_counter(counter);

#else

#define DeviceZoneScopedMainN( name )
Expand All @@ -537,4 +541,6 @@ namespace kernel_profiler{

#define DeviceZoneScopedND( name , nocBuffer, nocIndex )

#define DeviceZoneSetCounter( counter )

#endif
10 changes: 9 additions & 1 deletion tt_metal/tools/profiler/process_device_log.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,15 @@ def import_device_profile_log(logPath):
timerID = {"id": int(row[4].strip()), "zone_name": "", "zone_phase": "", "src_line": "", "src_file": ""}
timeData = int(row[5].strip())
statData = 0
if len(row) > 6:
if len(row) == 13:
statData = int(row[6].strip())
timerID["run_id"] = int(row[7].strip())
timerID["op_id"] = int(row[8].strip())
timerID["zone_name"] = row[9].strip()
timerID["zone_phase"] = row[10].strip()
timerID["src_line"] = int(row[11].strip())
timerID["src_file"] = row[12].strip()
elif len(row) == 12:
statData = int(row[6].strip())
timerID["run_id"] = int(row[7].strip())
timerID["zone_name"] = row[8].strip()
Expand Down
4 changes: 4 additions & 0 deletions tt_metal/tools/profiler/process_ops_logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,10 @@ def append_device_data(ops, deviceLogFolder):
cores = set()
for timeID, ts, statData, risc, core in deviceOpTime["timeseries"]:
if "zone_name" in timeID.keys() and "FW" in timeID["zone_name"]:
if "op_id" in timeID.keys():
assert (
timeID["op_id"] == deviceOp["global_call_count"]
), f"op id {timeID['op_id']} reproted by device is not matching assigned op id {deviceOp['global_call_count']}"
if core not in cores:
cores.add(core)
deviceOp["core_usage"] = {"count": len(cores), "cores": [str(core) for core in cores]}
Expand Down
14 changes: 10 additions & 4 deletions tt_metal/tools/profiler/profiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ void DeviceProfiler::readRiscProfilerResults(
uint32_t riscNumRead = 0;
uint32_t coreFlatIDRead = 0;
uint32_t runCounterRead = 0;
uint32_t runHostCounterRead = 0;

bool newRunStart = false;

Expand All @@ -108,7 +109,8 @@ void DeviceProfiler::readRiscProfilerResults(
//TODO(MO): Cleanup magic numbers
riscNumRead = profile_buffer[index] & 0x7;
coreFlatIDRead = (profile_buffer[index] >> 3) & 0xFF;
runCounterRead = profile_buffer[index + 1];
runCounterRead = profile_buffer[index + 1] & 0xFFFF;
runHostCounterRead = (profile_buffer[index + 1] >> 16 ) & 0xFFFF;

}
else
Expand Down Expand Up @@ -149,6 +151,7 @@ void DeviceProfiler::readRiscProfilerResults(

dumpResultToFile(
runCounterRead,
runHostCounterRead,
device_id,
worker_core,
coreFlatID,
Expand All @@ -167,6 +170,7 @@ void DeviceProfiler::readRiscProfilerResults(
uint32_t time_L = opTime_L;
dumpResultToFile(
runCounterRead,
runHostCounterRead,
device_id,
worker_core,
coreFlatID,
Expand Down Expand Up @@ -201,6 +205,7 @@ void DeviceProfiler::firstTimestamp(uint64_t timestamp)

void DeviceProfiler::dumpResultToFile(
uint32_t run_id,
uint32_t run_host_id,
int device_id,
CoreCoord core,
int core_flat,
Expand Down Expand Up @@ -237,7 +242,7 @@ void DeviceProfiler::dumpResultToFile(
source_line = stoi(source_line_str);
}

tracy::TTDeviceEvent event = tracy::TTDeviceEvent(run_id, device_id, core.x, core.y, risc_num, timer_id, timestamp, source_line, source_file, zone_name, zone_phase);
tracy::TTDeviceEvent event = tracy::TTDeviceEvent(run_host_id, device_id, core.x, core.y, risc_num, timer_id, timestamp, source_line, source_file, zone_name, zone_phase);

auto ret = device_events.insert(event);

Expand All @@ -249,7 +254,7 @@ void DeviceProfiler::dumpResultToFile(
{
log_file.open(log_path);
log_file << "ARCH: " << get_string_lowercase(device_architecture) << ", CHIP_FREQ[MHz]: " << device_core_frequency << std::endl;
log_file << "PCIe slot, core_x, core_y, RISC processor type, timer_id, time[cycles since reset], stat value, Run ID, zone name, zone phase, source line, source file" << std::endl;
log_file << "PCIe slot, core_x, core_y, RISC processor type, timer_id, time[cycles since reset], stat value, run ID, run host ID, zone name, zone phase, source line, source file" << std::endl;
new_log = false;
}
else
Expand All @@ -258,7 +263,7 @@ void DeviceProfiler::dumpResultToFile(
}

//log_file << fmt::format("{:4},{:3},{:3},{:>7},{:7},{:15},{:15},{:5},{:>25},{:>6},{:6},{}",
log_file << fmt::format("{},{},{},{},{},{},{},{},{},{},{},{}",
log_file << fmt::format("{},{},{},{},{},{},{},{},{},{},{},{},{}",
device_id,
core.x,
core.y,
Expand All @@ -267,6 +272,7 @@ void DeviceProfiler::dumpResultToFile(
timestamp,
stat_value,
run_id,
run_host_id,
zone_name,
magic_enum::enum_name(zone_phase),
source_line,
Expand Down
1 change: 1 addition & 0 deletions tt_metal/tools/profiler/profiler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ class DeviceProfiler {
// Dumping profile result to file
void dumpResultToFile(
uint32_t runID,
uint32_t runHostID,
int device_id,
CoreCoord core,
int core_flat,
Expand Down
2 changes: 1 addition & 1 deletion tt_metal/tt_metal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -532,7 +532,7 @@ void LaunchProgram(Device *device, Program &program, bool wait_until_cores_done)
for (const auto &[core_type, logical_cores] : logical_cores_used_in_program) {
for (const auto &logical_core : logical_cores) {
launch_msg_t *msg = &program.kernels_on_core(logical_core, core_type)->launch_msg;

msg->host_assigned_op_id = program.get_global_id();
auto physical_core = device->physical_core_from_logical_core(logical_core, core_type);
not_done_cores.insert(physical_core);
tt::llrt::write_launch_msg_to_core(device->id(), physical_core, msg);
Expand Down
2 changes: 2 additions & 0 deletions ttnn/cpp/ttnn/device_operation.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,8 @@ typename device_operation_t::tensor_return_value_t run(
auto& program = create_or_get_program_from_cache<device_operation_t>(
program_cache, program_cache_hit, program_hash, operation_attributes, tensor_args, tensor_return_value);

program.set_global_id(operation_id);

if (USE_FAST_DISPATCH) {
ZoneScopedN("EnqueueProgram");
auto& queue = device->command_queue(cq_id);
Expand Down

0 comments on commit e642070

Please sign in to comment.