Skip to content

Commit

Permalink
#8223: No hangs on dispatch cores when profiling them
Browse files Browse the repository at this point in the history
  • Loading branch information
mo-tenstorrent committed May 15, 2024
1 parent b759289 commit 59ffa8d
Show file tree
Hide file tree
Showing 16 changed files with 177 additions and 86 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,7 @@ void configure_kernel_variant(
CoreCoord phys_downstream_core) {

std::map<string, string> defines = {
{"DISPATCH_KERNEL", "1"},
{"MY_NOC_X", std::to_string(phys_my_core.x)},
{"MY_NOC_Y", std::to_string(phys_my_core.y)},
{"UPSTREAM_NOC_X", std::to_string(phys_upstream_core.x)},
Expand Down
9 changes: 9 additions & 0 deletions tt_metal/hw/firmware/src/brisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ namespace kernel_profiler {
uint32_t stackSize __attribute__((used));
uint32_t sums[SUM_COUNT] __attribute__((used));
uint32_t sumIDs[SUM_COUNT] __attribute__((used));
bool resultsPushed __attribute__((used));
uint16_t core_flat_id __attribute__((used));
}

void enable_power_management() {
Expand Down Expand Up @@ -342,7 +344,14 @@ int main() {
}
DEBUG_STATUS("D");

int i = my_x[0];
int j = my_y[0];
DPRINT << i << "NC WAIT" << j << ENDL();
wait_ncrisc_trisc();
DPRINT << i << "NC DONE" << j << ENDL();
DPRINT << i << "NC DONE" << j << ENDL();
DPRINT << i << "NC DONE" << j << ENDL();
DPRINT << i << "NC DONE" << j << ENDL();

mailboxes->launch.run = RUN_MSG_DONE;

Expand Down
2 changes: 2 additions & 0 deletions tt_metal/hw/firmware/src/erisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ namespace kernel_profiler {
uint32_t stackSize __attribute__((used));
uint32_t sums[SUM_COUNT] __attribute__((used));
uint32_t sumIDs[SUM_COUNT] __attribute__((used));
bool resultsPushed __attribute__((used));
uint16_t core_flat_id __attribute__((used));
}

uint8_t noc_index = 0; // TODO: remove hardcoding
Expand Down
2 changes: 2 additions & 0 deletions tt_metal/hw/firmware/src/idle_erisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ namespace kernel_profiler {
uint32_t stackSize __attribute__((used));
uint32_t sums[SUM_COUNT] __attribute__((used));
uint32_t sumIDs[SUM_COUNT] __attribute__((used));
bool resultsPushed __attribute__((used));
uint16_t core_flat_id __attribute__((used));
}

//inline void RISC_POST_STATUS(uint32_t status) {
Expand Down
2 changes: 2 additions & 0 deletions tt_metal/hw/firmware/src/ncrisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ namespace kernel_profiler {
uint32_t stackSize __attribute__((used));
uint32_t sums[SUM_COUNT] __attribute__((used));
uint32_t sumIDs[SUM_COUNT] __attribute__((used));
bool resultsPushed __attribute__((used));
uint16_t core_flat_id __attribute__((used));
}

extern "C" void ncrisc_resume(void);
Expand Down
2 changes: 1 addition & 1 deletion tt_metal/hw/inc/debug/dprint.h
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,7 @@ void debug_print(DebugPrinter &dp, DebugPrintData data) {
template<typename T>
__attribute__((__noinline__))
DebugPrinter operator <<(DebugPrinter dp, T val) {
#if defined(DEBUG_PRINT_ENABLED) && !defined(PROFILE_KERNEL)
#if defined(DEBUG_PRINT_ENABLED)
DebugPrintData data{
.sz = DebugPrintTypeToSize<T>(val), // includes terminating 0 for char*
.data_ptr = DebugPrintTypeAddr<T>(&val),
Expand Down
10 changes: 5 additions & 5 deletions tt_metal/impl/debug/dprint_server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -955,10 +955,10 @@ void DprintServerAttach(Device* device) {
// Skip if DPRINT not enabled, and make sure profiler is not running.
if (!tt::llrt::OptionsG.get_dprint_enabled())
return;
TT_FATAL(
DebugPrintServerContext::ProfilerIsRunning == false,
"Device side profiler is running, cannot start print server"
);
//TT_FATAL(
//DebugPrintServerContext::ProfilerIsRunning == false,
//"Device side profiler is running, cannot start print server"
//);

// If no server is running, create one
if (!DprintServerIsRunning())
Expand All @@ -981,7 +981,7 @@ void DprintServerDetach(Device* device) {
}

void DprintServerSetProfilerState(bool profile_device) {
DebugPrintServerContext::ProfilerIsRunning = profile_device;
//DebugPrintServerContext::ProfilerIsRunning = profile_device;
}

bool DprintServerIsRunning() {
Expand Down
8 changes: 8 additions & 0 deletions tt_metal/impl/device/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,7 @@ void Device::compile_command_queue_programs() {
{"UPSTREAM_NOC_Y", std::to_string(0)},
{"DOWNSTREAM_NOC_X", std::to_string(dispatch_physical_core.x)},
{"DOWNSTREAM_NOC_Y", std::to_string(dispatch_physical_core.y)},
{"PROFILE_KERNEL", "1"},
};

std::vector<uint32_t> prefetch_compile_args = {
Expand Down Expand Up @@ -456,6 +457,7 @@ void Device::compile_command_queue_programs() {
{"UPSTREAM_NOC_Y", std::to_string(prefetch_physical_core.y)},
{"DOWNSTREAM_NOC_X", std::to_string(0)},
{"DOWNSTREAM_NOC_Y", std::to_string(0)},
{"PROFILE_KERNEL", "1"},
};
std::vector<uint32_t> dispatch_compile_args = {
dispatch_constants::DISPATCH_BUFFER_BASE,
Expand Down Expand Up @@ -583,6 +585,7 @@ void Device::compile_command_queue_programs() {
{"UPSTREAM_NOC_Y", std::to_string(0)},
{"DOWNSTREAM_NOC_X", std::to_string(mux_physical_core.x)},
{"DOWNSTREAM_NOC_Y", std::to_string(mux_physical_core.y)},
{"PROFILE_KERNEL", "1"},
};

std::vector<uint32_t> prefetch_compile_args = {
Expand Down Expand Up @@ -865,6 +868,7 @@ void Device::compile_command_queue_programs() {
{"UPSTREAM_NOC_Y", std::to_string(demux_physical_core.y)},
{"DOWNSTREAM_NOC_X", std::to_string(0xffffffff)},
{"DOWNSTREAM_NOC_Y", std::to_string(0xffffffff)},
{"PROFILE_KERNEL", "1"},
};

log_debug(LogDevice, "run dispatch_h at {}", dispatch_location.str());
Expand Down Expand Up @@ -1106,6 +1110,7 @@ void Device::compile_command_queue_programs() {
{"UPSTREAM_NOC_Y", std::to_string(demux_d_physical_core.y)},
{"DOWNSTREAM_NOC_X", std::to_string(dispatch_physical_core.x)},
{"DOWNSTREAM_NOC_Y", std::to_string(dispatch_physical_core.y)},
{"PROFILE_KERNEL", "1"},
};

std::vector<uint32_t> prefetch_d_compile_args = {
Expand Down Expand Up @@ -1169,6 +1174,7 @@ void Device::compile_command_queue_programs() {
{"UPSTREAM_NOC_Y", std::to_string(prefetch_d_physical_core.y)},
{"DOWNSTREAM_NOC_X", std::to_string(mux_d_physical_core.x)},
{"DOWNSTREAM_NOC_Y", std::to_string(mux_d_physical_core.y)},
{"PROFILE_KERNEL", "1"},
};
std::vector<uint32_t> dispatch_d_compile_args = {
dispatch_constants::DISPATCH_BUFFER_BASE,
Expand Down Expand Up @@ -1431,6 +1437,8 @@ bool Device::initialize(size_t l1_small_size, const std::vector<uint32_t> &l1_ba
log_info(tt::LogMetal, "Initializing device {}. Program cache is {}enabled", this->id_, this->program_cache.is_enabled() ? "": "NOT ");
this->initialize_cluster();
this->initialize_allocator(l1_small_size, l1_bank_remap);
if (minimal) return true;
log_info(tt::LogMetal, "Initializing device {}. Program cache is NOT enabled", this->id_);
this->initialize_build();
auto num_devices = tt::tt_metal::GetNumAvailableDevices();
tt::tt_metal::device_pool::devices.resize(num_devices, nullptr);
Expand Down
56 changes: 32 additions & 24 deletions tt_metal/impl/dispatch/kernels/cq_dispatch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,10 @@ static uint32_t cmd_ptr; // walks through pages in cb cmd by cmd

static uint32_t downstream_cb_data_ptr = downstream_cb_base;

namespace kernel_profiler {
uint32_t nocWriteSize __attribute__((used));
}

FORCE_INLINE volatile uint32_t* get_cq_completion_read_ptr() {
return reinterpret_cast<volatile uint32_t*>(CQ_COMPLETION_READ_PTR);
}
Expand Down Expand Up @@ -849,33 +853,37 @@ void kernel_main() {
}
bool done = false;
while (!done) {
if (cmd_ptr == cb_fence) {
get_cb_page<
dispatch_cb_base,
dispatch_cb_blocks,
dispatch_cb_log_page_size,
my_noc_xy,
my_dispatch_cb_sem_id>(cmd_ptr,
cb_fence,
block_noc_writes_to_clear,
block_next_start_addr,
rd_block_idx);
}
{
DeviceZoneScopedMainN("CQ-DISPATCH");
if (cmd_ptr == cb_fence) {
get_cb_page<
dispatch_cb_base,
dispatch_cb_blocks,
dispatch_cb_log_page_size,
my_noc_xy,
my_dispatch_cb_sem_id>(cmd_ptr,
cb_fence,
block_noc_writes_to_clear,
block_next_start_addr,
rd_block_idx);
}

done = is_d_variant ?
process_cmd_d(cmd_ptr) :
process_cmd_h(cmd_ptr);
done = is_d_variant ?
process_cmd_d(cmd_ptr) :
process_cmd_h(cmd_ptr);

// Move to next page
cmd_ptr = round_up_pow2(cmd_ptr, dispatch_cb_page_size);
// Move to next page
cmd_ptr = round_up_pow2(cmd_ptr, dispatch_cb_page_size);

// XXXXX move this inside while loop waiting for get_dispatch_cb_page above
// XXXXX can potentially clear a partial block when stalled w/ some more bookkeeping
cb_block_release_pages<upstream_noc_xy,
upstream_dispatch_cb_sem_id,
dispatch_cb_blocks,
dispatch_cb_pages_per_block>(block_noc_writes_to_clear,
wr_block_idx);
// XXXXX move this inside while loop waiting for get_dispatch_cb_page above
// XXXXX can potentially clear a partial block when stalled w/ some more bookkeeping
cb_block_release_pages<upstream_noc_xy,
upstream_dispatch_cb_sem_id,
dispatch_cb_blocks,
dispatch_cb_pages_per_block>(block_noc_writes_to_clear,
wr_block_idx);
}
block_noc_writes_to_clear[rd_block_idx] += DeviceProfilerNOCWriteBlockCountAndReset();
}

noc_async_write_barrier();
Expand Down
23 changes: 15 additions & 8 deletions tt_metal/impl/dispatch/kernels/cq_prefetch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,10 @@ static struct PrefetchExecBufState {
uint32_t length;
} exec_buf_state;

namespace kernel_profiler {
uint32_t nocWriteSize __attribute__((used));
}

static_assert((downstream_cb_base & (downstream_cb_page_size - 1)) == 0);

template<bool cmddat_wrap_enable,
Expand Down Expand Up @@ -133,7 +137,7 @@ void read_from_pcie(volatile tt_l1_ptr uint32_t *& prefetch_q_rd_ptr,
}

uint64_t host_src_addr = get_noc_addr_helper(NOC_XY_ENCODING(PCIE_NOC_X, PCIE_NOC_Y), pcie_read_ptr);
DPRINT << "read_from_pcie: " << fence + preamble_size << " " << pcie_read_ptr << ENDL();
//DPRINT << "read_from_pcie: " << fence + preamble_size << " " << pcie_read_ptr << ENDL();
noc_async_read(host_src_addr, fence + preamble_size, size);
pending_read_size = size + preamble_size;
pcie_read_ptr += size;
Expand Down Expand Up @@ -177,7 +181,7 @@ void fetch_q_get_cmds(uint32_t& fence, uint32_t& cmd_ptr, uint32_t& pcie_read_pt
static uint32_t pending_read_size = 0;
static volatile tt_l1_ptr uint32_t* prefetch_q_rd_ptr = (volatile tt_l1_ptr uint32_t*)prefetch_q_base;

DPRINT << "fetch_q_get_cmds: " << cmd_ptr << " " << fence << ENDL();
//DPRINT << "fetch_q_get_cmds: " << cmd_ptr << " " << fence << ENDL();
if (fence < cmd_ptr) {
DPRINT << "fetch_q_get_cmds wrap cmd" << ENDL();
cmd_ptr = fence;
Expand All @@ -192,7 +196,7 @@ void fetch_q_get_cmds(uint32_t& fence, uint32_t& cmd_ptr, uint32_t& pcie_read_pt
}
if (!cmd_ready) {
if (pending_read_size != 0) {
DPRINT << "fetch_q_get_cmds barrier" << ENDL();
//DPRINT << "fetch_q_get_cmds barrier" << ENDL();
noc_async_read_barrier();

// wrap the cmddat_q
Expand All @@ -217,7 +221,7 @@ void fetch_q_get_cmds(uint32_t& fence, uint32_t& cmd_ptr, uint32_t& pcie_read_pt
// By here, prefetch_q_ready must be false
// Nothing to fetch, nothing pending, nothing available, stall on host
DEBUG_STATUS("HQW");
DPRINT << "fetch_q_get_cmds stall" << ENDL();
//DPRINT << "fetch_q_get_cmds stall" << ENDL();
while ((fetch_size = *prefetch_q_rd_ptr) == 0);
fetch_q_get_cmds<preamble_size>(fence, cmd_ptr, pcie_read_ptr);
DEBUG_STATUS("HQD");
Expand Down Expand Up @@ -743,6 +747,7 @@ uint32_t process_exec_buf_cmd(uint32_t cmd_ptr_outer,

bool done = false;
while (!done) {
//DeviceZoneScopedMainN("PROC-MAIN");
uint32_t cmd_ptr = cmddat_q_base;

paged_read_into_cmddat_q(cmd_ptr);
Expand Down Expand Up @@ -801,7 +806,7 @@ bool process_cmd(uint32_t& cmd_ptr,
break;

case CQ_PREFETCH_CMD_RELAY_INLINE:
DPRINT << "relay inline" << ENDL();
//DPRINT << "relay inline" << ENDL();
if (exec_buf) {
stride = process_relay_inline_exec_buf_cmd(cmd_ptr, downstream_data_ptr);
} else {
Expand All @@ -827,7 +832,7 @@ bool process_cmd(uint32_t& cmd_ptr,
break;

case CQ_PREFETCH_CMD_STALL:
DPRINT << "stall" << ENDL();
//DPRINT << "stall" << ENDL();
stride = process_stall(cmd_ptr);
break;

Expand Down Expand Up @@ -973,6 +978,7 @@ void kernel_main_h() {

bool done = false;
while (!done) {
//DeviceZoneScopedMainN("KERNEL-MAIN-H");
fetch_q_get_cmds<sizeof(CQPrefetchHToPrefetchDHeader)>(fence, cmd_ptr, pcie_read_ptr);

volatile CQPrefetchCmd tt_l1_ptr *cmd = (volatile CQPrefetchCmd tt_l1_ptr *)(cmd_ptr + sizeof(CQPrefetchHToPrefetchDHeader));
Expand All @@ -984,7 +990,7 @@ void kernel_main_h() {
DPRINT << "exec buf\n";
process_exec_buf_cmd_h();
} else if (cmd_id == CQ_PREFETCH_CMD_TERMINATE) {
DPRINT << "prefetch terminating_" << is_h_variant << is_d_variant << ENDL();;
DPRINT << "prefetch 2 terminating_" << is_h_variant << is_d_variant << ENDL();;
done = true;
}
#if defined(COMPILE_FOR_IDLE_ERISC)
Expand All @@ -1010,6 +1016,7 @@ void kernel_main_d() {

bool done = false;
while (!done) {
//DeviceZoneScopedMainN("KERNEL-MAIN-D");
// cmds come in packed batches based on HostQ reads in prefetch_h
// once a packed batch ends, we need to jump to the next page
uint32_t length = relay_cb_get_cmds(fence, cmd_ptr);
Expand Down Expand Up @@ -1059,9 +1066,9 @@ void kernel_main_hd() {

uint32_t cmd_ptr = cmddat_q_base;
uint32_t fence = cmddat_q_base;

bool done = false;
while (!done) {
DeviceZoneScopedMainN("KERNEL-MAIN-HD");
constexpr uint32_t preamble_size = 0;
fetch_q_get_cmds<preamble_size>(fence, cmd_ptr, pcie_read_ptr);

Expand Down
10 changes: 7 additions & 3 deletions tt_metal/jit_build/genfiles.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -414,10 +414,12 @@ std::string generate_bank_to_noc_coord_descriptor_string(
ss << endl;
ss << "extern uint16_t dram_bank_to_noc_xy[NUM_NOCS][NUM_DRAM_BANKS];" << endl;
ss << "extern int32_t bank_to_dram_offset[NUM_DRAM_BANKS];" << endl;
ss << "extern int32_t noc_xy_to_profiler_flat_id[noc_size_x][noc_size_y];" << endl;
ss << "extern uint16_t l1_bank_to_noc_xy[NUM_NOCS][NUM_L1_BANKS];" << endl;
ss << "extern int32_t bank_to_l1_offset[NUM_L1_BANKS];" << endl;
ss << "#if defined(COMPILE_FOR_BRISC) || defined(COMPILE_FOR_NCRISC) || defined(COMPILE_FOR_ERISC)" << endl;
ss << "extern uint8_t noc_xy_to_profiler_flat_id[noc_size_x][noc_size_y];" << endl;
ss << "extern uint16_t profiler_core_count_per_dram;" << endl;
ss << "#endif" << endl;

ss << endl;
ss << "#else // !KERNEL_BUILD (FW_BUILD)" << endl;
Expand Down Expand Up @@ -454,17 +456,18 @@ std::string generate_bank_to_noc_coord_descriptor_string(
* For DRAM banks in particular, integer division of flat_id/core_count_per_dram gives the dram bank id and the modulo
* is the offset.
* */
ss << "#if defined(COMPILE_FOR_BRISC) || defined(COMPILE_FOR_NCRISC) || defined(COMPILE_FOR_ERISC)" << endl;
ss << "uint16_t profiler_core_count_per_dram __attribute__((used)) = ";
ss << core_count_per_dram << ";" << endl;
ss << endl;

ss << "int32_t noc_xy_to_profiler_flat_id[noc_size_x][noc_size_y] __attribute__((used)) = {" << endl;
ss << "uint8_t noc_xy_to_profiler_flat_id[noc_size_x][noc_size_y] __attribute__((used)) = {" << endl;
for (unsigned int x = 0; x < grid_size.x; x++) {
ss << " {" << endl;
for (unsigned int y = 0; y < grid_size.y; y++) {
CoreCoord core = {x,y};
if (profiler_flat_id_map.find(core) == profiler_flat_id_map.end()){
ss << " " << -1 << "," << endl;
ss << " " << 255 << "," << endl;
}
else{
ss << " " << profiler_flat_id_map.at(core) << "," << endl;
Expand All @@ -474,6 +477,7 @@ std::string generate_bank_to_noc_coord_descriptor_string(
}
ss << "};" << endl;
ss << endl;
ss << "#endif" << endl;

#endif

Expand Down
3 changes: 3 additions & 0 deletions tt_metal/llrt/llrt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,9 @@ static bool check_if_riscs_on_specified_core_done(chip_id_t chip_id, const CoreC
bool is_active_eth_core = false;
bool is_inactive_eth_core = false;

//if (core.x == 7 && core.y ==11 ) return true;
//log_info("checking: chip {}, device {} {}", chip_id, core.x, core.y);

// Determine whether an ethernet core is active or idle. Their host handshake interfaces are different.
if (is_eth_core) {
auto active_eth_cores = tt::Cluster::instance().get_active_ethernet_cores(chip_id);
Expand Down
2 changes: 1 addition & 1 deletion tt_metal/llrt/rtoptions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ RunTimeOptions::RunTimeOptions() {
profiler_enabled = true;
}
#endif
TT_FATAL(!(get_dprint_enabled() && get_profiler_enabled()), "Cannot enable both debug printing and profiling");
//TT_FATAL(!(get_dprint_enabled() && get_profiler_enabled()), "Cannot enable both debug printing and profiling");

null_kernels = (std::getenv("TT_METAL_NULL_KERNELS") != nullptr);

Expand Down
Loading

0 comments on commit 59ffa8d

Please sign in to comment.