diff --git a/tt_metal/hostdevcommon/dprint_common.h b/tt_metal/hostdevcommon/dprint_common.h index a1d9a7150cc..580b43bbb55 100644 --- a/tt_metal/hostdevcommon/dprint_common.h +++ b/tt_metal/hostdevcommon/dprint_common.h @@ -30,9 +30,10 @@ DPRINT_PREFIX(WAIT) \ DPRINT_PREFIX(BFLOAT16) \ DPRINT_PREFIX(SETPRECISION) \ - DPRINT_PREFIX(FIXED) \ + DPRINT_PREFIX(NOC_LOG_XFER) \ + DPRINT_PREFIX(FIXED) \ DPRINT_PREFIX(DEFAULTFLOAT) \ - DPRINT_PREFIX(HEX) \ + DPRINT_PREFIX(HEX) \ DPRINT_PREFIX(OCT) \ DPRINT_PREFIX(DEC) \ DPRINT_PREFIX(TILESLICE) \ diff --git a/tt_metal/hw/inc/dataflow_api.h b/tt_metal/hw/inc/dataflow_api.h index dd347df2483..ec3fa3a3184 100644 --- a/tt_metal/hw/inc/dataflow_api.h +++ b/tt_metal/hw/inc/dataflow_api.h @@ -558,7 +558,6 @@ void noc_async_read_one_packet_set_state(std::uint64_t src_noc_addr, std::uint32 DEBUG_STATUS("RPD"); DEBUG_STATUS("NARW"); - DEBUG_SANITIZE_NOC_ADDR(src_noc_addr, size); NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_TARG_ADDR_MID, src_noc_addr >> 32); NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_AT_LEN_BE, size); @@ -582,8 +581,8 @@ void noc_async_read_one_packet_with_state(std::uint32_t src_noc_addr, std::uint3 DEBUG_STATUS("NARW"); - // TODO: need a way sanitize size + addr w/o directly providing x/y here (grab x/y form state?) - // DEBUG_SANITIZE_READ_NOC_TRANSACTION(src_noc_addr, dst_local_l1_addr, size); + // In order to sanitize, need to grab full noc addr + xfer size from state. + DEBUG_SANITIZE_NOC_READ_TRANSACTION_WITH_ADDR_AND_SIZE_STATE(noc_index, src_noc_addr, dst_local_l1_addr); NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_RET_ADDR_LO, dst_local_l1_addr); NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_TARG_ADDR_LO, src_noc_addr); @@ -609,9 +608,6 @@ void noc_async_read_set_state(std::uint64_t src_noc_addr) { while (!noc_cmd_buf_ready(noc_index, NCRISC_RD_CMD_BUF)); DEBUG_STATUS("RPD"); - // TODO: need to sanitize in noc_async_read_with_state - // DEBUG_SANITIZE_NOC_ADDR(src_noc_addr, size); - NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_TARG_ADDR_MID, src_noc_addr >> 32); DEBUG_STATUS("NARD"); @@ -627,9 +623,8 @@ void noc_async_read_with_state(std::uint32_t src_noc_addr, std::uint32_t dst_loc */ DEBUG_STATUS("NARW"); - // TODO: need a way sanitize size + addr w/o directly providing x/y here (grab x/y form state?) - // DEBUG_SANITIZE_NOC_ADDR(src_noc_addr, size); - DEBUG_SANITIZE_WORKER_ADDR(dst_local_l1_addr, size); + // In order to sanitize, need to grab full noc addr + xfer size from state. + DEBUG_SANITIZE_NOC_READ_TRANSACTION_WITH_ADDR_STATE(noc_index, src_noc_addr, dst_local_l1_addr, size); while (size > NOC_MAX_BURST_SIZE) { DEBUG_STATUS("RPW"); @@ -701,7 +696,6 @@ FORCE_INLINE void noc_async_write_one_packet_set_state(std::uint64_t dst_noc_addr, std::uint32_t size) { DEBUG_STATUS("NWPW"); - DEBUG_SANITIZE_NOC_ADDR(dst_noc_addr, size); while (!noc_cmd_buf_ready(noc_index, NCRISC_WR_CMD_BUF)); DEBUG_STATUS("NWPD"); @@ -722,11 +716,12 @@ FORCE_INLINE void noc_async_write_one_packet_with_state(std::uint32_t src_local_l1_addr, std::uint32_t dst_noc_addr) { DEBUG_STATUS("NWPW"); - // TODO: need a way sanitize size + addr w/o directly providing x/y here (grab x/y form state?) - // DEBUG_SANITIZE_NOC_WRITE_TRANSACTION(dst_noc_addr, src_local_l1_addr, size); while (!noc_cmd_buf_ready(noc_index, NCRISC_WR_CMD_BUF)); DEBUG_STATUS("NWPD"); + // In order to sanitize, need to grab full noc addr + xfer size from state. + DEBUG_SANITIZE_NOC_WRITE_TRANSACTION_WITH_ADDR_AND_SIZE_STATE(noc_index, dst_noc_addr, src_local_l1_addr); + NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_WR_CMD_BUF, NOC_TARG_ADDR_LO, src_local_l1_addr); NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_WR_CMD_BUF, NOC_RET_ADDR_LO, dst_noc_addr); NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_WR_CMD_BUF, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ); @@ -1004,7 +999,7 @@ struct InterleavedPow2AddrGenFast { } DEBUG_STATUS("NRPW"); - DEBUG_SANITIZE_NOC_READ_TRANSACTION(get_noc_addr_helper(src_noc_xy, src_addr), dest_addr, log_base_2_of_page_size); + DEBUG_SANITIZE_NOC_READ_TRANSACTION(get_noc_addr_helper(src_noc_xy, src_addr), dest_addr, 1 << log_base_2_of_page_size); while (!noc_cmd_buf_ready(noc_index, NCRISC_RD_CMD_BUF)); DEBUG_STATUS("NRPD"); @@ -1047,6 +1042,7 @@ struct InterleavedPow2AddrGenFast { DEBUG_STATUS("RPW"); while (!noc_cmd_buf_ready(noc_index, NCRISC_RD_CMD_BUF)); DEBUG_STATUS("RPD"); + DEBUG_SANITIZE_NOC_READ_TRANSACTION(get_noc_addr_helper(src_noc_xy, src_addr), dest_addr, size); NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_RET_ADDR_LO, dest_addr); NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_TARG_ADDR_LO, src_addr); // (uint32_t)src_addr diff --git a/tt_metal/hw/inc/dataflow_internal.h b/tt_metal/hw/inc/dataflow_internal.h index 1c8050895d7..7286724ebfd 100644 --- a/tt_metal/hw/inc/dataflow_internal.h +++ b/tt_metal/hw/inc/dataflow_internal.h @@ -17,7 +17,6 @@ void noc_fast_read_wait_ready() { FORCE_INLINE void noc_fast_read_set_src_xy(uint64_t src_addr) { - DEBUG_SANITIZE_NOC_ADDR(src_addr, NOC_CMD_BUF_READ_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_AT_LEN_BE)); NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_TARG_ADDR_MID, src_addr >> 32); } @@ -64,7 +63,6 @@ void noc_fast_write_set_cmd_field(uint32_t vc, bool mcast, bool linked) { FORCE_INLINE void noc_fast_write_set_dst_xy(uint64_t dest_addr) { - DEBUG_SANITIZE_NOC_ADDR(dest_addr, NOC_CMD_BUF_READ_REG(noc_index, NCRISC_WR_CMD_BUF, NOC_AT_LEN_BE)); NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_WR_CMD_BUF, NOC_RET_ADDR_MID, dest_addr >> 32); } diff --git a/tt_metal/hw/inc/debug/dprint.h b/tt_metal/hw/inc/debug/dprint.h index 597dd3efb39..b30043de6e2 100644 --- a/tt_metal/hw/inc/debug/dprint.h +++ b/tt_metal/hw/inc/debug/dprint.h @@ -34,10 +34,6 @@ #include "hostdevcommon/common_runtime_address_map.h" #include "dprint_buffer.h" -#if defined(COMPILE_FOR_ERISC) -#include "ethernet/tunneling.h" -#endif - #include "status.h" #define DPRINT DebugPrinter() @@ -83,6 +79,7 @@ struct HEX { char tmp; } ATTR_PACK; // Analog of cout << std::hex struct OCT { char tmp; } ATTR_PACK; // Analog of cout << std::oct struct DEC { char tmp; } ATTR_PACK; // Analog of cout << std::dec struct SETW { char w; SETW(char w) : w(w) {} } ATTR_PACK; // Analog of cout << std::setw() +struct NOC_LOG_XFER { uint32_t size; NOC_LOG_XFER(uint32_t sz) : size(sz) {} } ATTR_PACK; // For tracking noc transactions. struct U32_ARRAY { uint32_t* ptr; uint32_t len; U32_ARRAY(uint32_t* ptr, uint32_t len) : ptr(ptr), len(len) {} @@ -142,6 +139,7 @@ template<> uint8_t DebugPrintTypeToId() { return DPrintRAISE; } template<> uint8_t DebugPrintTypeToId() { return DPrintWAIT; } template<> uint8_t DebugPrintTypeToId() { return DPrintBFLOAT16; } template<> uint8_t DebugPrintTypeToId() { return DPrintSETPRECISION; } +template<> uint8_t DebugPrintTypeToId() { return DPrintNOC_LOG_XFER; } template<> uint8_t DebugPrintTypeToId() { return DPrintFIXED; } template<> uint8_t DebugPrintTypeToId() { return DPrintDEFAULTFLOAT; } template<> uint8_t DebugPrintTypeToId() { return DPrintHEX; } @@ -213,6 +211,9 @@ void debug_print(DebugPrinter &dp, DebugPrintData data) { #if defined(COMPILE_FOR_ERISC) internal_::risc_context_switch(); #endif + // If we've closed the device, we've now disabled printing on it, don't hang. + if (*dp.wpos() == DEBUG_PRINT_SERVER_DISABLED_MAGIC) + return; ; // wait for host to catch up to wpos with it's rpos } DEBUG_STATUS("DPD"); @@ -293,6 +294,7 @@ template DebugPrinter operator<< (DebugPrinter, HEX val); template DebugPrinter operator<< (DebugPrinter, OCT val); template DebugPrinter operator<< (DebugPrinter, DEC val); template DebugPrinter operator<< (DebugPrinter, SETPRECISION val); +template DebugPrinter operator<< (DebugPrinter, NOC_LOG_XFER val); template DebugPrinter operator<< (DebugPrinter, BF16 val); template DebugPrinter operator<< (DebugPrinter, F32 val); template DebugPrinter operator<< (DebugPrinter, U32 val); diff --git a/tt_metal/hw/inc/debug/sanitize_noc.h b/tt_metal/hw/inc/debug/sanitize_noc.h index a415fb3df55..f07c3a9ea56 100644 --- a/tt_metal/hw/inc/debug/sanitize_noc.h +++ b/tt_metal/hw/inc/debug/sanitize_noc.h @@ -14,6 +14,20 @@ // #pragma once +#include "dprint.h" + +// Add the ability to skip NOC logging, we can't have the tunneling cores stalling waiting for the +// print server. +#if !defined(SKIP_NOC_LOGGING) +#define LOG_LEN(l) DPRINT << NOC_LOG_XFER(l) +#define LOG_READ_LEN_FROM_STATE() LOG_LEN(NOC_CMD_BUF_READ_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_AT_LEN_BE)) +#define LOG_WRITE_LEN_FROM_STATE() LOG_LEN(NOC_CMD_BUF_READ_REG(noc_index, NCRISC_WR_CMD_BUF, NOC_AT_LEN_BE)) +#else +#define LOG_LEN(l) +#define LOG_READ_LEN_FROM_STATE() +#define LOG_WRITE_LEN_FROM_STATE() +#endif + #if (defined(COMPILE_FOR_BRISC) || defined(COMPILE_FOR_NCRISC) || defined(COMPILE_FOR_ERISC) || defined(COMPILE_FOR_IDLE_ERISC)) && \ (defined(WATCHER_ENABLED)) && \ (!defined(WATCHER_DISABLE_NOC_SANITIZE)) @@ -27,6 +41,7 @@ extern uint8_t noc_index; #include "noc_parameters.h" #include "noc_overlay_parameters.h" + // A couple defines for specifying read/write and multi/unicast #define DEBUG_SANITIZE_NOC_READ true #define DEBUG_SANITIZE_NOC_WRITE false @@ -199,33 +214,44 @@ void debug_sanitize_noc_and_worker_addr( } } -// TODO: should be able clean up uses of the first three macros and remove them. -#define DEBUG_SANITIZE_WORKER_ADDR(a, l) \ - debug_sanitize_worker_addr(a, l) +// TODO: Clean these up with #7453 #define DEBUG_SANITIZE_NOC_ADDR(a, l) \ - debug_sanitize_noc_addr(a, 0, l, DEBUG_SANITIZE_NOC_UNICAST, DEBUG_SANITIZE_NOC_READ) -#define DEBUG_SANITIZE_NOC_MULTI_ADDR(a, l) \ - debug_sanitize_noc_addr(a, 0, l, DEBUG_SANITIZE_NOC_MULTICAST, DEBUG_SANITIZE_NOC_READ) + debug_sanitize_noc_addr(a, 0, l, DEBUG_SANITIZE_NOC_UNICAST, DEBUG_SANITIZE_NOC_READ); LOG_LEN(l) #define DEBUG_SANITIZE_NOC_TRANSACTION(noc_a, worker_a, l, multicast, dir) \ - debug_sanitize_noc_and_worker_addr(noc_a, worker_a, l, multicast, dir) + debug_sanitize_noc_and_worker_addr(noc_a, worker_a, l, multicast, dir); LOG_LEN(l) #define DEBUG_SANITIZE_NOC_READ_TRANSACTION(noc_a, worker_a, l) \ - debug_sanitize_noc_and_worker_addr(noc_a, worker_a, l, DEBUG_SANITIZE_NOC_UNICAST, DEBUG_SANITIZE_NOC_READ) + debug_sanitize_noc_and_worker_addr(noc_a, worker_a, l, DEBUG_SANITIZE_NOC_UNICAST, DEBUG_SANITIZE_NOC_READ); LOG_LEN(l) #define DEBUG_SANITIZE_NOC_MULTI_READ_TRANSACTION(noc_a, worker_a, l) \ - debug_sanitize_noc_and_worker_addr(noc_a, worker_a, l, DEBUG_SANITIZE_NOC_MULTICAST, DEBUG_SANITIZE_NOC_READ) + debug_sanitize_noc_and_worker_addr(noc_a, worker_a, l, DEBUG_SANITIZE_NOC_MULTICAST, DEBUG_SANITIZE_NOC_READ); LOG_LEN(l) #define DEBUG_SANITIZE_NOC_WRITE_TRANSACTION(noc_a, worker_a, l) \ - debug_sanitize_noc_and_worker_addr(noc_a, worker_a, l, DEBUG_SANITIZE_NOC_UNICAST, DEBUG_SANITIZE_NOC_WRITE) + debug_sanitize_noc_and_worker_addr(noc_a, worker_a, l, DEBUG_SANITIZE_NOC_UNICAST, DEBUG_SANITIZE_NOC_WRITE); LOG_LEN(l) #define DEBUG_SANITIZE_NOC_MULTI_WRITE_TRANSACTION(noc_a, worker_a, l) \ - debug_sanitize_noc_and_worker_addr(noc_a, worker_a, l, DEBUG_SANITIZE_NOC_MULTICAST, DEBUG_SANITIZE_NOC_WRITE) + debug_sanitize_noc_and_worker_addr(noc_a, worker_a, l, DEBUG_SANITIZE_NOC_MULTICAST, DEBUG_SANITIZE_NOC_WRITE); LOG_LEN(l) +#define DEBUG_SANITIZE_NOC_READ_TRANSACTION_WITH_ADDR_AND_SIZE_STATE(noc_id, noc_a_lower, worker_a) \ + DEBUG_SANITIZE_NOC_READ_TRANSACTION( \ + ((uint64_t) NOC_CMD_BUF_READ_REG(noc_id, NCRISC_RD_CMD_BUF, NOC_TARG_ADDR_MID) << 32) | noc_a_lower, \ + worker_a, \ + NOC_CMD_BUF_READ_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_AT_LEN_BE)); +#define DEBUG_SANITIZE_NOC_READ_TRANSACTION_WITH_ADDR_STATE(noc_id, noc_a_lower, worker_a, l) \ + DEBUG_SANITIZE_NOC_READ_TRANSACTION( \ + ((uint64_t) NOC_CMD_BUF_READ_REG(noc_id, NCRISC_RD_CMD_BUF, NOC_TARG_ADDR_MID) << 32) | noc_a_lower, \ + worker_a, l); +#define DEBUG_SANITIZE_NOC_WRITE_TRANSACTION_WITH_ADDR_AND_SIZE_STATE(noc_id, noc_a_lower, worker_a) \ + DEBUG_SANITIZE_NOC_WRITE_TRANSACTION( \ + ((uint64_t) NOC_CMD_BUF_READ_REG(noc_index, NCRISC_WR_CMD_BUF, NOC_TARG_ADDR_MID) << 32) | noc_a_lower, \ + worker_a, \ + NOC_CMD_BUF_READ_REG(noc_index, NCRISC_WR_CMD_BUF, NOC_AT_LEN_BE)); #else // !WATCHER_ENABLED -#define DEBUG_SANITIZE_WORKER_ADDR(a, l) -#define DEBUG_SANITIZE_NOC_ADDR(a, l) -#define DEBUG_SANITIZE_NOC_MULTI_ADDR(a, l) -#define DEBUG_SANITIZE_NOC_TRANSACTION(noc_a, worker_a, l, multicast, dir) -#define DEBUG_SANITIZE_NOC_READ_TRANSACTION(noc_a, worker_a, l) -#define DEBUG_SANITIZE_NOC_MULTI_READ_TRANSACTION(noc_a, worker_a, l) -#define DEBUG_SANITIZE_NOC_WRITE_TRANSACTION(noc_a, worker_a, l) -#define DEBUG_SANITIZE_NOC_MULTI_WRITE_TRANSACTION(noc_a, worker_a, l) +#define DEBUG_SANITIZE_NOC_ADDR(a, l) LOG_LEN(l) +#define DEBUG_SANITIZE_NOC_TRANSACTION(noc_a, worker_a, l, multicast, dir) LOG_LEN(l) +#define DEBUG_SANITIZE_NOC_READ_TRANSACTION(noc_a, worker_a, l) LOG_LEN(l) +#define DEBUG_SANITIZE_NOC_MULTI_READ_TRANSACTION(noc_a, worker_a, l) LOG_LEN(l) +#define DEBUG_SANITIZE_NOC_WRITE_TRANSACTION(noc_a, worker_a, l) LOG_LEN(l) +#define DEBUG_SANITIZE_NOC_MULTI_WRITE_TRANSACTION(noc_a, worker_a, l) LOG_LEN(l) +#define DEBUG_SANITIZE_NOC_READ_TRANSACTION_WITH_ADDR_AND_SIZE_STATE(noc_id, noc_a_lower, worker_a) LOG_READ_LEN_FROM_STATE() +#define DEBUG_SANITIZE_NOC_READ_TRANSACTION_WITH_ADDR_STATE(noc_id, noc_a_lower, worker_a, l) LOG_LEN(l) +#define DEBUG_SANITIZE_NOC_WRITE_TRANSACTION_WITH_ADDR_AND_SIZE_STATE(noc_id, noc_a_lower, worker_a) LOG_WRITE_LEN_FROM_STATE() #endif // WATCHER_ENABLED && not TRISC diff --git a/tt_metal/impl/debug/dprint_server.cpp b/tt_metal/impl/debug/dprint_server.cpp index 7a97fe4a962..dea20343f66 100644 --- a/tt_metal/impl/debug/dprint_server.cpp +++ b/tt_metal/impl/debug/dprint_server.cpp @@ -68,6 +68,30 @@ static inline int GetNumRiscs(int chip_id, const CoreCoord &core) { return (tt::llrt::is_ethernet_core(core, chip_id))? DPRINT_NRISCVS_ETH : DPRINT_NRISCVS; } +// Helper function to get all physical printable cores on a device +static map> get_all_physical_printable_cores(Device *device) { + map> all_physical_printable_cores; + // The set of all printable cores is Tensix + Eth cores + CoreCoord logical_grid_size = device->logical_grid_size(); + for (uint32_t x = 0; x < logical_grid_size.x; x++) { + for (uint32_t y = 0; y < logical_grid_size.y; y++) { + CoreCoord logical_coord(x, y); + CoreCoord worker_core = device->worker_core_from_logical_core(logical_coord); + all_physical_printable_cores[CoreType::WORKER].insert(worker_core); + } + } + for (const auto& eth_core : device->get_active_ethernet_cores()) { + CoreCoord physical_core = device->ethernet_core_from_logical_core(eth_core); + all_physical_printable_cores[CoreType::ETH].insert(physical_core); + } + for (const auto& eth_core : device->get_inactive_ethernet_cores()) { + CoreCoord physical_core = device->ethernet_core_from_logical_core(eth_core); + all_physical_printable_cores[CoreType::ETH].insert(physical_core); + } + + return all_physical_printable_cores; +} + // A null stream for when the print server is muted. class NullBuffer : public std::streambuf { public: @@ -131,6 +155,8 @@ struct DebugPrintServerContext { std::ofstream* outfile_ = nullptr; // non-cout std::ostream* stream_ = nullptr; // either == outfile_ or is &cout + std::ofstream* noc_log_ = nullptr; + std::map noc_xfer_counts; // A map to from {device id, core coord x, y, hart index} to the signal code it's waiting for. std::map, uint32_t> hart_waiting_on_signal_; @@ -297,7 +323,8 @@ void WriteInitMagic(Device *device, const CoreCoord& core, int hart_id, bool ena // TODO(AP): this could use a cleanup - need a different mechanism to know if a kernel is running on device. // Force wait for first kernel launch by first writing a non-zero and waiting for a zero. - vector initbuf = { uint32_t(enabled ? DEBUG_PRINT_SERVER_STARTING_MAGIC : DEBUG_PRINT_SERVER_DISABLED_MAGIC) }; + vector initbuf = vector(PRINT_BUFFER_SIZE / sizeof(uint32_t), 0); + initbuf[0] = uint32_t(enabled ? DEBUG_PRINT_SERVER_STARTING_MAGIC : DEBUG_PRINT_SERVER_DISABLED_MAGIC); tt::llrt::write_hex_vec_to_core(device->id(), core, initbuf, base_addr); } // WriteInitMagic @@ -327,6 +354,7 @@ DebugPrintServerContext::DebugPrintServerContext() { outfile_ = new std::ofstream(file_name); } stream_ = outfile_ ? outfile_ : &cout; + noc_log_ = new std::ofstream("noc_log.csv"); stop_print_server_ = false; mute_print_server_ = false; @@ -355,6 +383,10 @@ DebugPrintServerContext::~DebugPrintServerContext() { outfile_->close(); delete outfile_; } + for (auto &size_and_count : noc_xfer_counts) + *noc_log_ << size_and_count.first << "," << size_and_count.second << "\n"; + noc_log_->close(); + delete noc_log_; inst = nullptr; } // ~DebugPrintServerContext @@ -384,24 +416,7 @@ void DebugPrintServerContext::AttachDevice(Device* device) { // A set of all valid printable cores, used for checking the user input. Note that the coords // here are physical. - map> all_physical_printable_cores; - // The set of all printable cores is Tensix + Eth cores - CoreCoord logical_grid_size = device->logical_grid_size(); - for (uint32_t x = 0; x < logical_grid_size.x; x++) { - for (uint32_t y = 0; y < logical_grid_size.y; y++) { - CoreCoord logical_coord(x, y); - CoreCoord worker_core = device->worker_core_from_logical_core(logical_coord); - all_physical_printable_cores[CoreType::WORKER].insert(worker_core); - } - } - for (const auto& eth_core : device->get_active_ethernet_cores()) { - CoreCoord physical_core = device->ethernet_core_from_logical_core(eth_core); - all_physical_printable_cores[CoreType::ETH].insert(physical_core); - } - for (const auto& eth_core : device->get_inactive_ethernet_cores()) { - CoreCoord physical_core = device->ethernet_core_from_logical_core(eth_core); - all_physical_printable_cores[CoreType::ETH].insert(physical_core); - } + map> all_physical_printable_cores = get_all_physical_printable_cores(device); // Initialize all print buffers on all cores on the device to have print disabled magic. We // will then write print enabled magic for only the cores the user has specified to monitor. @@ -554,6 +569,7 @@ void DebugPrintServerContext::DetachDevice(Device* device) { uint32_t wpos = from_dev[0], rpos = from_dev[1]; if (rpos < wpos) { outstanding_prints = true; + log_warning("Phys core {} is still waiting!", core.str()); break; } } @@ -570,6 +586,17 @@ void DebugPrintServerContext::DetachDevice(Device* device) { device_to_core_range_.erase(device); device_to_core_range_lock_.unlock(); log_info(tt::LogMetal, "DPRINT Server dettached device {}", device->id()); + + // When detaching a device, disable prints on it. + map> all_physical_printable_cores = get_all_physical_printable_cores(device); + for (auto &type_and_cores : all_physical_printable_cores) { + for (auto &core : type_and_cores.second) { + int hart_count = GetNumRiscs(device->id(), core); + for (int hart_index = 0; hart_index < hart_count; hart_index++) { + WriteInitMagic(device, core, hart_index, false); + } + } + } } // DetachDevice void DebugPrintServerContext::ClearLogFile() { @@ -709,6 +736,11 @@ bool DebugPrintServerContext::PeekOneHartNonBlocking( stream << std::setprecision(*ptr); TT_ASSERT(sz == 1); break; + case DPrintNOC_LOG_XFER: + if (tt::llrt::OptionsG.get_dprint_noc_transfers()) + noc_xfer_counts[*reinterpret_cast(ptr)]++; + TT_ASSERT(sz == 4); + break; case DPrintFIXED: stream << std::fixed; TT_ASSERT(sz == 1); diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index 459e3c4a23d..9788155b973 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -692,7 +692,7 @@ void Device::compile_command_queue_programs() { .eth_mode = Eth::IDLE, .noc = NOC::NOC_0, .compile_args = mux_compile_args, - .defines = {} + .defines = {{"SKIP_NOC_LOGGING", "1"}} } ); } else { @@ -704,7 +704,7 @@ void Device::compile_command_queue_programs() { .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default, .compile_args = mux_compile_args, - .defines = {} + .defines = {{"SKIP_NOC_LOGGING", "1"}} } ); } @@ -746,7 +746,10 @@ void Device::compile_command_queue_programs() { tunneler_logical_core, tt_metal::EthernetConfig{ .noc = tt_metal::NOC::NOC_0, - .compile_args = tunneler_l_compile_args + .compile_args = tunneler_l_compile_args, + // Skip noc logging for tunneling cores, since stopping the print server can hang + // the chip in this case. + .defines = {{"SKIP_NOC_LOGGING", "1"}} } ); log_debug(LogDevice, "run tunneler at {}", tunneler_location.str()); @@ -817,7 +820,7 @@ void Device::compile_command_queue_programs() { .eth_mode = Eth::IDLE, .noc = NOC::NOC_0, .compile_args = demux_compile_args, - .defines = {} + .defines = {{"SKIP_NOC_LOGGING", "1"}} } ); } else { @@ -829,7 +832,7 @@ void Device::compile_command_queue_programs() { .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default, .compile_args = demux_compile_args, - .defines = {} + .defines = {{"SKIP_NOC_LOGGING", "1"}} } ); } @@ -999,7 +1002,10 @@ void Device::compile_command_queue_programs() { r_tunneler_logical_core, tt_metal::EthernetConfig{ .noc = tt_metal::NOC::NOC_0, - .compile_args = tunneler_r_compile_args + .compile_args = tunneler_r_compile_args, + // Skip noc logging for tunneling cores, since stopping the print server can hang + // the chip in this case. + .defines = {{"SKIP_NOC_LOGGING", "1"}} } ); log_debug(LogDevice, "run tunneler at device {} Core {}", this->id(), r_tunneler_logical_core.str()); @@ -1067,7 +1073,7 @@ void Device::compile_command_queue_programs() { .eth_mode = Eth::IDLE, .noc = NOC::NOC_0, .compile_args = demux_d_compile_args, - .defines = {} + .defines = {{"SKIP_NOC_LOGGING", "1"}} } ); } else { @@ -1079,7 +1085,7 @@ void Device::compile_command_queue_programs() { .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default, .compile_args = demux_d_compile_args, - .defines = {} + .defines = {{"SKIP_NOC_LOGGING", "1"}} } ); } @@ -1266,7 +1272,7 @@ void Device::compile_command_queue_programs() { .eth_mode = Eth::IDLE, .noc = NOC::NOC_0, .compile_args = mux_d_compile_args, - .defines = {} + .defines = {{"SKIP_NOC_LOGGING", "1"}} } ); } else { @@ -1278,7 +1284,7 @@ void Device::compile_command_queue_programs() { .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default, .compile_args = mux_d_compile_args, - .defines = {} + .defines = {{"SKIP_NOC_LOGGING", "1"}} } ); } diff --git a/tt_metal/llrt/rtoptions.cpp b/tt_metal/llrt/rtoptions.cpp index 14ec64625b4..aaf6250b4a4 100644 --- a/tt_metal/llrt/rtoptions.cpp +++ b/tt_metal/llrt/rtoptions.cpp @@ -121,6 +121,10 @@ void RunTimeOptions::ParseDPrintEnv() { for (auto &core_type_and_cores : dprint_cores) if (core_type_and_cores.second.size() > 0) dprint_enabled = true; + + const char *print_noc_xfers = std::getenv("TT_METAL_DPRINT_NOC_TRANSFER_DATA"); + if (print_noc_xfers != nullptr) + dprint_noc_transfer_data = true; }; void RunTimeOptions::ParseDPrintCoreRange(const char* env_var, CoreType core_type) { diff --git a/tt_metal/llrt/rtoptions.hpp b/tt_metal/llrt/rtoptions.hpp index 7d9449663f2..79e132550f8 100644 --- a/tt_metal/llrt/rtoptions.hpp +++ b/tt_metal/llrt/rtoptions.hpp @@ -40,6 +40,7 @@ class RunTimeOptions { bool dprint_all_chips = false; uint32_t dprint_riscv_mask = 0; std::string dprint_file_name; + bool dprint_noc_transfer_data = false; bool test_mode_enabled = false; @@ -127,6 +128,8 @@ class RunTimeOptions { inline void set_dprint_file_name(std::string file_name) { dprint_file_name = file_name; } + inline bool get_dprint_noc_transfers() { return dprint_noc_transfer_data; } + inline void set_dprint_noc_transfers(bool val) { dprint_noc_transfer_data = val; } // Used for both watcher and dprint servers, this dev option (no corresponding env var) sets // whether to catch exceptions (test mode = true) coming from debug servers or to throw them