Skip to content

Commit

Permalink
#8223: Profiling dispatch cores
Browse files Browse the repository at this point in the history
  • Loading branch information
mo-tenstorrent committed May 31, 2024
1 parent 0598421 commit a877879
Show file tree
Hide file tree
Showing 25 changed files with 364 additions and 160 deletions.
12 changes: 6 additions & 6 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ def device(device_l1_small_size):

device = ttl.device.GetDefaultDevice()
yield device
ttl.device.DumpDeviceProfiler(device, True)
ttl.device.DumpDeviceProfiler(device)
ttl.device.DeallocateBuffers(device)


Expand All @@ -299,7 +299,7 @@ def pcie_devices(request):
yield [devices[i] for i in range(num_devices)]

for device in devices.values():
ttl.device.DumpDeviceProfiler(device, True)
ttl.device.DumpDeviceProfiler(device)
ttl.device.DeallocateBuffers(device)

ttl.device.CloseDevices(devices)
Expand All @@ -317,7 +317,7 @@ def all_devices(request):
yield [devices[i] for i in range(num_devices)]

for device in devices.values():
ttl.device.DumpDeviceProfiler(device, True)
ttl.device.DumpDeviceProfiler(device)
ttl.device.DeallocateBuffers(device)

ttl.device.CloseDevices(devices)
Expand All @@ -344,7 +344,7 @@ def device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0):
import tt_lib as ttl

for device in device_mesh.get_devices():
ttl.device.DumpDeviceProfiler(device, True)
ttl.device.DumpDeviceProfiler(device)
ttl.device.DeallocateBuffers(device)

ttnn.close_device_mesh(device_mesh)
Expand Down Expand Up @@ -374,7 +374,7 @@ def pcie_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0):
import tt_lib as ttl

for device in device_mesh.get_devices():
ttl.device.DumpDeviceProfiler(device, True)
ttl.device.DumpDeviceProfiler(device)
ttl.device.DeallocateBuffers(device)

ttnn.close_device_mesh(device_mesh)
Expand Down Expand Up @@ -404,7 +404,7 @@ def t3k_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0):
import tt_lib as ttl

for device in device_mesh.get_devices():
ttl.device.DumpDeviceProfiler(device, True)
ttl.device.DumpDeviceProfiler(device)
ttl.device.DeallocateBuffers(device)

ttnn.close_device_mesh(device_mesh)
Expand Down
2 changes: 2 additions & 0 deletions tt_eager/tracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,8 @@ def main():
testCommand = f"python -m tracy {osCmd}"

envVars = dict(os.environ)
# No Dispatch cores for op_report
envVars["TT_METAL_DEVICE_PROFILER_DISPATCH"] = "0"
if options.device:
envVars["TT_METAL_DEVICE_PROFILER"] = "1"
else:
Expand Down
4 changes: 2 additions & 2 deletions tt_eager/tt_lib/csrc/tt_lib_bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -196,14 +196,14 @@ void DeviceModule(py::module &m_device) {
the FinishCommand. Once set to false, all subsequent commands will immediately notify the device
that the write pointer has been updated.
)doc");
m_device.def("DumpDeviceProfiler", &detail::DumpDeviceProfiler, py::arg("device"), py::arg("free_buffers") = false, R"doc(
m_device.def("DumpDeviceProfiler", &detail::DumpDeviceProfiler, py::arg("device"), py::arg("last_dump") = false, R"doc(
Dump device side profiling data.
+------------------+----------------------------------+-----------------------+-------------+----------+
| Argument | Description | Data type | Valid range | Required |
+==================+==================================+=======================+=============+==========+
| device | Device to dump profiling data of | tt_lib.device.Device | | Yes |
| free_buffers | Option to free buffer | bool | | No |
| last_dump | Last dump before process dies | bool | | No |
+------------------+----------------------------------+-----------------------+-------------+----------+
)doc");
m_device.def("DeallocateBuffers", &detail::DeallocateBuffers, R"doc(
Expand Down
12 changes: 6 additions & 6 deletions tt_metal/detail/tt_metal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -165,9 +165,9 @@ namespace tt::tt_metal{
* |---------------|---------------------------------------------------|--------------------------------------------------------------|---------------------------|----------|
* | device | The device holding the program being profiled. | Device * | | True |
* | core_coords | The logical core coordinates being profiled. | const std::unordered_map<CoreType, std::vector<CoreCoord>> & | | True |
* | free_buffers | Free up the profiler buffer spaces for the device | bool | | False |
* | last_dump | Last dump before process dies | bool | | False |
* */
void DumpDeviceProfileResults(Device *device, std::vector<CoreCoord> &worker_cores, bool free_buffers = false);
void DumpDeviceProfileResults(Device *device, std::vector<CoreCoord> &worker_cores, bool last_dump = false);

/**
* Traverse all cores and read device side profiler data and dump results into device side CSV log
Expand All @@ -177,9 +177,9 @@ namespace tt::tt_metal{
* | Argument | Description | Type | Valid Range | Required |
* |---------------|---------------------------------------------------|--------------------------------------------------------------|---------------------------|----------|
* | device | The device holding the program being profiled. | Device * | | True |
* | free_buffers | Free up the profiler buffer spaces for the device | bool | | False |
* | last_dump | Last dump before process dies | bool | | False |
* */
void DumpDeviceProfileResults(Device *device, bool free_buffers = false);
void DumpDeviceProfileResults(Device *device, bool last_dump = false);

/**
* Set the directory for device-side CSV logs produced by the profiler instance in the tt-metal module
Expand Down Expand Up @@ -333,9 +333,9 @@ namespace tt::tt_metal{
DispatchStateCheck(true);
LAZY_COMMAND_QUEUE_MODE = lazy;
}
inline void DumpDeviceProfiler(Device * device, bool free_buffers)
inline void DumpDeviceProfiler(Device * device, bool last_dump)
{
tt::tt_metal::detail::DumpDeviceProfileResults(device, free_buffers);
tt::tt_metal::detail::DumpDeviceProfileResults(device, last_dump);
}

void AllocateBuffer(Buffer* buffer, bool bottom_up);
Expand Down
5 changes: 4 additions & 1 deletion tt_metal/hostdevcommon/profiler_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

#pragma once

#define PROFILER_OPT_DO_DISPATCH_CORES 2

namespace kernel_profiler{

constexpr static uint32_t PADDING_MARKER = ((1<<16) - 1);
Expand Down Expand Up @@ -40,7 +42,8 @@ namespace kernel_profiler{
RUN_COUNTER,
NOC_X,
NOC_Y,
FLAT_ID
FLAT_ID,
DROPPED_ZONES,
};


Expand Down
12 changes: 7 additions & 5 deletions tt_metal/hw/firmware/src/brisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,13 @@ CBInterface cb_interface[NUM_CIRCULAR_BUFFERS] __attribute__((used));
#define MEM_MOVER_VIEW_IRAM_BASE_ADDR (0x4 << 12)

namespace kernel_profiler {
uint32_t wIndex __attribute__((used));
uint32_t stackSize __attribute__((used));
uint32_t sums[SUM_COUNT] __attribute__((used));
uint32_t sumIDs[SUM_COUNT] __attribute__((used));
} // namespace kernel_profiler
uint32_t wIndex __attribute__((used));
uint32_t stackSize __attribute__((used));
uint32_t sums[SUM_COUNT] __attribute__((used));
uint32_t sumIDs[SUM_COUNT] __attribute__((used));
bool resultsPushed __attribute__((used));
uint16_t core_flat_id __attribute__((used));
}

void enable_power_management() {
// Mask and Hyst taken from tb_tensix math_tests
Expand Down
2 changes: 2 additions & 0 deletions tt_metal/hw/firmware/src/erisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ namespace kernel_profiler {
uint32_t stackSize __attribute__((used));
uint32_t sums[SUM_COUNT] __attribute__((used));
uint32_t sumIDs[SUM_COUNT] __attribute__((used));
bool resultsPushed __attribute__((used));
uint16_t core_flat_id __attribute__((used));
}

uint8_t noc_index = 0; // TODO: remove hardcoding
Expand Down
4 changes: 3 additions & 1 deletion tt_metal/hw/firmware/src/idle_erisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ namespace kernel_profiler {
uint32_t stackSize __attribute__((used));
uint32_t sums[SUM_COUNT] __attribute__((used));
uint32_t sumIDs[SUM_COUNT] __attribute__((used));
bool resultsPushed __attribute__((used));
uint16_t core_flat_id __attribute__((used));
}

//inline void RISC_POST_STATUS(uint32_t status) {
Expand Down Expand Up @@ -101,7 +103,7 @@ int main() {
DEBUG_STATUS("GD");

{
DeviceZoneScopedMainN("ERISC-FW");
DeviceZoneScopedMainN("ERISC-IDLE-FW");

noc_index = mailboxes->launch.brisc_noc_id;

Expand Down
15 changes: 10 additions & 5 deletions tt_metal/hw/firmware/src/ncrisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,16 @@ uint32_t atomic_ret_val __attribute__((section("l1_data"))) __attribute__((used)
CBInterface cb_interface[NUM_CIRCULAR_BUFFERS] __attribute__((used));

namespace kernel_profiler {
uint32_t wIndex __attribute__((used));
uint32_t stackSize __attribute__((used));
uint32_t sums[SUM_COUNT] __attribute__((used));
uint32_t sumIDs[SUM_COUNT] __attribute__((used));
} // namespace kernel_profiler
uint32_t wIndex __attribute__((used));
uint32_t stackSize __attribute__((used));
uint32_t sums[SUM_COUNT] __attribute__((used));
uint32_t sumIDs[SUM_COUNT] __attribute__((used));
bool resultsPushed __attribute__((used));
uint16_t core_flat_id __attribute__((used));
uint32_t nocWriteSize __attribute__((used));
uint32_t *nocWriteBuffer __attribute__((used));
uint32_t *nocWriteIndex __attribute__((used));
}

extern "C" void ncrisc_resume(void);
extern "C" void notify_brisc_and_halt(uint32_t status);
Expand Down
10 changes: 5 additions & 5 deletions tt_metal/hw/firmware/src/trisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@
// clang-format on

namespace kernel_profiler {
uint32_t wIndex __attribute__((used));
uint32_t stackSize __attribute__((used));
uint32_t sums[SUM_COUNT] __attribute__((used));
uint32_t sumIDs[SUM_COUNT] __attribute__((used));
} // namespace kernel_profiler
uint32_t wIndex __attribute__((used));
uint32_t stackSize __attribute__((used));
uint32_t sums[SUM_COUNT] __attribute__((used));
uint32_t sumIDs[SUM_COUNT] __attribute__((used));
}

namespace ckernel {

Expand Down
9 changes: 7 additions & 2 deletions tt_metal/impl/device/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1289,18 +1289,23 @@ bool Device::close() {
if (not this->initialized_) {
TT_THROW("Cannot close device {} that has not been initialized!", this->id_);
}
this->deallocate_buffers();
watcher_detach(this);

for (const std::unique_ptr<HWCommandQueue> &hw_command_queue : hw_command_queues_) {
if (hw_command_queue->manager.get_bypass_mode()) {
hw_command_queue->record_end();
}
hw_command_queue->terminate();
}

tt_metal::detail::DumpDeviceProfileResults(this, true);

this->trace_buffer_pool_.clear();
detail::EnableAllocs(this);

this->deallocate_buffers();
watcher_detach(this);


std::unordered_set<CoreCoord> not_done_dispatch_cores;
std::unordered_set<CoreCoord> cores_to_skip;

Expand Down
18 changes: 10 additions & 8 deletions tt_metal/impl/dispatch/kernels/cq_dispatch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ constexpr uint32_t l1_cache_size = ((max_write_packed_cores + l1_to_local_cache_

static uint32_t l1_cache[l1_cache_size];


// NOTE: CAREFUL USING THIS FUNCTION
// It is call "careful_copy" because you need to be careful...
// It copies beyond count by up to 5 elements make sure src and dst addresses are safe
Expand Down Expand Up @@ -886,17 +887,18 @@ void kernel_main() {
}
bool done = false;
while (!done) {
DeviceZoneScopedND("CQ-DISPATCH", block_noc_writes_to_clear, rd_block_idx );
if (cmd_ptr == cb_fence) {
get_cb_page<
dispatch_cb_base,
dispatch_cb_blocks,
dispatch_cb_log_page_size,
my_noc_xy,
my_dispatch_cb_sem_id>(cmd_ptr,
cb_fence,
block_noc_writes_to_clear,
block_next_start_addr,
rd_block_idx);
cb_fence,
block_noc_writes_to_clear,
block_next_start_addr,
rd_block_idx);
}

done = is_d_variant ?
Expand All @@ -909,10 +911,10 @@ void kernel_main() {
// XXXXX move this inside while loop waiting for get_dispatch_cb_page above
// XXXXX can potentially clear a partial block when stalled w/ some more bookkeeping
cb_block_release_pages<upstream_noc_xy,
upstream_dispatch_cb_sem_id,
dispatch_cb_blocks,
dispatch_cb_pages_per_block>(block_noc_writes_to_clear,
wr_block_idx);
upstream_dispatch_cb_sem_id,
dispatch_cb_blocks,
dispatch_cb_pages_per_block>(block_noc_writes_to_clear,
wr_block_idx);
}

noc_async_write_barrier();
Expand Down
13 changes: 7 additions & 6 deletions tt_metal/impl/dispatch/kernels/cq_prefetch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,6 @@ static struct PrefetchExecBufState {

// Feature to stall the prefetcher, mainly for ExecBuf impl which reuses CmdDataQ
static enum StallState { STALL_NEXT = 2, STALLED = 1, NOT_STALLED = 0} stall_state = NOT_STALLED;

static_assert((downstream_cb_base & (downstream_cb_page_size - 1)) == 0);

template<bool cmddat_wrap_enable,
Expand Down Expand Up @@ -217,7 +216,7 @@ void fetch_q_get_cmds(uint32_t& fence, uint32_t& cmd_ptr, uint32_t& pcie_read_pt
}
if (!cmd_ready) {
if (pending_read_size != 0) {
DPRINT << "fetch_q_get_cmds barrier" << ENDL();
DPRINT << "fetch_q_get_cmds barrier" << ENDL();
noc_async_read_barrier();

// wrap the cmddat_q
Expand Down Expand Up @@ -246,7 +245,7 @@ void fetch_q_get_cmds(uint32_t& fence, uint32_t& cmd_ptr, uint32_t& pcie_read_pt
// By here, prefetch_q_ready must be false
// Nothing to fetch, nothing pending, nothing available, stall on host
DEBUG_STATUS("HQW");
DPRINT << "fetch_q_get_cmds stall" << ENDL();
DPRINT << "fetch_q_get_cmds stall" << ENDL();
while ((fetch_size = *prefetch_q_rd_ptr) == 0);
fetch_q_get_cmds<preamble_size>(fence, cmd_ptr, pcie_read_ptr);
DEBUG_STATUS("HQD");
Expand Down Expand Up @@ -295,6 +294,7 @@ static uint32_t process_relay_inline_cmd(uint32_t cmd_ptr,

// Assume the downstream buffer is big relative to cmddat command size that we can
// grab what we need in one chunk

cb_acquire_pages<my_noc_xy, my_downstream_cb_sem_id>(npages);

uint32_t remaining = cmddat_q_end - data_ptr;
Expand Down Expand Up @@ -804,6 +804,7 @@ bool process_cmd(uint32_t& cmd_ptr,
uint32_t& downstream_data_ptr,
uint32_t& stride) {

DeviceZoneScopedND("PROCESS-CMD", block_noc_writes_to_clear, rd_block_idx );
volatile CQPrefetchCmd tt_l1_ptr *cmd = (volatile CQPrefetchCmd tt_l1_ptr *)cmd_ptr;
bool done = false;

Expand All @@ -830,7 +831,7 @@ bool process_cmd(uint32_t& cmd_ptr,
break;

case CQ_PREFETCH_CMD_RELAY_INLINE:
DPRINT << "relay inline" << ENDL();
DPRINT << "relay inline" << ENDL();
if (exec_buf) {
stride = process_relay_inline_exec_buf_cmd(cmd_ptr, downstream_data_ptr);
} else {
Expand Down Expand Up @@ -860,7 +861,7 @@ bool process_cmd(uint32_t& cmd_ptr,
break;

case CQ_PREFETCH_CMD_STALL:
DPRINT << "stall" << ENDL();
DPRINT << "stall" << ENDL();
stride = process_stall(cmd_ptr);
break;

Expand Down Expand Up @@ -1093,9 +1094,9 @@ void kernel_main_hd() {

uint32_t cmd_ptr = cmddat_q_base;
uint32_t fence = cmddat_q_base;

bool done = false;
while (!done) {
DeviceZoneScopedND("KERNEL-MAIN-HD", block_noc_writes_to_clear, rd_block_idx );
constexpr uint32_t preamble_size = 0;
fetch_q_get_cmds<preamble_size>(fence, cmd_ptr, pcie_read_ptr);

Expand Down
7 changes: 6 additions & 1 deletion tt_metal/jit_build/build.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,12 @@ void JitBuildEnv::init(uint32_t build_key, tt::ARCH arch) {
this->defines_ += "-DTENSIX_FIRMWARE -DLOCAL_MEM_EN=0 ";

if (tt::tt_metal::getDeviceProfilerState()) {
this->defines_ += "-DPROFILE_KERNEL=1 ";
if (tt::llrt::OptionsG.get_profiler_do_dispatch_cores()) {
//TODO(MO): Standard bit mask for device side profiler options
this->defines_ += "-DPROFILE_KERNEL=2 ";
} else {
this->defines_ += "-DPROFILE_KERNEL=1 ";
}
}

if (tt::llrt::OptionsG.get_watcher_enabled()) {
Expand Down
2 changes: 1 addition & 1 deletion tt_metal/llrt/llrt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ static bool check_if_riscs_on_specified_core_done(chip_id_t chip_id, const CoreC
bool is_active_eth_core = false;
bool is_inactive_eth_core = false;

// Determine whether an ethernet core is active or idle. Their host handshake interfaces are different.
// Determine whether an ethernet core is active or idle. Their host handshake interfaces are different.
if (is_eth_core) {
auto active_eth_cores = tt::Cluster::instance().get_active_ethernet_cores(chip_id);
auto inactive_eth_cores = tt::Cluster::instance().get_inactive_ethernet_cores(chip_id);
Expand Down
Loading

0 comments on commit a877879

Please sign in to comment.