Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mo/8223 fd2 dispatch core profiler support #8609

Merged
merged 1 commit into from
Jun 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ def device(request, device_params):

yield device

ttl.device.DumpDeviceProfiler(device, True)
ttl.device.DumpDeviceProfiler(device)
ttl.device.DeallocateBuffers(device)

ttl.device.Synchronize(device)
Expand All @@ -292,7 +292,7 @@ def pcie_devices(request, device_params):
yield [devices[i] for i in range(num_devices)]

for device in devices.values():
ttl.device.DumpDeviceProfiler(device, True)
ttl.device.DumpDeviceProfiler(device)
ttl.device.DeallocateBuffers(device)

ttl.device.CloseDevices(devices)
Expand All @@ -310,7 +310,7 @@ def all_devices(request, device_params):
yield [devices[i] for i in range(num_devices)]

for device in devices.values():
ttl.device.DumpDeviceProfiler(device, True)
ttl.device.DumpDeviceProfiler(device)
ttl.device.DeallocateBuffers(device)

ttl.device.CloseDevices(devices)
Expand All @@ -334,7 +334,7 @@ def device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0):
import tt_lib as ttl

for device in device_mesh.get_devices():
ttl.device.DumpDeviceProfiler(device, True)
ttl.device.DumpDeviceProfiler(device)
ttl.device.DeallocateBuffers(device)

ttnn.close_device_mesh(device_mesh)
Expand All @@ -361,7 +361,7 @@ def pcie_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0):
import tt_lib as ttl

for device in device_mesh.get_devices():
ttl.device.DumpDeviceProfiler(device, True)
ttl.device.DumpDeviceProfiler(device)
ttl.device.DeallocateBuffers(device)

ttnn.close_device_mesh(device_mesh)
Expand All @@ -388,7 +388,7 @@ def t3k_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0):
import tt_lib as ttl

for device in device_mesh.get_devices():
ttl.device.DumpDeviceProfiler(device, True)
ttl.device.DumpDeviceProfiler(device)
ttl.device.DeallocateBuffers(device)

ttnn.close_device_mesh(device_mesh)
Expand Down
9 changes: 2 additions & 7 deletions tests/scripts/run_profiler_regressions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,7 @@ run_profiling_test(){

run_async_mode_T3000_test

TT_METAL_DEVICE_PROFILER=1 pytest $PROFILER_TEST_SCRIPTS_ROOT/test_device_profiler.py::test_custom_cycle_count -vvv
TT_METAL_DEVICE_PROFILER=1 pytest $PROFILER_TEST_SCRIPTS_ROOT/test_device_profiler.py::test_full_buffer -vvv
#TODO(MO): Needed until #6560 is fixed.
if [ "$ARCH_NAME" != "grayskull" ]; then
TT_METAL_DEVICE_PROFILER=1 pytest $PROFILER_TEST_SCRIPTS_ROOT/test_device_profiler.py::test_multi_op -vvv
fi
TT_METAL_DEVICE_PROFILER=1 pytest $PROFILER_TEST_SCRIPTS_ROOT/test_device_profiler.py

remove_default_log_locations

Expand All @@ -92,7 +87,7 @@ run_profiling_no_reset_test(){
source python_env/bin/activate
export PYTHONPATH=$TT_METAL_HOME

TT_METAL_DEVICE_PROFILER=1 pytest $PROFILER_TEST_SCRIPTS_ROOT/test_device_profiler.py::test_multi_op -vvv
TT_METAL_DEVICE_PROFILER=1 pytest $PROFILER_TEST_SCRIPTS_ROOT/test_device_profiler_gs_no_reset.py

remove_default_log_locations
}
Expand Down
42 changes: 42 additions & 0 deletions tests/tt_metal/tools/profiler/test_device_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
clear_profiler_runtime_artifacts,
)

from models.utility_functions import skip_for_grayskull

PROG_EXMP_DIR = "programming_examples/profiler"


Expand Down Expand Up @@ -53,6 +55,7 @@ def get_function_name():
return frame.f_code.co_name


@skip_for_grayskull()
def test_multi_op():
OP_COUNT = 1000
RUN_COUNT = 2
Expand Down Expand Up @@ -129,3 +132,42 @@ def test_full_buffer():
assert stats[statNameEth]["stats"]["Count"] % (OP_COUNT * ZONE_COUNT) == 0, "Wrong Eth Marker Repeat count"
else:
assert stats[statName]["stats"]["Count"] in REF_COUNT_DICT[ENV_VAR_ARCH_NAME], "Wrong Marker Repeat count"


def test_dispatch_cores():
OP_COUNT = 1
RISC_COUNT = 1
ZONE_COUNT = 37
REF_COUNT_DICT = {
"grayskull": {
"Tensix CQ Dispatch": 37,
"Tensix CQ Prefetch": 44,
},
"wormhole_b0": {
"Tensix CQ Dispatch": 37,
"Tensix CQ Prefetch": 44,
},
}

ENV_VAR_ARCH_NAME = os.getenv("ARCH_NAME")
assert ENV_VAR_ARCH_NAME in REF_COUNT_DICT.keys()

os.environ["TT_METAL_DEVICE_PROFILER_DISPATCH"] = "1"

devicesData = run_device_profiler_test(setup=True)

stats = devicesData["data"]["devices"]["0"]["cores"]["DEVICE"]["analysis"]

verifiedStat = []
for stat in REF_COUNT_DICT[ENV_VAR_ARCH_NAME].keys():
if stat in stats.keys():
verifiedStat.append(stat)
assert stats[stat]["stats"]["Count"] == REF_COUNT_DICT[ENV_VAR_ARCH_NAME][stat], "Wrong Dispatch zone count"

statTypes = ["Dispatch", "Prefetch"]
statTypesSet = set(statTypes)
for statType in statTypes:
for stat in verifiedStat:
if statType in stat:
statTypesSet.remove(statType)
assert len(statTypesSet) == 0
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.

# SPDX-License-Identifier: Apache-2.0

from tests.tt_metal.tools.profiler import test_device_profiler


def test_multi_op_gs_no_reset():
test_device_profiler.test_multi_op()
2 changes: 2 additions & 0 deletions tt_eager/tracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,8 @@ def main():
testCommand = f"python -m tracy {osCmd}"

envVars = dict(os.environ)
# No Dispatch cores for op_report
envVars["TT_METAL_DEVICE_PROFILER_DISPATCH"] = "0"
if options.device:
envVars["TT_METAL_DEVICE_PROFILER"] = "1"
else:
Expand Down
4 changes: 2 additions & 2 deletions tt_eager/tt_lib/csrc/tt_lib_bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -198,14 +198,14 @@ void DeviceModule(py::module &m_device) {
the FinishCommand. Once set to false, all subsequent commands will immediately notify the device
that the write pointer has been updated.
)doc");
m_device.def("DumpDeviceProfiler", &detail::DumpDeviceProfiler, py::arg("device"), py::arg("free_buffers") = false, R"doc(
m_device.def("DumpDeviceProfiler", &detail::DumpDeviceProfiler, py::arg("device"), py::arg("last_dump") = false, R"doc(
Dump device side profiling data.

+------------------+----------------------------------+-----------------------+-------------+----------+
| Argument | Description | Data type | Valid range | Required |
+==================+==================================+=======================+=============+==========+
| device | Device to dump profiling data of | tt_lib.device.Device | | Yes |
| free_buffers | Option to free buffer | bool | | No |
| last_dump | Last dump before process dies | bool | | No |
+------------------+----------------------------------+-----------------------+-------------+----------+
)doc");
m_device.def("DeallocateBuffers", &detail::DeallocateBuffers, R"doc(
Expand Down
12 changes: 6 additions & 6 deletions tt_metal/detail/tt_metal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -165,9 +165,9 @@ namespace tt::tt_metal{
* |---------------|---------------------------------------------------|--------------------------------------------------------------|---------------------------|----------|
* | device | The device holding the program being profiled. | Device * | | True |
* | core_coords | The logical core coordinates being profiled. | const std::unordered_map<CoreType, std::vector<CoreCoord>> & | | True |
* | free_buffers | Free up the profiler buffer spaces for the device | bool | | False |
* | last_dump | Last dump before process dies | bool | | False |
* */
void DumpDeviceProfileResults(Device *device, std::vector<CoreCoord> &worker_cores, bool free_buffers = false);
void DumpDeviceProfileResults(Device *device, std::vector<CoreCoord> &worker_cores, bool last_dump = false);

/**
* Traverse all cores and read device side profiler data and dump results into device side CSV log
Expand All @@ -177,9 +177,9 @@ namespace tt::tt_metal{
* | Argument | Description | Type | Valid Range | Required |
* |---------------|---------------------------------------------------|--------------------------------------------------------------|---------------------------|----------|
* | device | The device holding the program being profiled. | Device * | | True |
* | free_buffers | Free up the profiler buffer spaces for the device | bool | | False |
* | last_dump | Last dump before process dies | bool | | False |
* */
void DumpDeviceProfileResults(Device *device, bool free_buffers = false);
void DumpDeviceProfileResults(Device *device, bool last_dump = false);

/**
* Set the directory for device-side CSV logs produced by the profiler instance in the tt-metal module
Expand Down Expand Up @@ -333,9 +333,9 @@ namespace tt::tt_metal{
DispatchStateCheck(true);
LAZY_COMMAND_QUEUE_MODE = lazy;
}
inline void DumpDeviceProfiler(Device * device, bool free_buffers)
inline void DumpDeviceProfiler(Device * device, bool last_dump)
{
tt::tt_metal::detail::DumpDeviceProfileResults(device, free_buffers);
tt::tt_metal::detail::DumpDeviceProfileResults(device, last_dump);
}

void AllocateBuffer(Buffer* buffer, bool bottom_up);
Expand Down
6 changes: 5 additions & 1 deletion tt_metal/hostdevcommon/profiler_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

#pragma once

#define PROFILER_OPT_DO_DISPATCH_CORES 2

namespace kernel_profiler{

constexpr static uint32_t PADDING_MARKER = ((1<<16) - 1);
Expand Down Expand Up @@ -40,7 +42,9 @@ namespace kernel_profiler{
RUN_COUNTER,
NOC_X,
NOC_Y,
FLAT_ID
FLAT_ID,
DROPPED_ZONES,
PROFILER_DONE,
};


Expand Down
13 changes: 8 additions & 5 deletions tt_metal/hw/firmware/src/brisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -65,12 +65,15 @@ CBInterface cb_interface[NUM_CIRCULAR_BUFFERS] __attribute__((used));

#define MEM_MOVER_VIEW_IRAM_BASE_ADDR (0x4 << 12)

#if defined(PROFILE_KERNEL)
namespace kernel_profiler {
uint32_t wIndex __attribute__((used));
uint32_t stackSize __attribute__((used));
uint32_t sums[SUM_COUNT] __attribute__((used));
uint32_t sumIDs[SUM_COUNT] __attribute__((used));
} // namespace kernel_profiler
uint32_t wIndex __attribute__((used));
uint32_t stackSize __attribute__((used));
uint32_t sums[SUM_COUNT] __attribute__((used));
uint32_t sumIDs[SUM_COUNT] __attribute__((used));
uint16_t core_flat_id __attribute__((used));
}
#endif

void enable_power_management() {
// Mask and Hyst taken from tb_tensix math_tests
Expand Down
3 changes: 3 additions & 0 deletions tt_metal/hw/firmware/src/erisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,15 @@ void ApplicationHandler(void) __attribute__((__section__(".init")));
}
#endif

#if defined(PROFILE_KERNEL)
namespace kernel_profiler {
uint32_t wIndex __attribute__((used));
uint32_t stackSize __attribute__((used));
uint32_t sums[SUM_COUNT] __attribute__((used));
uint32_t sumIDs[SUM_COUNT] __attribute__((used));
uint16_t core_flat_id __attribute__((used));
}
#endif

uint8_t noc_index = 0; // TODO: remove hardcoding
uint8_t my_x[NUM_NOCS] __attribute__((used));
Expand Down
5 changes: 4 additions & 1 deletion tt_metal/hw/firmware/src/idle_erisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,15 @@ constexpr uint32_t num_cbs_to_early_init = 4; // safe small number to overlap w

CBInterface cb_interface[NUM_CIRCULAR_BUFFERS] __attribute__((used));

#if defined(PROFILE_KERNEL)
namespace kernel_profiler {
uint32_t wIndex __attribute__((used));
uint32_t stackSize __attribute__((used));
uint32_t sums[SUM_COUNT] __attribute__((used));
uint32_t sumIDs[SUM_COUNT] __attribute__((used));
uint16_t core_flat_id __attribute__((used));
}
#endif

//inline void RISC_POST_STATUS(uint32_t status) {
// volatile uint32_t* ptr = (volatile uint32_t*)(NOC_CFG(ROUTER_CFG_2));
Expand Down Expand Up @@ -101,7 +104,7 @@ int main() {
DEBUG_STATUS("GD");

{
DeviceZoneScopedMainN("ERISC-FW");
DeviceZoneScopedMainN("ERISC-IDLE-FW");

noc_index = mailboxes->launch.brisc_noc_id;

Expand Down
16 changes: 11 additions & 5 deletions tt_metal/hw/firmware/src/ncrisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,18 @@ uint32_t atomic_ret_val __attribute__((section("l1_data"))) __attribute__((used)

CBInterface cb_interface[NUM_CIRCULAR_BUFFERS] __attribute__((used));

#if defined(PROFILE_KERNEL)
namespace kernel_profiler {
uint32_t wIndex __attribute__((used));
uint32_t stackSize __attribute__((used));
uint32_t sums[SUM_COUNT] __attribute__((used));
uint32_t sumIDs[SUM_COUNT] __attribute__((used));
} // namespace kernel_profiler
uint32_t wIndex __attribute__((used));
uint32_t stackSize __attribute__((used));
uint32_t sums[SUM_COUNT] __attribute__((used));
uint32_t sumIDs[SUM_COUNT] __attribute__((used));
uint16_t core_flat_id __attribute__((used));
uint32_t nocWriteSize __attribute__((used));
uint32_t *nocWriteBuffer __attribute__((used));
uint32_t *nocWriteIndex __attribute__((used));
}
#endif

extern "C" void ncrisc_resume(void);
extern "C" void notify_brisc_and_halt(uint32_t status);
Expand Down
12 changes: 7 additions & 5 deletions tt_metal/hw/firmware/src/trisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,14 @@
#include "circular_buffer.h"
// clang-format on

#if defined(PROFILE_KERNEL)
namespace kernel_profiler {
uint32_t wIndex __attribute__((used));
uint32_t stackSize __attribute__((used));
uint32_t sums[SUM_COUNT] __attribute__((used));
uint32_t sumIDs[SUM_COUNT] __attribute__((used));
} // namespace kernel_profiler
uint32_t wIndex __attribute__((used));
uint32_t stackSize __attribute__((used));
uint32_t sums[SUM_COUNT] __attribute__((used));
uint32_t sumIDs[SUM_COUNT] __attribute__((used));
}
#endif

namespace ckernel {

Expand Down
9 changes: 7 additions & 2 deletions tt_metal/impl/device/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1307,18 +1307,23 @@ bool Device::close() {
if (not this->initialized_) {
TT_THROW("Cannot close device {} that has not been initialized!", this->id_);
}
this->deallocate_buffers();
watcher_detach(this);

for (const std::unique_ptr<HWCommandQueue> &hw_command_queue : hw_command_queues_) {
if (hw_command_queue->manager.get_bypass_mode()) {
hw_command_queue->record_end();
}
hw_command_queue->terminate();
}

tt_metal::detail::DumpDeviceProfileResults(this, true);
Copy link
Contributor Author

@mo-tenstorrent mo-tenstorrent Jun 4, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@pgkeller one last profiler data dump is performed after terminate has been sent.

Ideally we should have a sync here, there is no guarantee that dispatch BRISCs have reached their FW end when we are reading here. However, the consequences are not too bad, you might miss FW and KERNEL markers on NCRISC and BRISC (Area encircled in red below). But shorter child calls will be caught.

Screenshot 2024-06-04 at 10 02 34 AM

Copy link
Contributor Author

@mo-tenstorrent mo-tenstorrent Jun 4, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we ok with a poll on a profiler L1 location on dispatch cores to flag profiler has finished on those cores?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

polling L1 from host isn't great, but after terminate is sent shouldn't be a big deal. maybe usleep a bit first in case a lot of work has been queued so you don't perturb performance you are measuring? or poll w/ delays between?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah this is at the very end of the run and only in profiler builds. I will poll w/ delays, starting with a delay.


this->trace_buffer_pool_.clear();
detail::EnableAllocs(this);

this->deallocate_buffers();
watcher_detach(this);


std::unordered_set<CoreCoord> not_done_dispatch_cores;
std::unordered_set<CoreCoord> cores_to_skip;

Expand Down
1 change: 1 addition & 0 deletions tt_metal/impl/dispatch/kernels/cq_dispatch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -863,6 +863,7 @@ void kernel_main() {
}
bool done = false;
while (!done) {
DeviceZoneScopedND("CQ-DISPATCH", block_noc_writes_to_clear, rd_block_idx );
if (cmd_ptr == cb_fence) {
get_cb_page<
dispatch_cb_base,
Expand Down
3 changes: 2 additions & 1 deletion tt_metal/impl/dispatch/kernels/cq_prefetch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -812,6 +812,7 @@ bool process_cmd(uint32_t& cmd_ptr,
uint32_t& downstream_data_ptr,
uint32_t& stride) {

DeviceZoneScopedND("PROCESS-CMD", block_noc_writes_to_clear, rd_block_idx );
volatile CQPrefetchCmd tt_l1_ptr *cmd = (volatile CQPrefetchCmd tt_l1_ptr *)cmd_ptr;
bool done = false;

Expand Down Expand Up @@ -1101,9 +1102,9 @@ void kernel_main_hd() {

uint32_t cmd_ptr = cmddat_q_base;
uint32_t fence = cmddat_q_base;

bool done = false;
while (!done) {
DeviceZoneScopedND("KERNEL-MAIN-HD", block_noc_writes_to_clear, rd_block_idx );
constexpr uint32_t preamble_size = 0;
fetch_q_get_cmds<preamble_size>(fence, cmd_ptr, pcie_read_ptr);

Expand Down
Loading
Loading