Skip to content

Commit

Permalink
#3764: Device side opID support
Browse files Browse the repository at this point in the history
  • Loading branch information
tt-aho authored and mo-tenstorrent committed Jul 11, 2024
1 parent 83d596e commit 374d29e
Show file tree
Hide file tree
Showing 17 changed files with 111 additions and 53 deletions.
43 changes: 21 additions & 22 deletions tests/scripts/run_profiler_regressions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,26 +4,6 @@ source scripts/tools_setup_common.sh

set -eo pipefail

run_additional_T3000_test(){
remove_default_log_locations
mkdir -p $PROFILER_ARTIFACTS_DIR

./tt_metal/tools/profiler/profile_this.py -c "pytest tests/tt_eager/python_api_testing/unit_testing/misc/test_all_gather.py::test_all_gather_on_t3000_post_commit[mem_config0-input_dtype0-8-1-input_shape1-0-layout1]" > $PROFILER_ARTIFACTS_DIR/test_out.log

cat $PROFILER_ARTIFACTS_DIR/test_out.log

if cat $PROFILER_ARTIFACTS_DIR/test_out.log | grep "SKIPPED"
then
echo "No verification as test was skipped"
else
echo "Verifying test results"
runDate=$(ls $PROFILER_OUTPUT_DIR/)
LINE_COUNT=9 #1 header + 8 devices
res=$(verify_perf_line_count "$PROFILER_OUTPUT_DIR/$runDate/ops_perf_results_$runDate.csv" "$LINE_COUNT")
echo $res
fi
}

run_async_mode_T3000_test(){
#Some tests here do not skip grayskull
if [ "$ARCH_NAME" != "grayskull" ]; then
Expand All @@ -47,6 +27,27 @@ run_async_mode_T3000_test(){
fi
}

run_additional_T3000_test(){
remove_default_log_locations
mkdir -p $PROFILER_ARTIFACTS_DIR

./tt_metal/tools/profiler/profile_this.py -c "pytest tests/tt_eager/python_api_testing/unit_testing/misc/test_all_gather.py::test_all_gather_on_t3000_post_commit[mem_config0-input_dtype0-8-1-input_shape1-0-layout1]" > $PROFILER_ARTIFACTS_DIR/test_out.log

cat $PROFILER_ARTIFACTS_DIR/test_out.log

if cat $PROFILER_ARTIFACTS_DIR/test_out.log | grep "SKIPPED"
then
echo "No verification as test was skipped"
else
echo "Verifying test results"
runDate=$(ls $PROFILER_OUTPUT_DIR/)
LINE_COUNT=9 #1 header + 8 devices
res=$(verify_perf_line_count "$PROFILER_OUTPUT_DIR/$runDate/ops_perf_results_$runDate.csv" "$LINE_COUNT")
echo $res
run_async_mode_T3000_test
fi
}

run_profiling_test(){
if [[ -z "$ARCH_NAME" ]]; then
echo "Must provide ARCH_NAME in environment" 1>&2
Expand All @@ -60,8 +61,6 @@ run_profiling_test(){

run_additional_T3000_test

run_async_mode_T3000_test

TT_METAL_DEVICE_PROFILER=1 pytest $PROFILER_TEST_SCRIPTS_ROOT/test_device_profiler.py

remove_default_log_locations
Expand Down
9 changes: 9 additions & 0 deletions tt_eager/tt_dnn/op_library/run_operation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,15 @@ OutputTensors run_device_operation(
operation, input_tensors, optional_input_tensors, output_tensors, optional_output_tensors);
uint32_t device_id = detail::get_device(input_tensors, optional_input_tensors)->id();

if (std::holds_alternative<std::reference_wrapper<Program>>(program))
{
std::get<std::reference_wrapper<Program>>(program).get().set_global_id(op_id);
}
else
{
std::get<std::shared_ptr<Program>>(program)->set_global_id(op_id);
}

// Enqueue or Launch Program
std::visit(
[&operation, &input_tensors, &optional_input_tensors, &output_tensors, queue](auto&& program) {
Expand Down
1 change: 1 addition & 0 deletions tt_metal/hw/firmware/src/brisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,7 @@ int main() {

{
DeviceZoneScopedMainN("BRISC-FW");
DeviceZoneSetCounter(mailboxes->launch.kernel_config.host_assigned_op_id);

// Copies from L1 to IRAM on chips where NCRISC has IRAM
l1_to_ncrisc_iram_copy(mailboxes->launch.kernel_config.ncrisc_kernel_size16, ncrisc_kernel_start_offset16);
Expand Down
1 change: 1 addition & 0 deletions tt_metal/hw/firmware/src/erisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ void __attribute__((section("erisc_l1_code.1"), noinline)) Application(void) {
// FD: assume that no more host -> remote writes are pending
if (mailboxes->launch.go.run == RUN_MSG_GO) {
DeviceZoneScopedMainN("ERISC-FW");
DeviceZoneSetCounter(mailboxes->launch.kernel_config.host_assigned_op_id);
DEBUG_STATUS("R");
uint32_t kernel_config_base = mailboxes->launch.kernel_config.kernel_config_base;
rta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base +
Expand Down
1 change: 1 addition & 0 deletions tt_metal/hw/firmware/src/idle_erisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ int main() {

{
DeviceZoneScopedMainN("ERISC-IDLE-FW");
DeviceZoneSetCounter(mailboxes->launch.kernel_config.host_assigned_op_id);

noc_index = mailboxes->launch.kernel_config.brisc_noc_id;

Expand Down
3 changes: 3 additions & 0 deletions tt_metal/hw/inc/dev_msgs.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@ struct kernel_config_msg_t {
volatile uint16_t watcher_kernel_ids[DISPATCH_CLASS_MAX];
volatile uint16_t ncrisc_kernel_size16; // size in 16 byte units

volatile uint16_t host_assigned_op_id;

// Ring buffer of kernel configuration data
volatile uint32_t kernel_config_base;
dyn_mem_map_t mem_map[DISPATCH_CLASS_MAX];
Expand All @@ -92,6 +94,7 @@ struct kernel_config_msg_t {
volatile uint8_t dispatch_core_y;
volatile uint8_t exit_erisc_kernel;
volatile uint8_t pad1;
volatile uint16_t pad2;
} __attribute__((packed));

struct go_msg_t {
Expand Down
6 changes: 5 additions & 1 deletion tt_metal/impl/dispatch/command_queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -651,6 +651,7 @@ void EnqueueProgramCommand::assemble_runtime_args_commands() {
}

void EnqueueProgramCommand::assemble_device_commands() {
ZoneScoped;
auto& cached_program_command_sequence = this->cached_program_command_sequences[this->program.id];
if (!program.is_finalized()) {
// Calculate size of command and fill program indices of data to update
Expand Down Expand Up @@ -895,6 +896,7 @@ void EnqueueProgramCommand::assemble_device_commands() {
kernel_group.launch_msg.kernel_config.mode = DISPATCH_MODE_DEV;
kernel_group.launch_msg.kernel_config.dispatch_core_x = this->dispatch_core.x;
kernel_group.launch_msg.kernel_config.dispatch_core_y = this->dispatch_core.y;
kernel_group.launch_msg.kernel_config.host_assigned_op_id = program.get_global_id();
const void* launch_message_data = (const void*)(&kernel_group.launch_msg);
for (const CoreRange& core_range : kernel_group.core_ranges.ranges()) {
CoreCoord physical_start =
Expand All @@ -917,11 +919,11 @@ void EnqueueProgramCommand::assemble_device_commands() {
this->packed_write_max_unicast_sub_cmds,
multicast_go_signals_payload);
}

for (KernelGroup& kernel_group : program.get_kernel_groups(CoreType::ETH)) {
kernel_group.launch_msg.kernel_config.mode = DISPATCH_MODE_DEV;
kernel_group.launch_msg.kernel_config.dispatch_core_x = this->dispatch_core.x;
kernel_group.launch_msg.kernel_config.dispatch_core_y = this->dispatch_core.y;
kernel_group.launch_msg.kernel_config.host_assigned_op_id = program.get_global_id();
const void* launch_message_data = (const launch_msg_t*)(&kernel_group.launch_msg);
for (const CoreRange& core_range : kernel_group.core_ranges.ranges()) {
for (auto x = core_range.start.x; x <= core_range.end.x; x++) {
Expand Down Expand Up @@ -1113,6 +1115,7 @@ void EnqueueProgramCommand::assemble_device_commands() {
}
} else {
uint32_t i = 0;
ZoneScopedN("program_loaded_on_device");
for (const auto& cbs_on_core_range : cached_program_command_sequence.circular_buffers_on_core_ranges) {
uint32_t* cb_config_payload = cached_program_command_sequence.cb_configs_payloads[i];
for (const shared_ptr<CircularBuffer>& cb : cbs_on_core_range) {
Expand All @@ -1134,6 +1137,7 @@ void EnqueueProgramCommand::assemble_device_commands() {
for (auto& go_signal : cached_program_command_sequence.go_signals) {
go_signal->kernel_config.dispatch_core_x = this->dispatch_core.x;
go_signal->kernel_config.dispatch_core_y = this->dispatch_core.y;
go_signal->kernel_config.host_assigned_op_id = program.get_global_id();
}
}
}
Expand Down
4 changes: 3 additions & 1 deletion tt_metal/impl/program/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ void DisablePersistentKernelCache() { enable_persistent_kernel_cache = false; }
std::atomic<uint64_t> Program::program_counter = 0;

Program::Program() :
id(program_counter++), worker_crs_({}), local_circular_buffer_allocation_needed_(false), loaded_onto_device(false) {
id(program_counter++), global_id(0), worker_crs_({}), local_circular_buffer_allocation_needed_(false), loaded_onto_device(false) {
std::set<CoreType> supported_core_types = {CoreType::WORKER, CoreType::ETH};
for (const auto &core_type : supported_core_types) {
kernels_.insert({core_type, {}});
Expand Down Expand Up @@ -934,5 +934,7 @@ void Program::compile(Device *device) {
this->loaded_onto_device = false;
}

void Program::set_global_id(uint64_t id) { this->global_id = id; }

Program::~Program() {}
} // namespace tt::tt_metal
3 changes: 3 additions & 0 deletions tt_metal/impl/program/program.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,11 +80,13 @@ class Program {
Program(Program &&other) = default;
Program& operator=(Program &&other) = default;

void set_global_id(uint64_t id);
~Program();

void construct_core_range_set_for_worker_cores();

const uint64_t get_id() const { return this->id; }
const uint64_t get_global_id() const { return this->global_id; }

size_t num_kernels() const {
size_t count = 0;
Expand Down Expand Up @@ -177,6 +179,7 @@ class Program {
};

uint64_t id; // Need to make non-const due to move constructor
uint64_t global_id; // Need to make non-const due to move constructor
static std::atomic<uint64_t> program_counter;
std::unordered_map<CoreType, std::unordered_map<KernelHandle, std::shared_ptr<Kernel> >> kernels_;
std::unordered_map<CoreType, CoreCoord> grid_extent_;
Expand Down
2 changes: 1 addition & 1 deletion tt_metal/third_party/tracy
58 changes: 36 additions & 22 deletions tt_metal/tools/profiler/kernel_profiler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,6 @@ namespace kernel_profiler{
{
core_flat_id = noc_xy_to_profiler_flat_id[my_x[0]][my_y[0]];

#pragma GCC unroll 65534
for (int i = ID_HH; i < GUARANTEED_MARKER_1_H; i ++)
{
eriscBuffer[i] = 0;
Expand All @@ -124,14 +123,13 @@ namespace kernel_profiler{
profiler_control_buffer[FLAT_ID] = core_flat_id;
}

#pragma GCC unroll 65534
for (int i = GUARANTEED_MARKER_1_H; i < CUSTOM_MARKERS; i ++)
{
//TODO(MO): Clean up magic numbers
eriscBuffer[i] = 0x80000000;
}

eriscBuffer [ID_LL] = runCounter;
eriscBuffer [ID_LL] = (runCounter & 0xFFFF) | (eriscBuffer [ID_LL] & 0xFFFF0000);

#endif //ERISC_INIT
#if defined(COMPILE_FOR_BRISC)
Expand All @@ -146,7 +144,6 @@ namespace kernel_profiler{
{
core_flat_id = noc_xy_to_profiler_flat_id[my_x[0]][my_y[0]];

#pragma GCC unroll 65534
for (int i = ID_HH; i < GUARANTEED_MARKER_1_H; i ++)
{
briscBuffer[i] = 0;
Expand All @@ -167,23 +164,21 @@ namespace kernel_profiler{
profiler_control_buffer[FLAT_ID] = core_flat_id;
}

#pragma GCC unroll 65534
for (int i = GUARANTEED_MARKER_1_H; i < CUSTOM_MARKERS; i ++)
{
//TODO(MO): Clean up magic numbers
//TODO(MO): Clean up magic numbers
briscBuffer[i] = 0x80000000;
ncriscBuffer[i] = 0x80000000;
trisc0Buffer[i] = 0x80000000;
trisc1Buffer[i] = 0x80000000;
trisc2Buffer[i] = 0x80000000;
}

//TODO(MO): Clean up magic numbers
briscBuffer [ID_LL] = runCounter;
ncriscBuffer[ID_LL] = runCounter;
trisc0Buffer[ID_LL] = runCounter;
trisc1Buffer[ID_LL] = runCounter;
trisc2Buffer[ID_LL] = runCounter;
briscBuffer [ID_LL] = (runCounter & 0xFFFF) | (briscBuffer [ID_LL] & 0xFFFF0000);
ncriscBuffer[ID_LL] = (runCounter & 0xFFFF) | (ncriscBuffer[ID_LL] & 0xFFFF0000);
trisc0Buffer[ID_LL] = (runCounter & 0xFFFF) | (trisc0Buffer[ID_LL] & 0xFFFF0000);
trisc1Buffer[ID_LL] = (runCounter & 0xFFFF) | (trisc1Buffer[ID_LL] & 0xFFFF0000);
trisc2Buffer[ID_LL] = (runCounter & 0xFFFF) | (trisc2Buffer[ID_LL] & 0xFFFF0000);


#endif //BRISC_INIT
Expand All @@ -208,14 +203,6 @@ namespace kernel_profiler{
buffer[index+1] = p_reg[WALL_CLOCK_LOW_INDEX];
}

inline __attribute__((always_inline)) void mark_end_at_index_inlined(uint32_t index, uint32_t timer_id_s, uint32_t timer_id)
{
volatile tt_l1_ptr uint32_t *buffer = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(kernel_profiler::profilerBuffer);
volatile tt_reg_ptr uint32_t *p_reg = reinterpret_cast<volatile tt_reg_ptr uint32_t *> (RISCV_DEBUG_REG_WALL_CLOCK_L);
buffer[index+2] = 0x80000000 | ((timer_id & 0x7FFFF) << 12) | (p_reg[WALL_CLOCK_HIGH_INDEX] & 0xFFF);
buffer[index+3] = p_reg[WALL_CLOCK_LOW_INDEX];
}

inline __attribute__((always_inline)) void mark_padding()
{
if (wIndex < PROFILER_L1_VECTOR_SIZE)
Expand All @@ -233,6 +220,28 @@ namespace kernel_profiler{
profiler_control_buffer[DROPPED_ZONES] = (1 << index) | curr;
}

inline __attribute__((always_inline)) void set_host_counter(uint32_t counterValue)
{
#if defined(COMPILE_FOR_ERISC)
volatile tt_l1_ptr uint32_t *eriscBuffer = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(eth_l1_mem::address_map::PROFILER_L1_BUFFER_ER);

eriscBuffer[ID_LL] = (counterValue << 16) | (eriscBuffer[ID_LL] & 0xFFFF);
#endif //ERISC_INIT

#if defined(COMPILE_FOR_BRISC)
volatile tt_l1_ptr uint32_t *briscBuffer = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(PROFILER_L1_BUFFER_BR);
volatile tt_l1_ptr uint32_t *ncriscBuffer = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(PROFILER_L1_BUFFER_NC);
volatile tt_l1_ptr uint32_t *trisc0Buffer = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(PROFILER_L1_BUFFER_T0);
volatile tt_l1_ptr uint32_t *trisc1Buffer = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(PROFILER_L1_BUFFER_T1);
volatile tt_l1_ptr uint32_t *trisc2Buffer = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(PROFILER_L1_BUFFER_T2);

briscBuffer[ID_LL] = (counterValue << 16) | (briscBuffer[ID_LL] & 0xFFFF);
ncriscBuffer[ID_LL] = (counterValue << 16) | (ncriscBuffer[ID_LL] & 0xFFFF);
trisc0Buffer[ID_LL] = (counterValue << 16) | (trisc0Buffer[ID_LL] & 0xFFFF);
trisc1Buffer[ID_LL] = (counterValue << 16) | (trisc1Buffer[ID_LL] & 0xFFFF);
trisc2Buffer[ID_LL] = (counterValue << 16) | (trisc2Buffer[ID_LL] & 0xFFFF);
#endif //ERISC_INIT
}

inline __attribute__((always_inline)) void risc_finished_profiling()
{
Expand Down Expand Up @@ -374,6 +383,7 @@ namespace kernel_profiler{
#if defined(DISPATCH_KERNEL) && defined(COMPILE_FOR_NCRISC) && (PROFILE_KERNEL == PROFILER_OPT_DO_DISPATCH_CORES)
SrcLocNameToHash("PROFILER-NOC-QUICK-SEND");
mark_time_at_index_inlined(wIndex, hash);
wIndex += PROFILER_L1_MARKER_UINT32_SIZE;
core_flat_id = noc_xy_to_profiler_flat_id[my_x[0]][my_y[0]];

uint32_t dram_offset =
Expand All @@ -389,8 +399,8 @@ namespace kernel_profiler{

uint64_t dram_bank_dst_noc_addr = s.get_noc_addr(core_flat_id / profiler_core_count_per_dram, dram_offset);

mark_end_at_index_inlined(wIndex, hash, get_end_timer_id(hash));
wIndex += QUICK_PUSH_MARKER_COUNT * PROFILER_L1_MARKER_UINT32_SIZE;
mark_time_at_index_inlined(wIndex, get_end_timer_id(hash));
wIndex += PROFILER_L1_MARKER_UINT32_SIZE;

uint32_t currEndIndex = profiler_control_buffer[HOST_BUFFER_END_INDEX_NC] + wIndex;

Expand Down Expand Up @@ -523,6 +533,8 @@ namespace kernel_profiler{

#define DeviceZoneScopedSumN2( name ) DO_PRAGMA(message(PROFILER_MSG_NAME(name))); auto constexpr hash = kernel_profiler::Hash16_CT(PROFILER_MSG_NAME(name)); kernel_profiler::profileScopeAccumulate<hash, 1> zone = kernel_profiler::profileScopeAccumulate<hash, 1>();

#define DeviceZoneSetCounter( counter ) kernel_profiler::set_host_counter(counter);

#else

#define DeviceZoneScopedMainN( name )
Expand All @@ -537,4 +549,6 @@ namespace kernel_profiler{

#define DeviceZoneScopedND( name , nocBuffer, nocIndex )

#define DeviceZoneSetCounter( counter )

#endif
10 changes: 9 additions & 1 deletion tt_metal/tools/profiler/process_device_log.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,15 @@ def import_device_profile_log(logPath):
timerID = {"id": int(row[4].strip()), "zone_name": "", "zone_phase": "", "src_line": "", "src_file": ""}
timeData = int(row[5].strip())
statData = 0
if len(row) > 6:
if len(row) == 13:
statData = int(row[6].strip())
timerID["run_id"] = int(row[7].strip())
timerID["op_id"] = int(row[8].strip())
timerID["zone_name"] = row[9].strip()
timerID["zone_phase"] = row[10].strip()
timerID["src_line"] = int(row[11].strip())
timerID["src_file"] = row[12].strip()
elif len(row) == 12:
statData = int(row[6].strip())
timerID["run_id"] = int(row[7].strip())
timerID["zone_name"] = row[8].strip()
Expand Down
4 changes: 4 additions & 0 deletions tt_metal/tools/profiler/process_ops_logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,10 @@ def append_device_data(ops, deviceLogFolder):
cores = set()
for timeID, ts, statData, risc, core in deviceOpTime["timeseries"]:
if "zone_name" in timeID.keys() and "FW" in timeID["zone_name"]:
if "op_id" in timeID.keys():
assert (
timeID["op_id"] == deviceOp["global_call_count"]
), f"op id {timeID['op_id']} reproted by device is not matching assigned op id {deviceOp['global_call_count']}"
if core not in cores:
cores.add(core)
deviceOp["core_usage"] = {"count": len(cores), "cores": [str(core) for core in cores]}
Expand Down
Loading

0 comments on commit 374d29e

Please sign in to comment.