Skip to content

Commit

Permalink
#13655: Refactor dispatching of go signal to not send noc txn data af…
Browse files Browse the repository at this point in the history
…ter go signal command

Instead, we populate a static array on dispatcher when we change sub-device configurations with all noc txn data, and read from it using an offset passed in the go signal command
Remove dynamic allocation of sub-devices/expected workers pairs, and pass them as separate spans
Fix cmd in sweep_pgm_dispatch
  • Loading branch information
tt-aho committed Nov 14, 2024
1 parent 5d470b7 commit 14a6f6f
Show file tree
Hide file tree
Showing 20 changed files with 538 additions and 252 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
Finish
======

.. doxygenfunction:: tt::tt_metal::v0::Finish(CommandQueue& cq)
.. doxygenfunction:: tt::tt_metal::v0::Finish
4 changes: 2 additions & 2 deletions tests/tt_metal/tools/profiler/test_device_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,11 +167,11 @@ def test_dispatch_cores():
REF_COUNT_DICT = {
"grayskull": {
"Tensix CQ Dispatch": 16,
"Tensix CQ Prefetch": 24,
"Tensix CQ Prefetch": 25,
},
"wormhole_b0": {
"Tensix CQ Dispatch": 16,
"Tensix CQ Prefetch": 24,
"Tensix CQ Prefetch": 25,
},
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch -w 5000 -s 40
echo "###" all procesors all cores 32 rta
build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch -w 5000 -s 256 -x $max_x -y $max_y -a 32 $trace_option
build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch -w 5000 -s 512 -x $max_x -y $max_y -a 32 $trace_option
build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch -w 5000 -s 1024 -x $max_x -y $max_y -a 32 $trace_optionv
build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch -w 5000 -s 1024 -x $max_x -y $max_y -a 32 $trace_option
build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch -w 5000 -s 2048 -x $max_x -y $max_y -a 32 $trace_option
build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch -w 5000 -s 4096 -x $max_x -y $max_y -a 32 $trace_option
# build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch -w 5000 -s 8192 -x $max_x -y $max_y -a 32 $trace_option
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -478,6 +478,7 @@ int main(int argc, char **argv) {
num_compute_cores, // max_write_packed_cores
0,
dispatch_constants::DISPATCH_MESSAGE_ENTRIES,
dispatch_constants::DISPATCH_GO_SIGNAL_NOC_DATA_ENTRIES,
0,
0,
0,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1921,6 +1921,7 @@ void configure_for_single_chip(Device *device,
num_compute_cores, // max_write_packed_cores
0,
dispatch_constants::DISPATCH_MESSAGE_ENTRIES,
dispatch_constants::DISPATCH_GO_SIGNAL_NOC_DATA_ENTRIES,
0,
0,
0,
Expand All @@ -1941,6 +1942,7 @@ void configure_for_single_chip(Device *device,
dispatch_compile_args[13] = dispatch_h_cb_sem;
dispatch_compile_args[14] = dispatch_d_preamble_size;
dispatch_compile_args[21] = dispatch_constants::DISPATCH_MESSAGE_ENTRIES;
dispatch_compile_args[22] = dispatch_constants::DISPATCH_GO_SIGNAL_NOC_DATA_ENTRIES;
CoreCoord phys_dispatch_d_downstream_core =
packetized_path_en_g ? phys_dispatch_relay_mux_core : phys_dispatch_h_core;
configure_kernel_variant<true, false>(program,
Expand All @@ -1962,6 +1964,7 @@ void configure_for_single_chip(Device *device,
dispatch_compile_args[13] = dispatch_downstream_cb_sem;
dispatch_compile_args[14] = 0; // preamble size
dispatch_compile_args[21] = 1; // max_num_worker_sems is used for array sizing, set to 1 even if array isn't used
dispatch_compile_args[22] = 1; // max_num_go_signal_noc_data_entries is used for array sizing, set to 1 even if array isn't used
CoreCoord phys_dispatch_h_upstream_core =
packetized_path_en_g ? phys_dispatch_relay_demux_core : phys_dispatch_core;
configure_kernel_variant<false, true>(program,
Expand Down Expand Up @@ -2666,6 +2669,7 @@ void configure_for_multi_chip(Device *device,
num_compute_cores,
0,
dispatch_constants::DISPATCH_MESSAGE_ENTRIES,
dispatch_constants::DISPATCH_GO_SIGNAL_NOC_DATA_ENTRIES,
0,
0,
0,
Expand All @@ -2686,6 +2690,7 @@ void configure_for_multi_chip(Device *device,
dispatch_compile_args[13] = dispatch_h_cb_sem;
dispatch_compile_args[14] = dispatch_d_preamble_size;
dispatch_compile_args[21] = dispatch_constants::DISPATCH_MESSAGE_ENTRIES;
dispatch_compile_args[22] = dispatch_constants::DISPATCH_GO_SIGNAL_NOC_DATA_ENTRIES;
CoreCoord phys_dispatch_d_downstream_core =
packetized_path_en_g ? phys_dispatch_relay_mux_core : phys_dispatch_h_core;
configure_kernel_variant<true, false>(program_r,
Expand All @@ -2706,6 +2711,7 @@ void configure_for_multi_chip(Device *device,
dispatch_compile_args[13] = dispatch_downstream_cb_sem;
dispatch_compile_args[14] = 0; // preamble size
dispatch_compile_args[21] = 1; // max_num_worker_sems is used for array sizing, set to 1 even if array isn't used
dispatch_compile_args[22] = 1; // max_num_go_signal_noc_data_entries is used for array sizing, set to 1 even if array isn't used
CoreCoord phys_dispatch_h_upstream_core =
packetized_path_en_g ? phys_dispatch_relay_demux_core : phys_dispatch_core;
configure_kernel_variant<false, true>(program,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "tt_metal/impl/device/device.hpp"
#include "tt_metal/impl/event/event.hpp"
#include "tt_metal/impl/sub_device/sub_device.hpp"
#include "tests/tt_metal/test_utils/stimulus.hpp"
#include "tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/command_queue_test_utils.hpp"

using namespace tt::tt_metal;
Expand Down Expand Up @@ -104,7 +105,7 @@ std::tuple<Program, Program, Program, std::unique_ptr<GlobalSemaphore>> create_b
waiter_program,
"tests/tt_metal/tt_metal/unit_tests_fast_dispatch/sub_device/kernels/persistent_remote_waiter.cpp",
waiter_core,
tt_metal::EthernetConfig{
EthernetConfig{
.noc = NOC::RISCV_0_default,
.processor = DataMovementProcessor::RISCV_0});
std::array<uint32_t, 7> waiter_rt_args = {global_sem->address(), incrementer_cores.num_cores(), syncer_core_physical.x, syncer_core_physical.y, tensix_waiter_core_physical.x, tensix_waiter_core_physical.y, eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE};
Expand Down Expand Up @@ -197,7 +198,9 @@ TEST_F(CommandQueueSingleCardFixture, TestSubDeviceAllocations) {
}

auto buffer_2 = CreateBuffer(interleaved_config);

EXPECT_THROW(CreateBuffer(shard_config_1, SubDeviceId{1}), std::exception);
EXPECT_THROW(device->clear_loaded_sub_device_manager(), std::exception);
EXPECT_THROW(device->load_sub_device_manager(sub_device_manager_2), std::exception);
DeallocateBuffer(*buffer_1);
device->clear_loaded_sub_device_manager();
device->load_sub_device_manager(sub_device_manager_2);
Expand Down Expand Up @@ -427,4 +430,156 @@ TEST_F(CommandQueueSingleCardTraceFixture, TestSubDeviceTraceBasicEthPrograms) {
}
}

TEST_F(CommandQueueSingleCardTraceFixture, TestSubDeviceTraceProgramsReconfigureSubDevices) {
SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))});
SubDevice sub_device_2(std::array{CoreRangeSet(std::array{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})})});
SubDevice sub_device_3(std::array{CoreRangeSet(std::array{CoreRange({2, 4}, {3, 4}), CoreRange({5, 1}, {6, 3})})});
uint32_t num_iters = 5;
for (Device *device : devices_) {
if (!does_device_have_active_eth_cores(device)) {
GTEST_SKIP() << "Skipping test because device " << device->id() << " does not have any active ethernet cores";
}
auto eth_core = *device->get_active_ethernet_cores(true).begin();
SubDevice sub_device_4(std::array{CoreRangeSet(std::array{CoreRange({2, 1}, {2, 2}), CoreRange({1, 5}, {5, 5})}), CoreRangeSet(CoreRange(eth_core, eth_core))});

auto sub_device_manager_1 = device->create_sub_device_manager({sub_device_1, sub_device_2}, 3200);
auto sub_device_manager_2 = device->create_sub_device_manager({sub_device_3, sub_device_4}, 3200);

device->load_sub_device_manager(sub_device_manager_1);

auto [waiter_program_1, syncer_program_1, incrementer_program_1, global_sem_1] = create_basic_sync_program(device, sub_device_1, sub_device_2);

// Compile the programs
EnqueueProgram(device->command_queue(), waiter_program_1, false);
EnqueueProgram(device->command_queue(), syncer_program_1, false);
EnqueueProgram(device->command_queue(), incrementer_program_1, false);
Synchronize(device);

// Capture the trace
auto tid_1 = BeginTraceCapture(device, device->command_queue().id());
EnqueueProgram(device->command_queue(), waiter_program_1, false);
EnqueueProgram(device->command_queue(), syncer_program_1, false);
EnqueueProgram(device->command_queue(), incrementer_program_1, false);
EndTraceCapture(device, device->command_queue().id(), tid_1);

auto tid_2 = BeginTraceCapture(device, device->command_queue().id());
EnqueueProgram(device->command_queue(), syncer_program_1, false);
EnqueueProgram(device->command_queue(), incrementer_program_1, false);
EndTraceCapture(device, device->command_queue().id(), tid_2);

device->load_sub_device_manager(sub_device_manager_2);

auto [waiter_program_2, syncer_program_2, incrementer_program_2, global_sem_2] = create_basic_eth_sync_program(device, sub_device_3, sub_device_4);

// Compile the programs
EnqueueProgram(device->command_queue(), waiter_program_2, false);
EnqueueProgram(device->command_queue(), syncer_program_2, false);
EnqueueProgram(device->command_queue(), incrementer_program_2, false);
Synchronize(device);

// Capture the trace
auto tid_3 = BeginTraceCapture(device, device->command_queue().id());
EnqueueProgram(device->command_queue(), waiter_program_2, false);
EnqueueProgram(device->command_queue(), syncer_program_2, false);
EnqueueProgram(device->command_queue(), incrementer_program_2, false);
EndTraceCapture(device, device->command_queue().id(), tid_3);

auto tid_4 = BeginTraceCapture(device, device->command_queue().id());
EnqueueProgram(device->command_queue(), syncer_program_2, false);
EnqueueProgram(device->command_queue(), incrementer_program_2, false);
EndTraceCapture(device, device->command_queue().id(), tid_4);

for (uint32_t i = 0; i < num_iters; i++) {
device->load_sub_device_manager(sub_device_manager_1);
// Regular program execution
EnqueueProgram(device->command_queue(), waiter_program_1, false);
// Test blocking on one sub-device
EnqueueProgram(device->command_queue(), syncer_program_1, false);
EnqueueProgram(device->command_queue(), incrementer_program_1, false);

// Full trace execution
ReplayTrace(device, device->command_queue().id(), tid_1, false);

// Partial trace execution
EnqueueProgram(device->command_queue(), waiter_program_1, false);
ReplayTrace(device, device->command_queue().id(), tid_2, false);

device->load_sub_device_manager(sub_device_manager_2);
// Regular program execution
EnqueueProgram(device->command_queue(), waiter_program_2, false);
// Test blocking on one sub-device
EnqueueProgram(device->command_queue(), syncer_program_2, false);
EnqueueProgram(device->command_queue(), incrementer_program_2, false);

// Full trace execution
ReplayTrace(device, device->command_queue().id(), tid_3, false);

// Partial trace execution
EnqueueProgram(device->command_queue(), waiter_program_2, false);
ReplayTrace(device, device->command_queue().id(), tid_4, false);
}
Synchronize(device);
}
}

TEST_F(CommandQueueSingleCardTraceFixture, TestSubDeviceIllegalOperations) {
SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))});
SubDevice sub_device_2(std::array{CoreRangeSet(std::vector{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})})});

// Assert no idle eth cores specified
EXPECT_THROW(SubDevice sub_device_3(std::array{CoreRangeSet(CoreRange({3, 3}, {3, 3})), CoreRangeSet(CoreRange({4, 4}, {4, 4})), CoreRangeSet(CoreRange({5, 5}, {5, 5}))}), std::exception);
for (Device *device : devices_) {
auto sub_device_manager_1 = device->create_sub_device_manager({sub_device_1, sub_device_2}, 3200);
auto sub_device_manager_2 = device->create_sub_device_manager({sub_device_2, sub_device_1}, 3200);
device->load_sub_device_manager(sub_device_manager_1);

auto [waiter_program_1, syncer_program_1, incrementer_program_1, global_sem_1] = create_basic_sync_program(device, sub_device_1, sub_device_2);

// Compile the programs
EnqueueProgram(device->command_queue(), waiter_program_1, false);
// Test blocking on one sub-device
EnqueueProgram(device->command_queue(), syncer_program_1, false);
EnqueueProgram(device->command_queue(), incrementer_program_1, false);
Synchronize(device);

// Capture the trace
auto tid_1 = BeginTraceCapture(device, device->command_queue().id());
// Can not load a sub-device manager while tracing
EXPECT_THROW(device->load_sub_device_manager(sub_device_manager_2), std::exception);
EnqueueProgram(device->command_queue(), waiter_program_1, false);
EnqueueProgram(device->command_queue(), syncer_program_1, false);
EnqueueProgram(device->command_queue(), incrementer_program_1, false);
EndTraceCapture(device, device->command_queue().id(), tid_1);

device->load_sub_device_manager(sub_device_manager_2);
auto [waiter_program_2, syncer_program_2, incrementer_program_2, global_sem_2] = create_basic_sync_program(device, sub_device_2, sub_device_1);

EnqueueProgram(device->command_queue(), waiter_program_2, false);
EnqueueProgram(device->command_queue(), syncer_program_2, false);
EnqueueProgram(device->command_queue(), incrementer_program_2, false);
Synchronize(device);

auto tid_2 = BeginTraceCapture(device, device->command_queue().id());
EnqueueProgram(device->command_queue(), waiter_program_2, false);
EnqueueProgram(device->command_queue(), syncer_program_2, false);
EnqueueProgram(device->command_queue(), incrementer_program_2, false);
EndTraceCapture(device, device->command_queue().id(), tid_2);

// Regular program execution
// Can not run a program on a different sub-device manager
EXPECT_THROW(EnqueueProgram(device->command_queue(), waiter_program_1, false), std::exception);

// Full trace execution
ReplayTrace(device, device->command_queue().id(), tid_2, false);

// Can not replay a trace on a different sub-device manager
EXPECT_THROW(ReplayTrace(device, device->command_queue().id(), tid_1, false), std::exception);

Synchronize(device);

device->remove_sub_device_manager(sub_device_manager_1);
EXPECT_THROW(device->load_sub_device_manager(sub_device_manager_1), std::exception);
}
}

} // namespace basic_tests
Loading

0 comments on commit 14a6f6f

Please sign in to comment.