diff --git a/docs/source/tt_metal/examples/dram_loopback.rst b/docs/source/tt_metal/examples/dram_loopback.rst index 050aa91b89b..8b2e1de9d23 100644 --- a/docs/source/tt_metal/examples/dram_loopback.rst +++ b/docs/source/tt_metal/examples/dram_loopback.rst @@ -40,7 +40,7 @@ Program pre-compilation setup .. code-block:: cpp - CommandQueue& cq = *tt::tt_metal::detail::GLOBAL_CQ; + CommandQueue& cq = detail::GetCommandQueue(device); Program program = CreateProgram(); We first obtain the global ``CommandQueue`` in order to use the fast dispatch diff --git a/docs/source/tt_metal/examples/matmul_single_core.rst b/docs/source/tt_metal/examples/matmul_single_core.rst index 169904b0c49..f4a237ed76f 100644 --- a/docs/source/tt_metal/examples/matmul_single_core.rst +++ b/docs/source/tt_metal/examples/matmul_single_core.rst @@ -89,7 +89,7 @@ core at (0, 0). .. code-block:: cpp - CommandQueue& cq = *detail::GLOBAL_CQ; + CommandQueue& cq = detail::GetCommandQueue(device); Program program{}; CoreRange core = {.start={0, 0}, .end={0, 0}}; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/matmul/matmul_global_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/matmul/matmul_global_l1.cpp index af0b05bc969..5fbd88f2a10 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/matmul/matmul_global_l1.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/matmul/matmul_global_l1.cpp @@ -1197,8 +1197,8 @@ int main(int argc, char** argv) { // took from run_operation.cpp auto start = std::chrono::high_resolution_clock::now(); - EnqueueProgram(*::detail::GLOBAL_CQ, program, false); - Finish(*::detail::GLOBAL_CQ); + EnqueueProgram(::detail::GetCommandQueue(device), program, false); + Finish(::detail::GetCommandQueue(device)); auto end = std::chrono::high_resolution_clock::now(); duration = end - start; tt_metal::DumpDeviceProfileResults(device, program); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/matmul/matmul_local_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/matmul/matmul_local_l1.cpp index 5a4e3c3b8f6..e260a89384e 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/matmul/matmul_local_l1.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/matmul/matmul_local_l1.cpp @@ -381,8 +381,8 @@ int main(int argc, char **argv) { std::chrono::duration duration; // took from run_operation.cpp auto start = std::chrono::high_resolution_clock::now(); - EnqueueProgram(*::detail::GLOBAL_CQ, program, false); - Finish(*::detail::GLOBAL_CQ); + EnqueueProgram(::detail::GetCommandQueue(device), program, false); + Finish(::detail::GetCommandQueue(device)); auto end = std::chrono::high_resolution_clock::now(); duration = end - start; tt_metal::DumpDeviceProfileResults(device, program); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_read_global_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_read_global_l1.cpp index 2a4490f1e6a..089cda732d3 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_read_global_l1.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_read_global_l1.cpp @@ -272,8 +272,8 @@ int main(int argc, char **argv) { log_info(LogTest, "Running {} core test", num_cores_r * num_cores_c); auto begin = std::chrono::steady_clock::now(); - EnqueueProgram(*::detail::GLOBAL_CQ, program, false); - Finish(*::detail::GLOBAL_CQ); + EnqueueProgram(::detail::GetCommandQueue(device), program, false); + Finish(::detail::GetCommandQueue(device)); auto end = std::chrono::steady_clock::now(); auto elapsed_us = duration_cast(end - begin).count(); auto bw = (total_tiles_size_bytes / 1024.0 / 1024.0 / 1024.0) / diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_read_local_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_read_local_l1.cpp index 8e65d913e35..e1863da4681 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_read_local_l1.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_read_local_l1.cpp @@ -225,8 +225,8 @@ int main(int argc, char **argv) { log_info(LogTest, "Running {} core test", num_cores_r * num_cores_c); auto begin = std::chrono::steady_clock::now(); - EnqueueProgram(*::detail::GLOBAL_CQ, program, false); - Finish(*::detail::GLOBAL_CQ); + EnqueueProgram(::detail::GetCommandQueue(device), program, false); + Finish(::detail::GetCommandQueue(device)); auto end = std::chrono::steady_clock::now(); auto elapsed_us = duration_cast(end - begin).count(); auto bw = (total_tiles_size_bytes / 1024.0 / 1024.0 / 1024.0) / diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/pcie/test_enqueue_rw_buffer.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/pcie/test_enqueue_rw_buffer.cpp index 6140296dae4..d989c7a0dad 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/pcie/test_enqueue_rw_buffer.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/pcie/test_enqueue_rw_buffer.cpp @@ -57,7 +57,7 @@ int main(int argc, char** argv) { // Device Setup int device_id = 0; tt_metal::Device* device = tt_metal::CreateDevice(device_id); - CommandQueue& cq = *tt::tt_metal::detail::GLOBAL_CQ; + CommandQueue& cq = tt::tt_metal::detail::GetCommandQueue(device); // Application Setup uint32_t single_tile_size = 2 * 1024; diff --git a/tests/tt_metal/tt_metal/test_eltwise_binary.cpp b/tests/tt_metal/tt_metal/test_eltwise_binary.cpp index 66823712e30..b3845862083 100644 --- a/tests/tt_metal/tt_metal/test_eltwise_binary.cpp +++ b/tests/tt_metal/tt_metal/test_eltwise_binary.cpp @@ -37,7 +37,7 @@ int main(int argc, char** argv) { tt_metal::Device* device = tt_metal::CreateDevice(device_id); - CommandQueue& cq = *tt::tt_metal::detail::GLOBAL_CQ; + CommandQueue& cq = detail::GetCommandQueue(device); Program programs[] = {tt_metal::CreateProgram(), tt_metal::CreateProgram(), tt_metal::CreateProgram()}; diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp index 984c24dfbbe..07bacfdef5f 100644 --- a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp @@ -397,7 +397,7 @@ int main(int argc, char **argv) { per_core_N); - CommandQueue& cq = *tt::tt_metal::detail::GLOBAL_CQ; + CommandQueue& cq = tt::tt_metal::detail::GetCommandQueue(device); //////////////////////////////////////////////////////////////////////////// // Execute Application diff --git a/tests/tt_metal/tt_metal/tt_dispatch/test_enqueue_program.cpp b/tests/tt_metal/tt_metal/tt_dispatch/test_enqueue_program.cpp index 291cf516486..5162ce67809 100644 --- a/tests/tt_metal/tt_metal/tt_dispatch/test_enqueue_program.cpp +++ b/tests/tt_metal/tt_metal/tt_dispatch/test_enqueue_program.cpp @@ -87,7 +87,7 @@ void test_enqueue_program(std::function out_vec; { - CommandQueue& cq = *tt::tt_metal::detail::GLOBAL_CQ; + CommandQueue& cq = tt::tt_metal::detail::GetCommandQueue(device); // Enqueue program inputs Buffer buf(device, NUM_TILES * 2048, 2048, BufferType::DRAM); diff --git a/tests/tt_metal/tt_metal/unit_tests/basic/device.cpp b/tests/tt_metal/tt_metal/unit_tests/basic/device.cpp index 5458fe3a3ab..3d9c6cda996 100644 --- a/tests/tt_metal/tt_metal/unit_tests/basic/device.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/basic/device.cpp @@ -340,3 +340,22 @@ TEST_F(DeviceFixture, ValidateKernelDoesNotTargetHarvestedCores) { } } } + +// For a given collection of MMIO device and remote devices, ensure that channels are unique +TEST_F(DeviceFixture, TestDeviceToHostMemChannelAssignment) { + std::unordered_map> mmio_device_to_device_group; + for (unsigned int dev_id = 0; dev_id < num_devices_; dev_id++) { + chip_id_t assoc_mmio_dev_id = tt::Cluster::instance().get_associated_mmio_device(dev_id); + std::set &device_ids = mmio_device_to_device_group[assoc_mmio_dev_id]; + device_ids.insert(dev_id); + } + + for (const auto& [mmio_dev_id, device_group] : mmio_device_to_device_group) { + EXPECT_EQ(tt::Cluster::instance().get_assigned_channel_for_device(mmio_dev_id), 0); + std::unordered_set channels; + for (const chip_id_t &device_id : device_group) { + channels.insert(tt::Cluster::instance().get_assigned_channel_for_device(device_id)); + } + EXPECT_EQ(channels.size(), device_group.size()); + } +} diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/matmul/single_core_matmul_compute.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/matmul/single_core_matmul_compute.cpp index a323ecf78d2..c81bfe4b51e 100644 --- a/tests/tt_metal/tt_metal/unit_tests/compute/matmul/single_core_matmul_compute.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/compute/matmul/single_core_matmul_compute.cpp @@ -135,246 +135,7 @@ void create_CBs_for_fused_matmul( auto cb_output = tt_metal::CreateCircularBuffer(program, core, cb_output_config); } } -struct SingleCoreMatmulConfig { - bool activations_rm = false; - bool outputs_rm = false; - size_t M = 0; - size_t K = 0; - size_t N = 0; - size_t out_subblock_h = 0; - size_t out_subblock_w = 0; - size_t in0_block_w = 0; - size_t input0_dram_channel = 0; - size_t input1_dram_channel = 0; - size_t output_dram_channel = 0; - CoreCoord core = {}; -}; - -bool single_core_matmul(tt_metal::Device* device, const SingleCoreMatmulConfig& cfg) { - // Since running in slow dispatch mode - tt::tt_metal::detail::GLOBAL_CQ.reset(); - bool pass = true; - //////////////////////////////////////////////////////////////////////////// - // Application Setup - //////////////////////////////////////////////////////////////////////////// - tt_metal::Program program = tt_metal::CreateProgram(); - - uint32_t single_tile_size = 2 * 1024; - TT_FATAL( - cfg.M * cfg.in0_block_w * single_tile_size * 2 <= 150 * 1024, "input0 block must fit within 150kB of L1"); - TT_FATAL( - cfg.N * cfg.in0_block_w * single_tile_size * 2 <= 100 * 1024, "input1 block must fit within 100kB of L1"); - TT_FATAL(cfg.M * cfg.N * single_tile_size <= 600 * 1024, "output block must fit within 600kB of L1"); - uint32_t dram_buffer_size_input0 = - single_tile_size * cfg.M * cfg.K; // num_tiles of FP16_B, hard-coded in the reader/writer kernels - uint32_t dram_buffer_size_input1 = - single_tile_size * cfg.K * cfg.N; // num_tiles of FP16_B, hard-coded in the reader/writer kernels - uint32_t dram_buffer_size_output = - single_tile_size * cfg.M * cfg.N; // num_tiles of FP16_B, hard-coded in the reader/writer kernels - - auto input0_dram_buffer = CreateBuffer( - device, - dram_buffer_size_input0, - dram_buffer_size_input0, - tt_metal::BufferType::DRAM); - uint32_t input0_dram_byte_address = input0_dram_buffer.address(); - auto input1_dram_buffer = CreateBuffer( - device, - dram_buffer_size_input1, - dram_buffer_size_input1, - tt_metal::BufferType::DRAM); - uint32_t input1_dram_byte_address = input1_dram_buffer.address(); - auto output_dram_buffer = CreateBuffer( - device, - dram_buffer_size_output, - dram_buffer_size_output, - tt_metal::BufferType::DRAM); - uint32_t output_dram_byte_address = output_dram_buffer.address(); - - auto input0_dram_noc_xy = input0_dram_buffer.noc_coordinates(); - auto input1_dram_noc_xy = input1_dram_buffer.noc_coordinates(); - auto output_dram_noc_xy = output_dram_buffer.noc_coordinates(); - - std::vector reader_rt_args{ - (std::uint32_t)input0_dram_byte_address, - (std::uint32_t)input0_dram_noc_xy.x, - (std::uint32_t)input0_dram_noc_xy.y, - (std::uint32_t)input1_dram_byte_address, - (std::uint32_t)input1_dram_noc_xy.x, - (std::uint32_t)input1_dram_noc_xy.y, - (std::uint32_t)(cfg.K / cfg.in0_block_w), // num_blocks - (std::uint32_t)(cfg.M * cfg.in0_block_w), // input 0 block num tiles - (std::uint32_t)(cfg.N * cfg.in0_block_w), // input 1 block num tiles - (std::uint32_t)(cfg.M * cfg.in0_block_w * single_tile_size), // input 0 block bytes - (std::uint32_t)(cfg.N * cfg.in0_block_w * single_tile_size)}; // input 1 block bytes - std::vector writer_rt_args; - string writer_kernel_name; - if (cfg.outputs_rm) { - writer_kernel_name = "tt_metal/kernels/dataflow/writer_unary.cpp"; - writer_rt_args = { - (std::uint32_t)output_dram_byte_address, - (std::uint32_t)output_dram_noc_xy.x, - (std::uint32_t)output_dram_noc_xy.y, - uint(cfg.M * cfg.N)}; - } else { - writer_kernel_name = "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unswizzle.cpp"; - writer_rt_args = { - (std::uint32_t)output_dram_byte_address, - (std::uint32_t)output_dram_noc_xy.x, - (std::uint32_t)output_dram_noc_xy.y, - (std::uint32_t)cfg.out_subblock_h, // num tiles per sub block m - (std::uint32_t)cfg.out_subblock_w, // num tiles per sub block n - (std::uint32_t)(cfg.M / cfg.out_subblock_h), // num sub blocks m - (std::uint32_t)(cfg.N / cfg.out_subblock_w), // num sub blocks n - (std::uint32_t)( - cfg.out_subblock_w * single_tile_size * - (cfg.N / cfg.out_subblock_w)), // bytes offset to next row within sub-block - (std::uint32_t)( - cfg.out_subblock_h * cfg.out_subblock_w * single_tile_size * - (cfg.N / cfg.out_subblock_w)), // bytes offset to next row of sub-blocks - (std::uint32_t)(cfg.out_subblock_w * single_tile_size)}; // bytes offset to next sub-block - } - auto writer_kernel = tt_metal::CreateKernel( - program, - writer_kernel_name, - cfg.core, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); - - auto reader_kernel = tt_metal::CreateKernel( - program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_blocked.cpp", - cfg.core, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); - - int num_blocks = (cfg.K / cfg.in0_block_w); - int in0_num_subblocks = (cfg.M / cfg.out_subblock_h); - int in0_block_num_tiles = cfg.out_subblock_h * cfg.in0_block_w * in0_num_subblocks; - int in0_subblock_num_tiles = cfg.out_subblock_h * cfg.in0_block_w; - int in1_num_subblocks = (cfg.N / cfg.out_subblock_w); - int in1_block_num_tiles = cfg.out_subblock_w * cfg.in0_block_w * in1_num_subblocks; - int in1_per_core_w = cfg.out_subblock_w * in1_num_subblocks; - int out_subblock_num_tiles = cfg.out_subblock_h * cfg.out_subblock_w; - int in0_subblock_h = (in0_block_num_tiles / in0_num_subblocks) / cfg.in0_block_w; - - create_CBs_for_fused_matmul( - program, - device, - cfg.core, - cfg.activations_rm, - cfg.outputs_rm, - cfg.M, - cfg.N, - cfg.in0_block_w, - cfg.out_subblock_h); - - TT_FATAL( - in0_subblock_h * cfg.in0_block_w * in0_num_subblocks == in0_block_num_tiles, - "in0_subblock_h * cfg.in0_block_w * in0_num_subblocks == in0_block_num_tiles"); - TT_FATAL(cfg.in0_block_w == cfg.K, "Must match k tiles"); - - vector compute_kernel_args = { - uint(cfg.in0_block_w), - uint(in0_num_subblocks), - uint(in0_block_num_tiles), - uint(in0_subblock_num_tiles), - uint(in0_subblock_h), - - uint(in1_num_subblocks), - uint(in1_block_num_tiles), - uint(in1_per_core_w), - - uint(num_blocks), - - uint(cfg.out_subblock_h), - uint(cfg.out_subblock_w), - uint(out_subblock_num_tiles), - - uint(cfg.activations_rm), - uint(cfg.outputs_rm)}; - - auto matmul_kernel = tt_metal::CreateKernel( - program, - "tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block.cpp", - cfg.core, - tt_metal::ComputeConfig{.compile_args = compute_kernel_args}); - - //////////////////////////////////////////////////////////////////////////// - // Stimulus Generation - //////////////////////////////////////////////////////////////////////////// - std::vector packed_identity = {}; - std::vector packed_activation = {}; - auto activation = generate_uniform_random_vector( - 1.0f, - 1.0f, - dram_buffer_size_input0 / bfloat16::SIZEOF, - std::chrono::system_clock::now().time_since_epoch().count()); - if (cfg.activations_rm) { - packed_activation = pack_vector(activation); - } else { - auto activations_tilized = tilize(activation, cfg.M * 32, cfg.K * 32); - auto activations_tile_layout = convert_to_tile_layout(activations_tilized); - packed_activation = pack_vector(activations_tile_layout); - } - auto identity = - generate_strided_vector(0.0f, 1.0f, cfg.N * 32 + 1, 0, dram_buffer_size_input1 / bfloat16::SIZEOF); - auto identity_tilized = tilize(identity, cfg.K * 32, cfg.N * 32); - auto identity_tile_layout = convert_to_tile_layout(identity_tilized); - packed_identity = pack_vector(identity_tile_layout); - //////////////////////////////////////////////////////////////////////////// - // Golden Generation - //////////////////////////////////////////////////////////////////////////// - auto packed_golden = packed_activation; // - - //////////////////////////////////////////////////////////////////////////// - // Compile and Execute Application - //////////////////////////////////////////////////////////////////////////// - - tt_metal::detail::WriteToBuffer(input0_dram_buffer, packed_activation); - tt_metal::detail::WriteToBuffer(input1_dram_buffer, packed_identity); - std::vector input0_dram_readback_packed; - tt_metal::detail::ReadFromBuffer(input0_dram_buffer, input0_dram_readback_packed); - EXPECT_TRUE(input0_dram_readback_packed == packed_activation); - print_vector_fixed_numel_per_row(unpack_vector(input0_dram_readback_packed), 32); - std::vector input1_dram_readback_packed; - tt_metal::detail::ReadFromBuffer(input1_dram_buffer, input1_dram_readback_packed); - EXPECT_TRUE(input1_dram_readback_packed == packed_identity); - print_vector_fixed_numel_per_row(unpack_vector(input1_dram_readback_packed), 32); - - - tt_metal::SetRuntimeArgs(program, reader_kernel, cfg.core, reader_rt_args); - tt_metal::SetRuntimeArgs(program, writer_kernel, cfg.core, writer_rt_args); - - tt_metal::detail::LaunchProgram(device, program); - - //////////////////////////////////////////////////////////////////////////// - // Comparison Checking - //////////////////////////////////////////////////////////////////////////// - std::vector input0_l1_readback_packed; - tt_metal::detail::ReadFromDeviceL1(device, cfg.core, 120 * 1024, 2 * single_tile_size, input0_l1_readback_packed); - EXPECT_TRUE(input0_l1_readback_packed == packed_activation); - std::vector input1_l1_readback_packed; - tt_metal::detail::ReadFromDeviceL1(device, cfg.core, 250 * 1024, 2 * single_tile_size, input1_l1_readback_packed); - EXPECT_TRUE(input1_l1_readback_packed == packed_identity); - - // std::vector input1_l1_readback_packed; - // std::vector output_l1_readback_packed; - std::vector dest_buffer_data; - tt_metal::detail::ReadFromBuffer(output_dram_buffer, dest_buffer_data); - auto dest_buffer_data_unpacked = unpack_vector(dest_buffer_data); - if (not cfg.outputs_rm) { - dest_buffer_data_unpacked = convert_to_flat_layout(dest_buffer_data_unpacked); - dest_buffer_data_unpacked = untilize(dest_buffer_data_unpacked, cfg.M * 32, cfg.N * 32); - } - pass &= - is_close_vectors(activation, dest_buffer_data_unpacked, [&](const bfloat16& a, const bfloat16& b) { - return is_close(a, b, 0.15f); - }); - - return pass; -} bool single_tile_matmul(tt_metal::Device* device) { bool pass = true; diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_CommandQueue.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_CommandQueue.cpp new file mode 100644 index 00000000000..b816d972b2a --- /dev/null +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_CommandQueue.cpp @@ -0,0 +1,60 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "command_queue_fixture.hpp" +#include "command_queue_test_utils.hpp" +#include "gtest/gtest.h" +#include "tt_metal/host_api.hpp" +#include "tt_metal/detail/tt_metal.hpp" +#include "tt_metal/test_utils/env_vars.hpp" +#include "tt_metal/test_utils/stimulus.hpp" + +using namespace tt::tt_metal; + +namespace host_tests { + +TEST_F(MultiCommandQueueFixture, TestAccessCommandQueue) { + for (unsigned int device_id = 0; device_id < num_devices_; device_id++) { + EXPECT_NO_THROW(detail::GetCommandQueue(devices_[device_id])); + } +} + +TEST_F(BasicFastDispatchFixture, TestCannotAccessCommandQueueForClosedDevice) { + const unsigned int device_id = 0; + Device* device = CreateDevice(device_id); + EXPECT_NO_THROW(detail::GetCommandQueue(device)); + CloseDevice(device); + EXPECT_ANY_THROW(detail::GetCommandQueue(device)); +} + +TEST_F(MultiCommandQueueFixture, TestDirectedLoopbackToUniqueHugepage) { + std::unordered_map> golden_data; + + const uint32_t byte_size = 2048 * 16; + const uint64_t address = 0; + + for (chip_id_t device_id = 0; device_id < num_devices_; device_id++) { + std::vector data = + tt::test_utils::generate_uniform_random_vector(0, UINT32_MAX, byte_size / sizeof(uint32_t)); + + chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device_id); + uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id); + tt::Cluster::instance().write_sysmem(data.data(), data.size() * sizeof(uint32_t), address, mmio_device_id, channel); + + golden_data[device_id] = data; + } + + std::vector readback_data; + readback_data.resize(byte_size / sizeof(uint32_t)); + for (chip_id_t device_id = 0; device_id < num_devices_; device_id++) { + chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device_id); + uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id); + tt::Cluster::instance().read_sysmem(&readback_data, byte_size, address, mmio_device_id, channel); + EXPECT_EQ(readback_data, golden_data.at(device_id)); + } +} + +} // namespace host_tests diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp index a597efb0353..24e27725e9d 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp @@ -299,8 +299,8 @@ TEST_F(CommandQueueFixture, TestArbiterDoesNotHang) { auto dummy_reader_kernel = CreateKernel( program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/arbiter_hang.cpp", cr_set, DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default}); - EnqueueProgram(*::detail::GLOBAL_CQ, program, false); - Finish(*::detail::GLOBAL_CQ); + EnqueueProgram(::detail::GetCommandQueue(device_), program, false); + Finish(::detail::GetCommandQueue(device_)); } } @@ -315,7 +315,7 @@ TEST_F(CommandQueueFixture, TestSingleCbConfigCorrectlySentSingleCore) { DummyProgramMultiCBConfig config = {.cr_set = cr_set, .cb_config_vector = {cb_config} }; - EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_cbs(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config)); + EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_cbs(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config)); } TEST_F(CommandQueueFixture, TestMultiCbSeqConfigCorrectlySentSingleCore) { @@ -330,7 +330,7 @@ TEST_F(CommandQueueFixture, TestMultiCbSeqConfigCorrectlySentSingleCore) { DummyProgramMultiCBConfig config = {.cr_set = cr_set, .cb_config_vector = {cb_config_0, cb_config_1, cb_config_2, cb_config_3}}; - EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_cbs(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config)); + EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_cbs(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config)); } TEST_F(CommandQueueFixture, TestMultiCbRandomConfigCorrectlySentSingleCore) { @@ -345,7 +345,7 @@ TEST_F(CommandQueueFixture, TestMultiCbRandomConfigCorrectlySentSingleCore) { DummyProgramMultiCBConfig config = {.cr_set = cr_set, .cb_config_vector = {cb_config_0, cb_config_1, cb_config_2, cb_config_3}}; - EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_cbs(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config)); + EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_cbs(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config)); } TEST_F(CommandQueueFixture, TestMultiCBSharedAddressSpaceSentSingleCore) { @@ -371,9 +371,9 @@ TEST_F(CommandQueueFixture, TestMultiCBSharedAddressSpaceSentSingleCore) { auto cb = CreateCircularBuffer(program, cr_set, cb_config); local_test_functions::initialize_dummy_kernels(program, cr_set); - EnqueueProgram(*tt::tt_metal::detail::GLOBAL_CQ, program, false); + EnqueueProgram(tt::tt_metal::detail::GetCommandQueue(device_), program, false); - Finish(*tt::tt_metal::detail::GLOBAL_CQ); + Finish(tt::tt_metal::detail::GetCommandQueue(device_)); vector cb_config_vector; uint32_t cb_config_buffer_size = NUM_CIRCULAR_BUFFERS * UINT32_WORDS_PER_CIRCULAR_BUFFER_CONFIG * sizeof(uint32_t); @@ -407,7 +407,7 @@ TEST_F(CommandQueueFixture, TestSingleCbConfigCorrectlyUpdateSizeSentSingleCore) DummyProgramMultiCBConfig config = {.cr_set = cr_set, .cb_config_vector = {cb_config}}; - EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_cbs_update_size(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config)); + EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_cbs_update_size(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config)); } TEST_F(CommandQueueFixture, TestSingleSemaphoreConfigCorrectlySentSingleCore) { @@ -416,7 +416,7 @@ TEST_F(CommandQueueFixture, TestSingleSemaphoreConfigCorrectlySentSingleCore) { DummyProgramConfig config = {.cr_set = cr_set, .num_sems = 1}; - EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_sems(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config)); + EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_sems(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config)); } TEST_F(CommandQueueFixture, TestAutoInsertedBlankBriscKernelInDeviceDispatchMode) { @@ -430,8 +430,8 @@ TEST_F(CommandQueueFixture, TestAutoInsertedBlankBriscKernelInDeviceDispatchMode program, "tt_metal/kernels/dataflow/blank.cpp", cr_set, DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default}); - EnqueueProgram(*tt::tt_metal::detail::GLOBAL_CQ, program, false); - Finish(*tt::tt_metal::detail::GLOBAL_CQ); + EnqueueProgram(tt::tt_metal::detail::GetCommandQueue(device_), program, false); + Finish(tt::tt_metal::detail::GetCommandQueue(device_)); } TEST_F(CommandQueueFixture, ComputeRuntimeArgs) { @@ -450,8 +450,8 @@ TEST_F(CommandQueueFixture, ComputeRuntimeArgs) { std::vector initial_runtime_args = {101, 202}; SetRuntimeArgs(program, 0, cr_set, initial_runtime_args); - EnqueueProgram(*tt::tt_metal::detail::GLOBAL_CQ, program, false); - Finish(*tt::tt_metal::detail::GLOBAL_CQ); + EnqueueProgram(tt::tt_metal::detail::GetCommandQueue(device_), program, false); + Finish(tt::tt_metal::detail::GetCommandQueue(device_)); std::vector increments = {87, 216}; std::vector written_args; @@ -469,7 +469,7 @@ TEST_F(CommandQueueFixture, TestRuntimeArgsCorrectlySentSingleCore) { CoreRangeSet cr_set({cr}); DummyProgramConfig dummy_program_config = {.cr_set = cr_set}; - local_test_functions::test_dummy_EnqueueProgram_with_runtime_args(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, dummy_program_config); + local_test_functions::test_dummy_EnqueueProgram_with_runtime_args(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), dummy_program_config); } } // end namespace single_core_tests @@ -490,7 +490,7 @@ TEST_F(CommandQueueFixture, TestAllCbConfigsCorrectlySentMultiCore) { DummyProgramMultiCBConfig config = { .cr_set = cr_set, .cb_config_vector = cb_config_vector}; - EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_cbs(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config)); + EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_cbs(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config)); } TEST_F(CommandQueueFixture, TestAllCbConfigsCorrectlySentUpdateSizeMultiCore) { @@ -507,7 +507,7 @@ TEST_F(CommandQueueFixture, TestAllCbConfigsCorrectlySentUpdateSizeMultiCore) { DummyProgramMultiCBConfig config = { .cr_set = cr_set, .cb_config_vector = cb_config_vector }; - EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_cbs_update_size(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config)); + EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_cbs_update_size(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config)); } @@ -526,7 +526,7 @@ TEST_F(CommandQueueFixture, TestMultiCbConfigsCorrectlySentUpdateSizeMultiCore) DummyProgramMultiCBConfig config = { .cr_set = cr_set, .cb_config_vector = cb_config_vector }; - EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_cbs_update_size(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config)); + EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_cbs_update_size(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config)); } TEST_F(CommandQueueFixture, TestAllSemConfigsCorrectlySentMultiCore) { @@ -537,7 +537,7 @@ TEST_F(CommandQueueFixture, TestAllSemConfigsCorrectlySentMultiCore) { DummyProgramConfig config = {.cr_set = cr_set, .num_sems = NUM_SEMAPHORES}; - EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_sems(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config)); + EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_sems(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config)); } TEST_F(CommandQueueFixture, TestAllRuntimeArgsCorrectlySentMultiCore) { @@ -547,7 +547,7 @@ TEST_F(CommandQueueFixture, TestAllRuntimeArgsCorrectlySentMultiCore) { CoreRangeSet cr_set({cr}); DummyProgramConfig dummy_program_config = {.cr_set = cr_set}; - EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_runtime_args(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, dummy_program_config)); + EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_runtime_args(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), dummy_program_config)); } } // end namespace multicore_tests diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp index 4e164fa602d..4dc3a4f4645 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp @@ -154,7 +154,7 @@ namespace dram_tests { TEST_F(CommandQueueFixture, WriteOneTileToDramBank0) { BufferConfig config = {.num_pages = 1, .page_size = 2048, .buftype = BufferType::DRAM}; - EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config)); + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config)); } TEST_F(CommandQueueFixture, WriteOneTileToAllDramBanks) { @@ -163,7 +163,7 @@ TEST_F(CommandQueueFixture, WriteOneTileToAllDramBanks) { .page_size = 2048, .buftype = BufferType::DRAM}; - EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config)); + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config)); } TEST_F(CommandQueueFixture, WriteOneTileAcrossAllDramBanksTwiceRoundRobin) { @@ -173,7 +173,7 @@ TEST_F(CommandQueueFixture, WriteOneTileAcrossAllDramBanksTwiceRoundRobin) { .page_size = 2048, .buftype = BufferType::DRAM}; - EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config)); + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config)); } TEST_F(CommandQueueFixture, Sending131072Pages) { @@ -184,26 +184,26 @@ TEST_F(CommandQueueFixture, Sending131072Pages) { .page_size = 128, .buftype = BufferType::DRAM}; - EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config)); + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config)); } // TEST_F(CommandQueueFixture, FusedWriteDramBuffersInWhichRemainderBurstSizeDoesNotFitInLocalL1) { // BufferConfig config = {.num_pages = 4096, .page_size = 22016, .buftype = BufferType::DRAM}; -// EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config)); +// EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config)); // } TEST_F(CommandQueueFixture, TestNon32BAlignedPageSizeForDram) { BufferConfig config = {.num_pages = 1250, .page_size = 200, .buftype = BufferType::DRAM}; - EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config)); + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config)); } TEST_F(CommandQueueFixture, TestNon32BAlignedPageSizeForDram2) { // From stable diffusion read buffer BufferConfig config = {.num_pages = 8 * 1024, .page_size = 80, .buftype = BufferType::DRAM}; - EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config)); + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config)); } TEST_F(CommandQueueFixture, TestPageSizeTooLarge) { @@ -213,14 +213,14 @@ TEST_F(CommandQueueFixture, TestPageSizeTooLarge) { // Should throw a host error due to the page size not fitting in the consumer CB BufferConfig config = {.num_pages = 1024, .page_size = 250880 * 2, .buftype = BufferType::DRAM}; - EXPECT_ANY_THROW(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config)); + EXPECT_ANY_THROW(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config)); } TEST_F(CommandQueueFixture, TestWrapHostHugepageOnEnqueueReadBuffer) { BufferConfig buf_config = {.num_pages = 524270, .page_size = 2048, .buftype = BufferType::DRAM}; - EXPECT_TRUE(local_test_functions::test_EnqueueWrap_on_EnqueueReadBuffer(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, buf_config)); + EXPECT_TRUE(local_test_functions::test_EnqueueWrap_on_EnqueueReadBuffer(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), buf_config)); } } // end namespace dram_tests @@ -230,7 +230,7 @@ namespace l1_tests { TEST_F(CommandQueueFixture, WriteOneTileToL1Bank0) { BufferConfig config = {.num_pages = 1, .page_size = 2048, .buftype = BufferType::L1}; - EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config)); + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config)); } TEST_F(CommandQueueFixture, WriteOneTileToAllL1Banks) { @@ -240,7 +240,7 @@ TEST_F(CommandQueueFixture, WriteOneTileToAllL1Banks) { .page_size = 2048, .buftype = BufferType::L1}; - EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config)); + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config)); } TEST_F(CommandQueueFixture, WriteOneTileToAllL1BanksTwiceRoundRobin) { @@ -250,13 +250,13 @@ TEST_F(CommandQueueFixture, WriteOneTileToAllL1BanksTwiceRoundRobin) { .page_size = 2048, .buftype = BufferType::L1}; - EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config)); + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config)); } TEST_F(CommandQueueFixture, TestNon32BAlignedPageSizeForL1) { BufferConfig config = {.num_pages = 1250, .page_size = 200, .buftype = BufferType::L1}; - EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config)); + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config)); } TEST_F(CommandQueueFixture, TestBackToBackNon32BAlignedPageSize) { @@ -264,17 +264,17 @@ TEST_F(CommandQueueFixture, TestBackToBackNon32BAlignedPageSize) { Buffer bufa(device_, 125000, 100, buff_type); auto src_a = local_test_functions::generate_arange_vector(bufa.size()); - EnqueueWriteBuffer(*tt::tt_metal::detail::GLOBAL_CQ, bufa, src_a, false); + EnqueueWriteBuffer(tt::tt_metal::detail::GetCommandQueue(device_), bufa, src_a, false); Buffer bufb(device_, 152000, 152, buff_type); auto src_b = local_test_functions::generate_arange_vector(bufb.size()); - EnqueueWriteBuffer(*tt::tt_metal::detail::GLOBAL_CQ, bufb, src_b, false); + EnqueueWriteBuffer(tt::tt_metal::detail::GetCommandQueue(device_), bufb, src_b, false); vector result_a; - EnqueueReadBuffer(*tt::tt_metal::detail::GLOBAL_CQ, bufa, result_a, true); + EnqueueReadBuffer(tt::tt_metal::detail::GetCommandQueue(device_), bufa, result_a, true); vector result_b; - EnqueueReadBuffer(*tt::tt_metal::detail::GLOBAL_CQ, bufb, result_b, true); + EnqueueReadBuffer(tt::tt_metal::detail::GetCommandQueue(device_), bufb, result_b, true); EXPECT_EQ(src_a, result_a); EXPECT_EQ(src_b, result_b); @@ -289,7 +289,7 @@ TEST_F(CommandQueueFixture, WritesToRandomBufferTypeAndThenReads) { BufferStressTestConfig config = { .seed = 0, .num_pages_total = 50000, .page_size = 2048, .max_num_pages_per_buffer = 16}; EXPECT_TRUE( - local_test_functions::stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config)); + local_test_functions::stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, tt::tt_metal::detail::GetCommandQueue(this->device_), config)); } TEST_F(CommandQueueFixture, StressWrapTest) { @@ -303,7 +303,7 @@ TEST_F(CommandQueueFixture, StressWrapTest) { BufferStressTestConfig config = { .page_size = 4096, .max_num_pages_per_buffer = 2000, .num_iterations = 10000, .num_unique_vectors = 20}; EXPECT_TRUE( - local_test_functions::stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer_wrap(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config)); + local_test_functions::stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer_wrap(this->device_, tt::tt_metal::detail::GetCommandQueue(this->device_), config)); } } // end namespace stress_tests diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/common/command_queue_fixture.hpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/common/command_queue_fixture.hpp index bdbc38b0308..0632d1a90ac 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/common/command_queue_fixture.hpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/common/command_queue_fixture.hpp @@ -85,3 +85,47 @@ class CommandQueueWithDPrintFixture: public CommandQueueFixture { // device setup, we need to skip device teardown if the test is skipped. bool test_skipped = false; }; + +class BasicFastDispatchFixture : public ::testing::Test { + protected: + void SetUp() override { + auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); + if (slow_dispatch) { + TT_THROW("This suite can only be run with fast dispatch or TT_METAL_SLOW_DISPATCH_MODE unset"); + GTEST_SKIP(); + } + } +}; + +class MultiCommandQueueFixture : public ::testing::Test { + protected: + void SetUp() override { + auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); + if (slow_dispatch) { + TT_THROW("This suite can only be run with fast dispatch or TT_METAL_SLOW_DISPATCH_MODE unset"); + GTEST_SKIP(); + } + arch_ = tt::get_arch_from_string(tt::test_utils::get_env_arch_name()); + + num_devices_ = tt::tt_metal::Device::detect_num_available_devices(); + + if (arch_ == tt::ARCH::GRAYSKULL && num_devices_ > 1) { + GTEST_SKIP(); + } + + for (unsigned int id = 0; id < num_devices_; id++) { + auto* device = tt::tt_metal::CreateDevice(id); + devices_.push_back(device); + } + } + + void TearDown() override { + for (unsigned int id = 0; id < devices_.size(); id++) { + tt::tt_metal::CloseDevice(devices_.at(id)); + } + } + + std::vector devices_; + tt::ARCH arch_; + size_t num_devices_; +}; diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp index caff8d0cd7a..2e7b76ea11d 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp @@ -217,7 +217,7 @@ bool run_sfpu_all_same_buffer(tt_metal::Device* device, const SfpuConfig& test_c } std::vector dest_buffer_data; - CommandQueue& cq = *tt::tt_metal::detail::GLOBAL_CQ; + CommandQueue& cq = tt::tt_metal::detail::GetCommandQueue(device); EnqueueWriteBuffer(cq, input_dram_buffer, packed_input, false); EnqueueProgram(cq, program, false); diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/dprint/test_mute_print_server.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/dprint/test_mute_print_server.cpp index b8b93462eb1..1f838e5b91c 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/dprint/test_mute_print_server.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/dprint/test_mute_print_server.cpp @@ -23,7 +23,7 @@ TEST_F(CommandQueueWithDPrintFixture, TestPrintMuting) { Device *device = this->device_; // Set up program and command queue - CommandQueue& cq = *tt::tt_metal::detail::GLOBAL_CQ; + CommandQueue& cq = tt::tt_metal::detail::GetCommandQueue(device); Program program = Program(); // This tests prints only on a single core diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/dprint/test_print_all_harts.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/dprint/test_print_all_harts.cpp index cedcd513143..a96e13ff924 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/dprint/test_print_all_harts.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/dprint/test_print_all_harts.cpp @@ -117,7 +117,7 @@ TEST_F(CommandQueueWithDPrintFixture, TestPrintFromAllHarts) { // Set up program and command queue constexpr CoreCoord core = {0, 0}; // Print on first core only - CommandQueue& cq = *tt::tt_metal::detail::GLOBAL_CQ; + CommandQueue& cq = tt::tt_metal::detail::GetCommandQueue(device); Program program = Program(); // Create a CB for testing TSLICE, dimensions are 32x32 bfloat16s diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/dprint/test_raise_wait.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/dprint/test_raise_wait.cpp index 015b9a3168c..09a873cbeba 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/dprint/test_raise_wait.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/dprint/test_raise_wait.cpp @@ -221,7 +221,7 @@ TEST_F(CommandQueueWithDPrintFixture, TestPrintRaiseWait) { Device *device = this->device_; // Set up program and command queue - CommandQueue& cq = *tt::tt_metal::detail::GLOBAL_CQ; + CommandQueue& cq = tt::tt_metal::detail::GetCommandQueue(device); Program program = Program(); // Test runs on a 5x5 grid diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/pipelining/basic_pipeline.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/pipelining/basic_pipeline.cpp index 6dbbf5f0882..efe09e11448 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/pipelining/basic_pipeline.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/pipelining/basic_pipeline.cpp @@ -30,7 +30,7 @@ struct PipelineRowConfig { }; void create_and_run_row_pipeline(tt_metal::Device* device, const PipelineRowConfig& test_config) { - CommandQueue& cq = *tt::tt_metal::detail::GLOBAL_CQ; + CommandQueue& cq = tt::tt_metal::detail::GetCommandQueue(device); tt_metal::Program program = tt_metal::CreateProgram(); diff --git a/tt_eager/tensor/tensor_impl.hpp b/tt_eager/tensor/tensor_impl.hpp index 0782e653399..0588a791861 100644 --- a/tt_eager/tensor/tensor_impl.hpp +++ b/tt_eager/tensor/tensor_impl.hpp @@ -389,7 +389,7 @@ std::vector read_data_from_device(const Tensor &tensor, uint32_t size_in_byte if (TT_METAL_SLOW_DISPATCH_MODE == nullptr) { std::vector device_data; device_data.resize(size_in_bytes / sizeof(T)); - EnqueueReadBuffer(*tt::tt_metal::detail::GLOBAL_CQ, *device_buffer, device_data.data(), true); + EnqueueReadBuffer(tt::tt_metal::detail::GetCommandQueue(tensor.device()), *device_buffer, device_data.data(), true); return device_data; } else { std::vector device_data; @@ -406,7 +406,7 @@ inline void write_data_to_device_buffer(const BufferType& data_to_write, Devi const char *TT_METAL_SLOW_DISPATCH_MODE = std::getenv("TT_METAL_SLOW_DISPATCH_MODE"); if (TT_METAL_SLOW_DISPATCH_MODE == nullptr) { - EnqueueWriteBuffer(*tt::tt_metal::detail::GLOBAL_CQ, *buffer, std::begin(data_to_write), false); + EnqueueWriteBuffer(tt::tt_metal::detail::GetCommandQueue(buffer->device()), *buffer, std::begin(data_to_write), false); } else { auto uint32_data = pack_vec_into_uint32_vec(data_to_write); ::detail::WriteToBuffer(*buffer, uint32_data); diff --git a/tt_eager/tt_dnn/op_library/run_operation.cpp b/tt_eager/tt_dnn/op_library/run_operation.cpp index a264ab8ff15..bf6c0b30b25 100644 --- a/tt_eager/tt_dnn/op_library/run_operation.cpp +++ b/tt_eager/tt_dnn/op_library/run_operation.cpp @@ -212,11 +212,11 @@ std::vector run_device_operation( if (USE_FAST_DISPATCH) { #ifndef TTNN_ENABLE_LOGGING - EnqueueProgram(*tt::tt_metal::detail::GLOBAL_CQ, program, false); + EnqueueProgram(tt::tt_metal::detail::GetCommandQueue(device), program, false); #else const auto start{std::chrono::steady_clock::now()}; - EnqueueProgram(*tt::tt_metal::detail::GLOBAL_CQ, program, false); - Finish(*tt::tt_metal::detail::GLOBAL_CQ); + EnqueueProgram(tt::tt_metal::detail::GetCommandQueue(device), program, false); + Finish(tt::tt_metal::detail::GetCommandQueue(device)); const auto end{std::chrono::steady_clock::now()}; const std::chrono::duration elapsed_seconds{end - start}; tt::log_info( diff --git a/tt_metal/detail/tt_metal.hpp b/tt_metal/detail/tt_metal.hpp index ed607d34e7b..8b9b371a837 100644 --- a/tt_metal/detail/tt_metal.hpp +++ b/tt_metal/detail/tt_metal.hpp @@ -13,6 +13,7 @@ #include "tt_metal/impl/dispatch/command_queue.hpp" #include "tt_metal/detail/program.hpp" #include "tt_metal/llrt/watcher.hpp" +#include "tt_metal/host_api.hpp" using std::unique_lock; using std::mutex; @@ -20,9 +21,6 @@ using std::mutex; namespace tt::tt_metal{ namespace detail { - // To be removed at a later time, but need a global - // command queue for the time being. - inline unique_ptr GLOBAL_CQ; inline static bool DispatchStateCheck( bool isFastDispatch){ static bool fd = isFastDispatch; @@ -258,10 +256,24 @@ namespace tt::tt_metal{ return true; } - inline void Synchronize() + inline CommandQueue &GetCommandQueue(Device *device) { - if (detail::GLOBAL_CQ) { - Finish(*detail::GLOBAL_CQ); + detail::DispatchStateCheck(true); + // For now there is only one SW CommandQueue per device + static std::vector> command_queues( Device::detect_num_available_devices() ); + chip_id_t id = device->id(); + TT_FATAL(id < command_queues.size(), "Invalid device {} detected", id); + TT_FATAL(device->is_initialized(), "Cannot access command queue for closed device {}", id); + if (not command_queues[id] or (command_queues[id] and command_queues[id]->device != device)) { + command_queues[device->id()] = std::make_unique(device); + } + return *(command_queues[id]); + } + + inline void Synchronize(Device *device) + { + if (std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr) { + Finish(GetCommandQueue(device)); } } @@ -270,10 +282,10 @@ namespace tt::tt_metal{ device->deallocate_buffers(); } - inline void ClearCommandQueueProgramCache() + inline void ClearCommandQueueProgramCache(Device *device) { - if (detail::GLOBAL_CQ) { - ClearProgramCache(*detail::GLOBAL_CQ); + if (std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr) { + ClearProgramCache(GetCommandQueue(device)); } } @@ -408,5 +420,73 @@ namespace tt::tt_metal{ specified_core_spec ); } + + // Sending dispatch kernel. TODO(agrebenisan): Needs a refactor + inline void SendDispatchKernelToDevice(Device *device) { + ZoneScoped; + + Program dispatch_program = CreateProgram(); + auto dispatch_cores = device->dispatch_cores().begin(); + CoreCoord producer_logical_core = *dispatch_cores++; + CoreCoord consumer_logical_core = *dispatch_cores; + + CoreCoord producer_physical_core = device->worker_core_from_logical_core(producer_logical_core); + CoreCoord consumer_physical_core = device->worker_core_from_logical_core(consumer_logical_core); + + std::map producer_defines = { + {"IS_DISPATCH_KERNEL", ""}, + {"CONSUMER_NOC_X", std::to_string(consumer_physical_core.x)}, + {"CONSUMER_NOC_Y", std::to_string(consumer_physical_core.y)}, + }; + std::map consumer_defines = { + {"PRODUCER_NOC_X", std::to_string(producer_physical_core.x)}, + {"PRODUCER_NOC_Y", std::to_string(producer_physical_core.y)}, + }; + std::vector dispatch_compile_args = {tt::Cluster::instance().get_tensix_soft_reset_addr()}; + tt::tt_metal::CreateKernel( + dispatch_program, + "tt_metal/impl/dispatch/kernels/command_queue_producer.cpp", + producer_logical_core, + tt::tt_metal::DataMovementConfig { + .processor = tt::tt_metal::DataMovementProcessor::RISCV_0, + .noc = tt::tt_metal::NOC::RISCV_0_default, + .compile_args = dispatch_compile_args, + .defines = producer_defines}); + + tt::tt_metal::CreateKernel( + dispatch_program, + "tt_metal/impl/dispatch/kernels/command_queue_consumer.cpp", + consumer_logical_core, + tt::tt_metal::DataMovementConfig { + .processor = tt::tt_metal::DataMovementProcessor::RISCV_0, + .noc = tt::tt_metal::NOC::RISCV_0_default, + .compile_args = dispatch_compile_args, + .defines = consumer_defines}); + + tt::tt_metal::CreateSemaphore(dispatch_program, producer_logical_core, 2); + tt::tt_metal::CreateSemaphore(dispatch_program, consumer_logical_core, 0); + + CompileProgram(device, dispatch_program); + ConfigureDeviceWithProgram(device, dispatch_program); + + uint32_t fifo_addr = (HOST_CQ_FINISH_PTR + 32) >> 4; + vector fifo_addr_vector = {fifo_addr}; + WriteToDeviceL1(device, producer_logical_core, CQ_READ_PTR, fifo_addr_vector); + WriteToDeviceL1(device, producer_logical_core, CQ_WRITE_PTR, fifo_addr_vector); + + tt::Cluster::instance().l1_barrier(device->id()); + + const std::tuple tlb_data = tt::Cluster::instance().get_tlb_data(tt_cxy_pair(device->id(), device->worker_core_from_logical_core(*device->dispatch_cores().begin()))).value(); + auto [tlb_offset, tlb_size] = tlb_data; + // std::cout << "CORE: " << device->worker_core_from_logical_core(*device->dispatch_cores().begin()).str() << std::endl; + // std::cout << "after sending pointers to device. my tlb_offset: " << tlb_offset << ", my tlb_size: " << tlb_size << std::endl; + + launch_msg_t msg = dispatch_program.kernels_on_core(producer_logical_core)->launch_msg; + + // TODO(pkeller): Should use detail::LaunchProgram once we have a mechanism to avoid running all RISCs + tt::llrt::write_launch_msg_to_core(device->id(), producer_physical_core, &msg); + tt::llrt::write_launch_msg_to_core(device->id(), consumer_physical_core, &msg); + } + } } diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index 510cb53f454..bb60b904664 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -279,7 +279,19 @@ bool Device::initialize(const std::vector& l1_bank_remap) { get_compile_outpath() ); + // Mark initialized before compiling and sending dispatch kernels to device because compilation expects device to be initialized this->initialized_ = true; + + // Create system memory writer for this device to have an associated interface to hardware command queue (i.e. hugepage) + if (std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr) { + this->sysmem_writer = std::make_unique( + this->id_, + this->dispatch_cores(), + [&, this](CoreCoord core) { return this->worker_core_from_logical_core(core); } + ); + detail::SendDispatchKernelToDevice(this); + } + return true; } @@ -295,6 +307,9 @@ bool Device::close() { this->clear_l1_state(); tt::Cluster::instance().l1_barrier(id_); allocator::clear(*this->allocator_); + if (std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr) { + this->sysmem_writer.reset(nullptr); + } this->active_devices_.deactivate_device(this->id_); diff --git a/tt_metal/impl/device/device.hpp b/tt_metal/impl/device/device.hpp index 7a0ef070b80..b46a71c85e3 100644 --- a/tt_metal/impl/device/device.hpp +++ b/tt_metal/impl/device/device.hpp @@ -9,6 +9,7 @@ #include "hostdevcommon/common_values.hpp" #include "tt_metal/impl/allocator/basic_allocator.hpp" #include "tt_metal/impl/allocator/l1_banking_allocator.hpp" +#include "tt_metal/impl/dispatch/command_queue_interface.hpp" #include "llrt/tt_cluster.hpp" #include "dev_msgs.h" @@ -20,6 +21,7 @@ namespace tt_metal { enum class BufferType; class Buffer; class Program; +class CommandQueue; using on_close_device_callback = std::function; @@ -167,6 +169,11 @@ class Device { std::unique_ptr allocator_ = nullptr; bool initialized_ = false; + // SystemMemoryWriter is the interface to the hardware command queue + std::unique_ptr sysmem_writer; + // Allows access to sysmem_writer + friend class CommandQueue; + std::set compute_cores; std::set storage_only_cores_; std::set dispatch_cores_; diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp index da7d199d963..818e31bdf11 100644 --- a/tt_metal/impl/dispatch/command_queue.cpp +++ b/tt_metal/impl/dispatch/command_queue.cpp @@ -5,7 +5,6 @@ #include "tt_metal/impl/dispatch/command_queue.hpp" #include "debug_tools.hpp" -#include "device_data.hpp" #include "noc/noc_parameters.h" #include "tt_metal/detail/program.hpp" #include "tt_metal/detail/tt_metal.hpp" @@ -654,83 +653,15 @@ void EnqueueWrapCommand::process() { EnqueueCommandType EnqueueWrapCommand::type() { return this->type_; } -// Sending dispatch kernel. TODO(agrebenisan): Needs a refactor -void send_dispatch_kernel_to_device(Device* device) { - ZoneScoped; - // Ideally, this should be some separate API easily accessible in - // TT-metal, don't like the fact that I'm writing this from scratch - - Program dispatch_program = CreateProgram(); - auto dispatch_cores = device->dispatch_cores().begin(); - CoreCoord producer_logical_core = *dispatch_cores++; - CoreCoord consumer_logical_core = *dispatch_cores; - - CoreCoord producer_physical_core = device->worker_core_from_logical_core(producer_logical_core); - CoreCoord consumer_physical_core = device->worker_core_from_logical_core(consumer_logical_core); - - std::map producer_defines = { - {"IS_DISPATCH_KERNEL", ""}, - {"CONSUMER_NOC_X", std::to_string(consumer_physical_core.x)}, - {"CONSUMER_NOC_Y", std::to_string(consumer_physical_core.y)}, - }; - std::map consumer_defines = { - {"PRODUCER_NOC_X", std::to_string(producer_physical_core.x)}, - {"PRODUCER_NOC_Y", std::to_string(producer_physical_core.y)}, - }; - std::vector dispatch_compile_args = {DEVICE_DATA.TENSIX_SOFT_RESET_ADDR}; - tt::tt_metal::CreateKernel( - dispatch_program, - "tt_metal/impl/dispatch/kernels/command_queue_producer.cpp", - producer_logical_core, - tt::tt_metal::DataMovementConfig { - .processor = tt::tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt::tt_metal::NOC::RISCV_0_default, - .compile_args = dispatch_compile_args, - .defines = producer_defines}); - - tt::tt_metal::CreateKernel( - dispatch_program, - "tt_metal/impl/dispatch/kernels/command_queue_consumer.cpp", - consumer_logical_core, - tt::tt_metal::DataMovementConfig { - .processor = tt::tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt::tt_metal::NOC::RISCV_0_default, - .compile_args = dispatch_compile_args, - .defines = consumer_defines}); - - tt::tt_metal::CreateSemaphore(dispatch_program, producer_logical_core, 2); - tt::tt_metal::CreateSemaphore(dispatch_program, consumer_logical_core, 0); - - detail::CompileProgram(device, dispatch_program); - tt::tt_metal::detail::ConfigureDeviceWithProgram(device, dispatch_program); - - uint32_t fifo_addr = (HOST_CQ_FINISH_PTR + 32) >> 4; - vector fifo_addr_vector = {fifo_addr}; - tt::tt_metal::detail::WriteToDeviceL1(device, producer_logical_core, CQ_READ_PTR, fifo_addr_vector); - tt::tt_metal::detail::WriteToDeviceL1(device, producer_logical_core, CQ_WRITE_PTR, fifo_addr_vector); - - tt::Cluster::instance().l1_barrier(device->id()); - - const std::tuple tlb_data = tt::Cluster::instance().get_tlb_data(tt_cxy_pair(device->id(), device->worker_core_from_logical_core(*device->dispatch_cores().begin()))).value(); - auto [tlb_offset, tlb_size] = tlb_data; - // std::cout << "CORE: " << device->worker_core_from_logical_core(*device->dispatch_cores().begin()).str() << std::endl; - // std::cout << "after sending pointers to device. my tlb_offset: " << tlb_offset << ", my tlb_size: " << tlb_size << std::endl; - - launch_msg_t msg = dispatch_program.kernels_on_core(producer_logical_core)->launch_msg; - - // TODO(pkeller): Should use detail::LaunchProgram once we have a mechanism to avoid running all RISCs - tt::llrt::write_launch_msg_to_core(device->id(), producer_physical_core, &msg); - tt::llrt::write_launch_msg_to_core(device->id(), consumer_physical_core, &msg); -} - // CommandQueue section -CommandQueue::CommandQueue(Device* device): sysmem_writer(device) { +CommandQueue::CommandQueue(Device* device) { vector pointers(CQ_START / sizeof(uint32_t), 0); pointers[0] = CQ_START >> 4; - tt::Cluster::instance().write_sysmem(pointers.data(), pointers.size() * sizeof(uint32_t), 0, 0); + chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device->id()); + uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id()); + tt::Cluster::instance().write_sysmem(pointers.data(), pointers.size() * sizeof(uint32_t), 0, mmio_device_id, channel); - send_dispatch_kernel_to_device(device); this->device = device; } @@ -751,13 +682,13 @@ void CommandQueue::enqueue_read_buffer(Buffer& buffer, void* dst, bool blocking) ZoneScopedN("CommandQueue_read_buffer"); TT_FATAL(blocking, "EnqueueReadBuffer only has support for blocking mode currently"); uint32_t read_buffer_command_size = DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND + buffer.size(); - if ((this->sysmem_writer.cq_write_interface.fifo_wr_ptr << 4) + read_buffer_command_size >= DeviceCommand::HUGE_PAGE_SIZE) { + if ((this->device->sysmem_writer->cq_write_interface.fifo_wr_ptr << 4) + read_buffer_command_size >= DeviceCommand::HUGE_PAGE_SIZE) { TT_ASSERT(read_buffer_command_size <= DeviceCommand::HUGE_PAGE_SIZE - CQ_START, "EnqueueReadBuffer command is too large"); this->wrap(); } tt::log_debug(tt::LogDispatch, "EnqueueReadBuffer"); - EnqueueReadBufferCommand command(this->device, buffer, dst, this->sysmem_writer); + EnqueueReadBufferCommand command(this->device, buffer, dst, *this->device->sysmem_writer); // TODO(agrebenisan): Provide support so that we can achieve non-blocking // For now, make read buffer blocking since after the @@ -770,15 +701,18 @@ void CommandQueue::enqueue_read_buffer(Buffer& buffer, void* dst, bool blocking) uint32_t padded_page_size = align(buffer.page_size(), 32); uint32_t data_size_in_bytes = padded_page_size * num_pages; + chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(this->device->id()); + uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->device->id()); + if ((buffer.page_size() % 32) != 0) { // If page size is not 32B-aligned, we cannot do a contiguous copy uint32_t dst_address_offset = 0; for (uint32_t sysmem_address_offset = 0; sysmem_address_offset < data_size_in_bytes; sysmem_address_offset += padded_page_size) { - tt::Cluster::instance().read_sysmem((char*)dst + dst_address_offset, buffer.page_size(), command.read_buffer_addr + sysmem_address_offset, 0); + tt::Cluster::instance().read_sysmem((char*)dst + dst_address_offset, buffer.page_size(), command.read_buffer_addr + sysmem_address_offset, mmio_device_id, channel); dst_address_offset += buffer.page_size(); } } else { - tt::Cluster::instance().read_sysmem(dst, data_size_in_bytes, command.read_buffer_addr, 0); + tt::Cluster::instance().read_sysmem(dst, data_size_in_bytes, command.read_buffer_addr, mmio_device_id, channel); } } @@ -792,7 +726,7 @@ void CommandQueue::enqueue_write_buffer(Buffer& buffer, const void* src, bool bl "Buffer pages must fit within the command queue data section"); uint32_t write_buffer_command_size = DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND + buffer.size(); - if ((this->sysmem_writer.cq_write_interface.fifo_wr_ptr << 4) + write_buffer_command_size >= DeviceCommand::HUGE_PAGE_SIZE) { + if ((this->device->sysmem_writer->cq_write_interface.fifo_wr_ptr << 4) + write_buffer_command_size >= DeviceCommand::HUGE_PAGE_SIZE) { TT_ASSERT( write_buffer_command_size <= DeviceCommand::HUGE_PAGE_SIZE - CQ_START, "EnqueueWriteBuffer command is too large: {}", @@ -803,7 +737,7 @@ void CommandQueue::enqueue_write_buffer(Buffer& buffer, const void* src, bool bl // TODO(agrebenisan): This could just be a stack variable since we // are just running in one thread - EnqueueWriteBufferCommand command(this->device, buffer, src, this->sysmem_writer); + EnqueueWriteBufferCommand command(this->device, buffer, src, *this->device->sysmem_writer); this->enqueue_command(command, blocking); } @@ -844,7 +778,7 @@ void CommandQueue::enqueue_program(Program& program, bool blocking) { uint32_t host_data_and_device_command_size = DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND + (host_data_num_pages * DeviceCommand::PROGRAM_PAGE_SIZE); - if ((this->sysmem_writer.cq_write_interface.fifo_wr_ptr << 4) + host_data_and_device_command_size >= + if ((this->device->sysmem_writer->cq_write_interface.fifo_wr_ptr << 4) + host_data_and_device_command_size >= DeviceCommand::HUGE_PAGE_SIZE) { TT_ASSERT( host_data_and_device_command_size <= DeviceCommand::HUGE_PAGE_SIZE - CQ_START, "EnqueueProgram command size too large"); @@ -854,7 +788,7 @@ void CommandQueue::enqueue_program(Program& program, bool blocking) { EnqueueProgramCommand command(this->device, *this->program_to_buffer.at(program_id), this->program_to_dev_map.at(program_id), - this->sysmem_writer, + *this->device->sysmem_writer, program, stall); @@ -863,30 +797,33 @@ void CommandQueue::enqueue_program(Program& program, bool blocking) { void CommandQueue::finish() { ZoneScopedN("CommandQueue_finish"); - if ((this->sysmem_writer.cq_write_interface.fifo_wr_ptr << 4) + DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND >= + if ((this->device->sysmem_writer->cq_write_interface.fifo_wr_ptr << 4) + DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND >= DeviceCommand::HUGE_PAGE_SIZE) { this->wrap(); } tt::log_debug(tt::LogDispatch, "Finish"); - FinishCommand command(this->device, this->sysmem_writer); + FinishCommand command(this->device, *this->device->sysmem_writer); this->enqueue_command(command, false); + chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(this->device->id()); + uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->device->id()); + // We then poll to check that we're done. uint32_t finish; do { - tt::Cluster::instance().read_sysmem(&finish, 4, HOST_CQ_FINISH_PTR, 0); + tt::Cluster::instance().read_sysmem(&finish, 4, HOST_CQ_FINISH_PTR, mmio_device_id, channel); } while (finish != 1); // Reset this value to 0 before moving on finish = 0; - tt::Cluster::instance().write_sysmem(&finish, 4, HOST_CQ_FINISH_PTR, 0); + tt::Cluster::instance().write_sysmem(&finish, 4, HOST_CQ_FINISH_PTR, mmio_device_id, channel); } void CommandQueue::wrap() { ZoneScopedN("CommandQueue_wrap"); tt::log_debug(tt::LogDispatch, "EnqueueWrap"); - EnqueueWrapCommand command(this->device, this->sysmem_writer); + EnqueueWrapCommand command(this->device, *this->device->sysmem_writer); this->enqueue_command(command, false); } diff --git a/tt_metal/impl/dispatch/command_queue.hpp b/tt_metal/impl/dispatch/command_queue.hpp index 9c91918fd52..ee2f15afc71 100644 --- a/tt_metal/impl/dispatch/command_queue.hpp +++ b/tt_metal/impl/dispatch/command_queue.hpp @@ -13,7 +13,6 @@ #include "build_kernels_for_riscv/build_kernels_for_riscv.hpp" -#include "tt_metal/impl/dispatch/command_queue_interface.hpp" #include "tt_metal/impl/dispatch/thread_safe_queue.hpp" #include "tt_metal/common/base.hpp" #include "tt_metal/common/tt_backend_api_types.hpp" @@ -165,7 +164,10 @@ class EnqueueWrapCommand : public Command { EnqueueCommandType type(); }; -void send_dispatch_kernel_to_device(Device* device); +// Fwd declares +namespace detail{ + CommandQueue &GetCommandQueue(Device *device); +} class CommandQueue { public: @@ -175,7 +177,7 @@ class CommandQueue { private: Device* device; - SystemMemoryWriter sysmem_writer; + // thread processing_thread; map> program_to_buffer; @@ -201,6 +203,7 @@ class CommandQueue { friend void EnqueueProgram(CommandQueue& cq, Program& program, bool blocking); friend void Finish(CommandQueue& cq); friend void ClearProgramCache(CommandQueue& cq); + friend CommandQueue &detail::GetCommandQueue(Device *device); }; } // namespace tt::tt_metal diff --git a/tt_metal/impl/dispatch/command_queue_interface.hpp b/tt_metal/impl/dispatch/command_queue_interface.hpp index 9b1fdf39271..876e272a003 100644 --- a/tt_metal/impl/dispatch/command_queue_interface.hpp +++ b/tt_metal/impl/dispatch/command_queue_interface.hpp @@ -3,7 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 #include "tt_metal/common/base.hpp" -#include "tt_metal/impl/device/device.hpp" #include "tt_metal/impl/dispatch/device_command.hpp" #include "tt_metal/llrt/llrt.hpp" @@ -11,7 +10,9 @@ using namespace tt::tt_metal; inline uint32_t get_cq_rd_ptr(chip_id_t chip_id) { uint32_t recv; - tt::Cluster::instance().read_sysmem(&recv, sizeof(uint32_t), HOST_CQ_READ_PTR, chip_id); + chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(chip_id); + uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(chip_id); + tt::Cluster::instance().read_sysmem(&recv, sizeof(uint32_t), HOST_CQ_READ_PTR, mmio_device_id, channel); return recv; } @@ -34,25 +35,30 @@ struct SystemMemoryCQWriteInterface { class SystemMemoryWriter { private: - Device* device; + chip_id_t device_id; // Data required for fast writes to write pointer location // in prefetch core's L1 // const std::tuple tlb_data; const uint32_t m_dma_buf_size; const std::function fast_write_callable; + const std::set dispatch_cores; + const std::functionworker_from_logical_callable; uint32_t byte_addr; char* hugepage_start; public: SystemMemoryCQWriteInterface cq_write_interface; - SystemMemoryWriter(Device* device) : - m_dma_buf_size(tt::Cluster::instance().get_m_dma_buf_size(device->id())), - hugepage_start((char*) tt::Cluster::instance().host_dma_address(0, device->id(), 0)), + SystemMemoryWriter(chip_id_t device_id, const std::set &dev_dispatch_cores, const std::function &worker_from_logical) : + device_id(device_id), + m_dma_buf_size(tt::Cluster::instance().get_m_dma_buf_size(device_id)), + hugepage_start( + (char*) tt::Cluster::instance().host_dma_address(0, tt::Cluster::instance().get_associated_mmio_device(device_id), tt::Cluster::instance().get_assigned_channel_for_device(device_id))), fast_write_callable( - tt::Cluster::instance().get_fast_pcie_static_tlb_write_callable(device->id())) { + tt::Cluster::instance().get_fast_pcie_static_tlb_write_callable(device_id)), + dispatch_cores(dev_dispatch_cores), + worker_from_logical_callable(worker_from_logical) { - this->device = device; - const std::tuple tlb_data = tt::Cluster::instance().get_tlb_data(tt_cxy_pair(device->id(), this->device->worker_core_from_logical_core(*device->dispatch_cores().begin()))).value(); + const std::tuple tlb_data = tt::Cluster::instance().get_tlb_data(tt_cxy_pair(device_id, this->worker_from_logical_callable(*this->dispatch_cores.begin()))).value(); auto [tlb_offset, tlb_size] = tlb_data; this->byte_addr = tlb_offset + CQ_WRITE_PTR % tlb_size; } @@ -65,7 +71,7 @@ class SystemMemoryWriter { uint32_t rd_ptr; uint32_t rd_toggle; do { - rd_ptr_and_toggle = get_cq_rd_ptr(this->device->id()); + rd_ptr_and_toggle = get_cq_rd_ptr(this->device_id); rd_ptr = rd_ptr_and_toggle & 0x7fffffff; rd_toggle = rd_ptr_and_toggle >> 31; @@ -79,18 +85,19 @@ class SystemMemoryWriter { } // Ideally, data should be an array or pointer, but vector for time-being + // TODO ALMEET: MEASURE THIS void cq_write(const void* data, uint32_t size_in_bytes, uint32_t write_ptr) const { // There is a 50% overhead if hugepage_start is not made static. // Eventually when we want to have multiple hugepages, we may need to template // the sysmem writer to get this optimization. - static char* hugepage_start = this->hugepage_start; + /*static*/ char* hugepage_start = this->hugepage_start; void* user_scratchspace = hugepage_start + write_ptr; memcpy(user_scratchspace, data, size_in_bytes); } void send_write_ptr() const { static CoreCoord dispatch_core = - this->device->worker_core_from_logical_core(*this->device->dispatch_cores().begin()); + this->worker_from_logical_callable(*this->dispatch_cores.begin()); uint32_t write_ptr_and_toggle = this->cq_write_interface.fifo_wr_ptr | (this->cq_write_interface.fifo_wr_toggle << 31); diff --git a/tt_metal/llrt/tt_cluster.cpp b/tt_metal/llrt/tt_cluster.cpp index 662eb5c3c53..b9e2663b2af 100644 --- a/tt_metal/llrt/tt_cluster.cpp +++ b/tt_metal/llrt/tt_cluster.cpp @@ -117,6 +117,8 @@ void Cluster::generate_cluster_descriptor() { void Cluster::initialize_device_drivers() { for (const auto &[mmio_device_id, controlled_devices] : this->devices_grouped_by_assoc_mmio_device_) { + this->assign_mem_channels_to_devices(mmio_device_id, controlled_devices); + this->open_driver(mmio_device_id, controlled_devices); tt_device_params default_params; @@ -128,6 +130,20 @@ void Cluster::initialize_device_drivers() { } } +void Cluster::assign_mem_channels_to_devices(chip_id_t mmio_device_id, const std::set &controlled_device_ids) { + // g_MAX_HOST_MEM_CHANNELS (4) is defined in tt_SiliconDevice and denotes the max number of host memory channels per MMIO device + // Metal currently assigns 1 channel per device. See https://github.com/tenstorrent-metal/tt-metal/issues/4087 + TT_ASSERT(controlled_device_ids.size() <= 4, "Unable to assign each device to its own host memory channel!"); + uint16_t channel = 0; + this->device_to_host_mem_channel_[mmio_device_id] = channel++; + for (const chip_id_t &device_id : controlled_device_ids) { + if (device_id == mmio_device_id) { + continue; + } + this->device_to_host_mem_channel_[device_id] = channel++; + } +} + void Cluster::get_metal_desc_from_tt_desc( const std::unordered_map &input, const std::unordered_map &per_chip_id_harvesting_masks) { @@ -142,9 +158,10 @@ void Cluster::open_driver(chip_id_t mmio_device_id, const std::set &c std::unique_ptr device_driver; if (this->target_type_ == TargetDevice::Silicon) { - // This is the target/desired number of mem channels per arch/device. Silicon driver will attempt to open - // this many hugepages as channels, and assert if workload uses more than available. - uint32_t num_host_mem_ch_per_mmio_device = 1; + // This is the target/desired number of mem channels per arch/device. + // Silicon driver will attempt to open this many hugepages as channels, and assert if workload uses more than available. + // Metal currently uses assigns 1 channel per device + uint32_t num_host_mem_ch_per_mmio_device = controlled_device_ids.size(); std::unordered_map dynamic_tlb_config = {}; dynamic_tlb_config["REG_TLB"] = DEVICE_DATA.REG_TLB; // This will remove harvested rows from the soc descriptor @@ -499,14 +516,13 @@ void Cluster::read_reg(std::uint32_t *mem_ptr, tt_cxy_pair target, uint64_t addr this->get_driver(chip_id).read_from_device(mem_ptr, virtual_target, addr, size_in_bytes, "REG_TLB"); } -void Cluster::write_sysmem(const void* vec, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id) const { - constexpr uint16_t channel = 0; +void Cluster::write_sysmem(const void* vec, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id, uint16_t channel) const { + TT_ASSERT(this->cluster_desc_->is_chip_mmio_capable(src_device_id)); this->get_driver(src_device_id).write_to_sysmem(vec, size_in_bytes, addr, channel, src_device_id); } -void Cluster::read_sysmem(void *vec, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id) const { - // TODO: Uplift - constexpr uint16_t channel = 0; +void Cluster::read_sysmem(void *vec, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id, uint16_t channel) const { + TT_ASSERT(this->cluster_desc_->is_chip_mmio_capable(src_device_id)); this->get_driver(src_device_id).read_from_sysmem(vec, addr, channel, size_in_bytes, src_device_id); } @@ -619,6 +635,10 @@ std::tuple Cluster::get_connected_ethernet_core(std::tuple std::get<0>(connected_eth_core), soc_desc.chan_to_logical_eth_core_map.at(std::get<1>(connected_eth_core))); } +uint32_t Cluster::get_tensix_soft_reset_addr() const { + return DEVICE_DATA.TENSIX_SOFT_RESET_ADDR; +} + } // namespace tt std::ostream &operator<<(std::ostream &os, tt_target_dram const &dram) { diff --git a/tt_metal/llrt/tt_cluster.hpp b/tt_metal/llrt/tt_cluster.hpp index 6f1fe463a82..ad1560fce5e 100644 --- a/tt_metal/llrt/tt_cluster.hpp +++ b/tt_metal/llrt/tt_cluster.hpp @@ -95,8 +95,8 @@ class Cluster { void write_reg(const std::uint32_t *mem_ptr, tt_cxy_pair target, uint64_t addr) const; void read_reg(std::uint32_t *mem_ptr, tt_cxy_pair target, uint64_t addr) const; - void write_sysmem(const void* mem_ptr, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id) const; - void read_sysmem(void *mem_ptr, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id) const; + void write_sysmem(const void* mem_ptr, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id, uint16_t channel) const; + void read_sysmem(void *mem_ptr, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id, uint16_t channel) const; int get_device_aiclk(const chip_id_t &chip_id) const; @@ -127,6 +127,17 @@ class Cluster { // Returns connected ethernet core on the other chip std::tuple get_connected_ethernet_core(std::tuple eth_core) const; + // Returns MMIO device ID (logical) that controls given `device_id`. If `device_id` is MMIO device it is returned. + chip_id_t get_associated_mmio_device(chip_id_t device_id) const { + return this->device_to_mmio_device_.at(device_id); + } + + uint16_t get_assigned_channel_for_device(chip_id_t device_id) const { + return this->device_to_host_mem_channel_.at(device_id); + } + + uint32_t get_tensix_soft_reset_addr() const; + private: Cluster(); ~Cluster(); @@ -134,6 +145,7 @@ class Cluster { void detect_arch_and_target(); void generate_cluster_descriptor(); void initialize_device_drivers(); + void assign_mem_channels_to_devices(chip_id_t mmio_device_id, const std::set &controlled_device_ids); void open_driver(chip_id_t mmio_device_id, const std::set &controlled_device_ids, const bool &skip_driver_allocs = false); void start_driver(chip_id_t mmio_device_id, tt_device_params &device_params) const; @@ -162,6 +174,17 @@ class Cluster { // Save mapping of device id to associated MMIO device id for fast lookup std::unordered_map device_to_mmio_device_; + // Currently, each device is mapped to its own channel in host memory to enable fast dispatch + // Channels are unique within a group of devices all controlled by a particular MMIO device + // For example: + // Two N300 cards where MMIO device IDs are 0, 1 and R chips are 2, 3 + // 0 L controls 2 R and 1 L controls 3 R then, device_to_host_mem_channel_: + // 0 -> 0 + // 2 -> 1 + // 1 -> 0 + // 3 -> 1 + std::unordered_map device_to_host_mem_channel_; + tt_device_dram_address_params dram_address_params = { DRAM_BARRIER_BASE }; diff --git a/tt_metal/programming_examples/eltwise_binary/eltwise_binary.cpp b/tt_metal/programming_examples/eltwise_binary/eltwise_binary.cpp index 810e181e996..f60790c028e 100644 --- a/tt_metal/programming_examples/eltwise_binary/eltwise_binary.cpp +++ b/tt_metal/programming_examples/eltwise_binary/eltwise_binary.cpp @@ -68,7 +68,7 @@ int main(int argc, char **argv) { /* * Setup program to execute along with its buffers and kernels to use */ - CommandQueue& cq = *tt::tt_metal::detail::GLOBAL_CQ; + CommandQueue& cq = tt::tt_metal::detail::GetCommandQueue(device); Program program = CreateProgram(); diff --git a/tt_metal/programming_examples/eltwise_sfpu/eltwise_sfpu.cpp b/tt_metal/programming_examples/eltwise_sfpu/eltwise_sfpu.cpp index 44ec4de8746..5868fc5dcbf 100644 --- a/tt_metal/programming_examples/eltwise_sfpu/eltwise_sfpu.cpp +++ b/tt_metal/programming_examples/eltwise_sfpu/eltwise_sfpu.cpp @@ -40,7 +40,7 @@ int main(int argc, char **argv) { /* * Setup program to execute along with its buffers and kernels to use */ - CommandQueue& cq = *tt::tt_metal::detail::GLOBAL_CQ; + CommandQueue& cq = tt::tt_metal::detail::GetCommandQueue(device); Program program = CreateProgram(); diff --git a/tt_metal/programming_examples/loopback/loopback.cpp b/tt_metal/programming_examples/loopback/loopback.cpp index 33de9f92008..fdfd6bf9a0a 100644 --- a/tt_metal/programming_examples/loopback/loopback.cpp +++ b/tt_metal/programming_examples/loopback/loopback.cpp @@ -34,7 +34,7 @@ int main(int argc, char **argv) { /* * Setup program and command queue to execute along with its buffers and kernels to use */ - CommandQueue& cq = *tt::tt_metal::detail::GLOBAL_CQ; + CommandQueue& cq = tt::tt_metal::detail::GetCommandQueue(device); Program program = CreateProgram(); constexpr CoreCoord core = {0, 0}; diff --git a/tt_metal/programming_examples/matmul_multi_core/matmul_multi_core.cpp b/tt_metal/programming_examples/matmul_multi_core/matmul_multi_core.cpp index d6181d7c67a..9a63c9abe58 100644 --- a/tt_metal/programming_examples/matmul_multi_core/matmul_multi_core.cpp +++ b/tt_metal/programming_examples/matmul_multi_core/matmul_multi_core.cpp @@ -110,7 +110,7 @@ void matmul_multi_core(vector& a, vector& b, vector& a, vector& b, vector& a, vector& b, vect * Setup program to execute along with its buffers and kernels to use * Core range is just single core */ - CommandQueue& cq = *detail::GLOBAL_CQ; + CommandQueue& cq = tt_metal::detail::GetCommandQueue(device); Program program{}; tt::DataFormat cb_data_format = tt::DataFormat::Float16_b; diff --git a/tt_metal/programming_examples/matmul_single_core/matmul_single_core.cpp b/tt_metal/programming_examples/matmul_single_core/matmul_single_core.cpp index 49752b88581..c96f83d11b6 100644 --- a/tt_metal/programming_examples/matmul_single_core/matmul_single_core.cpp +++ b/tt_metal/programming_examples/matmul_single_core/matmul_single_core.cpp @@ -110,7 +110,7 @@ void matmul_single_core(vector& a, vector& b, vector &logical_c { ProfileTTMetalScope profile_this = ProfileTTMetalScope("DumpDeviceProfileResults"); //TODO: (MO) This global is temporary need to update once the new interface is in - if (GLOBAL_CQ) { - Finish(*GLOBAL_CQ); + if (std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr) { + Finish(GetCommandQueue(device)); } + TT_FATAL(tt_is_print_server_running() == false, "Debug print server is running, cannot dump device profiler data"); auto worker_cores_used_in_program =\ device->worker_cores_from_logical_cores(logical_cores); diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp index d59d3a2ea8a..d3ed04e9a2d 100644 --- a/tt_metal/tt_metal.cpp +++ b/tt_metal/tt_metal.cpp @@ -234,7 +234,6 @@ namespace detail { DumpDeviceProfileResults(device, program); } - bool ConfigureDeviceWithProgram(Device *device, Program &program) { ZoneScoped; bool pass = true; @@ -328,22 +327,14 @@ namespace detail { ZoneScoped; program.compile(device); } -} +} // namespace detail Device *CreateDevice(chip_id_t device_id, const std::vector& l1_bank_remap) { Device * dev = new Device(device_id, l1_bank_remap); - const char *TT_METAL_SLOW_DISPATCH_MODE = std::getenv("TT_METAL_SLOW_DISPATCH_MODE"); - if (TT_METAL_SLOW_DISPATCH_MODE == nullptr) { - detail::GLOBAL_CQ = std::make_unique(dev); - } return dev; } bool CloseDevice(Device *device) { - // Needed to ensure that GLOBAL_CQ doesn't contain a closed device - if (detail::GLOBAL_CQ) { - detail::GLOBAL_CQ.reset(nullptr); - } return device->close(); }