#2934: Replace GLOBAL_CQ with one CommandQueue per device and move sy…

…smem writer (HW CQ) to be owned by Device assign each device to specific host mem channel
tenstorrent · Dec 5, 2023 · 9fc4f0e · 9fc4f0e
1 parent 0944027
commit 9fc4f0e
Show file tree

Hide file tree

Showing 42 changed files with 412 additions and 444 deletions.
diff --git a/docs/source/tt_metal/examples/dram_loopback.rst b/docs/source/tt_metal/examples/dram_loopback.rst
@@ -40,7 +40,7 @@ Program pre-compilation setup
 
 .. code-block:: cpp
 
-   CommandQueue& cq = *tt::tt_metal::detail::GLOBAL_CQ;
+   CommandQueue& cq = detail::GetCommandQueue(device);
    Program program = CreateProgram();
 
 We first obtain the global ``CommandQueue`` in order to use the fast dispatch

diff --git a/docs/source/tt_metal/examples/matmul_single_core.rst b/docs/source/tt_metal/examples/matmul_single_core.rst
@@ -89,7 +89,7 @@ core at (0, 0).
 
 .. code-block:: cpp
 
-    CommandQueue& cq = *detail::GLOBAL_CQ;
+    CommandQueue& cq = detail::GetCommandQueue(device);
     Program program{};
     CoreRange core = {.start={0, 0}, .end={0, 0}};
 

diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/matmul/matmul_global_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/matmul/matmul_global_l1.cpp
@@ -1197,8 +1197,8 @@ int main(int argc, char** argv) {
 
     // took from run_operation.cpp
     auto start = std::chrono::high_resolution_clock::now();
-    EnqueueProgram(*::detail::GLOBAL_CQ, program, false);
-    Finish(*::detail::GLOBAL_CQ);
+    EnqueueProgram(::detail::GetCommandQueue(device), program, false);
+    Finish(::detail::GetCommandQueue(device));
     auto end = std::chrono::high_resolution_clock::now();
     duration = end - start;
     tt_metal::DumpDeviceProfileResults(device, program);

diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/matmul/matmul_local_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/matmul/matmul_local_l1.cpp
@@ -381,8 +381,8 @@ int main(int argc, char **argv) {
     std::chrono::duration<double, std::nano> duration;
     // took from run_operation.cpp
     auto start = std::chrono::high_resolution_clock::now();
-    EnqueueProgram(*::detail::GLOBAL_CQ, program, false);
-    Finish(*::detail::GLOBAL_CQ);
+    EnqueueProgram(::detail::GetCommandQueue(device), program, false);
+    Finish(::detail::GetCommandQueue(device));
     auto end = std::chrono::high_resolution_clock::now();
     duration = end - start;
     tt_metal::DumpDeviceProfileResults(device, program);

diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_read_global_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_read_global_l1.cpp
@@ -272,8 +272,8 @@ int main(int argc, char **argv) {
 
     log_info(LogTest, "Running {} core test", num_cores_r * num_cores_c);
     auto begin = std::chrono::steady_clock::now();
-    EnqueueProgram(*::detail::GLOBAL_CQ, program, false);
-    Finish(*::detail::GLOBAL_CQ);
+    EnqueueProgram(::detail::GetCommandQueue(device), program, false);
+    Finish(::detail::GetCommandQueue(device));
     auto end = std::chrono::steady_clock::now();
     auto elapsed_us = duration_cast<microseconds>(end - begin).count();
     auto bw = (total_tiles_size_bytes / 1024.0 / 1024.0 / 1024.0) /

diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_read_local_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_read_local_l1.cpp
@@ -225,8 +225,8 @@ int main(int argc, char **argv) {
 
     log_info(LogTest, "Running {} core test", num_cores_r * num_cores_c);
     auto begin = std::chrono::steady_clock::now();
-    EnqueueProgram(*::detail::GLOBAL_CQ, program, false);
-    Finish(*::detail::GLOBAL_CQ);
+    EnqueueProgram(::detail::GetCommandQueue(device), program, false);
+    Finish(::detail::GetCommandQueue(device));
     auto end = std::chrono::steady_clock::now();
     auto elapsed_us = duration_cast<microseconds>(end - begin).count();
     auto bw = (total_tiles_size_bytes / 1024.0 / 1024.0 / 1024.0) /

diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/pcie/test_enqueue_rw_buffer.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/pcie/test_enqueue_rw_buffer.cpp
@@ -57,7 +57,7 @@ int main(int argc, char** argv) {
     // Device Setup
     int device_id = 0;
     tt_metal::Device* device = tt_metal::CreateDevice(device_id);
-    CommandQueue& cq = *tt::tt_metal::detail::GLOBAL_CQ;
+    CommandQueue& cq = tt::tt_metal::detail::GetCommandQueue(device);
 
     // Application Setup
     uint32_t single_tile_size = 2 * 1024;

diff --git a/tests/tt_metal/tt_metal/test_eltwise_binary.cpp b/tests/tt_metal/tt_metal/test_eltwise_binary.cpp
@@ -37,7 +37,7 @@ int main(int argc, char** argv) {
     tt_metal::Device* device = tt_metal::CreateDevice(device_id);
 
 
-    CommandQueue& cq = *tt::tt_metal::detail::GLOBAL_CQ;
+    CommandQueue& cq = detail::GetCommandQueue(device);
 
     Program programs[] = {tt_metal::CreateProgram(), tt_metal::CreateProgram(), tt_metal::CreateProgram()};
 

diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp
@@ -397,7 +397,7 @@ int main(int argc, char **argv) {
             per_core_N);
 
 
-        CommandQueue& cq = *tt::tt_metal::detail::GLOBAL_CQ;
+        CommandQueue& cq = tt::tt_metal::detail::GetCommandQueue(device);
 
         ////////////////////////////////////////////////////////////////////////////
         //                      Execute Application

diff --git a/tests/tt_metal/tt_metal/tt_dispatch/test_enqueue_program.cpp b/tests/tt_metal/tt_metal/tt_dispatch/test_enqueue_program.cpp
@@ -87,7 +87,7 @@ void test_enqueue_program(std::function<tt_metal::Program(tt_metal::Device *devi
 
     vector<uint32_t> out_vec;
     {
-        CommandQueue& cq = *tt::tt_metal::detail::GLOBAL_CQ;
+        CommandQueue& cq = tt::tt_metal::detail::GetCommandQueue(device);
 
         // Enqueue program inputs
         Buffer buf(device, NUM_TILES * 2048, 2048, BufferType::DRAM);

diff --git a/tests/tt_metal/tt_metal/unit_tests/basic/device.cpp b/tests/tt_metal/tt_metal/unit_tests/basic/device.cpp
@@ -340,3 +340,22 @@ TEST_F(DeviceFixture, ValidateKernelDoesNotTargetHarvestedCores) {
         }
     }
 }
+
+// For a given collection of MMIO device and remote devices, ensure that channels are unique
+TEST_F(DeviceFixture, TestDeviceToHostMemChannelAssignment) {
+    std::unordered_map<chip_id_t, std::set<chip_id_t>> mmio_device_to_device_group;
+    for (unsigned int dev_id = 0; dev_id < num_devices_; dev_id++) {
+        chip_id_t assoc_mmio_dev_id = tt::Cluster::instance().get_associated_mmio_device(dev_id);
+        std::set<chip_id_t> &device_ids = mmio_device_to_device_group[assoc_mmio_dev_id];
+        device_ids.insert(dev_id);
+    }
+
+    for (const auto& [mmio_dev_id, device_group] : mmio_device_to_device_group) {
+        EXPECT_EQ(tt::Cluster::instance().get_assigned_channel_for_device(mmio_dev_id), 0);
+        std::unordered_set<uint16_t> channels;
+        for (const chip_id_t &device_id : device_group) {
+            channels.insert(tt::Cluster::instance().get_assigned_channel_for_device(device_id));
+        }
+        EXPECT_EQ(channels.size(), device_group.size());
+    }
+}
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/matmul/single_core_matmul_compute.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/matmul/single_core_matmul_compute.cpp
@@ -135,246 +135,7 @@ void create_CBs_for_fused_matmul(
         auto cb_output = tt_metal::CreateCircularBuffer(program, core, cb_output_config);
     }
 }
-struct SingleCoreMatmulConfig {
-    bool activations_rm = false;
-    bool outputs_rm = false;
-    size_t M = 0;
-    size_t K = 0;
-    size_t N = 0;
-    size_t out_subblock_h = 0;
-    size_t out_subblock_w = 0;
-    size_t in0_block_w = 0;
-    size_t input0_dram_channel = 0;
-    size_t input1_dram_channel = 0;
-    size_t output_dram_channel = 0;
-    CoreCoord core = {};
-};
-
-bool single_core_matmul(tt_metal::Device* device, const SingleCoreMatmulConfig& cfg) {
-    // Since running in slow dispatch mode
-    tt::tt_metal::detail::GLOBAL_CQ.reset();
-    bool pass = true;
-    ////////////////////////////////////////////////////////////////////////////
-    //                      Application Setup
-    ////////////////////////////////////////////////////////////////////////////
-    tt_metal::Program program = tt_metal::CreateProgram();
-
-    uint32_t single_tile_size = 2 * 1024;
-    TT_FATAL(
-        cfg.M * cfg.in0_block_w * single_tile_size * 2 <= 150 * 1024, "input0 block must fit within 150kB of L1");
-    TT_FATAL(
-        cfg.N * cfg.in0_block_w * single_tile_size * 2 <= 100 * 1024, "input1 block must fit within 100kB of L1");
-    TT_FATAL(cfg.M * cfg.N * single_tile_size <= 600 * 1024, "output block must fit within 600kB of L1");
-    uint32_t dram_buffer_size_input0 =
-        single_tile_size * cfg.M * cfg.K;  // num_tiles of FP16_B, hard-coded in the reader/writer kernels
-    uint32_t dram_buffer_size_input1 =
-        single_tile_size * cfg.K * cfg.N;  // num_tiles of FP16_B, hard-coded in the reader/writer kernels
-    uint32_t dram_buffer_size_output =
-        single_tile_size * cfg.M * cfg.N;  // num_tiles of FP16_B, hard-coded in the reader/writer kernels
-
-    auto input0_dram_buffer = CreateBuffer(
-        device,
-        dram_buffer_size_input0,
-        dram_buffer_size_input0,
-        tt_metal::BufferType::DRAM);
-    uint32_t input0_dram_byte_address = input0_dram_buffer.address();
-    auto input1_dram_buffer = CreateBuffer(
-        device,
-        dram_buffer_size_input1,
-        dram_buffer_size_input1,
-        tt_metal::BufferType::DRAM);
-    uint32_t input1_dram_byte_address = input1_dram_buffer.address();
-    auto output_dram_buffer = CreateBuffer(
-        device,
-        dram_buffer_size_output,
-        dram_buffer_size_output,
-        tt_metal::BufferType::DRAM);
-    uint32_t output_dram_byte_address = output_dram_buffer.address();
-
-    auto input0_dram_noc_xy = input0_dram_buffer.noc_coordinates();
-    auto input1_dram_noc_xy = input1_dram_buffer.noc_coordinates();
-    auto output_dram_noc_xy = output_dram_buffer.noc_coordinates();
-
-    std::vector<uint32_t> reader_rt_args{
-        (std::uint32_t)input0_dram_byte_address,
-        (std::uint32_t)input0_dram_noc_xy.x,
-        (std::uint32_t)input0_dram_noc_xy.y,
-        (std::uint32_t)input1_dram_byte_address,
-        (std::uint32_t)input1_dram_noc_xy.x,
-        (std::uint32_t)input1_dram_noc_xy.y,
-        (std::uint32_t)(cfg.K / cfg.in0_block_w),                      // num_blocks
-        (std::uint32_t)(cfg.M * cfg.in0_block_w),                      // input 0 block num tiles
-        (std::uint32_t)(cfg.N * cfg.in0_block_w),                      // input 1 block num tiles
-        (std::uint32_t)(cfg.M * cfg.in0_block_w * single_tile_size),   // input 0 block bytes
-        (std::uint32_t)(cfg.N * cfg.in0_block_w * single_tile_size)};  // input 1 block bytes
-    std::vector<uint32_t> writer_rt_args;
-    string writer_kernel_name;
-    if (cfg.outputs_rm) {
-        writer_kernel_name = "tt_metal/kernels/dataflow/writer_unary.cpp";
-        writer_rt_args = {
-            (std::uint32_t)output_dram_byte_address,
-            (std::uint32_t)output_dram_noc_xy.x,
-            (std::uint32_t)output_dram_noc_xy.y,
-            uint(cfg.M * cfg.N)};
-    } else {
-        writer_kernel_name = "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unswizzle.cpp";
-        writer_rt_args = {
-            (std::uint32_t)output_dram_byte_address,
-            (std::uint32_t)output_dram_noc_xy.x,
-            (std::uint32_t)output_dram_noc_xy.y,
-            (std::uint32_t)cfg.out_subblock_h,            // num tiles per sub block m
-            (std::uint32_t)cfg.out_subblock_w,            // num tiles per sub block n
-            (std::uint32_t)(cfg.M / cfg.out_subblock_h),  // num sub blocks m
-            (std::uint32_t)(cfg.N / cfg.out_subblock_w),  // num sub blocks n
-            (std::uint32_t)(
-                cfg.out_subblock_w * single_tile_size *
-                (cfg.N / cfg.out_subblock_w)),  // bytes offset to next row within sub-block
-            (std::uint32_t)(
-                cfg.out_subblock_h * cfg.out_subblock_w * single_tile_size *
-                (cfg.N / cfg.out_subblock_w)),                        // bytes offset to next row of sub-blocks
-            (std::uint32_t)(cfg.out_subblock_w * single_tile_size)};  // bytes offset to next sub-block
-    }
-    auto writer_kernel = tt_metal::CreateKernel(
-        program,
-        writer_kernel_name,
-        cfg.core,
-        tt_metal::DataMovementConfig{
-            .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
-
-    auto reader_kernel = tt_metal::CreateKernel(
-        program,
-        "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_blocked.cpp",
-        cfg.core,
-        tt_metal::DataMovementConfig{
-            .processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
-
-    int num_blocks = (cfg.K / cfg.in0_block_w);
-    int in0_num_subblocks = (cfg.M / cfg.out_subblock_h);
-    int in0_block_num_tiles = cfg.out_subblock_h * cfg.in0_block_w * in0_num_subblocks;
-    int in0_subblock_num_tiles = cfg.out_subblock_h * cfg.in0_block_w;
-    int in1_num_subblocks = (cfg.N / cfg.out_subblock_w);
-    int in1_block_num_tiles = cfg.out_subblock_w * cfg.in0_block_w * in1_num_subblocks;
-    int in1_per_core_w = cfg.out_subblock_w * in1_num_subblocks;
-    int out_subblock_num_tiles = cfg.out_subblock_h * cfg.out_subblock_w;
-    int in0_subblock_h = (in0_block_num_tiles / in0_num_subblocks) / cfg.in0_block_w;
-
-    create_CBs_for_fused_matmul(
-        program,
-        device,
-        cfg.core,
-        cfg.activations_rm,
-        cfg.outputs_rm,
-        cfg.M,
-        cfg.N,
-        cfg.in0_block_w,
-        cfg.out_subblock_h);
-
-    TT_FATAL(
-        in0_subblock_h * cfg.in0_block_w * in0_num_subblocks == in0_block_num_tiles,
-        "in0_subblock_h * cfg.in0_block_w * in0_num_subblocks == in0_block_num_tiles");
-    TT_FATAL(cfg.in0_block_w == cfg.K, "Must match k tiles");
-
-    vector<uint32_t> compute_kernel_args = {
-        uint(cfg.in0_block_w),
-        uint(in0_num_subblocks),
-        uint(in0_block_num_tiles),
-        uint(in0_subblock_num_tiles),
-        uint(in0_subblock_h),
-
-        uint(in1_num_subblocks),
-        uint(in1_block_num_tiles),
-        uint(in1_per_core_w),
-
-        uint(num_blocks),
-
-        uint(cfg.out_subblock_h),
-        uint(cfg.out_subblock_w),
-        uint(out_subblock_num_tiles),
-
-        uint(cfg.activations_rm),
-        uint(cfg.outputs_rm)};
-
-    auto matmul_kernel = tt_metal::CreateKernel(
-        program,
-        "tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block.cpp",
-        cfg.core,
-        tt_metal::ComputeConfig{.compile_args = compute_kernel_args});
-
-    ////////////////////////////////////////////////////////////////////////////
-    //                      Stimulus Generation
-    ////////////////////////////////////////////////////////////////////////////
-    std::vector<uint32_t> packed_identity = {};
-    std::vector<uint32_t> packed_activation = {};
-    auto activation = generate_uniform_random_vector<bfloat16>(
-        1.0f,
-        1.0f,
-        dram_buffer_size_input0 / bfloat16::SIZEOF,
-        std::chrono::system_clock::now().time_since_epoch().count());
-    if (cfg.activations_rm) {
-        packed_activation = pack_vector<uint32_t, bfloat16>(activation);
-    } else {
-        auto activations_tilized = tilize<bfloat16, 32, 32>(activation, cfg.M * 32, cfg.K * 32);
-        auto activations_tile_layout = convert_to_tile_layout(activations_tilized);
-        packed_activation = pack_vector<uint32_t, bfloat16>(activations_tile_layout);
-    }
-    auto identity =
-        generate_strided_vector<bfloat16>(0.0f, 1.0f, cfg.N * 32 + 1, 0, dram_buffer_size_input1 / bfloat16::SIZEOF);
-    auto identity_tilized = tilize<bfloat16, 32, 32>(identity, cfg.K * 32, cfg.N * 32);
-    auto identity_tile_layout = convert_to_tile_layout(identity_tilized);
-    packed_identity = pack_vector<uint32_t, bfloat16>(identity_tile_layout);
-    ////////////////////////////////////////////////////////////////////////////
-    //                      Golden Generation
-    ////////////////////////////////////////////////////////////////////////////
-    auto packed_golden = packed_activation;  //
-
-    ////////////////////////////////////////////////////////////////////////////
-    //                      Compile and Execute Application
-    ////////////////////////////////////////////////////////////////////////////
-
-    tt_metal::detail::WriteToBuffer(input0_dram_buffer, packed_activation);
-    tt_metal::detail::WriteToBuffer(input1_dram_buffer, packed_identity);
-    std::vector<uint32_t> input0_dram_readback_packed;
-    tt_metal::detail::ReadFromBuffer(input0_dram_buffer, input0_dram_readback_packed);
-    EXPECT_TRUE(input0_dram_readback_packed == packed_activation);
-    print_vector_fixed_numel_per_row(unpack_vector<bfloat16, uint32_t>(input0_dram_readback_packed), 32);
-    std::vector<uint32_t> input1_dram_readback_packed;
-    tt_metal::detail::ReadFromBuffer(input1_dram_buffer, input1_dram_readback_packed);
-    EXPECT_TRUE(input1_dram_readback_packed == packed_identity);
-    print_vector_fixed_numel_per_row(unpack_vector<bfloat16, uint32_t>(input1_dram_readback_packed), 32);
-
-
-    tt_metal::SetRuntimeArgs(program, reader_kernel, cfg.core, reader_rt_args);
-    tt_metal::SetRuntimeArgs(program, writer_kernel, cfg.core, writer_rt_args);
-
 
-    tt_metal::detail::LaunchProgram(device, program);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //                      Comparison Checking
-    ////////////////////////////////////////////////////////////////////////////
-    std::vector<uint32_t> input0_l1_readback_packed;
-    tt_metal::detail::ReadFromDeviceL1(device, cfg.core, 120 * 1024, 2 * single_tile_size, input0_l1_readback_packed);
-    EXPECT_TRUE(input0_l1_readback_packed == packed_activation);
-    std::vector<uint32_t> input1_l1_readback_packed;
-    tt_metal::detail::ReadFromDeviceL1(device, cfg.core, 250 * 1024, 2 * single_tile_size, input1_l1_readback_packed);
-    EXPECT_TRUE(input1_l1_readback_packed == packed_identity);
-
-    // std::vector<uint32_t> input1_l1_readback_packed;
-    // std::vector<uint32_t> output_l1_readback_packed;
-    std::vector<uint32_t> dest_buffer_data;
-    tt_metal::detail::ReadFromBuffer(output_dram_buffer, dest_buffer_data);
-    auto dest_buffer_data_unpacked = unpack_vector<bfloat16, uint32_t>(dest_buffer_data);
-    if (not cfg.outputs_rm) {
-        dest_buffer_data_unpacked = convert_to_flat_layout(dest_buffer_data_unpacked);
-        dest_buffer_data_unpacked = untilize<bfloat16, 32, 32>(dest_buffer_data_unpacked, cfg.M * 32, cfg.N * 32);
-    }
-    pass &=
-        is_close_vectors<bfloat16>(activation, dest_buffer_data_unpacked, [&](const bfloat16& a, const bfloat16& b) {
-            return is_close(a, b, 0.15f);
-        });
-
-    return pass;
-}
 bool single_tile_matmul(tt_metal::Device* device) {
 
     bool pass = true;