diff --git a/docs/source/tt_metal/examples/dram_loopback.rst b/docs/source/tt_metal/examples/dram_loopback.rst
index 050aa91b89b..8b2e1de9d23 100644
--- a/docs/source/tt_metal/examples/dram_loopback.rst
+++ b/docs/source/tt_metal/examples/dram_loopback.rst
@@ -40,7 +40,7 @@ Program pre-compilation setup
 
 .. code-block:: cpp
 
-   CommandQueue& cq = *tt::tt_metal::detail::GLOBAL_CQ;
+   CommandQueue& cq = detail::GetCommandQueue(device);
    Program program = CreateProgram();
 
 We first obtain the global ``CommandQueue`` in order to use the fast dispatch
diff --git a/docs/source/tt_metal/examples/matmul_single_core.rst b/docs/source/tt_metal/examples/matmul_single_core.rst
index 169904b0c49..f4a237ed76f 100644
--- a/docs/source/tt_metal/examples/matmul_single_core.rst
+++ b/docs/source/tt_metal/examples/matmul_single_core.rst
@@ -89,7 +89,7 @@ core at (0, 0).
 
 .. code-block:: cpp
 
-    CommandQueue& cq = *detail::GLOBAL_CQ;
+    CommandQueue& cq = detail::GetCommandQueue(device);
     Program program{};
     CoreRange core = {.start={0, 0}, .end={0, 0}};
 
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/matmul/matmul_global_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/matmul/matmul_global_l1.cpp
index af0b05bc969..5fbd88f2a10 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/matmul/matmul_global_l1.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/matmul/matmul_global_l1.cpp
@@ -1197,8 +1197,8 @@ int main(int argc, char** argv) {
 
     // took from run_operation.cpp
     auto start = std::chrono::high_resolution_clock::now();
-    EnqueueProgram(*::detail::GLOBAL_CQ, program, false);
-    Finish(*::detail::GLOBAL_CQ);
+    EnqueueProgram(::detail::GetCommandQueue(device), program, false);
+    Finish(::detail::GetCommandQueue(device));
     auto end = std::chrono::high_resolution_clock::now();
     duration = end - start;
     tt_metal::DumpDeviceProfileResults(device, program);
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/matmul/matmul_local_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/matmul/matmul_local_l1.cpp
index 5a4e3c3b8f6..e260a89384e 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/matmul/matmul_local_l1.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/matmul/matmul_local_l1.cpp
@@ -381,8 +381,8 @@ int main(int argc, char **argv) {
     std::chrono::duration<double, std::nano> duration;
     // took from run_operation.cpp
     auto start = std::chrono::high_resolution_clock::now();
-    EnqueueProgram(*::detail::GLOBAL_CQ, program, false);
-    Finish(*::detail::GLOBAL_CQ);
+    EnqueueProgram(::detail::GetCommandQueue(device), program, false);
+    Finish(::detail::GetCommandQueue(device));
     auto end = std::chrono::high_resolution_clock::now();
     duration = end - start;
     tt_metal::DumpDeviceProfileResults(device, program);
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_read_global_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_read_global_l1.cpp
index 2a4490f1e6a..089cda732d3 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_read_global_l1.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_read_global_l1.cpp
@@ -272,8 +272,8 @@ int main(int argc, char **argv) {
 
     log_info(LogTest, "Running {} core test", num_cores_r * num_cores_c);
     auto begin = std::chrono::steady_clock::now();
-    EnqueueProgram(*::detail::GLOBAL_CQ, program, false);
-    Finish(*::detail::GLOBAL_CQ);
+    EnqueueProgram(::detail::GetCommandQueue(device), program, false);
+    Finish(::detail::GetCommandQueue(device));
     auto end = std::chrono::steady_clock::now();
     auto elapsed_us = duration_cast<microseconds>(end - begin).count();
     auto bw = (total_tiles_size_bytes / 1024.0 / 1024.0 / 1024.0) /
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_read_local_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_read_local_l1.cpp
index 8e65d913e35..e1863da4681 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_read_local_l1.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_read_local_l1.cpp
@@ -225,8 +225,8 @@ int main(int argc, char **argv) {
 
     log_info(LogTest, "Running {} core test", num_cores_r * num_cores_c);
     auto begin = std::chrono::steady_clock::now();
-    EnqueueProgram(*::detail::GLOBAL_CQ, program, false);
-    Finish(*::detail::GLOBAL_CQ);
+    EnqueueProgram(::detail::GetCommandQueue(device), program, false);
+    Finish(::detail::GetCommandQueue(device));
     auto end = std::chrono::steady_clock::now();
     auto elapsed_us = duration_cast<microseconds>(end - begin).count();
     auto bw = (total_tiles_size_bytes / 1024.0 / 1024.0 / 1024.0) /
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/pcie/test_enqueue_rw_buffer.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/pcie/test_enqueue_rw_buffer.cpp
index 6140296dae4..d989c7a0dad 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/pcie/test_enqueue_rw_buffer.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/pcie/test_enqueue_rw_buffer.cpp
@@ -57,7 +57,7 @@ int main(int argc, char** argv) {
     // Device Setup
     int device_id = 0;
     tt_metal::Device* device = tt_metal::CreateDevice(device_id);
-    CommandQueue& cq = *tt::tt_metal::detail::GLOBAL_CQ;
+    CommandQueue& cq = tt::tt_metal::detail::GetCommandQueue(device);
 
     // Application Setup
     uint32_t single_tile_size = 2 * 1024;
diff --git a/tests/tt_metal/tt_metal/test_eltwise_binary.cpp b/tests/tt_metal/tt_metal/test_eltwise_binary.cpp
index 66823712e30..b3845862083 100644
--- a/tests/tt_metal/tt_metal/test_eltwise_binary.cpp
+++ b/tests/tt_metal/tt_metal/test_eltwise_binary.cpp
@@ -37,7 +37,7 @@ int main(int argc, char** argv) {
     tt_metal::Device* device = tt_metal::CreateDevice(device_id);
 
 
-    CommandQueue& cq = *tt::tt_metal::detail::GLOBAL_CQ;
+    CommandQueue& cq = detail::GetCommandQueue(device);
 
     Program programs[] = {tt_metal::CreateProgram(), tt_metal::CreateProgram(), tt_metal::CreateProgram()};
 
diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp
index 984c24dfbbe..07bacfdef5f 100644
--- a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp
@@ -397,7 +397,7 @@ int main(int argc, char **argv) {
             per_core_N);
 
 
-        CommandQueue& cq = *tt::tt_metal::detail::GLOBAL_CQ;
+        CommandQueue& cq = tt::tt_metal::detail::GetCommandQueue(device);
 
         ////////////////////////////////////////////////////////////////////////////
         //                      Execute Application
diff --git a/tests/tt_metal/tt_metal/tt_dispatch/test_enqueue_program.cpp b/tests/tt_metal/tt_metal/tt_dispatch/test_enqueue_program.cpp
index 291cf516486..5162ce67809 100644
--- a/tests/tt_metal/tt_metal/tt_dispatch/test_enqueue_program.cpp
+++ b/tests/tt_metal/tt_metal/tt_dispatch/test_enqueue_program.cpp
@@ -87,7 +87,7 @@ void test_enqueue_program(std::function<tt_metal::Program(tt_metal::Device *devi
 
     vector<uint32_t> out_vec;
     {
-        CommandQueue& cq = *tt::tt_metal::detail::GLOBAL_CQ;
+        CommandQueue& cq = tt::tt_metal::detail::GetCommandQueue(device);
 
         // Enqueue program inputs
         Buffer buf(device, NUM_TILES * 2048, 2048, BufferType::DRAM);
diff --git a/tests/tt_metal/tt_metal/unit_tests/basic/device.cpp b/tests/tt_metal/tt_metal/unit_tests/basic/device.cpp
index 5458fe3a3ab..3d9c6cda996 100644
--- a/tests/tt_metal/tt_metal/unit_tests/basic/device.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/basic/device.cpp
@@ -340,3 +340,22 @@ TEST_F(DeviceFixture, ValidateKernelDoesNotTargetHarvestedCores) {
         }
     }
 }
+
+// For a given collection of MMIO device and remote devices, ensure that channels are unique
+TEST_F(DeviceFixture, TestDeviceToHostMemChannelAssignment) {
+    std::unordered_map<chip_id_t, std::set<chip_id_t>> mmio_device_to_device_group;
+    for (unsigned int dev_id = 0; dev_id < num_devices_; dev_id++) {
+        chip_id_t assoc_mmio_dev_id = tt::Cluster::instance().get_associated_mmio_device(dev_id);
+        std::set<chip_id_t> &device_ids = mmio_device_to_device_group[assoc_mmio_dev_id];
+        device_ids.insert(dev_id);
+    }
+
+    for (const auto& [mmio_dev_id, device_group] : mmio_device_to_device_group) {
+        EXPECT_EQ(tt::Cluster::instance().get_assigned_channel_for_device(mmio_dev_id), 0);
+        std::unordered_set<uint16_t> channels;
+        for (const chip_id_t &device_id : device_group) {
+            channels.insert(tt::Cluster::instance().get_assigned_channel_for_device(device_id));
+        }
+        EXPECT_EQ(channels.size(), device_group.size());
+    }
+}
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/matmul/single_core_matmul_compute.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/matmul/single_core_matmul_compute.cpp
index a323ecf78d2..c81bfe4b51e 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/matmul/single_core_matmul_compute.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/compute/matmul/single_core_matmul_compute.cpp
@@ -135,246 +135,7 @@ void create_CBs_for_fused_matmul(
         auto cb_output = tt_metal::CreateCircularBuffer(program, core, cb_output_config);
     }
 }
-struct SingleCoreMatmulConfig {
-    bool activations_rm = false;
-    bool outputs_rm = false;
-    size_t M = 0;
-    size_t K = 0;
-    size_t N = 0;
-    size_t out_subblock_h = 0;
-    size_t out_subblock_w = 0;
-    size_t in0_block_w = 0;
-    size_t input0_dram_channel = 0;
-    size_t input1_dram_channel = 0;
-    size_t output_dram_channel = 0;
-    CoreCoord core = {};
-};
-
-bool single_core_matmul(tt_metal::Device* device, const SingleCoreMatmulConfig& cfg) {
-    // Since running in slow dispatch mode
-    tt::tt_metal::detail::GLOBAL_CQ.reset();
-    bool pass = true;
-    ////////////////////////////////////////////////////////////////////////////
-    //                      Application Setup
-    ////////////////////////////////////////////////////////////////////////////
-    tt_metal::Program program = tt_metal::CreateProgram();
-
-    uint32_t single_tile_size = 2 * 1024;
-    TT_FATAL(
-        cfg.M * cfg.in0_block_w * single_tile_size * 2 <= 150 * 1024, "input0 block must fit within 150kB of L1");
-    TT_FATAL(
-        cfg.N * cfg.in0_block_w * single_tile_size * 2 <= 100 * 1024, "input1 block must fit within 100kB of L1");
-    TT_FATAL(cfg.M * cfg.N * single_tile_size <= 600 * 1024, "output block must fit within 600kB of L1");
-    uint32_t dram_buffer_size_input0 =
-        single_tile_size * cfg.M * cfg.K;  // num_tiles of FP16_B, hard-coded in the reader/writer kernels
-    uint32_t dram_buffer_size_input1 =
-        single_tile_size * cfg.K * cfg.N;  // num_tiles of FP16_B, hard-coded in the reader/writer kernels
-    uint32_t dram_buffer_size_output =
-        single_tile_size * cfg.M * cfg.N;  // num_tiles of FP16_B, hard-coded in the reader/writer kernels
-
-    auto input0_dram_buffer = CreateBuffer(
-        device,
-        dram_buffer_size_input0,
-        dram_buffer_size_input0,
-        tt_metal::BufferType::DRAM);
-    uint32_t input0_dram_byte_address = input0_dram_buffer.address();
-    auto input1_dram_buffer = CreateBuffer(
-        device,
-        dram_buffer_size_input1,
-        dram_buffer_size_input1,
-        tt_metal::BufferType::DRAM);
-    uint32_t input1_dram_byte_address = input1_dram_buffer.address();
-    auto output_dram_buffer = CreateBuffer(
-        device,
-        dram_buffer_size_output,
-        dram_buffer_size_output,
-        tt_metal::BufferType::DRAM);
-    uint32_t output_dram_byte_address = output_dram_buffer.address();
-
-    auto input0_dram_noc_xy = input0_dram_buffer.noc_coordinates();
-    auto input1_dram_noc_xy = input1_dram_buffer.noc_coordinates();
-    auto output_dram_noc_xy = output_dram_buffer.noc_coordinates();
-
-    std::vector<uint32_t> reader_rt_args{
-        (std::uint32_t)input0_dram_byte_address,
-        (std::uint32_t)input0_dram_noc_xy.x,
-        (std::uint32_t)input0_dram_noc_xy.y,
-        (std::uint32_t)input1_dram_byte_address,
-        (std::uint32_t)input1_dram_noc_xy.x,
-        (std::uint32_t)input1_dram_noc_xy.y,
-        (std::uint32_t)(cfg.K / cfg.in0_block_w),                      // num_blocks
-        (std::uint32_t)(cfg.M * cfg.in0_block_w),                      // input 0 block num tiles
-        (std::uint32_t)(cfg.N * cfg.in0_block_w),                      // input 1 block num tiles
-        (std::uint32_t)(cfg.M * cfg.in0_block_w * single_tile_size),   // input 0 block bytes
-        (std::uint32_t)(cfg.N * cfg.in0_block_w * single_tile_size)};  // input 1 block bytes
-    std::vector<uint32_t> writer_rt_args;
-    string writer_kernel_name;
-    if (cfg.outputs_rm) {
-        writer_kernel_name = "tt_metal/kernels/dataflow/writer_unary.cpp";
-        writer_rt_args = {
-            (std::uint32_t)output_dram_byte_address,
-            (std::uint32_t)output_dram_noc_xy.x,
-            (std::uint32_t)output_dram_noc_xy.y,
-            uint(cfg.M * cfg.N)};
-    } else {
-        writer_kernel_name = "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unswizzle.cpp";
-        writer_rt_args = {
-            (std::uint32_t)output_dram_byte_address,
-            (std::uint32_t)output_dram_noc_xy.x,
-            (std::uint32_t)output_dram_noc_xy.y,
-            (std::uint32_t)cfg.out_subblock_h,            // num tiles per sub block m
-            (std::uint32_t)cfg.out_subblock_w,            // num tiles per sub block n
-            (std::uint32_t)(cfg.M / cfg.out_subblock_h),  // num sub blocks m
-            (std::uint32_t)(cfg.N / cfg.out_subblock_w),  // num sub blocks n
-            (std::uint32_t)(
-                cfg.out_subblock_w * single_tile_size *
-                (cfg.N / cfg.out_subblock_w)),  // bytes offset to next row within sub-block
-            (std::uint32_t)(
-                cfg.out_subblock_h * cfg.out_subblock_w * single_tile_size *
-                (cfg.N / cfg.out_subblock_w)),                        // bytes offset to next row of sub-blocks
-            (std::uint32_t)(cfg.out_subblock_w * single_tile_size)};  // bytes offset to next sub-block
-    }
-    auto writer_kernel = tt_metal::CreateKernel(
-        program,
-        writer_kernel_name,
-        cfg.core,
-        tt_metal::DataMovementConfig{
-            .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
-
-    auto reader_kernel = tt_metal::CreateKernel(
-        program,
-        "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_blocked.cpp",
-        cfg.core,
-        tt_metal::DataMovementConfig{
-            .processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
-
-    int num_blocks = (cfg.K / cfg.in0_block_w);
-    int in0_num_subblocks = (cfg.M / cfg.out_subblock_h);
-    int in0_block_num_tiles = cfg.out_subblock_h * cfg.in0_block_w * in0_num_subblocks;
-    int in0_subblock_num_tiles = cfg.out_subblock_h * cfg.in0_block_w;
-    int in1_num_subblocks = (cfg.N / cfg.out_subblock_w);
-    int in1_block_num_tiles = cfg.out_subblock_w * cfg.in0_block_w * in1_num_subblocks;
-    int in1_per_core_w = cfg.out_subblock_w * in1_num_subblocks;
-    int out_subblock_num_tiles = cfg.out_subblock_h * cfg.out_subblock_w;
-    int in0_subblock_h = (in0_block_num_tiles / in0_num_subblocks) / cfg.in0_block_w;
-
-    create_CBs_for_fused_matmul(
-        program,
-        device,
-        cfg.core,
-        cfg.activations_rm,
-        cfg.outputs_rm,
-        cfg.M,
-        cfg.N,
-        cfg.in0_block_w,
-        cfg.out_subblock_h);
-
-    TT_FATAL(
-        in0_subblock_h * cfg.in0_block_w * in0_num_subblocks == in0_block_num_tiles,
-        "in0_subblock_h * cfg.in0_block_w * in0_num_subblocks == in0_block_num_tiles");
-    TT_FATAL(cfg.in0_block_w == cfg.K, "Must match k tiles");
-
-    vector<uint32_t> compute_kernel_args = {
-        uint(cfg.in0_block_w),
-        uint(in0_num_subblocks),
-        uint(in0_block_num_tiles),
-        uint(in0_subblock_num_tiles),
-        uint(in0_subblock_h),
-
-        uint(in1_num_subblocks),
-        uint(in1_block_num_tiles),
-        uint(in1_per_core_w),
-
-        uint(num_blocks),
-
-        uint(cfg.out_subblock_h),
-        uint(cfg.out_subblock_w),
-        uint(out_subblock_num_tiles),
-
-        uint(cfg.activations_rm),
-        uint(cfg.outputs_rm)};
-
-    auto matmul_kernel = tt_metal::CreateKernel(
-        program,
-        "tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block.cpp",
-        cfg.core,
-        tt_metal::ComputeConfig{.compile_args = compute_kernel_args});
-
-    ////////////////////////////////////////////////////////////////////////////
-    //                      Stimulus Generation
-    ////////////////////////////////////////////////////////////////////////////
-    std::vector<uint32_t> packed_identity = {};
-    std::vector<uint32_t> packed_activation = {};
-    auto activation = generate_uniform_random_vector<bfloat16>(
-        1.0f,
-        1.0f,
-        dram_buffer_size_input0 / bfloat16::SIZEOF,
-        std::chrono::system_clock::now().time_since_epoch().count());
-    if (cfg.activations_rm) {
-        packed_activation = pack_vector<uint32_t, bfloat16>(activation);
-    } else {
-        auto activations_tilized = tilize<bfloat16, 32, 32>(activation, cfg.M * 32, cfg.K * 32);
-        auto activations_tile_layout = convert_to_tile_layout(activations_tilized);
-        packed_activation = pack_vector<uint32_t, bfloat16>(activations_tile_layout);
-    }
-    auto identity =
-        generate_strided_vector<bfloat16>(0.0f, 1.0f, cfg.N * 32 + 1, 0, dram_buffer_size_input1 / bfloat16::SIZEOF);
-    auto identity_tilized = tilize<bfloat16, 32, 32>(identity, cfg.K * 32, cfg.N * 32);
-    auto identity_tile_layout = convert_to_tile_layout(identity_tilized);
-    packed_identity = pack_vector<uint32_t, bfloat16>(identity_tile_layout);
-    ////////////////////////////////////////////////////////////////////////////
-    //                      Golden Generation
-    ////////////////////////////////////////////////////////////////////////////
-    auto packed_golden = packed_activation;  //
-
-    ////////////////////////////////////////////////////////////////////////////
-    //                      Compile and Execute Application
-    ////////////////////////////////////////////////////////////////////////////
-
-    tt_metal::detail::WriteToBuffer(input0_dram_buffer, packed_activation);
-    tt_metal::detail::WriteToBuffer(input1_dram_buffer, packed_identity);
-    std::vector<uint32_t> input0_dram_readback_packed;
-    tt_metal::detail::ReadFromBuffer(input0_dram_buffer, input0_dram_readback_packed);
-    EXPECT_TRUE(input0_dram_readback_packed == packed_activation);
-    print_vector_fixed_numel_per_row(unpack_vector<bfloat16, uint32_t>(input0_dram_readback_packed), 32);
-    std::vector<uint32_t> input1_dram_readback_packed;
-    tt_metal::detail::ReadFromBuffer(input1_dram_buffer, input1_dram_readback_packed);
-    EXPECT_TRUE(input1_dram_readback_packed == packed_identity);
-    print_vector_fixed_numel_per_row(unpack_vector<bfloat16, uint32_t>(input1_dram_readback_packed), 32);
-
-
-    tt_metal::SetRuntimeArgs(program, reader_kernel, cfg.core, reader_rt_args);
-    tt_metal::SetRuntimeArgs(program, writer_kernel, cfg.core, writer_rt_args);
-
 
-    tt_metal::detail::LaunchProgram(device, program);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //                      Comparison Checking
-    ////////////////////////////////////////////////////////////////////////////
-    std::vector<uint32_t> input0_l1_readback_packed;
-    tt_metal::detail::ReadFromDeviceL1(device, cfg.core, 120 * 1024, 2 * single_tile_size, input0_l1_readback_packed);
-    EXPECT_TRUE(input0_l1_readback_packed == packed_activation);
-    std::vector<uint32_t> input1_l1_readback_packed;
-    tt_metal::detail::ReadFromDeviceL1(device, cfg.core, 250 * 1024, 2 * single_tile_size, input1_l1_readback_packed);
-    EXPECT_TRUE(input1_l1_readback_packed == packed_identity);
-
-    // std::vector<uint32_t> input1_l1_readback_packed;
-    // std::vector<uint32_t> output_l1_readback_packed;
-    std::vector<uint32_t> dest_buffer_data;
-    tt_metal::detail::ReadFromBuffer(output_dram_buffer, dest_buffer_data);
-    auto dest_buffer_data_unpacked = unpack_vector<bfloat16, uint32_t>(dest_buffer_data);
-    if (not cfg.outputs_rm) {
-        dest_buffer_data_unpacked = convert_to_flat_layout(dest_buffer_data_unpacked);
-        dest_buffer_data_unpacked = untilize<bfloat16, 32, 32>(dest_buffer_data_unpacked, cfg.M * 32, cfg.N * 32);
-    }
-    pass &=
-        is_close_vectors<bfloat16>(activation, dest_buffer_data_unpacked, [&](const bfloat16& a, const bfloat16& b) {
-            return is_close(a, b, 0.15f);
-        });
-
-    return pass;
-}
 bool single_tile_matmul(tt_metal::Device* device) {
 
     bool pass = true;
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_CommandQueue.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_CommandQueue.cpp
new file mode 100644
index 00000000000..b816d972b2a
--- /dev/null
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_CommandQueue.cpp
@@ -0,0 +1,60 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <memory>
+
+#include "command_queue_fixture.hpp"
+#include "command_queue_test_utils.hpp"
+#include "gtest/gtest.h"
+#include "tt_metal/host_api.hpp"
+#include "tt_metal/detail/tt_metal.hpp"
+#include "tt_metal/test_utils/env_vars.hpp"
+#include "tt_metal/test_utils/stimulus.hpp"
+
+using namespace tt::tt_metal;
+
+namespace host_tests {
+
+TEST_F(MultiCommandQueueFixture, TestAccessCommandQueue) {
+    for (unsigned int device_id = 0; device_id < num_devices_; device_id++) {
+        EXPECT_NO_THROW(detail::GetCommandQueue(devices_[device_id]));
+    }
+}
+
+TEST_F(BasicFastDispatchFixture, TestCannotAccessCommandQueueForClosedDevice) {
+    const unsigned int device_id = 0;
+    Device* device = CreateDevice(device_id);
+    EXPECT_NO_THROW(detail::GetCommandQueue(device));
+    CloseDevice(device);
+    EXPECT_ANY_THROW(detail::GetCommandQueue(device));
+}
+
+TEST_F(MultiCommandQueueFixture, TestDirectedLoopbackToUniqueHugepage) {
+    std::unordered_map<chip_id_t, std::vector<uint32_t>> golden_data;
+
+    const uint32_t byte_size = 2048 * 16;
+    const uint64_t address = 0;
+
+    for (chip_id_t device_id = 0; device_id < num_devices_; device_id++) {
+        std::vector<uint32_t> data =
+            tt::test_utils::generate_uniform_random_vector<uint32_t>(0, UINT32_MAX, byte_size / sizeof(uint32_t));
+
+        chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device_id);
+        uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id);
+        tt::Cluster::instance().write_sysmem(data.data(), data.size() * sizeof(uint32_t), address, mmio_device_id, channel);
+
+        golden_data[device_id] = data;
+    }
+
+    std::vector<uint32_t> readback_data;
+    readback_data.resize(byte_size / sizeof(uint32_t));
+    for (chip_id_t device_id = 0; device_id < num_devices_; device_id++) {
+        chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device_id);
+        uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id);
+        tt::Cluster::instance().read_sysmem(&readback_data, byte_size, address, mmio_device_id, channel);
+        EXPECT_EQ(readback_data, golden_data.at(device_id));
+    }
+}
+
+}   // namespace host_tests
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp
index a597efb0353..24e27725e9d 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp
@@ -299,8 +299,8 @@ TEST_F(CommandQueueFixture, TestArbiterDoesNotHang) {
     auto dummy_reader_kernel = CreateKernel(
         program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/arbiter_hang.cpp", cr_set, DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default});
 
-    EnqueueProgram(*::detail::GLOBAL_CQ, program, false);
-    Finish(*::detail::GLOBAL_CQ);
+    EnqueueProgram(::detail::GetCommandQueue(device_), program, false);
+    Finish(::detail::GetCommandQueue(device_));
 }
 
 }
@@ -315,7 +315,7 @@ TEST_F(CommandQueueFixture, TestSingleCbConfigCorrectlySentSingleCore) {
 
     DummyProgramMultiCBConfig config = {.cr_set = cr_set, .cb_config_vector = {cb_config} };
 
-    EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_cbs(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config));
+    EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_cbs(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config));
 }
 
 TEST_F(CommandQueueFixture, TestMultiCbSeqConfigCorrectlySentSingleCore) {
@@ -330,7 +330,7 @@ TEST_F(CommandQueueFixture, TestMultiCbSeqConfigCorrectlySentSingleCore) {
 
     DummyProgramMultiCBConfig config = {.cr_set = cr_set, .cb_config_vector = {cb_config_0, cb_config_1, cb_config_2, cb_config_3}};
 
-    EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_cbs(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config));
+    EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_cbs(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config));
 }
 
 TEST_F(CommandQueueFixture, TestMultiCbRandomConfigCorrectlySentSingleCore) {
@@ -345,7 +345,7 @@ TEST_F(CommandQueueFixture, TestMultiCbRandomConfigCorrectlySentSingleCore) {
 
     DummyProgramMultiCBConfig config = {.cr_set = cr_set, .cb_config_vector = {cb_config_0, cb_config_1, cb_config_2, cb_config_3}};
 
-    EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_cbs(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config));
+    EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_cbs(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config));
 }
 
 TEST_F(CommandQueueFixture, TestMultiCBSharedAddressSpaceSentSingleCore) {
@@ -371,9 +371,9 @@ TEST_F(CommandQueueFixture, TestMultiCBSharedAddressSpaceSentSingleCore) {
     auto cb = CreateCircularBuffer(program, cr_set, cb_config);
 
     local_test_functions::initialize_dummy_kernels(program, cr_set);
-    EnqueueProgram(*tt::tt_metal::detail::GLOBAL_CQ, program, false);
+    EnqueueProgram(tt::tt_metal::detail::GetCommandQueue(device_), program, false);
 
-    Finish(*tt::tt_metal::detail::GLOBAL_CQ);
+    Finish(tt::tt_metal::detail::GetCommandQueue(device_));
     vector<uint32_t> cb_config_vector;
     uint32_t cb_config_buffer_size = NUM_CIRCULAR_BUFFERS * UINT32_WORDS_PER_CIRCULAR_BUFFER_CONFIG * sizeof(uint32_t);
 
@@ -407,7 +407,7 @@ TEST_F(CommandQueueFixture, TestSingleCbConfigCorrectlyUpdateSizeSentSingleCore)
 
     DummyProgramMultiCBConfig config = {.cr_set = cr_set, .cb_config_vector = {cb_config}};
 
-    EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_cbs_update_size(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config));
+    EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_cbs_update_size(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config));
 }
 
 TEST_F(CommandQueueFixture, TestSingleSemaphoreConfigCorrectlySentSingleCore) {
@@ -416,7 +416,7 @@ TEST_F(CommandQueueFixture, TestSingleSemaphoreConfigCorrectlySentSingleCore) {
 
     DummyProgramConfig config = {.cr_set = cr_set, .num_sems = 1};
 
-    EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_sems(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config));
+    EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_sems(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config));
 }
 
 TEST_F(CommandQueueFixture, TestAutoInsertedBlankBriscKernelInDeviceDispatchMode) {
@@ -430,8 +430,8 @@ TEST_F(CommandQueueFixture, TestAutoInsertedBlankBriscKernelInDeviceDispatchMode
         program, "tt_metal/kernels/dataflow/blank.cpp", cr_set,
         DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default});
 
-    EnqueueProgram(*tt::tt_metal::detail::GLOBAL_CQ, program, false);
-    Finish(*tt::tt_metal::detail::GLOBAL_CQ);
+    EnqueueProgram(tt::tt_metal::detail::GetCommandQueue(device_), program, false);
+    Finish(tt::tt_metal::detail::GetCommandQueue(device_));
 }
 
 TEST_F(CommandQueueFixture, ComputeRuntimeArgs) {
@@ -450,8 +450,8 @@ TEST_F(CommandQueueFixture, ComputeRuntimeArgs) {
 
     std::vector<uint32_t> initial_runtime_args = {101, 202};
     SetRuntimeArgs(program, 0, cr_set, initial_runtime_args);
-    EnqueueProgram(*tt::tt_metal::detail::GLOBAL_CQ, program, false);
-    Finish(*tt::tt_metal::detail::GLOBAL_CQ);
+    EnqueueProgram(tt::tt_metal::detail::GetCommandQueue(device_), program, false);
+    Finish(tt::tt_metal::detail::GetCommandQueue(device_));
 
     std::vector<uint32_t> increments = {87, 216};
     std::vector<uint32_t> written_args;
@@ -469,7 +469,7 @@ TEST_F(CommandQueueFixture, TestRuntimeArgsCorrectlySentSingleCore) {
     CoreRangeSet cr_set({cr});
 
     DummyProgramConfig dummy_program_config = {.cr_set = cr_set};
-    local_test_functions::test_dummy_EnqueueProgram_with_runtime_args(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, dummy_program_config);
+    local_test_functions::test_dummy_EnqueueProgram_with_runtime_args(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), dummy_program_config);
 }
 
 }  // end namespace single_core_tests
@@ -490,7 +490,7 @@ TEST_F(CommandQueueFixture, TestAllCbConfigsCorrectlySentMultiCore) {
     DummyProgramMultiCBConfig config = {
         .cr_set = cr_set, .cb_config_vector = cb_config_vector};
 
-    EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_cbs(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config));
+    EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_cbs(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config));
 }
 
 TEST_F(CommandQueueFixture, TestAllCbConfigsCorrectlySentUpdateSizeMultiCore) {
@@ -507,7 +507,7 @@ TEST_F(CommandQueueFixture, TestAllCbConfigsCorrectlySentUpdateSizeMultiCore) {
     DummyProgramMultiCBConfig config = {
         .cr_set = cr_set, .cb_config_vector = cb_config_vector  };
 
-    EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_cbs_update_size(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config));
+    EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_cbs_update_size(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config));
 }
 
 
@@ -526,7 +526,7 @@ TEST_F(CommandQueueFixture, TestMultiCbConfigsCorrectlySentUpdateSizeMultiCore)
     DummyProgramMultiCBConfig config = {
         .cr_set = cr_set, .cb_config_vector = cb_config_vector  };
 
-    EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_cbs_update_size(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config));
+    EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_cbs_update_size(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config));
 }
 
 TEST_F(CommandQueueFixture, TestAllSemConfigsCorrectlySentMultiCore) {
@@ -537,7 +537,7 @@ TEST_F(CommandQueueFixture, TestAllSemConfigsCorrectlySentMultiCore) {
 
     DummyProgramConfig config = {.cr_set = cr_set, .num_sems = NUM_SEMAPHORES};
 
-    EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_sems(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config));
+    EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_sems(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config));
 }
 
 TEST_F(CommandQueueFixture, TestAllRuntimeArgsCorrectlySentMultiCore) {
@@ -547,7 +547,7 @@ TEST_F(CommandQueueFixture, TestAllRuntimeArgsCorrectlySentMultiCore) {
     CoreRangeSet cr_set({cr});
 
     DummyProgramConfig dummy_program_config = {.cr_set = cr_set};
-    EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_runtime_args(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, dummy_program_config));
+    EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_runtime_args(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), dummy_program_config));
 }
 
 }  // end namespace multicore_tests
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
index 4e164fa602d..4dc3a4f4645 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
@@ -154,7 +154,7 @@ namespace dram_tests {
 TEST_F(CommandQueueFixture, WriteOneTileToDramBank0) {
     BufferConfig config = {.num_pages = 1, .page_size = 2048, .buftype = BufferType::DRAM};
 
-    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config));
+    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config));
 }
 
 TEST_F(CommandQueueFixture, WriteOneTileToAllDramBanks) {
@@ -163,7 +163,7 @@ TEST_F(CommandQueueFixture, WriteOneTileToAllDramBanks) {
         .page_size = 2048,
         .buftype = BufferType::DRAM};
 
-    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config));
+    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config));
 }
 
 TEST_F(CommandQueueFixture, WriteOneTileAcrossAllDramBanksTwiceRoundRobin) {
@@ -173,7 +173,7 @@ TEST_F(CommandQueueFixture, WriteOneTileAcrossAllDramBanksTwiceRoundRobin) {
         .page_size = 2048,
         .buftype = BufferType::DRAM};
 
-    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config));
+    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config));
 }
 
 TEST_F(CommandQueueFixture, Sending131072Pages) {
@@ -184,26 +184,26 @@ TEST_F(CommandQueueFixture, Sending131072Pages) {
         .page_size = 128,
         .buftype = BufferType::DRAM};
 
-    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config));
+    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config));
 }
 
 // TEST_F(CommandQueueFixture, FusedWriteDramBuffersInWhichRemainderBurstSizeDoesNotFitInLocalL1) {
 //     BufferConfig config = {.num_pages = 4096, .page_size = 22016, .buftype = BufferType::DRAM};
 
-//     EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config));
+//     EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config));
 // }
 
 TEST_F(CommandQueueFixture, TestNon32BAlignedPageSizeForDram) {
     BufferConfig config = {.num_pages = 1250, .page_size = 200, .buftype = BufferType::DRAM};
 
-    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config));
+    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config));
 }
 
 TEST_F(CommandQueueFixture, TestNon32BAlignedPageSizeForDram2) {
     // From stable diffusion read buffer
     BufferConfig config = {.num_pages = 8 * 1024, .page_size = 80, .buftype = BufferType::DRAM};
 
-    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config));
+    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config));
 }
 
 TEST_F(CommandQueueFixture, TestPageSizeTooLarge) {
@@ -213,14 +213,14 @@ TEST_F(CommandQueueFixture, TestPageSizeTooLarge) {
     // Should throw a host error due to the page size not fitting in the consumer CB
     BufferConfig config = {.num_pages = 1024, .page_size = 250880 * 2, .buftype = BufferType::DRAM};
 
-    EXPECT_ANY_THROW(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config));
+    EXPECT_ANY_THROW(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config));
 }
 
 TEST_F(CommandQueueFixture, TestWrapHostHugepageOnEnqueueReadBuffer) {
 
     BufferConfig buf_config = {.num_pages = 524270, .page_size = 2048, .buftype = BufferType::DRAM};
 
-    EXPECT_TRUE(local_test_functions::test_EnqueueWrap_on_EnqueueReadBuffer(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, buf_config));
+    EXPECT_TRUE(local_test_functions::test_EnqueueWrap_on_EnqueueReadBuffer(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), buf_config));
 }
 
 }  // end namespace dram_tests
@@ -230,7 +230,7 @@ namespace l1_tests {
 TEST_F(CommandQueueFixture, WriteOneTileToL1Bank0) {
     BufferConfig config = {.num_pages = 1, .page_size = 2048, .buftype = BufferType::L1};
 
-    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config));
+    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config));
 }
 
 TEST_F(CommandQueueFixture, WriteOneTileToAllL1Banks) {
@@ -240,7 +240,7 @@ TEST_F(CommandQueueFixture, WriteOneTileToAllL1Banks) {
         .page_size = 2048,
         .buftype = BufferType::L1};
 
-    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config));
+    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config));
 }
 
 TEST_F(CommandQueueFixture, WriteOneTileToAllL1BanksTwiceRoundRobin) {
@@ -250,13 +250,13 @@ TEST_F(CommandQueueFixture, WriteOneTileToAllL1BanksTwiceRoundRobin) {
         .page_size = 2048,
         .buftype = BufferType::L1};
 
-    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config));
+    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config));
 }
 
 TEST_F(CommandQueueFixture, TestNon32BAlignedPageSizeForL1) {
     BufferConfig config = {.num_pages = 1250, .page_size = 200, .buftype = BufferType::L1};
 
-    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config));
+    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config));
 }
 
 TEST_F(CommandQueueFixture, TestBackToBackNon32BAlignedPageSize) {
@@ -264,17 +264,17 @@ TEST_F(CommandQueueFixture, TestBackToBackNon32BAlignedPageSize) {
 
     Buffer bufa(device_, 125000, 100, buff_type);
     auto src_a = local_test_functions::generate_arange_vector(bufa.size());
-    EnqueueWriteBuffer(*tt::tt_metal::detail::GLOBAL_CQ, bufa, src_a, false);
+    EnqueueWriteBuffer(tt::tt_metal::detail::GetCommandQueue(device_), bufa, src_a, false);
 
     Buffer bufb(device_, 152000, 152, buff_type);
     auto src_b = local_test_functions::generate_arange_vector(bufb.size());
-    EnqueueWriteBuffer(*tt::tt_metal::detail::GLOBAL_CQ, bufb, src_b, false);
+    EnqueueWriteBuffer(tt::tt_metal::detail::GetCommandQueue(device_), bufb, src_b, false);
 
     vector<uint32_t> result_a;
-    EnqueueReadBuffer(*tt::tt_metal::detail::GLOBAL_CQ, bufa, result_a, true);
+    EnqueueReadBuffer(tt::tt_metal::detail::GetCommandQueue(device_), bufa, result_a, true);
 
     vector<uint32_t> result_b;
-    EnqueueReadBuffer(*tt::tt_metal::detail::GLOBAL_CQ, bufb, result_b, true);
+    EnqueueReadBuffer(tt::tt_metal::detail::GetCommandQueue(device_), bufb, result_b, true);
 
     EXPECT_EQ(src_a, result_a);
     EXPECT_EQ(src_b, result_b);
@@ -289,7 +289,7 @@ TEST_F(CommandQueueFixture, WritesToRandomBufferTypeAndThenReads) {
     BufferStressTestConfig config = {
         .seed = 0, .num_pages_total = 50000, .page_size = 2048, .max_num_pages_per_buffer = 16};
     EXPECT_TRUE(
-        local_test_functions::stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config));
+        local_test_functions::stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, tt::tt_metal::detail::GetCommandQueue(this->device_), config));
 }
 
 TEST_F(CommandQueueFixture, StressWrapTest) {
@@ -303,7 +303,7 @@ TEST_F(CommandQueueFixture, StressWrapTest) {
     BufferStressTestConfig config = {
         .page_size = 4096, .max_num_pages_per_buffer = 2000, .num_iterations = 10000, .num_unique_vectors = 20};
     EXPECT_TRUE(
-        local_test_functions::stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer_wrap(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config));
+        local_test_functions::stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer_wrap(this->device_, tt::tt_metal::detail::GetCommandQueue(this->device_), config));
 }
 
 }  // end namespace stress_tests
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/common/command_queue_fixture.hpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/common/command_queue_fixture.hpp
index bdbc38b0308..0632d1a90ac 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/common/command_queue_fixture.hpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/common/command_queue_fixture.hpp
@@ -85,3 +85,47 @@ class CommandQueueWithDPrintFixture: public CommandQueueFixture {
     // device setup, we need to skip device teardown if the test is skipped.
     bool test_skipped = false;
 };
+
+class BasicFastDispatchFixture : public ::testing::Test  {
+   protected:
+    void SetUp() override {
+        auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE");
+        if (slow_dispatch) {
+            TT_THROW("This suite can only be run with fast dispatch or TT_METAL_SLOW_DISPATCH_MODE unset");
+            GTEST_SKIP();
+        }
+    }
+};
+
+class MultiCommandQueueFixture : public ::testing::Test {
+   protected:
+    void SetUp() override {
+        auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE");
+        if (slow_dispatch) {
+            TT_THROW("This suite can only be run with fast dispatch or TT_METAL_SLOW_DISPATCH_MODE unset");
+            GTEST_SKIP();
+        }
+        arch_ = tt::get_arch_from_string(tt::test_utils::get_env_arch_name());
+
+        num_devices_ = tt::tt_metal::Device::detect_num_available_devices();
+
+        if (arch_ == tt::ARCH::GRAYSKULL && num_devices_ > 1) {
+            GTEST_SKIP();
+        }
+
+        for (unsigned int id = 0; id < num_devices_; id++) {
+            auto* device = tt::tt_metal::CreateDevice(id);
+            devices_.push_back(device);
+        }
+    }
+
+    void TearDown() override {
+        for (unsigned int id = 0; id < devices_.size(); id++) {
+            tt::tt_metal::CloseDevice(devices_.at(id));
+        }
+    }
+
+    std::vector<tt::tt_metal::Device*> devices_;
+    tt::ARCH arch_;
+    size_t num_devices_;
+};
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp
index caff8d0cd7a..2e7b76ea11d 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp
@@ -217,7 +217,7 @@ bool run_sfpu_all_same_buffer(tt_metal::Device* device, const SfpuConfig& test_c
     }
 
     std::vector<uint32_t> dest_buffer_data;
-    CommandQueue& cq = *tt::tt_metal::detail::GLOBAL_CQ;
+    CommandQueue& cq = tt::tt_metal::detail::GetCommandQueue(device);
     EnqueueWriteBuffer(cq, input_dram_buffer, packed_input, false);
 
     EnqueueProgram(cq, program, false);
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/dprint/test_mute_print_server.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/dprint/test_mute_print_server.cpp
index b8b93462eb1..1f838e5b91c 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/dprint/test_mute_print_server.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/dprint/test_mute_print_server.cpp
@@ -23,7 +23,7 @@ TEST_F(CommandQueueWithDPrintFixture, TestPrintMuting) {
     Device *device = this->device_;
 
     // Set up program and command queue
-    CommandQueue& cq = *tt::tt_metal::detail::GLOBAL_CQ;
+    CommandQueue& cq = tt::tt_metal::detail::GetCommandQueue(device);
     Program program = Program();
 
     // This tests prints only on a single core
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/dprint/test_print_all_harts.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/dprint/test_print_all_harts.cpp
index cedcd513143..a96e13ff924 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/dprint/test_print_all_harts.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/dprint/test_print_all_harts.cpp
@@ -117,7 +117,7 @@ TEST_F(CommandQueueWithDPrintFixture, TestPrintFromAllHarts) {
 
     // Set up program and command queue
     constexpr CoreCoord core = {0, 0}; // Print on first core only
-    CommandQueue& cq = *tt::tt_metal::detail::GLOBAL_CQ;
+    CommandQueue& cq = tt::tt_metal::detail::GetCommandQueue(device);
     Program program = Program();
 
     // Create a CB for testing TSLICE, dimensions are 32x32 bfloat16s
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/dprint/test_raise_wait.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/dprint/test_raise_wait.cpp
index 015b9a3168c..09a873cbeba 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/dprint/test_raise_wait.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/dprint/test_raise_wait.cpp
@@ -221,7 +221,7 @@ TEST_F(CommandQueueWithDPrintFixture, TestPrintRaiseWait) {
     Device *device = this->device_;
 
     // Set up program and command queue
-    CommandQueue& cq = *tt::tt_metal::detail::GLOBAL_CQ;
+    CommandQueue& cq = tt::tt_metal::detail::GetCommandQueue(device);
     Program program = Program();
 
     // Test runs on a 5x5 grid
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/pipelining/basic_pipeline.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/pipelining/basic_pipeline.cpp
index 6dbbf5f0882..efe09e11448 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/pipelining/basic_pipeline.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/pipelining/basic_pipeline.cpp
@@ -30,7 +30,7 @@ struct PipelineRowConfig {
 };
 
 void create_and_run_row_pipeline(tt_metal::Device* device, const PipelineRowConfig& test_config) {
-    CommandQueue& cq = *tt::tt_metal::detail::GLOBAL_CQ;
+    CommandQueue& cq = tt::tt_metal::detail::GetCommandQueue(device);
 
     tt_metal::Program program = tt_metal::CreateProgram();
 
diff --git a/tt_eager/tensor/tensor_impl.hpp b/tt_eager/tensor/tensor_impl.hpp
index 0782e653399..0588a791861 100644
--- a/tt_eager/tensor/tensor_impl.hpp
+++ b/tt_eager/tensor/tensor_impl.hpp
@@ -389,7 +389,7 @@ std::vector<T> read_data_from_device(const Tensor &tensor, uint32_t size_in_byte
     if (TT_METAL_SLOW_DISPATCH_MODE == nullptr) {
         std::vector<T> device_data;
         device_data.resize(size_in_bytes / sizeof(T));
-        EnqueueReadBuffer(*tt::tt_metal::detail::GLOBAL_CQ, *device_buffer, device_data.data(), true);
+        EnqueueReadBuffer(tt::tt_metal::detail::GetCommandQueue(tensor.device()), *device_buffer, device_data.data(), true);
         return device_data;
     } else {
         std::vector<uint32_t> device_data;
@@ -406,7 +406,7 @@ inline void write_data_to_device_buffer(const BufferType<T>& data_to_write, Devi
 
     const char *TT_METAL_SLOW_DISPATCH_MODE = std::getenv("TT_METAL_SLOW_DISPATCH_MODE");
     if (TT_METAL_SLOW_DISPATCH_MODE == nullptr) {
-        EnqueueWriteBuffer(*tt::tt_metal::detail::GLOBAL_CQ, *buffer, std::begin(data_to_write), false);
+        EnqueueWriteBuffer(tt::tt_metal::detail::GetCommandQueue(buffer->device()), *buffer, std::begin(data_to_write), false);
     } else {
         auto uint32_data = pack_vec_into_uint32_vec<T>(data_to_write);
         ::detail::WriteToBuffer(*buffer, uint32_data);
diff --git a/tt_eager/tt_dnn/op_library/run_operation.cpp b/tt_eager/tt_dnn/op_library/run_operation.cpp
index a264ab8ff15..bf6c0b30b25 100644
--- a/tt_eager/tt_dnn/op_library/run_operation.cpp
+++ b/tt_eager/tt_dnn/op_library/run_operation.cpp
@@ -212,11 +212,11 @@ std::vector<Tensor> run_device_operation(
 
             if (USE_FAST_DISPATCH) {
 #ifndef TTNN_ENABLE_LOGGING
-                EnqueueProgram(*tt::tt_metal::detail::GLOBAL_CQ, program, false);
+                EnqueueProgram(tt::tt_metal::detail::GetCommandQueue(device), program, false);
 #else
                 const auto start{std::chrono::steady_clock::now()};
-                EnqueueProgram(*tt::tt_metal::detail::GLOBAL_CQ, program, false);
-                Finish(*tt::tt_metal::detail::GLOBAL_CQ);
+                EnqueueProgram(tt::tt_metal::detail::GetCommandQueue(device), program, false);
+                Finish(tt::tt_metal::detail::GetCommandQueue(device));
                 const auto end{std::chrono::steady_clock::now()};
                 const std::chrono::duration<double> elapsed_seconds{end - start};
                 tt::log_info(
diff --git a/tt_metal/detail/tt_metal.hpp b/tt_metal/detail/tt_metal.hpp
index ed607d34e7b..8b9b371a837 100644
--- a/tt_metal/detail/tt_metal.hpp
+++ b/tt_metal/detail/tt_metal.hpp
@@ -13,6 +13,7 @@
 #include "tt_metal/impl/dispatch/command_queue.hpp"
 #include "tt_metal/detail/program.hpp"
 #include "tt_metal/llrt/watcher.hpp"
+#include "tt_metal/host_api.hpp"
 
 using std::unique_lock;
 using std::mutex;
@@ -20,9 +21,6 @@ using std::mutex;
 namespace tt::tt_metal{
 
     namespace detail {
-        // To be removed at a later time, but need a global
-        // command queue for the time being.
-        inline unique_ptr<CommandQueue> GLOBAL_CQ;
 
         inline static bool DispatchStateCheck( bool isFastDispatch){
             static bool fd = isFastDispatch;
@@ -258,10 +256,24 @@ namespace tt::tt_metal{
             return true;
         }
 
-        inline void Synchronize()
+        inline CommandQueue &GetCommandQueue(Device *device)
         {
-            if (detail::GLOBAL_CQ) {
-                Finish(*detail::GLOBAL_CQ);
+            detail::DispatchStateCheck(true);
+            // For now there is only one SW CommandQueue per device
+            static std::vector<std::unique_ptr<CommandQueue>> command_queues( Device::detect_num_available_devices() );
+            chip_id_t id = device->id();
+            TT_FATAL(id < command_queues.size(), "Invalid device {} detected", id);
+            TT_FATAL(device->is_initialized(), "Cannot access command queue for closed device {}", id);
+            if (not command_queues[id] or (command_queues[id] and command_queues[id]->device != device)) {
+                command_queues[device->id()] = std::make_unique<CommandQueue>(device);
+            }
+            return *(command_queues[id]);
+        }
+
+        inline void Synchronize(Device *device)
+        {
+            if (std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr) {
+                Finish(GetCommandQueue(device));
             }
         }
 
@@ -270,10 +282,10 @@ namespace tt::tt_metal{
             device->deallocate_buffers();
         }
 
-        inline void ClearCommandQueueProgramCache()
+        inline void ClearCommandQueueProgramCache(Device *device)
         {
-            if (detail::GLOBAL_CQ) {
-                ClearProgramCache(*detail::GLOBAL_CQ);
+            if (std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr) {
+                ClearProgramCache(GetCommandQueue(device));
             }
         }
 
@@ -408,5 +420,73 @@ namespace tt::tt_metal{
                 specified_core_spec
             );
         }
+
+        // Sending dispatch kernel. TODO(agrebenisan): Needs a refactor
+        inline void SendDispatchKernelToDevice(Device *device) {
+            ZoneScoped;
+
+            Program dispatch_program = CreateProgram();
+            auto dispatch_cores = device->dispatch_cores().begin();
+            CoreCoord producer_logical_core = *dispatch_cores++;
+            CoreCoord consumer_logical_core = *dispatch_cores;
+
+            CoreCoord producer_physical_core = device->worker_core_from_logical_core(producer_logical_core);
+            CoreCoord consumer_physical_core = device->worker_core_from_logical_core(consumer_logical_core);
+
+            std::map<string, string> producer_defines = {
+                {"IS_DISPATCH_KERNEL", ""},
+                {"CONSUMER_NOC_X", std::to_string(consumer_physical_core.x)},
+                {"CONSUMER_NOC_Y", std::to_string(consumer_physical_core.y)},
+            };
+            std::map<string, string> consumer_defines = {
+                {"PRODUCER_NOC_X", std::to_string(producer_physical_core.x)},
+                {"PRODUCER_NOC_Y", std::to_string(producer_physical_core.y)},
+            };
+            std::vector<uint32_t> dispatch_compile_args = {tt::Cluster::instance().get_tensix_soft_reset_addr()};
+            tt::tt_metal::CreateKernel(
+                dispatch_program,
+                "tt_metal/impl/dispatch/kernels/command_queue_producer.cpp",
+                producer_logical_core,
+                tt::tt_metal::DataMovementConfig {
+                    .processor = tt::tt_metal::DataMovementProcessor::RISCV_0,
+                    .noc = tt::tt_metal::NOC::RISCV_0_default,
+                    .compile_args = dispatch_compile_args,
+                    .defines = producer_defines});
+
+            tt::tt_metal::CreateKernel(
+                dispatch_program,
+                "tt_metal/impl/dispatch/kernels/command_queue_consumer.cpp",
+                consumer_logical_core,
+                tt::tt_metal::DataMovementConfig {
+                    .processor = tt::tt_metal::DataMovementProcessor::RISCV_0,
+                    .noc = tt::tt_metal::NOC::RISCV_0_default,
+                    .compile_args = dispatch_compile_args,
+                    .defines = consumer_defines});
+
+            tt::tt_metal::CreateSemaphore(dispatch_program, producer_logical_core, 2);
+            tt::tt_metal::CreateSemaphore(dispatch_program, consumer_logical_core, 0);
+
+            CompileProgram(device, dispatch_program);
+            ConfigureDeviceWithProgram(device, dispatch_program);
+
+            uint32_t fifo_addr = (HOST_CQ_FINISH_PTR + 32) >> 4;
+            vector<uint32_t> fifo_addr_vector = {fifo_addr};
+            WriteToDeviceL1(device, producer_logical_core, CQ_READ_PTR, fifo_addr_vector);
+            WriteToDeviceL1(device, producer_logical_core, CQ_WRITE_PTR, fifo_addr_vector);
+
+            tt::Cluster::instance().l1_barrier(device->id());
+
+            const std::tuple<uint32_t, uint32_t> tlb_data = tt::Cluster::instance().get_tlb_data(tt_cxy_pair(device->id(), device->worker_core_from_logical_core(*device->dispatch_cores().begin()))).value();
+            auto [tlb_offset, tlb_size] = tlb_data;
+            // std::cout << "CORE: " << device->worker_core_from_logical_core(*device->dispatch_cores().begin()).str() << std::endl;
+            // std::cout << "after sending pointers to device. my tlb_offset: " << tlb_offset << ", my tlb_size: " << tlb_size << std::endl;
+
+            launch_msg_t msg = dispatch_program.kernels_on_core(producer_logical_core)->launch_msg;
+
+            // TODO(pkeller): Should use detail::LaunchProgram once we have a mechanism to avoid running all RISCs
+            tt::llrt::write_launch_msg_to_core(device->id(), producer_physical_core, &msg);
+            tt::llrt::write_launch_msg_to_core(device->id(), consumer_physical_core, &msg);
+        }
+
     }
 }
diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp
index 510cb53f454..bb60b904664 100644
--- a/tt_metal/impl/device/device.cpp
+++ b/tt_metal/impl/device/device.cpp
@@ -279,7 +279,19 @@ bool Device::initialize(const std::vector<uint32_t>& l1_bank_remap) {
                          get_compile_outpath()
                          );
 
+    // Mark initialized before compiling and sending dispatch kernels to device because compilation expects device to be initialized
     this->initialized_ = true;
+
+    // Create system memory writer for this device to have an associated interface to hardware command queue (i.e. hugepage)
+    if (std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr) {
+        this->sysmem_writer = std::make_unique<SystemMemoryWriter>(
+            this->id_,
+            this->dispatch_cores(),
+            [&, this](CoreCoord core) { return this->worker_core_from_logical_core(core); }
+        );
+        detail::SendDispatchKernelToDevice(this);
+    }
+
     return true;
 }
 
@@ -295,6 +307,9 @@ bool Device::close() {
     this->clear_l1_state();
     tt::Cluster::instance().l1_barrier(id_);
     allocator::clear(*this->allocator_);
+    if (std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr) {
+        this->sysmem_writer.reset(nullptr);
+    }
 
     this->active_devices_.deactivate_device(this->id_);
 
diff --git a/tt_metal/impl/device/device.hpp b/tt_metal/impl/device/device.hpp
index 7a0ef070b80..b46a71c85e3 100644
--- a/tt_metal/impl/device/device.hpp
+++ b/tt_metal/impl/device/device.hpp
@@ -9,6 +9,7 @@
 #include "hostdevcommon/common_values.hpp"
 #include "tt_metal/impl/allocator/basic_allocator.hpp"
 #include "tt_metal/impl/allocator/l1_banking_allocator.hpp"
+#include "tt_metal/impl/dispatch/command_queue_interface.hpp"
 #include "llrt/tt_cluster.hpp"
 #include "dev_msgs.h"
 
@@ -20,6 +21,7 @@ namespace tt_metal {
 enum class BufferType;
 class Buffer;
 class Program;
+class CommandQueue;
 
 using on_close_device_callback = std::function<void ()>;
 
@@ -167,6 +169,11 @@ class Device {
     std::unique_ptr<Allocator> allocator_ = nullptr;
     bool initialized_ = false;
 
+    // SystemMemoryWriter is the interface to the hardware command queue
+    std::unique_ptr<SystemMemoryWriter> sysmem_writer;
+    // Allows access to sysmem_writer
+    friend class CommandQueue;
+
     std::set<CoreCoord> compute_cores;
     std::set<CoreCoord> storage_only_cores_;
     std::set<CoreCoord> dispatch_cores_;
diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp
index da7d199d963..818e31bdf11 100644
--- a/tt_metal/impl/dispatch/command_queue.cpp
+++ b/tt_metal/impl/dispatch/command_queue.cpp
@@ -5,7 +5,6 @@
 #include "tt_metal/impl/dispatch/command_queue.hpp"
 
 #include "debug_tools.hpp"
-#include "device_data.hpp"
 #include "noc/noc_parameters.h"
 #include "tt_metal/detail/program.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
@@ -654,83 +653,15 @@ void EnqueueWrapCommand::process() {
 
 EnqueueCommandType EnqueueWrapCommand::type() { return this->type_; }
 
-// Sending dispatch kernel. TODO(agrebenisan): Needs a refactor
-void send_dispatch_kernel_to_device(Device* device) {
-    ZoneScoped;
-    // Ideally, this should be some separate API easily accessible in
-    // TT-metal, don't like the fact that I'm writing this from scratch
-
-    Program dispatch_program = CreateProgram();
-    auto dispatch_cores = device->dispatch_cores().begin();
-    CoreCoord producer_logical_core = *dispatch_cores++;
-    CoreCoord consumer_logical_core = *dispatch_cores;
-
-    CoreCoord producer_physical_core = device->worker_core_from_logical_core(producer_logical_core);
-    CoreCoord consumer_physical_core = device->worker_core_from_logical_core(consumer_logical_core);
-
-    std::map<string, string> producer_defines = {
-        {"IS_DISPATCH_KERNEL", ""},
-        {"CONSUMER_NOC_X", std::to_string(consumer_physical_core.x)},
-        {"CONSUMER_NOC_Y", std::to_string(consumer_physical_core.y)},
-    };
-    std::map<string, string> consumer_defines = {
-        {"PRODUCER_NOC_X", std::to_string(producer_physical_core.x)},
-        {"PRODUCER_NOC_Y", std::to_string(producer_physical_core.y)},
-    };
-    std::vector<uint32_t> dispatch_compile_args = {DEVICE_DATA.TENSIX_SOFT_RESET_ADDR};
-    tt::tt_metal::CreateKernel(
-        dispatch_program,
-        "tt_metal/impl/dispatch/kernels/command_queue_producer.cpp",
-        producer_logical_core,
-        tt::tt_metal::DataMovementConfig {
-            .processor = tt::tt_metal::DataMovementProcessor::RISCV_0,
-            .noc = tt::tt_metal::NOC::RISCV_0_default,
-            .compile_args = dispatch_compile_args,
-            .defines = producer_defines});
-
-    tt::tt_metal::CreateKernel(
-        dispatch_program,
-        "tt_metal/impl/dispatch/kernels/command_queue_consumer.cpp",
-        consumer_logical_core,
-        tt::tt_metal::DataMovementConfig {
-            .processor = tt::tt_metal::DataMovementProcessor::RISCV_0,
-            .noc = tt::tt_metal::NOC::RISCV_0_default,
-            .compile_args = dispatch_compile_args,
-            .defines = consumer_defines});
-
-    tt::tt_metal::CreateSemaphore(dispatch_program, producer_logical_core, 2);
-    tt::tt_metal::CreateSemaphore(dispatch_program, consumer_logical_core, 0);
-
-    detail::CompileProgram(device, dispatch_program);
-    tt::tt_metal::detail::ConfigureDeviceWithProgram(device, dispatch_program);
-
-    uint32_t fifo_addr = (HOST_CQ_FINISH_PTR + 32) >> 4;
-    vector<uint32_t> fifo_addr_vector = {fifo_addr};
-    tt::tt_metal::detail::WriteToDeviceL1(device, producer_logical_core, CQ_READ_PTR, fifo_addr_vector);
-    tt::tt_metal::detail::WriteToDeviceL1(device, producer_logical_core, CQ_WRITE_PTR, fifo_addr_vector);
-
-    tt::Cluster::instance().l1_barrier(device->id());
-
-    const std::tuple<uint32_t, uint32_t> tlb_data = tt::Cluster::instance().get_tlb_data(tt_cxy_pair(device->id(), device->worker_core_from_logical_core(*device->dispatch_cores().begin()))).value();
-    auto [tlb_offset, tlb_size] = tlb_data;
-    // std::cout << "CORE: " << device->worker_core_from_logical_core(*device->dispatch_cores().begin()).str() << std::endl;
-    // std::cout << "after sending pointers to device. my tlb_offset: " << tlb_offset << ", my tlb_size: " << tlb_size << std::endl;
-
-    launch_msg_t msg = dispatch_program.kernels_on_core(producer_logical_core)->launch_msg;
-
-    // TODO(pkeller): Should use detail::LaunchProgram once we have a mechanism to avoid running all RISCs
-    tt::llrt::write_launch_msg_to_core(device->id(), producer_physical_core, &msg);
-    tt::llrt::write_launch_msg_to_core(device->id(), consumer_physical_core, &msg);
-}
-
 // CommandQueue section
-CommandQueue::CommandQueue(Device* device): sysmem_writer(device) {
+CommandQueue::CommandQueue(Device* device) {
     vector<uint32_t> pointers(CQ_START / sizeof(uint32_t), 0);
     pointers[0] = CQ_START >> 4;
 
-    tt::Cluster::instance().write_sysmem(pointers.data(), pointers.size() * sizeof(uint32_t), 0, 0);
+    chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device->id());
+    uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id());
+    tt::Cluster::instance().write_sysmem(pointers.data(), pointers.size() * sizeof(uint32_t), 0, mmio_device_id, channel);
 
-    send_dispatch_kernel_to_device(device);
     this->device = device;
 }
 
@@ -751,13 +682,13 @@ void CommandQueue::enqueue_read_buffer(Buffer& buffer, void* dst, bool blocking)
     ZoneScopedN("CommandQueue_read_buffer");
     TT_FATAL(blocking, "EnqueueReadBuffer only has support for blocking mode currently");
     uint32_t read_buffer_command_size = DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND + buffer.size();
-    if ((this->sysmem_writer.cq_write_interface.fifo_wr_ptr << 4) + read_buffer_command_size >= DeviceCommand::HUGE_PAGE_SIZE) {
+    if ((this->device->sysmem_writer->cq_write_interface.fifo_wr_ptr << 4) + read_buffer_command_size >= DeviceCommand::HUGE_PAGE_SIZE) {
         TT_ASSERT(read_buffer_command_size <= DeviceCommand::HUGE_PAGE_SIZE - CQ_START, "EnqueueReadBuffer command is too large");
         this->wrap();
     }
     tt::log_debug(tt::LogDispatch, "EnqueueReadBuffer");
 
-    EnqueueReadBufferCommand command(this->device, buffer, dst, this->sysmem_writer);
+    EnqueueReadBufferCommand command(this->device, buffer, dst, *this->device->sysmem_writer);
 
     // TODO(agrebenisan): Provide support so that we can achieve non-blocking
     // For now, make read buffer blocking since after the
@@ -770,15 +701,18 @@ void CommandQueue::enqueue_read_buffer(Buffer& buffer, void* dst, bool blocking)
     uint32_t padded_page_size = align(buffer.page_size(), 32);
     uint32_t data_size_in_bytes = padded_page_size * num_pages;
 
+    chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(this->device->id());
+    uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->device->id());
+
     if ((buffer.page_size() % 32) != 0) {
         // If page size is not 32B-aligned, we cannot do a contiguous copy
         uint32_t dst_address_offset = 0;
         for (uint32_t sysmem_address_offset = 0; sysmem_address_offset < data_size_in_bytes; sysmem_address_offset += padded_page_size) {
-            tt::Cluster::instance().read_sysmem((char*)dst + dst_address_offset, buffer.page_size(), command.read_buffer_addr + sysmem_address_offset, 0);
+            tt::Cluster::instance().read_sysmem((char*)dst + dst_address_offset, buffer.page_size(), command.read_buffer_addr + sysmem_address_offset, mmio_device_id, channel);
             dst_address_offset += buffer.page_size();
         }
     } else {
-        tt::Cluster::instance().read_sysmem(dst, data_size_in_bytes, command.read_buffer_addr, 0);
+        tt::Cluster::instance().read_sysmem(dst, data_size_in_bytes, command.read_buffer_addr, mmio_device_id, channel);
     }
 }
 
@@ -792,7 +726,7 @@ void CommandQueue::enqueue_write_buffer(Buffer& buffer, const void* src, bool bl
         "Buffer pages must fit within the command queue data section");
 
     uint32_t write_buffer_command_size = DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND + buffer.size();
-    if ((this->sysmem_writer.cq_write_interface.fifo_wr_ptr << 4) + write_buffer_command_size >= DeviceCommand::HUGE_PAGE_SIZE) {
+    if ((this->device->sysmem_writer->cq_write_interface.fifo_wr_ptr << 4) + write_buffer_command_size >= DeviceCommand::HUGE_PAGE_SIZE) {
         TT_ASSERT(
             write_buffer_command_size <= DeviceCommand::HUGE_PAGE_SIZE - CQ_START,
             "EnqueueWriteBuffer command is too large: {}",
@@ -803,7 +737,7 @@ void CommandQueue::enqueue_write_buffer(Buffer& buffer, const void* src, bool bl
 
     // TODO(agrebenisan): This could just be a stack variable since we
     // are just running in one thread
-    EnqueueWriteBufferCommand command(this->device, buffer, src, this->sysmem_writer);
+    EnqueueWriteBufferCommand command(this->device, buffer, src, *this->device->sysmem_writer);
     this->enqueue_command(command, blocking);
 }
 
@@ -844,7 +778,7 @@ void CommandQueue::enqueue_program(Program& program, bool blocking) {
     uint32_t host_data_and_device_command_size =
         DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND + (host_data_num_pages * DeviceCommand::PROGRAM_PAGE_SIZE);
 
-    if ((this->sysmem_writer.cq_write_interface.fifo_wr_ptr << 4) + host_data_and_device_command_size >=
+    if ((this->device->sysmem_writer->cq_write_interface.fifo_wr_ptr << 4) + host_data_and_device_command_size >=
         DeviceCommand::HUGE_PAGE_SIZE) {
         TT_ASSERT(
             host_data_and_device_command_size <= DeviceCommand::HUGE_PAGE_SIZE - CQ_START, "EnqueueProgram command size too large");
@@ -854,7 +788,7 @@ void CommandQueue::enqueue_program(Program& program, bool blocking) {
     EnqueueProgramCommand command(this->device,
         *this->program_to_buffer.at(program_id),
         this->program_to_dev_map.at(program_id),
-        this->sysmem_writer,
+        *this->device->sysmem_writer,
         program,
         stall);
 
@@ -863,30 +797,33 @@ void CommandQueue::enqueue_program(Program& program, bool blocking) {
 
 void CommandQueue::finish() {
     ZoneScopedN("CommandQueue_finish");
-    if ((this->sysmem_writer.cq_write_interface.fifo_wr_ptr << 4) + DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND >=
+    if ((this->device->sysmem_writer->cq_write_interface.fifo_wr_ptr << 4) + DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND >=
         DeviceCommand::HUGE_PAGE_SIZE) {
         this->wrap();
     }
     tt::log_debug(tt::LogDispatch, "Finish");
 
-    FinishCommand command(this->device, this->sysmem_writer);
+    FinishCommand command(this->device, *this->device->sysmem_writer);
     this->enqueue_command(command, false);
 
+    chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(this->device->id());
+    uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->device->id());
+
     // We then poll to check that we're done.
     uint32_t finish;
     do {
-        tt::Cluster::instance().read_sysmem(&finish, 4, HOST_CQ_FINISH_PTR, 0);
+        tt::Cluster::instance().read_sysmem(&finish, 4, HOST_CQ_FINISH_PTR, mmio_device_id, channel);
     } while (finish != 1);
 
     // Reset this value to 0 before moving on
     finish = 0;
-    tt::Cluster::instance().write_sysmem(&finish, 4, HOST_CQ_FINISH_PTR, 0);
+    tt::Cluster::instance().write_sysmem(&finish, 4, HOST_CQ_FINISH_PTR, mmio_device_id, channel);
 }
 
 void CommandQueue::wrap() {
     ZoneScopedN("CommandQueue_wrap");
     tt::log_debug(tt::LogDispatch, "EnqueueWrap");
-    EnqueueWrapCommand command(this->device, this->sysmem_writer);
+    EnqueueWrapCommand command(this->device, *this->device->sysmem_writer);
     this->enqueue_command(command, false);
 }
 
diff --git a/tt_metal/impl/dispatch/command_queue.hpp b/tt_metal/impl/dispatch/command_queue.hpp
index 9c91918fd52..ee2f15afc71 100644
--- a/tt_metal/impl/dispatch/command_queue.hpp
+++ b/tt_metal/impl/dispatch/command_queue.hpp
@@ -13,7 +13,6 @@
 
 
 #include "build_kernels_for_riscv/build_kernels_for_riscv.hpp"
-#include "tt_metal/impl/dispatch/command_queue_interface.hpp"
 #include "tt_metal/impl/dispatch/thread_safe_queue.hpp"
 #include "tt_metal/common/base.hpp"
 #include "tt_metal/common/tt_backend_api_types.hpp"
@@ -165,7 +164,10 @@ class EnqueueWrapCommand : public Command {
     EnqueueCommandType type();
 };
 
-void send_dispatch_kernel_to_device(Device* device);
+// Fwd declares
+namespace detail{
+    CommandQueue &GetCommandQueue(Device *device);
+}
 
 class CommandQueue {
    public:
@@ -175,7 +177,7 @@ class CommandQueue {
 
    private:
     Device* device;
-    SystemMemoryWriter sysmem_writer;
+
     // thread processing_thread;
     map<uint64_t, unique_ptr<Buffer>>
         program_to_buffer;
@@ -201,6 +203,7 @@ class CommandQueue {
     friend void EnqueueProgram(CommandQueue& cq, Program& program, bool blocking);
     friend void Finish(CommandQueue& cq);
     friend void ClearProgramCache(CommandQueue& cq);
+    friend CommandQueue &detail::GetCommandQueue(Device *device);
 };
 
 } // namespace tt::tt_metal
diff --git a/tt_metal/impl/dispatch/command_queue_interface.hpp b/tt_metal/impl/dispatch/command_queue_interface.hpp
index 9b1fdf39271..876e272a003 100644
--- a/tt_metal/impl/dispatch/command_queue_interface.hpp
+++ b/tt_metal/impl/dispatch/command_queue_interface.hpp
@@ -3,7 +3,6 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "tt_metal/common/base.hpp"
-#include "tt_metal/impl/device/device.hpp"
 #include "tt_metal/impl/dispatch/device_command.hpp"
 #include "tt_metal/llrt/llrt.hpp"
 
@@ -11,7 +10,9 @@ using namespace tt::tt_metal;
 
 inline uint32_t get_cq_rd_ptr(chip_id_t chip_id) {
     uint32_t recv;
-    tt::Cluster::instance().read_sysmem(&recv, sizeof(uint32_t), HOST_CQ_READ_PTR, chip_id);
+    chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(chip_id);
+    uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(chip_id);
+    tt::Cluster::instance().read_sysmem(&recv, sizeof(uint32_t), HOST_CQ_READ_PTR, mmio_device_id, channel);
     return recv;
 }
 
@@ -34,25 +35,30 @@ struct SystemMemoryCQWriteInterface {
 
 class SystemMemoryWriter {
    private:
-    Device* device;
+    chip_id_t device_id;
     // Data required for fast writes to write pointer location
     // in prefetch core's L1
     // const std::tuple<uint32_t, uint32_t> tlb_data;
     const uint32_t m_dma_buf_size;
     const std::function<void(uint32_t, uint32_t, const uint8_t*, uint32_t)> fast_write_callable;
+    const std::set<CoreCoord> dispatch_cores;
+    const std::function<CoreCoord (CoreCoord)>worker_from_logical_callable;
     uint32_t byte_addr;
     char* hugepage_start;
 
    public:
     SystemMemoryCQWriteInterface cq_write_interface;
-    SystemMemoryWriter(Device* device) :
-        m_dma_buf_size(tt::Cluster::instance().get_m_dma_buf_size(device->id())),
-        hugepage_start((char*) tt::Cluster::instance().host_dma_address(0, device->id(), 0)),
+    SystemMemoryWriter(chip_id_t device_id, const std::set<CoreCoord> &dev_dispatch_cores, const std::function<CoreCoord (CoreCoord)> &worker_from_logical) :
+        device_id(device_id),
+        m_dma_buf_size(tt::Cluster::instance().get_m_dma_buf_size(device_id)),
+        hugepage_start(
+            (char*) tt::Cluster::instance().host_dma_address(0, tt::Cluster::instance().get_associated_mmio_device(device_id), tt::Cluster::instance().get_assigned_channel_for_device(device_id))),
         fast_write_callable(
-            tt::Cluster::instance().get_fast_pcie_static_tlb_write_callable(device->id())) {
+            tt::Cluster::instance().get_fast_pcie_static_tlb_write_callable(device_id)),
+        dispatch_cores(dev_dispatch_cores),
+        worker_from_logical_callable(worker_from_logical) {
 
-        this->device = device;
-        const std::tuple<uint32_t, uint32_t> tlb_data = tt::Cluster::instance().get_tlb_data(tt_cxy_pair(device->id(), this->device->worker_core_from_logical_core(*device->dispatch_cores().begin()))).value();
+        const std::tuple<uint32_t, uint32_t> tlb_data = tt::Cluster::instance().get_tlb_data(tt_cxy_pair(device_id, this->worker_from_logical_callable(*this->dispatch_cores.begin()))).value();
         auto [tlb_offset, tlb_size] = tlb_data;
         this->byte_addr = tlb_offset + CQ_WRITE_PTR % tlb_size;
     }
@@ -65,7 +71,7 @@ class SystemMemoryWriter {
         uint32_t rd_ptr;
         uint32_t rd_toggle;
         do {
-            rd_ptr_and_toggle = get_cq_rd_ptr(this->device->id());
+            rd_ptr_and_toggle = get_cq_rd_ptr(this->device_id);
             rd_ptr = rd_ptr_and_toggle & 0x7fffffff;
             rd_toggle = rd_ptr_and_toggle >> 31;
 
@@ -79,18 +85,19 @@ class SystemMemoryWriter {
     }
 
     // Ideally, data should be an array or pointer, but vector for time-being
+    // TODO ALMEET: MEASURE THIS
     void cq_write(const void* data, uint32_t size_in_bytes, uint32_t write_ptr) const {
         // There is a 50% overhead if hugepage_start is not made static.
         // Eventually when we want to have multiple hugepages, we may need to template
         // the sysmem writer to get this optimization.
-        static char* hugepage_start = this->hugepage_start;
+        /*static*/ char* hugepage_start = this->hugepage_start;
         void* user_scratchspace = hugepage_start + write_ptr;
         memcpy(user_scratchspace, data, size_in_bytes);
     }
 
     void send_write_ptr() const {
         static CoreCoord dispatch_core =
-            this->device->worker_core_from_logical_core(*this->device->dispatch_cores().begin());
+            this->worker_from_logical_callable(*this->dispatch_cores.begin());
 
         uint32_t write_ptr_and_toggle =
             this->cq_write_interface.fifo_wr_ptr | (this->cq_write_interface.fifo_wr_toggle << 31);
diff --git a/tt_metal/llrt/tt_cluster.cpp b/tt_metal/llrt/tt_cluster.cpp
index 662eb5c3c53..b9e2663b2af 100644
--- a/tt_metal/llrt/tt_cluster.cpp
+++ b/tt_metal/llrt/tt_cluster.cpp
@@ -117,6 +117,8 @@ void Cluster::generate_cluster_descriptor() {
 
 void Cluster::initialize_device_drivers() {
     for (const auto &[mmio_device_id, controlled_devices] : this->devices_grouped_by_assoc_mmio_device_) {
+        this->assign_mem_channels_to_devices(mmio_device_id, controlled_devices);
+
         this->open_driver(mmio_device_id, controlled_devices);
 
         tt_device_params default_params;
@@ -128,6 +130,20 @@ void Cluster::initialize_device_drivers() {
     }
 }
 
+void Cluster::assign_mem_channels_to_devices(chip_id_t mmio_device_id, const std::set<chip_id_t> &controlled_device_ids) {
+    // g_MAX_HOST_MEM_CHANNELS (4) is defined in tt_SiliconDevice and denotes the max number of host memory channels per MMIO device
+    // Metal currently assigns 1 channel per device. See https://github.com/tenstorrent-metal/tt-metal/issues/4087
+    TT_ASSERT(controlled_device_ids.size() <= 4, "Unable to assign each device to its own host memory channel!");
+    uint16_t channel = 0;
+    this->device_to_host_mem_channel_[mmio_device_id] = channel++;
+    for (const chip_id_t &device_id : controlled_device_ids) {
+        if (device_id == mmio_device_id) {
+            continue;
+        }
+        this->device_to_host_mem_channel_[device_id] = channel++;
+    }
+}
+
 void Cluster::get_metal_desc_from_tt_desc(
     const std::unordered_map<chip_id_t, tt_SocDescriptor> &input,
     const std::unordered_map<chip_id_t, uint32_t> &per_chip_id_harvesting_masks) {
@@ -142,9 +158,10 @@ void Cluster::open_driver(chip_id_t mmio_device_id, const std::set<chip_id_t> &c
 
     std::unique_ptr<tt_device> device_driver;
     if (this->target_type_ == TargetDevice::Silicon) {
-        // This is the target/desired number of mem channels per arch/device. Silicon driver will attempt to open
-        // this many hugepages as channels, and assert if workload uses more than available.
-        uint32_t num_host_mem_ch_per_mmio_device = 1;
+        // This is the target/desired number of mem channels per arch/device.
+        // Silicon driver will attempt to open this many hugepages as channels, and assert if workload uses more than available.
+        // Metal currently uses assigns 1 channel per device
+        uint32_t num_host_mem_ch_per_mmio_device = controlled_device_ids.size();
         std::unordered_map<std::string, std::int32_t> dynamic_tlb_config = {};
         dynamic_tlb_config["REG_TLB"] = DEVICE_DATA.REG_TLB;
         // This will remove harvested rows from the soc descriptor
@@ -499,14 +516,13 @@ void Cluster::read_reg(std::uint32_t *mem_ptr, tt_cxy_pair target, uint64_t addr
     this->get_driver(chip_id).read_from_device(mem_ptr, virtual_target, addr, size_in_bytes, "REG_TLB");
 }
 
-void Cluster::write_sysmem(const void* vec, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id) const {
-    constexpr uint16_t channel = 0;
+void Cluster::write_sysmem(const void* vec, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id, uint16_t channel) const {
+    TT_ASSERT(this->cluster_desc_->is_chip_mmio_capable(src_device_id));
     this->get_driver(src_device_id).write_to_sysmem(vec, size_in_bytes, addr, channel, src_device_id);
 }
 
-void Cluster::read_sysmem(void *vec, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id) const {
-    // TODO: Uplift
-    constexpr uint16_t channel = 0;
+void Cluster::read_sysmem(void *vec, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id, uint16_t channel) const {
+    TT_ASSERT(this->cluster_desc_->is_chip_mmio_capable(src_device_id));
     this->get_driver(src_device_id).read_from_sysmem(vec, addr, channel, size_in_bytes, src_device_id);
 }
 
@@ -619,6 +635,10 @@ std::tuple<chip_id_t, CoreCoord> Cluster::get_connected_ethernet_core(std::tuple
         std::get<0>(connected_eth_core), soc_desc.chan_to_logical_eth_core_map.at(std::get<1>(connected_eth_core)));
 }
 
+uint32_t Cluster::get_tensix_soft_reset_addr() const {
+    return DEVICE_DATA.TENSIX_SOFT_RESET_ADDR;
+}
+
 }  // namespace tt
 
 std::ostream &operator<<(std::ostream &os, tt_target_dram const &dram) {
diff --git a/tt_metal/llrt/tt_cluster.hpp b/tt_metal/llrt/tt_cluster.hpp
index 6f1fe463a82..ad1560fce5e 100644
--- a/tt_metal/llrt/tt_cluster.hpp
+++ b/tt_metal/llrt/tt_cluster.hpp
@@ -95,8 +95,8 @@ class Cluster {
     void write_reg(const std::uint32_t *mem_ptr, tt_cxy_pair target, uint64_t addr) const;
     void read_reg(std::uint32_t *mem_ptr, tt_cxy_pair target, uint64_t addr) const;
 
-    void write_sysmem(const void* mem_ptr, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id) const;
-    void read_sysmem(void *mem_ptr, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id) const;
+    void write_sysmem(const void* mem_ptr, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id, uint16_t channel) const;
+    void read_sysmem(void *mem_ptr, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id, uint16_t channel) const;
 
     int get_device_aiclk(const chip_id_t &chip_id) const;
 
@@ -127,6 +127,17 @@ class Cluster {
     // Returns connected ethernet core on the other chip
     std::tuple<chip_id_t, CoreCoord> get_connected_ethernet_core(std::tuple<chip_id_t, CoreCoord> eth_core) const;
 
+    // Returns MMIO device ID (logical) that controls given `device_id`. If `device_id` is MMIO device it is returned.
+    chip_id_t get_associated_mmio_device(chip_id_t device_id) const {
+        return this->device_to_mmio_device_.at(device_id);
+    }
+
+    uint16_t get_assigned_channel_for_device(chip_id_t device_id) const {
+        return this->device_to_host_mem_channel_.at(device_id);
+    }
+
+    uint32_t get_tensix_soft_reset_addr() const;
+
    private:
     Cluster();
     ~Cluster();
@@ -134,6 +145,7 @@ class Cluster {
     void detect_arch_and_target();
     void generate_cluster_descriptor();
     void initialize_device_drivers();
+    void assign_mem_channels_to_devices(chip_id_t mmio_device_id, const std::set<chip_id_t> &controlled_device_ids);
     void open_driver(chip_id_t mmio_device_id, const std::set<chip_id_t> &controlled_device_ids, const bool &skip_driver_allocs = false);
     void start_driver(chip_id_t mmio_device_id, tt_device_params &device_params) const;
 
@@ -162,6 +174,17 @@ class Cluster {
     // Save mapping of device id to associated MMIO device id for fast lookup
     std::unordered_map<chip_id_t, chip_id_t> device_to_mmio_device_;
 
+    // Currently, each device is mapped to its own channel in host memory to enable fast dispatch
+    // Channels are unique within a group of devices all controlled by a particular MMIO device
+    // For example:
+    //      Two N300 cards where MMIO device IDs are 0, 1 and R chips are 2, 3
+    //      0 L controls 2 R and 1 L controls 3 R then, device_to_host_mem_channel_:
+    //          0 -> 0
+    //          2 -> 1
+    //          1 -> 0
+    //          3 -> 1
+    std::unordered_map<chip_id_t, uint16_t> device_to_host_mem_channel_;
+
     tt_device_dram_address_params dram_address_params = {
         DRAM_BARRIER_BASE
     };
diff --git a/tt_metal/programming_examples/eltwise_binary/eltwise_binary.cpp b/tt_metal/programming_examples/eltwise_binary/eltwise_binary.cpp
index 810e181e996..f60790c028e 100644
--- a/tt_metal/programming_examples/eltwise_binary/eltwise_binary.cpp
+++ b/tt_metal/programming_examples/eltwise_binary/eltwise_binary.cpp
@@ -68,7 +68,7 @@ int main(int argc, char **argv) {
         /*
         * Setup program to execute along with its buffers and kernels to use
         */
-        CommandQueue& cq = *tt::tt_metal::detail::GLOBAL_CQ;
+        CommandQueue& cq = tt::tt_metal::detail::GetCommandQueue(device);
 
         Program program = CreateProgram();
 
diff --git a/tt_metal/programming_examples/eltwise_sfpu/eltwise_sfpu.cpp b/tt_metal/programming_examples/eltwise_sfpu/eltwise_sfpu.cpp
index 44ec4de8746..5868fc5dcbf 100644
--- a/tt_metal/programming_examples/eltwise_sfpu/eltwise_sfpu.cpp
+++ b/tt_metal/programming_examples/eltwise_sfpu/eltwise_sfpu.cpp
@@ -40,7 +40,7 @@ int main(int argc, char **argv) {
         /*
         * Setup program to execute along with its buffers and kernels to use
         */
-        CommandQueue& cq = *tt::tt_metal::detail::GLOBAL_CQ;
+        CommandQueue& cq = tt::tt_metal::detail::GetCommandQueue(device);
 
         Program program = CreateProgram();
 
diff --git a/tt_metal/programming_examples/loopback/loopback.cpp b/tt_metal/programming_examples/loopback/loopback.cpp
index 33de9f92008..fdfd6bf9a0a 100644
--- a/tt_metal/programming_examples/loopback/loopback.cpp
+++ b/tt_metal/programming_examples/loopback/loopback.cpp
@@ -34,7 +34,7 @@ int main(int argc, char **argv) {
         /*
         * Setup program and command queue to execute along with its buffers and kernels to use
         */
-        CommandQueue& cq = *tt::tt_metal::detail::GLOBAL_CQ;
+        CommandQueue& cq = tt::tt_metal::detail::GetCommandQueue(device);
         Program program = CreateProgram();
 
         constexpr CoreCoord core = {0, 0};
diff --git a/tt_metal/programming_examples/matmul_multi_core/matmul_multi_core.cpp b/tt_metal/programming_examples/matmul_multi_core/matmul_multi_core.cpp
index d6181d7c67a..9a63c9abe58 100644
--- a/tt_metal/programming_examples/matmul_multi_core/matmul_multi_core.cpp
+++ b/tt_metal/programming_examples/matmul_multi_core/matmul_multi_core.cpp
@@ -110,7 +110,7 @@ void matmul_multi_core(vector<uint32_t>& a, vector<uint32_t>& b, vector<uint32_t
     /*
     * Setup program to execute along with its buffers and kernels to use
     */
-    CommandQueue& cq = *detail::GLOBAL_CQ;
+    CommandQueue& cq = tt_metal::detail::GetCommandQueue(device);
     Program program{};
 
     /*
diff --git a/tt_metal/programming_examples/matmul_multicore_reuse/matmul_multicore_reuse.cpp b/tt_metal/programming_examples/matmul_multicore_reuse/matmul_multicore_reuse.cpp
index d22eed55070..54d97f18ee7 100644
--- a/tt_metal/programming_examples/matmul_multicore_reuse/matmul_multicore_reuse.cpp
+++ b/tt_metal/programming_examples/matmul_multicore_reuse/matmul_multicore_reuse.cpp
@@ -110,7 +110,7 @@ void matmul_multicore_reuse(vector<uint32_t>& a, vector<uint32_t>& b, vector<uin
     * Setup program to execute along with its buffers and kernels to use
     * Core range is just single core
     */
-    CommandQueue& cq = *detail::GLOBAL_CQ;
+    CommandQueue& cq = tt_metal::detail::GetCommandQueue(device);
     Program program{};
 
     tt::DataFormat cb_data_format = tt::DataFormat::Float16_b;
diff --git a/tt_metal/programming_examples/matmul_multicore_reuse_mcast/matmul_multicore_reuse_mcast.cpp b/tt_metal/programming_examples/matmul_multicore_reuse_mcast/matmul_multicore_reuse_mcast.cpp
index 5341bf54bb0..f4d0ae899a3 100644
--- a/tt_metal/programming_examples/matmul_multicore_reuse_mcast/matmul_multicore_reuse_mcast.cpp
+++ b/tt_metal/programming_examples/matmul_multicore_reuse_mcast/matmul_multicore_reuse_mcast.cpp
@@ -111,7 +111,7 @@ void matmul_multicore_reuse_mcast(vector<uint32_t>& a, vector<uint32_t>& b, vect
     * Setup program to execute along with its buffers and kernels to use
     * Core range is just single core
     */
-    CommandQueue& cq = *detail::GLOBAL_CQ;
+    CommandQueue& cq = tt_metal::detail::GetCommandQueue(device);
     Program program{};
 
     tt::DataFormat cb_data_format = tt::DataFormat::Float16_b;
diff --git a/tt_metal/programming_examples/matmul_single_core/matmul_single_core.cpp b/tt_metal/programming_examples/matmul_single_core/matmul_single_core.cpp
index 49752b88581..c96f83d11b6 100644
--- a/tt_metal/programming_examples/matmul_single_core/matmul_single_core.cpp
+++ b/tt_metal/programming_examples/matmul_single_core/matmul_single_core.cpp
@@ -110,7 +110,7 @@ void matmul_single_core(vector<uint32_t>& a, vector<uint32_t>& b, vector<uint32_
     * Setup program to execute along with its buffers and kernels to use
     * Core range is just single core
     */
-    CommandQueue& cq = *detail::GLOBAL_CQ;
+    CommandQueue& cq = tt_metal::detail::GetCommandQueue(device);
     Program program{};
     CoreRange core = {.start={0, 0}, .end={0, 0}};
 
diff --git a/tt_metal/programming_examples/profiler/test_custom_cycle_count/test_custom_cycle_count.cpp b/tt_metal/programming_examples/profiler/test_custom_cycle_count/test_custom_cycle_count.cpp
index 3c2d5d70037..3bbabc42d73 100644
--- a/tt_metal/programming_examples/profiler/test_custom_cycle_count/test_custom_cycle_count.cpp
+++ b/tt_metal/programming_examples/profiler/test_custom_cycle_count/test_custom_cycle_count.cpp
@@ -42,8 +42,8 @@ bool RunCustomCycle(tt_metal::Device *device, int loop_count, string run_name =
         tt_metal::ComputeConfig{.compile_args = trisc_kernel_args, .defines = kernel_defines}
     );
 
-    EnqueueProgram(*::detail::GLOBAL_CQ, program, false);
-    Finish(*::detail::GLOBAL_CQ);
+    EnqueueProgram(tt_metal::detail::GetCommandQueue(device), program, false);
+    Finish(tt_metal::detail::GetCommandQueue(device));
     tt_metal::DumpDeviceProfileResults(device, program);
 
     return pass;
diff --git a/tt_metal/programming_examples/profiler/test_full_buffer/test_full_buffer.cpp b/tt_metal/programming_examples/profiler/test_full_buffer/test_full_buffer.cpp
index 8a6b54589e1..57082a07ddd 100644
--- a/tt_metal/programming_examples/profiler/test_full_buffer/test_full_buffer.cpp
+++ b/tt_metal/programming_examples/profiler/test_full_buffer/test_full_buffer.cpp
@@ -42,8 +42,8 @@ bool RunCustomCycle(tt_metal::Device *device, int loop_count, string run_name =
         tt_metal::ComputeConfig{.compile_args = trisc_kernel_args, .defines = kernel_defines}
     );
 
-    EnqueueProgram(*::detail::GLOBAL_CQ, program, false);
-    Finish(*::detail::GLOBAL_CQ);
+    EnqueueProgram(tt_metal::detail::GetCommandQueue(device), program, false);
+    Finish(tt_metal::detail::GetCommandQueue(device));
     tt_metal::DumpDeviceProfileResults(device, program);
 
     return pass;
diff --git a/tt_metal/tools/profiler/tt_metal_profiler.cpp b/tt_metal/tools/profiler/tt_metal_profiler.cpp
index e3b026b0799..39b3c58b8d9 100644
--- a/tt_metal/tools/profiler/tt_metal_profiler.cpp
+++ b/tt_metal/tools/profiler/tt_metal_profiler.cpp
@@ -33,9 +33,10 @@ void DumpDeviceProfileResults(Device *device, const vector<CoreCoord> &logical_c
     {
         ProfileTTMetalScope profile_this = ProfileTTMetalScope("DumpDeviceProfileResults");
         //TODO: (MO) This global is temporary need to update once the new interface is in
-        if (GLOBAL_CQ) {
-            Finish(*GLOBAL_CQ);
+        if (std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr) {
+            Finish(GetCommandQueue(device));
         }
+
         TT_FATAL(tt_is_print_server_running() == false, "Debug print server is running, cannot dump device profiler data");
         auto worker_cores_used_in_program =\
             device->worker_cores_from_logical_cores(logical_cores);
diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp
index d59d3a2ea8a..d3ed04e9a2d 100644
--- a/tt_metal/tt_metal.cpp
+++ b/tt_metal/tt_metal.cpp
@@ -234,7 +234,6 @@ namespace detail {
         DumpDeviceProfileResults(device, program);
     }
 
-
     bool ConfigureDeviceWithProgram(Device *device, Program &program) {
         ZoneScoped;
         bool pass = true;
@@ -328,22 +327,14 @@ namespace detail {
         ZoneScoped;
         program.compile(device);
     }
-}
+}   // namespace detail
 
 Device *CreateDevice(chip_id_t device_id, const std::vector<uint32_t>& l1_bank_remap) {
     Device * dev = new Device(device_id, l1_bank_remap);
-    const char *TT_METAL_SLOW_DISPATCH_MODE = std::getenv("TT_METAL_SLOW_DISPATCH_MODE");
-    if (TT_METAL_SLOW_DISPATCH_MODE == nullptr) {
-        detail::GLOBAL_CQ = std::make_unique<CommandQueue>(dev);
-    }
     return dev;
 }
 
 bool CloseDevice(Device *device) {
-    // Needed to ensure that GLOBAL_CQ doesn't contain a closed device
-    if (detail::GLOBAL_CQ) {
-        detail::GLOBAL_CQ.reset(nullptr);
-    }
     return device->close();
 }