diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
index 65fad37937b..43466f36f65 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
@@ -94,11 +94,11 @@ void test_EnqueueWriteBuffer_and_EnqueueReadBuffer(Device *device, CommandQueue
     // Clear out command queue
     uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id());
     chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device->id());
-    uint32_t cq_size = tt::Cluster::instance().get_host_channel_size(mmio_device_id, channel) / device->num_hw_cqs();
+    uint32_t cq_size = device->sysmem_manager().get_cq_size();
 
     std::vector<uint32_t> cq_zeros((cq_size - CQ_START) / sizeof(uint32_t), 0);
 
-    tt::Cluster::instance().write_sysmem(cq_zeros.data(), (cq_size - CQ_START), CQ_START, mmio_device_id, channel);
+    tt::Cluster::instance().write_sysmem(cq_zeros.data(), (cq_size - CQ_START), get_absolute_cq_offset(channel, 0, cq_size) + CQ_START, mmio_device_id, channel);
 
     for (const bool cq_write : {true, false}) {
         for (const bool cq_read : {true, false}) {
@@ -327,6 +327,7 @@ namespace dram_tests {
 TEST_F(CommandQueueSingleCardFixture, WriteOneTileToDramBank0) {
     TestBufferConfig config = {.num_pages = 1, .page_size = 2048, .buftype = BufferType::DRAM};
     for (Device *device : devices_) {
+        tt::log_info("Running On Device {}", device->id());
         local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(device, device->command_queue(), config);
     }
 }
@@ -428,11 +429,9 @@ TEST_F(CommandQueueFixture, TestPageSizeTooLarge) {
 // Requires enqueue write buffer
 TEST_F(CommandQueueSingleCardFixture, TestWrapHostHugepageOnEnqueueReadBuffer) {
     for (Device *device : this->devices_) {
+        tt::log_info("Running On Device {}", device->id());
         uint32_t page_size = 2048;
-        uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id());
-        chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device->id());
-        uint32_t command_queue_size = tt::Cluster::instance().get_host_channel_size(mmio_device_id, channel);
-        uint32_t command_issue_region_size = 805310400;
+        uint32_t command_issue_region_size = device->sysmem_manager().get_issue_queue_size(0);
 
         uint32_t max_command_size = command_issue_region_size - CQ_START;
         uint32_t buffer = 14240;
@@ -446,10 +445,9 @@ TEST_F(CommandQueueSingleCardFixture, TestWrapHostHugepageOnEnqueueReadBuffer) {
 
 TEST_F(CommandQueueSingleCardFixture, TestIssueMultipleReadWriteCommandsForOneBuffer) {
     for (Device *device : this->devices_) {
+        tt::log_info("Running On Device {}", device->id());
         uint32_t page_size = 2048;
-        uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id());
-        chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device->id());
-        uint32_t command_queue_size = tt::Cluster::instance().get_host_channel_size(mmio_device_id, channel);
+        uint32_t command_queue_size = device->sysmem_manager().get_cq_size();
         uint32_t num_pages = command_queue_size / page_size;
 
         TestBufferConfig config = {.num_pages = num_pages, .page_size = page_size, .buftype = BufferType::DRAM};
@@ -465,10 +463,8 @@ TEST_F(CommandQueueSingleCardFixture, TestWrapCompletionQOnInsufficientSpace) {
     uint32_t small_page_size = 2048;  // page size for second read
 
     for (Device *device : devices_) {
-        uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id());
-        chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device->id());
-        uint32_t command_queue_size = tt::Cluster::instance().get_host_channel_size(mmio_device_id, channel);
-        uint32_t command_completion_region_size = 268431360;
+        tt::log_info("Running On Device {}", device->id());
+        uint32_t command_completion_region_size = device->sysmem_manager().get_completion_queue_size(0);
 
         uint32_t first_buffer_size = tt::round_up(command_completion_region_size * 0.95, large_page_size);
 
@@ -504,10 +500,8 @@ TEST_F(CommandQueueSingleCardFixture, TestWrapCompletionQOnInsufficientSpace) {
 TEST_F(CommandQueueSingleCardFixture, TestWrapCompletionQOnInsufficientSpace2) {
     // Using default 75-25 issue and completion queue split
     for (Device *device : devices_) {
-        uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id());
-        chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device->id());
-        uint32_t command_queue_size = tt::Cluster::instance().get_host_channel_size(mmio_device_id, channel);
-        uint32_t command_completion_region_size = 268431360;
+        tt::log_info("Running On Device {}", device->id());
+        uint32_t command_completion_region_size = device->sysmem_manager().get_completion_queue_size(0);
 
         uint32_t num_pages_buff_1 = 9;
         uint32_t page_size_buff_1 = 2048;
@@ -653,6 +647,7 @@ TEST_F(CommandQueueSingleCardFixture, WritesToRandomBufferTypeAndThenReadsBlocki
         .seed = 0, .num_pages_total = 50000, .page_size = 2048, .max_num_pages_per_buffer = 16};
 
     for (Device *device : devices_) {
+        tt::log_info("Running on Device {}", device->id());
         EXPECT_TRUE(local_test_functions::stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer<true>(
             device, device->command_queue(), config));
     }
@@ -672,34 +667,34 @@ TEST_F(CommandQueueSingleCardFixture, WritesToRandomBufferTypeAndThenReadsNonblo
 
 // TODO: Split this into separate tests
 TEST_F(CommandQueueSingleCardFixture, ShardedBufferL1ReadWrites) {
+    std::map<std::string, std::vector<std::array<uint32_t, 2>>> test_params;
+
     for (Device *device : devices_) {
-        for (const std::array<uint32_t, 2> cores :
-             {std::array<uint32_t, 2>{1, 1},
-              std::array<uint32_t, 2>{5, 1},
-              std::array<uint32_t, 2>{1, 5},
-              std::array<uint32_t, 2>{5, 3},
-              std::array<uint32_t, 2>{3, 5},
-              std::array<uint32_t, 2>{5, 5},
-              std::array<uint32_t, 2>{
-                  static_cast<uint32_t>(device->compute_with_storage_grid_size().x),
-                  static_cast<uint32_t>(device->compute_with_storage_grid_size().y)}}) {
-            for (const std::array<uint32_t, 2> num_pages : {
-                     std::array<uint32_t, 2>{1, 1},
-                     std::array<uint32_t, 2>{2, 1},
-                     std::array<uint32_t, 2>{1, 2},
-                     std::array<uint32_t, 2>{2, 2},
-                     std::array<uint32_t, 2>{7, 11},
-                     std::array<uint32_t, 2>{3, 65},
-                     std::array<uint32_t, 2>{67, 4},
-                     std::array<uint32_t, 2>{3, 137},
-                 }) {
-                for (const std::array<uint32_t, 2> page_shape : {
-                         std::array<uint32_t, 2>{32, 32},
-                         std::array<uint32_t, 2>{1, 4},
-                         std::array<uint32_t, 2>{1, 120},
-                         std::array<uint32_t, 2>{1, 1024},
-                         std::array<uint32_t, 2>{1, 2048},
-                     }) {
+        if (tt::Cluster::instance().is_galaxy_cluster()) {
+            test_params = {
+                {"cores",
+                 {{1, 1},
+                  {static_cast<uint32_t>(device->compute_with_storage_grid_size().x),
+                   static_cast<uint32_t>(device->compute_with_storage_grid_size().y)}}},
+                {"num_pages", {{3, 65}}},
+                {"page_shape", {{32, 32}}}};
+        } else {
+            test_params = {
+                {"cores",
+                 {{1, 1},
+                  {5, 1},
+                  {1, 5},
+                  {5, 3},
+                  {3, 5},
+                  {5, 5},
+                  {static_cast<uint32_t>(device->compute_with_storage_grid_size().x),
+                   static_cast<uint32_t>(device->compute_with_storage_grid_size().y)}}},
+                {"num_pages", {{1, 1}, {2, 1}, {1, 2}, {2, 2}, {7, 11}, {3, 65}, {67, 4}, {3, 137}}},
+                {"page_shape", {{32, 32}, {1, 4}, {1, 120}, {1, 1024}, {1, 2048}}}};
+        }
+        for (const std::array<uint32_t, 2> cores : test_params.at("cores")) {
+            for (const std::array<uint32_t, 2> num_pages : test_params.at("num_pages")) {
+                for (const std::array<uint32_t, 2> page_shape : test_params.at("page_shape")) {
                     for (const TensorMemoryLayout shard_strategy :
                          {TensorMemoryLayout::HEIGHT_SHARDED,
                           TensorMemoryLayout::WIDTH_SHARDED,
@@ -712,7 +707,7 @@ TEST_F(CommandQueueSingleCardFixture, ShardedBufferL1ReadWrites) {
                             config.num_iterations = num_iterations;
                             config.mem_config = shard_strategy;
                             config.page_shape = page_shape;
-                            tt::log_info(tt::LogTest, fmt::format("cores: [{},{}] num_pages: [{},{}] page_shape: [{},{}], shard_strategy: {}, num_iterations: {}", cores[0],cores[1], num_pages[0],num_pages[1], page_shape[0],page_shape[1], magic_enum::enum_name(shard_strategy).data(), num_iterations).c_str());
+                            tt::log_info(tt::LogTest, fmt::format("Device: {} cores: [{},{}] num_pages: [{},{}] page_shape: [{},{}], shard_strategy: {}, num_iterations: {}", device->id(), cores[0],cores[1], num_pages[0],num_pages[1], page_shape[0],page_shape[1], magic_enum::enum_name(shard_strategy).data(), num_iterations).c_str());
                             local_test_functions::stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer_sharded(
                                 device, device->command_queue(), config, BufferType::L1, false);
                         }
@@ -800,7 +795,7 @@ TEST_F(CommandQueueSingleCardFixture, ShardedBufferLargeL1ReadWrites) {
                             config.num_iterations = num_iterations;
                             config.mem_config = shard_strategy;
                             config.page_shape = page_shape;
-                            tt::log_info(tt::LogTest, fmt::format("cores: [{},{}] num_pages: [{},{}] page_shape: [{},{}], shard_strategy: {}, num_iterations: {}", cores[0],cores[1], num_pages[0],num_pages[1], page_shape[0],page_shape[1], magic_enum::enum_name(shard_strategy).data(), num_iterations).c_str());
+                            tt::log_info(tt::LogTest, fmt::format("Device: {} cores: [{},{}] num_pages: [{},{}] page_shape: [{},{}], shard_strategy: {}, num_iterations: {}", device->id(), cores[0],cores[1], num_pages[0],num_pages[1], page_shape[0],page_shape[1], magic_enum::enum_name(shard_strategy).data(), num_iterations).c_str());
                             local_test_functions::stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer_sharded(
                                 device, device->command_queue(), config, BufferType::L1, true);
                         }
diff --git a/tt_metal/common/core_descriptor.hpp b/tt_metal/common/core_descriptor.hpp
index 66c0c92e0b6..3c5db2a7312 100644
--- a/tt_metal/common/core_descriptor.hpp
+++ b/tt_metal/common/core_descriptor.hpp
@@ -119,6 +119,10 @@ inline const core_descriptor_t &get_core_descriptor_config(chip_id_t device_id,
 
     auto compute_with_storage_start = desc_yaml["compute_with_storage_grid_range"]["start"];
     auto compute_with_storage_end = desc_yaml["compute_with_storage_grid_range"]["end"];
+    if (tt::Cluster::instance().is_galaxy_cluster() and product_name == "nebula_x1") {
+        compute_with_storage_start = desc_yaml["tg_compute_with_storage_grid_range"]["start"];
+        compute_with_storage_end = desc_yaml["tg_compute_with_storage_grid_range"]["end"];
+    }
     TT_ASSERT(compute_with_storage_start.IsSequence() and compute_with_storage_end.IsSequence());
     TT_ASSERT(compute_with_storage_end[0].as<size_t>() >= compute_with_storage_start[0].as<size_t>());
     TT_ASSERT(compute_with_storage_end[1].as<size_t>() >= compute_with_storage_start[1].as<size_t>());
@@ -136,7 +140,11 @@ inline const core_descriptor_t &get_core_descriptor_config(chip_id_t device_id,
     }
 
     std::vector<RelativeCoreCoord> dispatch_cores;
-    for (const auto& core_node : desc_yaml["dispatch_cores"]) {
+    auto dispatch_cores_string = "dispatch_cores";
+    if (tt::Cluster::instance().is_galaxy_cluster() and product_name == "nebula_x1") {
+        dispatch_cores_string = "tg_dispatch_cores";
+    }
+    for (const auto& core_node : desc_yaml[dispatch_cores_string]) {
         RelativeCoreCoord coord = {};
         if (core_node.IsSequence()) {
             // Logical coord
diff --git a/tt_metal/core_descriptors/wormhole_b0_80_arch.yaml b/tt_metal/core_descriptors/wormhole_b0_80_arch.yaml
index 9afdd48ccd2..68eb9d8df87 100644
--- a/tt_metal/core_descriptors/wormhole_b0_80_arch.yaml
+++ b/tt_metal/core_descriptors/wormhole_b0_80_arch.yaml
@@ -49,12 +49,21 @@ nebula_x1:
       start: [0, 0]
       end: [7, 7]
 
+    tg_compute_with_storage_grid_range: # Logical only start and end [x, y]
+      start: [0, 0]
+      end: [7, 3]
+
     storage_cores:
       []
 
     dispatch_cores:
       [[0, -1], [1, -1], [2, -1], [3, -1], [4, -1], [5, -1], [6, -1], [7, -1]]
 
+    tg_dispatch_cores:
+      [[0, -1], [1, -1], [2, -1], [3, -1], [4, -1], [5, -1], [6, -1], [7, -1],
+       [0, -2], [1, -2], [2, -2], [3, -2], [4, -2], [5, -2], [6, -2], [7, -2],
+       [0, -3], [1, -3], [2, -3], [3, -3], [4, -3], [5, -3], [6, -3], [7, -3]]
+
     dispatch_core_type:
       "tensix"
 
diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp
index e73c9efbdbb..1194b4790de 100644
--- a/tt_metal/impl/device/device.cpp
+++ b/tt_metal/impl/device/device.cpp
@@ -73,6 +73,7 @@ Device::Device(
     ZoneScoped;
     TT_ASSERT(num_hw_cqs > 0 and num_hw_cqs < 3, "num_hw_cqs can be between 1 and 2");
     this->build_key_ = tt::Cluster::instance().get_harvesting_mask(device_id);
+    tunnel_device_dispatch_workers_ = {};
     this->initialize(l1_small_size, l1_bank_remap, minimal);
 }
 
@@ -384,6 +385,737 @@ void Device::configure_kernel_variant(
     }
 }
 
+void Device::update_workers_build_settings(std::vector<std::vector<std::tuple<tt_cxy_pair, worker_build_settings_t>>> &device_worker_variants) {
+    for (uint32_t dwv = 0; dwv < device_worker_variants.size(); dwv++)
+    {
+        if (device_worker_variants[dwv].size() == 0) {
+            continue;
+        }
+        log_debug(tt::LogMetal, "Setting up {} Arguments", magic_enum::enum_name((tt::tt_metal::DispatchWorkerType)dwv));
+        switch(dwv) {
+            case PREFETCH:
+            {
+                uint32_t num_prefetchers = device_worker_variants[PREFETCH].size();
+                TT_ASSERT(device_worker_variants[MUX].size() == 1, "Cannot have more than one Mux.");
+                auto mux_settings = std::get<1>(device_worker_variants[MUX][0]);
+                TT_ASSERT(num_prefetchers == mux_settings.semaphores.size(), "Mux does not have required number of semaphores for Prefetchers. Exptected = {}. Fount = {}", num_prefetchers, mux_settings.semaphores.size());
+                uint32_t mux_sem = mux_settings.consumer_semaphore_id;
+                for (auto&[core, settings] : device_worker_variants[PREFETCH]) {
+                    auto dispatch_core_type = settings.dispatch_core_type;
+                    uint32_t downstream_cb_base = mux_settings.cb_start_address + mux_settings.cb_size_bytes * mux_sem;
+                    settings.upstream_cores.push_back(tt_cxy_pair(0, 0, 0));
+                    settings.downstream_cores.push_back(mux_settings.worker_physical_core);
+                    settings.compile_args.resize(23);
+                    auto& compile_args = settings.compile_args;
+                    compile_args[0]  = downstream_cb_base;
+                    compile_args[1]  = dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE;
+                    compile_args[2]  = dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_pages();
+                    compile_args[3]  = settings.producer_semaphore_id;
+                    compile_args[4]  = mux_sem++;
+                    compile_args[5]  = settings.issue_queue_start_addr;
+                    compile_args[6]  = settings.issue_queue_size;
+                    compile_args[7]  = dispatch_constants::PREFETCH_Q_BASE;
+                    compile_args[8]  = dispatch_constants::get(dispatch_core_type).prefetch_q_size();
+                    compile_args[9]  = CQ_PREFETCH_Q_RD_PTR;
+                    compile_args[10] = dispatch_constants::get(dispatch_core_type).cmddat_q_base();
+                    compile_args[11] = dispatch_constants::get(dispatch_core_type).cmddat_q_size();
+                    compile_args[12] = dispatch_constants::get(dispatch_core_type).scratch_db_base(); // unused for prefetch_h
+                    compile_args[13] = dispatch_constants::get(dispatch_core_type).scratch_db_size(); // unused for prefetch_h
+                    compile_args[14] = 0; //prefetch_sync_sem unused for prefetch_h
+                    compile_args[15] = dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_pages(); // prefetch_d only
+                    compile_args[16] = 0; // prefetch_d only
+                    compile_args[17] = 0; //prefetch_downstream_cb_sem, // prefetch_d only
+                    compile_args[18] = dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE;
+                    compile_args[19] = dispatch_constants::PREFETCH_D_BUFFER_BLOCKS; // prefetch_d only
+                    compile_args[20] = 2; //prefetch_h_exec_buf_sem,
+                    compile_args[21] = false;  // is_dram_variant
+                    compile_args[22] = true;    // is_host_variant
+                }
+                break;
+            }
+            case MUX:
+            {
+                uint32_t num_prefetchers = device_worker_variants[PREFETCH].size();
+                TT_ASSERT(device_worker_variants[MUX].size() == 1, "Cannot have more than one Mux.");
+                auto &mux_settings = std::get<1>(device_worker_variants[MUX][0]);
+                TT_ASSERT(num_prefetchers == mux_settings.semaphores.size(), "Mux does not have required number of semaphores for Prefetchers. Exptected = {}. Fount = {}", num_prefetchers, mux_settings.semaphores.size());
+                uint32_t mux_sem = mux_settings.consumer_semaphore_id;
+
+                auto& compile_args = mux_settings.compile_args;
+                compile_args.resize(25);
+                compile_args[0] = 0; // 0: reserved
+                compile_args[1] = mux_settings.cb_start_address >> 4; // 1: rx_queue_start_addr_words
+                compile_args[2] = mux_settings.cb_size_bytes >> 4; // 2: rx_queue_size_words
+                compile_args[3] = num_prefetchers; // 3: mux_fan_in
+                uint32_t arg_index = 4;
+                for (auto&[core, settings] : device_worker_variants[PREFETCH]) {
+                    compile_args[arg_index++] = packet_switch_4B_pack((uint32_t)settings.worker_physical_core.x,
+                                                                    (uint32_t)settings.worker_physical_core.y,
+                                                                    1,
+                                                                    (uint32_t)DispatchRemoteNetworkType::NOC0); // 4,5,6,7: src x info
+                }
+                TT_ASSERT(device_worker_variants[US_TUNNELER_REMOTE].size() == 1, "Unexpected number of ethernet tunnelers.");
+                auto &tunneler_settings = std::get<1>(device_worker_variants[US_TUNNELER_REMOTE][0]);
+
+                compile_args[8] = tunneler_settings.cb_start_address >> 4; // 8: remote_tx_queue_start_addr_words
+                compile_args[9] = tunneler_settings.cb_size_bytes >> 4; // 9: remote_tx_queue_size_words
+                compile_args[10] = tunneler_settings.worker_physical_core.x; // 10: remote_tx_x
+                compile_args[11] = tunneler_settings.worker_physical_core.y; // 11: remote_tx_y
+                compile_args[12] = 0; // 12: remote_tx_queue_id
+                compile_args[13] = (uint32_t)DispatchRemoteNetworkType::NOC0; // 13: tx_network_type
+                compile_args[14] = BRISC_L1_RESULT_BASE; // 14: test_results_addr
+                compile_args[15] = 1024; // 15: test_results_size
+                compile_args[16] = 0; // 16: timeout_cycles
+                compile_args[17] = 0x0; // 17: output_depacketize
+                compile_args[18] = 0x0; // 18: output_depacketize info
+                arg_index = 19; // 19, 20, 21, 22: input x packetize info:
+                for (auto&[core, settings] : device_worker_variants[PREFETCH]) {
+                    compile_args[arg_index++] = packet_switch_4B_pack(0x1,
+                                dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE,
+                                settings.producer_semaphore_id,  // upstream sem
+                                mux_sem++); // local sem
+                }
+                compile_args[23] = packet_switch_4B_pack(0xA1, 0xA2, 0xA3, 0xA4); // 23: packetized input src id
+                compile_args[24] = packet_switch_4B_pack(0xB1, 0xB2, 0xB3, 0xB4); // 24: packetized input dest id
+                break;
+            }
+            case US_TUNNELER_REMOTE:
+            {
+                TT_ASSERT(device_worker_variants[US_TUNNELER_REMOTE].size() == 1, "Unexpected number of ethernet tunnelers.");
+                auto &tunneler_settings = std::get<1>(device_worker_variants[US_TUNNELER_REMOTE][0]);
+                bool is_tunnel_start = tunneler_settings.tunnel_stop == 0;
+                auto &compile_args = tunneler_settings.compile_args;
+                compile_args.resize(16);
+                compile_args[0] = 0xDACADACA; // 0: endpoint_id_start_index
+                compile_args[1] = 2; // tunnel_lanes. 1 => Unidirectional. 2 => Bidirectional.
+                compile_args[2] = tunneler_settings.cb_start_address >> 4; // 2: rx_queue_start_addr_words
+                compile_args[3] = tunneler_settings.cb_size_bytes >> 4; // 3: rx_queue_size_words
+
+                compile_args[4] = packet_switch_4B_pack(tunneler_settings.eth_partner_physical_core.x,
+                                    tunneler_settings.eth_partner_physical_core.y,
+                                    0,
+                                    (uint32_t)DispatchRemoteNetworkType::ETH); // 4: remote_receiver_0_info
+                compile_args[6] = tunneler_settings.cb_start_address >> 4; // 6: remote_receiver_queue_start_addr_words 0
+                compile_args[7] = tunneler_settings.cb_size_bytes >> 4; // 7: remote_receiver_queue_size_words 0
+
+                if (is_tunnel_start) {
+                    auto &demux_settings = std::get<1>(device_worker_variants[DEMUX][0]);
+                    auto &mux_settings = std::get<1>(device_worker_variants[MUX][0]);
+
+                    compile_args[5] = packet_switch_4B_pack(demux_settings.worker_physical_core.x,
+                                        demux_settings.worker_physical_core.y,
+                                        device_worker_variants[DISPATCH].size(),//num_dest_endpoints,
+                                        (uint32_t)DispatchRemoteNetworkType::NOC0); // 5: remote_receiver_1_info
+                    compile_args[8] = demux_settings.cb_start_address >> 4; // 8: remote_receiver_queue_start_addr_words 1
+                    compile_args[9] = demux_settings.cb_size_bytes >> 4; // 9: remote_receiver_queue_size_words 1
+                    compile_args[10] = packet_switch_4B_pack(mux_settings.worker_physical_core.x,
+                                        mux_settings.worker_physical_core.y,
+                                        device_worker_variants[PREFETCH].size(), // mux output queue id
+                                        (uint32_t)DispatchRemoteNetworkType::NOC0); // 10: remote_sender_0_info
+                } else {
+                    auto &mux_d_settings = std::get<1>(device_worker_variants[MUX_D][0]);
+                    auto &demux_d_settings = std::get<1>(device_worker_variants[DEMUX_D][0]);
+
+                    compile_args[5] = packet_switch_4B_pack(mux_d_settings.worker_physical_core.x,
+                                        mux_d_settings.worker_physical_core.y,
+                                        1,//num_dest_endpoints,
+                                        (uint32_t)DispatchRemoteNetworkType::NOC0); // 5: remote_receiver_1_info
+                    compile_args[8] = (mux_d_settings.cb_start_address + mux_d_settings.cb_size_bytes) >> 4; // 8: remote_receiver_queue_start_addr_words 1
+                    compile_args[9] = mux_d_settings.cb_size_bytes >> 4; // 9: remote_receiver_queue_size_words 1
+                    compile_args[10] = packet_switch_4B_pack(demux_d_settings.worker_physical_core.x,
+                                        demux_d_settings.worker_physical_core.y,
+                                        1, // demux output queue id
+                                        (uint32_t)DispatchRemoteNetworkType::NOC0); // 10: remote_sender_0_info
+                }
+
+                compile_args[11] = packet_switch_4B_pack(tunneler_settings.eth_partner_physical_core.x,
+                                    tunneler_settings.eth_partner_physical_core.y,
+                                    3, // r tunneler output queue id
+                                    (uint32_t)DispatchRemoteNetworkType::ETH); // 11: remote_sender_1_info
+
+                compile_args[12] = 0x39000; // 12: test_results_addr
+                compile_args[13] = 0x7000; // 13: test_results_size
+                compile_args[14] = 0; // 14: timeout_cycles
+
+                break;
+            }
+            case DEMUX:
+            {
+                TT_ASSERT(device_worker_variants[DEMUX].size() == 1, "Unexpected number of ethernet tunnelers.");
+                auto &tunneler_settings = std::get<1>(device_worker_variants[US_TUNNELER_REMOTE][0]);
+                auto &demux_settings = std::get<1>(device_worker_variants[DEMUX][0]);
+                auto &dispatch_settings = std::get<1>(device_worker_variants[DISPATCH][0]);
+
+                auto &compile_args = demux_settings.compile_args;
+                compile_args.resize(30);
+
+                compile_args[0] = 0xD1; // 0: endpoint_id_start_index
+                compile_args[1] = demux_settings.cb_start_address >> 4; // 1: rx_queue_start_addr_words
+                compile_args[2] = demux_settings.cb_size_bytes >> 4; // 2: rx_queue_size_words
+                compile_args[3] = device_worker_variants[DISPATCH].size(); // 3: demux_fan_out
+
+                uint32_t arg_index = 4;
+                for (auto&[core, settings] : device_worker_variants[DISPATCH]) {
+                    compile_args[arg_index++] = packet_switch_4B_pack((uint32_t)settings.worker_physical_core.x,
+                                                                    (uint32_t)settings.worker_physical_core.y,
+                                                                    0,
+                                                                    (uint32_t)DispatchRemoteNetworkType::NOC0); // 4,5,6,7: remote_tx_x_info
+                }
+                arg_index = 8;
+                for (auto&[core, settings] : device_worker_variants[DISPATCH]) {
+                    compile_args[arg_index++] = settings.cb_start_address >> 4; // 8, 10, 12, 14: remote_tx_queue_start_addr_words x
+                    compile_args[arg_index++] = settings.cb_size_bytes >> 4; // 9, 11, 13, 15: remote_tx_queue_size_words x
+                }
+                compile_args[16] = tunneler_settings.worker_physical_core.x; // 16: remote_rx_x
+                compile_args[17] = tunneler_settings.worker_physical_core.y; // 17: remote_rx_y
+                compile_args[18] = 3; // 18: remote_rx_queue_id
+                compile_args[19] = (uint32_t)DispatchRemoteNetworkType::NOC0; // 19: tx_network_type
+                uint32_t dest_map_array[4] = {0, 1, 2, 3};
+                uint64_t dest_endpoint_output_map = packet_switch_dest_pack(dest_map_array, 4);
+                compile_args[20] = (uint32_t)(dest_endpoint_output_map >> 32); // 20: dest_endpoint_output_map_hi
+                compile_args[21] = (uint32_t)(dest_endpoint_output_map & 0xFFFFFFFF); // 21: dest_endpoint_output_map_lo
+                compile_args[22] = BRISC_L1_RESULT_BASE; // 22: test_results_addr
+                compile_args[23] = 1024; // 23: test_results_size
+                compile_args[24] = 0; // 24: timeout_cycles
+                compile_args[25] = 0xF; // 25: output_depacketize_mask
+                arg_index = 26;
+                uint32_t demux_sem = demux_settings.producer_semaphore_id;
+                for (auto&[core, settings] : device_worker_variants[DISPATCH]) {
+                     // 26, 27, 28, 29: output x depacketize info:
+                    compile_args[arg_index++] = packet_switch_4B_pack(settings.cb_log_page_size,
+                                                                        settings.consumer_semaphore_id, // downstream sem
+                                                                        demux_sem++,    // local sem
+                                                                        1); // remove header
+                }
+                break;
+            }
+            case DISPATCH:
+            {
+                uint32_t num_dispatchers = device_worker_variants[DISPATCH].size();
+                TT_ASSERT(device_worker_variants[DEMUX].size() == 1, "Cannot have more than one Demux.");
+                auto demux_settings = std::get<1>(device_worker_variants[DEMUX][0]);
+                TT_ASSERT(num_dispatchers == demux_settings.semaphores.size(), "Demux does not have required number of semaphores for Dispatchers. Exptected = {}. Fount = {}", num_dispatchers, demux_settings.semaphores.size());
+                uint32_t demux_sem = demux_settings.producer_semaphore_id;
+                for (auto&[core, settings] : device_worker_variants[DISPATCH]) {
+                    auto dispatch_core_type = settings.dispatch_core_type;
+                    settings.upstream_cores.push_back(demux_settings.worker_physical_core);
+                    settings.downstream_cores.push_back(tt_cxy_pair(0, 0, 0));
+                    settings.compile_args.resize(17);
+                    auto& compile_args = settings.compile_args;
+                    compile_args[0] = settings.cb_start_address;
+                    compile_args[1] = settings.cb_log_page_size;
+                    compile_args[2] = settings.cb_pages;
+                    compile_args[3] = settings.consumer_semaphore_id;
+                    compile_args[4] = demux_sem++;
+                    compile_args[5] = dispatch_constants::DISPATCH_BUFFER_SIZE_BLOCKS;
+                    compile_args[6] = 0; //unused prefetch_sync_sem
+                    compile_args[7] = settings.command_queue_start_addr;
+                    compile_args[8] = settings.completion_queue_start_addr;
+                    compile_args[9] = settings.completion_queue_size;
+                    compile_args[10] = dispatch_constants::DISPATCH_BUFFER_BASE; // unused
+                    compile_args[11] = (1 << dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE) * dispatch_constants::get(dispatch_core_type).dispatch_buffer_pages(); // unused
+                    compile_args[12] = 0; // unused: local ds semaphore
+                    compile_args[13] = 0; // unused: remote ds semaphore
+                    compile_args[14] = 0; // preamble size
+                    compile_args[15] = false; // is_dram_variant
+                    compile_args[16] = true; // is_host_variant
+                }
+                break;
+            }
+            case US_TUNNELER_LOCAL:
+            {
+                bool is_tunnel_end = device_worker_variants[US_TUNNELER_REMOTE].size() == 0;
+                TT_ASSERT(device_worker_variants[US_TUNNELER_LOCAL].size() == 1, "Unexpected number of ethernet tunnelers.");
+                auto &tunneler_settings = std::get<1>(device_worker_variants[US_TUNNELER_LOCAL][0]);
+                auto &demux_d_settings = std::get<1>(device_worker_variants[DEMUX_D][0]);
+                auto &mux_d_settings = std::get<1>(device_worker_variants[MUX_D][0]);
+
+                auto &compile_args = tunneler_settings.compile_args;
+                compile_args.resize(16);
+                compile_args[0] = 0xDACADACA; // 0: endpoint_id_start_index
+                compile_args[1] = 2; // tunnel_lanes. 1 => Unidirectional. 2 => Bidirectional.
+                compile_args[2] = tunneler_settings.cb_start_address >> 4; // 2: rx_queue_start_addr_words
+                compile_args[3] = tunneler_settings.cb_size_bytes >> 4; // 3: rx_queue_size_words
+
+                compile_args[4] = packet_switch_4B_pack(demux_d_settings.worker_physical_core.x,
+                                    demux_d_settings.worker_physical_core.y,
+                                    is_tunnel_end ? 1 : 2,
+                                    (uint32_t)DispatchRemoteNetworkType::NOC0); // 4: remote_receiver_0_info
+
+                compile_args[5] = packet_switch_4B_pack(tunneler_settings.eth_partner_physical_core.x,
+                                    tunneler_settings.eth_partner_physical_core.y,
+                                    1, // input q id of remote ethernet tunneler
+                                    (uint32_t)DispatchRemoteNetworkType::ETH); // 5: remote_receiver_1_info
+
+                compile_args[6] = demux_d_settings.cb_start_address >> 4; // 6: remote_receiver_queue_start_addr_words 0
+                compile_args[7] = demux_d_settings.cb_size_bytes >> 4; // 7: remote_receiver_queue_size_words 0
+                compile_args[8] = (tunneler_settings.cb_start_address + tunneler_settings.cb_size_bytes) >> 4; // 8: remote_receiver_queue_start_addr_words 1
+                compile_args[9] = tunneler_settings.cb_size_bytes >> 4; // 9: remote_receiver_queue_size_words 1
+
+                compile_args[10] = packet_switch_4B_pack(tunneler_settings.eth_partner_physical_core.x,
+                                    tunneler_settings.eth_partner_physical_core.y,
+                                    2, // queue id of remote eth tunneler sender
+                                    (uint32_t)DispatchRemoteNetworkType::ETH); // 10: remote_sender_0_info
+                compile_args[11] = packet_switch_4B_pack(mux_d_settings.worker_physical_core.x,
+                                    mux_d_settings.worker_physical_core.y,
+                                    is_tunnel_end ? 1 : 2, // mux_d output queue id
+                                    (uint32_t)DispatchRemoteNetworkType::NOC0); // 11: remote_sender_1_info
+                compile_args[12] = 0x39000; // 12: test_results_addr
+                compile_args[13] = 0x7000; // 13: test_results_size
+                compile_args[14] = 0; // 14: timeout_cycles
+                if (!is_tunnel_end && tunneler_settings.tunnel_stop > 1) {
+                    auto &us_tunneler_remote_settings = std::get<1>(device_worker_variants[US_TUNNELER_REMOTE][0]);
+                    auto mux_d_sender = us_tunneler_remote_settings.worker_physical_core;
+                    compile_args[15] = (0x3 << 16) | (mux_d_sender.y << 8) | (mux_d_sender.x);
+                    log_debug(tt::LogMetal, "Tunner Inner Device {} will send done to {}", tunneler_settings.worker_physical_core.str(), mux_d_sender.str());
+                }
+
+                break;
+            }
+            case DEMUX_D:
+            {
+                bool is_tunnel_end = device_worker_variants[US_TUNNELER_REMOTE].size() == 0;
+                TT_ASSERT(device_worker_variants[DEMUX_D].size() == 1, "Unexpected number of device demux.");
+
+                auto &tunneler_settings = std::get<1>(device_worker_variants[US_TUNNELER_LOCAL][0]);
+                auto &demux_d_settings = std::get<1>(device_worker_variants[DEMUX_D][0]);
+                auto &prefetch_d_settings = std::get<1>(device_worker_variants[PREFETCH_D][0]);
+
+                TT_ASSERT(demux_d_settings.tunnel_stop > 0 && demux_d_settings.tunnel_stop <= 4, "Invalid Demux D tunnel stop.");
+
+                auto &compile_args = demux_d_settings.compile_args;
+                compile_args.resize(30);
+
+                compile_args[0] = 0xB1; // 0: endpoint_id_start_index
+                compile_args[1] = demux_d_settings.cb_start_address >> 4; // 1: rx_queue_start_addr_words
+                compile_args[2] = demux_d_settings.cb_size_bytes >> 4; // 2: rx_queue_size_words
+                compile_args[3] = is_tunnel_end ? 1 : 2; // 3: demux_fan_out
+
+                compile_args[4] = packet_switch_4B_pack(prefetch_d_settings.worker_physical_core.x,
+                                                        prefetch_d_settings.worker_physical_core.y,
+                                                        0,
+                                                        (uint32_t)DispatchRemoteNetworkType::NOC0); // 4: remote_tx_0_info
+
+                compile_args[8] = prefetch_d_settings.cb_start_address >> 4; // 8: remote_tx_queue_start_addr_words 0
+                compile_args[9] = prefetch_d_settings.cb_size_bytes >> 4; // 9: remote_tx_queue_size_words 0
+
+                if(!is_tunnel_end) {
+                    auto &us_tunneler_remote_settings = std::get<1>(device_worker_variants[US_TUNNELER_REMOTE][0]);
+                    compile_args[5] = packet_switch_4B_pack((uint32_t)us_tunneler_remote_settings.worker_physical_core.x,
+                                                                    (uint32_t)us_tunneler_remote_settings.worker_physical_core.y,
+                                                                    0,
+                                                                    (uint32_t)DispatchRemoteNetworkType::NOC0); // 5: remote_tx_1_info
+
+                    compile_args[10] = us_tunneler_remote_settings.cb_start_address >> 4;    // 10: remote_tx_queue_start_addr_words 1
+                    compile_args[11] = us_tunneler_remote_settings.cb_size_bytes >> 4;   // 11: remote_tx_queue_size_words 1
+                }
+
+                compile_args[16] = tunneler_settings.worker_physical_core.x; // 16: remote_rx_x
+                compile_args[17] = tunneler_settings.worker_physical_core.y; // 17: remote_rx_y
+                compile_args[18] = 2; // 18: remote_rx_queue_id
+                compile_args[19] = (uint32_t)DispatchRemoteNetworkType::NOC0; // 19: tx_network_type
+                uint32_t dest_map_array[4] = {1, 1, 1, 1}; // needs to be based on tunnel stop.
+                dest_map_array[demux_d_settings.tunnel_stop-1] = 0;
+                uint64_t dest_endpoint_output_map = packet_switch_dest_pack(dest_map_array, 4);
+                compile_args[20] = (uint32_t)(dest_endpoint_output_map >> 32); // 20: dest_endpoint_output_map_hi
+                compile_args[21] = (uint32_t)(dest_endpoint_output_map & 0xFFFFFFFF); // 21: dest_endpoint_output_map_lo
+                compile_args[22] = BRISC_L1_RESULT_BASE; // 22: test_results_addr
+                compile_args[23] = 1024; // 23: test_results_size
+                compile_args[24] = 0; // 24: timeout_cycles
+                compile_args[25] = 0x1; // 25: output_depacketize_mask
+                compile_args[26] = packet_switch_4B_pack(prefetch_d_settings.cb_log_page_size,
+                                                                        prefetch_d_settings.consumer_semaphore_id, // downstream sem
+                                                                        demux_d_settings.producer_semaphore_id,    // local sem
+                                                                        0); // remove header
+                break;
+            }
+            case PREFETCH_D:
+            {
+
+                uint32_t num_prefetchers = device_worker_variants[PREFETCH_D].size();
+                TT_ASSERT(device_worker_variants[DEMUX_D].size() == 1, "Cannot have more than one Demux D.");
+                auto &prefetch_d_settings = std::get<1>(device_worker_variants[PREFETCH_D][0]);
+                auto demux_d_settings = std::get<1>(device_worker_variants[DEMUX_D][0]);
+                auto dispatch_d_settings = std::get<1>(device_worker_variants[DISPATCH_D][0]);
+
+                TT_ASSERT(num_prefetchers == demux_d_settings.semaphores.size(), "Demux D does not have required number of semaphores for Prefetcher D. Exptected = {}. Fount = {}", num_prefetchers, demux_d_settings.semaphores.size());
+
+                auto dispatch_core_type = prefetch_d_settings.dispatch_core_type;
+                prefetch_d_settings.upstream_cores.push_back(demux_d_settings.worker_physical_core);
+                prefetch_d_settings.downstream_cores.push_back(dispatch_d_settings.worker_physical_core);
+
+                uint32_t scratch_db_base = (prefetch_d_settings.cb_start_address + prefetch_d_settings.cb_size_bytes + PCIE_ALIGNMENT - 1) & (~(PCIE_ALIGNMENT - 1));
+                uint32_t scratch_db_size = dispatch_constants::get(dispatch_core_type).scratch_db_size();
+                const uint32_t l1_size = dispatch_core_type == CoreType::WORKER ? MEM_L1_SIZE : MEM_ETH_SIZE;
+                TT_ASSERT(scratch_db_base + scratch_db_size <= l1_size);
+
+                auto& compile_args = prefetch_d_settings.compile_args;
+                compile_args.resize(23);
+                compile_args[0]  = dispatch_d_settings.cb_start_address;
+                compile_args[1]  = dispatch_d_settings.cb_log_page_size;
+                compile_args[2]  = dispatch_d_settings.cb_pages;
+                compile_args[3]  = prefetch_d_settings.producer_semaphore_id;
+                compile_args[4]  = dispatch_d_settings.consumer_semaphore_id;
+                compile_args[5]  = 0;
+                compile_args[6]  = 0;
+                compile_args[7]  = 0;
+                compile_args[8]  = dispatch_constants::get(dispatch_core_type).prefetch_q_size();
+                compile_args[9]  = CQ_PREFETCH_Q_RD_PTR;
+                compile_args[10] = prefetch_d_settings.cb_start_address;
+                compile_args[11] = prefetch_d_settings.cb_size_bytes;
+                compile_args[12] = scratch_db_base;
+                compile_args[13] = scratch_db_size;
+                compile_args[14] = 0; //prefetch_sync_sem
+                compile_args[15] = prefetch_d_settings.cb_pages; // prefetch_d only
+                compile_args[16] = prefetch_d_settings.consumer_semaphore_id; // prefetch_d only
+                compile_args[17] = demux_d_settings.producer_semaphore_id; //prefetch_downstream_cb_sem, // prefetch_d only
+                compile_args[18] = prefetch_d_settings.cb_log_page_size;;
+                compile_args[19] = dispatch_constants::PREFETCH_D_BUFFER_BLOCKS; // prefetch_d only
+                compile_args[20] = 2; //prefetch_h_exec_buf_sem,
+                compile_args[21] = true;  // is_dram_variant
+                compile_args[22] = false; // is_host_variant
+                break;
+            }
+            case DISPATCH_D:
+            {
+                uint32_t num_dispatchers = device_worker_variants[DISPATCH_D].size();
+                TT_ASSERT(device_worker_variants[MUX_D].size() == 1, "Cannot have more than one Mux D.");
+                auto mux_d_settings = std::get<1>(device_worker_variants[MUX_D][0]);
+                TT_ASSERT(num_dispatchers == mux_d_settings.semaphores.size(), "Mux D does not have required number of semaphores for Dispatchers. Exptected = {}. Fount = {}", num_dispatchers, mux_d_settings.semaphores.size());
+                uint32_t sem = 0;
+                auto &dispatch_d_settings = std::get<1>(device_worker_variants[DISPATCH_D][0]);
+                auto prefetch_d_settings = std::get<1>(device_worker_variants[PREFETCH_D][0]);
+
+                auto dispatch_core_type = dispatch_d_settings.dispatch_core_type;
+                dispatch_d_settings.upstream_cores.push_back(prefetch_d_settings.worker_physical_core);
+                dispatch_d_settings.downstream_cores.push_back(mux_d_settings.worker_physical_core);
+                dispatch_d_settings.compile_args.resize(17);
+                auto& compile_args = dispatch_d_settings.compile_args;
+                compile_args[0] = dispatch_d_settings.cb_start_address;
+                compile_args[1] = dispatch_d_settings.cb_log_page_size;
+                compile_args[2] = dispatch_d_settings.cb_pages;
+                compile_args[3] = dispatch_d_settings.consumer_semaphore_id;
+                compile_args[4] = prefetch_d_settings.producer_semaphore_id;
+                compile_args[5] = dispatch_constants::DISPATCH_BUFFER_SIZE_BLOCKS;
+                compile_args[6] = 0;
+                compile_args[7] = dispatch_d_settings.command_queue_start_addr;
+                compile_args[8] = dispatch_d_settings.completion_queue_start_addr;
+                compile_args[9] = dispatch_d_settings.completion_queue_size;
+                compile_args[10] = mux_d_settings.cb_start_address;
+                compile_args[11] = mux_d_settings.cb_size_bytes;
+                compile_args[12] = dispatch_d_settings.producer_semaphore_id; // unused: local ds semaphore
+                compile_args[13] = mux_d_settings.consumer_semaphore_id; // unused: remote ds semaphore
+                compile_args[14] = sizeof(dispatch_packet_header_t); // preamble size
+                compile_args[15] = true; // is_dram_variant
+                compile_args[16] = false; // is_host_variant
+                break;
+            }
+            case MUX_D:
+            {
+                uint32_t num_dispatchers = device_worker_variants[DISPATCH_D].size();
+                TT_ASSERT(device_worker_variants[MUX_D].size() == 1, "Cannot have more than one Mux D.");
+                auto &mux_d_settings = std::get<1>(device_worker_variants[MUX_D][0]);
+                auto dispatch_d_settings = std::get<1>(device_worker_variants[DISPATCH_D][0]);
+
+                TT_ASSERT(num_dispatchers == mux_d_settings.semaphores.size(), "Mux D does not have required number of semaphores for Dispatchers. Exptected = {}. Fount = {}", num_dispatchers, mux_d_settings.semaphores.size());
+                uint32_t sem = 0;
+                bool is_tunnel_end = device_worker_variants[US_TUNNELER_REMOTE].size() == 0;
+
+                auto& compile_args = mux_d_settings.compile_args;
+                compile_args.resize(25);
+                compile_args[0] = 0; // 0: reserved
+                compile_args[1] = mux_d_settings.cb_start_address >> 4; // 1: rx_queue_start_addr_words
+                compile_args[2] = mux_d_settings.cb_size_bytes >> 4; // 2: rx_queue_size_words
+                compile_args[3] = is_tunnel_end ? 1 : 2; // 3: mux_fan_in
+                uint32_t arg_index = 4;
+                compile_args[4] = packet_switch_4B_pack(dispatch_d_settings.worker_physical_core.x,
+                                                        dispatch_d_settings.worker_physical_core.y,
+                                                        1,
+                                                        DispatchRemoteNetworkType::NOC0); // 4,5,6,7: src x info
+
+                if (!is_tunnel_end) {
+                    TT_ASSERT(device_worker_variants[US_TUNNELER_REMOTE].size() == 1, "Unexpected number of ethernet tunnelers.");
+                    auto &us_tunneler_remote_settings = std::get<1>(device_worker_variants[US_TUNNELER_REMOTE][0]);
+                    compile_args[5] = packet_switch_4B_pack(us_tunneler_remote_settings.worker_physical_core.x,
+                                        us_tunneler_remote_settings.worker_physical_core.y,
+                                        3,
+                                        DispatchRemoteNetworkType::NOC0); // 4,5,6,7: src x info
+
+                }
+
+                TT_ASSERT(device_worker_variants[US_TUNNELER_LOCAL].size() == 1, "Unexpected number of ethernet tunnelers.");
+                auto &tunneler_settings = std::get<1>(device_worker_variants[US_TUNNELER_LOCAL][0]);
+
+                compile_args[8] = (tunneler_settings.cb_start_address + tunneler_settings.cb_size_bytes) >> 4; // 8: remote_tx_queue_start_addr_words
+                compile_args[9] = tunneler_settings.cb_size_bytes >> 4; // 9: remote_tx_queue_size_words
+                compile_args[10] = tunneler_settings.worker_physical_core.x; // 10: remote_tx_x
+                compile_args[11] = tunneler_settings.worker_physical_core.y; // 11: remote_tx_y
+                compile_args[12] = 1; // 12: remote_tx_queue_id
+                compile_args[13] = (uint32_t)DispatchRemoteNetworkType::NOC0; // 13: tx_network_type
+                compile_args[14] = BRISC_L1_RESULT_BASE; // 14: test_results_addr
+                compile_args[15] = 1024; // 15: test_results_size
+                compile_args[16] = 0; // 16: timeout_cycles
+                compile_args[17] = 0x0; // 17: output_depacketize
+                compile_args[18] = 0x0; // 18: output_depacketize info
+
+                compile_args[19] = packet_switch_4B_pack(0x1,
+                            dispatch_d_settings.cb_log_page_size,
+                            dispatch_d_settings.producer_semaphore_id,  // upstream sem
+                            mux_d_settings.consumer_semaphore_id); // local sem
+                uint32_t src_id = 0xC1 + mux_d_settings.tunnel_stop - 1;
+                uint32_t dest_id = 0xD1 + mux_d_settings.tunnel_stop - 1;
+                compile_args[23] = packet_switch_4B_pack(src_id, src_id, src_id, src_id); // 23: packetized input src id
+                compile_args[24] = packet_switch_4B_pack(dest_id, dest_id, dest_id, dest_id); // 24: packetized input dest id
+                break;
+            }
+        }
+    }
+}
+
+void Device::setup_tunnel_for_remote_devices() {
+    chip_id_t mmio_device_id = this->id_;
+    uint32_t num_tunnels = tt::Cluster::instance().get_mmio_device_tunnel_count(mmio_device_id);
+    if (num_tunnels == 0) {
+        //no remote device conected to this mmio device.
+        return;
+    }
+
+
+    tunnels_from_mmio_ = tt::Cluster::instance().get_tunnels_from_mmio_device(mmio_device_id);
+    uint32_t index = 0;
+    for (auto tunnel : tunnels_from_mmio_) {
+        for (auto remote_dev : tunnel) {
+            log_info(tt::LogMetal, "MMIO Device {} : Tunnel {} : Device {}", mmio_device_id, index, remote_dev);
+        }
+        index++;
+    }
+
+    std::map<uint32_t, std::vector<std::vector<std::tuple<tt_cxy_pair, worker_build_settings_t>>>> tunnel_dispatch_core_allocations = {};
+
+    uint32_t tunnel_id = 0;
+    for (auto &tunnel: tunnels_from_mmio_) {
+        std::vector<std::vector<std::tuple<tt_cxy_pair, worker_build_settings_t>>> tunnel_core_allocations = {};
+        tunnel_core_allocations.resize(tt::tt_metal::DispatchWorkerType::COUNT);
+
+        for (uint32_t tunnel_stop = 1; tunnel_stop < tunnel.size(); tunnel_stop++) {
+            //uint32_t tunnel_stop = tt::Cluster::instance().get_device_tunnel_depth(device_id);
+            chip_id_t device_id = tunnel[tunnel_stop];
+            // a remote device.
+            // tunnel_stop hops away.
+            uint8_t num_hw_cqs = 1;
+            uint32_t cq_id = 0;
+            uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id);
+            CoreType dispatch_core_type = dispatch_core_manager::get(num_hw_cqs).get_dispatch_core_type(mmio_device_id);
+
+            worker_build_settings_t settings = {};
+            //allocations below are on mmio chip.
+            settings.tunnel_stop = 0;
+            uint32_t cq_size = this->sysmem_manager().get_cq_size();
+            settings.command_queue_start_addr = get_absolute_cq_offset(channel, cq_id, cq_size);
+            settings.issue_queue_start_addr = settings.command_queue_start_addr + CQ_START;
+            settings.issue_queue_size = this->sysmem_manager_->get_issue_queue_size(cq_id);
+            settings.completion_queue_start_addr = settings.issue_queue_start_addr + settings.issue_queue_size;
+            settings.completion_queue_size = this->sysmem_manager_->get_completion_queue_size(cq_id);
+            settings.dispatch_core_type = dispatch_core_type;
+
+            tt_cxy_pair prefetch_location = dispatch_core_manager::get(num_hw_cqs).prefetcher_core(device_id, channel, cq_id);
+            settings.worker_physical_core = tt_cxy_pair(prefetch_location.chip, get_physical_core_coordinate(prefetch_location, dispatch_core_type));
+            settings.kernel_file = "tt_metal/impl/dispatch/kernels/cq_prefetch.cpp";
+            //prefetch needs three semaphores.
+            settings.semaphores.push_back(0);
+            settings.semaphores.push_back(dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_pages());
+            settings.semaphores.push_back(0);
+            settings.producer_semaphore_id = 1;
+            tunnel_core_allocations[PREFETCH].push_back(std::make_tuple(prefetch_location, settings));
+
+            settings.semaphores.clear();
+            tt_cxy_pair dispatch_location = dispatch_core_manager::get(num_hw_cqs).dispatcher_core(device_id, channel, cq_id);
+            settings.worker_physical_core = tt_cxy_pair(dispatch_location.chip, get_physical_core_coordinate(dispatch_location, dispatch_core_type));
+            settings.kernel_file = "tt_metal/impl/dispatch/kernels/cq_dispatch.cpp";
+            //dispatch needs one semaphore.
+            settings.semaphores.push_back(0);
+            settings.producer_semaphore_id = 0;
+            settings.consumer_semaphore_id = 0;
+            settings.cb_start_address = dispatch_constants::DISPATCH_BUFFER_BASE;
+            settings.cb_log_page_size = dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE;
+            settings.cb_pages = dispatch_constants::get(dispatch_core_type).dispatch_buffer_pages();
+            settings.cb_size_bytes = (1 << settings.cb_log_page_size) * settings.cb_pages;
+            tunnel_core_allocations[DISPATCH].push_back(std::make_tuple(dispatch_location, settings));
+            log_debug(LogMetal, "Device {} Channel {} : Dispatch: Issue Q Start Addr: {} - Completion Q Start Addr: {}",  device_id, channel, settings.issue_queue_start_addr, settings.completion_queue_start_addr);
+
+            if (tunnel_stop == 1) {
+                //need to allocate mux/demux on mmio chip only once.
+                //all tunnel stops, share the same mux/demux on mmio chip.
+                settings.semaphores.clear();
+                //mux/demux need a semaphore per remote device in the tunnel.
+                //Tunnel includes the mmio device as well, so tunnel.size() - 1 is the number of remote devices.
+                settings.semaphores.resize(tunnel.size()-1);
+                settings.producer_semaphore_id = 0;
+                settings.consumer_semaphore_id = 0;
+                tt_cxy_pair mux_location = dispatch_core_manager::get(num_hw_cqs).mux_core(device_id, channel, cq_id);
+                settings.worker_physical_core = tt_cxy_pair(mux_location.chip, get_physical_core_coordinate(mux_location, dispatch_core_type));
+                settings.kernel_file = "tt_metal/impl/dispatch/kernels/packet_mux.cpp";
+                settings.cb_start_address = dispatch_constants::DISPATCH_BUFFER_BASE;
+                settings.cb_size_bytes = dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_size();
+
+                tunnel_core_allocations[MUX].push_back(std::make_tuple(mux_location, settings));
+
+                tt_cxy_pair demux_location = dispatch_core_manager::get(num_hw_cqs).demux_core(device_id, channel, cq_id);
+                settings.worker_physical_core = tt_cxy_pair(demux_location.chip, get_physical_core_coordinate(demux_location, dispatch_core_type));
+                settings.kernel_file = "tt_metal/impl/dispatch/kernels/packet_demux.cpp";
+                settings.cb_start_address = L1_UNRESERVED_BASE;
+                settings.cb_size_bytes = 0x10000;
+                tunnel_core_allocations[DEMUX].push_back(std::make_tuple(demux_location, settings));
+            }
+
+            settings.tunnel_stop = tunnel_stop - 1;
+            settings.semaphores.clear();
+            chip_id_t us_device = tunnel[tunnel_stop - 1];
+            tt_cxy_pair us_location = dispatch_core_manager::get(num_hw_cqs).tunneler_core(us_device, device_id, channel, cq_id);
+            tt_cxy_pair local_location = dispatch_core_manager::get(num_hw_cqs).us_tunneler_core_local(device_id, channel, cq_id);
+
+            settings.worker_physical_core = tt_cxy_pair(us_location.chip, get_physical_core_coordinate(us_location, CoreType::ETH));
+            settings.eth_partner_physical_core = tt_cxy_pair(local_location.chip, get_physical_core_coordinate(local_location, CoreType::ETH));
+            settings.kernel_file = "tt_metal/impl/dispatch/kernels/eth_tunneler.cpp";
+            settings.cb_start_address = 0x19000;
+            settings.cb_size_bytes = 0x10000;
+            tunnel_core_allocations[US_TUNNELER_REMOTE].push_back(std::make_tuple(us_location, settings));
+
+            //all allocation below this are on a remote chip.
+            settings.tunnel_stop = tunnel_stop;
+
+            //swap the two etnernet link pair cores for downstream chip on the link pair.
+            tt_cxy_pair temp = settings.worker_physical_core;
+            settings.worker_physical_core = settings.eth_partner_physical_core;
+            settings.eth_partner_physical_core = temp;
+            settings.kernel_file = "tt_metal/impl/dispatch/kernels/eth_tunneler.cpp";
+            tunnel_core_allocations[US_TUNNELER_LOCAL].push_back(std::make_tuple(local_location, settings));
+
+            TT_ASSERT(us_location.chip == us_device,
+                "Upstream Tunneler is on device {} but it is expected to be on device {}", us_location.chip, us_device);
+            TT_ASSERT(local_location.chip == device_id,
+                "Upstream Local Tunneler is on device {} but it is expected to be on device {}", local_location.chip, device_id);
+
+            dispatch_core_type = dispatch_core_manager::get(num_hw_cqs).get_dispatch_core_type(device_id);
+            settings.dispatch_core_type = dispatch_core_type;
+
+            tt_cxy_pair mux_d_location = dispatch_core_manager::get(num_hw_cqs).mux_d_core(device_id, channel, cq_id);
+            settings.worker_physical_core = tt_cxy_pair(mux_d_location.chip, get_physical_core_coordinate(mux_d_location, dispatch_core_type));
+            settings.kernel_file = "tt_metal/impl/dispatch/kernels/packet_mux.cpp";
+            settings.semaphores.push_back(0);
+            settings.consumer_semaphore_id = 0;
+            settings.cb_start_address = dispatch_constants::DISPATCH_BUFFER_BASE;
+            settings.cb_log_page_size = dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE;
+            settings.cb_pages = dispatch_constants::get(dispatch_core_type).dispatch_buffer_pages();
+            settings.cb_size_bytes = (1 << settings.cb_log_page_size) * settings.cb_pages;
+            tunnel_core_allocations[MUX_D].push_back(std::make_tuple(mux_d_location, settings));
+
+            tt_cxy_pair demux_d_location = dispatch_core_manager::get(num_hw_cqs).demux_d_core(device_id, channel, cq_id);
+            settings.worker_physical_core = tt_cxy_pair(demux_d_location.chip, get_physical_core_coordinate(demux_d_location, dispatch_core_type));
+            settings.kernel_file = "tt_metal/impl/dispatch/kernels/packet_demux.cpp";
+            settings.producer_semaphore_id = 0;
+            settings.cb_start_address = L1_UNRESERVED_BASE;
+            settings.cb_size_bytes = 0x10000;
+            tunnel_core_allocations[DEMUX_D].push_back(std::make_tuple(demux_d_location, settings));
+
+            settings.semaphores.clear();
+            uint32_t dispatch_buffer_pages = dispatch_constants::get(dispatch_core_type).dispatch_buffer_pages();
+            settings.semaphores.push_back(0);// prefetch_d_sync_sem
+            settings.semaphores.push_back(0);// prefetch_d_upstream_cb_sem
+            settings.semaphores.push_back(dispatch_buffer_pages);// prefetch_d_downstream_cb_sem
+            settings.consumer_semaphore_id = 1;
+            settings.producer_semaphore_id = 2;
+
+            tt_cxy_pair prefetch_d_location = dispatch_core_manager::get(num_hw_cqs).prefetcher_d_core(device_id, channel, cq_id);
+            settings.worker_physical_core = tt_cxy_pair(prefetch_d_location.chip, get_physical_core_coordinate(prefetch_d_location, dispatch_core_type));
+            settings.kernel_file = "tt_metal/impl/dispatch/kernels/cq_prefetch.cpp";
+            settings.cb_start_address = dispatch_constants::DISPATCH_BUFFER_BASE;
+            settings.cb_size_bytes = dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_size();
+            settings.cb_pages = dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_pages();
+            settings.cb_log_page_size = dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE;
+            tunnel_core_allocations[PREFETCH_D].push_back(std::make_tuple(prefetch_d_location, settings));
+
+            settings.semaphores.clear();
+            settings.semaphores.push_back(0);// dispatch_sem
+            settings.semaphores.push_back(dispatch_buffer_pages);// dispatch_downstream_cb_sem
+            settings.consumer_semaphore_id = 0;
+            settings.producer_semaphore_id = 1;
+            settings.cb_start_address = dispatch_constants::DISPATCH_BUFFER_BASE;
+            settings.cb_log_page_size = dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE;
+            settings.cb_pages = dispatch_constants::get(dispatch_core_type).dispatch_buffer_pages();
+            settings.cb_size_bytes = (1 << settings.cb_log_page_size) * settings.cb_pages;
+            tt_cxy_pair dispatch_d_location = dispatch_core_manager::get(num_hw_cqs).dispatcher_d_core(device_id, channel, cq_id);
+            settings.worker_physical_core = tt_cxy_pair(dispatch_d_location.chip, get_physical_core_coordinate(dispatch_d_location, dispatch_core_type));
+            settings.kernel_file = "tt_metal/impl/dispatch/kernels/cq_dispatch.cpp";
+            tunnel_core_allocations[DISPATCH_D].push_back(std::make_tuple(dispatch_d_location, settings));
+        }
+        tunnel_dispatch_core_allocations.insert(std::make_pair(tunnel_id, tunnel_core_allocations));
+        tunnel_id++;
+    }
+
+    //separate out all the dispatch workers on the tunnel into individual devices.
+    for (const auto& pair : tunnel_dispatch_core_allocations) {
+        std::map<chip_id_t, std::vector<std::vector<std::tuple<tt_cxy_pair, worker_build_settings_t>>>> device_dispatch_workers = {};
+        for (uint32_t i = 0; i < pair.second.size(); i++) {
+            if (pair.second[i].size()) {
+                //some workers of allocated.
+                auto tunnel_workers = pair.second[i];
+                for (auto &[worker, settings] : tunnel_workers) {
+                    if (device_dispatch_workers.find(worker.chip) == device_dispatch_workers.end()) {
+                        std::vector<std::vector<std::tuple<tt_cxy_pair, worker_build_settings_t>>> temp = {};
+                        temp.resize(tt::tt_metal::DispatchWorkerType::COUNT);
+                        temp[i].push_back(std::make_tuple(worker, settings));
+                        device_dispatch_workers.insert(std::make_pair(worker.chip, temp));
+                    } else {
+                        device_dispatch_workers[worker.chip][i].push_back(std::make_tuple(worker, settings));
+                    }
+                }
+            }
+        }
+        tunnel_device_dispatch_workers_.insert(std::make_pair(pair.first, device_dispatch_workers));
+    }
+
+    log_debug(LogMetal, "{} tunnels found.",  tunnel_device_dispatch_workers_.size());
+
+    for (const auto& tunnel : tunnel_device_dispatch_workers_) {
+        for (const auto& pair : tunnel.second) {
+            for (uint32_t i = 0; i < pair.second.size(); i++) {
+                for (auto [core, settings] : pair.second[i]) {
+                    log_debug(LogMetal, "Tunnel {} Device {} has {} on core {}.", tunnel.first, pair.first, magic_enum::enum_name((tt::tt_metal::DispatchWorkerType)i), core.str());
+                }
+            }
+        }
+    }
+
+    for (uint32_t t = 0; t < tunnels_from_mmio_.size(); t++) {
+        auto tunnel = tunnels_from_mmio_[t];
+        TT_ASSERT(tunnel_device_dispatch_workers_.find(t) != tunnel_device_dispatch_workers_.end(),
+                "Tunnel {} not found on MMIO Device {}", t, mmio_device_id);
+        auto &tunnel_devices = tunnel_device_dispatch_workers_[t];
+        for (uint32_t tunnel_stop = 0; tunnel_stop < tunnel.size(); tunnel_stop++) {
+            //last iteration is used to loop in tunnel workers that run on mmio device.
+            auto tunnel_device = tunnel[tunnel_stop];
+            TT_ASSERT(tunnel_devices.find(tunnel_device) != tunnel_devices.end(),
+                "Device {} not found in Tunnel {} on MMIO Device {}", tunnel_device, t, mmio_device_id);
+            auto &device_worker_variants = tunnel_devices[tunnel_device];
+            update_workers_build_settings(device_worker_variants);
+
+            for (uint32_t dwv = 0; dwv < device_worker_variants.size(); dwv++)
+            {
+                if (device_worker_variants[dwv].size()) {
+                    for (auto &[core, settings] : device_worker_variants[dwv]) {
+                        log_debug(LogMetal, "Tunnel {} Stop {} is Device {}. Core {} - Physical {} will run {}.", t, tunnel_stop, tunnel_device, core.str(), settings.worker_physical_core.str(), magic_enum::enum_name((tt::tt_metal::DispatchWorkerType)dwv));
+                        for (uint32_t arg = 0; arg < settings.compile_args.size(); arg++) {
+                            log_debug(LogMetal, "CompileArgs[{}] = {}", arg, settings.compile_args[arg]);
+                        }
+
+                    }
+                }
+            }
+        }
+    }
+}
+
 void Device::compile_command_queue_programs() {
     ZoneScoped;
     unique_ptr<Program, detail::ProgramDeleter> command_queue_program_ptr(new Program);
@@ -508,388 +1240,143 @@ void Device::compile_command_queue_programs() {
         }
         detail::CompileProgram(this, *command_queue_program_ptr);
         this->command_queue_programs.push_back(std::move(command_queue_program_ptr));
+        this->setup_tunnel_for_remote_devices();
     } else {
-        /////////////////Following section is for mmio device serving Remote Device
-        uint8_t num_hw_cqs = 1;
-        uint32_t cq_id = 0;
         chip_id_t device_id = this->id();
         chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device_id);
         Device *mmio_device = tt::tt_metal::detail::GetDeviceHandle(mmio_device_id);
-        uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id);
-        uint32_t cq_size = mmio_device->sysmem_manager().get_cq_size();
-
-
-        CoreType dispatch_core_type = dispatch_core_manager::get(num_hw_cqs).get_dispatch_core_type(mmio_device_id);
-        tt_cxy_pair prefetch_core = dispatch_core_manager::get(num_hw_cqs).prefetcher_core(device_id, channel, cq_id);
-        tt_cxy_pair dispatch_core = dispatch_core_manager::get(num_hw_cqs).dispatcher_core(device_id, channel, cq_id);
-        bool dispatch_on_eth = dispatch_core_type == CoreType::ETH;
-
-        TT_ASSERT(prefetch_core.chip == mmio_device_id and dispatch_core.chip == mmio_device_id,
-            "Prefetcher is on device {} and Dispatcher is on device {} but they are expected to be on device {}", prefetch_core.chip, dispatch_core.chip, mmio_device_id);
-
-        CoreCoord prefetch_physical_core = get_physical_core_coordinate(prefetch_core, dispatch_core_type);
-        CoreCoord dispatch_physical_core = get_physical_core_coordinate(dispatch_core, dispatch_core_type);
-
-        log_debug(LogDevice, "Dispatching out of {} cores",  magic_enum::enum_name(dispatch_core_type));
-        log_debug(LogDevice, "Prefetch H logical location: {} physical core: {}", prefetch_core.str(), prefetch_physical_core.str());
-        log_debug(LogDevice, "Dispatch H logical location: {} physical core {}", dispatch_core.str(), dispatch_physical_core.str());
-
-        uint32_t command_queue_start_addr = get_absolute_cq_offset(channel, cq_id, cq_size);
-        uint32_t issue_queue_start_addr = command_queue_start_addr + CQ_START;
-        uint32_t issue_queue_size = mmio_device->sysmem_manager_->get_issue_queue_size(cq_id);
-        uint32_t completion_queue_start_addr = issue_queue_start_addr + issue_queue_size;
-        uint32_t completion_queue_size = mmio_device->sysmem_manager_->get_completion_queue_size(cq_id);
-
-        tt_cxy_pair mux_core = dispatch_core_manager::get(num_hw_cqs).mux_core(device_id, channel, cq_id);
-        tt_cxy_pair demux_core = dispatch_core_manager::get(num_hw_cqs).demux_core(device_id, channel, cq_id);
-        tt_cxy_pair tunneler_location = dispatch_core_manager::get(num_hw_cqs).tunneler_core(device_id, channel, cq_id);
-        CoreCoord tunneler_core = CoreCoord(tunneler_location.x, tunneler_location.y);
-        TT_ASSERT(tunneler_location.chip == mmio_device_id,
-            "Tunneler is on device {} but it is expected to be on device {}", tunneler_location.chip, mmio_device_id);
-        CoreCoord r_tunneler_core = std::get<1>(tt::Cluster::instance().get_connected_ethernet_core(std::make_tuple(tunneler_location.chip, tunneler_core)));
-        CoreCoord r_tunneler_physical_core = this->ethernet_core_from_logical_core(r_tunneler_core);
-
-        CoreCoord tunneler_physical_core = mmio_device->ethernet_core_from_logical_core(tunneler_location);
-        CoreCoord mux_physical_core = get_physical_core_coordinate(mux_core, dispatch_core_type);
-        CoreCoord demux_physical_core = get_physical_core_coordinate(demux_core, dispatch_core_type);
-
-        uint32_t tunneler_queue_start_addr = 0x19000;
-        uint32_t tunneler_queue_size_bytes = 0x10000;
-        uint32_t tunneler_test_results_addr = 0x39000;
-        uint32_t tunneler_test_results_size = 0x7000;
-        constexpr uint32_t packetized_path_test_results_addr = BRISC_L1_RESULT_BASE;
-        constexpr uint32_t packetized_path_test_results_size = 1024;
-
-        constexpr uint32_t src_endpoint_start_id = 0xaa;
-        constexpr uint32_t dest_endpoint_start_id = 0xbb;
-
-        tt::tt_metal::CreateSemaphore(*mmio_command_queue_program_ptr, prefetch_core, 0, dispatch_core_type); // prefetch_sync_sem
-        tt::tt_metal::CreateSemaphore(*mmio_command_queue_program_ptr, prefetch_core, dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_pages(), dispatch_core_type); // prefetch_sem
-        tt::tt_metal::CreateSemaphore(*mmio_command_queue_program_ptr, prefetch_core, 0, dispatch_core_type); // prefetch_h_exec_buf_sem
-
-        tt::tt_metal::CreateSemaphore(*mmio_command_queue_program_ptr, mux_core, 0, dispatch_core_type); // mux_sem
-
-        tt_metal::CreateSemaphore(*mmio_command_queue_program_ptr, demux_core, 0, dispatch_core_type); //demux_sem
-
-        constexpr uint32_t dispatch_h_cb_sem = 0; // remove it.
-        constexpr uint32_t dispatch_sem = 0;
-        tt_metal::CreateSemaphore(*mmio_command_queue_program_ptr, dispatch_core, 0, dispatch_core_type); // dispatch_sem
-
-        std::vector<uint32_t> prefetch_compile_args = {
-            dispatch_constants::DISPATCH_BUFFER_BASE,
-            dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE,
-            dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_pages(),
-            prefetch_sem,
-            mux_sem,
-            issue_queue_start_addr,
-            issue_queue_size,
-            dispatch_constants::PREFETCH_Q_BASE,
-            dispatch_constants::get(dispatch_core_type).prefetch_q_size(),
-            CQ_PREFETCH_Q_RD_PTR,
-            dispatch_constants::get(dispatch_core_type).cmddat_q_base(),
-            dispatch_constants::get(dispatch_core_type).cmddat_q_size(),
-            dispatch_constants::get(dispatch_core_type).scratch_db_base(), // unused for prefetch_h
-            dispatch_constants::get(dispatch_core_type).scratch_db_size(), // unused for prefetch_h
-            prefetch_sync_sem, // unused for prefetch_h
-            dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_pages(), // prefetch_d only
-            0, // prefetch_d only
-            0, //prefetch_downstream_cb_sem, // prefetch_d only
-            dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE,
-            dispatch_constants::PREFETCH_D_BUFFER_BLOCKS, // prefetch_d only
-            prefetch_h_exec_buf_sem,
-            false,  // is_dram_variant
-            true    // is_host_variant
-        };
-
-        configure_kernel_variant(
-            *mmio_command_queue_program_ptr,
-            "tt_metal/impl/dispatch/kernels/cq_prefetch.cpp",
-            prefetch_compile_args,
-            prefetch_core,
-            prefetch_physical_core,
-            dispatch_core_type,
-            CoreCoord{0, 0},
-            mux_physical_core,
-            std::map<string, string> {}
-        );
-
-        log_debug(LogDevice, "run prefetch_h {}", prefetch_core.str());
-
-        std::vector<uint32_t> mux_compile_args =
-        {
-            0, // 0: reserved
-            (dispatch_constants::DISPATCH_BUFFER_BASE >> 4), // 1: rx_queue_start_addr_words
-            (dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_size() >> 4), // 2: rx_queue_size_words
-            1, // 3: mux_fan_in
-            packet_switch_4B_pack((uint32_t)prefetch_physical_core.x,
-                                (uint32_t)prefetch_physical_core.y,
-                                1,
-                                (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: src 0 info
-            packet_switch_4B_pack(0,
-                                0,
-                                1,
-                                (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: src 1 info
-            packet_switch_4B_pack(0,
-                                0,
-                                1,
-                                (uint32_t)DispatchRemoteNetworkType::NOC0), // 6: src 2 info
-            packet_switch_4B_pack(0,
-                                0,
-                                1,
-                                (uint32_t)DispatchRemoteNetworkType::NOC0), // 7: src 3 info
-            (tunneler_queue_start_addr >> 4), // 8: remote_tx_queue_start_addr_words
-            (tunneler_queue_size_bytes >> 4), // 9: remote_tx_queue_size_words
-            (uint32_t)tunneler_physical_core.x, // 10: remote_tx_x
-            (uint32_t)tunneler_physical_core.y, // 11: remote_tx_y
-            0, // 12: remote_tx_queue_id
-            (uint32_t)DispatchRemoteNetworkType::NOC0, // 13: tx_network_type
-            packetized_path_test_results_addr, // 14: test_results_addr
-            packetized_path_test_results_size, // 15: test_results_size
-            0, // 16: timeout_cycles
-            0x0,// 17: output_depacketize
-            0x0,// 18: output_depacketize info
-            // 19: input 0 packetize info:
-            packet_switch_4B_pack(0x1,
-                                dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE,
-                                prefetch_sem, // upstream sem
-                                mux_sem),     // local sem
-            packet_switch_4B_pack(0, 0, 0, 0), // 20: input 1 packetize info
-            packet_switch_4B_pack(0, 0, 0, 0), // 21: input 2 packetize info
-            packet_switch_4B_pack(0, 0, 0, 0), // 22: input 3 packetize info
-            packet_switch_4B_pack(src_endpoint_start_id, 0, 0, 0), // 23: packetized input src id
-            packet_switch_4B_pack(dest_endpoint_start_id, 0, 0, 0), // 24: packetized input dest id
-        };
-
-        log_debug(LogDevice, "run mux at {}", mux_core.str());
-
-        configure_kernel_variant(
-            *mmio_command_queue_program_ptr,
-            "tt_metal/impl/dispatch/kernels/packet_mux.cpp",
-            mux_compile_args,
-            mux_core,
-            CoreCoord{0, 0},
-            dispatch_core_type,
-            CoreCoord{0, 0},
-            CoreCoord{0, 0},
-            std::map<string, string> {{"SKIP_NOC_LOGGING", "1"}}
-        );
-
-        std::vector<uint32_t> tunneler_l_compile_args =
-        {
-            dest_endpoint_start_id, // 0: endpoint_id_start_index
-            2, // tunnel_lanes. 1 => Unidirectional. 2 => Bidirectional.
-            (tunneler_queue_start_addr >> 4), // 2: rx_queue_start_addr_words
-            (tunneler_queue_size_bytes >> 4), // 3: rx_queue_size_words
-            packet_switch_4B_pack(r_tunneler_physical_core.x,
-                                r_tunneler_physical_core.y,
-                                0,
-                                (uint32_t)DispatchRemoteNetworkType::ETH), // 4: remote_receiver_0_info
-            packet_switch_4B_pack(demux_physical_core.x,
-                                demux_physical_core.y,
-                                1,//num_dest_endpoints,
-                                (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: remote_receiver_1_info
-            tunneler_queue_start_addr >> 4, // 6: remote_receiver_queue_start_addr_words 0
-            tunneler_queue_size_bytes >> 4, // 7: remote_receiver_queue_size_words 0
-            (L1_UNRESERVED_BASE >> 4), // 8: remote_receiver_queue_start_addr_words 1
-            (0x10000 >> 4), // 9: remote_receiver_queue_size_words 1
-            packet_switch_4B_pack(mux_physical_core.x,
-                                mux_physical_core.y,
-                                1, // mux output queue id
-                                (uint32_t)DispatchRemoteNetworkType::NOC0), // 10: remote_sender_0_info
-            packet_switch_4B_pack(r_tunneler_physical_core.x,
-                                r_tunneler_physical_core.y,
-                                3, // r tunneler output queue id
-                                (uint32_t)DispatchRemoteNetworkType::ETH), // 11: remote_sender_1_info
-            tunneler_test_results_addr, // 12: test_results_addr
-            tunneler_test_results_size, // 13: test_results_size
-            0, // 14: timeout_cycles
-        };
 
-        configure_kernel_variant(
-            *mmio_command_queue_program_ptr,
-            "tt_metal/impl/dispatch/kernels/eth_tunneler.cpp",
-            tunneler_l_compile_args,
-            tunneler_core,
-            CoreCoord{0, 0},
-            CoreType::ETH,
-            CoreCoord{0, 0},
-            CoreCoord{0, 0},
-            std::map<string, string> {{"SKIP_NOC_LOGGING", "1"}},
-            true
-        );
+        auto &tunnel_device_dispatch_workers = mmio_device->tunnel_device_dispatch_workers_;
+        auto &tunnels_from_mmio = mmio_device->tunnels_from_mmio_;
 
-        log_debug(LogDevice, "run tunneler at {}", tunneler_location.str());
-
-        uint32_t dest_map_array[4] = {0, 1, 2, 3};
-        uint64_t dest_endpoint_output_map = packet_switch_dest_pack(dest_map_array, 4);
-        std::vector<uint32_t> demux_compile_args =
-        {
-            dest_endpoint_start_id, // 0: endpoint_id_start_index
-            (L1_UNRESERVED_BASE >> 4), // 1: rx_queue_start_addr_words
-            (0x10000 >> 4), // 2: rx_queue_size_words
-            1, // 3: demux_fan_out
-            packet_switch_4B_pack(dispatch_physical_core.x,
-                                    dispatch_physical_core.y,
-                                    0,
-                                    (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: remote_tx_0_info
-            packet_switch_4B_pack(0,
-                                    0,
-                                    0,
-                                    (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: remote_tx_1_info
-            packet_switch_4B_pack(0,
-                                    0,
-                                    0,
-                                    (uint32_t)DispatchRemoteNetworkType::NOC0), // 6: remote_tx_2_info
-            packet_switch_4B_pack(0,
-                                    0,
-                                    0,
-                                    (uint32_t)DispatchRemoteNetworkType::NOC0), // 7: remote_tx_3_info
-            (dispatch_constants::DISPATCH_BUFFER_BASE >> 4), // 8: remote_tx_queue_start_addr_words 0
-            ((1 << dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE)*dispatch_constants::get(dispatch_core_type).dispatch_buffer_pages()) >> 4, // 9: remote_tx_queue_size_words 0
-            0, // 10: remote_tx_queue_start_addr_words 1
-            0, // 11: remote_tx_queue_size_words 1
-            0, // 12: remote_tx_queue_start_addr_words 2
-            0, // 13: remote_tx_queue_size_words 2
-            0, // 14: remote_tx_queue_start_addr_words 3
-            0, // 15: remote_tx_queue_size_words 3
-            (uint32_t)tunneler_physical_core.x, // 16: remote_rx_x
-            (uint32_t)tunneler_physical_core.y, // 17: remote_rx_y
-            3, // 18: remote_rx_queue_id
-            (uint32_t)DispatchRemoteNetworkType::NOC0, // 19: tx_network_type
-            (uint32_t)(dest_endpoint_output_map >> 32), // 20: dest_endpoint_output_map_hi
-            (uint32_t)(dest_endpoint_output_map & 0xFFFFFFFF), // 21: dest_endpoint_output_map_lo
-            packetized_path_test_results_addr, // 22: test_results_addr
-            packetized_path_test_results_size, // 23: test_results_size
-            0, // 24: timeout_cycles
-            0x1, // 25: output_depacketize_mask
-            // 26: output 0 packetize info:
-            packet_switch_4B_pack(dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE,
-                                    dispatch_sem, // downstream sem
-                                    demux_sem,    // local sem
-                                    1), // remove header
-            packet_switch_4B_pack(0, 0, 0, 0), // 27: output 1 packetize info
-            packet_switch_4B_pack(0, 0, 0, 0), // 28: output 2 packetize info
-            packet_switch_4B_pack(0, 0, 0, 0), // 29: output 3 packetize info
-        };
+        std::vector<std::vector<std::tuple<tt_cxy_pair, worker_build_settings_t>>> device_worker_variants;
+        std::vector<std::vector<std::tuple<tt_cxy_pair, worker_build_settings_t>>> mmio_device_worker_variants;
 
+        uint32_t tunnel_id = 0;
+        for (auto tunnel : tunnel_device_dispatch_workers) {
+            TT_ASSERT(tunnel.second.find(mmio_device_id) != tunnel.second.end(), "MMIO Device {} not found in tunnel map.", mmio_device_id);
+            if (tunnel.second.find(device_id) != tunnel.second.end()) {
+                tunnel_id = tunnel.first;
+                device_worker_variants = tunnel.second[device_id];
+                mmio_device_worker_variants = tunnel.second[mmio_device_id];
+                break;
+            }
+        }
+        TT_ASSERT(device_worker_variants.size() != 0, "No worker variants found for Device {}.", device_id);
+
+        //determine if its first tunnel stop.
+        //FD2 kernels running on mmio device are launched with first tunnel stop.
+        bool first_tunnel_stop = true;
+        auto tunnel = tunnels_from_mmio[tunnel_id];
+        for (uint32_t ts = 1; ts < tunnel.size(); ts++) {
+            if (tunnel[ts] == device_id) {
+                first_tunnel_stop = ts == 1;
+                break;
+            }
+            TT_ASSERT(ts < (tunnel.size() - 1) , "Device {} tunnel stop cannot be determined on tunnel {}.", device_id, tunnel_id);
+        }
 
-        configure_kernel_variant(
-            *mmio_command_queue_program_ptr,
-            "tt_metal/impl/dispatch/kernels/packet_demux.cpp",
-            demux_compile_args,
-            demux_core,
-            CoreCoord{0, 0},
-            dispatch_core_type,
-            CoreCoord{0, 0},
-            CoreCoord{0, 0},
-            std::map<string, string> {{"SKIP_NOC_LOGGING", "1"}}
-        );
+        if (first_tunnel_stop) {
+            /////////////////Following section is for mmio device serving Remote Device
+            for (auto [prefetch_core, prefetch_settings] : mmio_device_worker_variants[PREFETCH]) {
+                //auto [prefetch_core, prefetch_settings] = mmio_device_worker_variants[PREFETCH][0];
+                for (auto sem : prefetch_settings.semaphores) {
+                    //size of semaphores vector is number of needed semaphores on the core.
+                    //Value of each vector entry is the initialization value for the semaphore.
+                    tt::tt_metal::CreateSemaphore(*mmio_command_queue_program_ptr, prefetch_core, sem, prefetch_settings.dispatch_core_type);
+                }
+                configure_kernel_variant(
+                    *mmio_command_queue_program_ptr,
+                    prefetch_settings.kernel_file,
+                    prefetch_settings.compile_args,
+                    prefetch_core,
+                    prefetch_settings.worker_physical_core,
+                    prefetch_settings.dispatch_core_type,
+                    prefetch_settings.upstream_cores[0],
+                    prefetch_settings.downstream_cores[0],
+                    std::map<string, string> {}
+                );
+            }
 
-        log_debug(LogDevice, "run dispatch demux at {}", demux_core.str());
-
-        std::vector<uint32_t> dispatch_compile_args = {
-            dispatch_constants::DISPATCH_BUFFER_BASE,
-            dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE,
-            dispatch_constants::get(dispatch_core_type).dispatch_buffer_pages(),
-            dispatch_sem,
-            demux_sem,
-            dispatch_constants::DISPATCH_BUFFER_SIZE_BLOCKS,
-            prefetch_sync_sem,
-            command_queue_start_addr,
-            completion_queue_start_addr,
-            completion_queue_size,
-            dispatch_constants::DISPATCH_BUFFER_BASE,
-            (1 << dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE) * dispatch_constants::get(dispatch_core_type).dispatch_buffer_pages(),
-            0, // unused: local ds semaphore
-            0, // unused: remote ds semaphore
-            0, // preamble size. unused unless tunneler is between h and d
-            false,   // is_dram_variant
-            true     // is_host_variant
-        };
+            auto [mux_core, mux_settings] = mmio_device_worker_variants[MUX][0];
+            for (auto sem : mux_settings.semaphores) {
+                //size of semaphores vector is number of needed semaphores on the core.
+                //Value of each vector entry is the initialization value for the semaphore.
+                tt::tt_metal::CreateSemaphore(*mmio_command_queue_program_ptr, mux_core, sem, mux_settings.dispatch_core_type);
+            }
+            configure_kernel_variant(
+                *mmio_command_queue_program_ptr,
+                mux_settings.kernel_file,
+                mux_settings.compile_args,
+                mux_core,
+                CoreCoord{0, 0},
+                mux_settings.dispatch_core_type,
+                CoreCoord{0, 0},
+                CoreCoord{0, 0},
+                std::map<string, string> {{"SKIP_NOC_LOGGING", "1"}}
+            );
 
-        configure_kernel_variant(
-            *mmio_command_queue_program_ptr,
-            "tt_metal/impl/dispatch/kernels/cq_dispatch.cpp",
-            dispatch_compile_args,
-            dispatch_core,
-            dispatch_physical_core,
-            dispatch_core_type,
-            demux_physical_core,
-            CoreCoord{0xffffffff, 0xffffffff},
-            std::map<string, string> {}
-        );
+            auto [tunneler_core, tunneler_settings] = mmio_device_worker_variants[US_TUNNELER_REMOTE][0];
+            configure_kernel_variant(
+                *mmio_command_queue_program_ptr,
+                tunneler_settings.kernel_file,
+                tunneler_settings.compile_args,
+                tunneler_core,
+                CoreCoord{0, 0},
+                CoreType::ETH,
+                CoreCoord{0, 0},
+                CoreCoord{0, 0},
+                std::map<string, string> {{"SKIP_NOC_LOGGING", "1"}},
+                true
+            );
 
-        log_debug(LogDevice, "run dispatch_h at {}", dispatch_core.str());
+            auto [demux_core, demux_settings] = mmio_device_worker_variants[DEMUX][0];
+            for (auto sem : demux_settings.semaphores) {
+                //size of semaphores vector is number of needed semaphores on the core.
+                //Value of each vector entry is the initialization value for the semaphore.
+                tt::tt_metal::CreateSemaphore(*mmio_command_queue_program_ptr, demux_core, sem, demux_settings.dispatch_core_type);
+            }
+            configure_kernel_variant(
+                *mmio_command_queue_program_ptr,
+                demux_settings.kernel_file,
+                demux_settings.compile_args,
+                demux_core,
+                CoreCoord{0, 0},
+                demux_settings.dispatch_core_type,
+                CoreCoord{0, 0},
+                CoreCoord{0, 0},
+                std::map<string, string> {{"SKIP_NOC_LOGGING", "1"}}
+            );
 
+            for (auto [dispatch_core, dispatch_settings] : mmio_device_worker_variants[DISPATCH]) {
+                //auto [dispatch_core, dispatch_settings] = mmio_device_worker_variants[DISPATCH][0];
+                for (auto sem : dispatch_settings.semaphores) {
+                    //size of semaphores vector is number of needed semaphores on the core.
+                    //Value of each vector entry is the initialization value for the semaphore.
+                    tt::tt_metal::CreateSemaphore(*mmio_command_queue_program_ptr, dispatch_core, sem, dispatch_settings.dispatch_core_type);
+                }
+                configure_kernel_variant(
+                    *mmio_command_queue_program_ptr,
+                    dispatch_settings.kernel_file,
+                    dispatch_settings.compile_args,
+                    dispatch_core,
+                    dispatch_settings.worker_physical_core,
+                    dispatch_settings.dispatch_core_type,
+                    dispatch_settings.upstream_cores[0],
+                    CoreCoord{0xffffffff, 0xffffffff},
+                    std::map<string, string> {}
+                );
+            }
+        }
         /////////////////Following section is for Remote Device
-        dispatch_core_type = dispatch_core_manager::get(num_hw_cqs).get_dispatch_core_type(device_id);
-        dispatch_on_eth = dispatch_core_type == CoreType::ETH;
-
-        uint32_t dispatch_buffer_pages = dispatch_constants::get(dispatch_core_type).dispatch_buffer_pages();
-        uint32_t mux_queue_start_addr = dispatch_constants::DISPATCH_BUFFER_BASE;
-        uint32_t mux_queue_size_bytes = (1 << dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE)*dispatch_buffer_pages;
-        // Packetized path buffer, can be at any available address.
-        constexpr uint32_t demux_queue_start_addr = L1_UNRESERVED_BASE;
-        constexpr uint32_t demux_queue_size_bytes = 0x10000;
-
-        tt_cxy_pair mux_d_core = dispatch_core_manager::get(num_hw_cqs).mux_d_core(device_id, channel, cq_id);
-        CoreCoord mux_d_physical_core = get_physical_core_coordinate(mux_d_core, dispatch_core_type);
-        tt_cxy_pair demux_d_core = dispatch_core_manager::get(num_hw_cqs).demux_d_core(device_id, channel, cq_id);
-        CoreCoord demux_d_physical_core = get_physical_core_coordinate(demux_d_core, dispatch_core_type);
-
-        tt_cxy_pair prefetch_d_core = dispatch_core_manager::get(num_hw_cqs).prefetcher_d_core(device_id, channel, cq_id);
-        CoreCoord prefetch_d_physical_core = get_physical_core_coordinate(prefetch_d_core, dispatch_core_type);
-
-        dispatch_core = dispatch_core_manager::get(num_hw_cqs).dispatcher_d_core(device_id, channel, cq_id);
-        dispatch_physical_core = get_physical_core_coordinate(dispatch_core, dispatch_core_type);
-
-        tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, prefetch_d_core, 0, dispatch_core_type); // prefetch_d_sync_sem
-        tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, prefetch_d_core, 0, dispatch_core_type); // prefetch_d_upstream_cb_sem
-        tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, prefetch_d_core, dispatch_buffer_pages, dispatch_core_type); // prefetch_d_downstream_cb_sem
-
-        tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, {demux_d_core}, 0, dispatch_core_type); // demux_sem
-
-        tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, dispatch_core, 0, dispatch_core_type); // dispatch_sem
-        tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, dispatch_core, dispatch_buffer_pages, dispatch_core_type); // dispatch_downstream_cb_sem
-
-        tt_metal::CreateSemaphore(*command_queue_program_ptr, mux_d_core, 0, dispatch_core_type); // mux_sem
-
-        std::vector<uint32_t> tunneler_r_compile_args =
-        {
-            dest_endpoint_start_id, // 0: endpoint_id_start_index
-            2,  // tunnel_lanes. 1 => Unidirectional. 2 => Bidirectional.
-            (tunneler_queue_start_addr >> 4), // 2: rx_queue_start_addr_words
-            (tunneler_queue_size_bytes >> 4), // 3: rx_queue_size_words
-            packet_switch_4B_pack(demux_d_physical_core.x,
-                                    demux_d_physical_core.y,
-                                    1, //num_dest_endpoints,
-                                    (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: remote_receiver_0_info
-            packet_switch_4B_pack(tunneler_physical_core.x,
-                                    tunneler_physical_core.y,
-                                    1,
-                                    (uint32_t)DispatchRemoteNetworkType::ETH), // 5: remote_receiver_1_info
-            (L1_UNRESERVED_BASE >> 4), // 6: remote_receiver_queue_start_addr_words 0
-            (0x10000 >> 4), // 7: remote_receiver_queue_size_words 0
-            (tunneler_queue_start_addr + tunneler_queue_size_bytes) >> 4, // 8: remote_receiver_queue_start_addr_words 1
-            tunneler_queue_size_bytes >> 4, // 9: remote_receiver_queue_size_words 1
-            packet_switch_4B_pack(tunneler_physical_core.x,
-                                tunneler_physical_core.y,
-                                2,
-                                (uint32_t)DispatchRemoteNetworkType::ETH), // 10: remote_sender_0_info
-            packet_switch_4B_pack(mux_d_physical_core.x,
-                                mux_d_physical_core.y,
-                                1, //num_dest_endpoints,
-                                (uint32_t)DispatchRemoteNetworkType::NOC0), // 11: remote_sender_1_info
-            tunneler_test_results_addr, // 12: test_results_addr
-            tunneler_test_results_size, // 13: test_results_size
-            0, // 14: timeout_cycles
-        };
 
+        //Upstream device tunneler. Goes towards MMIO Device.
+        auto [us_tunneler_core, us_tunneler_settings] = device_worker_variants[US_TUNNELER_LOCAL][0];
         configure_kernel_variant(
             *command_queue_program_ptr,
-            "tt_metal/impl/dispatch/kernels/eth_tunneler.cpp",
-            tunneler_r_compile_args,
-            r_tunneler_core,
+            us_tunneler_settings.kernel_file,
+            us_tunneler_settings.compile_args,
+            us_tunneler_core,
             CoreCoord{0, 0},
             CoreType::ETH,
             CoreCoord{0, 0},
@@ -898,217 +1385,101 @@ void Device::compile_command_queue_programs() {
             true
         );
 
-        log_debug(LogDevice, "run tunneler at device {} Core {}", this->id(), r_tunneler_core.str());
-
-        std::vector<uint32_t> demux_d_compile_args =
-        {
-            dest_endpoint_start_id, // 0: endpoint_id_start_index
-            (L1_UNRESERVED_BASE >> 4), // 1: rx_queue_start_addr_words
-            (0x10000 >> 4), // 2: rx_queue_size_words
-            1, // 3: demux_fan_out
-            packet_switch_4B_pack(prefetch_d_physical_core.x,
-                                prefetch_d_physical_core.y,
-                                0,
-                                (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: remote_tx_0_info
-            packet_switch_4B_pack(0,
-                                0,
-                                0,
-                                (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: remote_tx_1_info
-            packet_switch_4B_pack(0,
-                                0,
-                                0,
-                                (uint32_t)DispatchRemoteNetworkType::NOC0), // 6: remote_tx_2_info
-            packet_switch_4B_pack(0,
-                                0,
-                                0,
-                                (uint32_t)DispatchRemoteNetworkType::NOC0), // 7: remote_tx_3_info
-            (dispatch_constants::DISPATCH_BUFFER_BASE >> 4), // 8: remote_tx_queue_start_addr_words 0
-            dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_size() >> 4, // 9: remote_tx_queue_size_words 0
-            0, // 10: remote_tx_queue_start_addr_words 1
-            0, // 11: remote_tx_queue_size_words 1
-            0, // 12: remote_tx_queue_start_addr_words 2
-            0, // 13: remote_tx_queue_size_words 2
-            0, // 14: remote_tx_queue_start_addr_words 3
-            0, // 15: remote_tx_queue_size_words 3
-            (uint32_t)r_tunneler_physical_core.x, // 16: remote_rx_x
-            (uint32_t)r_tunneler_physical_core.y, // 17: remote_rx_y
-            2, // 18: remote_rx_queue_id
-            (uint32_t)DispatchRemoteNetworkType::NOC0, // 19: tx_network_type
-            (uint32_t)(dest_endpoint_output_map >> 32), // 20: dest_endpoint_output_map_hi
-            (uint32_t)(dest_endpoint_output_map & 0xFFFFFFFF), // 21: dest_endpoint_output_map_lo
-            packetized_path_test_results_addr, // 22: test_results_addr
-            packetized_path_test_results_size, // 23: test_results_size
-            0, // 24: timeout_cycles
-            0x1, // 25: output_depacketize_mask
-            // 26: output 0 packetize info:
-            packet_switch_4B_pack(dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE,
-                                prefetch_d_upstream_cb_sem, // downstream sem
-                                demux_sem,      // local sem
-                                0),
-            packet_switch_4B_pack(0, 0, 0, 0), // 27: output 1 packetize info
-            packet_switch_4B_pack(0, 0, 0, 0), // 28: output 2 packetize info
-            packet_switch_4B_pack(0, 0, 0, 0), // 29: output 3 packetize info
-        };
+        //Downstream device tunneler. Goes towards tunnel end.
+        if (device_worker_variants[US_TUNNELER_REMOTE].size()) {
+            auto [ds_tunneler_core, ds_tunneler_settings] = device_worker_variants[US_TUNNELER_REMOTE][0];
+            configure_kernel_variant(
+                *command_queue_program_ptr,
+                ds_tunneler_settings.kernel_file,
+                ds_tunneler_settings.compile_args,
+                ds_tunneler_core,
+                CoreCoord{0, 0},
+                CoreType::ETH,
+                CoreCoord{0, 0},
+                CoreCoord{0, 0},
+                std::map<string, string> {{"SKIP_NOC_LOGGING", "1"}},
+                true
+            );
+        }
 
+        auto [demux_d_core, demux_d_settings] = device_worker_variants[DEMUX_D][0];
+        for (auto sem : demux_d_settings.semaphores) {
+            //size of semaphores vector is number of needed semaphores on the core.
+            //Value of each vector entry is the initialization value for the semaphore.
+            tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, demux_d_core, sem, demux_d_settings.dispatch_core_type);
+        }
         configure_kernel_variant(
             *command_queue_program_ptr,
-            "tt_metal/impl/dispatch/kernels/packet_demux.cpp",
-            demux_d_compile_args,
+            demux_d_settings.kernel_file,
+            demux_d_settings.compile_args,
             demux_d_core,
             CoreCoord{0, 0},
-            dispatch_core_type,
+            demux_d_settings.dispatch_core_type,
             CoreCoord{0, 0},
             CoreCoord{0, 0},
             std::map<string, string> {{"SKIP_NOC_LOGGING", "1"}}
         );
 
-        log_debug(LogDevice, "run demux at {}", demux_d_core.str());
-
-        // prefetch_d
-        uint32_t scratch_db_base = (dispatch_constants::DISPATCH_BUFFER_BASE + dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_size()
-                                    + PCIE_ALIGNMENT - 1) & (~(PCIE_ALIGNMENT - 1));
-        uint32_t scratch_db_size = dispatch_constants::get(dispatch_core_type).scratch_db_size();
-        const uint32_t l1_size = dispatch_core_type == CoreType::WORKER ? MEM_L1_SIZE : MEM_ETH_SIZE;
-
-        TT_ASSERT(scratch_db_base + scratch_db_size <= l1_size);
-
-        std::vector<uint32_t> prefetch_d_compile_args = {
-            dispatch_constants::DISPATCH_BUFFER_BASE, // overridden below for prefetch_h
-            dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE, // overridden below for prefetch_h
-            dispatch_buffer_pages, // overridden below for prefetch_h
-            prefetch_d_downstream_cb_sem, // overridden below for prefetch_d
-            dispatch_sem, // overridden below for prefetch_h
-            0, //issue_queue_start_addr,
-            0, //issue_queue_size,
-            0, //prefetch_q_base,
-            dispatch_constants::get(dispatch_core_type).prefetch_q_size(),
-            CQ_PREFETCH_Q_RD_PTR,
-            dispatch_constants::DISPATCH_BUFFER_BASE, // overridden for split below
-            dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_size(), // overridden for split below
-            scratch_db_base, // scratch_db_base filled in below if used
-            scratch_db_size,
-            prefetch_d_sync_sem,
-            dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_pages(), // prefetch_d only
-            prefetch_d_upstream_cb_sem, // prefetch_d only my upstream
-            demux_sem, // prefetch_d only upstream
-            dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE,
-            dispatch_constants::PREFETCH_D_BUFFER_BLOCKS, // prefetch_d only
-            prefetch_h_exec_buf_sem,
-            true,
-            false
-        };
-
+        auto [prefetch_d_core, prefetch_d_settings] = device_worker_variants[PREFETCH_D][0];
+        for (auto sem : prefetch_d_settings.semaphores) {
+            //size of semaphores vector is number of needed semaphores on the core.
+            //Value of each vector entry is the initialization value for the semaphore.
+            tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, prefetch_d_core, sem, prefetch_d_settings.dispatch_core_type);
+        }
         configure_kernel_variant(
             *command_queue_program_ptr,
-            "tt_metal/impl/dispatch/kernels/cq_prefetch.cpp",
-            prefetch_d_compile_args,
+            prefetch_d_settings.kernel_file,
+            prefetch_d_settings.compile_args,
             prefetch_d_core,
-            prefetch_d_physical_core,
-            dispatch_core_type,
-            demux_d_physical_core,
-            dispatch_physical_core,
+            prefetch_d_settings.worker_physical_core,
+            prefetch_d_settings.dispatch_core_type,
+            prefetch_d_settings.upstream_cores[0],
+            prefetch_d_settings.downstream_cores[0],
             std::map<string, string> {}
         );
 
-        log_debug(LogDevice, "run prefertch_d at {}", prefetch_d_core.str());
-
-        std::vector<uint32_t> dispatch_d_compile_args = {
-            dispatch_constants::DISPATCH_BUFFER_BASE,
-            dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE,
-            dispatch_buffer_pages,
-            dispatch_sem,
-            prefetch_d_downstream_cb_sem,
-            dispatch_constants::DISPATCH_BUFFER_SIZE_BLOCKS,
-            prefetch_d_sync_sem,
-            128,
-            128 + 256 * 1024 * 1024,
-            256 * 1024 * 1024,
-            dispatch_constants::DISPATCH_BUFFER_BASE,
-            (1 << dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE) * dispatch_buffer_pages,
-            dispatch_downstream_cb_sem, // unused on hd, filled in below for h and d
-            mux_sem, // unused on hd, filled in below for h and d
-            sizeof(dispatch_packet_header_t), // unused unless tunneler is between h and d
-            true,   // is_dram_variant
-            false    // is_host_variant
-        };
-
+        auto [dispatch_d_core, dispatch_d_settings] = device_worker_variants[DISPATCH_D][0];
+        for (auto sem : dispatch_d_settings.semaphores) {
+            //size of semaphores vector is number of needed semaphores on the core.
+            //Value of each vector entry is the initialization value for the semaphore.
+            tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, dispatch_d_core, sem, dispatch_d_settings.dispatch_core_type);
+        }
         configure_kernel_variant(
             *command_queue_program_ptr,
-            "tt_metal/impl/dispatch/kernels/cq_dispatch.cpp",
-            dispatch_d_compile_args,
-            dispatch_core,
-            dispatch_physical_core,
-            dispatch_core_type,
-            prefetch_d_physical_core,
-            mux_d_physical_core,
+            dispatch_d_settings.kernel_file,
+            dispatch_d_settings.compile_args,
+            dispatch_d_core,
+            dispatch_d_settings.worker_physical_core,
+            dispatch_d_settings.dispatch_core_type,
+            dispatch_d_settings.upstream_cores[0],
+            dispatch_d_settings.downstream_cores[0],
             std::map<string, string> {}
         );
 
-        log_debug(LogDevice, "run dispatch at {}", dispatch_core.str());
-
-        std::vector<uint32_t> mux_d_compile_args =
-        {
-            0, // 0: reserved
-            (mux_queue_start_addr >> 4), // 1: rx_queue_start_addr_words
-            (mux_queue_size_bytes >> 4), // 2: rx_queue_size_words
-            1, // 3: mux_fan_in
-            packet_switch_4B_pack((uint32_t)dispatch_physical_core.x,
-                                    (uint32_t)dispatch_physical_core.y,
-                                    1,
-                                    (uint32_t)DispatchRemoteNetworkType::NOC0), // 4: src 0 info
-            packet_switch_4B_pack(0,
-                                    0,
-                                    1,
-                                    (uint32_t)DispatchRemoteNetworkType::NOC0), // 5: src 1 info
-            packet_switch_4B_pack(0,
-                                    0,
-                                    1,
-                                    (uint32_t)DispatchRemoteNetworkType::NOC0), // 6: src 2 info
-            packet_switch_4B_pack(0,
-                                    0,
-                                    1,
-                                    (uint32_t)DispatchRemoteNetworkType::NOC0), // 7: src 3 info
-            ((tunneler_queue_start_addr + tunneler_queue_size_bytes) >> 4), // 8: remote_tx_queue_start_addr_words
-            (tunneler_queue_size_bytes >> 4), // 9: remote_tx_queue_size_words
-            (uint32_t)r_tunneler_physical_core.x, // 10: remote_tx_x
-            (uint32_t)r_tunneler_physical_core.y, // 11: remote_tx_y
-            1, // 12: remote_tx_queue_id
-            (uint32_t)DispatchRemoteNetworkType::NOC0, // 13: tx_network_type
-            packetized_path_test_results_addr, // 14: test_results_addr
-            packetized_path_test_results_size, // 15: test_results_size
-            0, // 16: timeout_cycles
-            0x0,// 17: output_depacketize
-            0x0,// 18: output_depacketize info
-            // 19: input 0 packetize info:
-            packet_switch_4B_pack(0x1,
-                                    dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE,
-                                    dispatch_downstream_cb_sem, // upstream sem
-                                    mux_sem), // local sem
-            packet_switch_4B_pack(0, 0, 0, 0), // 20: input 1 packetize info
-            packet_switch_4B_pack(0, 0, 0, 0), // 21: input 2 packetize info
-            packet_switch_4B_pack(0, 0, 0, 0), // 22: input 3 packetize info
-            packet_switch_4B_pack(src_endpoint_start_id, 0, 0, 0), // 23: packetized input src id
-            packet_switch_4B_pack(dest_endpoint_start_id, 0, 0, 0), // 24: packetized input dest id
-        };
-
+        auto [mux_d_core, mux_d_settings] = device_worker_variants[MUX_D][0];
+        for (auto sem : mux_d_settings.semaphores) {
+            //size of semaphores vector is number of needed semaphores on the core.
+            //Value of each vector entry is the initialization value for the semaphore.
+            tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, mux_d_core, sem, mux_d_settings.dispatch_core_type);
+        }
         configure_kernel_variant(
             *command_queue_program_ptr,
-            "tt_metal/impl/dispatch/kernels/packet_mux.cpp",
-            mux_d_compile_args,
+            mux_d_settings.kernel_file,
+            mux_d_settings.compile_args,
             mux_d_core,
             CoreCoord{0, 0},
-            dispatch_core_type,
+            mux_d_settings.dispatch_core_type,
             CoreCoord{0, 0},
             CoreCoord{0, 0},
             std::map<string, string> {{"SKIP_NOC_LOGGING", "1"}}
         );
 
-        log_debug(LogDevice, "run mux at {}", mux_d_core.str());
-
         detail::CompileProgram(this, *command_queue_program_ptr);
         this->command_queue_programs.push_back(std::move(command_queue_program_ptr));
-        detail::CompileProgram(mmio_device, *mmio_command_queue_program_ptr);
-        this->command_queue_programs.push_back(std::move(mmio_command_queue_program_ptr));
+        if (first_tunnel_stop) {
+            detail::CompileProgram(mmio_device, *mmio_command_queue_program_ptr);
+            this->command_queue_programs.push_back(std::move(mmio_command_queue_program_ptr));
+        }
     }
 }
 
@@ -1118,6 +1489,7 @@ void Device::configure_command_queue_programs() {
     chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device_id);
     Device *mmio_device = tt::tt_metal::detail::GetDeviceHandle(mmio_device_id);
     uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id);
+    log_debug(tt::LogMetal, "Device {} - Channel {}", this->id_, channel);
 
     std::vector<uint32_t> zero = {0x0}; // Reset state in case L1 Clear is disabled.
     std::vector<uint32_t> pointers(CQ_START / sizeof(uint32_t), 0);
@@ -1126,7 +1498,8 @@ void Device::configure_command_queue_programs() {
     if (this->is_mmio_capable()) {
         TT_ASSERT(this->command_queue_programs.size() == 1);
     } else {
-        TT_ASSERT(this->command_queue_programs.size() == 2);
+        uint32_t program_size = tt::Cluster::instance().get_device_tunnel_depth(device_id) == 1 ? 2 : 1;
+        TT_ASSERT(this->command_queue_programs.size() == program_size);
     }
 
     Program& command_queue_program = *this->command_queue_programs[0];
@@ -1138,7 +1511,7 @@ void Device::configure_command_queue_programs() {
         pointers[HOST_CQ_ISSUE_READ_PTR / sizeof(uint32_t)] = (CQ_START + get_absolute_cq_offset(channel, cq_id, cq_size)) >> 4;
         pointers[HOST_CQ_COMPLETION_WRITE_PTR / sizeof(uint32_t)] = (CQ_START + this->sysmem_manager_->get_issue_queue_size(cq_id) + get_absolute_cq_offset(channel, cq_id, cq_size)) >> 4;
 
-        tt::Cluster::instance().write_sysmem(pointers.data(), pointers.size() * sizeof(uint32_t), cq_id * cq_size, mmio_device_id, channel);
+        tt::Cluster::instance().write_sysmem(pointers.data(), pointers.size() * sizeof(uint32_t), get_absolute_cq_offset(channel, cq_id, cq_size), mmio_device_id, get_umd_channel(channel));
     }
 
     uint8_t num_hw_cqs = device_id == mmio_device_id ? this->num_hw_cqs() : 1;
@@ -1160,7 +1533,7 @@ void Device::configure_command_queue_programs() {
         detail::WriteToDeviceL1(mmio_device, prefetch_location, dispatch_constants::PREFETCH_Q_BASE, prefetch_q, dispatch_core_type);
 
         // Initialize completion queue write pointer and read pointer copy
-        uint32_t issue_queue_size = mmio_device->sysmem_manager_->get_issue_queue_size(cq_id);
+        uint32_t issue_queue_size = this->sysmem_manager_->get_issue_queue_size(cq_id);
         uint32_t completion_queue_start_addr = CQ_START + issue_queue_size + get_absolute_cq_offset(channel, cq_id, cq_size);
         uint32_t completion_queue_start_addr_16B = completion_queue_start_addr >> 4;
         vector<uint32_t> completion_queue_wr_ptr = {completion_queue_start_addr_16B};
@@ -1182,9 +1555,12 @@ void Device::configure_command_queue_programs() {
     detail::ConfigureDeviceWithProgram(this, command_queue_program, true);
     tt::Cluster::instance().l1_barrier(this->id());
     if (device_id != mmio_device_id) {
-        Program& mmio_command_queue_program = *this->command_queue_programs[1];
-        detail::ConfigureDeviceWithProgram(mmio_device, mmio_command_queue_program, true);
-        tt::Cluster::instance().l1_barrier(mmio_device_id);
+        if (tt::Cluster::instance().get_device_tunnel_depth(device_id) == 1) {
+            //first or only remote device on the tunnel, launch fd2 kernels on mmio device for all remote devices.
+            Program& mmio_command_queue_program = *this->command_queue_programs[1];
+            detail::ConfigureDeviceWithProgram(mmio_device, mmio_command_queue_program, true);
+            tt::Cluster::instance().l1_barrier(mmio_device_id);
+        }
     }
 }
 
@@ -1203,7 +1579,8 @@ void Device::initialize_command_queue() {
     if (this->is_mmio_capable()) {
         TT_ASSERT(this->command_queue_programs.size() == 1);
     } else {
-        TT_ASSERT(this->command_queue_programs.size() == 2);
+        uint32_t program_size = tt::Cluster::instance().get_device_tunnel_depth(this->id()) == 1 ? 2 : 1;
+        TT_ASSERT(this->command_queue_programs.size() == program_size);
     }
     this->configure_command_queue_programs();
     Program& command_queue_program = *this->command_queue_programs[0];
@@ -1218,13 +1595,15 @@ void Device::initialize_command_queue() {
     }
 
     if (!this->is_mmio_capable()) {
-        chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(this->id());
-        Device *mmio_device = tt::tt_metal::detail::GetDeviceHandle(mmio_device_id);
-        Program& mmio_command_queue_program = *this->command_queue_programs[1];
-        for (const auto &[core_type, logical_dispatch_cores] : mmio_command_queue_program.logical_cores()) {
-            for (const CoreCoord &logical_dispatch_core : logical_dispatch_cores) {
-                launch_msg_t msg = mmio_command_queue_program.kernels_on_core(logical_dispatch_core, core_type)->launch_msg;
-                tt::llrt::write_launch_msg_to_core(mmio_device_id, mmio_device->physical_core_from_logical_core(logical_dispatch_core, core_type), &msg);
+        if (tt::Cluster::instance().get_device_tunnel_depth(this->id()) == 1) {
+            chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(this->id());
+            Device *mmio_device = tt::tt_metal::detail::GetDeviceHandle(mmio_device_id);
+            Program& mmio_command_queue_program = *this->command_queue_programs[1];
+            for (const auto &[core_type, logical_dispatch_cores] : mmio_command_queue_program.logical_cores()) {
+                for (const CoreCoord &logical_dispatch_core : logical_dispatch_cores) {
+                    launch_msg_t msg = mmio_command_queue_program.kernels_on_core(logical_dispatch_core, core_type)->launch_msg;
+                    tt::llrt::write_launch_msg_to_core(mmio_device_id, mmio_device->physical_core_from_logical_core(logical_dispatch_core, core_type), &msg);
+                }
             }
         }
     }
diff --git a/tt_metal/impl/device/device.hpp b/tt_metal/impl/device/device.hpp
index 7b054f03068..9169041050b 100644
--- a/tt_metal/impl/device/device.hpp
+++ b/tt_metal/impl/device/device.hpp
@@ -158,6 +158,10 @@ class Device {
         return tt::Cluster::instance().get_associated_mmio_device(this->id_) == this->id_;
     }
 
+    void setup_tunnel_for_remote_devices();
+
+    void update_workers_build_settings(std::vector<std::vector<std::tuple<tt_cxy_pair, worker_build_settings_t>>> &device_worker_variants);
+
     uint32_t num_banks(const BufferType &buffer_type) const;
     uint32_t bank_size(const BufferType &buffer_type) const;
 
@@ -259,6 +263,8 @@ class Device {
     uint32_t build_key_;
     std::unique_ptr<Allocator> allocator_ = nullptr;
     bool initialized_ = false;
+    std::map<uint32_t, std::map<chip_id_t, std::vector<std::vector<std::tuple<tt_cxy_pair, worker_build_settings_t>>>>> tunnel_device_dispatch_workers_;
+    std::vector<std::vector<chip_id_t>> tunnels_from_mmio_;
 
     JitBuildEnv build_env_;
     JitBuildStateSet firmware_build_states_;
diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp
index 5df863d7b3b..77f79b6af2a 100644
--- a/tt_metal/impl/dispatch/command_queue.cpp
+++ b/tt_metal/impl/dispatch/command_queue.cpp
@@ -1220,7 +1220,10 @@ HWCommandQueue::HWCommandQueue(Device* device, uint32_t id) :
     chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device->id());
     uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id());
     this->size_B = tt::Cluster::instance().get_host_channel_size(mmio_device_id, channel) / device->num_hw_cqs();
-
+    if (tt::Cluster::instance().is_galaxy_cluster()) {
+        //Galaxy puts 4 devices per host channel until umd can provide one channel per device.
+        this->size_B = this->size_B / 4;
+    }
     tt_cxy_pair completion_q_writer_location =
         dispatch_core_manager::get(device->num_hw_cqs()).completion_queue_writer_core(device->id(), channel, this->id);
 
diff --git a/tt_metal/impl/dispatch/command_queue_interface.hpp b/tt_metal/impl/dispatch/command_queue_interface.hpp
index 574c2db6a40..872036ba7ab 100644
--- a/tt_metal/impl/dispatch/command_queue_interface.hpp
+++ b/tt_metal/impl/dispatch/command_queue_interface.hpp
@@ -17,7 +17,10 @@ using namespace tt::tt_metal;
 
 // todo consider moving these to dispatch_addr_map
 static constexpr uint32_t PCIE_ALIGNMENT = 32;
-static constexpr uint32_t MAX_HUGEPAGE_SIZE = 1 << 30;  // 1GB;
+static constexpr uint32_t MAX_HUGEPAGE_SIZE = 1 << 30; // 1GB;
+static constexpr uint32_t MAX_DEV_CHANNEL_SIZE = 1 << 28; // 256 MB;
+static constexpr uint32_t DEVICES_PER_UMD_CHANNEL = MAX_HUGEPAGE_SIZE / MAX_DEV_CHANNEL_SIZE; // 256 MB;
+
 
 static constexpr uint32_t MEMCPY_ALIGNMENT = sizeof(__m128i);
 
@@ -85,7 +88,7 @@ struct dispatch_constants {
         // make this 2^N as required by the packetized stages
         uint32_t dispatch_buffer_block_size;
         if (core_type == CoreType::WORKER) {
-            prefetch_q_entries_ = 2048;
+            prefetch_q_entries_ = 1534;
             max_prefetch_command_size_ = 128 * 1024;
             cmddat_q_size_ = 256 * 1024;
             scratch_db_size_ = 128 * 1024;
@@ -135,13 +138,17 @@ struct dispatch_constants {
 /// @return uint32_t relative offset
 inline uint32_t get_relative_cq_offset(uint8_t cq_id, uint32_t cq_size) { return cq_id * cq_size; }
 
+inline uint16_t get_umd_channel(uint16_t channel) {
+    return channel & 0x3;
+}
+
 /// @brief Get absolute offset of the command queue
 /// @param channel uint16_t channel ID (hugepage)
 /// @param cq_id uint8_t ID the command queue
 /// @param cq_size uint32_t size of the command queue
 /// @return uint32_t absolute offset
 inline uint32_t get_absolute_cq_offset(uint16_t channel, uint8_t cq_id, uint32_t cq_size) {
-    return (MAX_HUGEPAGE_SIZE * channel) + get_relative_cq_offset(cq_id, cq_size);
+    return (MAX_HUGEPAGE_SIZE * get_umd_channel(channel)) + ((channel >> 2) * MAX_DEV_CHANNEL_SIZE) + get_relative_cq_offset(cq_id, cq_size);
 }
 
 template <bool addr_16B>
@@ -149,10 +156,11 @@ inline uint32_t get_cq_issue_rd_ptr(chip_id_t chip_id, uint8_t cq_id, uint32_t c
     uint32_t recv;
     chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(chip_id);
     uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(chip_id);
+    uint32_t channel_offset = (channel >> 2) * MAX_DEV_CHANNEL_SIZE;
     tt::Cluster::instance().read_sysmem(
         &recv,
         sizeof(uint32_t),
-        HOST_CQ_ISSUE_READ_PTR + get_relative_cq_offset(cq_id, cq_size),
+        HOST_CQ_ISSUE_READ_PTR + channel_offset + get_relative_cq_offset(cq_id, cq_size),
         mmio_device_id,
         channel);
     if (not addr_16B) {
@@ -166,10 +174,11 @@ inline uint32_t get_cq_completion_wr_ptr(chip_id_t chip_id, uint8_t cq_id, uint3
     uint32_t recv;
     chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(chip_id);
     uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(chip_id);
+    uint32_t channel_offset = (channel >> 2) * MAX_DEV_CHANNEL_SIZE;
     tt::Cluster::instance().read_sysmem(
         &recv,
         sizeof(uint32_t),
-        HOST_CQ_COMPLETION_WRITE_PTR + get_relative_cq_offset(cq_id, cq_size),
+        HOST_CQ_COMPLETION_WRITE_PTR + channel_offset + get_relative_cq_offset(cq_id, cq_size),
         mmio_device_id,
         channel);
     if (not addr_16B) {
@@ -330,6 +339,7 @@ class SystemMemoryManager {
         chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device_id);
         uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id);
         char *hugepage_start = (char *)tt::Cluster::instance().host_dma_address(0, mmio_device_id, channel);
+        hugepage_start += (channel >> 2) * MAX_DEV_CHANNEL_SIZE;
         this->cq_sysmem_start = hugepage_start;
 
         // TODO(abhullar): Remove env var and expose sizing at the API level
@@ -339,8 +349,12 @@ class SystemMemoryManager {
             this->cq_size = cq_size_override;
         } else {
             this->cq_size = tt::Cluster::instance().get_host_channel_size(mmio_device_id, channel) / num_hw_cqs;
+            if (tt::Cluster::instance().is_galaxy_cluster()) {
+                //We put 4 galaxy devices per huge page since number of hugepages available is less than number of devices.
+                this->cq_size = this->cq_size / DEVICES_PER_UMD_CHANNEL;
+            }
         }
-        this->channel_offset = MAX_HUGEPAGE_SIZE * channel;
+        this->channel_offset = MAX_HUGEPAGE_SIZE * get_umd_channel(channel) + (channel >> 2) * MAX_DEV_CHANNEL_SIZE;
 
         CoreType core_type = dispatch_core_manager::get(num_hw_cqs).get_dispatch_core_type(device_id);
         for (uint8_t cq_id = 0; cq_id < num_hw_cqs; cq_id++) {
diff --git a/tt_metal/impl/dispatch/dispatch_core_manager.hpp b/tt_metal/impl/dispatch/dispatch_core_manager.hpp
index 0a7a8297e1d..a184c1083c4 100644
--- a/tt_metal/impl/dispatch/dispatch_core_manager.hpp
+++ b/tt_metal/impl/dispatch/dispatch_core_manager.hpp
@@ -19,6 +19,45 @@ namespace tt::tt_metal {
 // One core dispatches commands to worker cores on the device `dispatcher`
 // The `remote_x` cores are used for remote fast dispatch and receive / transmit fast dispatch packets from ethernet cores
 
+enum DispatchWorkerType : uint32_t {
+    PREFETCH = 0,
+    PREFETCH_D = 1,
+    DISPATCH = 2,
+    DISPATCH_D = 3,
+    MUX = 4,
+    MUX_D = 5,
+    DEMUX = 6,
+    DEMUX_D = 7,
+    US_TUNNELER_LOCAL = 8,
+    US_TUNNELER_REMOTE = 9,
+    DS_TUNNELER_LOCAL = 10,
+    DS_TUNNELER_REMOTE = 11,
+    COUNT = 12
+};
+
+struct worker_build_settings_t{
+    std::string kernel_file;
+    std::vector<uint32_t> compile_args;
+    std::vector<tt_cxy_pair> upstream_cores;
+    std::vector<tt_cxy_pair> downstream_cores;
+    tt_cxy_pair worker_physical_core;
+    tt_cxy_pair eth_partner_physical_core;
+    CoreType dispatch_core_type;
+    uint32_t command_queue_start_addr;
+    uint32_t issue_queue_start_addr;
+    uint32_t issue_queue_size;
+    uint32_t completion_queue_start_addr;
+    uint32_t completion_queue_size;
+    std::vector<uint32_t> semaphores;
+    uint32_t producer_semaphore_id;
+    uint32_t consumer_semaphore_id;
+    uint32_t cb_start_address;
+    uint32_t cb_size_bytes;
+    uint32_t cb_log_page_size;
+    uint32_t cb_pages;
+    uint32_t tunnel_stop;
+};
+
 // std::optional is used to determine whether core has been assigned
 // tt_cxy_pair is used over CoreCoord to denote location because remote device command queue interface cores are on the associated MMIO device
 struct dispatch_core_types_t {
@@ -35,6 +74,7 @@ struct dispatch_core_types_t {
     std::optional<tt_cxy_pair> tunneler_d = std::nullopt; // ethernet tunneler
 };
 
+
 class dispatch_core_manager {
    public:
     dispatch_core_manager &operator=(const dispatch_core_manager &) = delete;
@@ -193,20 +233,30 @@ class dispatch_core_manager {
     /// @param channel assigned to the command queue where commands are enqueued
     /// @param cq_id ID of the command queue within the channel
     /// @return tt_cxy_pair logical location (chip + core coordinate) of the ethernet tunnel core
-    const tt_cxy_pair &tunneler_core(chip_id_t device_id, uint16_t channel, uint8_t cq_id) {
+    const tt_cxy_pair &tunneler_core(chip_id_t upstream_device_id, chip_id_t device_id, uint16_t channel, uint8_t cq_id) {
         dispatch_core_types_t &assignment = this->dispatch_core_assignments[device_id][channel][cq_id];
         if (assignment.tunneler.has_value()) {
             return assignment.tunneler.value();
         }
-        TT_ASSERT(assignment.mux.has_value(), " Mux core not assigned for device {}. Must assign a Mux core before getting a tunneler core.", device_id);
 
-        tt_cxy_pair tunneler_location = tt::Cluster::instance().get_eth_core_for_dispatch_core(
-                        assignment.mux.value(), EthRouterMode::BI_DIR_TUNNELING, device_id);
-        assignment.tunneler = tunneler_location;
-        log_debug(tt::LogMetal, "Allocated Tunneler Core: {} for Device {}", tunneler_location.str(), device_id);
+        auto[us_core, ds_core] = tt::Cluster::instance().get_eth_tunnel_core(upstream_device_id, device_id, EthRouterMode::BI_DIR_TUNNELING);
+
+        assignment.tunneler = us_core;
+        assignment.tunneler_d = ds_core;
+
+        log_debug(tt::LogMetal, "Allocated Tunneler Core: {} for Device {}", us_core.str(), device_id);
         return assignment.tunneler.value();
     }
 
+    const tt_cxy_pair &us_tunneler_core_local(chip_id_t device_id, uint16_t channel, uint8_t cq_id) {
+        dispatch_core_types_t &assignment = this->dispatch_core_assignments[device_id][channel][cq_id];
+        if (assignment.tunneler_d.has_value()) {
+            return assignment.tunneler_d.value();
+        }
+        TT_ASSERT(false, "Device {} has no allocation for Local Upstream Tunneler Core.", device_id);
+        assignment.tunneler_d = tt_cxy_pair(0, 0, 0);
+        return assignment.tunneler_d.value();
+    }
 
 
     /// @brief Gets the location of the kernel desginated to write to the completion queue region for a particular command queue
diff --git a/tt_metal/impl/dispatch/kernels/eth_tunneler.cpp b/tt_metal/impl/dispatch/kernels/eth_tunneler.cpp
index 971afc15f8d..3f1193021be 100644
--- a/tt_metal/impl/dispatch/kernels/eth_tunneler.cpp
+++ b/tt_metal/impl/dispatch/kernels/eth_tunneler.cpp
@@ -92,6 +92,7 @@ tt_l1_ptr uint32_t* const test_results =
     reinterpret_cast<tt_l1_ptr uint32_t*>(test_results_buf_addr_arg);
 
 constexpr uint32_t timeout_cycles = get_compile_time_arg_val(14);
+constexpr uint32_t inner_stop_mux_d_bypass = get_compile_time_arg_val(15);
 
 void kernel_main() {
 
@@ -151,6 +152,11 @@ void kernel_main() {
             output_queues[i].prev_words_in_flight_check_flush();
             bool output_finished = output_queues[i].is_remote_finished();
             if (output_finished) {
+                if ((i == 1) && (inner_stop_mux_d_bypass != 0)) {
+                    input_queues[1].remote_x = inner_stop_mux_d_bypass & 0xFF;
+                    input_queues[1].remote_y = (inner_stop_mux_d_bypass >> 8) & 0xFF;
+                    input_queues[1].set_remote_ready_status_addr((inner_stop_mux_d_bypass >> 16) & 0xFF);
+                }
                 input_queues[i].send_remote_finished_notification();
             }
             all_outputs_finished &= output_finished;
diff --git a/tt_metal/impl/dispatch/kernels/packet_queue.hpp b/tt_metal/impl/dispatch/kernels/packet_queue.hpp
index 0be25837726..9b70187b665 100644
--- a/tt_metal/impl/dispatch/kernels/packet_queue.hpp
+++ b/tt_metal/impl/dispatch/kernels/packet_queue.hpp
@@ -293,6 +293,10 @@ class packet_queue_state_t {
                                 PACKET_QUEUE_REMOTE_READY_FLAG);
     }
 
+    inline void set_remote_ready_status_addr(uint8_t remote_queue_id) {
+        this->remote_ready_status_addr = STREAM_REG_ADDR(remote_queue_id, STREAM_REMOTE_SRC_REG_INDEX);
+    }
+
     inline void send_remote_finished_notification() {
         this->remote_reg_update(this->remote_ready_status_addr,
                                 PACKET_QUEUE_REMOTE_FINISHED_FLAG);
diff --git a/tt_metal/llrt/tt_cluster.cpp b/tt_metal/llrt/tt_cluster.cpp
index 8d4b94cb366..c8cf461e24b 100644
--- a/tt_metal/llrt/tt_cluster.cpp
+++ b/tt_metal/llrt/tt_cluster.cpp
@@ -20,6 +20,9 @@
 #include "tt_metal/llrt/rtoptions.hpp"
 #include "tt_metal/llrt/tlb_config.hpp"
 
+static constexpr uint32_t HOST_MEM_CHANNELS = 4;
+static constexpr uint32_t HOST_MEM_CHANNELS_MASK = HOST_MEM_CHANNELS - 1;
+
 namespace tt {
 
 const Cluster &Cluster::instance() {
@@ -120,6 +123,10 @@ std::filesystem::path get_cluster_desc_yaml() {
     return fs::absolute(cluster_desc_path);
 }
 
+bool Cluster::is_galaxy_cluster() const {
+    return this->is_tg_cluster_;
+}
+
 void Cluster::generate_cluster_descriptor() {
     this->cluster_desc_path_ = (this->target_type_ == TargetDevice::Silicon and this->arch_ == tt::ARCH::WORMHOLE_B0)
                                    ? get_cluster_desc_yaml().string()
@@ -138,6 +145,12 @@ void Cluster::generate_cluster_descriptor() {
             tt_ClusterDescriptor::create_for_grayskull_cluster(logical_mmio_device_ids, physical_mmio_device_ids);
     } else {
         this->cluster_desc_ = tt_ClusterDescriptor::create_from_yaml(this->cluster_desc_path_);
+        for (const auto &chip_id : this->cluster_desc_->get_all_chips()) {
+            if (this->cluster_desc_->get_board_type(chip_id) == BoardType::GALAXY) {
+                this->is_tg_cluster_ = true;
+                break;
+            }
+        }
     }
 
     // Use cluster descriptor to map MMIO device id to all devices on the same card (including the MMIO device)
@@ -155,13 +168,24 @@ void Cluster::generate_cluster_descriptor() {
     }
 
     uint32_t total_num_hugepages = get_num_hugepages();
+    if (this->is_tg_cluster_) {
+        // TODO: don't think this check is correct, we want to have total num hugepages == num chips even for Galaxy
+        TT_FATAL(
+            this->arch_ == tt::ARCH::BLACKHOLE or total_num_hugepages >= this->cluster_desc_->get_all_chips().size()/4,
+            "Machine setup error: Insufficient number of hugepages available, expected >= {} for {} devices but have {}. "
+            "Increase number of hugepages!",
+            this->cluster_desc_->get_all_chips().size()/4,
+            this->cluster_desc_->get_all_chips().size(),
+            total_num_hugepages);
+    } else {
     // TODO (abhullar): ignore hugepage set up for BH bringup
-    TT_FATAL(
-        this->arch_ == tt::ARCH::BLACKHOLE or total_num_hugepages >= this->cluster_desc_->get_all_chips().size(),
-        "Machine setup error: Insufficient number of hugepages available, expected one per device ({}) but have {}. "
-        "Increase number of hugepages!",
-        this->cluster_desc_->get_all_chips().size(),
-        total_num_hugepages);
+        TT_FATAL(
+            this->arch_ == tt::ARCH::BLACKHOLE or total_num_hugepages >= this->cluster_desc_->get_all_chips().size(),
+            "Machine setup error: Insufficient number of hugepages available, expected one per device ({}) but have {}. "
+            "Increase number of hugepages!",
+            this->cluster_desc_->get_all_chips().size(),
+            total_num_hugepages);
+    }
 }
 
 void Cluster::initialize_device_drivers() {
@@ -189,7 +213,8 @@ void Cluster::assign_mem_channels_to_devices(
     chip_id_t mmio_device_id, const std::set<chip_id_t> &controlled_device_ids) {
     // g_MAX_HOST_MEM_CHANNELS (4) is defined in tt_SiliconDevice and denotes the max number of host memory channels per
     // MMIO device Metal currently assigns 1 channel per device. See https://github.com/tenstorrent/tt-metal/issues/4087
-    TT_ASSERT(controlled_device_ids.size() <= 4, "Unable to assign each device to its own host memory channel!");
+    // One WH gateway should have 8 remote deivces in its control group.
+    TT_ASSERT(controlled_device_ids.size() <= 9, "Unable to assign each device to its own host memory channel!");
     uint16_t channel = 0;
     this->device_to_host_mem_channel_[mmio_device_id] = channel++;
     for (const chip_id_t &device_id : controlled_device_ids) {
@@ -197,6 +222,7 @@ void Cluster::assign_mem_channels_to_devices(
             continue;
         }
         this->device_to_host_mem_channel_[device_id] = channel++;
+        if ((channel + 1) % 4 == 0) channel++;
     }
 }
 
@@ -220,6 +246,9 @@ void Cluster::open_driver(
         // available. Metal currently uses assigns 1 channel per device
         uint32_t num_host_mem_ch_per_mmio_device =
             this->arch_ == tt::ARCH::BLACKHOLE ? 0 : controlled_device_ids.size();
+        if (is_tg_cluster_) {
+            num_host_mem_ch_per_mmio_device = HOST_MEM_CHANNELS;
+        }
         std::unordered_map<std::string, std::int32_t> dynamic_tlb_config = ll_api::get_dynamic_tlb_config();
         // This will remove harvested rows from the soc descriptor
         const bool perform_harvesting = true;
@@ -275,6 +304,10 @@ Cluster::~Cluster() {
 
     this->mmio_device_id_to_driver_.clear();
     this->sdesc_per_chip_.clear();
+    this->devices_grouped_by_assoc_mmio_device_.clear();
+    this->device_to_mmio_device_.clear();
+    this->device_to_host_mem_channel_.clear();
+    this->device_eth_routing_info_.clear();
 }
 
 tt_device &Cluster::get_driver(chip_id_t device_id) const {
@@ -455,13 +488,13 @@ void Cluster::read_reg(std::uint32_t *mem_ptr, tt_cxy_pair target, uint64_t addr
 void Cluster::write_sysmem(
     const void *vec, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id, uint16_t channel) const {
     TT_ASSERT(this->cluster_desc_->is_chip_mmio_capable(src_device_id));
-    this->get_driver(src_device_id).write_to_sysmem(vec, size_in_bytes, addr, channel, src_device_id);
+    this->get_driver(src_device_id).write_to_sysmem(vec, size_in_bytes, addr, channel & HOST_MEM_CHANNELS_MASK, src_device_id);
 }
 
 void Cluster::read_sysmem(
     void *vec, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id, uint16_t channel) const {
     TT_ASSERT(this->cluster_desc_->is_chip_mmio_capable(src_device_id));
-    this->get_driver(src_device_id).read_from_sysmem(vec, addr, channel, size_in_bytes, src_device_id);
+    this->get_driver(src_device_id).read_from_sysmem(vec, addr, channel & HOST_MEM_CHANNELS_MASK, size_in_bytes, src_device_id);
 }
 
 void Cluster::verify_sw_fw_versions(
@@ -510,12 +543,12 @@ uint32_t Cluster::get_num_host_channels(chip_id_t device_id) const {
 
 uint32_t Cluster::get_host_channel_size(chip_id_t device_id, uint32_t channel) const {
     TT_ASSERT(this->cluster_desc_->is_chip_mmio_capable(device_id));
-    return this->get_driver(device_id).get_host_channel_size(device_id, channel);
+    return this->get_driver(device_id).get_host_channel_size(device_id, channel & HOST_MEM_CHANNELS_MASK);
 }
 
 void *Cluster::host_dma_address(uint64_t offset, chip_id_t src_device_id, uint16_t channel) const {
     TT_ASSERT(this->cluster_desc_->is_chip_mmio_capable(src_device_id));
-    return this->get_driver(src_device_id).host_dma_address(offset, src_device_id, channel);
+    return this->get_driver(src_device_id).host_dma_address(offset, src_device_id, channel & HOST_MEM_CHANNELS_MASK);
 }
 
 uint64_t Cluster::get_pcie_base_addr_from_device(chip_id_t chip_id) const {
@@ -548,6 +581,81 @@ std::unordered_map<chip_id_t, std::vector<CoreCoord>> Cluster::get_ethernet_core
     }
     return connected_chips;
 }
+#define MAX_TUNNEL_DEPTH 4
+std::vector<std::vector<chip_id_t>> Cluster::get_tunnels_from_mmio_device(chip_id_t mmio_chip_id) const {
+    std::vector<std::vector<chip_id_t>> tunnels_from_mmio = {};
+    const auto &all_eth_connections = this->cluster_desc_->get_ethernet_connections();
+    TT_ASSERT(this->cluster_desc_->is_chip_mmio_capable(mmio_chip_id));
+
+    if (all_eth_connections.find(mmio_chip_id) == all_eth_connections.end()) {
+        return {};
+    }
+
+    std::set<chip_id_t> device_ids = get_devices_controlled_by_mmio_device(mmio_chip_id);
+    device_ids.erase(mmio_chip_id);
+
+    if (device_ids.size() == 0) {
+        return {};
+    }
+
+    for (const auto &[eth_chan, connected_chip_chan] : all_eth_connections.at(mmio_chip_id)) {
+        const auto &other_chip_id = std::get<0>(connected_chip_chan);
+        if (device_ids.find(other_chip_id) != device_ids.end()) {
+            //mmio chip is connected to a remote chip in its mmio group.
+            //erase from the pool so multiple ethenret connections to same remote device do not
+            //pollute the counts.
+            device_ids.erase(other_chip_id);
+            std::vector<chip_id_t> first_stop = {other_chip_id};
+            auto it = std::find(tunnels_from_mmio.begin(), tunnels_from_mmio.end(), first_stop);
+            TT_ASSERT(it == tunnels_from_mmio.end(),"Duplicate first tunnel stop found when finding FD2 Tunnel devices.");
+            tunnels_from_mmio.push_back(first_stop);
+        }
+    }
+
+    log_info(tt::LogMetal, "Found {} FD Tunnels originating from MMIO Device {}", tunnels_from_mmio.size(), mmio_chip_id);
+
+    device_ids = get_devices_controlled_by_mmio_device(mmio_chip_id);
+    device_ids.erase(mmio_chip_id);
+
+    for (auto &tunnel : tunnels_from_mmio) {
+        TT_ASSERT(tunnel.size() == 1,"Tunnel depth must be 1 when it has only 1 stop in it.");
+        device_ids.erase(tunnel[0]);
+    }
+
+    bool tunneled_device_hit;
+    for (auto it = device_ids.begin(); it != device_ids.end();) {
+        tunneled_device_hit = false;
+        for (auto &dev_vec : tunnels_from_mmio) {
+            for (const auto &[eth_chan, connected_chip_chan] : all_eth_connections.at(dev_vec.back())) {
+                const auto &other_chip_id = std::get<0>(connected_chip_chan);
+                auto id_iter = device_ids.find(other_chip_id);
+                if (id_iter != device_ids.end()) {
+                    it = device_ids.erase(id_iter);
+                    dev_vec.push_back(other_chip_id);
+                    tunneled_device_hit = true;
+                    break;
+                }
+            }
+        }
+        TT_ASSERT(tunneled_device_hit || (it == device_ids.end()),"Loop Exit Error.");
+    }
+
+    TT_ASSERT(tunnels_from_mmio.size() != 0,"Must have at least 1 tunnel from MMIO Device.");
+    uint32_t tunnel_depth = tunnels_from_mmio[0].size();
+    log_info(tt::LogMetal, "Each FD Tunnel is {} deep.", tunnel_depth);
+
+    for (auto &dev_vec : tunnels_from_mmio) {
+        TT_ASSERT(dev_vec.size() == tunnel_depth,"All tunnels from mmio device must have same depth. Found {}. Expected {}.", dev_vec.size(), tunnel_depth);
+        //Now that all remotete chips have been added to respective tunnels,
+        //add mmio device at start of each of the tunnels.
+        if (dev_vec.size() > MAX_TUNNEL_DEPTH) {
+            dev_vec.resize(dev_vec.size() - (dev_vec.size() - MAX_TUNNEL_DEPTH));
+        }
+        dev_vec.insert(dev_vec.begin(), mmio_chip_id);
+    }
+    return tunnels_from_mmio;
+}
+
 
 // Ethernet cluster api
 void Cluster::initialize_ethernet_sockets() {
@@ -583,6 +691,71 @@ void Cluster::initialize_ethernet_sockets() {
     }
 }
 
+void Cluster::reserve_ethernet_cores_for_tunneling() {
+    const char *TT_METAL_SLOW_DISPATCH_MODE = std::getenv("TT_METAL_SLOW_DISPATCH_MODE");
+    const uint32_t routing_info_addr = eth_l1_mem::address_map::ERISC_APP_ROUTING_INFO_BASE;
+    for (const auto &[assoc_mmio_device, devices] : this->devices_grouped_by_assoc_mmio_device_) {
+        for (const auto &chip_id : devices) {
+            if (this->device_eth_routing_info_.find(chip_id) == this->device_eth_routing_info_.end()) {
+                this->device_eth_routing_info_.insert({chip_id, {}});
+            }
+        }
+        std::map<std::tuple<chip_id_t, chip_id_t>, bool> reserved_chip_connections = {};
+        for (const auto &chip_id : devices) {
+            if (TT_METAL_SLOW_DISPATCH_MODE == nullptr) {
+                for (const auto &[connected_chip_id, active_eth_cores] :
+                     this->get_ethernet_cores_grouped_by_connected_chips(chip_id)) {
+                    for (const auto &eth_core : active_eth_cores) {
+                        const auto connected_eth_core =
+                            std::get<1>(this->get_connected_ethernet_core(std::make_tuple(chip_id, eth_core)));
+                        if (this->device_eth_routing_info_.at(chip_id).find(eth_core) ==
+                            this->device_eth_routing_info_.at(chip_id).end()) {
+                            if (devices.find(connected_chip_id) != devices.end() &&
+                                reserved_chip_connections.find(std::make_tuple(chip_id, connected_chip_id)) ==
+                                    reserved_chip_connections.end() &&
+                                this->cluster_desc_->get_ethernet_link_distance(chip_id, assoc_mmio_device) !=
+                                    this->cluster_desc_->get_ethernet_link_distance(
+                                        connected_chip_id, assoc_mmio_device)) {
+                                // only setup fd tunneling for devices grouped with same mmio device and if no bi dir
+                                // tunnel found between the two chips and if link distance between both chips to mmio
+                                // chip is not the same
+                                tt_cxy_pair(chip_id, ethernet_core_from_logical_core(chip_id, eth_core));
+                                log_debug(
+                                    LogDevice,
+                                    "Reserving {} for tunneling",
+                                    tt_cxy_pair(chip_id, ethernet_core_from_logical_core(chip_id, eth_core)).str());
+                                log_debug(
+                                    LogDevice,
+                                    "Reserving {} for tunneling",
+                                    tt_cxy_pair(
+                                        connected_chip_id,
+                                        ethernet_core_from_logical_core(connected_chip_id, connected_eth_core))
+                                        .str());
+                                this->device_eth_routing_info_.at(chip_id).insert(
+                                    {eth_core, EthRouterMode::BI_DIR_TUNNELING});
+                                this->device_eth_routing_info_.at(connected_chip_id)
+                                    .insert({connected_eth_core, EthRouterMode::BI_DIR_TUNNELING});
+                                reserved_chip_connections.insert({std::make_tuple(chip_id, connected_chip_id), true});
+                                reserved_chip_connections.insert({std::make_tuple(connected_chip_id, chip_id), true});
+                            } else {
+                                this->device_eth_routing_info_.at(chip_id).insert({eth_core, EthRouterMode::IDLE});
+                            }
+                        }
+                    }
+                }
+            } else {
+                // Slow dispatch mode
+                for (const auto &[connected_chip_id, active_eth_cores] :
+                     this->get_ethernet_cores_grouped_by_connected_chips(chip_id)) {
+                    for (const auto &eth_core : active_eth_cores) {
+                        this->device_eth_routing_info_.at(chip_id).insert({eth_core, EthRouterMode::IDLE});
+                    }
+                }
+            }
+        }
+    }
+}
+
 std::unordered_set<chip_id_t> Cluster::get_ethernet_connected_device_ids(chip_id_t chip_id) const {
     std::unordered_set<chip_id_t> device_ids;
     const auto &connected_chips = this->get_ethernet_cores_grouped_by_connected_chips(chip_id);
@@ -658,56 +831,6 @@ CoreCoord Cluster::ethernet_core_from_logical_core(chip_id_t chip_id, const Core
     return soc_desc.get_physical_ethernet_core_from_logical(logical_core);
 }
 
-void Cluster::reserve_ethernet_cores_for_tunneling() {
-    const char *TT_METAL_SLOW_DISPATCH_MODE = std::getenv("TT_METAL_SLOW_DISPATCH_MODE");
-    const uint32_t routing_info_addr = eth_l1_mem::address_map::ERISC_APP_ROUTING_INFO_BASE;
-    for (const auto &[assoc_mmio_device, devices] : this->devices_grouped_by_assoc_mmio_device_) {
-        for (const auto &chip_id : devices) {
-            if (this->device_eth_routing_info_.find(chip_id) == this->device_eth_routing_info_.end()) {
-                this->device_eth_routing_info_.insert({chip_id, {}});
-            }
-        }
-        std::map<std::tuple<chip_id_t, chip_id_t>, bool> reserved_chip_connections = {};
-        for (const auto &chip_id : devices) {
-            if (TT_METAL_SLOW_DISPATCH_MODE == nullptr) {
-                for (const auto &[connected_chip_id, active_eth_cores] :
-                     this->get_ethernet_cores_grouped_by_connected_chips(chip_id)) {
-                    for (const auto &eth_core : active_eth_cores) {
-                        const auto connected_eth_core =
-                            std::get<1>(this->get_connected_ethernet_core(std::make_tuple(chip_id, eth_core)));
-                        if (this->device_eth_routing_info_.at(chip_id).find(eth_core) ==
-                            this->device_eth_routing_info_.at(chip_id).end()) {
-                            tt_cxy_pair this_phys_core(chip_id, ethernet_core_from_logical_core(chip_id, eth_core));
-                            if (devices.find(connected_chip_id) != devices.end() &&
-                                reserved_chip_connections.find(std::make_tuple(chip_id, connected_chip_id)) ==
-                                    reserved_chip_connections.end()) {
-                                // only setup fd tunneling for devices grouped with same mmio device and if no bi dir
-                                // tunnel found between the two chips
-                                this->device_eth_routing_info_.at(chip_id).insert(
-                                    {eth_core, EthRouterMode::BI_DIR_TUNNELING});
-                                this->device_eth_routing_info_.at(connected_chip_id)
-                                    .insert({connected_eth_core, EthRouterMode::BI_DIR_TUNNELING});
-                                reserved_chip_connections.insert({std::make_tuple(chip_id, connected_chip_id), true});
-                                reserved_chip_connections.insert({std::make_tuple(connected_chip_id, chip_id), true});
-                            } else {
-                                this->device_eth_routing_info_.at(chip_id).insert({eth_core, EthRouterMode::IDLE});
-                            }
-                        }
-                    }
-                }
-            } else {
-                // Slow dispatch mode
-                for (const auto &[connected_chip_id, active_eth_cores] :
-                     this->get_ethernet_cores_grouped_by_connected_chips(chip_id)) {
-                    for (const auto &eth_core : active_eth_cores) {
-                        this->device_eth_routing_info_.at(chip_id).insert({eth_core, EthRouterMode::IDLE});
-                    }
-                }
-            }
-        }
-    }
-}
-
 tt_cxy_pair Cluster::get_eth_core_for_dispatch_core(
     tt_cxy_pair logical_dispatch_core, EthRouterMode mode, chip_id_t connected_chip_id) const {
     const auto &local_chip_id = logical_dispatch_core.chip;
@@ -723,6 +846,20 @@ tt_cxy_pair Cluster::get_eth_core_for_dispatch_core(
     return {};
 }
 
+std::tuple<tt_cxy_pair, tt_cxy_pair> Cluster::get_eth_tunnel_core(
+    chip_id_t upstream_chip_id, chip_id_t downstream_chip_id, EthRouterMode mode) const {
+    for (const auto &[eth_core, router_mode] : this->device_eth_routing_info_.at(downstream_chip_id)) {
+
+      // Check for connected chip id since one chip can be bi directional tunneling to multiple chips
+        const auto [tunnel_chip_id, tunnel_eth_core] = this->get_connected_ethernet_core(std::make_tuple(downstream_chip_id, eth_core));
+        if (router_mode == mode and tunnel_chip_id == upstream_chip_id) {
+            return std::make_tuple(tt_cxy_pair(tunnel_chip_id, tunnel_eth_core), tt_cxy_pair(downstream_chip_id, eth_core));
+        }
+    }
+    TT_ASSERT(false, "Cluster does not contain requested eth routing core");
+    return {};
+}
+
 // TODO: ALLAN Can change to write one bit
 void Cluster::set_internal_routing_info_for_ethernet_cores(bool enable_internal_routing) const {
     log_debug(tt::LogDevice, "Set internal routing bit {}", enable_internal_routing);
@@ -785,6 +922,34 @@ void Cluster::set_internal_routing_info_for_ethernet_cores(bool enable_internal_
     }
 }
 
+uint32_t Cluster::get_mmio_device_max_tunnel_depth(chip_id_t mmio_device) const {
+    // Assume that tunnel depth for multiple tunnels are the same
+    TT_ASSERT(
+        (this->get_associated_mmio_device(mmio_device) == mmio_device), "Called mmio device api on non-mmio device");
+    uint32_t depth = 0;
+    for (const auto &[assoc_mmio_device, devices] : this->devices_grouped_by_assoc_mmio_device_) {
+        for (const auto &chip_id : devices) {
+            depth =
+                std::max(depth, uint32_t(this->cluster_desc_->get_ethernet_link_distance(chip_id, assoc_mmio_device)));
+        }
+    }
+    return depth;
+}
+
+uint32_t Cluster::get_mmio_device_tunnel_count(chip_id_t mmio_device) const {
+    TT_ASSERT(
+        (this->get_associated_mmio_device(mmio_device) == mmio_device), "Called mmio device api on non-mmio device");
+    const auto &chip_eth_core_modes = this->device_eth_routing_info_.at(mmio_device);
+    uint32_t tunnel_count = std::count_if(chip_eth_core_modes.begin(), chip_eth_core_modes.end(), [](const auto &e) {
+        return e.second == EthRouterMode::BI_DIR_TUNNELING;
+    });
+    return tunnel_count;
+}
+
+uint32_t Cluster::get_device_tunnel_depth(chip_id_t chip_id) const {
+    return this->cluster_desc_->get_ethernet_link_distance(chip_id, this->get_associated_mmio_device(chip_id));
+}
+
 uint32_t Cluster::get_tensix_soft_reset_addr() const { return DEVICE_DATA.TENSIX_SOFT_RESET_ADDR; }
 
 }  // namespace tt
diff --git a/tt_metal/llrt/tt_cluster.hpp b/tt_metal/llrt/tt_cluster.hpp
index c1a7901dbf0..f1fa68605db 100644
--- a/tt_metal/llrt/tt_cluster.hpp
+++ b/tt_metal/llrt/tt_cluster.hpp
@@ -149,16 +149,18 @@ class Cluster {
     // Converts logical ethernet core coord to physical ethernet core coord
     CoreCoord ethernet_core_from_logical_core(chip_id_t chip_id, const CoreCoord &logical_core) const;
 
-    // Configures routing mapping of ethernet cores
-    void initialize_routing_info_for_ethernet_cores();
-
-    void reserve_ethernet_cores_for_tunneling();
+    // Bookkeeping for mmio device tunnels
+    uint32_t get_mmio_device_max_tunnel_depth(chip_id_t mmio_device) const;
+    uint32_t get_mmio_device_tunnel_count(chip_id_t mmio_device) const;
+    uint32_t get_device_tunnel_depth(chip_id_t chip_id) const;
 
     // Dispatch core is managed by device, so this is an api for device to get the each eth core used in FD tunneling.
     // Returns logical eth core that communicates with specified dispatch core
     tt_cxy_pair get_eth_core_for_dispatch_core(
         tt_cxy_pair logical_dispatch_core, EthRouterMode mode, chip_id_t connected_chip_id) const;
 
+    std::tuple<tt_cxy_pair, tt_cxy_pair> get_eth_tunnel_core(chip_id_t upstream_chip_id, chip_id_t downstream_chip_id, EthRouterMode mode) const;
+
     // Internal routing for SD and FD enables launching user ethernet kernels and FD tunneling for all devices in the
     // cluster. When using multiple devices in a cluster, this should be the flow:
     //       CreateDevice(0)
@@ -186,6 +188,12 @@ class Cluster {
         return this->devices_grouped_by_assoc_mmio_device_.at(mmio_device_id);
     }
 
+    // Returns vector of unique tunnels originating from mmio device.
+    // Each vecor entry is another vector of remote devices on that tunnel.
+    std::vector<std::vector<chip_id_t>> get_tunnels_from_mmio_device(chip_id_t mmio_chip_id) const;
+
+    bool is_galaxy_cluster() const;
+
    private:
     Cluster();
     ~Cluster();
@@ -202,6 +210,8 @@ class Cluster {
     void get_metal_desc_from_tt_desc(const std::unordered_map<chip_id_t, tt_SocDescriptor> &input, const std::unordered_map<chip_id_t, uint32_t> &per_chip_id_harvesting_masks);
     tt_cxy_pair convert_physical_cxy_to_virtual(const tt_cxy_pair &physical_cxy) const;
 
+    // Reserves ethernet cores in cluster for tunneling
+    void reserve_ethernet_cores_for_tunneling();
     // Returns map of connected chip ids to active ethernet cores
     std::unordered_map<chip_id_t, std::vector<CoreCoord>> get_ethernet_cores_grouped_by_connected_chips(
         chip_id_t chip_id) const;
@@ -226,6 +236,10 @@ class Cluster {
     // Save mapping of device id to associated MMIO device id for fast lookup
     std::unordered_map<chip_id_t, chip_id_t> device_to_mmio_device_;
 
+    // Flag to tell whether we are on a TG type of system.
+    // If any device has to board type of GALAXY, we are on a TG cluster.
+    bool is_tg_cluster_;
+
     // Currently, each device is mapped to its own channel in host memory to enable fast dispatch
     // Channels are unique within a group of devices all controlled by a particular MMIO device
     // For example:
diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp
index 4ce64b5b07a..cc4eae72993 100644
--- a/tt_metal/tt_metal.cpp
+++ b/tt_metal/tt_metal.cpp
@@ -257,22 +257,52 @@ std::map<chip_id_t, Device *> CreateDevices(
     std::unordered_set<uint32_t> free_cores = {};
     auto cpu_cores_per_numa_node = device_cpu_allocator::get_cpu_cores_per_numa_node(free_cores);
 
+    bool is_galaxy = tt::Cluster::instance().is_galaxy_cluster();
     for (const auto &device_id : device_ids) {
+        // For Galaxy init, we only need to loop over mmio devices
         const auto &mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device_id);
+        if (is_galaxy and mmio_device_id != device_id) {
+            continue;
+        }
         if (active_devices.find(mmio_device_id) == active_devices.end()) {
-            for (const auto &mmio_controlled_device_id :
-                 tt::Cluster::instance().get_devices_controlled_by_mmio_device(mmio_device_id)) {
-                int core_assigned_to_device = device_cpu_allocator::get_cpu_core_for_device_worker_thread(
-                    mmio_controlled_device_id, cpu_cores_per_numa_node, free_cores);
-                Device *dev = new Device(
-                    mmio_controlled_device_id,
-                    num_hw_cqs,
-                    l1_small_size,
-                    l1_bank_remap,
-                    false,
-                    core_assigned_to_device);
-                active_devices.insert({mmio_controlled_device_id, dev});
-                detail::InitDeviceProfiler(dev);
+            log_debug(tt::LogMetal, "MMIO Device {} Tunnel Count: {}", mmio_device_id, tt::Cluster::instance().get_mmio_device_tunnel_count(mmio_device_id));
+            log_debug(tt::LogMetal, "MMIO Device {} Tunnel Depth: {}", mmio_device_id, tt::Cluster::instance().get_mmio_device_max_tunnel_depth(mmio_device_id));
+            log_debug(tt::LogMetal, "MMIO Device {} Tunnel Stop: {}", mmio_device_id, tt::Cluster::instance().get_device_tunnel_depth(mmio_device_id));
+            int core_assigned_to_device = device_cpu_allocator::get_cpu_core_for_device_worker_thread(
+                mmio_device_id, cpu_cores_per_numa_node, free_cores);
+            Device *mmio_device = new Device(
+                mmio_device_id,
+                num_hw_cqs,
+                l1_small_size,
+                l1_bank_remap,
+                false,
+                core_assigned_to_device);
+            //Only include the mmio device in the active devices set returned to the caller if we are not running
+            //on a Galaxy cluster.
+            //On Galaxy, gateway (mmio devices) cannot run compute workloads.
+            if (!is_galaxy) {
+                active_devices.insert({mmio_device_id, mmio_device});
+                detail::InitDeviceProfiler(mmio_device);
+            }
+
+            auto tunnels_from_mmio = mmio_device->tunnels_from_mmio_;
+            for (uint32_t t = 0; t < tunnels_from_mmio.size(); t++) {
+                //Need to create devices from farthest to the closest.
+                for (uint32_t ts = tunnels_from_mmio[t].size() - 1; ts > 0 ; ts--) {
+                    uint32_t mmio_controlled_device_id = tunnels_from_mmio[t][ts];
+                    log_debug(tt::LogMetal, "Tunnel {} Device {} Tunnel Stop: {}", t, mmio_controlled_device_id, ts);
+                    int core_assigned_to_device = device_cpu_allocator::get_cpu_core_for_device_worker_thread(
+                        mmio_controlled_device_id, cpu_cores_per_numa_node, free_cores);
+                    Device *dev = new Device(
+                        mmio_controlled_device_id,
+                        num_hw_cqs,
+                        l1_small_size,
+                        l1_bank_remap,
+                        false,
+                        core_assigned_to_device);
+                    active_devices.insert({mmio_controlled_device_id, dev});
+                    detail::InitDeviceProfiler(dev);
+                }
             }
         }
     }
@@ -286,7 +316,45 @@ std::map<chip_id_t, Device *> CreateDevices(
 
 void CloseDevices(std::map<chip_id_t, Device *> devices) {
     tt::Cluster::instance().set_internal_routing_info_for_ethernet_cores(false);
-    for (const auto &[device_id, dev] : devices) {
+    std::map<chip_id_t, Device *> mmio_devices = {};
+    bool is_galaxy = tt::Cluster::instance().is_galaxy_cluster();
+
+    if (is_galaxy) {
+        //On Galaxy, gateway wormhole devices (mmio devices) are not included in the set of devices
+        //created by CreateDevices(). So when closing devices, we need to find the corresponding
+        //gateway chips for all the tunneled devcies.
+        for (const auto &[device_id, dev] : devices) {
+            const auto &mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device_id);
+            if (mmio_devices.find(mmio_device_id) == mmio_devices.end()) {
+                auto dev_handle = tt::tt_metal::detail::GetDeviceHandle(mmio_device_id);
+                mmio_devices.insert({mmio_device_id, dev_handle});
+            }
+        }
+    } else {
+        for (const auto &[device_id, dev] : devices) {
+            if(dev->is_mmio_capable()) {
+                mmio_devices.insert({device_id, dev});
+            }
+        }
+        for (const auto &[device_id, dev] : mmio_devices) {
+            devices.erase(device_id);
+        }
+    }
+
+    for (const auto &[device_id, dev] : mmio_devices) {
+        //For each mmio device, first close all the remote tunneled devices.
+        //Close the farthest tunneled device first.
+        auto tunnels_from_mmio = dev->tunnels_from_mmio_;
+        //iterate over all tunnels origination from this mmio device
+        for (auto t : tunnels_from_mmio) {
+            //iterate over all tunneled devices (tunnel stops) in this tunnel and close them.
+            for (uint32_t ts = t.size() - 1; ts > 0; ts--) {
+                if (devices.find(t[ts]) != devices.end()) {
+                    devices[t[ts]]->close();
+                }
+            }
+        }
+        //finally close the mmio device
         dev->close();
     }
 }