diff --git a/device/CMakeLists.txt b/device/CMakeLists.txt
index cecc88db..2373b162 100644
--- a/device/CMakeLists.txt
+++ b/device/CMakeLists.txt
@@ -1,20 +1,19 @@
 
 set(UMD_DEVICE_SRCS
     architecture_implementation.cpp
-    blackhole_implementation.cpp
     cpuset_lib.cpp
-    grayskull_implementation.cpp
     tlb.cpp
     tt_cluster_descriptor.cpp
     tt_device.cpp
-    tt_emulation_stub.cpp
     tt_silicon_driver.cpp
     tt_silicon_driver_common.cpp
     tt_soc_descriptor.cpp
-    tt_versim_stub.cpp
-    wormhole_implementation.cpp
     simulation/tt_simulation_device.cpp
     simulation/tt_simulation_host.cpp
+    blackhole/blackhole_implementation.cpp
+    grayskull/grayskull_implementation.cpp
+    wormhole/wormhole_implementation.cpp
+    pci_device.cpp
 )
 add_library(umd_device SHARED ${UMD_DEVICE_SRCS})
 target_link_libraries(umd_device 
diff --git a/device/architecture_implementation.cpp b/device/architecture_implementation.cpp
index 96117d96..d55d3e29 100644
--- a/device/architecture_implementation.cpp
+++ b/device/architecture_implementation.cpp
@@ -4,9 +4,9 @@
 
 #include "device/architecture_implementation.h"
 
-#include "device/blackhole_implementation.h"
-#include "device/grayskull_implementation.h"
-#include "device/wormhole_implementation.h"
+#include "device/blackhole/blackhole_implementation.h"
+#include "device/grayskull/grayskull_implementation.h"
+#include "device/wormhole/wormhole_implementation.h"
 
 namespace tt::umd {
 
diff --git a/device/blackhole_implementation.cpp b/device/blackhole/blackhole_implementation.cpp
similarity index 98%
rename from device/blackhole_implementation.cpp
rename to device/blackhole/blackhole_implementation.cpp
index 4c36838c..eda2f140 100644
--- a/device/blackhole_implementation.cpp
+++ b/device/blackhole/blackhole_implementation.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "device/blackhole_implementation.h"
+#include "blackhole_implementation.h"
 
 namespace tt::umd {
 
diff --git a/device/blackhole_implementation.h b/device/blackhole/blackhole_implementation.h
similarity index 100%
rename from device/blackhole_implementation.h
rename to device/blackhole/blackhole_implementation.h
diff --git a/device/blackhole/impl_device.hpp b/device/blackhole/impl_device.hpp
deleted file mode 100644
index afb4091c..00000000
--- a/device/blackhole/impl_device.hpp
+++ /dev/null
@@ -1,76 +0,0 @@
-#pragma once
-
-#include "device/tt_silicon_driver_common.hpp"
-
-// See src/t6ifc/t6py/packages/tenstorrent/data/wormhole/pci/tlb.yaml
-// local_offset: [ 0, 15,  0,  "36-bit address prefix, prepended to the 20 LSBs of issued address to form a 56-bit NOC address. The 1MB TLB #n corresponds to the 1MB MMIO range starting at (0x0 + N*0x100000)."]
-// x_end       : [ 0, 21, 16,  "" ]
-// y_end       : [ 0, 27, 22,  "" ]
-// x_start     : [ 0, 33, 28,  "" ]
-// y_start     : [ 0, 39, 34,  "" ]
-// noc_sel:      [ 0, 40, 40,  "NOC select (1 = NOC1, 0 = NOC0)"]
-// mcast:        [ 0, 41, 41,  "1 = multicast, 0 = unicast"]
-// ordering:     [ 0, 43, 42,  "ordering mode (01 = strict (full AXI ordering), 00 = relaxed (no RAW hazard), 10 = posted (may have RAW hazard)"]
-// linked:       [ 0, 44, 44,  "linked"]
-
-// local_offset: [ 0, 14,  0,  "35-bit address prefix, prepended to the 21 LSBs of issued address to form a 56-bit NOC address. The 2MB TLB #n corresponds to the 2MB MMIO range starting at (0x9C00000 + N*0x200000)."]
-// x_end       : [ 0, 20, 15,  "" ]
-// y_end       : [ 0, 26, 21,  "" ]
-// x_start     : [ 0, 32, 27,  "" ]
-// y_start     : [ 0, 38, 33,  "" ]
-// noc_sel:      [ 0, 39, 39,  "NOC select (1 = NOC1, 0 = NOC0)"]
-// mcast:        [ 0, 40, 40,  "1 = multicast, 0 = unicast"]
-// ordering:     [ 0, 42, 41,  "ordering mode (01 = strict (full AXI ordering), 00 = relaxed (no RAW hazard), 10 = posted (may have RAW hazard)"]
-// linked:       [ 0, 43, 43,  "linked"]
-
-// local_offset: [ 0, 11,  0,  "32-bit address prefix, prepended to the 24 LSBs of issued address to form a 56-bit NOC address. The 16MB TLB #n corresponds to the 16MB MMIO range starting at (0xB000000 + N*0x1000000)."]
-// x_end       : [ 0, 17, 12,  "" ]
-// y_end       : [ 0, 23, 18,  "" ]
-// x_start     : [ 0, 29, 24,  "" ]
-// y_start     : [ 0, 35, 30,  "" ]
-// noc_sel:      [ 0, 36, 36,  "NOC select (1 = NOC1, 0 = NOC0)"]
-// mcast:        [ 0, 37, 37,  "1 = multicast, 0 = unicast"]
-// ordering:     [ 0, 39, 38,  "ordering mode (01 = strict (full AXI ordering), 00 = relaxed (no RAW hazard), 10 = posted (may have RAW hazard)"]
-// linked:       [ 0, 40, 40,  "linked"]
-
-const auto TLB_1M_OFFSET = TLB_OFFSETS {
-    .local_offset = 0,
-    .x_end = 16,
-    .y_end = 22,
-    .x_start = 28,
-    .y_start = 34,
-    .noc_sel = 40,
-    .mcast = 41,
-    .ordering = 42,
-    .linked = 44,
-    .static_vc = 45,
-    .static_vc_end = 46
-};
-
-const auto TLB_2M_OFFSET = TLB_OFFSETS {
-    .local_offset = 0,
-    .x_end = 15,
-    .y_end = 21,
-    .x_start = 27,
-    .y_start = 33,
-    .noc_sel = 39,
-    .mcast = 40,
-    .ordering = 41,
-    .linked = 43,
-    .static_vc = 44,
-    .static_vc_end = 45
-};
-
-const auto TLB_16M_OFFSET = TLB_OFFSETS {
-    .local_offset = 0,
-    .x_end = 12,
-    .y_end = 18,
-    .x_start = 24,
-    .y_start = 30,
-    .noc_sel = 36,
-    .mcast = 37,
-    .ordering = 38,
-    .linked = 40,
-    .static_vc = 41,
-    .static_vc_end = 42
-};
diff --git a/device/cpuset_lib.cpp b/device/cpuset_lib.cpp
index 803ee8eb..123b5fd0 100644
--- a/device/cpuset_lib.cpp
+++ b/device/cpuset_lib.cpp
@@ -7,39 +7,13 @@
 #include "cpuset_lib.hpp"
 #include "common/logger.hpp"
 #include <thread>
-#include "device/device_api.h"
+#include "device/tt_device.h"
 #include <filesystem>
 namespace tt {
 
 namespace fs = std::filesystem;
 namespace cpuset {
 
-// Unrelated to hwloc binding of threads, instead to query cpu affinity to find reasonable number of threads to parallelize over.
-int get_allowed_num_threads(){
-    unsigned int num_pus_in_system = sysconf(_SC_NPROCESSORS_ONLN);
-    unsigned int num_threads = num_pus_in_system;
-
-    cpu_set_t mask;
-    if (sched_getaffinity(0, sizeof(cpu_set_t), &mask) == -1) {
-        log_warning(LogSiliconDriver, "Could not detect current process cpu id affinity for calculating num_threads, will use default num_threads: {}.", num_threads);
-    } else{
-        unsigned int visible_pu_count = CPU_COUNT(&mask);
-        if (visible_pu_count < num_pus_in_system){
-            num_threads = visible_pu_count;
-        }
-        log_trace(LogSiliconDriver, "Detected (allowed) visible_pu_count: {}, setting num_threads: {}", visible_pu_count, num_threads);
-    }
-
-    char const* override_thread_count = std::getenv("TT_BACKEND_COMPILE_THREADS");
-    if (override_thread_count != nullptr && std::atoi(override_thread_count) > 0){
-        num_threads = std::atoi(override_thread_count);
-        log_trace(LogSiliconDriver, "Overriding via env-var to num_threads: {}", num_threads);
-    }
-
-    return num_threads;
-}
-
-
 /////////////////////////////////////////////////////////////////////////
 // Initialization Functions /////////////////////////////////////////////
 /////////////////////////////////////////////////////////////////////////
@@ -49,7 +23,6 @@ tt_cpuset_allocator::tt_cpuset_allocator() {
 
     m_pid           = getpid();
     m_debug         = std::getenv("TT_BACKEND_CPUSET_ALLOCATOR_DEBUG") ? true : false;
-    m_skip_singlify = std::getenv("TT_BACKEND_CPUSET_ALLOCATOR_SKIP_SINGLIFY") ? true : false;
 
     // Chicken bit to disable this entire feature for debug/comparison.
     bool cpuset_allocator_enable_env = std::getenv("TT_BACKEND_CPUSET_ALLOCATOR_ENABLE") ? true : false;
@@ -72,7 +45,6 @@ tt_cpuset_allocator::tt_cpuset_allocator() {
 
         if (is_cpu_supported){
             m_enable_cpuset_allocator &= init_determine_cpuset_allocations();
-            m_enable_cpuset_allocator &= init_populate_physical_mmio_device_id_map();
         }else{
             m_enable_cpuset_allocator = false;
         }
@@ -351,206 +323,10 @@ bool tt_cpuset_allocator::init_determine_cpuset_allocations(){
 
 }
 
-// Step 6 - Populate map of logical to physical mmio device map.
-bool tt_cpuset_allocator::init_populate_physical_mmio_device_id_map(){
-
-    if (!m_enable_cpuset_allocator){
-        return false;
-    }
-
-    log_debug(LogSiliconDriver,"Starting tt_cpuset_allocator::populate_physical_mmio_device_id_map()");
-
-    // Get map of logical to physical device ids - FIXME: This is not accurate for some WHB0 clusters.
-    std::vector<chip_id_t> available_device_ids = tt_SiliconDevice::detect_available_device_ids();
-    m_logical_to_physical_mmio_device_id_map    = tt_SiliconDevice::get_logical_to_physical_mmio_device_id_map(available_device_ids);
-
-    for (auto &d: m_logical_to_physical_mmio_device_id_map){
-        auto logical_device_id = d.first;
-        auto physical_device_id = d.second;
-        log_debug(LogSiliconDriver, "populate_physical_mmio_device_id_map() -- available_devices: {} logical_device_id: {} => physical_device_id: {}", available_device_ids.size(), (int) logical_device_id, (int) physical_device_id);
-        m_num_threads_pinned_per_tt_device.insert({physical_device_id, 0});
-    }
-
-    return true; // Success
-}
-
-
 /////////////////////////////////////////////////////////////////////////
 // Runtime Functions ////////////////////////////////////////////////////
 /////////////////////////////////////////////////////////////////////////
 
-// Idea  - Something to compare cpuset from Slurm to cpuset picked by this function.
-hwloc_cpuset_t tt_cpuset_allocator::allocate_cpu_set_for_thread(chip_id_t physical_device_id, bool skip_singlify){
-
-        // To prevent races on read/modify/write to m_num_threads_pinned_per_tt_device across threads to same device.
-        const std::lock_guard<std::mutex> lock(allocate_cpu_id_mutex);
-
-        int num_alloc_slots_for_tt_device   = m_physical_device_id_to_cpusets_map.at(physical_device_id).size();
-        int tt_device_alloc_idx             = m_num_threads_pinned_per_tt_device.at(physical_device_id) % num_alloc_slots_for_tt_device;
-
-        // Check if 2CCX-PER-CCD Optimization can be enabled. For AMD EPYC models : There is 1 L3Cache per CCX and 2 CCX per CCD.
-        // Better perf to first allocate to unique CCD's if we have enough per device. Expand to other CPU types?
-        bool enable_special_case    = true;
-        auto package_id             = m_physical_device_id_to_package_id_map.at(physical_device_id);
-        auto num_l3_per_ccx         = m_package_id_to_num_l3_per_ccx_map.at(package_id);
-        auto num_ccx_per_ccd        = m_package_id_to_num_ccx_per_ccd_map.at(package_id);
-
-        if (enable_special_case && num_l3_per_ccx == 1 && num_ccx_per_ccd == 2 && num_alloc_slots_for_tt_device > num_ccx_per_ccd && m_object_per_alloc_slot == HWLOC_OBJ_L3CACHE){
-            int alloc_idx_for_device    = m_num_threads_pinned_per_tt_device.at(physical_device_id);
-            int ccx_in_ccd              = (alloc_idx_for_device % num_alloc_slots_for_tt_device) < num_alloc_slots_for_tt_device/num_ccx_per_ccd ? 0 : 1;
-            tt_device_alloc_idx         = (ccx_in_ccd + (alloc_idx_for_device * num_ccx_per_ccd)) % num_alloc_slots_for_tt_device;
-            log_debug(LogSiliconDriver,"Special L3Cache case physical_device_id: {} alloc_idx_for_device: {} ccx_in_ccd: {} tt_device_alloc_idx: {}", physical_device_id, alloc_idx_for_device, ccx_in_ccd, tt_device_alloc_idx);
-        }
-
-
-        // Get the desired cpuset and prevent migration between different PU's in set by singlifying to single PU.
-        hwloc_cpuset_t cpuset = hwloc_bitmap_dup(m_physical_device_id_to_cpusets_map.at(physical_device_id).at(tt_device_alloc_idx));
-        if (!m_skip_singlify && !skip_singlify){
-            hwloc_bitmap_singlify(cpuset);
-        }
-
-        // Debug
-        auto tid = std::this_thread::get_id();
-        log_debug(LogSiliconDriver,"Allocating for physical_device_id: {} num_alloc_slots: {} num_threads_pinned: {} alloc_idx: {} skip_singlify: {} (pid: {} tid: {}) => {} PU's {}", 
-            physical_device_id, num_alloc_slots_for_tt_device, m_num_threads_pinned_per_tt_device.at(physical_device_id), tt_device_alloc_idx, skip_singlify,
-            m_pid, tid, hwloc_bitmap_weight(cpuset), get_hwloc_bitmap_vector(cpuset));
-
-        // Increment counter to keep track of number of pinned thread per device, to get unique cpuset per thread.
-        m_num_threads_pinned_per_tt_device.at(physical_device_id)++;
-
-        return cpuset;
-}
-
-void tt_cpuset_allocator::store_thread_original_cpuset(){
-
-    auto tid = std::this_thread::get_id();
-    hwloc_cpuset_t orig_cpuset = hwloc_bitmap_alloc();
-
-    if (hwloc_get_cpubind(m_topology, orig_cpuset, HWLOC_CPUBIND_THREAD)){
-        log_warning(LogSiliconDriver,"store_thread_original_cpuset() calling hwloc_get_cpubind() failed with errno: {} (pid: {} tid:{})", strerror(errno), m_pid, tid);
-    }else{
-        auto orig_cpuset_vector = get_hwloc_bitmap_vector(orig_cpuset);
-        log_debug(LogSiliconDriver, "store_thread_original_cpuset() success - got orig cpuset: {} PU's: {} (pid: {} tid: {})", orig_cpuset_vector.size(), orig_cpuset_vector, m_pid, tid);
-        m_global_thread_id_to_original_cpuset_map.insert({tid, hwloc_bitmap_dup(orig_cpuset)});
-    }
-    hwloc_bitmap_free(orig_cpuset);
-}
-
-
-
-// Given a logical device_id, determine the right cpu_ids associated with it and pin this thread to them.
-void tt_cpuset_allocator::bind_thread_cpuset(tt_cluster_description *ndesc, chip_id_t logical_device_id, bool skip_singlify){
-
-    auto tid = std::this_thread::get_id();
-
-    // This needed to be protected by not-empty otherwise arithmetic error.
-    if ((!m_global_thread_ids_pinned.empty() && m_global_thread_ids_pinned.count(tid)) || (!m_enable_cpuset_allocator)){
-        return;
-    }else{
-
-        if (!ndesc->is_chip_mmio_capable(logical_device_id)){
-            logical_device_id = ndesc->get_closest_mmio_capable_chip(logical_device_id);
-        }
-
-        log_debug(LogSiliconDriver,"bind_thread_cpuset_cpuset() for logical_device_id: {} m_logical_to_physical_mmio_device_id_map.size(): {}", logical_device_id, m_logical_to_physical_mmio_device_id_map.size());
-
-        // If a main thread ID was captured, make sure it is not attempted to be pinned. Only IO API sub threads are expected to be pinned today.
-        if (m_stored_main_thread_id && tid == m_main_thread_id){
-            log_warning(LogSiliconDriver, "bind_thread_cpuset() - Skipping cpubind for runtime main thread_id: {} to prevent undesired inheritence. Consider moving device IO (ie. push/pop/get) to sub-threads for binding to be supported.", m_main_thread_id);
-            return;
-        }
-
-        if (m_logical_to_physical_mmio_device_id_map.count(logical_device_id) > 0){
-
-            auto physical_device_id = m_logical_to_physical_mmio_device_id_map.at(logical_device_id);
-            auto package_id = m_physical_device_id_to_package_id_map.at(physical_device_id);
-
-            store_thread_original_cpuset(); // Store original cpuset for later unbinding if necessary.
-
-            // Get the cpuset, and attempt to bind thread to it.
-            hwloc_cpuset_t cpuset   = allocate_cpu_set_for_thread(physical_device_id, skip_singlify);
-            auto cpuset_vector      = get_hwloc_bitmap_vector(cpuset);
-
-            if (hwloc_set_cpubind(m_topology, cpuset, HWLOC_CPUBIND_THREAD | HWLOC_CPUBIND_STRICT )){; // HWLOC_CPUBIND_NOMEMBIND
-                log_warning(LogSiliconDriver,"bind_thread_cpuset() binding failed (errno: {}) for physical_device_id: {} on package_id: {} to {} PU's: {} (pid: {} tid: {})",
-                    strerror(errno), physical_device_id, package_id, cpuset_vector.size(), cpuset_vector, m_pid, tid);
-            }else{
-                log_debug(LogSiliconDriver,"bind_thread_cpuset() binding success skip: {} for physical_device_id: {} on package_id: {} to {} PU's: {} (pid: {} tid: {})",
-                    skip_singlify, physical_device_id, package_id, cpuset_vector.size(), cpuset_vector, m_pid, tid);
-                // Record that this thread is pinned, no need to repeat on subsequent IO API calls.
-                m_global_thread_ids_pinned.insert(tid);
-                m_global_thread_id_to_physical_device_id_map.insert({tid, physical_device_id});
-            }
-
-        }else{
-            log_warning(LogSiliconDriver,"Could not find logical_device_id: {} in m_logical_to_physical_mmio_device_id_map. This shouldn't happen.", logical_device_id);
-        }
-    }
-}
-
-
-// Restore thread's original cpubind. Perhaps could be simplified to not require physical_device_id or previous binding, and just always bind to MACHINE cpuset.
-void tt_cpuset_allocator::unbind_thread_cpuset(){
-
-    if (m_enable_cpuset_allocator){
-        auto tid = std::this_thread::get_id();
-
-        // Make sure this thread was successfully and previously binded to a cpuset.
-        if (!m_global_thread_id_to_original_cpuset_map.count(tid)){
-            log_warning(LogSiliconDriver,"unbind_thread_cpuset() called for tid: {} but no original cpuset for this thread found. Previous cpu binding skipped or failed?", tid);
-            return;
-        }
-
-        if (!m_global_thread_id_to_physical_device_id_map.count(tid)){
-            log_warning(LogSiliconDriver,"unbind_thread_cpuset() called for tid: {} but no physical_device_id this thread found. Previous cpu binding skipped or failed?", tid);
-            return;
-        }
-
-        // Handle the case where something goes wrong during original binding above, don't want to error out.
-        auto cpuset             = m_global_thread_id_to_original_cpuset_map.at(tid);
-        auto physical_device_id = m_global_thread_id_to_physical_device_id_map.at(tid);
-        auto cpuset_vector      = get_hwloc_bitmap_vector(cpuset); // Can tighten this up and remove, it's purely for debug anyways.
-
-        if (hwloc_set_cpubind(m_topology, cpuset, HWLOC_CPUBIND_THREAD)){
-            log_warning(LogSiliconDriver,"unbind_thread_cpuset() binding failed (errno: {}) for physical_device_id: {} to original {} PU's: {} (pid: {} tid: {})",
-                strerror(errno), physical_device_id, cpuset_vector.size(), cpuset_vector, m_pid, tid);
-        }else{
-            log_debug(LogSiliconDriver,"unbind_thread_cpuset() binding success for physical_device_id: {} to original {} PU's: {} (pid: {} tid: {})",
-                physical_device_id, cpuset_vector.size(), cpuset_vector, m_pid, tid);
-
-            // To prevent races on read/modify/write to m_num_threads_pinned_per_tt_device across threads to same device.
-            const std::lock_guard<std::mutex> lock(allocate_cpu_id_mutex);
-
-            // Update book-keeping by removing entry, so this thread can be re-pinned in the future.
-            m_num_threads_pinned_per_tt_device.at(physical_device_id)--;
-            m_global_thread_ids_pinned.erase(tid);
-            m_global_thread_id_to_physical_device_id_map.erase(tid);
-        }
-    }
-}
-
-// Teardown/Cleanup for end of process. Don't do anything if feature disabled. Probably don't even need this if process is going to be ended.
-void tt_cpuset_allocator::clear_state(){
-    if (m_enable_cpuset_allocator){
-
-        auto tid = std::this_thread::get_id();
-        log_debug(LogSiliconDriver,"Clearing state and unbinding entire process' cpuset (pid: {} tid: {}).", m_pid, tid);
-
-        // Reset state variables so that next time the thread can be freshly pinned
-        m_global_thread_ids_pinned.clear();
-        for (auto &device: m_num_threads_pinned_per_tt_device){
-            device.second = 0;
-        }
-
-        // Undo previous pinning, by binding to full machine cpuset. Alternatively could have saved and restored orig cpuset per thread.
-        auto machine_obj = hwloc_get_obj_by_type(m_topology, HWLOC_OBJ_MACHINE, 0);
-        if (hwloc_set_cpubind(m_topology, machine_obj->cpuset, HWLOC_CPUBIND_PROCESS)){
-            log_warning(LogSiliconDriver,"clear_state() binding failed (errno: {}) to Machine cpuset (pid: {} tid: {})", strerror(errno), m_pid, tid);
-        }
-    }
-}
-
-
 // Given a physical device_id, determine the right numa nodes associated with it and attempt to membind a previously allocated memory region to it.
 bool tt_cpuset_allocator::bind_area_memory_nodeset(chip_id_t physical_device_id, const void * addr, size_t len){
 
@@ -580,14 +356,6 @@ bool tt_cpuset_allocator::bind_area_memory_nodeset(chip_id_t physical_device_id,
     return true; // Success
 }
 
-
-// For checking purposes, to make sure main thread is not cpubinded accidentally.
-void tt_cpuset_allocator::_set_main_thread_id(){
-    m_main_thread_id = std::this_thread::get_id();
-    m_stored_main_thread_id = true;
-    log_debug(LogSiliconDriver,"Captured main_thread_id: {}", m_main_thread_id);
-}
-
 int tt_cpuset_allocator::_get_num_tt_pci_devices() {
 
     for (auto &d : m_physical_device_id_to_package_id_map) {
diff --git a/device/cpuset_lib.hpp b/device/cpuset_lib.hpp
index 65e31eaa..a14a4f33 100644
--- a/device/cpuset_lib.hpp
+++ b/device/cpuset_lib.hpp
@@ -24,8 +24,6 @@ namespace tt {
 //! Utility functions for various backend paramsf
 namespace cpuset {
 
-int get_allowed_num_threads();
-
 // CPU ID allocator for pinning threads to cpu_ids
 // It's a singleton that should be retrieved via get()
 struct tt_cpuset_allocator {
@@ -34,39 +32,12 @@ struct tt_cpuset_allocator {
         tt_cpuset_allocator(tt_cpuset_allocator const&)     = delete;
         void operator=(tt_cpuset_allocator const&)          = delete;
 
-        static void bind_thread_to_cpuset(tt_cluster_description *ndesc, chip_id_t device_id, bool skip_singlify=false){
-            auto& instance = tt_cpuset_allocator::get();
-            instance.bind_thread_cpuset(ndesc, device_id, skip_singlify);
-        }
-        
-        static void unbind_thread_from_cpuset(){
-            auto& instance = tt_cpuset_allocator::get();
-            instance.unbind_thread_cpuset();
-        }
-
-        static void clear_state_and_cpuset_pins(){
-            auto& instance = tt_cpuset_allocator::get();
-            instance.clear_state();
-        }
-
         // Bind an already allocated memory region to particular numa nodes
         static bool bind_area_to_memory_nodeset(chip_id_t physical_device_id, const void * addr, size_t len){
             auto& instance = tt_cpuset_allocator::get();
             return instance.bind_area_memory_nodeset(physical_device_id, addr, len);
         }
 
-        // Store process' main thread_id (not required, mainly for checking purposes to ensure no cpubinds on it occur).
-        static void set_main_thread_id(){
-            auto& instance = tt_cpuset_allocator::get();
-            instance._set_main_thread_id();
-        }
-
-        static int get_num_cpu_cores_allocated_to_device(chip_id_t physical_device_id){
-            auto& instance = tt_cpuset_allocator::get();
-            auto num_cores = instance.m_enable_cpuset_allocator ? instance.m_num_cpu_cores_allocated_per_tt_device.at(physical_device_id) : get_allowed_num_threads();
-            return num_cores;
-        }
-
         static int get_num_tt_pci_devices(){
             auto& instance = tt_cpuset_allocator::get();
             return instance._get_num_tt_pci_devices();
@@ -88,17 +59,10 @@ struct tt_cpuset_allocator {
 
         int TENSTORRENT_VENDOR_ID = 0x1e52;
 
-        void bind_thread_cpuset(tt_cluster_description *ndesc, chip_id_t device_id, bool skip_singlify);
-        void unbind_thread_cpuset();
-        void store_thread_original_cpuset();
         bool bind_area_memory_nodeset(chip_id_t physical_device_id, const void * addr, size_t len);
-        void _set_main_thread_id();
         int _get_num_tt_pci_devices();
         int _get_num_tt_pci_devices_by_pci_device_id(uint16_t device_id, uint16_t revision_id);
 
-        void clear_state();
-        hwloc_cpuset_t allocate_cpu_set_for_thread(chip_id_t physical_device_id, bool skip_singlify);
-
         // Series of init functions, must be called in this order. Seperated out to support
         // early exit in case of errors.
         bool init_topology_init_and_load();
@@ -106,7 +70,6 @@ struct tt_cpuset_allocator {
         bool init_get_number_of_packages();
         bool init_is_cpu_model_supported();
         bool init_determine_cpuset_allocations();
-        bool init_populate_physical_mmio_device_id_map();
 
         // Helper Functions
         std::string get_pci_bus_id(hwloc_obj_t pci_device_obj);
@@ -122,11 +85,8 @@ struct tt_cpuset_allocator {
         std::vector<int> get_hwloc_nodeset_vector(hwloc_obj_t &obj);
         hwloc_topology_t m_topology;
         bool m_debug;
-        bool m_skip_singlify;
         pid_t m_pid;
 
-        std::unordered_map<chip_id_t, chip_id_t> m_logical_to_physical_mmio_device_id_map;
-
         // Items calculated by parsing system info, used by allocation algorithm:
         std::map<int, std::vector<int>> m_package_id_to_devices_map;
         std::map<int, std::string> m_physical_device_id_to_pci_bus_id_map; // Debug/Info
@@ -135,30 +95,16 @@ struct tt_cpuset_allocator {
         std::map<chip_id_t, std::vector<hwloc_cpuset_t>> m_physical_device_id_to_cpusets_map;
         std::map<chip_id_t, int> m_physical_device_id_to_package_id_map;
 
-        std::mutex allocate_cpu_id_mutex;
-
         bool m_enable_cpuset_allocator = true; // Enable feature, otherwise do nothing.
         int m_num_packages = 0;
         std::vector<int> m_all_tt_devices = {};
 
         hwloc_obj_type_t m_object_per_alloc_slot = HWLOC_OBJ_L3CACHE; // Default
 
-
         // For 2CCX-PER-CCD Optimization detection.
         std::map<int, int> m_package_id_to_num_l3_per_ccx_map;
         std::map<int, int> m_package_id_to_num_ccx_per_ccd_map;
 
-        std::map<chip_id_t, int> m_num_threads_pinned_per_tt_device;
-        std::unordered_set<std::thread::id> m_global_thread_ids_pinned = {};
-        std::thread::id m_main_thread_id;
-        bool m_stored_main_thread_id = false;
-
-        // For quicker unbinding of threads, record the physical_device_id during binding.
-        std::map<std::thread::id, chip_id_t> m_global_thread_id_to_physical_device_id_map = {};
-
-        // For storing original cpuset during binding, to restore during unbinding.
-        std::map<std::thread::id, hwloc_cpuset_t> m_global_thread_id_to_original_cpuset_map = {};
-
         // Memory Binding
         std::map<chip_id_t, hwloc_nodeset_t> m_physical_device_id_to_numa_nodeset_map;
 
diff --git a/device/device_api.h b/device/device_api.h
deleted file mode 100644
index a2728e7a..00000000
--- a/device/device_api.h
+++ /dev/null
@@ -1,10 +0,0 @@
-/*
- * SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc.
- *
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#pragma once
-#include "device/tt_device.h"
-#include "device/driver_atomics.h"
-#include "device/tt_emulation_device.h"
diff --git a/device/device_api_metal.h b/device/device_api_metal.h
index a2728e7a..0fc7820c 100644
--- a/device/device_api_metal.h
+++ b/device/device_api_metal.h
@@ -7,4 +7,3 @@
 #pragma once
 #include "device/tt_device.h"
 #include "device/driver_atomics.h"
-#include "device/tt_emulation_device.h"
diff --git a/device/grayskull_implementation.cpp b/device/grayskull/grayskull_implementation.cpp
similarity index 98%
rename from device/grayskull_implementation.cpp
rename to device/grayskull/grayskull_implementation.cpp
index 9d773166..6ed7aaaf 100644
--- a/device/grayskull_implementation.cpp
+++ b/device/grayskull/grayskull_implementation.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "device/grayskull_implementation.h"
+#include "grayskull_implementation.h"
 
 namespace tt::umd {
 
diff --git a/device/grayskull_implementation.h b/device/grayskull/grayskull_implementation.h
similarity index 99%
rename from device/grayskull_implementation.h
rename to device/grayskull/grayskull_implementation.h
index 79bdfdee..c014350a 100644
--- a/device/grayskull_implementation.h
+++ b/device/grayskull/grayskull_implementation.h
@@ -99,7 +99,6 @@ enum class arc_message_type {
     ARC_GO_LONG_IDLE = 0x54,
     ARC_GET_HARVESTING = 0x57,
     TEST = 0x90,
-    NOC_DMA_TRANSFER = 0x9A,
     SETUP_IATU_FOR_PEER_TO_PEER = 0x97,
     DEASSERT_RISCV_RESET = 0xba
 };
diff --git a/device/grayskull/impl_device.hpp b/device/grayskull/impl_device.hpp
deleted file mode 100644
index 21a18125..00000000
--- a/device/grayskull/impl_device.hpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc.
- *
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#pragma once
-
-#include "device/tt_silicon_driver_common.hpp"
-
-// See src/t6ifc/t6py/packages/tenstorrent/data/grayskull/pci/tlb.yaml
-// 1M
-// local_offset: [ 0, 11,  0,  "36-bit address prefix, prepended to the 20 LSBs of issued address to form a 56-bit NOC address. The 1MB TLB #n corresponds to the 1MB MMIO range starting at (0x0 + N*0x100000)."]
-// x_end       : [ 0, 17, 12,  "" ]
-// y_end       : [ 0, 23, 18,  "" ]
-// x_start     : [ 0, 29, 24,  "" ]
-// y_start     : [ 0, 35, 30,  "" ]
-// noc_sel:      [ 0, 36, 36,  "NOC select (1 = NOC1, 0 = NOC0)"]
-// mcast:        [ 0, 37, 37,  "1 = multicast, 0 = unicast"]
-// ordering:     [ 0, 39, 38,  "ordering mode (01 = strict (full AXI ordering), 00 = relaxed (no RAW hazard), 10 = posted (may have RAW hazard)"]
-// linked:       [ 0, 40, 40,  "linked"]
-
-// 2M
-// local_offset: [ 0, 10,  0,  "35-bit address prefix, prepended to the 21 LSBs of issued address to form a 56-bit NOC address. The 2MB TLB #n corresponds to the 2MB MMIO range starting at (0x9C00000 + N*0x200000)."]
-// x_end       : [ 0, 16, 11,  "" ]
-// y_end       : [ 0, 22, 17,  "" ]
-// x_start     : [ 0, 28, 23,  "" ]
-// y_start     : [ 0, 34, 29,  "" ]
-// noc_sel:      [ 0, 35, 35,  "NOC select (1 = NOC1, 0 = NOC0)"]
-// mcast:        [ 0, 36, 36,  "1 = multicast, 0 = unicast"]
-// ordering:     [ 0, 38, 37,  "ordering mode (01 = strict (full AXI ordering), 00 = relaxed (no RAW hazard), 10 = posted (may have RAW hazard)"]
-// linked:       [ 0, 39, 39,  "linked"]
-
-// 16M
-// local_offset: [ 0, 7 ,  0,  "32-bit address prefix, prepended to the 24 LSBs of issued address to form a 56-bit NOC address. The 16MB TLB #n corresponds to the 16MB MMIO range starting at (0xB000000 + N*0x1000000)."]
-// x_end       : [ 0, 13,  8,  "" ]
-// y_end       : [ 0, 19, 14,  "" ]
-// x_start     : [ 0, 25, 20,  "" ]
-// y_start     : [ 0, 31, 26,  "" ]
-// noc_sel:      [ 0, 32, 32,  "NOC select (1 = NOC1, 0 = NOC0)"]
-// mcast:        [ 0, 33, 33,  "1 = multicast, 0 = unicast"]
-// ordering:     [ 0, 35, 34,  "ordering mode (01 = strict (full AXI ordering), 00 = relaxed (no RAW hazard), 10 = posted (may have RAW hazard)"]
-// linked:       [ 0, 36, 36,  "linked"]
-
-const auto TLB_1M_OFFSET = TLB_OFFSETS {
-    .local_offset = 0,
-    .x_end = 12,
-    .y_end = 18,
-    .x_start = 24,
-    .y_start = 30,
-    .noc_sel = 36,
-    .mcast = 37,
-    .ordering = 38,
-    .linked = 40,
-    .static_vc = 41,
-    .static_vc_end = 42
-};
-
-const auto TLB_2M_OFFSET = TLB_OFFSETS {
-    .local_offset = 0,
-    .x_end = 11,
-    .y_end = 17,
-    .x_start = 23,
-    .y_start = 29,
-    .noc_sel = 35,
-    .mcast = 36,
-    .ordering = 37,
-    .linked = 39,
-    .static_vc = 40,
-    .static_vc_end = 41
-};
-
-const auto TLB_16M_OFFSET = TLB_OFFSETS {
-    .local_offset = 0,
-    .x_end = 8,
-    .y_end = 14,
-    .x_start = 20,
-    .y_start = 26,
-    .noc_sel = 32,
-    .mcast = 33,
-    .ordering = 34,
-    .linked = 36,
-    .static_vc = 37,
-    .static_vc_end = 38
-};
diff --git a/device/kmdif.h b/device/kmdif.h
index 32596d55..c013202b 100644
--- a/device/kmdif.h
+++ b/device/kmdif.h
@@ -9,15 +9,6 @@
 
 typedef std::uint32_t DWORD;
 
-const uint32_t MAX_DMA_BYTES = 4*1024*1024;
-
-// DMA
-struct DMAbuffer {
-	void *pBuf = NULL;
-	std::uint64_t pDma = 0;
-	std::uint64_t size;
-};
-
 struct TTDevice;
 
 struct PCIdevice  {
diff --git a/device/pci_device.cpp b/device/pci_device.cpp
new file mode 100644
index 00000000..fa256100
--- /dev/null
+++ b/device/pci_device.cpp
@@ -0,0 +1,431 @@
+/*
+ * SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <cstdint>
+#include <vector>
+#include <fcntl.h>  // for ::open
+#include <unistd.h> // for ::close
+#include <sys/ioctl.h> // for ioctl
+#include <sys/mman.h>  // for mmap, munmap
+#include <linux/pci.h> // for PCI_SLOT, PCI_FUNC
+
+#include "pci_device.hpp"
+#include "architecture_implementation.h"
+#include "ioctl.h"
+#include "device/tt_arch_types.h"
+#include "device/driver_atomics.h"
+
+#include "common/assert.hpp"
+#include "common/logger.hpp"
+
+int find_device(const uint16_t device_id) {
+    // returns device id if found, otherwise -1
+    const char device_name_pattern [] = "/dev/tenstorrent/%u";
+    char device_name[sizeof(device_name_pattern) + std::numeric_limits<unsigned int>::digits10];
+    std::snprintf(device_name, sizeof(device_name), device_name_pattern, (unsigned int)device_id);
+    int device_fd = ::open(device_name, O_RDWR | O_CLOEXEC);
+    // LOG2 ("find_device() open call returns device_fd: %d for device_name: %s (device_id: %d)\n", device_fd, device_name, device_id);
+    return device_fd;
+}
+
+tt::ARCH detect_arch(uint16_t pcie_device_id, int pcie_revision_id) {
+    if (pcie_device_id == 0xfaca){
+        return tt::ARCH::GRAYSKULL;
+    } else if (pcie_device_id == 0x401e && pcie_revision_id == 0x01){
+        return tt::ARCH::WORMHOLE_B0;
+    } else if (pcie_device_id == 0x401e){
+        TT_THROW("Wormhole is not supported. Please use Wormhole B0 instead.");
+        return tt::ARCH::WORMHOLE;
+    } else if (pcie_device_id == 0xb140){
+        return tt::ARCH::BLACKHOLE;
+    } else {
+        TT_THROW("Unknown pcie device id that does not match any known architecture: ", pcie_device_id);
+    }
+}
+
+// --------------------------------------------------------------------------------------------------------------
+// --------------------------------------------------------------------------------------------------------------
+// --------------------------------------------------------------------------------------------------------------
+
+TTDevice::TTDevice(int device_id, int logical_device_id){
+    this->device_id = device_id;
+    this->logical_id = logical_device_id;
+    setup_device();
+}
+
+TTDevice::~TTDevice(){
+    close_device();
+}
+
+void TTDevice::setup_device() {
+    device_fd = find_device(device_id);
+    get_pcie_info();
+    if (device_fd == -1) {
+        throw std::runtime_error(std::string("Failed opening a handle for device ") + std::to_string(device_id));
+    }
+
+    arch = detect_arch(pcie_device_id, pcie_revision_id);
+    architecture_implementation = tt::umd::architecture_implementation::create(static_cast<tt::umd::architecture>(arch));
+
+    // Get PCIe device info through IOTCL -> tt-kmd
+    tenstorrent_get_device_info device_info;
+    memset(&device_info, 0, sizeof(device_info));
+    device_info.in.output_size_bytes = sizeof(device_info.out);
+    if (ioctl(device_fd, TENSTORRENT_IOCTL_GET_DEVICE_INFO, &device_info) == -1) {
+        throw std::runtime_error(std::string("Get device info failed on device ") + std::to_string(device_id) + ".");
+    }
+
+    struct {
+        tenstorrent_query_mappings query_mappings;
+        tenstorrent_mapping mapping_array[8];
+    } mappings;
+
+    memset(&mappings, 0, sizeof(mappings));
+    mappings.query_mappings.in.output_mapping_count = 8;
+
+    if (ioctl(device_fd, TENSTORRENT_IOCTL_QUERY_MAPPINGS, &mappings.query_mappings) == -1) {
+        throw std::runtime_error(std::string("Query mappings failed on device ") + std::to_string(device_id) + ".");
+    }
+
+    // Mapping resource to BAR
+    // Resource 0 -> BAR0
+    // Resource 1 -> BAR2
+    // Resource 2 -> BAR4
+    tenstorrent_mapping bar0_uc_mapping;
+    tenstorrent_mapping bar0_wc_mapping;
+    tenstorrent_mapping bar2_uc_mapping;
+    tenstorrent_mapping bar2_wc_mapping;
+    tenstorrent_mapping bar4_uc_mapping;
+    tenstorrent_mapping bar4_wc_mapping;
+
+    memset(&bar0_uc_mapping, 0, sizeof(bar0_uc_mapping));
+    memset(&bar0_wc_mapping, 0, sizeof(bar0_wc_mapping));
+    memset(&bar2_uc_mapping, 0, sizeof(bar2_uc_mapping));
+    memset(&bar2_wc_mapping, 0, sizeof(bar2_wc_mapping));
+    memset(&bar4_uc_mapping, 0, sizeof(bar4_uc_mapping));
+    memset(&bar4_wc_mapping, 0, sizeof(bar4_wc_mapping));
+
+    for (unsigned int i = 0; i < mappings.query_mappings.in.output_mapping_count; i++) {
+        if (mappings.mapping_array[i].mapping_id == TENSTORRENT_MAPPING_RESOURCE0_UC) {
+            bar0_uc_mapping = mappings.mapping_array[i];
+        }
+
+        if (mappings.mapping_array[i].mapping_id == TENSTORRENT_MAPPING_RESOURCE0_WC) {
+            bar0_wc_mapping = mappings.mapping_array[i];
+        }
+
+        if (mappings.mapping_array[i].mapping_id == TENSTORRENT_MAPPING_RESOURCE1_UC) {
+            bar2_uc_mapping = mappings.mapping_array[i];
+        }
+
+        if (mappings.mapping_array[i].mapping_id == TENSTORRENT_MAPPING_RESOURCE1_WC) {
+            bar2_wc_mapping = mappings.mapping_array[i];
+        }
+
+        if (mappings.mapping_array[i].mapping_id == TENSTORRENT_MAPPING_RESOURCE2_UC) {
+            bar4_uc_mapping = mappings.mapping_array[i];
+        }
+
+        if (mappings.mapping_array[i].mapping_id == TENSTORRENT_MAPPING_RESOURCE2_WC) {
+            bar4_wc_mapping = mappings.mapping_array[i];
+        }
+
+        log_debug(LogSiliconDriver, "BAR mapping id {} base {} size {}",
+            mappings.mapping_array[i].mapping_id,
+            (void *)mappings.mapping_array[i].mapping_base,
+            mappings.mapping_array[i].mapping_size);
+    }
+
+    if (bar0_uc_mapping.mapping_id != TENSTORRENT_MAPPING_RESOURCE0_UC) {
+        throw std::runtime_error(std::string("Device ") + std::to_string(device_id) + " has no BAR0 UC mapping.");
+    }
+
+    auto wc_mapping_size = arch == tt::ARCH::BLACKHOLE ? BH_BAR0_WC_MAPPING_SIZE : GS_BAR0_WC_MAPPING_SIZE;
+
+    // Attempt WC mapping first so we can fall back to all-UC if it fails.
+    if (bar0_wc_mapping.mapping_id == TENSTORRENT_MAPPING_RESOURCE0_WC) {
+        bar0_wc_size = std::min<size_t>(bar0_wc_mapping.mapping_size, wc_mapping_size);
+        bar0_wc = mmap(NULL, bar0_wc_size, PROT_READ | PROT_WRITE, MAP_SHARED, device_fd, bar0_wc_mapping.mapping_base);
+        if (bar0_wc == MAP_FAILED) {
+            bar0_wc_size = 0;
+            bar0_wc = nullptr;
+        }
+    }
+
+    if (bar0_wc) {
+        // The bottom part of the BAR is mapped WC. Map the top UC.
+        bar0_uc_size = bar0_uc_mapping.mapping_size - wc_mapping_size;
+        bar0_uc_offset = wc_mapping_size;
+    } else {
+        // No WC mapping, map the entire BAR UC.
+        bar0_uc_size = bar0_uc_mapping.mapping_size;
+        bar0_uc_offset = 0;
+    }
+
+    bar0_uc = mmap(NULL, bar0_uc_size, PROT_READ | PROT_WRITE, MAP_SHARED, device_fd, bar0_uc_mapping.mapping_base + bar0_uc_offset);
+
+    if (bar0_uc == MAP_FAILED) {
+        throw std::runtime_error(std::string("BAR0 UC memory mapping failed for device ") + std::to_string(device_id) + ".");
+    }
+
+    if (!bar0_wc) {
+        bar0_wc = bar0_uc;
+    }
+
+    if (arch == tt::ARCH::WORMHOLE_B0) {
+        if (bar4_uc_mapping.mapping_id != TENSTORRENT_MAPPING_RESOURCE2_UC) {
+            throw std::runtime_error(std::string("Device ") + std::to_string(device_id) + " has no BAR4 UC mapping.");
+        }
+
+        system_reg_mapping_size = bar4_uc_mapping.mapping_size;
+
+        system_reg_mapping = mmap(NULL, bar4_uc_mapping.mapping_size, PROT_READ | PROT_WRITE, MAP_SHARED, device_fd, bar4_uc_mapping.mapping_base);
+
+        if (system_reg_mapping == MAP_FAILED) {
+            throw std::runtime_error(std::string("BAR4 UC memory mapping failed for device ") + std::to_string(device_id) + ".");
+        }
+
+        system_reg_start_offset = (512 - 16) * 1024*1024;
+        system_reg_offset_adjust = (512 - 32) * 1024*1024;
+    } else if(arch == tt::ARCH::BLACKHOLE) {
+        if (bar2_uc_mapping.mapping_id != TENSTORRENT_MAPPING_RESOURCE1_UC) {
+            throw std::runtime_error(std::string("Device ") + std::to_string(device_id) + " has no BAR2 UC mapping.");
+        }
+
+        // Using UnCachable memory mode. This is used for accessing registers on Blackhole.
+        bar2_uc_size = bar2_uc_mapping.mapping_size;
+        bar2_uc = mmap(NULL, bar2_uc_mapping.mapping_size, PROT_READ | PROT_WRITE, MAP_SHARED, device_fd, bar2_uc_mapping.mapping_base);
+
+        if (bar2_uc == MAP_FAILED) {
+            throw std::runtime_error(std::string("BAR2 UC memory mapping failed for device ") + std::to_string(device_id) + ".");
+        }
+
+        if (bar4_wc_mapping.mapping_id != TENSTORRENT_MAPPING_RESOURCE2_WC) {
+            throw std::runtime_error(std::string("Device ") + std::to_string(device_id) + " has no BAR4 WC mapping.");
+        }
+
+        // Using Write-Combine memory mode. This is used for accessing DRAM on Blackhole.
+        // WC doesn't guarantee write ordering but has better performance.
+        bar4_wc_size = bar4_wc_mapping.mapping_size;
+        bar4_wc = mmap(NULL, bar4_wc_mapping.mapping_size, PROT_READ | PROT_WRITE, MAP_SHARED, device_fd, bar4_wc_mapping.mapping_base);
+
+        if (bar4_wc == MAP_FAILED) {
+            throw std::runtime_error(std::string("BAR4 WC memory mapping failed for device ") + std::to_string(device_id) + ".");
+        }
+    }
+
+    // GS+WH: ARC_SCRATCH[6], BH: NOC NODE_ID
+    read_checking_offset = arch == tt::ARCH::BLACKHOLE ? BH_NOC_NODE_ID_OFFSET : GS_WH_ARC_SCRATCH_6_OFFSET;
+}
+
+void TTDevice::close_device() {
+    if (arch == tt::ARCH::BLACKHOLE && bar2_uc != nullptr && bar2_uc != MAP_FAILED) {
+        // Disable ATU index 0
+        // TODO: Implement disabling for all indexes, once more host channels are enabled.
+        uint64_t iatu_index = 0;
+        uint64_t iatu_base = UNROLL_ATU_OFFSET_BAR + iatu_index * 0x200;
+        uint32_t region_ctrl_2 = 0 << 31; // REGION_EN = 0
+        write_regs(reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(bar2_uc) + iatu_base + 0x04), &region_ctrl_2, 1);
+    }
+
+    if (device_fd != -1) {
+        ::close(device_fd);
+    }
+
+    if (bar0_wc != nullptr && bar0_wc != MAP_FAILED && bar0_wc != bar0_uc) {
+        munmap(bar0_wc, bar0_wc_size);
+    }
+
+    if (bar0_uc != nullptr && bar0_uc != MAP_FAILED) {
+        munmap(bar0_uc, bar0_uc_size);
+    }
+
+    if (bar2_uc != nullptr && bar2_uc != MAP_FAILED) {
+        munmap(bar2_uc, bar2_uc_size);
+    }
+
+    if (bar4_wc != nullptr && bar4_wc != MAP_FAILED) {
+        munmap(bar4_wc, bar4_wc_size);
+    }
+
+    if (system_reg_mapping != nullptr && system_reg_mapping != MAP_FAILED) {
+        munmap(system_reg_mapping, system_reg_mapping_size);
+    }
+
+    device_fd = -1;
+    bar0_uc = nullptr;
+    bar0_wc = nullptr;
+    bar2_uc = nullptr;
+    bar4_wc = nullptr;
+    system_reg_mapping = nullptr;
+}
+
+void TTDevice::get_pcie_info() {
+    // Get PCIe device info through IOTCL -> tt-kmd and return pci_device_id and revision_id
+    std::uint16_t pcie_domain;
+    std::uint8_t pcie_bus;
+    std::uint8_t pcie_device;
+    std::uint8_t pcie_function;
+
+    tenstorrent_get_device_info device_info;
+    memset(&device_info, 0, sizeof(device_info));
+    device_info.in.output_size_bytes = sizeof(device_info.out);
+    if (ioctl(this->device_fd, TENSTORRENT_IOCTL_GET_DEVICE_INFO, &device_info) == -1) {
+        TT_THROW("Get PCIe device info failed on device: ", this->device_id);
+    }
+    pcie_domain = device_info.out.pci_domain;
+    pcie_bus = device_info.out.bus_dev_fn >> 8;
+    pcie_device = PCI_SLOT(device_info.out.bus_dev_fn);
+    pcie_function = PCI_FUNC(device_info.out.bus_dev_fn);
+
+    // Get the PCIe revision ID from sysfs
+    static const char sys_pattern[] = "/sys/bus/pci/devices/%04x:%02x:%02x.%u/%s";
+    char buf[sizeof(sys_pattern) + 10];
+
+    // revision pattern = "/sys/bus/pci/devices/%04x:%02x:%02x.%u/revision"
+    std::snprintf(buf, sizeof(buf), sys_pattern, pcie_domain, pcie_bus, pcie_device, pcie_function, "revision");
+
+    std::ifstream revision_file(buf);
+    std::string revision_string;
+    if (std::getline(revision_file, revision_string)) {
+        this->pcie_device_id = device_info.out.device_id;
+        this->pcie_revision_id = std::stoi(revision_string, nullptr, 0);
+    } else {
+        TT_THROW("Revision ID /sys/ read failed for device: ", this->device_id);
+    }
+
+    // Get NUMA node from sysfs
+    // numa node pattern = "/sys/bus/pci/devices/%04x:%02x:%02x.%u/numa_node"
+    std::snprintf(buf, sizeof(buf), sys_pattern, pcie_domain, pcie_bus, pcie_device, pcie_function, "numa_node");
+
+    std::ifstream num_node_file(buf);
+    std::string numa_node_string;
+    if (std::getline(num_node_file, numa_node_string)) {
+        this->numa_node = std::stoi(numa_node_string, nullptr, 0);
+    } else {
+        TT_THROW("Numa node /sys/ read failed for device: ", this->device_id);
+    }
+}
+
+// Open a unique device_id per host memory channel (workaround for ttkmd < 1.21 support for more than 1 pin per fd)
+void TTDevice::open_hugepage_per_host_mem_ch(uint32_t num_host_mem_channels) {
+    for (int ch = 0; ch < num_host_mem_channels; ch++) {
+        log_debug(LogSiliconDriver, "Opening device_fd_per_host_ch device index: {} ch: {} (num_host_mem_channels: {})", device_id, ch, num_host_mem_channels);
+        int device_fd_for_host_mem = find_device(device_id);
+        if (device_fd_for_host_mem == -1) {
+            throw std::runtime_error(std::string("Failed opening a host memory device handle for device ") + std::to_string(device_id));
+        }
+        device_fd_per_host_ch.push_back(device_fd_for_host_mem);
+    }
+}
+
+tt::ARCH TTDevice::get_arch() const {
+    return arch;
+}
+
+template<typename T>
+T* TTDevice::get_register_address(std::uint32_t register_offset) {
+    void *reg_mapping;
+    if (system_reg_mapping != nullptr && register_offset >= system_reg_start_offset) {
+        register_offset -= system_reg_offset_adjust;
+        reg_mapping = system_reg_mapping;
+    } else if (bar0_wc != bar0_uc && register_offset < bar0_wc_size) {
+        reg_mapping = bar0_wc;
+    } else {
+        register_offset -= bar0_uc_offset;
+        reg_mapping = bar0_uc;
+    }
+    return reinterpret_cast<T*>(static_cast<uint8_t*>(reg_mapping) + register_offset);
+}
+
+void TTDevice::write_block(uint64_t byte_addr, uint64_t num_bytes, const uint8_t* buffer_addr) {
+    void *dest = nullptr;
+    if (bar4_wc != nullptr && byte_addr >= BAR0_BH_SIZE) {
+        byte_addr -= BAR0_BH_SIZE;
+        dest = reinterpret_cast<uint8_t *>(bar4_wc) + byte_addr;
+    }else {
+        dest = get_register_address<uint8_t>(byte_addr);
+    }
+
+    const void *src = reinterpret_cast<const void *>(buffer_addr);
+    memcpy(dest, src, num_bytes);
+// #ifndef DISABLE_ISSUE_3487_FIX
+//     // memcpy_to_device(dest, src, num_bytes);
+// #else
+//     // ~4x faster than pci_read above, but works for all sizes and alignments
+//     memcpy(dest, src, num_bytes);
+// #endif
+}
+
+void TTDevice::read_block(uint64_t byte_addr, uint64_t num_bytes, uint8_t* buffer_addr) {
+    void *src = nullptr;
+    if (bar4_wc != nullptr && byte_addr >= BAR0_BH_SIZE) { //arch == tt::ARCH::BLACKHOLE && 
+        byte_addr -= BAR0_BH_SIZE;
+        src = reinterpret_cast<uint8_t *>(bar4_wc) + byte_addr;
+    } else {
+        src = get_register_address<uint8_t>(byte_addr);
+    }
+
+    void *dest = reinterpret_cast<void *>(buffer_addr);
+    memcpy(dest, src, num_bytes);
+// #ifndef DISABLE_ISSUE_3487_FIX
+//     // memcpy_from_device(dest, src, num_bytes);
+// #else
+//     // ~4x faster than pci_read above, but works for all sizes and alignments
+//     memcpy(dest, src, num_bytes);
+// #endif
+}
+
+// This is only needed for the BH workaround in iatu_configure_peer_region since no arc
+void TTDevice::write_regs(volatile uint32_t *dest, const uint32_t *src, uint32_t word_len) {
+    while (word_len-- != 0) {
+        *dest++ = *src++;
+    }
+}
+
+void TTDevice::write_regs(uint32_t byte_addr, uint32_t word_len, const void *data) {
+    volatile uint32_t *dest = get_register_address<uint32_t>(byte_addr);
+    const uint32_t *src = reinterpret_cast<const uint32_t*>(data);
+
+    write_regs(dest, src, word_len);
+}
+
+void TTDevice::read_regs(uint32_t byte_addr, uint32_t word_len, void *data) {
+    const volatile uint32_t *src = get_register_address<uint32_t>(byte_addr);
+    uint32_t *dest = reinterpret_cast<uint32_t*>(data);
+
+    while (word_len-- != 0) {
+        uint32_t temp = *src++;
+        memcpy(dest++, &temp, sizeof(temp));
+    }
+}
+
+void TTDevice::write_tlb_reg(uint32_t byte_addr, std::uint64_t value_lower, std::uint64_t value_upper, std::uint32_t tlb_cfg_reg_size){
+    log_assert((tlb_cfg_reg_size == 8) or (tlb_cfg_reg_size == 12), "Tenstorrent hardware supports only 64bit or 96bit TLB config regs");
+
+    volatile uint64_t *dest_qw = get_register_address<std::uint64_t>(byte_addr);
+    volatile uint32_t *dest_extra_dw = get_register_address<std::uint32_t>(byte_addr+8);
+#if defined(__ARM_ARCH) || defined(__riscv)
+    // The store below goes through UC memory on x86, which has implicit ordering constraints with WC accesses.
+    // ARM has no concept of UC memory. This will not allow for implicit ordering of this store wrt other memory accesses.
+    // Insert an explicit full memory barrier for ARM.
+    // Do the same for RISC-V.
+    tt_driver_atomics::mfence();
+#endif
+    *dest_qw = value_lower;
+    if (tlb_cfg_reg_size > 8) {
+        uint32_t* p_value_upper = reinterpret_cast<uint32_t*>(&value_upper);
+        *dest_extra_dw = p_value_upper[0];
+    }
+    tt_driver_atomics::mfence(); // Otherwise subsequent WC loads move earlier than the above UC store to the TLB register.
+
+//     LOG2(" TLB ");
+//     print_buffer (&value_lower, sizeof(value_lower), true);
+//     if (tlb_cfg_reg_size > 8) {
+//         uint32_t* p_value_upper = reinterpret_cast<uint32_t*>(&value_upper);
+//         print_buffer (p_value_upper, sizeof(uint32_t), true);
+//     }
+}
diff --git a/device/pci_device.hpp b/device/pci_device.hpp
new file mode 100644
index 00000000..fd918b77
--- /dev/null
+++ b/device/pci_device.hpp
@@ -0,0 +1,99 @@
+/*
+ * SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+#include <cstdint>
+#include <vector>
+
+#include "device/tt_arch_types.h"
+#include "architecture_implementation.h"
+
+static uint32_t GS_BAR0_WC_MAPPING_SIZE = (156<<20) + (10<<21) + (18<<24);
+static uint32_t BH_BAR0_WC_MAPPING_SIZE = 188<<21; // Defines the address for WC region. addresses 0 to BH_BAR0_WC_MAPPING_SIZE are in WC, above that are UC
+
+static const uint32_t BH_NOC_NODE_ID_OFFSET = 0x1FD04044;
+static const uint32_t GS_WH_ARC_SCRATCH_6_OFFSET = 0x1FF30078;
+
+// See /vendor_ip/synopsys/052021/bh_pcie_ctl_gen5/export/configuration/DWC_pcie_ctl.h
+static const uint64_t UNROLL_ATU_OFFSET_BAR = 0x1200;
+
+// BAR0 size for Blackhole, used to determine whether write block should use BAR0 or BAR4
+const uint64_t BAR0_BH_SIZE = 512 * 1024 * 1024;
+
+typedef std::uint32_t DWORD;
+
+class TTDevice {
+public:
+    TTDevice(int device_id, int logical_device_id);
+    ~TTDevice();
+    TTDevice(const TTDevice&) = delete; // copy
+    void operator = (const TTDevice&) = delete; // copy assignment
+    
+    void write_block(uint64_t byte_addr, uint64_t num_bytes, const uint8_t* buffer_addr);
+    void read_block(uint64_t byte_addr, uint64_t num_bytes, uint8_t* buffer_addr);
+    void write_regs(uint32_t byte_addr, uint32_t word_len, const void *data);
+    void write_regs(volatile uint32_t *dest, const uint32_t *src, uint32_t word_len);
+    void read_regs(uint32_t byte_addr, uint32_t word_len, void *data);
+    void write_tlb_reg(uint32_t byte_addr, std::uint64_t value_lower, std::uint64_t value_upper, std::uint32_t tlb_cfg_reg_size);
+
+    void open_hugepage_per_host_mem_ch(uint32_t num_host_mem_channels);
+    bool reset_board();
+    tt::umd::architecture_implementation* get_architecture_implementation() const { return architecture_implementation.get(); }
+
+    int device_id;
+    int logical_id;
+    int device_fd = -1;
+
+    // PCIe device info
+    std::uint32_t numa_node;
+    std::uint16_t pcie_device_id;
+    int pcie_revision_id;
+
+    // BAR and regs mapping setup
+    std::vector<int> device_fd_per_host_ch;
+    void *bar0_uc = nullptr;
+    std::size_t bar0_uc_size = 0;
+    std::size_t bar0_uc_offset = 0;
+
+    void *bar0_wc = nullptr;
+    std::size_t bar0_wc_size = 0;
+
+    void *bar2_uc = nullptr;
+    std::size_t bar2_uc_size;
+
+    void *bar4_wc = nullptr;
+    std::uint64_t bar4_wc_size;
+
+    void *system_reg_mapping = nullptr;
+    std::size_t system_reg_mapping_size;
+
+    // These two are currently not used.
+    void *system_reg_wc_mapping = nullptr;
+    std::size_t system_reg_wc_mapping_size;
+
+    std::uint32_t system_reg_start_offset;  // Registers >= this are system regs, use the mapping.
+    std::uint32_t system_reg_offset_adjust; // This is the offset of the first reg in the system reg mapping.
+
+    // int sysfs_config_fd = -1;    // not used
+    std::uint32_t read_checking_offset;
+
+    tt::ARCH get_arch() const;
+    
+private:
+    void get_pcie_info();
+    void setup_device();
+    void close_device();
+    void drop();
+
+    bool reset_by_sysfs();
+    bool reset_by_ioctl();
+
+    template <typename T>
+    T* get_register_address(std::uint32_t register_offset);
+
+    tt::ARCH arch;
+    std::unique_ptr<tt::umd::architecture_implementation> architecture_implementation;
+};
diff --git a/device/simulation/tt_simulation_device.h b/device/simulation/tt_simulation_device.h
index c57bc1da..ec407f65 100644
--- a/device/simulation/tt_simulation_device.h
+++ b/device/simulation/tt_simulation_device.h
@@ -39,8 +39,6 @@ class tt_SimulationDevice: public tt_device {
 
     // void broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set<chip_id_t>& chips_to_exclude,  std::set<uint32_t>& rows_to_exclude,  std::set<uint32_t>& columns_to_exclude, const std::string& fallback_tlb);
 
-    // virtual void rolled_write_to_device(uint32_t* mem_ptr, uint32_t size_in_bytes, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb);
-    // virtual void rolled_write_to_device(std::vector<uint32_t> &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use);
     virtual void read_from_device(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb);
 
     virtual void write_to_sysmem(std::vector<uint32_t>& vec, uint64_t addr, uint16_t channel, chip_id_t src_device_id);
@@ -57,16 +55,13 @@ class tt_SimulationDevice: public tt_device {
     // Misc. Functions to Query/Set Device State
     // virtual bool using_harvested_soc_descriptors();
     virtual std::unordered_map<chip_id_t, uint32_t> get_harvesting_masks_for_soc_descriptors();
-    // virtual bool noc_translation_en();
     // virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c);
     // virtual int get_number_of_chips_in_cluster();
     // virtual std::unordered_set<chip_id_t> get_all_chips_in_cluster();
     // virtual tt_ClusterDescriptor* get_cluster_description();
     static std::vector<chip_id_t> detect_available_device_ids();
-    // static std::unordered_map<chip_id_t, chip_id_t> get_logical_to_physical_mmio_device_id_map(std::vector<chip_id_t> physical_device_ids);
     virtual std::set<chip_id_t> get_target_remote_device_ids();
     virtual std::map<int,int> get_clocks();
-    // virtual uint32_t dma_allocation_size(chip_id_t src_device_id = -1);
     // virtual void *channel_0_address(std::uint32_t offset, std::uint32_t device_id) const;
     virtual void *host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const;
     virtual std::uint64_t get_pcie_base_addr_from_device() const;
@@ -74,7 +69,6 @@ class tt_SimulationDevice: public tt_device {
     virtual std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel);
     virtual std::uint32_t get_num_host_channels(std::uint32_t device_id);
     virtual std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel);
-    // virtual std::uint32_t get_pcie_speed(std::uint32_t device_id);
     virtual std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id);
     // virtual tt_version get_ethernet_fw_version() const;
 
diff --git a/device/tt_cluster_descriptor.cpp b/device/tt_cluster_descriptor.cpp
index 2ef5ec64..90f53855 100644
--- a/device/tt_cluster_descriptor.cpp
+++ b/device/tt_cluster_descriptor.cpp
@@ -52,19 +52,6 @@ std::vector<std::tuple<ethernet_channel_t, ethernet_channel_t>> tt_ClusterDescri
     return directly_connected_channels;
 }
 
-bool tt_ClusterDescriptor::channels_are_directly_connected(const chip_id_t &first, const ethernet_channel_t &first_channel, const chip_id_t &second, const ethernet_channel_t &second_channel) const {
-    if (this->enabled_active_chips.find(first) == this->enabled_active_chips.end() || this->enabled_active_chips.find(second) == this->enabled_active_chips.end()) {
-        return false;
-    }
-
-    if (this->ethernet_connections.at(first).find(first_channel) == this->ethernet_connections.at(first).end()) {
-        return false;
-    }
-
-    const auto &[connected_chip, connected_channel] = this->ethernet_connections.at(first).at(first_channel);
-    return connected_chip == second && connected_channel == second_channel;   
-}
-
 // const eth_coord_t tt_ClusterDescriptor::get_chip_xy(const chip_id_t &chip_id) const {
 //     // For now we only support a 1D cluster, so the mapping is trivial (where the chip ID is the x value of the xy
 //     location) return eth_coord_t(chip_id, 0, 0, 0);
@@ -367,14 +354,6 @@ std::unique_ptr<tt_ClusterDescriptor> tt_ClusterDescriptor::create_for_grayskull
     return desc;
 }
 
-std::set<chip_id_t> get_sequential_chip_id_set(int num_chips) {
-    std::set<chip_id_t> chip_ids;
-    for (int i = 0; i < num_chips; ++i) {
-        chip_ids.insert(static_cast<chip_id_t>(i));
-    }
-    return chip_ids;
-}
-
 void tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc) {
     log_assert(yaml["ethernet_connections"].IsSequence(), "Invalid YAML");
     for (YAML::Node &connected_endpoints : yaml["ethernet_connections"].as<std::vector<YAML::Node>>()) {
@@ -594,22 +573,10 @@ void tt_ClusterDescriptor::load_harvesting_information(YAML::Node &yaml, tt_Clus
     }
 }
 
-void tt_ClusterDescriptor::specify_enabled_devices(const std::vector<chip_id_t> &chip_ids) {
-    this->enabled_active_chips.clear();
-    for (auto chip_id : chip_ids) {
-        this->enabled_active_chips.insert(chip_id);
-    }
-}
-
 void tt_ClusterDescriptor::enable_all_devices() {
     this->enabled_active_chips = this->all_chips;
 }
 
-bool tt_ClusterDescriptor::chips_have_ethernet_connectivity() const { 
-    return ethernet_connections.size() > 0; 
-}
-
-
 std::unordered_map<chip_id_t, std::unordered_map<ethernet_channel_t, std::tuple<chip_id_t, ethernet_channel_t> > > tt_ClusterDescriptor::get_ethernet_connections() const {
     auto eth_connections = std::unordered_map<chip_id_t, std::unordered_map<ethernet_channel_t, std::tuple<chip_id_t, ethernet_channel_t> > >();
 
diff --git a/device/tt_cluster_descriptor.h b/device/tt_cluster_descriptor.h
index 1a923a8b..a68e1d8c 100644
--- a/device/tt_cluster_descriptor.h
+++ b/device/tt_cluster_descriptor.h
@@ -76,7 +76,6 @@ class tt_ClusterDescriptor {
    */
   std::vector<std::tuple<ethernet_channel_t, ethernet_channel_t>> get_directly_connected_ethernet_channels_between_chips(const chip_id_t &first, const chip_id_t &second) const;
   
-  bool channels_are_directly_connected(const chip_id_t &first, const ethernet_channel_t &first_channel, const chip_id_t &second, const ethernet_channel_t &second_channel) const;
   bool is_chip_mmio_capable(const chip_id_t &chip_id) const;
   chip_id_t get_closest_mmio_capable_chip(const chip_id_t &chip);
   chip_id_t get_shelf_local_physical_chip_coords(chip_id_t virtual_coord);
@@ -87,7 +86,6 @@ class tt_ClusterDescriptor {
   // const eth_coord_t get_chip_xy(const chip_id_t &chip_id) const;
   // const chip_id_t get_chip_id_at_location(const eth_coord_t &chip_location) const;
 
-  bool chips_have_ethernet_connectivity() const;
   std::unordered_map<chip_id_t, std::uint32_t> get_harvesting_info() const;
   std::unordered_map<chip_id_t, bool> get_noc_translation_table_en() const;
   std::unordered_map<chip_id_t, eth_coord_t> get_chip_locations() const;
@@ -103,9 +101,6 @@ class tt_ClusterDescriptor {
   bool ethernet_core_has_active_ethernet_link(chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const;
   std::tuple<chip_id_t, ethernet_channel_t> get_chip_and_channel_of_remote_ethernet_core(chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const;
 
-  void specify_enabled_devices(const std::vector<chip_id_t> &chip_ids);
   void enable_all_devices();
 
 };
-
-std::set<chip_id_t> get_sequential_chip_id_set(int num_chips);
diff --git a/device/tt_device.h b/device/tt_device.h
index f3064cd5..2fb3766d 100644
--- a/device/tt_device.h
+++ b/device/tt_device.h
@@ -20,7 +20,8 @@
 #include "device/tlb.h"
 #include "device/tt_io.hpp"
 
-using TLB_OFFSETS = tt::umd::tlb_offsets;
+#include "pci_device.hpp"
+
 using TLB_DATA = tt::umd::tlb_data;
 
 
@@ -28,7 +29,6 @@ namespace boost::interprocess{
     class named_mutex;
 }
 
-class PCIDevice;
 class tt_ClusterDescriptor;
 
 enum tt_DevicePowerState {
@@ -37,13 +37,6 @@ enum tt_DevicePowerState {
     LONG_IDLE
 };
 
-enum tt_MutexType {
-    LARGE_READ_TLB,
-    LARGE_WRITE_TLB,
-    SMALL_READ_WRITE_TLB,
-    ARC_MSG
-};
-
 enum tt_MemBarFlag {
     SET = 0xaa,
     RESET = 0xbb,
@@ -215,10 +208,9 @@ struct tt_device_params {
 };
 
 /**
- * @brief Parent class for tt_SiliconDevice (Silicon Driver) and tt_VersimDevice (Versim Backend API).
- * Exposes a generic interface to callers, providing declarations for virtual functions defined differently for
- * Silicon and Versim.
- * Valid usage consists of declaring a tt_device object and initializing it to either a Silicon or Versim backend.
+ * @brief Parent class for tt_SiliconDevice (Silicon Driver).
+ * Exposes a generic interface to callers, providing declarations for virtual functions defined differently for Silicon.
+ * Valid usage consists of declaring a tt_device object and initializing it to Silicon backend.
  * Using tt_device itself will throw errors, since its APIs are undefined.
  */ 
 class tt_device
@@ -294,7 +286,7 @@ class tt_device
         throw std::runtime_error("---- tt_device::configure_active_ethernet_cores_for_mmio_device is not implemented\n");
     }
     /** 
-     * @brief Start the Silicon on Versim Device
+     * @brief
      * On Silicon: Assert soft Tensix reset, deassert RiscV reset, set power state to busy (ramp up AICLK), initialize iATUs for PCIe devices and ethernet queues for remote chips.
      * \param device_params tt_device_params object specifying initialization configuration
     */
@@ -353,10 +345,8 @@ class tt_device
     * \param core chip-x-y struct specifying device and core
     * \param addr Address to write to
     * \param tlb_to_use Specifies fallback/dynamic TLB to use for transaction, if this core does not have static TLBs mapped to this address (dynamic TLBs were initialized in driver constructor)
-    * \param send_epoch_cmd Specifies that this is an epoch_cmd write, forcing runtime to take a faster write path (Buda only)
-    * \param last_send_epoch_cmd Specifies that this is the last epoch command being written, which requires metadata to be updated (Buda only)
     */
-    virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false) {
+    virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) {
         // Only implement this for Silicon Backend
         throw std::runtime_error("---- tt_device::write_to_device is not implemented\n");
     }
@@ -364,43 +354,16 @@ class tt_device
         throw std::runtime_error("---- tt_device::broadcast_write_to_cluster is not implemented\n");
     }
     /**
-    * @brief Write uint32_t vector to specified device, core and address (defined for Silicon and Versim).
+    * @brief Write uint32_t vector to specified device, core and address (defined for Silicon).
     * \param vec Vector to write
     * \param core chip-x-y struct specifying device and core
     * \param addr Address to write to
     * \param tlb_to_use Specifies fallback/dynamic TLB to use for transaction, if this core does not have static TLBs mapped to this address (dynamic TLBs were initialized in driver constructor)
-    * \param send_epoch_cmd Specifies that this is an epoch_cmd write, forcing runtime to take a faster write path (Buda only)
-    * \param last_send_epoch_cmd Specifies that this is the last epoch command being written, which requires metadata to be updated (Buda only)
     */
-    virtual void write_to_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false) {
+    virtual void write_to_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) {
         throw std::runtime_error("---- tt_device::write_to_device is not implemented\n");
     }
 
-    /**
-    * @brief Unroll/replicate uint32_t data (as specified by ptr + len pair) and write it to specified device, core and address (defined for Silicon).
-    * \param mem_ptr src data address
-    * \param len src data size (specified for uint32_t)
-    * \param unroll_count Number of times vector should be unrolled
-    * \param core chip-x-y struct specifying device and core
-    * \param addr Address to write to
-    * \param fallback_tlb Specifies fallback/dynamic TLB to use for transaction, if this core does not have static TLBs mapped to this address (dynamic TLBs were initialized in driver constructor)
-    */
-    virtual void rolled_write_to_device(uint32_t* mem_ptr, uint32_t size_in_bytes, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb) {
-        // Only implement this for Silicon Backend
-        throw std::runtime_error("---- tt_device::rolled_write_to_device is not implemented\n");
-    }
-    /**
-    * @brief Unroll/replicate a uint32_t vector and write it to specified device, core and address (defined for Silicon and Versim).
-    * \param vec Vector to write
-    * \param unroll_count Number of times vector should be unrolled
-    * \param core chip-x-y struct specifying device and core
-    * \param addr Address to write to
-    * \param tlb_to_use Specifies fallback/dynamic TLB to use for transaction, if this core does not have static TLBs mapped to this address (dynamic TLBs were initialized in driver constructor)
-    */
-    virtual void rolled_write_to_device(std::vector<uint32_t> &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) {
-        throw std::runtime_error("---- tt_device::rolled_write_to_device is not implemented\n");
-    }
-
     /**
     * @brief Read uint32_t data from a specified device, core and address to host memory (defined for Silicon).
     * \param mem_ptr dest data address on host (expected to be preallocated, depending on transfer size)
@@ -415,7 +378,7 @@ class tt_device
     }
 
     /**
-    * @brief Read a uint32_t vector from a specified device, core and address to host memory (defined for Silicon and Versim).
+    * @brief Read a uint32_t vector from a specified device, core and address to host memory (defined for Silicon).
     * \param vec host side vector to populate with data read from device (does not need to be preallocated)
     * \param core chip-x-y struct specifying device and core
     * \param addr Address to read from
@@ -491,13 +454,7 @@ class tt_device
     virtual std::unordered_map<chip_id_t, uint32_t> get_harvesting_masks_for_soc_descriptors() {
         throw std::runtime_error("---- tt_device:get_harvesting_masks_for_soc_descriptors is not implemented\n");
     }
-     /**
-     * @brief Get Hardware Translation Table state
-     * \returns true if translation tables are enabled (WH only)
-     */ 
-    virtual bool noc_translation_en() {
-        throw std::runtime_error("---- tt_device:noc_translation_en is not implemented\n");
-    }
+
     /**
      * @brief Issue message to device, meant to be picked up by ARC Firmare
      * \param logical_device_id Chip to target
@@ -566,14 +523,6 @@ class tt_device
         return std::map<int,int>();
     }
 
-    /**
-     * @brief Get the PCIe speed for a specific device based on link width and link speed
-     * \returns Bandwidth in Gbps
-     */
-    virtual std::uint32_t get_pcie_speed(std::uint32_t device_id) {
-        return 8 * 16;  // default to x8 at 16 GT/s
-    }
-
     virtual std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id) {
         throw std::runtime_error("---- tt_device::get_numa_node_for_pcie_device is not implemented\n");
     }
@@ -585,30 +534,6 @@ class tt_device
     virtual tt_version get_ethernet_fw_version() const {
         throw std::runtime_error("---- tt_device::get_ethernet_fw_version is not implemented \n");
     }
-
-    /** 
-     * @brief Get the total hugepage (host memory) size allocated for a device. 
-     * This memory is not entirely accessible by device. To query the number of channels
-     * or memory per channel that is accessbile, see get_host_channel_size or get_num_host_channels
-     * \param src_device_id Device for which allocated host memory is being queried
-     * \returns Total memory allocated on host for a specific device
-     * 
-    */ 
-    virtual uint32_t dma_allocation_size(chip_id_t src_device_id = -1) {
-        throw std::runtime_error("---- tt_device::dma_allocation_size is not implemented\n");
-        return 0;
-    }
-
-    /** 
-     * Get the address for the MMIO mapped region on Channel (as seen from host memory)
-     * \param offset Address in DRAM
-     * \param target chip-x-y struct specifying device and core of target DRAM
-     * \returns Host interpretation of MMIO mapped channel 0 address 
-     */ 
-    virtual void *channel_address(std::uint32_t offset, const tt_cxy_pair& target) {
-        throw std::runtime_error("---- tt_device::channel_address is not implemented\n");
-        return nullptr;
-    }
     /**
      * @brief Query number of DRAM channels on a specific device
      * \param device_id Logical device id to query
@@ -676,67 +601,6 @@ class tt_device
     std::unordered_map<chip_id_t, tt_SocDescriptor> soc_descriptor_per_chip = {};
 };
 
-class c_versim_core;
-namespace nuapi {namespace device {template <typename, typename>class Simulator;}}
-namespace versim {
-  struct VersimSimulatorState;
-  using VersimSimulator = nuapi::device::Simulator<c_versim_core *, VersimSimulatorState>;
-}
-
-/**
- * @brief Versim Backend Class, derived from the tt_device class
- * Implements APIs to communicate with a simulated (using Verilator) Tenstorrent Device.
-*/ 
-class tt_VersimDevice: public tt_device
-{
-    public:
-    virtual void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_);
-    virtual void set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_);
-    tt_VersimDevice(const std::string &sdesc_path, const std::string &ndesc_path);
-    virtual std::unordered_map<chip_id_t, tt_SocDescriptor>& get_virtual_soc_descriptors();
-    virtual void start(std::vector<std::string> plusargs, std::vector<std::string> dump_cores, bool no_checkers, bool init_device, bool skip_driver_allocs);
-    virtual void start_device(const tt_device_params &device_params);
-    virtual void close_device();
-    virtual void deassert_risc_reset();
-    virtual void deassert_risc_reset_at_core(tt_cxy_pair core);
-    virtual void assert_risc_reset();
-    virtual void assert_risc_reset_at_core(tt_cxy_pair core);
-    virtual void write_to_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false);
-    virtual void broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set<chip_id_t>& chips_to_exclude, std::set<uint32_t>& rows_to_exclude, std::set<uint32_t>& columns_to_exclude, const std::string& fallback_tlb);
-    virtual void rolled_write_to_device(std::vector<uint32_t> &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use);
-    virtual void read_from_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use);
-    virtual void rolled_write_to_device(uint32_t* mem_ptr, uint32_t size_in_bytes, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb);
-    virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false);
-    virtual void read_from_device(void *mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use); 
-    virtual void wait_for_non_mmio_flush();
-    void l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
-    void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels);
-    void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
-    virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c);
-    virtual bool using_harvested_soc_descriptors();
-    virtual std::unordered_map<chip_id_t, uint32_t> get_harvesting_masks_for_soc_descriptors();
-    virtual bool noc_translation_en();
-    virtual std::set<chip_id_t> get_target_mmio_device_ids();
-    virtual std::set<chip_id_t> get_target_remote_device_ids();
-    virtual ~tt_VersimDevice();
-    virtual tt_ClusterDescriptor* get_cluster_description();
-    virtual int get_number_of_chips_in_cluster();
-    virtual std::unordered_set<chip_id_t> get_all_chips_in_cluster();
-    static int detect_number_of_chips();
-    virtual std::map<int,int> get_clocks();
-    virtual std::uint32_t get_num_dram_channels(std::uint32_t device_id);
-    virtual std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel);
-    virtual std::uint32_t get_num_host_channels(std::uint32_t device_id);
-    virtual std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel);
-    private:
-    bool stop();
-    tt_device_l1_address_params l1_address_params;
-    tt_device_dram_address_params dram_address_params;
-    versim::VersimSimulator* versim;
-    std::shared_ptr<tt_ClusterDescriptor> ndesc;
-    void* p_ca_soc_manager;
-};
-
 #include "device/architecture_implementation.h"
 
 /**
@@ -781,14 +645,10 @@ class tt_SiliconDevice: public tt_device
     virtual void close_device();
 
     // Runtime Functions
-    virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false);
-    virtual void write_to_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false);
+    virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use);
+    virtual void write_to_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use);
     void broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set<chip_id_t>& chips_to_exclude,  std::set<uint32_t>& rows_to_exclude,  std::set<uint32_t>& columns_to_exclude, const std::string& fallback_tlb);
-    virtual void write_epoch_cmd_to_device(const uint32_t *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write);
-    virtual void write_epoch_cmd_to_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write);
 
-    virtual void rolled_write_to_device(uint32_t* mem_ptr, uint32_t size_in_bytes, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb);
-    virtual void rolled_write_to_device(std::vector<uint32_t> &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use);
     virtual void read_from_device(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb);
     virtual void read_from_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use);
     virtual void write_to_sysmem(std::vector<uint32_t>& vec, uint64_t addr, uint16_t channel, chip_id_t src_device_id);
@@ -809,7 +669,7 @@ class tt_SiliconDevice: public tt_device
     /**
      * @brief This API allows you to write directly to device memory that is addressable by a static TLB
     */
-    std::function<void(uint32_t, uint32_t, const uint8_t*, uint32_t)> get_fast_pcie_static_tlb_write_callable(int device_id);
+    std::function<void(uint32_t, uint32_t, const uint8_t*)> get_fast_pcie_static_tlb_write_callable(int device_id);
 
     /**
      * @brief Provide fast write access to a statically-mapped TLB.
@@ -824,40 +684,30 @@ class tt_SiliconDevice: public tt_device
      */
     tt::Writer get_static_tlb_writer(tt_cxy_pair target);
 
-    /**
-     * @brief Returns the DMA buf size 
-    */
-    uint32_t get_m_dma_buf_size() const;
     // Misc. Functions to Query/Set Device State
     virtual int arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done = true, uint32_t arg0 = 0, uint32_t arg1 = 0, int timeout=1, uint32_t *return_3 = nullptr, uint32_t *return_4 = nullptr);
     virtual bool using_harvested_soc_descriptors();
     virtual std::unordered_map<chip_id_t, uint32_t> get_harvesting_masks_for_soc_descriptors();
-    virtual bool noc_translation_en();
     virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c);
     virtual int get_number_of_chips_in_cluster();
     virtual std::unordered_set<chip_id_t> get_all_chips_in_cluster();
     virtual tt_ClusterDescriptor* get_cluster_description();
     static int detect_number_of_chips();
     static std::vector<chip_id_t> detect_available_device_ids();
-    static std::unordered_map<chip_id_t, chip_id_t> get_logical_to_physical_mmio_device_id_map(std::vector<chip_id_t> physical_device_ids);
     virtual std::set<chip_id_t> get_target_mmio_device_ids();
     virtual std::set<chip_id_t> get_target_remote_device_ids();
     virtual std::map<int,int> get_clocks();
-    virtual uint32_t dma_allocation_size(chip_id_t src_device_id = -1);
-    virtual void *channel_address(std::uint32_t offset, const tt_cxy_pair& target);
     virtual void *host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const;
     virtual std::uint64_t get_pcie_base_addr_from_device() const;
     static std::vector<int> extract_rows_to_remove(const tt::ARCH &arch, const int worker_grid_rows, const int harvested_rows);
     static void remove_worker_row_from_descriptor(tt_SocDescriptor& full_soc_descriptor, const std::vector<int>& row_coordinates_to_remove);
     static void harvest_rows_in_soc_descriptor(tt::ARCH arch, tt_SocDescriptor& sdesc, uint32_t harvested_rows);
     static std::unordered_map<tt_xy_pair, tt_xy_pair> create_harvested_coord_translation(const tt::ARCH arch, bool identity_map);
-    static std::unordered_map<chip_id_t, uint32_t> get_harvesting_masks_from_harvested_rows(std::unordered_map<chip_id_t, std::vector<uint32_t>> harvested_rows); 
     std::unordered_map<tt_xy_pair, tt_xy_pair> get_harvested_coord_translation_map(chip_id_t logical_device_id);
     virtual std::uint32_t get_num_dram_channels(std::uint32_t device_id);
     virtual std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel);
     virtual std::uint32_t get_num_host_channels(std::uint32_t device_id);
     virtual std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel);
-    virtual std::uint32_t get_pcie_speed(std::uint32_t device_id);
     virtual std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id);
     virtual tt_version get_ethernet_fw_version() const;
 
@@ -871,24 +721,19 @@ class tt_SiliconDevice: public tt_device
     void initialize_interprocess_mutexes(int pci_interface_id, bool cleanup_mutexes_in_shm);
     void cleanup_shared_host_state();
     void initialize_pcie_devices();
-    void broadcast_pcie_tensix_risc_reset(struct PCIdevice *device, const TensixSoftResetOptions &cores);
+    void broadcast_pcie_tensix_risc_reset(TTDevice *device, const TensixSoftResetOptions &cores);
     void broadcast_tensix_risc_reset_to_cluster(const TensixSoftResetOptions &soft_resets);
     void send_remote_tensix_risc_reset_to_core(const tt_cxy_pair &core, const TensixSoftResetOptions &soft_resets);
     void send_tensix_risc_reset_to_core(const tt_cxy_pair &core, const TensixSoftResetOptions &soft_resets);
     void perform_harvesting_and_populate_soc_descriptors(const std::string& sdesc_path, const bool perform_harvesting);
     void populate_cores();
-    void init_pcie_iatus();
-    void init_pcie_iatus_no_p2p();
+    void init_pcie_iatus(); // No more p2p support.
     bool init_hugepage(chip_id_t device_id);
-    bool init_dmabuf(chip_id_t device_id);
     void check_pcie_device_initialized(int device_id);
-    bool init_dma_turbo_buf(struct PCIdevice* pci_device);
-    bool uninit_dma_turbo_buf(struct PCIdevice* pci_device);
-    static std::map<chip_id_t, std::string> get_physical_device_id_to_bus_id_map(std::vector<chip_id_t> physical_device_ids);
     void set_pcie_power_state(tt_DevicePowerState state);
     int set_remote_power_state(const chip_id_t &chip, tt_DevicePowerState device_state);
     void set_power_state(tt_DevicePowerState state);
-    uint32_t get_power_state_arc_msg(struct PCIdevice* pci_device, tt_DevicePowerState state);
+    uint32_t get_power_state_arc_msg(TTDevice *pci_device, tt_DevicePowerState state);
     void enable_local_ethernet_queue(const chip_id_t& chip, int timeout);
     void enable_ethernet_queue(int timeout);
     void enable_remote_ethernet_queue(const chip_id_t& chip, int timeout);
@@ -900,13 +745,11 @@ class tt_SiliconDevice: public tt_device
     int get_clock(int logical_device_id);
 
     // Communication Functions
-    void read_dma_buffer(void* mem_ptr, std::uint32_t address, std::uint16_t channel, std::uint32_t size_in_bytes, chip_id_t src_device_id);
-    void write_dma_buffer(const void *mem_ptr, std::uint32_t size, std::uint32_t address, std::uint16_t channel, chip_id_t src_device_id);
+    void read_buffer(void* mem_ptr, std::uint32_t address, std::uint16_t channel, std::uint32_t size_in_bytes, chip_id_t src_device_id);
+    void write_buffer(const void *mem_ptr, std::uint32_t size, std::uint32_t address, std::uint16_t channel, chip_id_t src_device_id);
     void write_device_memory(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair target, std::uint32_t address, const std::string& fallback_tlb);
     void write_to_non_mmio_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t address, bool broadcast = false, std::vector<int> broadcast_header = {});
     void read_device_memory(void *mem_ptr, tt_cxy_pair target, std::uint32_t address, std::uint32_t size_in_bytes, const std::string& fallback_tlb);
-    void write_to_non_mmio_device_send_epoch_cmd(const uint32_t *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t address, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write);
-    void rolled_write_to_non_mmio_device(const uint32_t *mem_ptr, uint32_t len, tt_cxy_pair core, uint64_t address, uint32_t unroll_count);
     void read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_t address, uint32_t size_in_bytes);
     void read_mmio_device_register(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb);
     void write_mmio_device_register(const void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb);
@@ -922,7 +765,7 @@ class tt_SiliconDevice: public tt_device
     int pcie_arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done = true, uint32_t arg0 = 0, uint32_t arg1 = 0, int timeout=1, uint32_t *return_3 = nullptr, uint32_t *return_4 = nullptr);
     int remote_arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done = true, uint32_t arg0 = 0, uint32_t arg1 = 0, int timeout=1, uint32_t *return_3 = nullptr, uint32_t *return_4 = nullptr);
     bool address_in_tlb_space(uint32_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size, uint32_t chip);
-    struct PCIdevice* get_pci_device(int pci_intf_id) const;
+    TTDevice *get_pci_device(int pci_intf_id) const;
     std::shared_ptr<boost::interprocess::named_mutex> get_mutex(const std::string& tlb_name, int pci_interface_id);
     virtual uint32_t get_harvested_noc_rows_for_chip(int logical_device_id); // Returns one-hot encoded harvesting mask for PCIe mapped chips
     void generate_tensix_broadcast_grids_for_grayskull( std::set<std::pair<tt_xy_pair, tt_xy_pair>>& broadcast_grids, std::set<uint32_t>& rows_to_exclude, std::set<uint32_t>& cols_to_exclude);
@@ -930,9 +773,7 @@ class tt_SiliconDevice: public tt_device
     // Test functions
     void verify_eth_fw();
     void verify_sw_fw_versions(int device_id, std::uint32_t sw_version, std::vector<std::uint32_t> &fw_versions);
-    int test_pcie_tlb_setup (struct PCIdevice* pci_device);
     int test_setup_interface ();
-    int test_broadcast (int logical_device_id);
 
     // State variables
     tt_device_dram_address_params dram_address_params;
@@ -944,7 +785,7 @@ class tt_SiliconDevice: public tt_device
     std::set<chip_id_t> target_remote_chips = {};
     tt_SocDescriptor& get_soc_descriptor(chip_id_t chip_id);
     tt::ARCH arch_name;
-    std::map<chip_id_t, struct PCIdevice*> m_pci_device_map;    // Map of enabled pci devices
+    std::map<chip_id_t, std::unique_ptr<TTDevice>> m_pci_device_map;    // Map of enabled pci devices
     int m_num_pci_devices;                                      // Number of pci devices in system (enabled or disabled)
     std::shared_ptr<tt_ClusterDescriptor> ndesc;
     // Level of printouts. Controlled by env var TT_PCI_LOG_LEVEL
@@ -962,17 +803,10 @@ class tt_SiliconDevice: public tt_device
     static constexpr std::uint32_t EPOCH_ETH_CORES_MASK = (EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS-1);
 
     int active_core = NON_EPOCH_ETH_CORES_START_ID;
-    int active_core_epoch = EPOCH_ETH_CORES_START_ID;
-    bool erisc_q_ptrs_initialized = false;
-    std::vector<std::uint32_t> erisc_q_ptrs_epoch[NUM_ETH_CORES_FOR_NON_MMIO_TRANSFERS];
-    bool erisc_q_wrptr_updated[NUM_ETH_CORES_FOR_NON_MMIO_TRANSFERS];
     std::vector< std::vector<tt_cxy_pair> > remote_transfer_ethernet_cores;
     bool flush_non_mmio = false;
     bool non_mmio_transfer_cores_customized = false;
     std::unordered_map<chip_id_t, int> active_eth_core_idx_per_chip = {};
-    // Size of the PCIE DMA buffer
-    // The setting should not exceed MAX_DMA_BYTES
-    std::uint32_t m_dma_buf_size;
     std::unordered_map<chip_id_t, bool> noc_translation_enabled_for_chip = {};
     std::map<std::string, std::shared_ptr<boost::interprocess::named_mutex>> hardware_resource_mutex_map = {};
     std::unordered_map<chip_id_t, std::unordered_map<tt_xy_pair, tt_xy_pair>> harvested_coord_translation = {};
@@ -991,9 +825,6 @@ class tt_SiliconDevice: public tt_device
     std::unordered_map<std::string, std::int32_t> dynamic_tlb_config = {};
     std::unordered_map<std::string, uint64_t> dynamic_tlb_ordering_modes = {};
     std::map<std::set<chip_id_t>, std::unordered_map<chip_id_t, std::vector<std::vector<int>>>> bcast_header_cache = {};
-    std::uint64_t buf_physical_addr = 0;
-    void * buf_mapping = nullptr;
-    int driver_id;  
     bool perform_harvesting_on_sdesc = false;
     bool use_ethernet_ordered_writes = true;
     bool use_ethernet_broadcast = true;
diff --git a/device/tt_emulation_device.cpp b/device/tt_emulation_device.cpp
deleted file mode 100644
index 3e64c15e..00000000
--- a/device/tt_emulation_device.cpp
+++ /dev/null
@@ -1,190 +0,0 @@
-#include <stdexcept>
-#include <cstring>
-
-#include "common/logger.hpp"
-#include "device/tt_cluster_descriptor.h"
-#include "tt_emulation_device.h"
-#include "tt_emu_zemi3_wrapper.h"
-
-
-tt_emulation_device::tt_emulation_device(const std::string& sdesc_path) : tt_device(sdesc_path) {
-  soc_descriptor_per_chip.emplace(0, tt_SocDescriptor(sdesc_path));
-  std::set<chip_id_t> target_devices = {0};
-  // create just a default one, we do not have cluster anyway
-  ndesc = tt_ClusterDescriptor::create_for_grayskull_cluster(target_devices, {});
-  tt_zebu_wrapper_inst = new tt_emu_zemi3_wrapper();
-
-  log_info(tt::LogEmulationDriver, "Created Emulation Device ");
-}
-
-tt_emulation_device::~tt_emulation_device() {
-  ndesc.reset();
-  delete tt_zebu_wrapper_inst;
-  log_info(tt::LogEmulationDriver, "Destroyed Emulation Device ");
-}
-  
-void tt_emulation_device::write(tt_cxy_pair core, uint64_t addr, const std::vector<uint8_t>& data) {
-  const uint32_t size = static_cast<uint32_t>(data.size());
-  tt_zebu_wrapper_inst->axi_write(0, core.x, core.y, addr, size, data); 
-  log_info(tt::LogEmulationDriver, "Wrote {} bytes to address {:#016x}, core {},{}", size, addr, core.x, core.y);
-}
-
-std::vector<uint8_t> tt_emulation_device::read(tt_cxy_pair core, uint64_t addr, uint32_t size) {
-  std::vector<uint8_t> data(size);
-  tt_zebu_wrapper_inst->axi_read(0, core.x, core.y, addr, size, data);
-  log_info(tt::LogEmulationDriver, "Read {} bytes from address {:#016x}", size, addr);
-
-  return data;
-}
-
-
-void tt_emulation_device::start_device(const tt_device_params& device_params) {
-  tt_zebu_wrapper_inst->zebu_start();
-  tt_zebu_wrapper_inst->zebu_enable_waveform_dump(tt_zebu_wrapper::WAVEFORM_DUMP_QIWC);
-  log_info(tt::LogEmulationDriver, "Started Emulation Device ");
-}
-
-void tt_emulation_device::deassert_risc_reset() {
-  tt_zebu_wrapper_inst->all_tensix_reset_deassert();
-  log_info(tt::LogEmulationDriver, "Deasserted all tensix RISC Reset ");
-}
-
-void tt_emulation_device::assert_risc_reset() {
-  tt_zebu_wrapper_inst->all_tensix_reset_assert();
-  log_info(tt::LogEmulationDriver, "Asserted all tensix RISC Reset ");
-}
-
-void tt_emulation_device::deassert_risc_reset_at_core(tt_cxy_pair core) {
-  tt_zebu_wrapper_inst->tensix_reset_deassert(core.x, core.y);
-}
-
-void tt_emulation_device::assert_risc_reset_at_core(tt_cxy_pair core) {
-  tt_zebu_wrapper_inst->tensix_reset_assert(core.x, core.y);
-}
-
-
-
-void tt_emulation_device::close_device() {
-    log_info(tt::LogEmulationDriver, "Closing Emulation Device ");
-    tt_zebu_wrapper_inst->zebu_finish();
-}
-
-void tt_emulation_device::start(std::vector<std::string> plusargs, std::vector<std::string> dump_cores, bool no_checkers, bool /*init_device*/, bool /*skip_driver_allocs*/
-) {
-  log_info(tt::LogEmulationDriver, "Starting Emulation Device ");
-}
-
-
-void tt_emulation_device::broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set<chip_id_t>& chips_to_exclude, std::set<uint32_t>& rows_to_exclude, std::set<uint32_t>& cols_to_exclude, const std::string& fallback_tlb) {
-  for(const auto& core : get_soc_descriptor(0) -> cores) {
-    // if(cols_to_exclude.find(core.first.x) == cols_to_exclude.end() and rows_to_exclude.find(core.first.y) == rows_to_exclude.end() and core.second.type != CoreType::HARVESTED) {
-    //     write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, "");
-    //   }
-    // MT: Iterate through all the worker cores for bcast:
-    // if (get_soc_descriptor(0)->is_worker_core(core.first)) {
-    //   write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, "");
-    // }
-    // Emulation only broadcasts to all Tensix cores or all DRAM cores.
-    // differentiate which bcast pattern to use based on exclude columns
-    if (cols_to_exclude.find(0) == cols_to_exclude.end()) {
-      // Detect DRAM bcast
-      if (get_soc_descriptor(0)->is_dram_core(core.first)) {
-        write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, "");
-      }
-    } else {
-      if (get_soc_descriptor(0)->is_worker_core(core.first)) {
-        write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, "");
-      }
-    }
-  }
-} 
-void tt_emulation_device::rolled_write_to_device(std::vector<uint32_t>& base_vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t base_addr, const std::string& tlb_to_use) {
-  std::vector<uint32_t> vec = base_vec;
-  uint32_t byte_increment = 4 * vec.size();
-  for (uint32_t i = 0; i < unroll_count; ++i) {
-    vec[0] = i; // slot id for debug
-    uint64_t offset_addr = base_addr + i * byte_increment;
-    write_to_device(vec, core, offset_addr, tlb_to_use);
-  }
-}
-void tt_emulation_device::write_to_device(const void *mem_ptr, uint32_t size, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {
-  log_assert(!(size % 4), "Writes to Emulation Backend should be 4 byte aligned!");
-
-  std::vector<std::uint32_t> mem_vector((uint32_t*)mem_ptr, (uint32_t*)mem_ptr + size / sizeof(uint32_t));
-  write_to_device(mem_vector, core, addr, tlb_to_use, send_epoch_cmd, last_send_epoch_cmd, ordered_with_prev_remote_write);
-}
-
-void tt_emulation_device::write_to_device(std::vector<uint32_t>& vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {
-
-  std::vector<uint8_t> byte_data(vec.size() * sizeof(uint32_t));
-  std::memcpy(byte_data.data(), vec.data(), byte_data.size());
-
-  write(core, addr, byte_data);
-}
-
-void tt_emulation_device::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {
-    // Placeholder - implement later - https://yyz-gitlab.local.tenstorrent.com/tenstorrent/open-umd/-/issues/26
-}
-
-void tt_emulation_device::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {
-    // Placeholder - implement later - https://yyz-gitlab.local.tenstorrent.com/tenstorrent/open-umd/-/issues/26
-}
-
-void tt_emulation_device::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels) {
-    // Placeholder - implement later - https://yyz-gitlab.local.tenstorrent.com/tenstorrent/open-umd/-/issues/26
-}
-
-
-
-void tt_emulation_device::read_from_device(std::vector<uint32_t>& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& /*tlb_to_use*/) {
-  std::vector<uint8_t> byte_data = read(core, addr, size);
-
-  // Verify that the received byte data can be converted to uint32_t
-  // if (byte_data.size() % sizeof(uint32_t) != 0) {
-  //   throw std::runtime_error("Received byte data size is not a multiple of uint32_t size.");
-  // }
-
-  vec.clear();
-  vec.resize(byte_data.size() / sizeof(uint32_t));
-  std::memcpy(vec.data(), byte_data.data(), byte_data.size());
-}
-
-void tt_emulation_device::translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c) {
-  // No translation is performed
-  return;
-}
-tt_ClusterDescriptor* tt_emulation_device::get_cluster_description() { return ndesc.get(); }
-
-std::set<chip_id_t> tt_emulation_device::get_target_mmio_device_ids() {
-  log_error("LogEmulationDriver: get_target_mmio_device_ids not implemented");
-  return {};
-}
-
-std::set<chip_id_t> tt_emulation_device::get_target_remote_device_ids() {
-  log_error("LogEmulationDriver: get_target_remote_device_ids not implemented");
-  return {};
-}
-
-void tt_emulation_device::set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_) {
-    dram_address_params = dram_address_params_;
-}
-int tt_emulation_device::get_number_of_chips_in_cluster() { return detect_number_of_chips(); }
-std::unordered_set<int> tt_emulation_device::get_all_chips_in_cluster() { return { 0 }; }
-int tt_emulation_device::detect_number_of_chips() { return 1; }
-
-bool tt_emulation_device::using_harvested_soc_descriptors() { return false; }
-bool tt_emulation_device::noc_translation_en() { return false; }
-std::unordered_map<chip_id_t, uint32_t> tt_emulation_device::get_harvesting_masks_for_soc_descriptors() { return {{0, 0}};}
-
-std::unordered_map<chip_id_t, tt_SocDescriptor>& tt_emulation_device::get_virtual_soc_descriptors() {return soc_descriptor_per_chip;}
-
-std::map<int, int> tt_emulation_device::get_clocks() {
-  return std::map<int, int>();
-}
-
-void tt_emulation_device::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) {
-  l1_address_params = l1_address_params_;
-}
-
-
-
diff --git a/device/tt_emulation_device.h b/device/tt_emulation_device.h
deleted file mode 100644
index 259841c4..00000000
--- a/device/tt_emulation_device.h
+++ /dev/null
@@ -1,65 +0,0 @@
-#pragma once
-
-#include <cstdint>
-#include <fstream>
-#include <vector>
-#include "tt_soc_descriptor.h"
-#include "tt_xy_pair.h"
-#include "tt_device.h"
-
-// use forward declaration here so we do not need to include tt_zebu_wrapper.h
-class tt_zebu_wrapper;
-
-class tt_emulation_device : public tt_device {
-public:
-  virtual void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_); // Dont care
-  tt_emulation_device(const std::string& sdesc_path);
-  virtual void start(std::vector<std::string> plusargs, std::vector<std::string> dump_cores, bool no_checkers, bool init_device, bool skip_driver_allocs);
-  virtual void start_device(const tt_device_params& device_params);
-  virtual void close_device();
-  virtual void deassert_risc_reset();
-  virtual void deassert_risc_reset_at_core(tt_cxy_pair core);
-  virtual void assert_risc_reset();
-  virtual void assert_risc_reset_at_core(tt_cxy_pair core);
-  virtual void write_to_device(std::vector<uint32_t>& vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false);
-  virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false);
-  virtual void broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set<chip_id_t>& chips_to_exclude, std::set<uint32_t>& rows_to_exclude, std::set<uint32_t>& columns_to_exclude, const std::string& fallback_tlb);
-
-  void l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
-  void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels);
-  void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
-
-  virtual void rolled_write_to_device(std::vector<uint32_t>& base_vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t base_addr, const std::string& tlb_to_use); // See Versim Implementation
-  virtual void read_from_device(std::vector<uint32_t>& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use);
-
-  virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c);
-  virtual bool using_harvested_soc_descriptors();
-  virtual std::unordered_map<chip_id_t, uint32_t> get_harvesting_masks_for_soc_descriptors();
-  virtual std::unordered_map<chip_id_t, tt_SocDescriptor>& get_virtual_soc_descriptors();
-  virtual bool noc_translation_en();
-  virtual std::set<chip_id_t> get_target_mmio_device_ids(); 
-  virtual std::set<chip_id_t> get_target_remote_device_ids();
-  virtual ~tt_emulation_device(); 
-  virtual tt_ClusterDescriptor* get_cluster_description();
-  virtual void set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_);
-  virtual int get_number_of_chips_in_cluster(); 
-  virtual std::unordered_set<chip_id_t> get_all_chips_in_cluster(); 
-  static int detect_number_of_chips();  
-  virtual std::map<int, int> get_clocks(); 
-private:
-
-  tt_device_l1_address_params l1_address_params;
-  std::shared_ptr<tt_ClusterDescriptor> ndesc;
-  tt_device_dram_address_params dram_address_params;
-  
-  // zebu wrapper, provides interface to zebu emulator device through axi and command transactors
-  tt_zebu_wrapper *tt_zebu_wrapper_inst = NULL;
-
-
-
-  // These functions implement the "protocol" between the RTL simulation and the UMD
-  void write(tt_cxy_pair core, uint64_t addr, const std::vector<uint8_t>& data);
-  std::vector<uint8_t> read(tt_cxy_pair core, uint64_t addr, uint32_t size);
-  
-};
-
diff --git a/device/tt_emulation_stub.cpp b/device/tt_emulation_stub.cpp
deleted file mode 100644
index 33fc3c90..00000000
--- a/device/tt_emulation_stub.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-#include <stdexcept>
-#include <cstring>
-
-#include "common/logger.hpp"
-#include "tt_emulation_device.h"
-
-tt_emulation_device::tt_emulation_device(const std::string& sdesc_path) : tt_device(sdesc_path) {
-  throw std::runtime_error("tt_emulation_device() -- Zebu Emulation is not supported in this build\n");
-}
-
-
-tt_emulation_device::~tt_emulation_device() {}
-  
-void tt_emulation_device::write(tt_cxy_pair core, uint64_t addr, const std::vector<uint8_t>& data) {}
-
-std::vector<uint8_t> tt_emulation_device::read(tt_cxy_pair core, uint64_t addr, uint32_t size) {return {};}
-
-
-void tt_emulation_device::start_device(const tt_device_params& device_params) {}
-
-void tt_emulation_device::deassert_risc_reset() {}
-
-void tt_emulation_device::assert_risc_reset() {}
-
-void tt_emulation_device::deassert_risc_reset_at_core(tt_cxy_pair core) {}
-
-void tt_emulation_device::assert_risc_reset_at_core(tt_cxy_pair core) {}
-
-void tt_emulation_device::close_device() {}
-
-void tt_emulation_device::start(std::vector<std::string> plusargs, std::vector<std::string> dump_cores, bool no_checkers, bool /*init_device*/, bool /*skip_driver_allocs*/) {}
-
-
-void tt_emulation_device::broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set<chip_id_t>& chips_to_exclude, std::set<uint32_t>& rows_to_exclude, std::set<uint32_t>& cols_to_exclude, const std::string& fallback_tlb) {} 
-void tt_emulation_device::rolled_write_to_device(std::vector<uint32_t>& base_vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t base_addr, const std::string& tlb_to_use) {}
-
-void tt_emulation_device::write_to_device(std::vector<uint32_t>& vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {}
-void tt_emulation_device::write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {};
-void tt_emulation_device::read_from_device(std::vector<uint32_t>& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& /*tlb_to_use*/) {}
-void tt_emulation_device::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {}
-void tt_emulation_device::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {}
-void tt_emulation_device::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels) {}
-
-
-// -------------------------
-// Not sure how to implement these functions below, leaving them blank/default for now
-void tt_emulation_device::translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c) {
-  // No translation is performed
-  return;
-}
-tt_ClusterDescriptor* tt_emulation_device::get_cluster_description() { return ndesc.get(); }
-
-std::set<chip_id_t> tt_emulation_device::get_target_mmio_device_ids() {return {};}
-
-std::set<chip_id_t> tt_emulation_device::get_target_remote_device_ids() {return {};}
-
-void tt_emulation_device::set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_) {}
-int tt_emulation_device::get_number_of_chips_in_cluster() { return detect_number_of_chips(); }
-std::unordered_set<int> tt_emulation_device::get_all_chips_in_cluster() { return { 0 }; }
-int tt_emulation_device::detect_number_of_chips() { return 1; }
-
-bool tt_emulation_device::using_harvested_soc_descriptors() { return false; }
-bool tt_emulation_device::noc_translation_en() { return false; }
-std::unordered_map<chip_id_t, uint32_t> tt_emulation_device::get_harvesting_masks_for_soc_descriptors() { return {{0, 0}};}
-
-std::unordered_map<chip_id_t, tt_SocDescriptor>& tt_emulation_device::get_virtual_soc_descriptors() {return soc_descriptor_per_chip;}
-
-std::map<int, int> tt_emulation_device::get_clocks() {return std::map<int, int>();}
-
-void tt_emulation_device::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) {}
-
-
-
diff --git a/device/tt_silicon_driver.cpp b/device/tt_silicon_driver.cpp
index 0cfdf027..cb7b0d0b 100644
--- a/device/tt_silicon_driver.cpp
+++ b/device/tt_silicon_driver.cpp
@@ -27,39 +27,36 @@
 #include <cerrno>
 #include <chrono>
 #include <ratio>
+#include <algorithm>
+#include <filesystem>
+#include <stdarg.h>
 
 #include <sys/ioctl.h>
 #include <sys/mman.h>
-#include <sys/types.h>
-#include <sys/stat.h>
+// #include <sys/types.h>
+// #include <sys/stat.h>
 #include <dirent.h>
-#include <fcntl.h>
-#include <unistd.h>
+// #include <fcntl.h>
+// #include <unistd.h>
 #include <spawn.h>
 #include <wait.h>
 #include <errno.h>
-#include <linux/pci.h>
+// #include <linux/pci.h>
+
+#include "yaml-cpp/yaml.h"
+#include "common/logger.hpp"
 
+#include "device/cpuset_lib.hpp"
+#include "device/driver_atomics.h"
 #include "device/architecture.h"
 #include "device/architecture_implementation.h"
 #include "device/tlb.h"
 #include "device/tt_arch_types.h"
 #include "tt_device.h"
-#include "kmdif.h"
 #include "ioctl.h"
 
 //#include "epoch_q.h"
 
-#include <algorithm>
-#include "yaml-cpp/yaml.h"
-#include <filesystem>
-#include <string.h>
-
-#include <stdarg.h>
-#include "device/cpuset_lib.hpp"
-#include "common/logger.hpp"
-#include "device/driver_atomics.h"
-
 #define WHT "\e[0;37m"
 #define BLK "\e[0;30m"
 #define RED "\e[0;31m"
@@ -84,17 +81,6 @@ void clr_printf(const char *clr, const char *fmt, ...) {
 int g_DEBUG_LEVEL; // /src/t6ifc/t6py/packages/tenstorrent/jlink/jtag_comm.cpp
 bool g_READ_CHECKING_ENABLED = true;
 
-bool g_USE_MSI_FOR_DMA = false; // Whether to wait for MSI after DMA transfer, or poll a variable
-uint32_t g_DMA_BLOCK_SIZE_READ_THRESHOLD_BYTES = 0;  // 0 - never use DMA. Otherwise use DMA for all blocks larger than this size
-uint32_t g_DMA_BLOCK_SIZE_WRITE_THRESHOLD_BYTES = 0; // 0 - never use DMA. Otherwise use DMA for all blocks larger than this size
-
-// Address in CSM where the DMA request structure resides
-uint32_t c_CSM_PCIE_CTRL_DMA_REQUEST_OFFSET = 0;
-// Address where the trigger for transfer resides
-uint32_t c_DMA_TRIGGER_ADDRESS = 0;
-// To trigger arc interrupt
-uint32_t c_ARC_MISC_CNTL_ADDRESS = 0;
-
 // Print all buffers smaller than this number of bytes
 uint32_t g_NUM_BYTES_TO_PRINT = 8;
 
@@ -102,24 +88,7 @@ uint32_t g_NUM_BYTES_TO_PRINT = 8;
 const bool g_SINGLE_PIN_PAGE_PER_FD_WORKAROND = true;
 const uint32_t g_MAX_HOST_MEM_CHANNELS = 4;
 
-volatile bool msi_interrupt_received = false;
-
-const char device_name_pattern[] = "/dev/tenstorrent/%u";
-
-const std::string tlb_large_read_mutex_name_prefix = "mem_tlb_large_read_mutex_pci_interface_id_";
-const std::string tlb_large_write_mutex_name_prefix = "mem_tlb_large_write_mutex_pci_interface_id_";
-const std::string tlb_small_read_write_mutex_name_prefix = "mem_tlb_small_read_write_mutex_pci_interface_id_";
-const std::string arc_msg_mutex_name_prefix = "arc_msg_mutex_pci_interface_id_";
-
-static uint32_t GS_BAR0_WC_MAPPING_SIZE = (156<<20) + (10<<21) + (18<<24);
-static uint32_t BH_BAR0_WC_MAPPING_SIZE = 188<<21; // Defines the address for WC region. addresses 0 to BH_BAR0_WC_MAPPING_SIZE are in WC, above that are UC
-
-static const uint32_t GS_WH_ARC_SCRATCH_6_OFFSET = 0x1FF30078;
-static const uint32_t BH_NOC_NODE_ID_OFFSET = 0x1FD04044;
-
-const uint32_t DMA_BUF_REGION_SIZE = 4 << 20;
 const uint32_t HUGEPAGE_REGION_SIZE = 1 << 30; // 1GB
-const uint32_t DMA_MAP_MASK = DMA_BUF_REGION_SIZE - 1;
 const uint32_t HUGEPAGE_MAP_MASK = HUGEPAGE_REGION_SIZE - 1;
 
 static const uint32_t MSG_ERROR_REPLY = 0xFFFFFFFF;
@@ -128,209 +97,9 @@ static const uint32_t MSG_ERROR_REPLY = 0xFFFFFFFF;
 const char* hugepage_dir_env = std::getenv("TT_BACKEND_HUGEPAGE_DIR");
 std::string hugepage_dir = hugepage_dir_env ? hugepage_dir_env : "/dev/hugepages-1G";
 
-// BAR0 size for Blackhole, used to determine whether write block should use BAR0 or BAR4
-const uint64_t BAR0_BH_SIZE = 512 * 1024 * 1024;
-
 // TLB size for DRAM on blackhole - 4GB
 const uint64_t BH_4GB_TLB_SIZE = 4ULL * 1024 * 1024 * 1024;
 
-// See /vendor_ip/synopsys/052021/bh_pcie_ctl_gen5/export/configuration/DWC_pcie_ctl.h
-const uint64_t UNROLL_ATU_OFFSET_BAR = 0x1200;
-
-// Foward declarations
-PCIdevice ttkmd_open(DWORD device_id, bool sharable /* = false */);
-int ttkmd_close(struct PCIdevice &device);
-
-uint32_t pcie_dma_transfer_turbo (TTDevice *dev, uint32_t chip_addr, uint32_t host_phys_addr, uint32_t size_bytes, bool write);
-DMAbuffer pci_allocate_dma_buffer(TTDevice *dev, uint32_t size);
-void pcie_init_dma_transfer_turbo (PCIdevice* dev);
-
-void write_regs(volatile uint32_t *dest, const uint32_t *src, uint32_t word_len);
-
-// Stash all the fields of TTDevice in TTDeviceBase to make moving simpler.
-struct TTDeviceBase
-{
-    unsigned int index;
-
-    int device_fd = -1;
-    std::vector<int> device_fd_per_host_ch;
-    void *bar0_uc = nullptr;
-    std::size_t bar0_uc_size = 0;
-    std::size_t bar0_uc_offset = 0;
-
-    void *bar0_wc = nullptr;
-    std::size_t bar0_wc_size = 0;
-
-    void *bar2_uc = nullptr;
-    std::size_t bar2_uc_size;
-
-    void *bar4_wc = nullptr;
-    std::uint64_t bar4_wc_size;
-
-    void *system_reg_mapping = nullptr;
-    std::size_t system_reg_mapping_size;
-
-    void *system_reg_wc_mapping = nullptr;
-    std::size_t system_reg_wc_mapping_size;
-
-    std::uint32_t system_reg_start_offset;  // Registers >= this are system regs, use the mapping.
-    std::uint32_t system_reg_offset_adjust; // This is the offset of the first reg in the system reg mapping.
-
-    int sysfs_config_fd = -1;
-    std::uint16_t pci_domain;
-    std::uint8_t pci_bus;
-    std::uint8_t pci_device;
-    std::uint8_t pci_function;
-
-    unsigned int next_dma_buf = 0;
-
-	DMAbuffer dma_completion_flag_buffer;  // When DMA completes, it writes to this buffer
-	DMAbuffer dma_transfer_buffer;         // Buffer for large DMA transfers
-
-    std::uint32_t max_dma_buf_size_log2;
-
-    tenstorrent_get_device_info_out device_info;
-
-    std::vector<DMAbuffer> dma_buffer_mappings;
-
-    std::uint32_t read_checking_offset;
-};
-
-struct TTDevice : TTDeviceBase
-{
-    static TTDevice open(unsigned int device_id);
-    void open_hugepage_per_host_mem_ch(uint32_t num_host_mem_channels);
-    ~TTDevice() { reset(); }
-
-    TTDevice(const TTDevice&) = delete;
-    void operator = (const TTDevice&) = delete;
-
-    TTDevice(TTDevice &&that) : TTDeviceBase(std::move(that)), arch(that.arch), architecture_implementation(std::move(that.architecture_implementation)) { that.drop(); }
-    TTDevice &operator = (TTDevice &&that) {
-        reset();
-
-        *static_cast<TTDeviceBase*>(this) = std::move(that);
-        arch = that.arch;
-        architecture_implementation = std::move(that.architecture_implementation);
-        that.drop();
-
-        return *this;
-    }
-
-    void suspend_before_device_reset() {
-        reset();
-    }
-
-    void resume_after_device_reset() {
-        do_open();
-    }
-
-    tt::ARCH get_arch() const { return arch; }
-    tt::umd::architecture_implementation* get_architecture_implementation() const { return architecture_implementation.get(); }
-
-private:
-    TTDevice() = default;
-
-    void reset() {
-        if (arch == tt::ARCH::BLACKHOLE && bar2_uc != nullptr && bar2_uc != MAP_FAILED) {
-            // Disable ATU index 0
-            // TODO: Implement disabling for all indexes, once more host channels are enabled.
-            uint64_t iatu_index = 0;
-            uint64_t iatu_base = UNROLL_ATU_OFFSET_BAR + iatu_index * 0x200;
-            uint32_t region_ctrl_2 = 0 << 31; // REGION_EN = 0
-            write_regs(reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(bar2_uc) + iatu_base + 0x04), &region_ctrl_2, 1);
-        }
-
-        if (device_fd != -1) {
-            close(device_fd);
-        }
-
-        if (bar0_wc != nullptr && bar0_wc != MAP_FAILED && bar0_wc != bar0_uc) {
-            munmap(bar0_wc, bar0_wc_size);
-        }
-
-        if (bar0_uc != nullptr && bar0_uc != MAP_FAILED) {
-            munmap(bar0_uc, bar0_uc_size);
-        }
-
-        if (bar2_uc != nullptr && bar2_uc != MAP_FAILED) {
-            munmap(bar2_uc, bar2_uc_size);
-        }
-
-        if (bar4_wc != nullptr && bar4_wc != MAP_FAILED) {
-            munmap(bar4_wc, bar4_wc_size);
-        }
-
-        if (system_reg_mapping != nullptr && system_reg_mapping != MAP_FAILED) {
-            munmap(system_reg_mapping, system_reg_mapping_size);
-        }
-
-        for (auto &&buf : dma_buffer_mappings) {
-            munmap(buf.pBuf, buf.size);
-        }
-
-        if (sysfs_config_fd != -1) {
-            close(sysfs_config_fd);
-        }
-
-        drop();
-    }
-
-    void drop() {
-        device_fd = -1;
-        bar0_uc = nullptr;
-        bar0_wc = nullptr;
-        bar2_uc = nullptr;
-        bar4_wc = nullptr;
-        system_reg_mapping = nullptr;
-        dma_buffer_mappings.clear();
-        sysfs_config_fd = -1;
-    }
-
-    void do_open();
-
-    tt::ARCH arch;
-    std::unique_ptr<tt::umd::architecture_implementation> architecture_implementation;
-};
-
-TTDevice TTDevice::open(unsigned int device_id) {
-    TTDevice ttdev;
-    static int unique_id = 0;
-    ttdev.index = device_id;
-    ttdev.do_open();
-
-    return ttdev;
-}
-
-bool is_grayskull(const uint16_t device_id) {
-    return device_id == 0xfaca;
-}
-
-bool is_wormhole(const uint16_t device_id) {
-    return device_id == 0x401e;
-}
-
-bool is_blackhole(const uint16_t device_id) {
-    return device_id == 0xb140;
-}
-
-bool is_grayskull(const tenstorrent_get_device_info_out &device_info) {
-    return is_grayskull(device_info.device_id);
-}
-
-bool is_wormhole(const tenstorrent_get_device_info_out &device_info) {
-    return is_wormhole(device_info.device_id);
-}
-
-bool is_wormhole_b0(const uint16_t device_id, const uint16_t revision_id) {
-    return (is_wormhole(device_id) && (revision_id == 0x01));
-}
-
-bool is_blackhole(const tenstorrent_get_device_info_out &device_info) {
-    return is_blackhole(device_info.device_id);
-}
-
-
 template <typename T>
 void size_buffer_to_capacity(std::vector<T> &data_buf, std::size_t size_in_bytes) {
     std::size_t target_size = 0;
@@ -407,246 +176,6 @@ uint32_t get_available_num_host_mem_channels(const uint32_t num_channels_per_dev
 
 }
 
-int find_device(const uint16_t device_id) {
-    // returns device id if found, otherwise -1
-    char device_name[sizeof(device_name_pattern) + std::numeric_limits<unsigned int>::digits10];
-    std::snprintf(device_name, sizeof(device_name), device_name_pattern, (unsigned int)device_id);
-    int device_fd = ::open(device_name, O_RDWR | O_CLOEXEC);
-    LOG2 ("find_device() open call returns device_fd: %d for device_name: %s (device_id: %d)\n", device_fd, device_name, device_id);
-    return device_fd;
-}
-
-// Open a unique device_id per host memory channel (workaround for ttkmd < 1.21 support for more than 1 pin per fd)
-void TTDevice::open_hugepage_per_host_mem_ch(uint32_t num_host_mem_channels) {
-    for (int ch = 0; ch < num_host_mem_channels; ch++) {
-        log_debug(LogSiliconDriver, "Opening device_fd_per_host_ch device index: {} ch: {} (num_host_mem_channels: {})", index, ch, num_host_mem_channels);
-        int device_fd_for_host_mem = find_device(index);
-        if (device_fd_for_host_mem == -1) {
-            throw std::runtime_error(std::string("Failed opening a host memory device handle for device ") + std::to_string(index));
-        }
-        device_fd_per_host_ch.push_back(device_fd_for_host_mem);
-    }
-}
-
-int get_revision_id(TTDevice *dev);
-
-tt::ARCH detect_arch(TTDevice *dev) {
-    if (is_grayskull(dev->device_info.device_id)) {
-        return tt::ARCH::GRAYSKULL;
-    } else if (is_wormhole_b0(dev->device_info.device_id, get_revision_id(dev))) {
-        return tt::ARCH::WORMHOLE_B0;
-    } else if (is_wormhole(dev->device_info.device_id)) {
-        return tt::ARCH::WORMHOLE;
-    } else if (is_blackhole(dev->device_info.device_id)) {
-        return tt::ARCH::BLACKHOLE;
-    } else {
-        throw std::runtime_error(std::string("Unknown device id."));
-    }
-}
-
-tt::ARCH detect_arch(PCIdevice *pci_device) {
-    return pci_device->hdev->get_arch();
-}
-
-tt::ARCH detect_arch(uint16_t device_id) {
-    tt::ARCH arch_name = tt::ARCH::Invalid;
-    if (find_device(device_id) == -1) {
-        WARN("---- tt_SiliconDevice::detect_arch did not find silcon device_id: %d\n", device_id);
-        return arch_name;
-    }
-    struct PCIdevice pci_device = ttkmd_open((DWORD)device_id, false);
-
-    arch_name = detect_arch(&pci_device);
-
-    ttkmd_close(pci_device);
-    return arch_name;
-}
-
-void TTDevice::do_open() {
-    device_fd = find_device(index);
-    if (device_fd == -1) {
-        throw std::runtime_error(std::string("Failed opening a handle for device ") + std::to_string(index));
-    }
-
-    tenstorrent_get_device_info device_info;
-    memset(&device_info, 0, sizeof(device_info));
-    device_info.in.output_size_bytes = sizeof(device_info.out);
-
-    if (ioctl(device_fd, TENSTORRENT_IOCTL_GET_DEVICE_INFO, &device_info) == -1) {
-        throw std::runtime_error(std::string("Get device info failed on device ") + std::to_string(index) + ".");
-    }
-
-    this->device_info = device_info.out;
-
-    max_dma_buf_size_log2 = device_info.out.max_dma_buf_size_log2;
-
-    struct {
-        tenstorrent_query_mappings query_mappings;
-        tenstorrent_mapping mapping_array[8];
-    } mappings;
-
-    memset(&mappings, 0, sizeof(mappings));
-    mappings.query_mappings.in.output_mapping_count = 8;
-
-    if (ioctl(device_fd, TENSTORRENT_IOCTL_QUERY_MAPPINGS, &mappings.query_mappings) == -1) {
-        throw std::runtime_error(std::string("Query mappings failed on device ") + std::to_string(index) + ".");
-    }
-
-    // Mapping resource to BAR
-    // Resource 0 -> BAR0
-    // Resource 1 -> BAR2
-    // Resource 2 -> BAR4
-    tenstorrent_mapping bar0_uc_mapping;
-    tenstorrent_mapping bar0_wc_mapping;
-    tenstorrent_mapping bar2_uc_mapping;
-    tenstorrent_mapping bar2_wc_mapping;
-    tenstorrent_mapping bar4_uc_mapping;
-    tenstorrent_mapping bar4_wc_mapping;
-
-    memset(&bar0_uc_mapping, 0, sizeof(bar0_uc_mapping));
-    memset(&bar0_wc_mapping, 0, sizeof(bar0_wc_mapping));
-    memset(&bar2_uc_mapping, 0, sizeof(bar2_uc_mapping));
-    memset(&bar2_wc_mapping, 0, sizeof(bar2_wc_mapping));
-    memset(&bar4_uc_mapping, 0, sizeof(bar4_uc_mapping));
-    memset(&bar4_wc_mapping, 0, sizeof(bar4_wc_mapping));
-
-    for (unsigned int i = 0; i < mappings.query_mappings.in.output_mapping_count; i++) {
-        if (mappings.mapping_array[i].mapping_id == TENSTORRENT_MAPPING_RESOURCE0_UC) {
-            bar0_uc_mapping = mappings.mapping_array[i];
-        }
-
-        if (mappings.mapping_array[i].mapping_id == TENSTORRENT_MAPPING_RESOURCE0_WC) {
-            bar0_wc_mapping = mappings.mapping_array[i];
-        }
-
-        if (mappings.mapping_array[i].mapping_id == TENSTORRENT_MAPPING_RESOURCE1_UC) {
-            bar2_uc_mapping = mappings.mapping_array[i];
-        }
-
-        if (mappings.mapping_array[i].mapping_id == TENSTORRENT_MAPPING_RESOURCE1_WC) {
-            bar2_wc_mapping = mappings.mapping_array[i];
-        }
-
-        if (mappings.mapping_array[i].mapping_id == TENSTORRENT_MAPPING_RESOURCE2_UC) {
-            bar4_uc_mapping = mappings.mapping_array[i];
-        }
-
-        if (mappings.mapping_array[i].mapping_id == TENSTORRENT_MAPPING_RESOURCE2_WC) {
-            bar4_wc_mapping = mappings.mapping_array[i];
-        }
-
-        log_debug(LogSiliconDriver, "BAR mapping id {} base {} size {}",
-            mappings.mapping_array[i].mapping_id,
-            (void *)mappings.mapping_array[i].mapping_base,
-            mappings.mapping_array[i].mapping_size);
-    }
-
-    if (bar0_uc_mapping.mapping_id != TENSTORRENT_MAPPING_RESOURCE0_UC) {
-        throw std::runtime_error(std::string("Device ") + std::to_string(index) + " has no BAR0 UC mapping.");
-    }
-
-    auto wc_mapping_size = is_blackhole(device_info.out) ? BH_BAR0_WC_MAPPING_SIZE : GS_BAR0_WC_MAPPING_SIZE;
-
-    // Attempt WC mapping first so we can fall back to all-UC if it fails.
-    if (bar0_wc_mapping.mapping_id == TENSTORRENT_MAPPING_RESOURCE0_WC) {
-        bar0_wc_size = std::min<size_t>(bar0_wc_mapping.mapping_size, wc_mapping_size);
-        bar0_wc = mmap(NULL, bar0_wc_size, PROT_READ | PROT_WRITE, MAP_SHARED, device_fd, bar0_wc_mapping.mapping_base);
-        if (bar0_wc == MAP_FAILED) {
-            bar0_wc_size = 0;
-            bar0_wc = nullptr;
-        }
-    }
-
-    if (bar0_wc) {
-        // The bottom part of the BAR is mapped WC. Map the top UC.
-        bar0_uc_size = bar0_uc_mapping.mapping_size - wc_mapping_size;
-        bar0_uc_offset = wc_mapping_size;
-    } else {
-        // No WC mapping, map the entire BAR UC.
-        bar0_uc_size = bar0_uc_mapping.mapping_size;
-        bar0_uc_offset = 0;
-    }
-
-    bar0_uc = mmap(NULL, bar0_uc_size, PROT_READ | PROT_WRITE, MAP_SHARED, device_fd, bar0_uc_mapping.mapping_base + bar0_uc_offset);
-
-    if (bar0_uc == MAP_FAILED) {
-        throw std::runtime_error(std::string("BAR0 UC memory mapping failed for device ") + std::to_string(index) + ".");
-    }
-
-    if (!bar0_wc) {
-        bar0_wc = bar0_uc;
-    }
-
-    if (is_wormhole(device_info.out)) {
-        if (bar4_uc_mapping.mapping_id != TENSTORRENT_MAPPING_RESOURCE2_UC) {
-            throw std::runtime_error(std::string("Device ") + std::to_string(index) + " has no BAR4 UC mapping.");
-        }
-
-        this->system_reg_mapping_size = bar4_uc_mapping.mapping_size;
-
-        this->system_reg_mapping = mmap(NULL, bar4_uc_mapping.mapping_size, PROT_READ | PROT_WRITE, MAP_SHARED, device_fd, bar4_uc_mapping.mapping_base);
-
-        if (this->system_reg_mapping == MAP_FAILED) {
-            throw std::runtime_error(std::string("BAR4 UC memory mapping failed for device ") + std::to_string(index) + ".");
-        }
-
-        this->system_reg_start_offset = (512 - 16) * 1024*1024;
-        this->system_reg_offset_adjust = (512 - 32) * 1024*1024;
-    } else if(is_blackhole(device_info.out)) {
-        if (bar2_uc_mapping.mapping_id != TENSTORRENT_MAPPING_RESOURCE1_UC) {
-            throw std::runtime_error(std::string("Device ") + std::to_string(index) + " has no BAR2 UC mapping.");
-        }
-
-        // Using UnCachable memory mode. This is used for accessing registers on Blackhole.
-        this->bar2_uc_size = bar2_uc_mapping.mapping_size;
-        this->bar2_uc = mmap(NULL, bar2_uc_mapping.mapping_size, PROT_READ | PROT_WRITE, MAP_SHARED, device_fd, bar2_uc_mapping.mapping_base);
-
-        if (this->bar2_uc == MAP_FAILED) {
-            throw std::runtime_error(std::string("BAR2 UC memory mapping failed for device ") + std::to_string(index) + ".");
-        }
-
-        if (bar4_wc_mapping.mapping_id != TENSTORRENT_MAPPING_RESOURCE2_WC) {
-            throw std::runtime_error(std::string("Device ") + std::to_string(index) + " has no BAR4 WC mapping.");
-        }
-
-        // Using Write-Combine memory mode. This is used for accessing DRAM on Blackhole.
-        // WC doesn't guarantee write ordering but has better performance.
-        this->bar4_wc_size = bar4_wc_mapping.mapping_size;
-        this->bar4_wc = mmap(NULL, bar4_wc_mapping.mapping_size, PROT_READ | PROT_WRITE, MAP_SHARED, device_fd, bar4_wc_mapping.mapping_base);
-
-        if (this->bar4_wc == MAP_FAILED) {
-            throw std::runtime_error(std::string("BAR4 WC memory mapping failed for device ") + std::to_string(index) + ".");
-        }
-    }
-    pci_domain = device_info.out.pci_domain;
-    pci_bus = device_info.out.bus_dev_fn >> 8;
-    pci_device = PCI_SLOT(device_info.out.bus_dev_fn);
-    pci_function = PCI_FUNC(device_info.out.bus_dev_fn);
-
-    arch = detect_arch(this);
-    architecture_implementation = tt::umd::architecture_implementation::create(static_cast<tt::umd::architecture>(arch));
-
-    // GS+WH: ARC_SCRATCH[6], BH: NOC NODE_ID
-    this->read_checking_offset = is_blackhole(device_info.out) ? BH_NOC_NODE_ID_OFFSET : GS_WH_ARC_SCRATCH_6_OFFSET;
-}
-
-void set_debug_level(int dl) {
-    g_DEBUG_LEVEL = dl;
-}
-
-std::uint64_t pci_dma_buffer_get_physical_addr(DMAbuffer &dma_buffer) {
-    log_assert (dma_buffer.pDma, "DMA Buffer not initialized");
-    return reinterpret_cast<std::uint64_t>(dma_buffer.pDma);
-}
-
-std::uint64_t pci_dma_buffer_get_user_addr(DMAbuffer &dma_buffer) {
-    log_assert (dma_buffer.pBuf, "DMA Buffer not initialized");
-    return reinterpret_cast<std::uint64_t>(dma_buffer.pBuf);
-}
-
-DWORD ttkmd_init() { return 0; }    // 0 on success
-DWORD ttkmd_uninit() { return 0; }  // 0 on success
-
 bool is_char_dev(const dirent *ent, const char *parent_dir) {
     if (ent->d_type == DT_UNKNOWN || ent->d_type == DT_LNK) {
         char name[2 * NAME_MAX + 2];
@@ -708,267 +237,93 @@ std::vector<chip_id_t> ttkmd_scan() {
     return found_devices;
 }
 
-int get_config_space_fd(TTDevice *dev) {
-    if (dev->sysfs_config_fd == -1) {
-        static const char pattern[] = "/sys/bus/pci/devices/0000:%02x:%02x.%u/config";
-        char buf[sizeof(pattern)];
-        std::snprintf(buf, sizeof(buf), pattern,
-                      (unsigned int)dev->pci_bus, (unsigned int)dev->pci_device, (unsigned int)dev->pci_function);
-        dev->sysfs_config_fd = open(buf, O_RDWR);
-
-        if (dev->sysfs_config_fd == -1) {
-            dev->sysfs_config_fd = open(buf, O_RDONLY);
-        }
-    }
-
-    return dev->sysfs_config_fd;
-}
-
-int get_revision_id(TTDevice *dev) {
-
-    static const char pattern[] = "/sys/bus/pci/devices/%04x:%02x:%02x.%u/revision";
-    char buf[sizeof(pattern)];
-    std::snprintf(buf, sizeof(buf), pattern,
-    (unsigned int)dev->pci_domain, (unsigned int)dev->pci_bus, (unsigned int)dev->pci_device, (unsigned int)dev->pci_function);
-
-    std::ifstream revision_file(buf);
-    std::string revision_string;
-    if (std::getline(revision_file, revision_string)) {
-        return std::stoi(revision_string, nullptr, 0);
-    } else {
-        throw std::runtime_error("Revision ID read failed for device");
-    }
-}
-
-int get_link_width(TTDevice *dev) {
-
-    static const char pattern[] = "/sys/bus/pci/devices/%04x:%02x:%02x.%u/current_link_width";
-    char buf[sizeof(pattern)];
-    std::snprintf(buf, sizeof(buf), pattern,
-    (unsigned int)dev->pci_domain, (unsigned int)dev->pci_bus, (unsigned int)dev->pci_device, (unsigned int)dev->pci_function);
-
-    std::ifstream linkwidth_file(buf);
-    std::string linkwidth_string;
-    if (std::getline(linkwidth_file, linkwidth_string)) {
-        return std::stoi(linkwidth_string, nullptr, 0);
-    } else {
-        throw std::runtime_error("Link width read failed for device");
-    }
-}
-
-int get_link_speed(TTDevice *dev) {
-
-    static const char pattern[] = "/sys/bus/pci/devices/%04x:%02x:%02x.%u/current_link_speed";
-    char buf[sizeof(pattern)];
-    std::snprintf(buf, sizeof(buf), pattern,
-    (unsigned int)dev->pci_domain, (unsigned int)dev->pci_bus, (unsigned int)dev->pci_device, (unsigned int)dev->pci_function);
-
-    std::ifstream linkspeed_file(buf);
-    std::string linkspeed_string;
-    int linkspeed;
-    if (std::getline(linkspeed_file, linkspeed_string) && sscanf(linkspeed_string.c_str(), "%d", &linkspeed) == 1) {
-        return linkspeed;
-    } else {
-        throw std::runtime_error("Link speed read failed for device");
-    }
-}
-
-int get_numa_node(TTDevice *dev) {
-
-    static const char pattern[] = "/sys/bus/pci/devices/%04x:%02x:%02x.%u/numa_node";
-    char buf[sizeof(pattern)];
-    std::snprintf(buf, sizeof(buf), pattern,
-    (unsigned int)dev->pci_domain, (unsigned int)dev->pci_bus, (unsigned int)dev->pci_device, (unsigned int)dev->pci_function);
-
-    std::ifstream num_node_file(buf);
-    std::string numa_node_string;
-    if (std::getline(num_node_file, numa_node_string)) {
-        return std::stoi(numa_node_string, nullptr, 0);
-    } else {
-        return -1;
-    }
-}
-
-std::uint64_t read_bar0_base(TTDevice *dev) {
-    const std::uint64_t bar_address_mask = ~(std::uint64_t)0xF;
-    unsigned int bar0_config_offset = 0x10;
-
-    std::uint64_t bar01;
-    if (pread(get_config_space_fd(dev), &bar01, sizeof(bar01), bar0_config_offset) != sizeof(bar01)) {
-        return 0;
-    }
-
-    return bar01 & bar_address_mask;
-}
-
-DMAbuffer allocate_dma_buffer(TTDevice *ttdev, unsigned int buffer_index, std::size_t size) {
-    tenstorrent_allocate_dma_buf allocate_dma_buf;
-
-    if (size > std::numeric_limits<decltype(allocate_dma_buf.in.requested_size)>::max()) {
-        throw std::runtime_error(std::string("Requested DMA buffer size (" + std::to_string(allocate_dma_buf.in.requested_size)
-                                             + ") bytes exceeds interface size limit for device " + std::to_string(ttdev->index) + ", with error: " + std::strerror(errno)));
-    }
+// bool is_hardware_hung(const TTDevice *dev) {
+//     volatile const void *addr = reinterpret_cast<const char *>(dev->bar0_uc) + (dev->get_architecture_implementation()->get_arc_reset_scratch_offset() + 6 * 4) - dev->bar0_uc_offset;
+//     std::uint32_t scratch_data = *reinterpret_cast<const volatile std::uint32_t*>(addr);
 
-    memset(&allocate_dma_buf, 0, sizeof(allocate_dma_buf));
-    allocate_dma_buf.in.requested_size = std::max<std::size_t>(size, getpagesize());
-    allocate_dma_buf.in.buf_index = buffer_index;
-
-    if (ioctl(ttdev->device_fd, TENSTORRENT_IOCTL_ALLOCATE_DMA_BUF, &allocate_dma_buf) == -1) {
-        throw std::runtime_error(std::string("DMA buffer allocation failed (") + std::to_string(allocate_dma_buf.in.requested_size)
-                                 + " bytes) for device " + std::to_string(ttdev->index) + ".");
-    }
-
-    void *mapping = mmap(NULL, allocate_dma_buf.out.size, PROT_READ | PROT_WRITE, MAP_SHARED, ttdev->device_fd, allocate_dma_buf.out.mapping_offset);
-
-    log_trace(tt::LogSiliconDriver, "DMA buffer succeeded with size {} offset {} phy_addr {}", allocate_dma_buf.out.size, allocate_dma_buf.out.mapping_offset, allocate_dma_buf.out.physical_address);
-
-    if (mapping == MAP_FAILED) {
-        throw std::runtime_error(std::string("DMA buffer memory mapping failed for device ") + std::to_string(ttdev->index) + ".");
-    }
-
-    DMAbuffer dmabuf;
-    dmabuf.pBuf = mapping;
-    dmabuf.pDma = allocate_dma_buf.out.physical_address;
-    dmabuf.size = allocate_dma_buf.out.size;
-
-    ttdev->dma_buffer_mappings.push_back(dmabuf);
-
-    return dmabuf;
-}
-
-PCIdevice ttkmd_open(DWORD device_id, bool sharable /* = false */)
-{
-    (void)sharable; // presently ignored
-
-    auto ttdev = std::make_unique<TTDevice>(TTDevice::open(device_id));
-
-    PCIdevice device;
-    device.id = device_id;
-    device.hdev = ttdev.get();
-    device.vendor_id = ttdev->device_info.vendor_id;
-    device.device_id = ttdev->device_info.device_id;
-    device.subsystem_vendor_id = ttdev->device_info.subsystem_vendor_id;
-    device.subsystem_id = ttdev->device_info.subsystem_id;
-    device.dwBus = ttdev->pci_bus;
-    device.dwSlot = ttdev->pci_device;
-    device.dwFunction = ttdev->pci_function;
-    device.BAR_addr = read_bar0_base(ttdev.get());
-    device.BAR_size_bytes = ttdev->bar0_uc_size;
-    device.revision_id = get_revision_id(ttdev.get());
-    ttdev.release();
-
-    return device;
-}
-
-int ttkmd_close(struct PCIdevice &device) {
-    delete static_cast<TTDevice*>(device.hdev);
-
-    return 0;
-}
-
-template <class T>
-volatile T* register_address(const TTDevice *dev, std::uint32_t register_offset) {
-    void *reg_mapping;
-    if (dev->system_reg_mapping != nullptr && register_offset >= dev->system_reg_start_offset) {
-        register_offset -= dev->system_reg_offset_adjust;
-        reg_mapping = dev->system_reg_mapping;
-    } else if (dev->bar0_wc != dev->bar0_uc && register_offset < dev->bar0_wc_size) {
-        reg_mapping = dev->bar0_wc;
-    } else {
-        register_offset -= dev->bar0_uc_offset;
-        reg_mapping = dev->bar0_uc;
-    }
-
-    return reinterpret_cast<T*>(static_cast<uint8_t*>(reg_mapping) + register_offset);
-}
+//     return (scratch_data == 0xffffffffu);
+// }
 
-bool is_hardware_hung(const TTDevice *dev) {
-    volatile const void *addr = reinterpret_cast<const char *>(dev->bar0_uc) + (dev->get_architecture_implementation()->get_arc_reset_scratch_offset() + 6 * 4) - dev->bar0_uc_offset;
-    std::uint32_t scratch_data = *reinterpret_cast<const volatile std::uint32_t*>(addr);
+// bool reset_by_sysfs(TTDevice *dev) {
 
-    return (scratch_data == 0xffffffffu);
-}
+//     const char *virtual_env = getenv("VIRTUAL_ENV");
+//     if (virtual_env == nullptr)
+//         return false;
 
-bool reset_by_sysfs(TTDevice *dev) {
+//     std::string reset_helper_path = virtual_env;
+//     reset_helper_path += "/bin/reset-helper";
 
-    const char *virtual_env = getenv("VIRTUAL_ENV");
-    if (virtual_env == nullptr)
-        return false;
+//     std::string busid = std::to_string(dev->pci_bus);
 
-    std::string reset_helper_path = virtual_env;
-    reset_helper_path += "/bin/reset-helper";
+//     dev->suspend_before_device_reset();
 
-    std::string busid = std::to_string(dev->pci_bus);
+//     char *argv[3];
+//     argv[0] = const_cast<char*>(reset_helper_path.c_str());
+//     argv[1] = const_cast<char*>(busid.c_str());
+//     argv[2] = nullptr;
 
-    dev->suspend_before_device_reset();
+//     pid_t reset_helper_pid;
+//     if (posix_spawn(&reset_helper_pid, reset_helper_path.c_str(), nullptr, nullptr, argv, environ) != 0)
+//         return false;
 
-    char *argv[3];
-    argv[0] = const_cast<char*>(reset_helper_path.c_str());
-    argv[1] = const_cast<char*>(busid.c_str());
-    argv[2] = nullptr;
+//     siginfo_t reset_helper_status;
+//     if (waitid(P_PID, reset_helper_pid, &reset_helper_status, WEXITED) != 0)
+//         return false;
 
-    pid_t reset_helper_pid;
-    if (posix_spawn(&reset_helper_pid, reset_helper_path.c_str(), nullptr, nullptr, argv, environ) != 0)
-        return false;
+//     if (reset_helper_status.si_status != 0)
+//         return false;
 
-    siginfo_t reset_helper_status;
-    if (waitid(P_PID, reset_helper_pid, &reset_helper_status, WEXITED) != 0)
-        return false;
-
-    if (reset_helper_status.si_status != 0)
-        return false;
+//     dev->resume_after_device_reset();
 
-    dev->resume_after_device_reset();
-
-    return true;
-}
+//     return true;
+// }
 
-bool reset_by_ioctl(TTDevice *dev) {
-    struct tenstorrent_reset_device reset_device;
-    memset(&reset_device, 0, sizeof(reset_device));
+// bool reset_by_ioctl(TTDevice *dev) {
+//     struct tenstorrent_reset_device reset_device;
+//     memset(&reset_device, 0, sizeof(reset_device));
 
-    reset_device.in.output_size_bytes = sizeof(reset_device.out);
-    reset_device.in.flags = 0;
+//     reset_device.in.output_size_bytes = sizeof(reset_device.out);
+//     reset_device.in.flags = 0;
 
-    if (ioctl(dev->device_fd, TENSTORRENT_IOCTL_RESET_DEVICE, &reset_device) == -1) {
-        return false;
-    }
+//     if (ioctl(dev->device_fd, TENSTORRENT_IOCTL_RESET_DEVICE, &reset_device) == -1) {
+//         return false;
+//     }
 
-    return (reset_device.out.result == 0);
-}
+//     return (reset_device.out.result == 0);
+// }
 
-bool auto_reset_board(TTDevice *dev) {
-    return ((reset_by_ioctl(dev) || reset_by_sysfs(dev)) && !is_hardware_hung(dev));
-}
+// bool auto_reset_board(TTDevice *dev) {
+//     return ((reset_by_ioctl(dev) || reset_by_sysfs(dev)) && !is_hardware_hung(dev));
+// }
 
-void detect_ffffffff_read(TTDevice *dev, std::uint32_t data_read = 0xffffffffu) {
-    if (g_READ_CHECKING_ENABLED && data_read == 0xffffffffu && is_hardware_hung(dev)) {
-        std::uint32_t scratch_data = *register_address<std::uint32_t>(dev, dev->read_checking_offset);
+// void detect_ffffffff_read(TTDevice *dev, std::uint32_t data_read = 0xffffffffu) {
+//     if (g_READ_CHECKING_ENABLED && data_read == 0xffffffffu && is_hardware_hung(dev)) {
+//         std::uint32_t scratch_data = *register_address<std::uint32_t>(dev, dev->read_checking_offset);
 
-        if (auto_reset_board(dev)) {
-            throw std::runtime_error("Read 0xffffffff from PCIE: auto-reset succeeded.");
-        } else {
-            throw std::runtime_error("Read 0xffffffff from PCIE: you should reset the board.");
-        }
-    }
-}
+//         if (auto_reset_board(dev)) {
+//             throw std::runtime_error("Read 0xffffffff from PCIE: auto-reset succeeded.");
+//         } else {
+//             throw std::runtime_error("Read 0xffffffff from PCIE: you should reset the board.");
+//         }
+//     }
+// }
 
-inline void record_access (const char* where, uint32_t addr, uint32_t size, bool turbo, bool write, bool block, bool endline) {
-    LOG2 ("%s PCI_ACCESS %s 0x%8x  %8d bytes %s %s%s", where, write ? "WR" : "RD", addr, size, turbo ? "TU" : "  ", block ? "BLK" : "   ", endline ? "\n" : "" );
-}
+// inline void record_access (const char* where, uint32_t addr, uint32_t size, bool turbo, bool write, bool block, bool endline) {
+//     LOG2 ("%s PCI_ACCESS %s 0x%8x  %8d bytes %s %s%s", where, write ? "WR" : "RD", addr, size, turbo ? "TU" : "  ", block ? "BLK" : "   ", endline ? "\n" : "" );
+// }
 
-inline void print_buffer (const void* buffer_addr, uint32_t len_bytes = 16, bool endline = true) {
-    // Prints each byte in a buffer
-    if (g_DEBUG_LEVEL > 1) {
-        uint8_t *b = (uint8_t *)(buffer_addr);
-        for (uint32_t i = 0; i < len_bytes; i++) {
-            LOG2 ("    [0x%x] = 0x%x (%u) ", i, b[i], b[i]);
-        }
-        if (endline) {
-            LOG2 ("\n");
-        }
-    }
-}
+// inline void print_buffer (const void* buffer_addr, uint32_t len_bytes = 16, bool endline = true) {
+//     // Prints each byte in a buffer
+//     if (g_DEBUG_LEVEL > 1) {
+//         uint8_t *b = (uint8_t *)(buffer_addr);
+//         for (uint32_t i = 0; i < len_bytes; i++) {
+//             LOG2 ("    [0x%x] = 0x%x (%u) ", i, b[i], b[i]);
+//         }
+//         if (endline) {
+//             LOG2 ("\n");
+//         }
+//     }
+// }
 
 // Custom device memcpy. This is only safe for memory-like regions on the device (Tensix L1, DRAM, ARC CSM).
 // Both routines assume that misaligned accesses are permitted on host memory.
@@ -978,390 +333,89 @@ inline void print_buffer (const void* buffer_addr, uint32_t len_bytes = 16, bool
 // 2. syseng#3487 WH GDDR5 controller has a bug when 1-byte writes are temporarily adjacent
 // to 2-byte writes. We avoid ever performing a 1-byte write to the device. This only affects to device.
 
-void memcpy_to_device(void *dest, const void *src, std::size_t num_bytes) {
-    typedef std::uint32_t copy_t;
-
-    // Start by aligning the destination (device) pointer. If needed, do RMW to fix up the
-    // first partial word.
-    volatile copy_t *dp;
-
-    std::uintptr_t dest_addr = reinterpret_cast<std::uintptr_t>(dest);
-    unsigned int dest_misalignment = dest_addr % sizeof(copy_t);
-
-    if (dest_misalignment != 0) {
-        // Read-modify-write for the first dest element.
-        dp = reinterpret_cast<copy_t*>(dest_addr - dest_misalignment);
-
-        copy_t tmp = *dp;
-
-        auto leading_len = std::min(sizeof(tmp) - dest_misalignment, num_bytes);
-
-        std::memcpy(reinterpret_cast<char*>(&tmp) + dest_misalignment, src, leading_len);
-        num_bytes -= leading_len;
-        src = static_cast<const char *>(src) + leading_len;
-
-        *dp++ = tmp;
-
-    } else {
-        dp = static_cast<copy_t*>(dest);
-    }
-
-    // Copy the destination-aligned middle.
-    const copy_t *sp = static_cast<const copy_t*>(src);
-    std::size_t num_words = num_bytes / sizeof(copy_t);
-
-    for (std::size_t i = 0; i < num_words; i++)
-        *dp++ = *sp++;
-
-    // Finally copy any sub-word trailer, again RMW on the destination.
-    auto trailing_len = num_bytes % sizeof(copy_t);
-    if (trailing_len != 0) {
-        copy_t tmp = *dp;
-
-        std::memcpy(&tmp, sp, trailing_len);
-
-        *dp++ = tmp;
-    }
-}
-
-void memcpy_from_device(void *dest, const void *src, std::size_t num_bytes) {
-    typedef std::uint32_t copy_t;
-
-    // Start by aligning the source (device) pointer.
-    const volatile copy_t *sp;
-
-    std::uintptr_t src_addr = reinterpret_cast<std::uintptr_t>(src);
-    unsigned int src_misalignment = src_addr % sizeof(copy_t);
-
-    if (src_misalignment != 0) {
-        sp = reinterpret_cast<copy_t*>(src_addr - src_misalignment);
-
-        copy_t tmp = *sp++;
-
-        auto leading_len = std::min(sizeof(tmp) - src_misalignment, num_bytes);
-        std::memcpy(dest, reinterpret_cast<char *>(&tmp) + src_misalignment, leading_len);
-        num_bytes -= leading_len;
-        dest = static_cast<char *>(dest) + leading_len;
-
-    } else {
-        sp = static_cast<const volatile copy_t*>(src);
-    }
-
-    // Copy the source-aligned middle.
-    copy_t *dp = static_cast<copy_t *>(dest);
-    std::size_t num_words = num_bytes / sizeof(copy_t);
-
-    for (std::size_t i = 0; i < num_words; i++)
-        *dp++ = *sp++;
-
-    // Finally copy any sub-word trailer.
-    auto trailing_len = num_bytes % sizeof(copy_t);
-    if (trailing_len != 0) {
-        copy_t tmp = *sp;
-        std::memcpy(dp, &tmp, trailing_len);
-    }
-}
-
-void read_block(TTDevice *dev, uint64_t byte_addr, uint64_t num_bytes, uint8_t* buffer_addr, uint32_t dma_buf_size) {
-    if (num_bytes >= g_DMA_BLOCK_SIZE_READ_THRESHOLD_BYTES && g_DMA_BLOCK_SIZE_READ_THRESHOLD_BYTES > 0) {
-        record_access ("read_block_a", byte_addr, num_bytes, true, false, true, true); // addr, size, turbo, write, block, endline
-
-        DMAbuffer &transfer_buffer = dev->dma_transfer_buffer;
-
-        uint64_t host_phys_addr = pci_dma_buffer_get_physical_addr (transfer_buffer);
-        uint64_t host_user_addr = pci_dma_buffer_get_user_addr (transfer_buffer);
-        while (num_bytes > 0) {
-            uint32_t transfered_bytes = std::min<uint32_t>(num_bytes, dma_buf_size);
-            pcie_dma_transfer_turbo (dev, byte_addr, host_phys_addr, transfered_bytes, false);
-            memcpy (buffer_addr, (void*)host_user_addr, transfered_bytes);
-            num_bytes -= transfered_bytes;
-            byte_addr += transfered_bytes;
-            buffer_addr += transfered_bytes;
-        }
-        return;
-    }
-
-    record_access("read_block_b", byte_addr, num_bytes, false, false, true, false); // addr, size, turbo, write, block, endline
-
-    void *reg_mapping;
-    if (dev->bar4_wc != nullptr && byte_addr >= BAR0_BH_SIZE) {
-        byte_addr -= BAR0_BH_SIZE;
-        reg_mapping = dev->bar4_wc;
-    }
-    else if (dev->system_reg_mapping != nullptr && byte_addr >= dev->system_reg_start_offset) {
-        byte_addr -= dev->system_reg_offset_adjust;
-        reg_mapping = dev->system_reg_mapping;
-    } else if (dev->bar0_wc != dev->bar0_uc && byte_addr < dev->bar0_wc_size) {
-        reg_mapping = dev->bar0_wc;
-    } else {
-        byte_addr -= dev->bar0_uc_offset;
-        reg_mapping = dev->bar0_uc;
-    }
-
-    const void *src = reinterpret_cast<const char *>(reg_mapping) + byte_addr;
-    void *dest = reinterpret_cast<void *>(buffer_addr);
-
-#ifndef DISABLE_ISSUE_3487_FIX
-    memcpy_from_device(dest, src, num_bytes);
-#else
-#ifdef FAST_MEMCPY
-
-    if ((num_bytes % 32 == 0) && ((intptr_t(dest) & 31) == 0) && ((intptr_t(src) & 31) == 0))
-    memcpy_from_device(dest, src, num_bytes);
-    {
-        // Faster memcpy version.. about 8x currently compared to pci_read above
-        fastMemcpy(dest, src, num_bytes);
-    }
-    else
-#else
-    // ~4x faster than pci_read above, but works for all sizes and alignments
-    memcpy(dest, src, num_bytes);
-#endif
-#endif
-
-    if (num_bytes >= sizeof(std::uint32_t)) {
-        detect_ffffffff_read(dev, *reinterpret_cast<std::uint32_t*>(dest));
-    }
-    print_buffer (buffer_addr, std::min((uint64_t)g_NUM_BYTES_TO_PRINT, num_bytes), true);
-}
-
-void write_block(TTDevice *dev, uint64_t byte_addr, uint64_t num_bytes, const uint8_t* buffer_addr, uint32_t dma_buf_size) {
-    if (num_bytes >= g_DMA_BLOCK_SIZE_WRITE_THRESHOLD_BYTES && g_DMA_BLOCK_SIZE_WRITE_THRESHOLD_BYTES > 0) {
-        record_access ("write_block_a", byte_addr, num_bytes, true, true, true, true); // addr, size, turbo, write, block, endline
-
-        DMAbuffer &transfer_buffer = dev->dma_transfer_buffer;
-
-        uint64_t host_phys_addr = pci_dma_buffer_get_physical_addr (transfer_buffer);
-        uint64_t host_user_addr = pci_dma_buffer_get_user_addr (transfer_buffer);
-        while (num_bytes > 0) {
-            uint32_t transfered_bytes = std::min<uint32_t>(num_bytes, dma_buf_size);
-            memcpy ( (void*)host_user_addr, buffer_addr, transfered_bytes);
-            pcie_dma_transfer_turbo (dev, byte_addr, host_phys_addr, transfered_bytes, true);
-            num_bytes -= transfered_bytes;
-            byte_addr += transfered_bytes;
-            buffer_addr += transfered_bytes;
-        }
-        return;
-    }
-
-    record_access("write_block_b", byte_addr, num_bytes, false, true, true, false); // addr, size, turbo, write, block, endline
-
-    void *reg_mapping;
-    if (dev->bar4_wc != nullptr && byte_addr >= BAR0_BH_SIZE) {
-        byte_addr -= BAR0_BH_SIZE;
-        reg_mapping = dev->bar4_wc;
-    }
-    else if (dev->system_reg_mapping != nullptr && byte_addr >= dev->system_reg_start_offset) {
-        byte_addr -= dev->system_reg_offset_adjust;
-        reg_mapping = dev->system_reg_mapping;
-    } else if (dev->bar0_wc != dev->bar0_uc && byte_addr < dev->bar0_wc_size) {
-        reg_mapping = dev->bar0_wc;
-    } else {
-        byte_addr -= dev->bar0_uc_offset;
-        reg_mapping = dev->bar0_uc;
-    }
-
-    void *dest = reinterpret_cast<char *>(reg_mapping) + byte_addr;
-    const void *src = reinterpret_cast<const void *>(buffer_addr);
-#ifndef DISABLE_ISSUE_3487_FIX
-    memcpy_to_device(dest, src, num_bytes);
-#else
-#ifdef FAST_MEMCPY
-    memcpy_to_device(dest, src, num_bytes);
-   if ((num_bytes % 32 == 0) && ((intptr_t(dest) & 31) == 0) && ((intptr_t(src) & 31) == 0))
-
-   {
-      // Faster memcpy version.. about 8x currently compared to pci_read above
-      fastMemcpy(dest, src, num_bytes);
-   }
-   else
-#else
-     // ~4x faster than pci_read above, but works for all sizes and alignments
-     memcpy(dest, src, num_bytes);
-#endif
-#endif
-    print_buffer (buffer_addr, std::min((uint64_t)g_NUM_BYTES_TO_PRINT, num_bytes), true);
-}
-
-void read_checking_enable(bool enable = true) {
-    g_READ_CHECKING_ENABLED = enable;
-}
-
-// Read/write to the configuration space of the device
-// pData is a pointer to a buffer (see memory module)
-DWORD read_cfg(TTDevice *dev, DWORD byte_offset, uint64_t pData, DWORD num_bytes) {
+// void memcpy_to_device(void *dest, const void *src, std::size_t num_bytes) {
+//     typedef std::uint32_t copy_t;
 
-    if (pread(get_config_space_fd(dev), reinterpret_cast<void*>(pData), num_bytes, byte_offset) != num_bytes) {
-        throw std::runtime_error("Config space read failed for device ");
-    }
+//     // Start by aligning the destination (device) pointer. If needed, do RMW to fix up the
+//     // first partial word.
+//     volatile copy_t *dp;
 
-    return 0;
-}
+//     std::uintptr_t dest_addr = reinterpret_cast<std::uintptr_t>(dest);
+//     unsigned int dest_misalignment = dest_addr % sizeof(copy_t);
 
-DWORD write_cfg(TTDevice *dev, DWORD byte_offset, uint64_t pData, DWORD num_bytes) {
+//     if (dest_misalignment != 0) {
+//         // Read-modify-write for the first dest element.
+//         dp = reinterpret_cast<copy_t*>(dest_addr - dest_misalignment);
 
-    if (pwrite(get_config_space_fd(dev), reinterpret_cast<const void*>(pData), num_bytes, byte_offset) != num_bytes) {
-        throw std::runtime_error("Config space read failed for device ");
-    }
+//         copy_t tmp = *dp;
 
-    return 0;
-}
+//         auto leading_len = std::min(sizeof(tmp) - dest_misalignment, num_bytes);
 
-DMAbuffer pci_allocate_dma_buffer(TTDevice *dev, uint32_t size) {
+//         std::memcpy(reinterpret_cast<char*>(&tmp) + dest_misalignment, src, leading_len);
+//         num_bytes -= leading_len;
+//         src = static_cast<const char *>(src) + leading_len;
 
-    uint32_t page_size = getpagesize();
-    uint32_t page_aligned_size = (size + page_size - 1) & ~(page_size - 1);
+//         *dp++ = tmp;
 
-    DMAbuffer ret_val = allocate_dma_buffer(dev, dev->next_dma_buf++, page_aligned_size);
-    LOG1 ("Allocated DMA buffer at 0x%lx 0x%lx size: %u\n", ret_val.pBuf, ret_val.pDma, size);
-    return ret_val;
-}
-
-void pcie_init_dma_transfer_turbo (PCIdevice* dev) {
-    // From SHA 8cf7ff1bc7b3886a:
-    if (detect_arch(dev) == tt::ARCH::WORMHOLE_B0) {
-        c_CSM_PCIE_CTRL_DMA_REQUEST_OFFSET = 0x1fef84c8; // chip.AXI.get_path_info("ARC_CSM.ARC_PCIE_DMA_REQUEST")
-    } else {
-        c_CSM_PCIE_CTRL_DMA_REQUEST_OFFSET = 0x1fef84c0; // chip.AXI.get_path_info("ARC_CSM.ARC_PCIE_DMA_REQUEST")
-    }
-    c_DMA_TRIGGER_ADDRESS = 0x1ff30074;              // chip.AXI.get_path_info("ARC_RESET.SCRATCH[5]")
-    c_ARC_MISC_CNTL_ADDRESS = 0x1ff30100;            // chip.AXI.get_path_info("ARC_RESET.ARC_MISC_CNTL")
-}
-
-void set_use_dma(bool msi, uint32_t dma_block_size_read_threshold_bytes, uint32_t dma_block_size_write_threshold_bytes) {
-    g_USE_MSI_FOR_DMA = msi;
-    g_DMA_BLOCK_SIZE_READ_THRESHOLD_BYTES  = dma_block_size_read_threshold_bytes;
-    g_DMA_BLOCK_SIZE_WRITE_THRESHOLD_BYTES = dma_block_size_write_threshold_bytes;
-}
-
-void write_regs(volatile uint32_t *dest, const uint32_t *src, uint32_t word_len) {
-    while (word_len-- != 0) {
-        *dest++ = *src++;
-    }
-}
-
-void write_regs(TTDevice *dev, uint32_t byte_addr, uint32_t word_len, const void *data) {
-    record_access("write_regs", byte_addr, word_len * sizeof(uint32_t), false, true, false, false);
-
-    volatile uint32_t *dest = register_address<std::uint32_t>(dev, byte_addr);
-    const uint32_t *src = reinterpret_cast<const uint32_t*>(data);
-
-    write_regs(dest, src, word_len);
-
-    LOG2(" REG ");
-    print_buffer (data, std::min(g_NUM_BYTES_TO_PRINT, word_len * 4), true);
-}
+//     } else {
+//         dp = static_cast<copy_t*>(dest);
+//     }
 
-void write_tlb_reg(TTDevice *dev, uint32_t byte_addr, std::uint64_t value_lower, std::uint64_t value_upper, std::uint32_t tlb_cfg_reg_size) {
-    record_access("write_tlb_reg", byte_addr, tlb_cfg_reg_size, false, true, false, false);
-
-    log_assert((tlb_cfg_reg_size == 8) or (tlb_cfg_reg_size == 12), "Tenstorrent hardware supports only 64bit or 96bit TLB config regs");
-
-    volatile uint64_t *dest_qw = register_address<std::uint64_t>(dev, byte_addr);
-    volatile uint32_t *dest_extra_dw = register_address<std::uint32_t>(dev, byte_addr+8);
-#if defined(__ARM_ARCH) || defined(__riscv)
-    // The store below goes through UC memory on x86, which has implicit ordering constraints with WC accesses.
-    // ARM has no concept of UC memory. This will not allow for implicit ordering of this store wrt other memory accesses.
-    // Insert an explicit full memory barrier for ARM.
-    // Do the same for RISC-V.
-    tt_driver_atomics::mfence();
-#endif
-    *dest_qw = value_lower;
-    if (tlb_cfg_reg_size > 8) {
-        uint32_t* p_value_upper = reinterpret_cast<uint32_t*>(&value_upper);
-        *dest_extra_dw = p_value_upper[0];
-    }
-    tt_driver_atomics::mfence(); // Otherwise subsequent WC loads move earlier than the above UC store to the TLB register.
-
-    LOG2(" TLB ");
-    print_buffer (&value_lower, sizeof(value_lower), true);
-    if (tlb_cfg_reg_size > 8) {
-        uint32_t* p_value_upper = reinterpret_cast<uint32_t*>(&value_upper);
-        print_buffer (p_value_upper, sizeof(uint32_t), true);
-    }
-}
+//     // Copy the destination-aligned middle.
+//     const copy_t *sp = static_cast<const copy_t*>(src);
+//     std::size_t num_words = num_bytes / sizeof(copy_t);
 
-void read_regs(TTDevice *dev, uint32_t byte_addr, uint32_t word_len, void *data) {
-    record_access("read_regs", byte_addr, word_len * sizeof(uint32_t), false, false, false, false);
+//     for (std::size_t i = 0; i < num_words; i++)
+//         *dp++ = *sp++;
 
-    const volatile uint32_t *src = register_address<std::uint32_t>(dev, byte_addr);
-    uint32_t *dest = reinterpret_cast<uint32_t*>(data);
+//     // Finally copy any sub-word trailer, again RMW on the destination.
+//     auto trailing_len = num_bytes % sizeof(copy_t);
+//     if (trailing_len != 0) {
+//         copy_t tmp = *dp;
 
-    while (word_len-- != 0) {
-        uint32_t temp = *src++;
-        memcpy(dest++, &temp, sizeof(temp));
-    }
-    LOG2(" REG ");
-    print_buffer (data, std::min(g_NUM_BYTES_TO_PRINT, word_len * 4), true);
-}
+//         std::memcpy(&tmp, sp, trailing_len);
 
-void handle_dma_timeout(TTDevice *dev, uint32_t size_bytes, bool write) {
-    detect_ffffffff_read(dev);
-    throw std::runtime_error(std::string("DMA transfer timeout: ")
-                             + std::to_string(size_bytes)
-                             + (write ? " byte write." : " byte read."));
-}
-uint32_t pcie_dma_transfer_turbo (TTDevice *dev, uint32_t chip_addr, uint32_t host_phys_addr, uint32_t size_bytes, bool write) {
-    // c_timer t ("");
+//         *dp++ = tmp;
+//     }
+// }
 
-    // t.now_in ("1. DMA setup");
+// void memcpy_from_device(void *dest, const void *src, std::size_t num_bytes) {
+//     typedef std::uint32_t copy_t;
 
-    if (c_CSM_PCIE_CTRL_DMA_REQUEST_OFFSET == 0) {
-        throw std::runtime_error ("pcie_init_dma_transfer_turbo must be called before pcie_dma_transfer_turbo");
-    }
+//     // Start by aligning the source (device) pointer.
+//     const volatile copy_t *sp;
 
-    arc_pcie_ctrl_dma_request_t req = {
-        .chip_addr           = chip_addr,
-        .host_phys_addr      = host_phys_addr,
-        .completion_flag_phys_addr = static_cast<uint32_t>(pci_dma_buffer_get_physical_addr(dev->dma_completion_flag_buffer)),
-        .size_bytes          = size_bytes,
-        .write               = (write ? 1U : 0U),
-        .pcie_msi_on_done    = g_USE_MSI_FOR_DMA ? 1U : 0U,
-        .pcie_write_on_done  = g_USE_MSI_FOR_DMA ? 0U : 1U,
-        .trigger             = 1U,
-        .repeat              = 1
-    };
+//     std::uintptr_t src_addr = reinterpret_cast<std::uintptr_t>(src);
+//     unsigned int src_misalignment = src_addr % sizeof(copy_t);
 
-    volatile uint32_t *complete_flag = (uint32_t *)pci_dma_buffer_get_user_addr(dev->dma_completion_flag_buffer);
-    *complete_flag = 0;
+//     if (src_misalignment != 0) {
+//         sp = reinterpret_cast<copy_t*>(src_addr - src_misalignment);
 
-    // Configure the DMA engine
-    msi_interrupt_received = false;
-    write_regs (dev, c_CSM_PCIE_CTRL_DMA_REQUEST_OFFSET, sizeof(req) / sizeof(uint32_t), &req);
+//         copy_t tmp = *sp++;
 
-    // Trigger ARC interrupt 0 on core 0
-    int arc_misc_cntl_value = 0;
+//         auto leading_len = std::min(sizeof(tmp) - src_misalignment, num_bytes);
+//         std::memcpy(dest, reinterpret_cast<char *>(&tmp) + src_misalignment, leading_len);
+//         num_bytes -= leading_len;
+//         dest = static_cast<char *>(dest) + leading_len;
 
-    // NOTE: Ideally, we should read the state of this register before writing to it, but that
-    //       casues a lot of delay (reads have huge latencies)
-    arc_misc_cntl_value |= (1 << 16); // Cause IRQ0 on core 0
-    write_regs (dev, c_ARC_MISC_CNTL_ADDRESS, 1, &arc_misc_cntl_value);
+//     } else {
+//         sp = static_cast<const volatile copy_t*>(src);
+//     }
 
-    if (!g_USE_MSI_FOR_DMA) {
-        // t.now_in ("2. DMA poll");
-        int wait_loops = 0;
-        while (true) {
-            // The complete flag is set ty by ARC (see src/hardware/soc/tb/arc_fw/lib/pcie_dma.c)
-            if (*complete_flag == 0xfaca) break;
-            wait_loops++;
-        }
-        // LOG2 ("Waited %d iterations\n", wait_loops);
-    } else {
-        // t.now_in ("2. DMA wait for MSI");
-        while (msi_interrupt_received == false)
-            ;
-    }
+//     // Copy the source-aligned middle.
+//     copy_t *dp = static_cast<copy_t *>(dest);
+//     std::size_t num_words = num_bytes / sizeof(copy_t);
 
-    return 0; // TODO: status
-}
+//     for (std::size_t i = 0; i < num_words; i++)
+//         *dp++ = *sp++;
 
-void print_device_info (struct PCIdevice &d) {
-    LOG1("PCIEIntfId   0x%x\n", d.id);
-    LOG1("VID:DID      0x%x:0x%x\n", d.vendor_id, d.device_id);
-    LOG1("SubVID:SubID 0x%x:0x%x\n", d.subsystem_vendor_id, d.subsystem_id);
-    LOG1("BSF          %x:%x:%x\n",  d.dwBus, d.dwSlot, d.dwFunction);
-    LOG1("BAR          0x%llx  size: %dMB\n",    d.BAR_addr, d.BAR_size_bytes / 1024 / 1024);
-}
+//     // Finally copy any sub-word trailer.
+//     auto trailing_len = num_bytes % sizeof(copy_t);
+//     if (trailing_len != 0) {
+//         copy_t tmp = *sp;
+//         std::memcpy(dp, &tmp, trailing_len);
+//     }
+// }
 
 // --------------------------------------------------------------------------------------------------------------
 // --------------------------------------------------------------------------------------------------------------
@@ -1424,9 +478,9 @@ namespace {
     };
 }
 // Get TLB index (from zero), check if it's in 16MB, 2MB or 1MB TLB range, and dynamically program it.
-dynamic_tlb set_dynamic_tlb(PCIdevice* dev, unsigned int tlb_index, tt_xy_pair start, tt_xy_pair end,
+dynamic_tlb set_dynamic_tlb(TTDevice *dev, unsigned int tlb_index, tt_xy_pair start, tt_xy_pair end,
                             std::uint64_t address, bool multicast, std::unordered_map<chip_id_t, std::unordered_map<tt_xy_pair, tt_xy_pair>>& harvested_coord_translation, std::uint64_t ordering) {
-    auto architecture_implementation = dev->hdev->get_architecture_implementation();
+    auto architecture_implementation = dev->get_architecture_implementation();
     if (multicast) {
         std::tie(start, end) = architecture_implementation->multicast_workaround(start, end);
     }
@@ -1436,8 +490,8 @@ dynamic_tlb set_dynamic_tlb(PCIdevice* dev, unsigned int tlb_index, tt_xy_pair s
 
     tt::umd::tlb_configuration tlb_config = architecture_implementation->get_tlb_configuration(tlb_index);
     std::uint32_t TLB_CFG_REG_SIZE_BYTES = architecture_implementation->get_tlb_cfg_reg_size_bytes();
-    auto translated_start_coords = harvested_coord_translation.at(dev -> logical_id).at(start);
-    auto translated_end_coords = harvested_coord_translation.at(dev -> logical_id).at(end);
+    auto translated_start_coords = harvested_coord_translation.at(dev->logical_id).at(start);
+    auto translated_end_coords = harvested_coord_translation.at(dev->logical_id).at(end);
     uint32_t tlb_address    = address / tlb_config.size;
     uint32_t local_offset   = address % tlb_config.size;
     uint64_t tlb_base       = tlb_config.base + (tlb_config.size * tlb_config.index_offset);
@@ -1454,24 +508,23 @@ dynamic_tlb set_dynamic_tlb(PCIdevice* dev, unsigned int tlb_index, tt_xy_pair s
         // TODO #2715: hack for Blackhole A0, will potentially be fixed in B0.
         // Using the same static vc for reads and writes through TLBs can hang the card. It doesn't even have to be the same TLB.
         // Dynamic vc should not have this issue. There might be a perf impact with using dynamic vc.
-        .static_vc = (dev->hdev->get_arch() == tt::ARCH::BLACKHOLE) ? false : true,
+        .static_vc = (dev->get_arch() == tt::ARCH::BLACKHOLE) ? false : true,
     }.apply_offset(tlb_config.offset);
 
     LOG1("set_dynamic_tlb() with tlb_index: %d tlb_index_offset: %d dynamic_tlb_size: %dMB tlb_base: 0x%x tlb_cfg_reg: 0x%x\n", tlb_index, tlb_config.index_offset, tlb_config.size/(1024*1024), tlb_base, tlb_cfg_reg);
     // write_regs(dev -> hdev, tlb_cfg_reg, 2, &tlb_data);
-    write_tlb_reg(dev->hdev, tlb_cfg_reg, tlb_data.first, tlb_data.second, TLB_CFG_REG_SIZE_BYTES);
+    dev->write_tlb_reg(tlb_cfg_reg, tlb_data.first, tlb_data.second, TLB_CFG_REG_SIZE_BYTES);
 
     return { tlb_base + local_offset, tlb_config.size - local_offset };
 }
 
-dynamic_tlb set_dynamic_tlb(PCIdevice *dev, unsigned int tlb_index, tt_xy_pair target, std::uint64_t address, std::unordered_map<chip_id_t, std::unordered_map<tt_xy_pair, tt_xy_pair>>& harvested_coord_translation, std::uint64_t ordering = TLB_DATA::Relaxed) {
+dynamic_tlb set_dynamic_tlb(TTDevice *dev, unsigned int tlb_index, tt_xy_pair target, std::uint64_t address, std::unordered_map<chip_id_t, std::unordered_map<tt_xy_pair, tt_xy_pair>>& harvested_coord_translation, std::uint64_t ordering = TLB_DATA::Relaxed) {
     return set_dynamic_tlb(dev, tlb_index, tt_xy_pair(0, 0), target, address, false, harvested_coord_translation, ordering);
 }
 
-dynamic_tlb set_dynamic_tlb_broadcast(PCIdevice *dev, unsigned int tlb_index, std::uint64_t address, std::unordered_map<chip_id_t, std::unordered_map<tt_xy_pair, tt_xy_pair>>& harvested_coord_translation, tt_xy_pair start, tt_xy_pair end, std::uint64_t ordering = TLB_DATA::Relaxed) {
+dynamic_tlb set_dynamic_tlb_broadcast(TTDevice *dev, unsigned int tlb_index, std::uint64_t address, std::unordered_map<chip_id_t, std::unordered_map<tt_xy_pair, tt_xy_pair>>& harvested_coord_translation, tt_xy_pair start, tt_xy_pair end, std::uint64_t ordering = TLB_DATA::Relaxed) {
     // Issue a broadcast to cores included in the start (top left) and end (bottom right) grid
-    return set_dynamic_tlb (dev, tlb_index, start, end,
-                            address, true, harvested_coord_translation, ordering);
+    return set_dynamic_tlb(dev, tlb_index, start, end, address, true, harvested_coord_translation, ordering);
 }
 
 bool tt_SiliconDevice::address_in_tlb_space(uint32_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size, std::uint32_t chip) {
@@ -1528,25 +581,16 @@ void tt_SiliconDevice::initialize_interprocess_mutexes(int pci_interface_id, boo
 
 void tt_SiliconDevice::create_device(const std::unordered_set<chip_id_t> &target_mmio_device_ids, const uint32_t &num_host_mem_ch_per_mmio_device, const bool skip_driver_allocs, const bool clean_system_resources) {
     m_pci_log_level = 0;
-    m_dma_buf_size = 0;
     LOG1("---- tt_SiliconDevice::tt_SiliconDevice\n");
-    static int unique_driver_id = 0;
-    driver_id = unique_driver_id++;
 
     // Set the log level for debugging
     const char* pci_log_level = std::getenv("TT_PCI_LOG_LEVEL");
     if (pci_log_level) {
         m_pci_log_level = atoi (pci_log_level);
     }
-    set_debug_level(m_pci_log_level);
+    g_DEBUG_LEVEL = m_pci_log_level;
     LOG1 ("TT_PCI_LOG_LEVEL=%d\n", m_pci_log_level);
 
-    const char* dma_buf_size = std::getenv("TT_PCI_DMA_BUF_SIZE");
-    if (dma_buf_size) {
-        m_dma_buf_size = atoi (dma_buf_size);
-    }
-    LOG1 ("TT_PCI_DMA_BUF_SIZE=%d\n", m_dma_buf_size);
-
     // Don't buffer stdout.
     setbuf(stdout, NULL);
 
@@ -1556,18 +600,17 @@ void tt_SiliconDevice::create_device(const std::unordered_set<chip_id_t> &target
     log_assert(target_mmio_device_ids.size() > 0, "Must provide set of target_mmio_device_ids to tt_SiliconDevice constructor now.");
 
     for (const chip_id_t &logical_device_id : target_mmio_device_ids) {
-        m_pci_device_map.insert({logical_device_id, new struct PCIdevice});
-        struct PCIdevice* pci_device = m_pci_device_map.at(logical_device_id);
-
         log_assert(logical_to_physical_device_id_map.count(logical_device_id) != 0, "Cannot find logical mmio device_id: {} in cluster desc / logical-to-physical-map", logical_device_id);
         int pci_interface_id = logical_to_physical_device_id_map.at(logical_device_id);
 
-        log_debug(LogSiliconDriver, "Opening TT_PCI_INTERFACE_ID {} for netlist target_device_id: {}", pci_interface_id, logical_device_id);
-        *pci_device = ttkmd_open ((DWORD) pci_interface_id, false);
-        pci_device->logical_id = logical_device_id;
+        if (!m_pci_device_map.count(logical_device_id)) {
+            log_debug(LogSiliconDriver, "Opening TT_PCI_INTERFACE_ID {} for netlist target_device_id: {}", pci_interface_id, logical_device_id);
+            m_pci_device_map.insert({logical_device_id, std::make_unique<TTDevice>(pci_interface_id, logical_device_id)});
+        }
+        auto dev = m_pci_device_map.at(logical_device_id).get();
 
-        m_num_host_mem_channels = get_available_num_host_mem_channels(num_host_mem_ch_per_mmio_device, pci_device->device_id, pci_device->revision_id);
-        if (arch_name == tt::ARCH::BLACKHOLE && m_num_host_mem_channels > 1) {
+        m_num_host_mem_channels = get_available_num_host_mem_channels(num_host_mem_ch_per_mmio_device, dev->pcie_device_id, dev->pcie_revision_id);
+        if (dev->get_arch() == tt::ARCH::BLACKHOLE && m_num_host_mem_channels > 1) {
             // TODO: Implement support for multiple host channels on BLACKHOLE.
             log_warning(LogSiliconDriver, "Forcing a single channel for Blackhole device. Multiple host channels not supported.");
             m_num_host_mem_channels = 1;
@@ -1577,7 +620,7 @@ void tt_SiliconDevice::create_device(const std::unordered_set<chip_id_t> &target
             m_num_host_mem_channels, logical_device_id, pci_interface_id, pci_device->device_id, pci_device->revision_id);
 
         if (g_SINGLE_PIN_PAGE_PER_FD_WORKAROND) {
-            pci_device->hdev->open_hugepage_per_host_mem_ch(m_num_host_mem_channels);
+            dev->open_hugepage_per_host_mem_ch(m_num_host_mem_channels);
         }
 
         // Initialize these. Used to be in header file.
@@ -1589,11 +632,8 @@ void tt_SiliconDevice::create_device(const std::unordered_set<chip_id_t> &target
 
         initialize_interprocess_mutexes(pci_interface_id, clean_system_resources);
 
-        if (!skip_driver_allocs)
-            print_device_info (*pci_device);
-
         // MT: Initial BH - hugepages will fail init
-        // For using silicon driver without workload to query mission mode params, no need for hugepage/dmabuf.
+        // For using silicon driver without workload to query mission mode params, no need for hugepage.
         if (!skip_driver_allocs){
             bool hugepages_initialized = init_hugepage(logical_device_id);
             // Large writes to remote chips require hugepages to be initialized.
@@ -1601,13 +641,11 @@ void tt_SiliconDevice::create_device(const std::unordered_set<chip_id_t> &target
             if(target_remote_chips.size()) {
                 log_assert(hugepages_initialized, "Hugepages must be successfully initialized if workload contains remote chips!");
             }
-            uint16_t channel = 0; // Single channel sufficient for this?
-            if (not hugepage_mapping.at(logical_device_id).at(channel)) {
-                init_dmabuf(logical_device_id);
+            if (not hugepage_mapping.at(logical_device_id).at(0)) {
+                log_warning(LogSiliconDriver, "No hugepage mapping at device {}", logical_device_id);
             }
         }
         harvested_coord_translation.insert({logical_device_id, create_harvested_coord_translation(arch_name, true)}); //translation layer for harvested coords. Default is identity map
-        archs_in_cluster.push_back(detect_arch(logical_to_physical_device_id_map.at(logical_device_id)));
     }
 
     for(const chip_id_t& chip : target_devices_in_cluster) {
@@ -1618,9 +656,6 @@ void tt_SiliconDevice::create_device(const std::unordered_set<chip_id_t> &target
     }
 }
 
-bool tt_SiliconDevice::noc_translation_en() {
-    return translation_tables_en;
-}
 bool tt_SiliconDevice::using_harvested_soc_descriptors() {
     return perform_harvesting_on_sdesc && performed_harvesting;
 }
@@ -1820,17 +855,6 @@ void tt_SiliconDevice::populate_cores() {
     }
 }
 
-std::unordered_map<chip_id_t, uint32_t> tt_SiliconDevice::get_harvesting_masks_from_harvested_rows(std::unordered_map<chip_id_t, std::vector<uint32_t>> harvested_rows) {
-    std::unordered_map<chip_id_t, uint32_t> harvesting_masks = {};
-    for(const auto& chip : harvested_rows) {
-        uint32_t harvesting_mask_per_chip = 0;
-        harvesting_masks.insert({chip.first, 0});
-        for(const auto& row : chip.second) {
-            harvesting_masks.at(chip.first) |= (1 << row);
-        }
-    }
-    return harvesting_masks;
-}
 std::vector<int> tt_SiliconDevice::extract_rows_to_remove(const tt::ARCH &arch, const int worker_grid_rows, const int harvested_rows) {
     // Check if harvesting config is legal for GS and WH
     log_assert(!((harvested_rows & 1) || (harvested_rows & 64) || (harvested_rows & 0xFFFFF000)), "For grayskull and wormhole, only rows 1-5 and 7-11 can be harvested");
@@ -1901,26 +925,27 @@ void tt_SiliconDevice::perform_harvesting_and_populate_soc_descriptors(const std
 
 void tt_SiliconDevice::check_pcie_device_initialized(int device_id) {
 
-    struct PCIdevice* pci_device = get_pci_device(device_id);
+    TTDevice *pci_device = get_pci_device(device_id);
+    tt::ARCH device_arch = pci_device->get_arch();
     if (arch_name == tt::ARCH::GRAYSKULL) {
-        if (!is_grayskull(pci_device->device_id)) {
-            throw std::runtime_error("Attempted to run grayskull configured tt_device on " + get_arch_str(detect_arch(pci_device)));
+        if (device_arch != tt::ARCH::GRAYSKULL) {
+            throw std::runtime_error("Attempted to run grayskull configured tt_device on " + get_arch_str(device_arch));
         }
     }
     else if (arch_name == tt::ARCH::WORMHOLE || arch_name == tt::ARCH::WORMHOLE_B0) {
-        if (!is_wormhole(pci_device->device_id)) {
-            throw std::runtime_error("Attempted to run wormhole configured tt_device on " + get_arch_str(detect_arch(pci_device)));
+        if (device_arch != tt::ARCH::WORMHOLE && device_arch != tt::ARCH::WORMHOLE_B0) {
+            throw std::runtime_error("Attempted to run wormhole configured tt_device on " + get_arch_str(device_arch));
         }
     }
     else if (arch_name == tt::ARCH::BLACKHOLE) {
-        if (!is_blackhole(pci_device->device_id)) {
-            throw std::runtime_error("Attempted to run blackhole configured tt_device on " + get_arch_str(detect_arch(pci_device)));
+        if (device_arch != tt::ARCH::BLACKHOLE) {
+            throw std::runtime_error("Attempted to run blackhole configured tt_device on " + get_arch_str(device_arch));
         }
     }
     else {
         throw std::runtime_error("Unsupported architecture: " + get_arch_str(arch_name));
     }
-    auto architecture_implementation = pci_device->hdev->get_architecture_implementation();
+    auto architecture_implementation = pci_device->get_architecture_implementation();
 
     // MT Initial BH - Add check for blackhole once access to ARC registers is setup through TLBs
     if (arch_name != tt::ARCH::BLACKHOLE) {
@@ -2040,60 +1065,24 @@ void tt_SiliconDevice::initialize_pcie_devices() {
         check_pcie_device_initialized(device_it.first);
     }
 
-    // If requires multi-channel or doesn't support mmio-p2p, init iatus without p2p.
-    if (m_num_host_mem_channels <= 1 && arch_name == tt::ARCH::GRAYSKULL) {
-        init_pcie_iatus();
-    } else {
-        // TODO: Implement support for multiple host channels on BLACKHOLE.
-        log_assert(!(arch_name == tt::ARCH::BLACKHOLE && m_num_host_mem_channels > 1),
-            "More channels are not yet supported for Blackhole");
-        init_pcie_iatus_no_p2p();
-    }
+    // TODO: Implement support for multiple host channels on BLACKHOLE.
+    log_assert(!(arch_name == tt::ARCH::BLACKHOLE && m_num_host_mem_channels > 1), "More channels are not yet supported for Blackhole");
+    init_pcie_iatus();
 
     init_membars();
-    
-    // https://yyz-gitlab.local.tenstorrent.com/ihamer/ll-sw/issues/25
-    // Note: using pcie dma while device is idle is safe, mixing p2p is unsafe, see issue above
-    // TODO: disable pcie dma if p2p traffic is present, ie. chip-to-chip or chip-to-host
-
-    for (auto &device_it : m_pci_device_map){
-        struct PCIdevice* pci_device = device_it.second;
-        auto device_id = pci_device->device_id;
-        // MT Initial BH - Don't use PCIe DMA
-        bool enable_pcie_dma;
-        if (arch_name == tt::ARCH::BLACKHOLE) {
-            enable_pcie_dma = false;
-        } else {
-            enable_pcie_dma = m_dma_buf_size>0;
-        }
-        // Use DMA only for transfers that cross the size thresholds (empirically determined)
-        if (enable_pcie_dma) {
-            try {
-                log_trace(LogSiliconDriver, "Enable PCIE DMA with bufsize {}", m_dma_buf_size);
-                set_use_dma (false, 128, 0); // use dma for reads only
-                init_dma_turbo_buf(pci_device);
-            } catch (const std::exception &e) {
-                log_trace(LogSiliconDriver, "Disable PCIE DMA, fallback to MMIO transfers due to exepction {}", e.what());
-                set_use_dma (false, 0, 0);
-                uninit_dma_turbo_buf(pci_device);
-            }
-        } else {
-            log_trace(LogSiliconDriver, "Disable PCIE DMA");
-        }
-    }   
 }
 
-void tt_SiliconDevice::broadcast_pcie_tensix_risc_reset(struct PCIdevice *device, const TensixSoftResetOptions &soft_resets) {
+void tt_SiliconDevice::broadcast_pcie_tensix_risc_reset(TTDevice *device, const TensixSoftResetOptions &soft_resets) {
     LOG1("---- tt_SiliconDevice::broadcast_tensix_risc_reset\n");
 
     auto valid = soft_resets & ALL_TENSIX_SOFT_RESET;
 
     LOG1("== For all tensix set soft-reset for %s risc cores.\n", TensixSoftResetOptionsToString(valid).c_str());
 
-    auto architecture_implementation = device->hdev->get_architecture_implementation();
+    auto architecture_implementation = device->get_architecture_implementation();
     auto [soft_reset_reg, _] = set_dynamic_tlb_broadcast(device, architecture_implementation->get_reg_tlb(), architecture_implementation->get_tensix_soft_reset_addr(), harvested_coord_translation, tt_xy_pair(0, 0), 
                                 tt_xy_pair(architecture_implementation->get_grid_size_x() - 1, architecture_implementation->get_grid_size_y() - 1 - num_rows_harvested.at(device -> logical_id)), TLB_DATA::Posted);
-    write_regs(device->hdev, soft_reset_reg, 1, &valid);
+    device->write_regs(soft_reset_reg, 1, &valid);
     tt_driver_atomics::sfence();
 }
 
@@ -2178,24 +1167,11 @@ std::vector<chip_id_t> tt_SiliconDevice::detect_available_device_ids() {
     return detected_device_ids;
 }
 
-static bool check_dram_core_exists(const std::vector<std::vector<tt_xy_pair>> &all_dram_cores, tt_xy_pair target_core) {
-    bool dram_core_exists = false;
-    for (const auto &dram_cores_in_channel : all_dram_cores) {
-        for (auto dram_core : dram_cores_in_channel) {
-            if (dram_core.x == target_core.x && dram_core.y == target_core.y) {
-                return true;
-            }
-        }
-    }
-    return false;
-}
-
-std::function<void(uint32_t, uint32_t, const uint8_t*, uint32_t)> tt_SiliconDevice::get_fast_pcie_static_tlb_write_callable(int device_id) {
-    struct PCIdevice* pci_device = get_pci_device(device_id);
-    TTDevice* dev = pci_device->hdev;
+std::function<void(uint32_t, uint32_t, const uint8_t*)> tt_SiliconDevice::get_fast_pcie_static_tlb_write_callable(int device_id) {
+    TTDevice* dev = get_pci_device(device_id);
 
-    const auto callable = [dev](uint32_t byte_addr, uint32_t num_bytes, const uint8_t* buffer_addr, uint32_t dma_buf_size) {
-        write_block(dev, byte_addr, num_bytes, buffer_addr, dma_buf_size);
+    const auto callable = [dev](uint32_t byte_addr, uint32_t num_bytes, const uint8_t* buffer_addr) {
+        dev->write_block(byte_addr, num_bytes, buffer_addr);
     };
 
     return callable;
@@ -2210,8 +1186,7 @@ tt::Writer tt_SiliconDevice::get_static_tlb_writer(tt_cxy_pair target) {
         throw std::runtime_error("TLBs not initialized");
     }
 
-    auto *pci_device = get_pci_device(target.chip);
-    auto *dev = pci_device->hdev;
+    auto *dev = get_pci_device(target.chip);
 
     if (!dev->bar0_wc) {
         throw std::runtime_error("No write-combined mapping for BAR0");
@@ -2231,9 +1206,7 @@ tt::Writer tt_SiliconDevice::get_static_tlb_writer(tt_cxy_pair target) {
 }
 
 void tt_SiliconDevice::write_device_memory(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair target, std::uint32_t address, const std::string& fallback_tlb) {
-    struct PCIdevice* pci_device = get_pci_device(target.chip);
-    TTDevice *dev = pci_device->hdev;
-
+    TTDevice *dev = get_pci_device(target.chip);
     const uint8_t* buffer_addr = static_cast<const uint8_t*>(mem_ptr);
 
     // LOG1("---- tt_SiliconDevice::write_device_memory to chip:%lu %lu-%lu at 0x%x size_in_bytes: %d small_access: %d\n",
@@ -2251,19 +1224,19 @@ void tt_SiliconDevice::write_device_memory(const void *mem_ptr, uint32_t size_in
         if (dev->bar4_wc != nullptr && tlb_size == BH_4GB_TLB_SIZE) {
             // This is only for Blackhole. If we want to  write to DRAM (BAR4 space), we add offset
             // to which we write so write_block knows it needs to target BAR4
-            write_block(dev, (tlb_offset + address % tlb_size) + BAR0_BH_SIZE, size_in_bytes, buffer_addr, m_dma_buf_size);
+            dev->write_block((tlb_offset + address % tlb_size) + BAR0_BH_SIZE, size_in_bytes, buffer_addr);
         } else {
-            write_block(dev, tlb_offset + address % tlb_size, size_in_bytes, buffer_addr, m_dma_buf_size);
+            dev->write_block(tlb_offset + address % tlb_size, size_in_bytes, buffer_addr);
         }
     } else {
         const auto tlb_index = dynamic_tlb_config.at(fallback_tlb);
-        const scoped_lock<named_mutex> lock(*get_mutex(fallback_tlb, pci_device -> id));
+        const scoped_lock<named_mutex> lock(*get_mutex(fallback_tlb, dev->device_id));
 
         while(size_in_bytes > 0) {
 
-            auto [mapped_address, tlb_size] = set_dynamic_tlb(pci_device, tlb_index, target, address, harvested_coord_translation, dynamic_tlb_ordering_modes.at(fallback_tlb));
+            auto [mapped_address, tlb_size] = set_dynamic_tlb(dev, tlb_index, target, address, harvested_coord_translation, dynamic_tlb_ordering_modes.at(fallback_tlb));
             uint32_t transfer_size = std::min((uint64_t)size_in_bytes, tlb_size);
-            write_block(dev, mapped_address, transfer_size, buffer_addr, m_dma_buf_size);
+            dev->write_block(mapped_address, transfer_size, buffer_addr);
 
             size_in_bytes -= transfer_size;
             address += transfer_size;
@@ -2276,8 +1249,7 @@ void tt_SiliconDevice::write_device_memory(const void *mem_ptr, uint32_t size_in
 void tt_SiliconDevice::read_device_memory(void *mem_ptr, tt_cxy_pair target, std::uint32_t address, std::uint32_t size_in_bytes, const std::string& fallback_tlb) {
     // Assume that mem_ptr has been allocated adequate memory on host when this function is called. Otherwise, this function will cause a segfault.
     LOG1("---- tt_SiliconDevice::read_device_memory to chip:%lu %lu-%lu at 0x%x size_in_bytes: %d\n", target.chip, target.x, target.y, address, size_in_bytes);
-    struct PCIdevice* pci_device = get_pci_device(target.chip);
-    TTDevice *dev = pci_device->hdev;
+    TTDevice *dev = get_pci_device(target.chip);
 
     uint8_t* buffer_addr = static_cast<uint8_t*>(mem_ptr);
 
@@ -2294,20 +1266,20 @@ void tt_SiliconDevice::read_device_memory(void *mem_ptr, tt_cxy_pair target, std
         if (dev->bar4_wc != nullptr && tlb_size == BH_4GB_TLB_SIZE) {
             // This is only for Blackhole. If we want to  read from DRAM (BAR4 space), we add offset
             // from which we read so read_block knows it needs to target BAR4
-            read_block(dev, (tlb_offset + address % tlb_size) + BAR0_BH_SIZE, size_in_bytes, buffer_addr, m_dma_buf_size);
+            dev->read_block((tlb_offset + address % tlb_size) + BAR0_BH_SIZE, size_in_bytes, buffer_addr);
         } else {
-            read_block(dev, tlb_offset + address % tlb_size, size_in_bytes, buffer_addr, m_dma_buf_size);
+            dev->read_block(tlb_offset + address % tlb_size, size_in_bytes, buffer_addr);
         }
         LOG1 ("  read_block called with tlb_offset: %d, tlb_size: %d\n", tlb_offset, tlb_size);
     } else {
         const auto tlb_index = dynamic_tlb_config.at(fallback_tlb);
-        const scoped_lock<named_mutex> lock(*get_mutex(fallback_tlb, pci_device -> id));
+        const scoped_lock<named_mutex> lock(*get_mutex(fallback_tlb, dev->device_id));
         LOG1 ("  dynamic tlb_index: %d\n", tlb_index);
         while(size_in_bytes > 0) {
 
-            auto [mapped_address, tlb_size] = set_dynamic_tlb(pci_device, tlb_index, target, address, harvested_coord_translation, dynamic_tlb_ordering_modes.at(fallback_tlb));
+            auto [mapped_address, tlb_size] = set_dynamic_tlb(dev, tlb_index, target, address, harvested_coord_translation, dynamic_tlb_ordering_modes.at(fallback_tlb));
             uint32_t transfer_size = std::min((uint64_t)size_in_bytes, tlb_size);
-            read_block(dev, mapped_address, transfer_size, buffer_addr, m_dma_buf_size);
+            dev->read_block(mapped_address, transfer_size, buffer_addr);
 
             size_in_bytes -= transfer_size;
             address += transfer_size;
@@ -2317,7 +1289,7 @@ void tt_SiliconDevice::read_device_memory(void *mem_ptr, tt_cxy_pair target, std
     }
 }
 
-void tt_SiliconDevice::read_dma_buffer(
+void tt_SiliconDevice::read_buffer(
     void* mem_ptr,
     std::uint32_t address,
     std::uint16_t channel,
@@ -2330,20 +1302,18 @@ void tt_SiliconDevice::read_dma_buffer(
 
     if(hugepage_mapping.at(src_device_id).at(channel)) {
       user_scratchspace = static_cast<char*>(hugepage_mapping.at(src_device_id).at(channel)) + (address & HUGEPAGE_MAP_MASK);
-    } else if (buf_mapping) {
-      user_scratchspace = static_cast<char*>(buf_mapping) + (address & DMA_MAP_MASK);
     } else {
-      std::string err_msg = "write_dma_buffer: Hugepage or DMAbuffer are not allocated for src_device_id: " + std::to_string(src_device_id) + " ch: " + std::to_string(channel);
+      std::string err_msg = "write_buffer: Hugepages are not allocated for src_device_id: " + std::to_string(src_device_id) + " ch: " + std::to_string(channel);
       err_msg += " - Ensure sufficient number of Hugepages installed per device (1 per host mem ch, per device)";
       throw std::runtime_error(err_msg);
     }
 
-    LOG1("---- tt_SiliconDevice::read_dma_buffer (src_device_id: %d, ch: %d) from 0x%lx\n",  src_device_id, channel, user_scratchspace);
+    LOG1("---- tt_SiliconDevice::read_buffer (src_device_id: %d, ch: %d) from 0x%lx\n",  src_device_id, channel, user_scratchspace);
     
     memcpy(mem_ptr, user_scratchspace, size_in_bytes);
 }
 
-void tt_SiliconDevice::write_dma_buffer(
+void tt_SiliconDevice::write_buffer(
     const void *mem_ptr,
     std::uint32_t size,
     std::uint32_t address,
@@ -2352,43 +1322,34 @@ void tt_SiliconDevice::write_dma_buffer(
 
     void * user_scratchspace = nullptr;
     if(hugepage_mapping.at(src_device_id).at(channel)) {
-      log_assert(size <= HUGEPAGE_REGION_SIZE, "write_dma_buffer data has larger size {} than destination buffer {}", size, HUGEPAGE_REGION_SIZE);
+      log_assert(size <= HUGEPAGE_REGION_SIZE, "write_buffer data has larger size {} than destination buffer {}", size, HUGEPAGE_REGION_SIZE);
       log_debug(LogSiliconDriver, "Using hugepage mapping at address {} offset {} chan {} size {}",
         hugepage_mapping.at(src_device_id).at(channel),
         (address & HUGEPAGE_MAP_MASK),
         channel,
         size);
       user_scratchspace = static_cast<char*>(hugepage_mapping.at(src_device_id).at(channel)) + (address & HUGEPAGE_MAP_MASK);
-    }
-    else if(buf_mapping) {
-      log_assert(size <= DMA_BUF_REGION_SIZE, "write_dma_buffer data has larger size {} than destination buffer {}", size, DMA_BUF_REGION_SIZE);
-      log_debug(LogSiliconDriver, "Using DMA Buffer at address {} offset {} size {}",
-        buf_mapping,
-        address,
-        size);
-        // we failed when initializing huge pages, we are using a 1MB DMA buffer as a stand-in
-        user_scratchspace = reinterpret_cast<char*>(buf_mapping);
     } else {
-      std::string err_msg = "write_dma_buffer: Hugepage or DMAbuffer are not allocated for src_device_id: " + std::to_string(src_device_id) + " ch: " + std::to_string(channel);
+      std::string err_msg = "write_buffer: Hugepage are not allocated for src_device_id: " + std::to_string(src_device_id) + " ch: " + std::to_string(channel);
       throw std::runtime_error(err_msg);
     }
     memcpy(user_scratchspace, mem_ptr, size);
 }
 
 
-uint32_t tt_SiliconDevice::get_power_state_arc_msg(struct PCIdevice* pci_device, tt_DevicePowerState state) {
+uint32_t tt_SiliconDevice::get_power_state_arc_msg(TTDevice* pci_device, tt_DevicePowerState state) {
     uint32_t msg = 0xaa00;
     switch (state) {
         case BUSY: {
-            msg |= pci_device->hdev->get_architecture_implementation()->get_arc_message_arc_go_busy();
+            msg |= pci_device->get_architecture_implementation()->get_arc_message_arc_go_busy();
             break;
         }
         case LONG_IDLE: {
-            msg |= pci_device->hdev->get_architecture_implementation()->get_arc_message_arc_go_long_idle();
+            msg |= pci_device->get_architecture_implementation()->get_arc_message_arc_go_long_idle();
             break;
         }
         case SHORT_IDLE: {
-            msg |= pci_device->hdev->get_architecture_implementation()->get_arc_message_arc_go_short_idle();
+            msg |= pci_device->get_architecture_implementation()->get_arc_message_arc_go_short_idle();
             break;
         }
         default: throw std::runtime_error("Unrecognized power state.");
@@ -2400,7 +1361,7 @@ void tt_SiliconDevice::set_pcie_power_state(tt_DevicePowerState state) {
 
     for (auto &device_it : m_pci_device_map){
         int d = device_it.first;
-        struct PCIdevice* pci_device = device_it.second;
+        auto pci_device = device_it.second.get();
         uint32_t msg = get_power_state_arc_msg(pci_device, state);
         std::stringstream ss;
         ss << state;
@@ -2427,8 +1388,8 @@ int tt_SiliconDevice::get_clock(int logical_device_id) {
 
     uint32_t clock;
     auto mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(logical_device_id);
-    struct PCIdevice* pci_device = get_pci_device(mmio_capable_chip_logical);
-    auto exit_code = arc_msg(logical_device_id, 0xaa00 | pci_device->hdev->get_architecture_implementation()->get_arc_message_get_aiclk(), true, 0xFFFF, 0xFFFF, 1, &clock);
+    TTDevice* pci_device = get_pci_device(mmio_capable_chip_logical);
+    auto exit_code = arc_msg(logical_device_id, 0xaa00 | pci_device->get_architecture_implementation()->get_arc_message_get_aiclk(), true, 0xFFFF, 0xFFFF, 1, &clock);
     if (exit_code != 0) {
         throw std::runtime_error("Failed to get aiclk value with exit code " + std::to_string(exit_code));
     }
@@ -2444,72 +1405,24 @@ std::map<int, int> tt_SiliconDevice::get_clocks() {
     return clock_freq_map;
 }
 
-//! Simple test of communication to device/target.  true if it passes.
-// bool tt_SiliconDevice::test_write_read(tt_cxy_pair target) {
-//     WARN("---- tt_SiliconDevice::test_write_read not implemented\n");
-//     return true;
-// }
-
-// bool tt_SiliconDevice::test_write_speed (struct PCIdevice* pci_device) {
-//     TTDevice *dev = pci_device->hdev;
-
-//     if (dev->bar0_uc == dev->bar0_wc) {
-//         WARN("---- tt_SiliconDevice::test_write_speed WC not configured\n");
-//     }
-
-//     std::byte fill_value{0x42};
-//     std::vector<std::byte> write_buf(architecture_implementation->get_static_tlb_size(), fill_value);
-
-//     auto before = std::chrono::high_resolution_clock::now();
-//     for (std::uint32_t y = 1; y < architecture_implementation->get_grid_size_y(); y++)
-//     {
-//         for (std::uint32_t x = 1; x < architecture_implementation->get_grid_size_x(); x++)
-//         {
-//             auto tlb_index = map_core_to_tlb(tt_xy_pair(x, y));
-//             if (tlb_index < 0) { continue; }
-
-//             auto offset = tlb_index * architecture_implementation->get_static_tlb_size();
-
-//             memcpy(static_cast<std::byte*>(dev->bar0_wc) + offset, write_buf.data(), write_buf.size());
-//         }
-//     }
-//     auto after = std::chrono::high_resolution_clock::now();
-
-//     std::chrono::duration<double, std::milli> interval = after - before;
-
-//     unsigned int write_bw = 120 * std::milli::den / interval.count();
-
-//     LOG1("---- tt_SiliconDevice::test_write_speed Wrote 120MB @ %u MB/s\n", write_bw);
-
-//     return (write_bw >= 512); // L1 write BW scales with AICLK, for low AICLK it will be very slow.
-// }
-
 tt_SiliconDevice::~tt_SiliconDevice () {
 
     LOG1 ("---- tt_SiliconDevice::~tt_SiliconDevice\n");
 
-    for(int i = 0; i < archs_in_cluster.size(); i++) {
-        if(archs_in_cluster[i] == tt::ARCH::WORMHOLE) {
-            log_warning(LogSiliconDriver, "Virtual device {} for this run is Wormhole A0. This architecture is now deprecated. Please use Wormhole B0 for testing.", i);
-        }
-    }
     cleanup_shared_host_state();
 
     for (auto &device_it : m_pci_device_map){
 
         chip_id_t device_id = device_it.first;
+        // TTDevice *dev = device_it.second.get();
 
         for (int ch = 0; ch < m_num_host_mem_channels; ch ++) {
             if (hugepage_mapping.at(device_id).at(ch)) {
                 munmap(hugepage_mapping.at(device_id).at(ch), hugepage_mapping_size.at(device_id).at(ch));
             }
         }
-
-        struct PCIdevice* pci_device = device_it.second;
-
-        ttkmd_close (*pci_device);
-        delete pci_device;
-        pci_device = NULL;
+        
+        device_it.second.reset();
     }
     m_pci_device_map.clear();
     ndesc.reset();
@@ -2531,15 +1444,11 @@ std::optional<std::tuple<uint32_t, uint32_t>> tt_SiliconDevice::get_tlb_data_fro
     return tlb_data;
 }
 
-uint32_t tt_SiliconDevice::get_m_dma_buf_size() const {
-    return m_dma_buf_size;
-}
-
 void tt_SiliconDevice::configure_tlb(chip_id_t logical_device_id, tt_xy_pair core, std::int32_t tlb_index, std::int32_t address, uint64_t ordering) {
     log_assert(ordering == TLB_DATA::Strict || ordering == TLB_DATA::Posted || ordering == TLB_DATA::Relaxed, "Invalid ordering specified in tt_SiliconDevice::configure_tlb");
-    struct PCIdevice* pci_device = get_pci_device(logical_device_id);
+    TTDevice *pci_device = get_pci_device(logical_device_id);
     set_dynamic_tlb(pci_device, tlb_index, core, address, harvested_coord_translation, ordering);
-    auto tlb_size = std::get<1>(pci_device->hdev->get_architecture_implementation()->describe_tlb(tlb_index).value());
+    auto tlb_size = std::get<1>(pci_device->get_architecture_implementation()->describe_tlb(tlb_index).value());
     if(tlb_config_map.find(logical_device_id) == tlb_config_map.end()) tlb_config_map.insert({logical_device_id, {}});
     tlb_config_map[logical_device_id].insert({tlb_index, (address / tlb_size) * tlb_size});
 }
@@ -2547,167 +1456,39 @@ void tt_SiliconDevice::configure_tlb(chip_id_t logical_device_id, tt_xy_pair cor
 void tt_SiliconDevice::set_fallback_tlb_ordering_mode(const std::string& fallback_tlb, uint64_t ordering) {
     log_assert(ordering == TLB_DATA::Strict || ordering == TLB_DATA::Posted || ordering == TLB_DATA::Relaxed, "Invalid ordering specified in tt_SiliconDevice::configure_tlb.");
     log_assert(dynamic_tlb_ordering_modes.find(fallback_tlb) != dynamic_tlb_ordering_modes.end(), "Invalid TLB specified in tt_SiliconDevice::set_fallback_tlb_ordering_mode.");
-    log_assert(fallback_tlb != "LARGE_READ_TLB" &&  fallback_tlb != "LARGE_WRITE_TLB", "Ordering modes for LARGE_READ_TLB and LARGE_WRITE_TLB cannot be modified.");
-    dynamic_tlb_ordering_modes.at(fallback_tlb) = ordering;
-}
-// This function checks that all TLBs are properly setup. It should return 0 if all is good (i.e. if init_pcie_tlb is called prior)
-// int tt_SiliconDevice::test_pcie_tlb_setup (struct PCIdevice* pci_device) {
-    // LOG1("---- tt_SiliconDevice::test_pcie_tlb_setup\n");
-    // uint64_t tlb_data;
-    // int ret_val;
-    // // Check static TLBs (only active Tensix cores for GS ... Active tensix cores + ethernet cores for WH)
-    // for (uint32_t y = 0; y < architecture_implementation->get_grid_size_y() - num_rows_harvested; y++) {
-    //     for (uint32_t x = 0; x < architecture_implementation->get_grid_size_x(); x++) {
-    //         int tlb_index = get_static_tlb_index(tt_xy_pair(x, y));
-    //         auto translated_coords = harvested_coord_translation.at(pci_device -> id).at(tt_xy_pair(x, y));
-    //         if (tlb_index < 0) { continue; }
-
-    //         auto tlb_data_attempt = architecture_implementation->get_tlb_data(tlb_index, TLB_DATA {
-    //             .x_end = translated_coords.x,
-    //             .y_end = translated_coords.y,
-    //         });
-    //         if (!tlb_data_attempt.has_value()) {
-    //             throw std::runtime_error("Error setting up (" + std::to_string(x) + ", " + std::to_string(y) + ") in pcie_tlb_test.");
-    //         }
-    //         uint64_t expected_tlb_data = tlb_data_attempt.value();
-
-    //         uint32_t tlb_setup_addr = architecture_implementation->get_static_tlb_cfg_addr() + 8 * tlb_index; // Each tlb setup takes 2 dwords, hence 8 bytes
-    //         read_regs(pci_device->hdev, tlb_setup_addr, 2, &tlb_data);
-
-    //     }
-    // }
-
-    // // Check 16MB TLBs 1-16 for peer-to-peer communication with DRAM channel 0
-    // uint64_t peer_dram_offset = architecture_implementation->get_dram_channel_0_peer2peer_region_start();
-    // for (uint32_t tlb_id = 1; tlb_id < 17; tlb_id++) {
-    //     auto tlb_data_expected = architecture_implementation->get_tlb_data(architecture_implementation->get_tlb_base_index_16m() + tlb_id, TLB_DATA {
-    //         .local_offset = peer_dram_offset / architecture_implementation->get_dynamic_tlb_16m_size(),
-    //         .x_end = architecture_implementation->get_dram_channel_0_x(),
-    //         .y_end = architecture_implementation->get_dram_channel_0_y(),
-    //         .ordering = TLB_DATA::Posted,
-    //         .static_vc = true,
-    //     });
-    //     uint64_t tlb_data_observed;
-    //     uint32_t tlb_setup_addr = architecture_implementation->get_dynamic_tlb_16m_cfg_addr() + 8 * tlb_id; // Each tlb setup takes 2 dwords, hence 8 bytes
-    //     read_regs(pci_device->hdev, tlb_setup_addr, 2, &tlb_data_observed);
-    //     ret_val = (tlb_data_expected == tlb_data_observed) ? 0 : 1;
-    //     if (ret_val != 0) return ret_val;
-    //     peer_dram_offset += architecture_implementation->get_dynamic_tlb_16m_size();
-    // }
-    // return ret_val;
-//}
-
-// Set up IATU for peer2peer
-// Consider changing this function
-void tt_SiliconDevice::init_pcie_iatus() {
-
-    int starting_device_id  = m_pci_device_map.begin()->first;
-    int ending_device_id    = m_pci_device_map.rbegin()->first;
-    int num_enabled_devices = m_pci_device_map.size();
-
-    LOG1("---- tt_SiliconDevice::init_pcie_iatus() num_enabled_devices: %d starting_device_id: %d ending_device_id: %d\n", num_enabled_devices, starting_device_id, ending_device_id);
-    log_assert(m_num_host_mem_channels <= 1, "Maximum of 1x 1GB Host memory channels supported.");
-
-    // Requirement for ring topology in GS, but since WH can share below code, check it again here for mmio mapped devices,
-    // otherwise us/ds device calculations will not be correct. Don't expect to see this for Wormhole today.
-    log_assert((starting_device_id + num_enabled_devices - 1) == ending_device_id, "The set of workload mmio-mapped target_device_id's must be sequential, without gaps.");
-
-    for (auto &src_device_it : m_pci_device_map){
-        int src_pci_id = src_device_it.first;
-        struct PCIdevice* src_pci_device = src_device_it.second;
-
-        uint32_t current_peer_region = 0;
-        const int num_peer_ids = 3; // 0=HOST, 1=UPSTREAM Device, 2=DOWNSTREAM Device, 3=Unused
-        for (int peer_id = 0; peer_id < num_peer_ids; peer_id++) {
-
-            //TODO: migrate this to huge pages when that support is in
-            if (peer_id == 0){
-                LOG2 ("Setting up src_pci_id: %d peer_id: %d to Host. current_peer_region: %d\n", src_pci_id, peer_id, current_peer_region);
-                // Device to Host (peer_id==0)
-                const uint16_t host_memory_channel = 0; // Only single channel supported.
-                if (hugepage_mapping.at(src_pci_id).at(host_memory_channel)) {
-                    iatu_configure_peer_region(src_pci_id, current_peer_region, hugepage_physical_address.at(src_pci_id).at(host_memory_channel), HUGEPAGE_REGION_SIZE);
-                    host_channel_size.insert({(int)src_pci_device->logical_id, {HUGEPAGE_REGION_SIZE}});
-                } else if(buf_mapping) {
-                    // we failed when initializing huge pages, we are using a 1MB DMA buffer as a stand-in
-                    iatu_configure_peer_region(src_pci_id, current_peer_region, buf_physical_addr, DMA_BUF_REGION_SIZE);
-                }
-            } else if (peer_id == 1 || peer_id == 2){
-                // Device to Device (peer_id==1 : Upstream, peer_id==2 : Downstream)
-                // For determining upstream/downstream peers in ring topology - this matches is_target_device_downstream() in net2pipe
-                int upstream_peer_device_id = src_pci_id > starting_device_id ? src_pci_id - 1 : ending_device_id;
-                int downstream_peer_device_id = src_pci_id < (ending_device_id) ? src_pci_id + 1 : starting_device_id;
-
-                int peer_device_id = peer_id == 1 ? upstream_peer_device_id : downstream_peer_device_id;
-
-                struct PCIdevice* peer_pci_device = m_pci_device_map.at(peer_device_id);
-                uint64_t peer_BAR_addr = peer_pci_device->BAR_addr;
-                uint32_t peer_pci_interface_id = peer_pci_device->id;
-                uint32_t TLB1_16MB_OFFSET = 0; // Was 192MB offset to DRAM, now added by net2pipe since ATU maps to base of 512MB PCI Bar.
-                uint32_t PEER_REGION_SIZE = 1024 * 1024 * 1024; // Was 256MB. Want 512MB. Updated to 1024MB to match net2pipe more easily.
-                // FIXME - How to reduce PEER_REGION_SIZE=256 again, and make this still work? Need to make the ATU mappings non-contiguous 256MB chunks (every 1GB?) to match net2pipe?
-
-                LOG2 ("Setting up src_pci_id: %d peer_id: %d to Device (upstream_peer_device_id: %d downstream_peer_device_id: %d) gives peer_device_id: %d (peer_pci_interface_id: %d) current_peer_region: %d\n",
-                    src_pci_id, peer_id, upstream_peer_device_id, downstream_peer_device_id, peer_device_id, peer_pci_interface_id, current_peer_region );
-
-                iatu_configure_peer_region (src_pci_id, current_peer_region, peer_BAR_addr + TLB1_16MB_OFFSET, PEER_REGION_SIZE);
-            }
-            current_peer_region ++;
-        }
-    }
+    log_assert(fallback_tlb != "LARGE_READ_TLB" &&  fallback_tlb != "LARGE_WRITE_TLB", "Ordering modes for LARGE_READ_TLB and LARGE_WRITE_TLB cannot be modified.");
+    dynamic_tlb_ordering_modes.at(fallback_tlb) = ordering;
 }
 
 // TT<->TT P2P support removed in favor of increased Host memory.
-void tt_SiliconDevice::init_pcie_iatus_no_p2p() {
-
+void tt_SiliconDevice::init_pcie_iatus() {
     int num_enabled_devices = m_pci_device_map.size();
-    LOG1("---- tt_SiliconDevice::init_pcie_iatus_no_p2p() num_enabled_devices: %d\n", num_enabled_devices);
+    LOG1("---- tt_SiliconDevice::init_pcie_iatus() num_enabled_devices: %d\n", num_enabled_devices);
     log_assert(m_num_host_mem_channels <= g_MAX_HOST_MEM_CHANNELS, "Maximum of {} 1GB Host memory channels supported.",  g_MAX_HOST_MEM_CHANNELS);
 
     for (auto &src_device_it : m_pci_device_map){
         int src_pci_id = src_device_it.first;
-        struct PCIdevice* src_pci_device = src_device_it.second;
+        TTDevice* src_pci_device = src_device_it.second.get();
 
         // Device to Host (multiple channels)
         for (int channel_id = 0; channel_id < m_num_host_mem_channels; channel_id++) {
-            // TODO - Try to remove DMA buffer support.
             if (hugepage_mapping.at(src_pci_id).at(channel_id)) {
                 std::uint32_t region_size = HUGEPAGE_REGION_SIZE;
                 if(channel_id == 3) region_size = 805306368; // Remove 256MB from full 1GB for channel 3 (iATU limitation)
                 log_debug(LogSiliconDriver, "Configuring ATU channel {} to point to hugepage {}.", channel_id, src_pci_id);
                 iatu_configure_peer_region(src_pci_id, channel_id, hugepage_physical_address.at(src_pci_id).at(channel_id), region_size);
                 if(host_channel_size.find(src_pci_device->logical_id) == host_channel_size.end()) {
-                     host_channel_size.insert({(int)src_pci_device->logical_id, {}});
+                     host_channel_size.insert({src_pci_device->logical_id, {}});
                 }
                 host_channel_size.at(src_pci_device -> logical_id).push_back(region_size);
-            } else if(buf_mapping) {
-                log_debug(LogSiliconDriver, "Configuring ATU channel {} to point to DMA buffer.", channel_id);
-                // we failed when initializing huge pages, we are using a 1MB DMA buffer as a stand-in
-                iatu_configure_peer_region(src_pci_id, channel_id, buf_physical_addr, DMA_BUF_REGION_SIZE);
+            } else {
+                std::string err_msg = "init_pcie_iatus: Hugepages are not allocated for src_pci_id: " + std::to_string(src_pci_id) + " ch: " + std::to_string(channel_id);
+                throw std::runtime_error(err_msg);
             }
         }
     }
 }
 
-uint32_t tt_SiliconDevice::dma_allocation_size(chip_id_t src_device_id)
-{
-
-  // Fall back to first device if no src_device_id is provided. Assumes all devices have the same size, which is true.
-  chip_id_t device_index = src_device_id == -1 ? m_pci_device_map.begin()->first : src_device_id;
-
-  if (hugepage_mapping.at(device_index).at(0)) {
-    return HUGEPAGE_REGION_SIZE;
-  } else if (buf_mapping) {
-    return DMA_BUF_REGION_SIZE;
-  } else {
-    log_fatal("Nothing has been allocated yet");
-    return 0;
-  }
-}
-
-
-
-
 // Looks for hugetlbfs inside /proc/mounts matching desired pagesize (typically 1G)
 std::string find_hugepage_dir(std::size_t pagesize)
 {
@@ -2795,52 +1576,6 @@ int tt_SiliconDevice::open_hugepage_file(const std::string &dir, chip_id_t physi
     return fd;
 }
 
-bool tt_SiliconDevice::init_dmabuf(chip_id_t device_id) {
-    if (buf_mapping == nullptr) {
-
-        TTDevice *dev = m_pci_device_map.begin()->second->hdev;
-
-        DMAbuffer buf = pci_allocate_dma_buffer(dev, DMA_BUF_REGION_SIZE);
-        buf_mapping = static_cast<void*>(reinterpret_cast<uint32_t*>(pci_dma_buffer_get_user_addr(buf)));
-        buf_physical_addr= pci_dma_buffer_get_physical_addr(buf);
-    }
-    return true;
-}
-
-bool tt_SiliconDevice::init_dma_turbo_buf (struct PCIdevice* pci_device) {
-    // Allocate buffers for DMA transfer data and flag
-    pci_device->hdev->dma_completion_flag_buffer = pci_allocate_dma_buffer(pci_device->hdev, sizeof(uint64_t));
-    pci_device->hdev->dma_transfer_buffer = pci_allocate_dma_buffer(pci_device->hdev, m_dma_buf_size);
-    pcie_init_dma_transfer_turbo(pci_device);
-    return true;
-}
-
-bool tt_SiliconDevice::uninit_dma_turbo_buf (struct PCIdevice* pci_device) {
-    struct DMAbuffer &flag_buffer = pci_device->hdev->dma_completion_flag_buffer;
-    struct DMAbuffer &xfer_buffer = pci_device->hdev->dma_transfer_buffer;
-    if (flag_buffer.pBuf) {
-        for (auto it = pci_device->hdev->dma_buffer_mappings.begin(); it != pci_device->hdev->dma_buffer_mappings.end();) {
-            if (it->pBuf == flag_buffer.pBuf) {
-                it = pci_device->hdev->dma_buffer_mappings.erase(it);
-            } else {
-                ++it;
-            }
-        }
-        munmap(flag_buffer.pBuf, flag_buffer.size);
-    }
-    if (xfer_buffer.pBuf) {
-        for (auto it = pci_device->hdev->dma_buffer_mappings.begin(); it != pci_device->hdev->dma_buffer_mappings.end();) {
-            if (it->pBuf == xfer_buffer.pBuf) {
-                it = pci_device->hdev->dma_buffer_mappings.erase(it);
-            } else {
-                ++it;
-            }
-        }
-        munmap(xfer_buffer.pBuf, xfer_buffer.size);
-    }
-    return true;
-}
-
 // For debug purposes when various stages fails.
 void print_file_contents(std::string filename, std::string hint = ""){
     if (std::filesystem::exists(filename)){
@@ -2858,7 +1593,8 @@ bool tt_SiliconDevice::init_hugepage(chip_id_t device_id) {
     const std::size_t mapping_size = (std::size_t) HUGEPAGE_REGION_SIZE;
 
     // Convert from logical (device_id in netlist) to physical device_id (in case of virtualization)
-    auto physical_device_id = m_pci_device_map.at(device_id)->id;
+    auto dev = m_pci_device_map.at(device_id).get();
+    auto physical_device_id = dev->device_id;
 
     std::string hugepage_dir = find_hugepage_dir(hugepage_size);
     if (hugepage_dir.empty()) {
@@ -2884,7 +1620,7 @@ bool tt_SiliconDevice::init_hugepage(chip_id_t device_id) {
         close(hugepage_fd);
 
         if (mapping == MAP_FAILED) {
-            uint32_t num_tt_mmio_devices_for_arch = tt::cpuset::tt_cpuset_allocator::get_num_tt_pci_devices_by_pci_device_id(m_pci_device_map.at(device_id)->device_id, m_pci_device_map.at(device_id)->revision_id);
+            uint32_t num_tt_mmio_devices_for_arch = tt::cpuset::tt_cpuset_allocator::get_num_tt_pci_devices_by_pci_device_id(physical_device_id, dev->pcie_revision_id);
             WARN("---- ttSiliconDevice::init_hugepage: physical_device_id: %d ch: %d mapping hugepage failed. (errno: %s).\n", physical_device_id, ch, strerror(errno));
             WARN("---- Possible hint: /proc/cmdline should have hugepages=N, nr_hugepages=N - (N = NUM_MMIO_TT_DEVICES * (is_grayskull ? 1 : 4). NUM_MMIO_DEVICES = %d\n", num_tt_mmio_devices_for_arch);
             print_file_contents("/proc/cmdline");\
@@ -2907,7 +1643,7 @@ bool tt_SiliconDevice::init_hugepage(chip_id_t device_id) {
         pin_pages.in.virtual_address = reinterpret_cast<std::uintptr_t>(mapping);
         pin_pages.in.size = mapping_size;
 
-        auto &fd = g_SINGLE_PIN_PAGE_PER_FD_WORKAROND ? m_pci_device_map.at(device_id)->hdev->device_fd_per_host_ch[ch] : m_pci_device_map.at(device_id)->hdev->device_fd;
+        auto &fd = g_SINGLE_PIN_PAGE_PER_FD_WORKAROND ? dev->device_fd_per_host_ch[ch] : dev->device_fd;
 
         if (ioctl(fd, TENSTORRENT_IOCTL_PIN_PAGES, &pin_pages) == -1) {
             WARN("---- ttSiliconDevice::init_hugepage: physical_device_id: %d ch: %d TENSTORRENT_IOCTL_PIN_PAGES failed (errno: %s). Common Issue: Requires TTMKD >= 1.11, see following file contents...\n", physical_device_id, ch, strerror(errno));
@@ -2933,23 +1669,23 @@ bool tt_SiliconDevice::init_hugepage(chip_id_t device_id) {
 int tt_SiliconDevice::test_setup_interface () {
     if (arch_name == tt::ARCH::GRAYSKULL) {
         int ret_val = 0;
-        TTDevice *dev = m_pci_device_map.begin()->second->hdev;
+        TTDevice *dev = m_pci_device_map.begin()->second.get();
 
-        uint32_t mapped_reg = set_dynamic_tlb(m_pci_device_map.begin()->second, dev->get_architecture_implementation()->get_reg_tlb(), tt_xy_pair(0, 0), 0xffb20108, harvested_coord_translation).bar_offset;
+        uint32_t mapped_reg = set_dynamic_tlb(dev, dev->get_architecture_implementation()->get_reg_tlb(), tt_xy_pair(0, 0), 0xffb20108, harvested_coord_translation).bar_offset;
 
         uint32_t regval = 0;
-        read_regs(dev, mapped_reg, 1, &regval);
+        dev->read_regs(mapped_reg, 1, &regval);
         ret_val = (regval != 0xffffffff && ((regval & 0x1) == 1)) ? 0 : 1;
         return ret_val;
     }
     else if (arch_name == tt::ARCH::WORMHOLE || arch_name == tt::ARCH::WORMHOLE_B0) {
         int ret_val = 0;
-        TTDevice *dev = m_pci_device_map.begin()->second->hdev;
+        TTDevice *dev = m_pci_device_map.begin()->second.get();
 
-        uint32_t mapped_reg = set_dynamic_tlb(m_pci_device_map.begin()->second, dev->get_architecture_implementation()->get_reg_tlb(), tt_xy_pair(1, 0), 0xffb20108, harvested_coord_translation).bar_offset;
+        uint32_t mapped_reg = set_dynamic_tlb(dev, dev->get_architecture_implementation()->get_reg_tlb(), tt_xy_pair(1, 0), 0xffb20108, harvested_coord_translation).bar_offset;
 
         uint32_t regval = 0;
-        read_regs(dev, mapped_reg, 1, &regval);
+        dev->read_regs(mapped_reg, 1, &regval);
         ret_val = (regval != 0xffffffff && (regval == 33)) ? 0 : 1;
         return ret_val;
     }
@@ -2971,80 +1707,24 @@ int tt_SiliconDevice::test_setup_interface () {
     }
 }
 
-// Code used to test non existent broadcast TLB
-// Keep for now, in case we need to test broadcast TLB again.
-// int tt_SiliconDevice::test_broadcast (int logical_device_id) {
-//     LOG1("---- tt_SiliconDevice::test_broadcast\n");
-
-//     int ret_val = 0;
-//     struct PCIdevice* pci_device = get_pci_device(logical_device_id);
-
-//     assert (test_pcie_tlb_setup(pci_device) == 0);
-
-//     std::vector<std::uint32_t> fill_array (1024, 0);
-//     uint32_t broadcast_bar_offset = architecture_implementation->get_broadcast_tlb_index() * architecture_implementation->get_static_tlb_size();
-//     LOG2 ("broadcast_bar_offset = 0x%x\n", broadcast_bar_offset);
-
-//     uint64_t fill_array_ptr = (uint64_t)(&fill_array[0]);
-
-//     // a. Fill with increasing numbers
-//     //
-//     for (size_t i = 0; i < fill_array.size(); i++) {
-//         fill_array[i] = i;
-//     }
-//     write_block(pci_device->hdev, broadcast_bar_offset, fill_array.size() * sizeof (std::uint32_t), fill_array_ptr, m_dma_buf_size);
-
-//     // Check individual locations
-//     for (uint32_t xi = 0; xi < architecture_implementation->get_t6_x_locations().size(); xi++) {
-//         for (uint32_t yi = 0; yi < architecture_implementation->get_t6_y_locations().size(); yi++) {
-//             tt_cxy_pair read_loc(logical_device_id, architecture_implementation->get_t6_x_locations()[xi], architecture_implementation->get_t6_y_locations()[yi]);
-//             read_vector (fill_array, read_loc, 0, fill_array.size() * sizeof (fill_array[0]) );
-//             for (size_t i = 0; i < fill_array.size(); i++) {
-//                 ret_val = (fill_array[i] == i) ? 0 : 1;
-//                 if (ret_val) return ret_val;
-//             }
-//         }
-//     }
-
-//     // b. Test with zeroes
-//     //
-//     std::vector<std::uint32_t> fill_array_zeroes (1024, 0);
-//     uint64_t fill_array_zeroes_ptr = (uint64_t)(&fill_array_zeroes[0]);
-//     write_block(pci_device->hdev, broadcast_bar_offset, fill_array.size() * sizeof (std::uint32_t), fill_array_zeroes_ptr, m_dma_buf_size);
-
-//     // Check individual locations
-//     for (uint32_t xi = 0; xi < architecture_implementation->get_t6_x_locations().size(); xi++) {
-//         for (uint32_t yi = 0; yi < architecture_implementation->get_t6_y_locations().size(); yi++) {
-//             tt_cxy_pair read_loc(logical_device_id, architecture_implementation->get_t6_x_locations()[xi], architecture_implementation->get_t6_y_locations()[yi]);
-//             read_vector (fill_array, read_loc, 0, fill_array.size() * sizeof (fill_array_zeroes[0]) );
-//             for (size_t i = 0; i < fill_array.size(); i++) {
-//                 ret_val = (fill_array_zeroes[i] == 0) ? 0 : 1;
-//                 if (ret_val) return ret_val;
-//             }
-//         }
-//     }
-
-//     return ret_val;
-// }
-
 void tt_SiliconDevice::bar_write32 (int logical_device_id, uint32_t addr, uint32_t data) {
-    TTDevice* dev = get_pci_device(logical_device_id)->hdev;
+    TTDevice *dev = get_pci_device(logical_device_id);
 
     if (addr < dev->bar0_uc_offset) {
-        write_block (dev, addr, sizeof(data), reinterpret_cast<const uint8_t*>(&data), m_dma_buf_size);
+        dev->write_block(addr, sizeof(data), reinterpret_cast<const uint8_t*>(&data)); // do we have to reinterpret_cast?
     } else {
-        write_regs (dev, addr, 1, &data);
+        dev->write_regs(addr, 1, &data);
     }
 }
 
 uint32_t tt_SiliconDevice::bar_read32 (int logical_device_id, uint32_t addr) {
-    TTDevice* dev = get_pci_device(logical_device_id)->hdev;
+    TTDevice* dev = get_pci_device(logical_device_id);
 
     uint32_t data;
     if (addr < dev->bar0_uc_offset) {
-        read_block (dev, addr, sizeof(data), reinterpret_cast<uint8_t*>(&data), m_dma_buf_size);
+        dev->read_block(addr, sizeof(data), reinterpret_cast<uint8_t*>(&data));
     } else {
-        read_regs (dev, addr, 1, &data);
+        dev->read_regs(addr, 1, &data);
     }
     return data;
 }
@@ -3058,12 +1738,12 @@ int tt_SiliconDevice::pcie_arc_msg(int logical_device_id, uint32_t msg_code, boo
     }
     log_assert(arg0 <= 0xffff and arg1 <= 0xffff, "Only 16 bits allowed in arc_msg args"); // Only 16 bits are allowed
 
-    struct PCIdevice* pci_device = get_pci_device(logical_device_id);
-    auto architecture_implementation = pci_device->hdev->get_architecture_implementation();
+    TTDevice *pci_device = get_pci_device(logical_device_id);
+    auto architecture_implementation = pci_device->get_architecture_implementation();
 
     // Exclusive access for a single process at a time. Based on physical pci interface id.
     std::string msg_type = "ARC_MSG";
-    const scoped_lock<named_mutex> lock(*get_mutex(msg_type, pci_device->id));
+    const scoped_lock<named_mutex> lock(*get_mutex(msg_type, pci_device->device_id));
     uint32_t fw_arg = arg0 | (arg1<<16);
     int exit_code = 0;
 
@@ -3108,7 +1788,7 @@ int tt_SiliconDevice::pcie_arc_msg(int logical_device_id, uint32_t msg_code, boo
         }
     }
 
-    detect_ffffffff_read(pci_device->hdev);
+    // detect_ffffffff_read(pci_device);
     return exit_code;
 }
 
@@ -3118,8 +1798,8 @@ int tt_SiliconDevice::iatu_configure_peer_region (int logical_device_id, uint32_
     uint32_t dest_bar_hi = (bar_addr_64 >> 32) & 0xffffffff;
     std::uint32_t region_id_to_use = peer_region_id;
     if(peer_region_id == 3) region_id_to_use = 4; // Hack use region 4 for channel 3..this ensures that we have a smaller chan 3 address space with the correct start offset
-    struct PCIdevice* pci_device = get_pci_device(logical_device_id);
-    auto architecture_implementation = pci_device->hdev->get_architecture_implementation();
+    TTDevice *pci_device = get_pci_device(logical_device_id);
+    auto architecture_implementation = pci_device->get_architecture_implementation();
 
     // BR: ARC doesn't work yet on Blackhole, so programming ATU directly. Should be removed when arc starts working.
     // TODO: Remove when ARC is implemented on BH.
@@ -3139,15 +1819,15 @@ int tt_SiliconDevice::iatu_configure_peer_region (int logical_device_id, uint32_
         uint64_t iatu_index = 0;
         uint64_t iatu_base = UNROLL_ATU_OFFSET_BAR + iatu_index * 0x200;
 
-        write_regs(reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->hdev->bar2_uc) + iatu_base + 0x00), &region_ctrl_1, 1);
-        write_regs(reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->hdev->bar2_uc) + iatu_base + 0x04), &region_ctrl_2, 1);
-        write_regs(reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->hdev->bar2_uc) + iatu_base + 0x08), &base_addr_lo, 1);
-        write_regs(reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->hdev->bar2_uc) + iatu_base + 0x0c), &base_addr_hi, 1);
-        write_regs(reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->hdev->bar2_uc) + iatu_base + 0x10), &limit_address_lo, 1);
-        write_regs(reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->hdev->bar2_uc) + iatu_base + 0x14), &dest_bar_lo, 1);
-        write_regs(reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->hdev->bar2_uc) + iatu_base + 0x18), &dest_bar_hi, 1);
-        write_regs(reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->hdev->bar2_uc) + iatu_base + 0x1c), &region_ctrl_3, 1);
-        write_regs(reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->hdev->bar2_uc) + iatu_base + 0x20), &limit_address_hi, 1);
+        pci_device->write_regs(reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x00), &region_ctrl_1, 1);
+        pci_device->write_regs(reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x04), &region_ctrl_2, 1);
+        pci_device->write_regs(reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x08), &base_addr_lo, 1);
+        pci_device->write_regs(reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x0c), &base_addr_hi, 1);
+        pci_device->write_regs(reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x10), &limit_address_lo, 1);
+        pci_device->write_regs(reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x14), &dest_bar_lo, 1);
+        pci_device->write_regs(reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x18), &dest_bar_hi, 1);
+        pci_device->write_regs(reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x1c), &region_ctrl_3, 1);
+        pci_device->write_regs(reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x20), &limit_address_hi, 1);
     }
     else {
         bar_write32(logical_device_id, architecture_implementation->get_arc_csm_mailbox_offset() + 0 * 4, region_id_to_use);
@@ -3193,8 +1873,8 @@ uint32_t tt_SiliconDevice::get_harvested_rows (int logical_device_id) {
         harv = std::stoul(harv_override, nullptr, 16);
     } else {
         auto mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(logical_device_id);
-        struct PCIdevice* pci_device = get_pci_device(mmio_capable_chip_logical);
-        int harvesting_msg_code = arc_msg(logical_device_id, 0xaa00 | pci_device->hdev->get_architecture_implementation()->get_arc_message_arc_get_harvesting(), true, 0, 0, 1, &harv);
+        TTDevice *pci_device = get_pci_device(mmio_capable_chip_logical);
+        int harvesting_msg_code = arc_msg(logical_device_id, 0xaa00 | pci_device->get_architecture_implementation()->get_arc_message_arc_get_harvesting(), true, 0, 0, 1, &harv);
         log_assert(harvesting_msg_code != MSG_ERROR_REPLY, "Failed to read harvested rows from device {}", logical_device_id);
     }
     log_assert(harv != 0xffffffff, "Readback 0xffffffff for harvesting info. Chip is fused incorrectly!");
@@ -3224,48 +1904,20 @@ void tt_SiliconDevice::enable_local_ethernet_queue(const chip_id_t &device_id, i
     }
 }
 
-void *tt_SiliconDevice::channel_address(std::uint32_t offset, const tt_cxy_pair& target) {
-    log_assert(ndesc->is_chip_mmio_capable(target.chip), "Cannot call channel_address for non-MMIO device");
-    struct PCIdevice* pci_device = get_pci_device(target.chip);
-    auto architecture_implementation = pci_device->hdev->get_architecture_implementation();
-    std::uint64_t bar0_offset;
-
-    // Temporary hack for blackhole bringup.
-    if (arch_name == tt::ARCH::BLACKHOLE) {
-        // We use BAR4 segment for mapping for Blackhole.
-        log_assert(tlbs_init, "TLBs were not initialized.");
-        std::int32_t tlb_index = map_core_to_tlb(tt_xy_pair(target.x, target.y));
-        auto [tlb_offset, tlb_size] = pci_device->hdev->get_architecture_implementation()->describe_tlb(tlb_index).value();
-
-        log_assert(pci_device->hdev->bar4_wc != nullptr && tlb_size == BH_4GB_TLB_SIZE, "BAR4 not initialized, or TLBs not initialized properly.");
-        return static_cast<std::byte*>(pci_device->hdev->bar4_wc) + tlb_offset + offset;
-    } else {
-        // This hard-codes that we use 16MB TLB #1 onwards for the mapping.
-        bar0_offset = offset - architecture_implementation->get_dram_channel_0_peer2peer_region_start()
-                        + architecture_implementation->get_dynamic_tlb_16m_base() + architecture_implementation->get_dynamic_tlb_16m_size();
-    }
-
-    return static_cast<std::byte*>(pci_device->hdev->bar0_wc) + bar0_offset;
-}
-
 void *tt_SiliconDevice::host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const {
-
     if (hugepage_mapping.at(src_device_id).at(channel) != nullptr) {
         return static_cast<std::byte*>(hugepage_mapping.at(src_device_id).at(channel)) + offset;
-    } else if(buf_mapping) {
-        // we failed when initializing huge pages, we are using a 1MB DMA buffer as a stand-in
-        return static_cast<std::byte*>(buf_mapping) + offset;
     } else {
         return nullptr;
     }
 }
 
 // Wrapper for throwing more helpful exception when not-enabled pci intf is accessed.
-inline struct PCIdevice* tt_SiliconDevice::get_pci_device(int device_id) const {
+inline TTDevice* tt_SiliconDevice::get_pci_device(int device_id) const {
     if (!m_pci_device_map.count(device_id)){
         throw std::runtime_error(std::string("device_id: " + std::to_string(device_id) + " attempted to be accessed, but is not enabled."));
     }
-    return m_pci_device_map.at(device_id);
+    return m_pci_device_map.at(device_id).get();
 }
 
 std::shared_ptr<boost::interprocess::named_mutex> tt_SiliconDevice::get_mutex(const std::string& tlb_name, int pci_interface_id) {
@@ -3273,46 +1925,6 @@ std::shared_ptr<boost::interprocess::named_mutex> tt_SiliconDevice::get_mutex(co
     return hardware_resource_mutex_map.at(mutex_name);
 }
 
-
-std::unordered_map<chip_id_t, chip_id_t> tt_SiliconDevice::get_logical_to_physical_mmio_device_id_map(std::vector<chip_id_t> physical_device_ids){
-
-    std::unordered_map<chip_id_t, chip_id_t> logical_to_physical_mmio_device_id_map;
-
-    LOG1("get_logical_to_physical_mmio_device_id_map() -- num_physical_devices: %d\n", physical_device_ids.size());
-
-    for (int logical_device_idx=0; logical_device_idx < physical_device_ids.size(); logical_device_idx++){
-        logical_to_physical_mmio_device_id_map.insert({logical_device_idx, physical_device_ids.at(logical_device_idx)});
-    }
-
-    return logical_to_physical_mmio_device_id_map;
-
-}
-
-
-// Get PCI bus_id info for looking up TT devices in hwloc to find associated CPU package.
-std::map<chip_id_t, std::string> tt_SiliconDevice::get_physical_device_id_to_bus_id_map(std::vector<chip_id_t> physical_device_ids){
-
-    std::map<int, std::string> physical_device_id_to_bus_id_map;
-
-    for (auto &pci_interface_id : physical_device_ids){
-
-        auto ttdev = std::make_unique<TTDevice>(TTDevice::open(pci_interface_id));
-
-        std::ostringstream pci_bsf;
-        pci_bsf << std::hex << std::setw(2) << std::setfill('0') << (int) ttdev->pci_bus << ":";
-        pci_bsf << std::hex << std::setw(2) << std::setfill('0') << (int) ttdev->pci_device << ".";
-        pci_bsf << std::hex << (int) ttdev->pci_function;
-
-        std::string pci_bsf_str = pci_bsf.str();
-        LOG2("get_physical_device_id_to_bus_id_map() -- pci_interface_id: %d BSF: %s\n", pci_interface_id, pci_bsf_str.c_str());
-        physical_device_id_to_bus_id_map.insert({pci_interface_id, pci_bsf_str});
-
-    }
-
-    return physical_device_id_to_bus_id_map;
-
-}
-
 uint64_t tt_SiliconDevice::get_sys_addr(uint32_t chip_x, uint32_t chip_y, uint32_t noc_x, uint32_t noc_y, uint64_t offset) {
     uint64_t result = chip_y;
     uint64_t noc_addr_local_bits_mask = (1UL << eth_interface_params.noc_addr_local_bits) - 1;
@@ -3345,7 +1957,6 @@ bool tt_SiliconDevice::is_non_mmio_cmd_q_full(uint32_t curr_wptr, uint32_t curr_
  *
  * Relevant functions:
  *  - write_to_non_mmio_device
- *  - rolled_write_to_non_mmio_device
  *  - read_from_non_mmio_device
  *
  * The non-MMIO read/write functions (excluding the `*_epoch_cmd` variants) are responsible for the
@@ -3442,8 +2053,7 @@ void tt_SiliconDevice::write_to_non_mmio_device(
     //                    MUTEX ACQUIRE (NON-MMIO)
     //  do not locate any ethernet core reads/writes before this acquire
     //
-    const scoped_lock<named_mutex> lock(
-        *get_mutex(NON_MMIO_MUTEX_NAME, this->get_pci_device(mmio_capable_chip_logical)->id));
+    const scoped_lock<named_mutex> lock(*get_mutex(NON_MMIO_MUTEX_NAME, this->get_pci_device(mmio_capable_chip_logical)->device_id));
 
     int& active_core_for_txn = non_mmio_transfer_cores_customized ? active_eth_core_idx_per_chip.at(mmio_capable_chip_logical) : active_core;
     tt_cxy_pair remote_transfer_ethernet_core = remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn];
@@ -3579,282 +2189,6 @@ void tt_SiliconDevice::write_to_non_mmio_device(
     }
 }
 
-
-// Specialized function for small epoch commands:
-// 1) uses separate eth cores than other non-mmio transfers hence does not require mutex
-// 2) does not have the code paths for transfers larger than 32kB (1024 cmds)
-// 3) only reads erisc_q_ptrs_epoch once, or when the queues are full
-// 4) only updates wptr on eth command queues for the last epoch command or when the queue is full or when switching eth cores based on eth-ordered-writes policy, or when
-//    eth-ordered-writes are not supported but current write must be ordered (flush prev wrptr).
-// 5) When eth-ordered-write not supported, allow flush to be used as ordering mechanism when ordering is requested via arg. When eth-ordered-write is supported, always use it
-//    and ensure ordering to same remote chip destinations by always using same remote xfer eth core for a given destination based on noc xy. Must ensure wrptr is flushed on
-//    switch of eth cores, and copy of rdptr/wrptr maintained on host for each eth xfer core.
-void tt_SiliconDevice::write_to_non_mmio_device_send_epoch_cmd(const uint32_t *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t address, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {
-    log_assert(!non_mmio_transfer_cores_customized, "{} cannot be used if ethernet cores for host->cluster transfers are customized. The default Ethernet Core configuration must be used.", __FUNCTION__);
-    using data_word_t = uint32_t;
-    constexpr int DATA_WORD_SIZE = sizeof(data_word_t);
-
-    const auto &mmio_capable_chip = ndesc->get_closest_mmio_capable_chip(core.chip);
-    const auto target_chip = ndesc->get_chip_locations().at(core.chip);
-
-    std::string write_tlb = "LARGE_WRITE_TLB";
-    std::string read_tlb = "LARGE_READ_TLB";
-    std::string empty_tlb = "";
-    translate_to_noc_table_coords(core.chip, core.y, core.x);
-
-    const auto &mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(core.chip);
-    tt_cxy_pair remote_transfer_ethernet_core = remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_epoch];
-
-    // read all eth queue ptrs for the first time, and initialize wrptr_updated bool for strict ordering.
-    if (!erisc_q_ptrs_initialized) {
-        for (int core_epoch = EPOCH_ETH_CORES_START_ID; core_epoch < EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS + EPOCH_ETH_CORES_START_ID; core_epoch++) {
-            erisc_q_ptrs_epoch[core_epoch].reserve(eth_interface_params.remote_update_ptr_size_bytes*2/sizeof(uint32_t));
-            read_device_memory(erisc_q_ptrs_epoch[core_epoch].data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb);
-            erisc_q_wrptr_updated[core_epoch] = false;
-            erisc_q_ptrs_initialized = true;
-        }
-    }
-
-    std::vector<std::uint32_t> erisc_command(sizeof(routing_cmd_t)/DATA_WORD_SIZE);
-    routing_cmd_t *new_cmd = (routing_cmd_t *)&erisc_command[0];
-    std::vector<std::uint32_t> data_block;
-
-    // Two mechanisms for ordering depending on eth fw version.
-    if (use_ethernet_ordered_writes) {
-        // Feature in this function to ensure ordering via eth-ordered-writes by using same eth core for all epoch writes to same dest noc xy.
-        auto &soc_desc  = get_soc_descriptor(mmio_capable_chip);
-        int core_id = core.x * soc_desc.grid_size.y + core.y;
-        int new_active_core_epoch = (core_id % EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS) + EPOCH_ETH_CORES_START_ID;
-
-        // Switch eth cores, and if wrptr was not flushed to device for previous eth core, do it now.
-        if (new_active_core_epoch != active_core_epoch) {
-            if (!erisc_q_wrptr_updated[active_core_epoch]) {
-                std::vector<std::uint32_t> erisc_q_wptr = { erisc_q_ptrs_epoch[active_core_epoch][0] };
-                write_device_memory(erisc_q_wptr.data(), erisc_q_wptr.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, write_tlb);
-                tt_driver_atomics::sfence();
-                erisc_q_wrptr_updated[active_core_epoch] = true;
-            }
-            active_core_epoch = new_active_core_epoch;
-            remote_transfer_ethernet_core = remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_epoch];
-        }
-    } else if (ordered_with_prev_remote_write) {
-        // Flush used as ordering mechanism when eth ordered writes are unsupported. If previous write requires flush,
-        // handle it here before setting flush_non_mmio for the current write.
-        if (!erisc_q_wrptr_updated[active_core_epoch]) {
-            std::vector<std::uint32_t> erisc_q_wptr = { erisc_q_ptrs_epoch[active_core_epoch][0] };
-            write_device_memory(erisc_q_wptr.data(), erisc_q_wptr.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, write_tlb);
-            tt_driver_atomics::sfence();
-            erisc_q_wrptr_updated[active_core_epoch] = true;
-        }
-        wait_for_non_mmio_flush();
-    }
-
-    flush_non_mmio = true;
-    uint32_t timestamp = 0; //CMD_TIMESTAMP;
-
-    bool use_dram = size_in_bytes > 256 * DATA_WORD_SIZE ? true : false;
-    uint32_t max_block_size = use_dram ? host_address_params.eth_routing_block_size : eth_interface_params.max_block_size;
-    uint32_t block_size;
-
-    // Ethernet ordered writes must originate from same erisc core, so prevent updating active core here.
-    while (is_non_mmio_cmd_q_full(erisc_q_ptrs_epoch[active_core_epoch][0], erisc_q_ptrs_epoch[active_core_epoch][4])) {
-        if (!use_ethernet_ordered_writes){
-            active_core_epoch++;
-            log_assert(active_core_epoch - EPOCH_ETH_CORES_START_ID >= 0, "Invalid ERISC core for sending epoch commands");
-            active_core_epoch = ((active_core_epoch - EPOCH_ETH_CORES_START_ID) % EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS) + EPOCH_ETH_CORES_START_ID;
-            remote_transfer_ethernet_core = remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_epoch];
-        }
-        read_device_memory(erisc_q_ptrs_epoch[active_core_epoch].data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb);
-    }
-
-    uint32_t req_wr_ptr = erisc_q_ptrs_epoch[active_core_epoch][0] & eth_interface_params.cmd_buf_size_mask;
-    if (address & 0x1F) { // address not 32-byte aligned
-        // can send it in one transfer, no need to break it up
-        log_assert(size_in_bytes == DATA_WORD_SIZE, "Non-mmio cmd queue update is too big");
-        block_size = DATA_WORD_SIZE;
-    } else {
-        // can send it in one transfer, no need to break it up
-        log_assert(size_in_bytes <= max_block_size, "Non-mmio cmd queue update is too big. size_in_bytes: {} exceeds max_block_size: {}", size_in_bytes, max_block_size);
-        block_size = size_in_bytes;
-    }
-    uint32_t req_flags = block_size > DATA_WORD_SIZE ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_wr_req | timestamp) : eth_interface_params.cmd_wr_req;
-    if (use_ethernet_ordered_writes) {
-        req_flags |= eth_interface_params.cmd_ordered;
-    }
-
-    uint32_t resp_flags = block_size > DATA_WORD_SIZE ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_wr_ack) : eth_interface_params.cmd_wr_ack;
-    timestamp = 0;
-
-    uint32_t host_dram_block_addr = host_address_params.eth_routing_buffers_start + (active_core_epoch * eth_interface_params.cmd_buf_size + req_wr_ptr) * max_block_size;
-    uint16_t host_dram_channel = 0; // This needs to be 0, since WH can only map ETH buffers to chan 0.
-
-    // send the data
-    if (req_flags & eth_interface_params.cmd_data_block) {
-        // Copy data to sysmem or device DRAM for Block mode
-        if (use_dram) {
-            req_flags |= eth_interface_params.cmd_data_block_dram;
-            resp_flags |= eth_interface_params.cmd_data_block_dram;
-            size_buffer_to_capacity(data_block, block_size);
-            memcpy(&data_block[0], mem_ptr, block_size);
-            write_to_sysmem(data_block, host_dram_block_addr, host_dram_channel, mmio_capable_chip_logical);
-        } else {
-            uint32_t buf_address = eth_interface_params.eth_routing_data_buffer_addr + req_wr_ptr * max_block_size;
-            size_buffer_to_capacity(data_block, block_size);
-            memcpy(&data_block[0], mem_ptr, block_size);
-            write_device_memory(data_block.data(), data_block.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, buf_address, write_tlb);
-        }
-        tt_driver_atomics::sfence();
-    }
-
-    // send the write request
-    log_assert((req_flags & eth_interface_params.cmd_data_block) ? (address & 0x1F) == 0 : true, "Block mode address must be 32-byte aligned.");
-
-    new_cmd->sys_addr = get_sys_addr(std::get<0>(target_chip), std::get<1>(target_chip), core.x, core.y, address);
-    new_cmd->rack = get_sys_rack(std::get<2>(target_chip), std::get<3>(target_chip));
-    new_cmd->data = req_flags & eth_interface_params.cmd_data_block ? block_size : *mem_ptr;
-    new_cmd->flags = req_flags;
-    if (use_dram) {
-        new_cmd->src_addr_tag = host_dram_block_addr;
-    }
-
-    write_device_memory(erisc_command.data(), erisc_command.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.request_routing_cmd_queue_base + (sizeof(routing_cmd_t) * req_wr_ptr), write_tlb);
-    tt_driver_atomics::sfence();
-
-    // update the wptr only if the eth queue is full or for the last command
-    erisc_q_ptrs_epoch[active_core_epoch][0] = (erisc_q_ptrs_epoch[active_core_epoch][0] + 1) & eth_interface_params.cmd_buf_ptr_mask;
-    if (last_send_epoch_cmd || is_non_mmio_cmd_q_full(erisc_q_ptrs_epoch[active_core_epoch][0], erisc_q_ptrs_epoch[active_core_epoch][4])) {
-        std::vector<std::uint32_t> erisc_q_wptr = { erisc_q_ptrs_epoch[active_core_epoch][0] };
-        write_device_memory(erisc_q_wptr.data(), erisc_q_wptr.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, write_tlb);
-        tt_driver_atomics::sfence();
-        erisc_q_wrptr_updated[active_core_epoch] = true;
-    } else {
-        erisc_q_wrptr_updated[active_core_epoch] = false;
-    }
-}
-
-/*
- * Note that this function is required to acquire the `NON_MMIO_MUTEX_NAME` mutex for interacting with the ethernet core (host) command queue
- * DO NOT issue any pcie reads/writes to the ethernet core prior to acquiring the mutex. For extra information, see the "NON_MMIO_MUTEX Usage" above
- */
-void tt_SiliconDevice::rolled_write_to_non_mmio_device(const uint32_t *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t address, uint32_t unroll_count) {
-    using data_word_t = uint32_t;
-    constexpr int DATA_WORD_SIZE = sizeof(data_word_t);
-
-    std::string write_tlb = "LARGE_WRITE_TLB";
-    std::string read_tlb = "LARGE_READ_TLB";
-    std::string empty_tlb = "";
-    translate_to_noc_table_coords(core.chip, core.y, core.x);
-
-    const eth_coord_t target_chip = ndesc->get_chip_locations().at(core.chip);
-
-
-    std::vector<std::uint32_t> erisc_command;
-    std::vector<std::uint32_t> erisc_q_rptr = std::vector<uint32_t>(1);
-    std::vector<std::uint32_t> erisc_q_ptrs = std::vector<uint32_t>(eth_interface_params.remote_update_ptr_size_bytes*2 / sizeof(uint32_t));
-
-    std::vector<std::uint32_t> data_block = std::vector<uint32_t>(size_in_bytes / DATA_WORD_SIZE);
-
-    routing_cmd_t *new_cmd;
-
-    flush_non_mmio = true;
-    uint32_t transfer_size = size_in_bytes * unroll_count;
-    uint32_t buffer_id = 0;
-    uint32_t timestamp = 0; //CMD_TIMESTAMP;
-
-    //
-    //                    MUTEX ACQUIRE (NON-MMIO)
-    //  do not locate any ethernet core reads/writes before this acquire
-    //
-    const auto &mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(core.chip);
-
-    if (non_mmio_transfer_cores_customized) {
-        log_assert(active_eth_core_idx_per_chip.find(mmio_capable_chip_logical) != active_eth_core_idx_per_chip.end(), "Ethernet Cores for Host to Cluster communication were not initialized for all MMIO devices.");
-    }
-
-    const scoped_lock<named_mutex> lock(
-        *get_mutex(NON_MMIO_MUTEX_NAME, this->get_pci_device(mmio_capable_chip_logical)->id));
-
-    erisc_command.resize(sizeof(routing_cmd_t)/DATA_WORD_SIZE);
-    new_cmd = (routing_cmd_t *)&erisc_command[0];
-    int& active_core_for_txn = non_mmio_transfer_cores_customized ? active_eth_core_idx_per_chip.at(mmio_capable_chip_logical) : active_core;
-    read_device_memory(erisc_q_ptrs.data(), remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn], eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb);
-
-    uint32_t offset = 0;
-
-    bool full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0], erisc_q_ptrs[4]);
-    erisc_q_rptr.resize(1);
-    erisc_q_rptr[0] = erisc_q_ptrs[4];
-
-    uint32_t unroll_offset = 0;
-
-    while (offset < transfer_size) {
-        while (full) {
-            read_device_memory(erisc_q_rptr.data(), remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn], eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes + eth_interface_params.remote_update_ptr_size_bytes, DATA_WORD_SIZE, read_tlb);
-            full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0],erisc_q_rptr[0]);
-        }
-        //full = true;
-        // set full only if this command will make the q full.
-        // otherwise full stays false so that we do not poll the rd pointer in next iteration.
-        // As long as current command push does not fill up the queue completely, we do not want
-        // to poll rd pointer in every iteration.
-        //full = is_non_mmio_cmd_q_full((erisc_q_ptrs[0] + 1) & CMD_BUF_PTR_MASK, erisc_q_rptr[0]);
-
-        log_assert(((address + offset) & 0x1F) == 0, "Base address + offset in incorrect range!");
-
-        uint32_t req_wr_ptr = erisc_q_ptrs[0] & eth_interface_params.cmd_buf_size_mask;
-
-        uint32_t req_flags = eth_interface_params.cmd_data_block_dram | eth_interface_params.cmd_data_block | eth_interface_params.cmd_wr_req;
-        timestamp = 0;
-
-        uint32_t host_dram_block_addr = host_address_params.eth_routing_buffers_start + (active_core_for_txn * eth_interface_params.cmd_buf_size + req_wr_ptr) * host_address_params.eth_routing_block_size;
-        uint16_t host_dram_channel = 0; // This needs to be 0, since WH can only map ETH buffers to chan 0.
-
-        memcpy(data_block.data(), mem_ptr, size_in_bytes);
-        uint32_t byte_increment = data_block.size() * DATA_WORD_SIZE;
-        uint32_t host_mem_offset = 0;
-        uint32_t i = 0;
-        for (i = 0; (i + unroll_offset) < unroll_count; i++) {
-            if ((host_mem_offset + byte_increment) > host_address_params.eth_routing_block_size) {
-                break;
-            }
-            data_block[0] = i + unroll_offset;
-            write_to_sysmem(data_block, host_dram_block_addr + host_mem_offset, host_dram_channel, mmio_capable_chip_logical);
-            host_mem_offset += byte_increment;
-        }
-        unroll_offset += i;
-        tt_driver_atomics::sfence();
-        new_cmd->sys_addr = get_sys_addr(std::get<0>(target_chip), std::get<1>(target_chip), core.x, core.y, address + offset);
-        new_cmd->rack = get_sys_rack(std::get<2>(target_chip), std::get<3>(target_chip));
-        new_cmd->data = host_mem_offset;
-        new_cmd->flags = req_flags;
-        new_cmd->src_addr_tag = host_dram_block_addr;
-
-        write_device_memory(erisc_command.data(), erisc_command.size() * DATA_WORD_SIZE, remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn], eth_interface_params.request_routing_cmd_queue_base + (sizeof(routing_cmd_t) * req_wr_ptr), write_tlb);
-        tt_driver_atomics::sfence();
-        erisc_q_ptrs[0] = (erisc_q_ptrs[0] + 1) & eth_interface_params.cmd_buf_ptr_mask;
-        std::vector<std::uint32_t> erisc_q_wptr;
-        erisc_q_wptr.resize(1);
-        erisc_q_wptr[0] = erisc_q_ptrs[0];
-        write_device_memory(erisc_q_wptr.data(), erisc_q_wptr.size() * DATA_WORD_SIZE, remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn], eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, write_tlb);
-        tt_driver_atomics::sfence();
-        offset += host_mem_offset;
-
-        // If there is more data to send and this command will make the q full, switch to next Q.
-        // otherwise full stays false so that we do not poll the rd pointer in next iteration.
-        // As long as current command push does not fill up the queue completely, we do not want
-        // to poll rd pointer in every iteration.
-
-        if (is_non_mmio_cmd_q_full((erisc_q_ptrs[0]) & eth_interface_params.cmd_buf_ptr_mask, erisc_q_rptr[0])) {
-            active_core_for_txn++;
-            uint32_t update_mask_for_chip = (remote_transfer_ethernet_cores[mmio_capable_chip_logical].size() - 1);
-            active_core_for_txn = non_mmio_transfer_cores_customized ? (active_core_for_txn & update_mask_for_chip) : ((active_core_for_txn & NON_EPOCH_ETH_CORES_MASK) + NON_EPOCH_ETH_CORES_START_ID);
-            read_device_memory(erisc_q_ptrs.data(), remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn], eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb);
-            full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0], erisc_q_ptrs[4]);
-            erisc_q_rptr[0] = erisc_q_ptrs[4];
-        }
-    }
-}
-
 /*
  * Note that this function is required to acquire the `NON_MMIO_MUTEX_NAME` mutex for interacting with the ethernet core (host) command queue
  * DO NOT use `active_core` or issue any pcie reads/writes to the ethernet core prior to acquiring the mutex. For extra information, see the "NON_MMIO_MUTEX Usage" above
@@ -3889,8 +2223,7 @@ void tt_SiliconDevice::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core
     //                    MUTEX ACQUIRE (NON-MMIO)
     //  do not locate any ethernet core reads/writes before this acquire
     //
-    const scoped_lock<named_mutex> lock(
-        *get_mutex(NON_MMIO_MUTEX_NAME, this->get_pci_device(mmio_capable_chip_logical)->id));
+    const scoped_lock<named_mutex> lock(*get_mutex(NON_MMIO_MUTEX_NAME, this->get_pci_device(mmio_capable_chip_logical)->device_id));
     const tt_cxy_pair remote_transfer_ethernet_core = remote_transfer_ethernet_cores[mmio_capable_chip_logical].at(0);
 
     read_device_memory(erisc_q_ptrs.data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb);
@@ -4186,15 +2519,14 @@ std::unordered_map<chip_id_t, std::vector<std::vector<int>>>& tt_SiliconDevice::
 
 void tt_SiliconDevice::pcie_broadcast_write(chip_id_t chip, const void* mem_ptr, uint32_t size_in_bytes, std::uint32_t addr, const tt_xy_pair& start, const tt_xy_pair& end, const std::string& fallback_tlb) {
     // Use the specified TLB to broadcast data to all cores included in the [start, end] grid -> GS Only. Use Ethernet Broadcast for WH.
-    struct PCIdevice* pci_device = get_pci_device(chip);
+    TTDevice *pci_device = get_pci_device(chip);
     const auto tlb_index = dynamic_tlb_config.at(fallback_tlb);
-    TTDevice *dev = pci_device->hdev;
     const uint8_t* buffer_addr = static_cast<const uint8_t*>(mem_ptr);
-    const scoped_lock<named_mutex> lock(*get_mutex(fallback_tlb, pci_device -> id));
+    const scoped_lock<named_mutex> lock(*get_mutex(fallback_tlb, pci_device->device_id));
     while(size_in_bytes > 0) {
         auto [mapped_address, tlb_size] = set_dynamic_tlb_broadcast(pci_device, tlb_index, addr, harvested_coord_translation, start, end, dynamic_tlb_ordering_modes.at(fallback_tlb));
         uint64_t transfer_size = std::min((uint64_t)size_in_bytes, tlb_size);
-        write_block(dev, mapped_address, transfer_size, buffer_addr, m_dma_buf_size);
+        pci_device->write_block(mapped_address, transfer_size, buffer_addr);
 
         size_in_bytes -= transfer_size;
         addr += transfer_size;
@@ -4419,18 +2751,18 @@ int tt_SiliconDevice::remote_arc_msg(int chip, uint32_t msg_code, bool wait_for_
 }
 
 void tt_SiliconDevice::write_to_sysmem(const void* mem_ptr, std::uint32_t size,  uint64_t addr, uint16_t channel, chip_id_t src_device_id) {
-    write_dma_buffer(mem_ptr, size, addr, channel, src_device_id);
+    write_buffer(mem_ptr, size, addr, channel, src_device_id);
 }
 void tt_SiliconDevice::write_to_sysmem(std::vector<uint32_t>& vec, uint64_t addr, uint16_t channel, chip_id_t src_device_id) {
-    write_dma_buffer(vec.data(), vec.size() * sizeof(uint32_t), addr, channel, src_device_id);
+    write_buffer(vec.data(), vec.size() * sizeof(uint32_t), addr, channel, src_device_id);
 }
 
 void tt_SiliconDevice::read_from_sysmem(void* mem_ptr, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id) {
-    read_dma_buffer(mem_ptr, addr, channel, size, src_device_id);
+    read_buffer(mem_ptr, addr, channel, size, src_device_id);
 }
 void tt_SiliconDevice::read_from_sysmem(std::vector<uint32_t> &vec, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id) {
     size_buffer_to_capacity(vec, size);
-    read_dma_buffer(vec.data(), addr, channel, size, src_device_id);
+    read_buffer(vec.data(), addr, channel, size, src_device_id);
 }
 
 void tt_SiliconDevice::set_membar_flag(const chip_id_t chip, const std::unordered_set<tt_xy_pair>& cores, const uint32_t barrier_value, const uint32_t barrier_addr, const std::string& fallback_tlb) {
@@ -4450,7 +2782,7 @@ void tt_SiliconDevice::set_membar_flag(const chip_id_t chip, const std::unordere
                     cores_synced.insert(core);
                 }
                 else {
-                    log_trace(LogSiliconDriver, "Waiting for core {} to recieve mem bar flag {} in function", core.str(), barrier_value);
+                    log_info(LogSiliconDriver, "Waiting for core {} to recieve mem bar flag {} in function", core.str(), barrier_value);
                 }
             }
         }
@@ -4462,7 +2794,7 @@ void tt_SiliconDevice::set_membar_flag(const chip_id_t chip, const std::unordere
 
 void tt_SiliconDevice::insert_host_to_device_barrier(const chip_id_t chip, const std::unordered_set<tt_xy_pair>& cores, const uint32_t barrier_addr, const std::string& fallback_tlb) {
     // Ensure that this memory barrier is atomic across processes/threads
-    const scoped_lock<named_mutex> lock(*get_mutex(MEM_BARRIER_MUTEX_NAME, this->get_pci_device(chip)->id));
+    const scoped_lock<named_mutex> lock(*get_mutex(MEM_BARRIER_MUTEX_NAME, this->get_pci_device(chip)->device_id));
     set_membar_flag(chip, cores, tt_MemBarFlag::SET, barrier_addr, fallback_tlb);
     set_membar_flag(chip, cores, tt_MemBarFlag::RESET, barrier_addr, fallback_tlb);
 }
@@ -4544,7 +2876,7 @@ void tt_SiliconDevice::dram_membar(const chip_id_t chip, const std::string& fall
     }
 }
 
-void tt_SiliconDevice::write_to_device(const void *mem_ptr, uint32_t size, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {
+void tt_SiliconDevice::write_to_device(const void *mem_ptr, uint32_t size, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb) {
     bool target_is_mmio_capable = ndesc -> is_chip_mmio_capable(core.chip);
     if(target_is_mmio_capable) {
         if (fallback_tlb == "REG_TLB") {
@@ -4552,74 +2884,29 @@ void tt_SiliconDevice::write_to_device(const void *mem_ptr, uint32_t size, tt_cx
         } else {
             write_device_memory(mem_ptr, size, core, addr, fallback_tlb);
         }
-    }
-    else if (!send_epoch_cmd) {
+    } else {
         log_assert(arch_name != tt::ARCH::BLACKHOLE, "Non-MMIO targets not supported in Blackhole");
         log_assert((get_soc_descriptor(core.chip).ethernet_cores).size() > 0 && get_number_of_chips_in_cluster() > 1, "Cannot issue ethernet writes to a single chip cluster!");
         write_to_non_mmio_device(mem_ptr, size, core, addr);
-    } else {
-        log_assert(arch_name != tt::ARCH::BLACKHOLE, "Non-MMIO targets not supported in Blackhole");
-        // as long as epoch commands are sent single-threaded, no need to acquire mutex
-        log_assert(!(size % 4), "Epoch commands must be 4 byte aligned!");
-        write_to_non_mmio_device_send_epoch_cmd((uint32_t*)mem_ptr, size, core, addr, last_send_epoch_cmd, ordered_with_prev_remote_write);
     }
 }
 
-
-void tt_SiliconDevice::write_to_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {
-    // Overloaded device writer that accepts a vector
-    write_to_device(vec.data(), vec.size() * sizeof(uint32_t), core, addr, fallback_tlb, send_epoch_cmd, last_send_epoch_cmd, ordered_with_prev_remote_write);
-}
-
-
-void tt_SiliconDevice::write_epoch_cmd_to_device(const uint32_t *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {
-    bool target_is_mmio_capable = ndesc -> is_chip_mmio_capable(core.chip);
-    if(target_is_mmio_capable) {
-        write_device_memory(mem_ptr, size_in_bytes, core, addr, fallback_tlb);
-    } else {
-        log_assert(arch_name != tt::ARCH::BLACKHOLE, "Non-MMIO targets not supported in Blackhole");    // MT: Use only dynamic TLBs and never program static
-        write_to_non_mmio_device_send_epoch_cmd(mem_ptr, size_in_bytes, core, addr, last_send_epoch_cmd, ordered_with_prev_remote_write);
-     }
-}
-
-void tt_SiliconDevice::write_epoch_cmd_to_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {
+void tt_SiliconDevice::write_to_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb) {
     // Overloaded device writer that accepts a vector
-    write_epoch_cmd_to_device(vec.data(), vec.size() * sizeof(uint32_t), core, addr, fallback_tlb, last_send_epoch_cmd, ordered_with_prev_remote_write);
-}
-
-void tt_SiliconDevice::rolled_write_to_device(uint32_t* mem_ptr, uint32_t size_in_bytes, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb) {
-    log_assert(!(size_in_bytes % 4), "{} only supports 4-byte aligned data", __FUNCTION__);
-    bool target_is_mmio_capable = ndesc->is_chip_mmio_capable(core.chip);
-
-    if (target_is_mmio_capable) {
-        for (int i=0; i<unroll_count; i++) {
-            *mem_ptr = i; // slot id for debug
-            write_device_memory(mem_ptr, size_in_bytes, core, addr + i * size_in_bytes, fallback_tlb);
-        }
-    }
-    else {
-        log_assert(arch_name != tt::ARCH::BLACKHOLE, "Non-MMIO targets not supported in Blackhole");    // MT: Use only dynamic TLBs and never program static
-        log_assert((get_soc_descriptor(core.chip).ethernet_cores).size() > 0 && get_number_of_chips_in_cluster() > 1, "Cannot issue ethernet writes to a single chip cluster!");
-        rolled_write_to_non_mmio_device(mem_ptr, size_in_bytes, core, addr, unroll_count);
-    }
-}
-
-void tt_SiliconDevice::rolled_write_to_device(std::vector<uint32_t> &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb) {
-    rolled_write_to_device(vec.data(), vec.size() * sizeof(uint32_t), unroll_count, core, addr, fallback_tlb);
+    write_to_device(vec.data(), vec.size() * sizeof(uint32_t), core, addr, fallback_tlb);
 }
 
 void tt_SiliconDevice::read_mmio_device_register(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) {
-    struct PCIdevice* pci_device = get_pci_device(core.chip);
-    TTDevice *dev = pci_device->hdev;
+    TTDevice *pci_device = get_pci_device(core.chip);
 
     const auto tlb_index = dynamic_tlb_config.at(fallback_tlb);
-    const scoped_lock<named_mutex> lock(*get_mutex(fallback_tlb, pci_device -> id));
+    const scoped_lock<named_mutex> lock(*get_mutex(fallback_tlb, pci_device->device_id));
     LOG1 ("  dynamic tlb_index: %d\n", tlb_index);
 
     auto [mapped_address, tlb_size] = set_dynamic_tlb(pci_device, tlb_index, core, addr, harvested_coord_translation, TLB_DATA::Strict);
     // Align block to 4bytes if needed. 
     auto aligned_buf = tt_4_byte_aligned_buffer(mem_ptr, size);
-    read_regs(dev, mapped_address, aligned_buf.block_size / sizeof(std::uint32_t), aligned_buf.local_storage);
+    pci_device->read_regs(mapped_address, aligned_buf.block_size / sizeof(std::uint32_t), aligned_buf.local_storage);
 
     if(aligned_buf.input_size != aligned_buf.block_size) {
         // Copy value from aligned buffer to main buffer.
@@ -4629,11 +2916,10 @@ void tt_SiliconDevice::read_mmio_device_register(void* mem_ptr, tt_cxy_pair core
 
 
 void tt_SiliconDevice::write_mmio_device_register(const void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) {
-    struct PCIdevice* pci_device = get_pci_device(core.chip);
-    TTDevice *dev = pci_device->hdev;
+    TTDevice *pci_device = get_pci_device(core.chip);
 
     const auto tlb_index = dynamic_tlb_config.at(fallback_tlb);
-    const scoped_lock<named_mutex> lock(*get_mutex(fallback_tlb, pci_device -> id));
+    const scoped_lock<named_mutex> lock(*get_mutex(fallback_tlb, pci_device->device_id));
     LOG1 ("  dynamic tlb_index: %d\n", tlb_index);
 
     auto [mapped_address, tlb_size] = set_dynamic_tlb(pci_device, tlb_index, core, addr, harvested_coord_translation, TLB_DATA::Strict);
@@ -4643,7 +2929,7 @@ void tt_SiliconDevice::write_mmio_device_register(const void* mem_ptr, tt_cxy_pa
         // Copy value from main buffer to aligned buffer
         std::memcpy(aligned_buf.local_storage, mem_ptr, size);
     }
-    write_regs(dev, mapped_address, aligned_buf.block_size / sizeof(uint32_t), aligned_buf.local_storage);
+    pci_device->write_regs(mapped_address, aligned_buf.block_size / sizeof(uint32_t), aligned_buf.local_storage);
 }
 
 void tt_SiliconDevice::read_from_device(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) {
@@ -4694,7 +2980,7 @@ void tt_SiliconDevice::send_remote_tensix_risc_reset_to_core(const tt_cxy_pair &
 
 int tt_SiliconDevice::set_remote_power_state(const chip_id_t &chip, tt_DevicePowerState device_state) {
     auto mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(chip);
-    struct PCIdevice* pci_device = get_pci_device(mmio_capable_chip_logical);
+    TTDevice *pci_device = get_pci_device(mmio_capable_chip_logical);
     return remote_arc_msg(chip, get_power_state_arc_msg(pci_device, device_state), true, 0, 0, 1, NULL, NULL);
 }
 
@@ -4718,7 +3004,7 @@ void tt_SiliconDevice::enable_remote_ethernet_queue(const chip_id_t& chip, int t
 void tt_SiliconDevice::broadcast_tensix_risc_reset_to_cluster(const TensixSoftResetOptions &soft_resets) {
     if(arch_name == tt::ARCH::GRAYSKULL) {
         for (auto &device_it : m_pci_device_map) {
-            broadcast_pcie_tensix_risc_reset(device_it.second, soft_resets);
+            broadcast_pcie_tensix_risc_reset(device_it.second.get(), soft_resets);
         }
     }
     else {
@@ -4792,14 +3078,14 @@ void tt_SiliconDevice::deassert_resets_and_set_power_state() {
     if (arch_name != tt::ARCH::BLACKHOLE) {
         // Send ARC Messages to deassert RISCV resets
         for (auto &device_it : m_pci_device_map){
-            arc_msg(device_it.first, 0xaa00 | device_it.second->hdev->get_architecture_implementation()->get_arc_message_deassert_riscv_reset(), true, 0, 0);
+            arc_msg(device_it.first, 0xaa00 | device_it.second.get()->get_architecture_implementation()->get_arc_message_deassert_riscv_reset(), true, 0, 0);
         }
         if(ndesc != nullptr) {
             for(const chip_id_t& chip : target_devices_in_cluster) {
                 if(!ndesc -> is_chip_mmio_capable(chip)) {
                     auto mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(chip);
-                    struct PCIdevice* pci_device = get_pci_device(mmio_capable_chip_logical);
-                    remote_arc_msg(chip, 0xaa00 | pci_device->hdev->get_architecture_implementation()->get_arc_message_deassert_riscv_reset(), true, 0x0, 0x0, 1, NULL, NULL);
+                    auto pci_device = get_pci_device(mmio_capable_chip_logical);
+                    remote_arc_msg(chip, 0xaa00 | pci_device->get_architecture_implementation()->get_arc_message_deassert_riscv_reset(), true, 0x0, 0x0, 1, NULL, NULL);
                 }
             }
             enable_ethernet_queue(30);
@@ -4905,22 +3191,8 @@ std::uint32_t tt_SiliconDevice::get_host_channel_size(std::uint32_t device_id, s
     return host_channel_size.at(device_id).at(channel);
 }
 
-std::uint32_t tt_SiliconDevice::get_pcie_speed(std::uint32_t device_id) {
-    int link_width = 0;
-    int link_speed = 0;
-    if (ndesc->is_chip_mmio_capable(device_id)) {
-        PCIdevice *pci_device = get_pci_device(device_id);
-        link_width = get_link_width(pci_device->hdev);
-        link_speed = get_link_speed(pci_device->hdev);
-        log_debug(LogSiliconDriver, "Device {} PCIe link width: x{}, speed: {} Gb/s", device_id, link_width, link_speed);
-    } else {
-        log_debug(LogSiliconDriver, "Device {} is NOT a PCIe device, width: x{}, speed: {} Gb/s", device_id, link_width, link_speed);
-    }
-    return (link_width * link_speed);
-}
-
 std::uint32_t tt_SiliconDevice::get_numa_node_for_pcie_device(std::uint32_t device_id) {
-    return get_numa_node(get_pci_device(device_id)->hdev);
+    return get_pci_device(device_id)->numa_node;
 }
 
 std::uint64_t tt_SiliconDevice::get_pcie_base_addr_from_device() const {
diff --git a/device/tt_silicon_driver_common.hpp b/device/tt_silicon_driver_common.hpp
index 1649bf70..9f275668 100644
--- a/device/tt_silicon_driver_common.hpp
+++ b/device/tt_silicon_driver_common.hpp
@@ -9,19 +9,6 @@
 #include <cstdint>
 #include <string>
 
-
-typedef struct {
-    uint32_t  chip_addr;
-    uint32_t  host_phys_addr;
-    uint32_t  completion_flag_phys_addr;
-    uint32_t  size_bytes                  : 28;
-    uint32_t  write                       : 1;
-    uint32_t  pcie_msi_on_done            : 1;
-    uint32_t  pcie_write_on_done          : 1;
-    uint32_t  trigger                     : 1;
-    uint32_t  repeat;
-} arc_pcie_ctrl_dma_request_t; // 5 * 4 = 20B
-
 enum class TensixSoftResetOptions: std::uint32_t {
     NONE = 0,
     BRISC = ((std::uint32_t) 1 << 11),
diff --git a/device/tt_soc_descriptor.cpp b/device/tt_soc_descriptor.cpp
index 4320b3ef..60958372 100644
--- a/device/tt_soc_descriptor.cpp
+++ b/device/tt_soc_descriptor.cpp
@@ -199,98 +199,20 @@ int tt_SocDescriptor::get_num_dram_channels() const {
     return num_channels;
 }
 
-std::vector<int> tt_SocDescriptor::get_dram_chan_map() {
-    std::vector<int> chan_map;
-    for (unsigned int i = 0; i < dram_cores.size(); i++) {
-        chan_map.push_back(i);
-    }
-    return chan_map;
-};
-
 bool tt_SocDescriptor::is_worker_core(const tt_xy_pair &core) const {
     return (
         routing_x_to_worker_x.find(core.x) != routing_x_to_worker_x.end() &&
         routing_y_to_worker_y.find(core.y) != routing_y_to_worker_y.end());
 }
 
-tt_xy_pair tt_SocDescriptor::get_worker_core(const tt_xy_pair &core) const {
-    tt_xy_pair worker_xy = {
-        static_cast<size_t>(routing_x_to_worker_x.at(core.x)), static_cast<size_t>(routing_y_to_worker_y.at(core.y))};
-    return worker_xy;
-}
-
-tt_xy_pair tt_SocDescriptor::get_routing_core(const tt_xy_pair& core) const {
-    tt_xy_pair routing_xy = {
-        static_cast<size_t>(worker_log_to_routing_x.at(core.x)), static_cast<size_t>(worker_log_to_routing_y.at(core.y))};
-    return routing_xy;
-}
-
 tt_xy_pair tt_SocDescriptor::get_core_for_dram_channel(int dram_chan, int subchannel) const {
     return this->dram_cores.at(dram_chan).at(subchannel);
 };
 
-tt_xy_pair tt_SocDescriptor::get_pcie_core(int pcie_id) const {
-    return this->pcie_cores.at(pcie_id);
-};
-
 bool tt_SocDescriptor::is_ethernet_core(const tt_xy_pair &core) const {
     return this->ethernet_core_channel_map.find(core) != ethernet_core_channel_map.end();
 }
 
-bool tt_SocDescriptor::is_dram_core(const tt_xy_pair &core) const {
-    static std::unordered_set<tt_xy_pair> cores = {};
-    if (cores.empty()) {
-        for (const std::vector<tt_xy_pair> &dram_chan : this->dram_cores) {
-            for (const tt_xy_pair &subchannel : dram_chan) {
-                cores.insert(subchannel);
-            }
-        }
-    }
-    return cores.find(core) != cores.end();
-}
-
-int tt_SocDescriptor::get_channel_of_ethernet_core(const tt_xy_pair &core) const {
-    return this->ethernet_core_channel_map.at(core);
-}
-
-int tt_SocDescriptor::get_num_dram_subchans() const {
-    int num_chan = 0;
-    for (const std::vector<tt_xy_pair> &core : this->dram_cores) {
-        num_chan += core.size();
-    }
-    return num_chan;
-}
-
-int tt_SocDescriptor::get_num_dram_blocks_per_channel() const {
-    int num_blocks = 0;
-    if (arch == tt::ARCH::GRAYSKULL) {
-        num_blocks = 1;
-    } else if (arch == tt::ARCH::WORMHOLE) {
-        num_blocks = 2;
-    } else if (arch == tt::ARCH::WORMHOLE_B0) {
-        num_blocks = 2;
-    } else if (arch == tt::ARCH::BLACKHOLE) {
-        num_blocks = 2;
-    }
-    return num_blocks;
-}
-
-// Note: same as t_SiliconDevice::get_pcie_base_addr_from_device
-uint64_t tt_SocDescriptor::get_noc2host_offset(uint16_t host_channel) const {
-
-    const std::uint64_t PEER_REGION_SIZE = (1024 * 1024 * 1024);
-
-    if (arch == tt::ARCH::GRAYSKULL) {
-        return (host_channel * PEER_REGION_SIZE);
-    }else if (arch == tt::ARCH::WORMHOLE || arch == tt::ARCH::WORMHOLE_B0) {
-        return (host_channel * PEER_REGION_SIZE) + 0x800000000;
-    } else if (arch == tt::ARCH::BLACKHOLE) {
-        return (host_channel * PEER_REGION_SIZE) + (1ULL << 60);
-    } else {
-        throw std::runtime_error("Unsupported architecture");
-    }
-}
-
 std::ostream &operator<<(std::ostream &out, const tt::ARCH &arch_name) {
     if (arch_name == tt::ARCH::JAWBRIDGE) {
         out << "jawbridge";
diff --git a/device/tt_soc_descriptor.h b/device/tt_soc_descriptor.h
index 2be98749..87ea1799 100644
--- a/device/tt_soc_descriptor.h
+++ b/device/tt_soc_descriptor.h
@@ -23,8 +23,6 @@ namespace YAML {
     class Node;
 }
 
-static constexpr std::size_t DEFAULT_DRAM_SIZE_PER_CORE = 8 * 1024 * 1024;
-
 std::ostream &operator<<(std::ostream &out, const tt::ARCH &arch_name);
 
 static inline std::string get_arch_str(const tt::ARCH arch_name){
@@ -132,18 +130,9 @@ class tt_SocDescriptor {
     uint64_t dram_bank_size;
 
     int get_num_dram_channels() const;
-    std::vector<int> get_dram_chan_map();
     bool is_worker_core(const tt_xy_pair &core) const;
-    tt_xy_pair get_worker_core(const tt_xy_pair& core) const;
-    tt_xy_pair get_routing_core(const tt_xy_pair& core) const;
     tt_xy_pair get_core_for_dram_channel(int dram_chan, int subchannel) const;
-    tt_xy_pair get_pcie_core(int pcie_id = 0) const;
-    bool is_dram_core(const tt_xy_pair& core) const;
     bool is_ethernet_core(const tt_xy_pair& core) const;
-    int get_channel_of_ethernet_core(const tt_xy_pair &core) const;
-    int get_num_dram_subchans() const;
-    int get_num_dram_blocks_per_channel() const;
-    uint64_t get_noc2host_offset(uint16_t host_channel) const;
 
     // Default constructor. Creates uninitialized object with public access to all of its attributes.
     tt_SocDescriptor() = default;
diff --git a/device/tt_umd.hpp b/device/tt_umd.hpp
new file mode 100644
index 00000000..0fc95860
--- /dev/null
+++ b/device/tt_umd.hpp
@@ -0,0 +1,57 @@
+/*
+ * SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+#include <set>
+
+#include "tt_soc_descriptor.h"
+#include "tt_xy_pair.h"
+#include "tt_silicon_driver_common.hpp"
+#include "device/tt_cluster_descriptor_types.h"
+#include "device/tlb.h"
+#include "device/tt_io.hpp"
+
+#include "pci_device.hpp"
+
+void write_to_device(const void *mem_ptr, uint32_t size, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb);
+void write_to_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb);
+void read_from_device(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb);
+void read_from_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb);
+void write_to_sysmem(const void* mem_ptr, std::uint32_t size,  uint64_t addr, uint16_t channel, chip_id_t src_device_id);
+void write_to_sysmem(std::vector<uint32_t>& vec, uint64_t addr, uint16_t channel, chip_id_t src_device_id);
+void read_from_sysmem(void* mem_ptr, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id);
+void read_from_sysmem(std::vector<uint32_t> &vec, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id);
+
+void wait_for_non_mmio_flush();
+void l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
+void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels);
+void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
+
+void assert_risc_reset();
+void assert_risc_reset_at_core(tt_cxy_pair core);
+void deassert_risc_reset();
+void deassert_risc_reset_at_core(tt_cxy_pair core);
+
+std::map<int, int> get_clocks();
+std::set<chip_id_t> get_target_remote_device_ids();
+std::uint32_t get_num_host_channels(std::uint32_t device_id);
+std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel);
+void *host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel); // should prob be a get?
+std::uint64_t get_pcie_base_addr_from_device();
+std::unordered_map<chip_id_t, uint32_t> get_harvesting_masks_for_soc_descriptors();
+std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id);
+std::vector<chip_id_t> detect_available_device_ids(); // should move all calls of this completely into umd
+
+// Fast-dispatch workaround :(
+std::function<void(uint32_t, uint32_t, const uint8_t*)> get_fast_pcie_static_tlb_write_callable(int device_id);
+tt::Writer get_static_tlb_writer(tt_cxy_pair target);
+
diff --git a/device/tt_versim_device.cpp b/device/tt_versim_device.cpp
deleted file mode 100644
index e7ac7506..00000000
--- a/device/tt_versim_device.cpp
+++ /dev/null
@@ -1,323 +0,0 @@
-// SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-
-
-#include "tt_device.h"
-#include "device/driver_atomics.h"
-#include "common/logger.hpp"
-#include <iostream>
-#include <fstream>
-#include <string>
-#include <vector>
-
-#include "yaml-cpp/yaml.h"
-
-// TODO: Remove dependency on command_assembler + soc
-#include "command_assembler/soc.h"
-#include "device/tt_cluster_descriptor.h"
-namespace CA = CommandAssembler;
-
-
-void translate_soc_descriptor_to_ca_soc(CA::Soc &soc, const tt_SocDescriptor soc_descriptor) {
-  for (auto &core : soc_descriptor.cores) {
-    CA::SocNocNode node;
-    CA::xy_pair CA_coord(core.first.x, core.first.y);
-    node.noc_coord = CA_coord;
-    node.memory_size = core.second.l1_size;
-    switch (core.second.type) {
-      case CoreType::ARC: node.arc = true; break;
-      case CoreType::DRAM: {
-        node.dram = true; 
-        #ifdef EN_DRAM_ALIAS
-          node.dram_channel_id = std::get<0>(soc_descriptor.dram_core_channel_map.at(core.first));
-        #endif
-      } break;
-      case CoreType::ETH: node.eth = true; break;
-      case CoreType::PCIE: node.pcie = true; break;
-      case CoreType::WORKER: node.worker = true; break;
-      case CoreType::HARVESTED: node.harvested = true; break;
-      case CoreType::ROUTER_ONLY: node.router_only = true; break;
-      default: std::cout << " Error: Unsupported CoreType type: " << static_cast<int>(core.second.type) << std::endl; break;
-    }
-    soc.SetNodeProperties(node.noc_coord, node);
-  }
-}
-
-////////
-// Device Versim
-////////
-
-#include "device.h"
-#include "sim_interactive.h"
-#include <command_assembler/xy_pair.h>
-
-tt_VersimDevice::tt_VersimDevice(const std::string &sdesc_path, const std::string &ndesc_path) : tt_device(sdesc_path) {
-  soc_descriptor_per_chip.emplace(0, tt_SocDescriptor(sdesc_path));
-  std::set<chip_id_t> target_devices = {0};
-  if (ndesc_path == "") {
-    ndesc = tt_ClusterDescriptor::create_for_grayskull_cluster(target_devices, {});
-  } 
-  else {
-    ndesc = tt_ClusterDescriptor::create_from_yaml(ndesc_path);
-  }
-}
-
-std::unordered_map<chip_id_t, tt_SocDescriptor>& tt_VersimDevice::get_virtual_soc_descriptors() {return soc_descriptor_per_chip;}
-
-tt_ClusterDescriptor* tt_VersimDevice::get_cluster_description() {return ndesc.get();}
-void tt_VersimDevice::start_device(const tt_device_params &device_params) {
-  bool no_checkers = true;
-  std::vector<std::string> dump_cores = device_params.unroll_vcd_dump_cores(get_soc_descriptor(0) -> grid_size);
-  start(device_params.expand_plusargs(), dump_cores, no_checkers, device_params.init_device, false);
-}
-
-void tt_VersimDevice::close_device() {
-  stop();
-}
-
-void tt_VersimDevice::start(
-    std::vector<std::string> plusargs,
-    std::vector<std::string> dump_cores,
-    bool no_checkers,
-    bool /*init_device*/,
-    bool /*skip_driver_allocs*/
-    ) {
-
-     std::cout << "Start Versim Device " << std::endl;
-     std::string device_descriptor_dir = "./";
-
-     std::optional<std::string> vcd_suffix;
-     if (dump_cores.size() > 0) {
-       vcd_suffix = "core_dump.vcd";
-     }
-
-     std::vector<std::string> vcd_cores;
-
-     // TODO: For now create a temporary stuff from CA and populate from descriptor before passing back to versim-core
-     // interface. mainly bypasses arch_configs etc from llir.  We can populate soc directly
-     // MT: have to preserve ca_soc_descriptor object since versim references it at runtime
-     CA::xy_pair CA_grid_size((soc_descriptor_per_chip.begin() -> second).grid_size.x, (soc_descriptor_per_chip.begin() -> second).grid_size.y);
-     // CA::Soc ca_soc_manager(CA_grid_size);
-     std::unique_ptr<CA::Soc> p_ca_soc_manager_unique = std::make_unique<CA::Soc>(CA_grid_size);
-     translate_soc_descriptor_to_ca_soc(*p_ca_soc_manager_unique, (soc_descriptor_per_chip.begin() -> second));
-     // TODO: End
-
-     std::cout << "Versim Device: turn_on_device ";
-     std::vector<std::uint32_t> trisc_sizes = {static_cast<unsigned int>(l1_address_params.trisc0_size), static_cast<unsigned int>(l1_address_params.trisc1_size), static_cast<unsigned int>(l1_address_params.trisc2_size)};
-     std::unique_ptr<versim::VersimSimulator> versim_unique = versim::turn_on_device(CA_grid_size, *p_ca_soc_manager_unique, plusargs, vcd_suffix, dump_cores, no_checkers,
-        l1_address_params.trisc_base, trisc_sizes);
-     versim = versim_unique.release();
-
-     std::cout << "Versim Device: write info to tvm db " << std::endl;
-     versim::write_info_to_tvm_db(l1_address_params.trisc_base, trisc_sizes);
-     versim::build_and_connect_tvm_phase();
-
-     versim->spin_threads(*p_ca_soc_manager_unique, false);
-     versim::assert_reset(*versim);
-
-     p_ca_soc_manager = (void*)(p_ca_soc_manager_unique.release());
-
-     std::cout << "Versim Device: Done start " << std::endl;
-}
-
-tt_VersimDevice::~tt_VersimDevice () {
-  ndesc.reset();
-}
-
-// bool tt_VersimDevice::run() {
-//   std::cout << "Versim Device: Run " << std::endl;
-
-//   // Run Versim main_loop
-//   versim::startup_versim_main_loop(*versim);
-
-//   return true;
-// }
-
-void tt_VersimDevice::deassert_risc_reset() {
-  std::cout << "Versim Device: Deassert risc resets start" << std::endl;
-  versim::handle_resetting_triscs(*versim);
-  std::cout << "Versim Device: Start main loop " << std::endl;
-  versim::startup_versim_main_loop(*versim);
-}
-
-void tt_VersimDevice::deassert_risc_reset_at_core(tt_cxy_pair core) {
-  // This function deasserts reset on the full versim device (don't need core level granularity for versim)
- deassert_risc_reset();
-}
-
-void tt_VersimDevice::assert_risc_reset() {
-  std::cout << "Pause all the cores" << std::endl;
-  versim::pause(*versim);
-
-  std::cout << "Wait for cores to go to paused state" << std::endl;
-  versim::sleep_wait_for_paused (*versim);
-
-  std::cout << "Assert riscv reset" << std::endl;
-  versim::assert_riscv_reset(*versim);
-}
-
-void tt_VersimDevice::assert_risc_reset_at_core(tt_cxy_pair core) {
-  // This function asserts reset on the full versim device (don't need core level granularity for versim)
- assert_risc_reset();
-}
-
-void tt_VersimDevice::rolled_write_to_device(std::vector<uint32_t> &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) {
-  uint32_t byte_increment = vec.size() * 4; 
-  for (int i=0; i<unroll_count; i++) {
-      vec[0] = i; // slot id for debug
-      write_to_device(vec, core, addr + i * byte_increment, tlb_to_use);
-  }
-}
-
-void tt_VersimDevice::rolled_write_to_device(uint32_t* mem_ptr, uint32_t len, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb) {
-  std::vector<std::uint32_t> mem_vector(mem_ptr, mem_ptr + len);
-  rolled_write_to_device(mem_vector, unroll_count, core, addr, fallback_tlb);
-}
-
-void tt_VersimDevice::write_to_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {
-  
-  log_debug(tt::LogSiliconDriver, "Versim Device ({}): Write vector at target core {}, address: {}", get_sim_time(*versim), core.str(), addr);
-
-  bool aligned_32B = (soc_descriptor_per_chip.begin() -> second).cores.at(core).type == CoreType::DRAM;
-  // MT: Remove these completely
-  CommandAssembler::xy_pair CA_target(core.x, core.y);
-  CommandAssembler::memory CA_tensor_memory(addr, vec);
-
-  nuapi::device::write_memory_to_core(*versim, CA_target, CA_tensor_memory);
-}
-
-void tt_VersimDevice::write_to_device(const void *mem_ptr, uint32_t size, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {
-  log_assert(!(size % 4), "Writes to Versim Backend should be 4 byte aligned!");
-
-  std::vector<std::uint32_t> mem_vector((uint32_t*)mem_ptr, (uint32_t*)mem_ptr + size / sizeof(uint32_t));
-  write_to_device(mem_vector, core, addr, tlb_to_use, send_epoch_cmd, last_send_epoch_cmd, ordered_with_prev_remote_write);
-}
-
-void tt_VersimDevice::broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set<chip_id_t>& chips_to_exclude, std::set<uint32_t>& rows_to_exclude, std::set<uint32_t>& cols_to_exclude, const std::string& fallback_tlb) {
-  for(const auto& core : get_soc_descriptor(0) -> cores) {
-    if(cols_to_exclude.find(core.first.x) == cols_to_exclude.end() and rows_to_exclude.find(core.first.y) == rows_to_exclude.end() and core.second.type != CoreType::HARVESTED) {
-        write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, "");
-      }
-  }
-}
-void tt_VersimDevice::wait_for_non_mmio_flush() {
-  // Do nothing, since Versim does not simulate non-mmio mapped chips
-}
-
-void tt_VersimDevice::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {
-  tt_driver_atomics::mfence(); // Ensure no reordering of loads/stores around this
-}
-
-void tt_VersimDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels) {
-  tt_driver_atomics::mfence(); // Ensure no reordering of loads/stores around this
-}
-
-void tt_VersimDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& dram_cores) {
-  tt_driver_atomics::mfence(); // Ensure no reordering of loads/stores around this
-}
-
-void tt_VersimDevice::read_from_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) {
-  log_debug(tt::LogSiliconDriver, "Versim Device ({}): Read vector from address: {}, with size: {} Bytes", get_sim_time(*versim), addr, size);
-
-  CommandAssembler::xy_pair CA_target(core.x, core.y);
-
-  size_t size_in_words = size / 4;
-  auto result = nuapi::device::read_memory_from_core(*versim, CA_target, addr, size_in_words);
-  vec = result;
-}
-
-void tt_VersimDevice::read_from_device(void *mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) {
-  log_debug(tt::LogSiliconDriver, "Versim Device ({}): Read vector from address: {}, with size: {} Bytes", get_sim_time(*versim), addr, size);
-  log_assert(!(size % 4), "Reads from Versim backend should be 4 byte aligned!");
-
-  CommandAssembler::xy_pair CA_target(core.x, core.y);
-
-  size_t size_in_words = size / 4;
-  auto result = nuapi::device::read_memory_from_core(*versim, CA_target, addr, size_in_words);
-  memcpy(mem_ptr, result.data(), result.size()*sizeof(uint32_t));
-}
-
-void tt_VersimDevice::translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c) {
-  // No translation is performed
-  return;
-}
-
-std::set<chip_id_t> tt_VersimDevice::get_target_mmio_device_ids() {
-  // Must only be used for silicon
-  return {};
-}
-
-std::set<chip_id_t> tt_VersimDevice::get_target_remote_device_ids() {
-  // Must only be used for silicon
-  return {};
-}
-
-
-bool versim_check_dram_core_exists(const std::vector<std::vector<tt_xy_pair>> &dram_core_channels, tt_xy_pair target_core) {
-    bool dram_core_exists = false;
-    for (const auto &dram_cores_in_channel: dram_core_channels) {
-      for (const auto &dram_core : dram_cores_in_channel) {
-        if (dram_core.x == target_core.x && dram_core.y == target_core.y) {
-            return true;
-        }
-      }
-    }
-    return false;
-}
-
-int tt_VersimDevice::get_number_of_chips_in_cluster() { return detect_number_of_chips(); }
-std::unordered_set<int> tt_VersimDevice::get_all_chips_in_cluster() { return {0}; }
-int tt_VersimDevice::detect_number_of_chips() { return 1; }
-
-bool tt_VersimDevice::using_harvested_soc_descriptors() { return false; }
-bool tt_VersimDevice::noc_translation_en() { return false; }
-std::unordered_map<chip_id_t, uint32_t> tt_VersimDevice::get_harvesting_masks_for_soc_descriptors() { return {{0, 0}};}
-
-// Meant to breakout running functions for simulator
-bool tt_VersimDevice::stop() {
-  std::cout << "Versim Device: Stop " << std::endl;
-
-  versim::turn_off_device(*versim);
-  versim->shutdown();
-  // Force free of all versim cores
-  for (auto x = 0; x < versim->grid_size.x; x++) {
-    for (auto y = 0; y < versim->grid_size.y; y++) {
-      delete versim->core_grid.at(x).at(y);
-    }
-  }
-  std::cout << "Versim Device: Stop completed " << std::endl;
-  delete versim;
-  return true;
-}
-
-std::map<int,int> tt_VersimDevice::get_clocks() {
-  return std::map<int,int>();
-}
-
-void tt_VersimDevice::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) {
-    l1_address_params = l1_address_params_;
-}
-
-void tt_VersimDevice::set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_) {
-    dram_address_params = dram_address_params_;
-}
-
-std::uint32_t tt_VersimDevice::get_num_dram_channels(std::uint32_t device_id) {
-    return get_soc_descriptor(device_id) -> get_num_dram_channels();
-}
-
-std::uint64_t tt_VersimDevice::get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel) {
-    return get_soc_descriptor(device_id) -> dram_bank_size; // Space per channel is identical for now
-}
-
-std::uint32_t tt_VersimDevice::get_num_host_channels(std::uint32_t device_id) {
-    // Host buffers not allocated for Versim Devices
-    return 0;
-}
-
-std::uint32_t tt_VersimDevice::get_host_channel_size(std::uint32_t device_id, std::uint32_t channel) {
-    // Host buffers not allocated for Versim Devices
-    return 0;
-}
\ No newline at end of file
diff --git a/device/tt_versim_stub.cpp b/device/tt_versim_stub.cpp
deleted file mode 100644
index 27c69f80..00000000
--- a/device/tt_versim_stub.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-// SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-
-#include "tt_device.h"
-
-#include <iostream>
-#include <fstream>
-#include <string>
-#include <vector>
-
-tt_VersimDevice::tt_VersimDevice(const std::string &sdesc_path, const std::string &ndesc_path) : tt_device(sdesc_path) {
-  throw std::runtime_error("tt_VersimDevice() -- VERSIM is not supported in this build\n");
-}
-
-tt_VersimDevice::~tt_VersimDevice () {}
-
-std::unordered_map<chip_id_t, tt_SocDescriptor>& tt_VersimDevice::get_virtual_soc_descriptors() {
-    throw std::runtime_error("tt_VersimDevice() -- VERSIM is not supported in this build\n");
-    return soc_descriptor_per_chip;
-}
-
-int tt_VersimDevice::get_number_of_chips_in_cluster() { return detect_number_of_chips(); }
-std::unordered_set<int> tt_VersimDevice::get_all_chips_in_cluster() { return {}; }
-int tt_VersimDevice::detect_number_of_chips() { return 0; }
-
-void tt_VersimDevice::start_device(const tt_device_params &device_params) {}
-void tt_VersimDevice::close_device() {}
-void tt_VersimDevice::write_to_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {}
-void tt_VersimDevice::broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set<chip_id_t>& chips_to_exclude, std::set<uint32_t>& rows_to_exclude, std::set<uint32_t>& cols_to_exclude, const std::string& fallback_tlb) {}
-void tt_VersimDevice::read_from_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) {}
-void tt_VersimDevice::rolled_write_to_device(std::vector<uint32_t> &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) {}
-void tt_VersimDevice::write_to_device(const void *mem_ptr, uint32_t len, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {}
-void tt_VersimDevice::read_from_device(void *mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) {}
-void tt_VersimDevice::rolled_write_to_device(uint32_t* mem_ptr, uint32_t len, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb) {}
-void tt_VersimDevice::wait_for_non_mmio_flush() {}
-
-void tt_VersimDevice::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {}
-void tt_VersimDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels) {}
-void tt_VersimDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& dram_cores) {}
-
-void tt_VersimDevice::start(
-    std::vector<std::string> plusargs,
-    std::vector<std::string> dump_cores,
-    bool no_checkers,
-    bool /*init_device*/,
-    bool /*skip_driver_allocs*/
-) {}
-
-void tt_VersimDevice::deassert_risc_reset() {}
-void tt_VersimDevice::deassert_risc_reset_at_core(tt_cxy_pair core) {}
-void tt_VersimDevice::assert_risc_reset() {}
-void tt_VersimDevice::assert_risc_reset_at_core(tt_cxy_pair core) {}
-
-void tt_VersimDevice::translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c) {};
-// void tt_VersimDevice::dump_wall_clock_mailbox(std::string output_path, int device_id) {}
-
-std::set<chip_id_t> tt_VersimDevice::get_target_mmio_device_ids() {return {};}
-std::set<chip_id_t> tt_VersimDevice::get_target_remote_device_ids() {return {};}
-
-bool versim_check_dram_core_exists(
-    const std::vector<std::vector<tt_xy_pair>> &dram_core_channels, tt_xy_pair target_core) {
-  return false;
-}
-
-bool tt_VersimDevice::using_harvested_soc_descriptors() { return false; }
-bool tt_VersimDevice::noc_translation_en() { return false; }
-std::unordered_map<chip_id_t, uint32_t> tt_VersimDevice::get_harvesting_masks_for_soc_descriptors() { return std::unordered_map<chip_id_t, uint32_t>();}
-
-bool tt_VersimDevice::stop() { return true; }
-
-void tt_VersimDevice::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) {}
-void tt_VersimDevice::set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_) {}
-
-std::uint32_t tt_VersimDevice::get_num_dram_channels(std::uint32_t device_id) {return 0;}
-std::uint64_t tt_VersimDevice::get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel) {return 0;}
-std::uint32_t tt_VersimDevice::get_num_host_channels(std::uint32_t device_id) {return 0;}
-std::uint32_t tt_VersimDevice::get_host_channel_size(std::uint32_t device_id, std::uint32_t channel) {return 0;}
-
-std::map<int,int> tt_VersimDevice::get_clocks() {return std::map<int,int>();}
-
-tt_ClusterDescriptor* tt_VersimDevice::get_cluster_description() {return ndesc.get();}
-
diff --git a/device/wormhole/impl_device.hpp b/device/wormhole/impl_device.hpp
deleted file mode 100644
index 227cac48..00000000
--- a/device/wormhole/impl_device.hpp
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc.
- *
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#pragma once
-
-#include "device/tt_silicon_driver_common.hpp"
-
-// See src/t6ifc/t6py/packages/tenstorrent/data/wormhole/pci/tlb.yaml
-// local_offset: [ 0, 15,  0,  "36-bit address prefix, prepended to the 20 LSBs of issued address to form a 56-bit NOC address. The 1MB TLB #n corresponds to the 1MB MMIO range starting at (0x0 + N*0x100000)."]
-// x_end       : [ 0, 21, 16,  "" ]
-// y_end       : [ 0, 27, 22,  "" ]
-// x_start     : [ 0, 33, 28,  "" ]
-// y_start     : [ 0, 39, 34,  "" ]
-// noc_sel:      [ 0, 40, 40,  "NOC select (1 = NOC1, 0 = NOC0)"]
-// mcast:        [ 0, 41, 41,  "1 = multicast, 0 = unicast"]
-// ordering:     [ 0, 43, 42,  "ordering mode (01 = strict (full AXI ordering), 00 = relaxed (no RAW hazard), 10 = posted (may have RAW hazard)"]
-// linked:       [ 0, 44, 44,  "linked"]
-
-// local_offset: [ 0, 14,  0,  "35-bit address prefix, prepended to the 21 LSBs of issued address to form a 56-bit NOC address. The 2MB TLB #n corresponds to the 2MB MMIO range starting at (0x9C00000 + N*0x200000)."]
-// x_end       : [ 0, 20, 15,  "" ]
-// y_end       : [ 0, 26, 21,  "" ]
-// x_start     : [ 0, 32, 27,  "" ]
-// y_start     : [ 0, 38, 33,  "" ]
-// noc_sel:      [ 0, 39, 39,  "NOC select (1 = NOC1, 0 = NOC0)"]
-// mcast:        [ 0, 40, 40,  "1 = multicast, 0 = unicast"]
-// ordering:     [ 0, 42, 41,  "ordering mode (01 = strict (full AXI ordering), 00 = relaxed (no RAW hazard), 10 = posted (may have RAW hazard)"]
-// linked:       [ 0, 43, 43,  "linked"]
-
-// local_offset: [ 0, 11,  0,  "32-bit address prefix, prepended to the 24 LSBs of issued address to form a 56-bit NOC address. The 16MB TLB #n corresponds to the 16MB MMIO range starting at (0xB000000 + N*0x1000000)."]
-// x_end       : [ 0, 17, 12,  "" ]
-// y_end       : [ 0, 23, 18,  "" ]
-// x_start     : [ 0, 29, 24,  "" ]
-// y_start     : [ 0, 35, 30,  "" ]
-// noc_sel:      [ 0, 36, 36,  "NOC select (1 = NOC1, 0 = NOC0)"]
-// mcast:        [ 0, 37, 37,  "1 = multicast, 0 = unicast"]
-// ordering:     [ 0, 39, 38,  "ordering mode (01 = strict (full AXI ordering), 00 = relaxed (no RAW hazard), 10 = posted (may have RAW hazard)"]
-// linked:       [ 0, 40, 40,  "linked"]
-
-const auto TLB_1M_OFFSET = TLB_OFFSETS {
-    .local_offset = 0,
-    .x_end = 16,
-    .y_end = 22,
-    .x_start = 28,
-    .y_start = 34,
-    .noc_sel = 40,
-    .mcast = 41,
-    .ordering = 42,
-    .linked = 44,
-    .static_vc = 45,
-    .static_vc_end = 46
-};
-
-const auto TLB_2M_OFFSET = TLB_OFFSETS {
-    .local_offset = 0,
-    .x_end = 15,
-    .y_end = 21,
-    .x_start = 27,
-    .y_start = 33,
-    .noc_sel = 39,
-    .mcast = 40,
-    .ordering = 41,
-    .linked = 43,
-    .static_vc = 44,
-    .static_vc_end = 45
-};
-
-const auto TLB_16M_OFFSET = TLB_OFFSETS {
-    .local_offset = 0,
-    .x_end = 12,
-    .y_end = 18,
-    .x_start = 24,
-    .y_start = 30,
-    .noc_sel = 36,
-    .mcast = 37,
-    .ordering = 38,
-    .linked = 40,
-    .static_vc = 41,
-    .static_vc_end = 42
-};
diff --git a/device/wormhole_implementation.cpp b/device/wormhole/wormhole_implementation.cpp
similarity index 98%
rename from device/wormhole_implementation.cpp
rename to device/wormhole/wormhole_implementation.cpp
index 9295e2de..96722311 100644
--- a/device/wormhole_implementation.cpp
+++ b/device/wormhole/wormhole_implementation.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "device/wormhole_implementation.h"
+#include "wormhole_implementation.h"
 
 namespace tt::umd {
 
diff --git a/device/wormhole_implementation.h b/device/wormhole/wormhole_implementation.h
similarity index 100%
rename from device/wormhole_implementation.h
rename to device/wormhole/wormhole_implementation.h
diff --git a/tests/blackhole/test_silicon_driver_bh.cpp b/tests/blackhole/test_silicon_driver_bh.cpp
index d6c938aa..23816841 100644
--- a/tests/blackhole/test_silicon_driver_bh.cpp
+++ b/tests/blackhole/test_silicon_driver_bh.cpp
@@ -12,7 +12,7 @@
 #include <thread>
 #include <memory>
 
-#include "device/blackhole_implementation.h"
+#include "device/blackhole/blackhole_implementation.h"
 #include "device/tt_cluster_descriptor.h"
 #include "tests/test_utils/generate_cluster_desc.hpp"
 
diff --git a/tests/emulation/test_emulation_device.cpp b/tests/emulation/test_emulation_device.cpp
deleted file mode 100644
index e54fa8f0..00000000
--- a/tests/emulation/test_emulation_device.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-#include "gtest/gtest.h"
-#include "device/tt_soc_descriptor.h"
-#include "device/tt_device.h"
-#include "device/tt_emulation_device.h"
-
-TEST(EmulationDeviceGS, BasicEmuTest) {
-    tt_emulation_device device = tt_emulation_device("../../tests/soc_descs/grayskull_10x12.yaml");
-    tt_device_params default_params;
-
-    std::size_t phys_x = 1;
-    std::size_t phys_y = 1;
-    tt_xy_pair core = tt_xy_pair(phys_x, phys_y);
-
-    uint32_t size = 16;
-    uint64_t l1_addr = 0x1000;
-    std::vector<uint32_t> wdata(size);
-    std::vector<uint32_t> rdata(size);
-    
-    try {
-        device.start_device(default_params);
-
-        for (auto &byte : wdata) {
-            byte = rand();
-        }
-        device.write_to_device(wdata, tt_cxy_pair(0, core), l1_addr, "l1");
-        device.read_from_device(rdata, tt_cxy_pair(0, core), l1_addr, size, "l1");
-        ASSERT_EQ(wdata, rdata) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written";
-
-        device.deassert_risc_reset();
-        device.write_to_device(wdata, tt_cxy_pair(0, tt_xy_pair(phys_x, phys_y)), l1_addr, "l1");
-        device.assert_risc_reset();
-        device.write_to_device(wdata, tt_cxy_pair(0, tt_xy_pair(phys_x, phys_y)), l1_addr, "l1");
-
-
-    } catch (const std::exception &e) {
-        std::cout << "Error: " << e.what() << std::endl;
-    }
-    device.close_device();
-}
diff --git a/tests/galaxy/test_umd_remote_api_stability.cpp b/tests/galaxy/test_umd_remote_api_stability.cpp
index f6bd28e8..ecf99862 100644
--- a/tests/galaxy/test_umd_remote_api_stability.cpp
+++ b/tests/galaxy/test_umd_remote_api_stability.cpp
@@ -76,13 +76,11 @@ TEST_F(WormholeGalaxyStabilityTestFixture, MixedRemoteTransfers) {
             100000 * scale_number_of_tests,
             seed,
 
-            transfer_type_weights_t{.write = 0.40, .rolled_write = 0.2, .read = 0.4, .epoch_cmd_write = 0.0},
+            transfer_type_weights_t{.write = 0.40, .read = 0.4},
 
             std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
             std::uniform_int_distribution<transfer_size_t>(0x4, 30000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 30000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution,
             std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution,
             0.75,
             0.75,
             std::uniform_int_distribution<transfer_size_t>(0x4, 30000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
@@ -108,13 +106,11 @@ TEST_F(WormholeGalaxyStabilityTestFixture, DISABLED_MultithreadedMixedRemoteTran
             50000 * scale_number_of_tests,
             0,
 
-            transfer_type_weights_t{.write = 0.50, .rolled_write = 0., .read = 0.50, .epoch_cmd_write = 0.},
+            transfer_type_weights_t{.write = 0.50, .read = 0.50},
 
             std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
             std::uniform_int_distribution<transfer_size_t>(0x4, 30000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 30000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution,
             std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution,
             0.75,
             0.75,
             std::uniform_int_distribution<transfer_size_t>(0x4, 30000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
@@ -129,13 +125,11 @@ TEST_F(WormholeGalaxyStabilityTestFixture, DISABLED_MultithreadedMixedRemoteTran
             50000 * scale_number_of_tests,
             100,
 
-            transfer_type_weights_t{.write = 0.25, .rolled_write = 0.25, .read = 0.50, .epoch_cmd_write = 0.},
+            transfer_type_weights_t{.write = 0.25, .read = 0.50},
 
             std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
             std::uniform_int_distribution<transfer_size_t>(0x4, 30000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 30000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution,
             std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution,
             0.75,
             0.75,
             std::uniform_int_distribution<transfer_size_t>(0x4, 30000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
@@ -150,13 +144,11 @@ TEST_F(WormholeGalaxyStabilityTestFixture, DISABLED_MultithreadedMixedRemoteTran
             50000 * scale_number_of_tests,
             23,
 
-            transfer_type_weights_t{.write = 0.5, .rolled_write = 0.25, .read = 0.25, .epoch_cmd_write = 0.},
+            transfer_type_weights_t{.write = 0.5, .read = 0.25},
 
             std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
             std::uniform_int_distribution<transfer_size_t>(0x4, 30000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 30000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution,
             std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution,
             0.75,
             0.75,
             std::uniform_int_distribution<transfer_size_t>(0x4, 30000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
@@ -171,13 +163,11 @@ TEST_F(WormholeGalaxyStabilityTestFixture, DISABLED_MultithreadedMixedRemoteTran
             100000 * scale_number_of_tests,
             99,
 
-            transfer_type_weights_t{.write = 0.1, .rolled_write = 0, .read = 0.1, .epoch_cmd_write = 0.8},
+            transfer_type_weights_t{.write = 0.1, .read = 0.1},
 
             std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
             std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution,
             std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution,
             0.75,
             0.75,
             std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
diff --git a/tests/grayskull/test_silicon_driver.cpp b/tests/grayskull/test_silicon_driver.cpp
index d8324f13..d890d8a9 100644
--- a/tests/grayskull/test_silicon_driver.cpp
+++ b/tests/grayskull/test_silicon_driver.cpp
@@ -7,7 +7,7 @@
 #include "gtest/gtest.h"
 #include "tt_device.h"
 #include "device/tt_soc_descriptor.h"
-#include "device/wormhole_implementation.h"
+#include "device/wormhole/wormhole_implementation.h"
 #include "l1_address_map.h"
 #include "tests/test_utils/generate_cluster_desc.hpp"
 
diff --git a/tests/test_utils/stimulus_generators.hpp b/tests/test_utils/stimulus_generators.hpp
index 094f06cb..6d35afb8 100644
--- a/tests/test_utils/stimulus_generators.hpp
+++ b/tests/test_utils/stimulus_generators.hpp
@@ -36,7 +36,7 @@ namespace tt::umd::test::utils {
 static const std::string SOC_DESC_PATH = "tests/soc_descs/wormhole_b0_8x10.yaml";
 
 
-enum RemoteTransferType : uint8_t { WRITE = 0, ROLLED_WRITE, READ, EPOCH_CMD_WRITE };
+enum RemoteTransferType : uint8_t { WRITE = 0, READ };
 
 template <
     typename SAMPLE_T,
@@ -102,14 +102,6 @@ struct write_transfer_sample_t {
     std::string tlb_to_use;
     // (payload.data(), size, destination, address, tlb_to_use, false, false);
 };
-struct rolled_write_transfer_sample_t {
-    destination_t destination;
-    address_t address;
-    transfer_size_t size_in_bytes;
-    int unroll_count;
-    std::string tlb_to_use;
-    // (payload, 2, destination, address, tlb_to_use);
-};
 struct read_transfer_sample_t {
     destination_t destination;
     address_t address;
@@ -117,17 +109,8 @@ struct read_transfer_sample_t {
     std::string tlb_to_use;
     // (payload.data(), destination, address, size, tlb_to_use);
 };
-struct write_epoch_cmd_sample_t {
-    destination_t destination;
-    address_t address;
-    transfer_size_t size_in_bytes;
-    std::string tlb_to_use;
-    bool last_epoch_command;
-    bool ordered_with_prev_remote_write;
-    // (payload.data(), size, destination, address, tlb_to_use, last_epoch_command, ordered_with_prev_remote_write);
-};
 
-using remote_transfer_sample_t = std::tuple<RemoteTransferType, std::variant<write_transfer_sample_t, rolled_write_transfer_sample_t, read_transfer_sample_t, write_epoch_cmd_sample_t>>;
+using remote_transfer_sample_t = std::tuple<RemoteTransferType, std::variant<write_transfer_sample_t, read_transfer_sample_t>>;
 
 template <
     template <typename>
@@ -267,25 +250,6 @@ template <
     template <typename>
     class WRITE_SIZE_DISTR_T,
 
-    template <typename>
-    class WRITE_EPOCH_CMD_DEST_DISTR_T,
-    template <typename>
-    class WRITE_EPOCH_CMD_ADDR_DISTR_T,
-    template <typename>
-    class WRITE_EPOCH_CMD_SIZE_DISTR_T,
-    class WRITE_EPOCH_CMD_LAST_CMD_DISTR_T,
-    class WRITE_EPOCH_CMD_ORDERED_DISTR_T,
-
-    template <typename>
-    class ROLLED_WRITE_DEST_DISTR_T,
-    template <typename>
-    class ROLLED_WRITE_ADDR_DISTR_T,
-    class ROLLED_WRITE_SIZE_DISTR_OUT_T,
-    template <typename>
-    class ROLLED_WRITE_SIZE_DISTR_T,
-    template <typename>
-    class ROLLED_WRITE_UNROLL_DISTR_T,
-
     template <typename>
     class READ_DEST_DISTR_T,
     template <typename>
@@ -299,8 +263,6 @@ class TestGenerator {
     using transfer_type_generator_t = DefaultTransferTypeGenerator;  // ConstrainedTemplateTemplateGenerator<RemoteTransferType, int,
                                                                      // TRANS_TYPE_DISTRIBUTION_T, GENERATOR_T>;
     using write_command_generator_t = WriteCommandGenerator<WRITE_DEST_DISTR_T, WRITE_ADDR_DISTR_T, WRITE_SIZE_DISTR_OUT_T, WRITE_SIZE_DISTR_T>;
-    using write_epoch_cmd_command_generator_t = WriteEpochCmdCommandGenerator<WRITE_EPOCH_CMD_DEST_DISTR_T, WRITE_EPOCH_CMD_ADDR_DISTR_T, WRITE_EPOCH_CMD_SIZE_DISTR_T, WRITE_EPOCH_CMD_LAST_CMD_DISTR_T, WRITE_EPOCH_CMD_ORDERED_DISTR_T>;
-    using rolled_write_command_generator_t = RolledWriteCommandGenerator<ROLLED_WRITE_DEST_DISTR_T, ROLLED_WRITE_ADDR_DISTR_T, ROLLED_WRITE_SIZE_DISTR_OUT_T, ROLLED_WRITE_SIZE_DISTR_T, ROLLED_WRITE_UNROLL_DISTR_T>;
     using read_command_generator_t = ReadCommandGenerator<READ_DEST_DISTR_T,READ_ADDR_DISTR_T, READ_SIZE_DISTR_OUT_T, READ_SIZE_DISTR_T>;
 
    public:
@@ -308,14 +270,10 @@ class TestGenerator {
         int seed,
         transfer_type_generator_t const& transfer_type_distribution,
         write_command_generator_t const& write_command_generator,
-        rolled_write_command_generator_t const& rolled_write_command_generator,
-        write_epoch_cmd_command_generator_t const& write_epoch_cmd_command_generator,
         read_command_generator_t const& read_command_generator) :
         generator(seed),
         transfer_type_distribution(transfer_type_distribution),
         write_command_generator(write_command_generator),
-        rolled_write_command_generator(rolled_write_command_generator),
-        write_epoch_cmd_command_generator(write_epoch_cmd_command_generator),
         read_command_generator(read_command_generator)
     {
     }
@@ -338,34 +296,6 @@ class TestGenerator {
                     .tlb_to_use = "LARGE_WRITE_TLB"}};
             } break;
 
-            case RemoteTransferType::ROLLED_WRITE: {
-                destination_t const& destination = rolled_write_command_generator.destination_generator.generate();
-                address_t const& address = rolled_write_command_generator.address_generator.generate();
-                transfer_size_t const& size_in_bytes = rolled_write_command_generator.size_generator.generate();
-                int unroll_count = rolled_write_command_generator.unroll_generator.generate();
-                return {transfer_type, rolled_write_transfer_sample_t{
-                    .destination = destination,
-                    .address = address,
-                    .size_in_bytes = size_in_bytes,
-                    .unroll_count = unroll_count,
-                    .tlb_to_use = "LARGE_WRITE_TLB"}};
-            } break;
-
-            case RemoteTransferType::EPOCH_CMD_WRITE: {
-                destination_t const& destination = write_epoch_cmd_command_generator.destination_generator.generate();
-                address_t const& address = write_epoch_cmd_command_generator.address_generator.generate();
-                transfer_size_t const& size_in_bytes = write_epoch_cmd_command_generator.size_generator.generate();
-                bool last_epoch_cmd = write_epoch_cmd_command_generator.last_cmd_generator.generate();
-                bool ordered_with_prev_remote_write = write_epoch_cmd_command_generator.ordered_generator.generate();
-                return {transfer_type, write_epoch_cmd_sample_t{
-                    .destination = destination,
-                    .address = address,
-                    .size_in_bytes = size_in_bytes,
-                    .tlb_to_use = "LARGE_WRITE_TLB",
-                    .last_epoch_command = last_epoch_cmd,
-                    .ordered_with_prev_remote_write = ordered_with_prev_remote_write}};
-            } break;
-
             case RemoteTransferType::READ: {
                 destination_t const& destination = read_command_generator.destination_generator.generate();
                 address_t const& address = read_command_generator.address_generator.generate();
@@ -388,22 +318,17 @@ class TestGenerator {
     transfer_type_generator_t transfer_type_distribution;
 
     write_command_generator_t write_command_generator;
-    rolled_write_command_generator_t rolled_write_command_generator;
-    write_epoch_cmd_command_generator_t write_epoch_cmd_command_generator;
     read_command_generator_t read_command_generator;
 };
 
 struct transfer_type_weights_t {
     double write;
-    double rolled_write;
     double read;
-    double epoch_cmd_write;
 };
 
 
 static auto address_aligner = [](address_t addr) -> address_t { addr = (((addr - 1) / 32) + 1) * 32; assert(addr % 32 == 0); return addr;};
 static auto transfer_size_aligner = [](transfer_size_t size) -> transfer_size_t { size = (((size - 1) / 4) + 1) * 4; assert(size > 0); assert(size % 4 == 0); return size; };
-static auto rolled_write_transfer_size_aligner = [](transfer_size_t size) -> transfer_size_t { size = (((size - 1) / 32) + 1) * 32; assert(size > 0); return size;};
 static auto address_aligner_32B = [](transfer_size_t size) -> transfer_size_t { size = (((size - 1) / 32) + 1) * 32; assert(size > 0); return size;};
 static auto size_aligner_32B = [](transfer_size_t size) -> transfer_size_t { size = (((size - 1) / 32) + 1) * 32; assert(size > 0); return size;};
 template<typename T>
@@ -433,28 +358,12 @@ static void print_command(remote_transfer_sample_t const& command) {
                         << ", y=" << command_args.destination.y << ", x=" << command_args.destination.x
                         << "), address: " << command_args.address << ", size_in_bytes: " << command_args.size_in_bytes << std::endl;
         } break;
-        case RemoteTransferType::ROLLED_WRITE: {
-            rolled_write_transfer_sample_t const& command_args =
-            std::get<rolled_write_transfer_sample_t>(std::get<1>(command));
-            std::cout << "Transfer type: ROLLED_WRITE, destination: (c=" << command_args.destination.chip
-                        << ", y=" << command_args.destination.y << ", x=" << command_args.destination.x
-                        << "), address: " << command_args.address << ", size_in_bytes: " << command_args.size_in_bytes
-                        << ", unroll_count: " << command_args.unroll_count << std::endl;
-        } break;
         case RemoteTransferType::READ: {
             read_transfer_sample_t const& command_args = std::get<read_transfer_sample_t>(std::get<1>(command));
             std::cout << "Transfer type: READ, destination: (c=" << command_args.destination.chip
                         << ", y=" << command_args.destination.y << ", x=" << command_args.destination.x
                         << "), address: " << command_args.address << ", size_in_bytes: " << command_args.size_in_bytes << std::endl;
         } break;
-        case RemoteTransferType::EPOCH_CMD_WRITE: {
-            write_epoch_cmd_sample_t const& command_args = std::get<write_epoch_cmd_sample_t>(std::get<1>(command));
-            std::cout << "Transfer type: EPOCH_CMD_WRITE, destination: (c=" << command_args.destination.chip
-                        << ", y=" << command_args.destination.y << ", x=" << command_args.destination.x
-                        << "), address: " << command_args.address << ", size_in_bytes: " << command_args.size_in_bytes
-                        << ", last_cmd: " << (command_args.last_epoch_command ? " True" : "False")
-                        << ", ordered_w_prev_remote_write: " <<  (command_args.ordered_with_prev_remote_write ? " True" : "False") << std::endl;
-        } break;
         default: throw std::runtime_error("Invalid transfer type");
     };
 }
@@ -479,14 +388,7 @@ static inline void dispatch_remote_transfer_command(
             write_transfer_sample_t const& command_args = std::get<write_transfer_sample_t>(std::get<1>(command));
             assert(command_args.size_in_bytes >= sizeof(uint32_t));
             resize_payload(payload,command_args.size_in_bytes);
-            driver.write_to_device(payload.data(), bytes_to_words<uint32_t>(command_args.size_in_bytes), command_args.destination, command_args.address, command_args.
-            tlb_to_use, false, false);
-        } break;
-        case RemoteTransferType::ROLLED_WRITE: {
-            rolled_write_transfer_sample_t const& command_args = std::get<rolled_write_transfer_sample_t>(std::get<1>(command));
-            assert(command_args.size_in_bytes >= sizeof(uint32_t));
-            resize_payload(payload,command_args.size_in_bytes);
-            driver.rolled_write_to_device(payload, command_args.unroll_count, command_args.destination, command_args.address, command_args.tlb_to_use);
+            driver.write_to_device(payload.data(), bytes_to_words<uint32_t>(command_args.size_in_bytes), command_args.destination, command_args.address, command_args.tlb_to_use);
         } break;
         case RemoteTransferType::READ: {
             read_transfer_sample_t const& command_args = std::get<read_transfer_sample_t>(std::get<1>(command));
@@ -494,12 +396,6 @@ static inline void dispatch_remote_transfer_command(
             resize_payload(payload,command_args.size_in_bytes);
             driver.read_from_device(payload.data(), command_args.destination, command_args.address, command_args.size_in_bytes, command_args.tlb_to_use);
         } break;
-        case RemoteTransferType::EPOCH_CMD_WRITE: {
-            write_epoch_cmd_sample_t const& command_args = std::get<write_epoch_cmd_sample_t>(std::get<1>(command));
-            assert(command_args.size_in_bytes >= sizeof(uint32_t));
-            resize_payload(payload,command_args.size_in_bytes);
-            driver.write_epoch_cmd_to_device(payload.data(), bytes_to_words<uint32_t>(command_args.size_in_bytes), command_args.destination, command_args.address, command_args.tlb_to_use, command_args.last_epoch_command, command_args.ordered_with_prev_remote_write);
-        } break;
         default:
             throw std::runtime_error("Invalid transfer type");
     };
@@ -524,16 +420,9 @@ static void print_command_executable_code(remote_transfer_sample_t const& comman
             std::cout << "assert(" << command_args.size_in_bytes << " >= sizeof(uint32_t));" << std::endl;
             emit_bytes_to_words_len_string("len", command_args.size_in_bytes, sizeof(uint32_t));
             emit_payload_resize_string(command_args.size_in_bytes, sizeof(uint32_t));
-            std::cout << "device->write_to_device(payload.data(), len, destination, " << command_args.address << ", \"" << command_args.tlb_to_use << "\", false, false);" << std::endl;
+            std::cout << "device->write_to_device(payload.data(), len, destination, " << command_args.address << ", \"" << command_args.tlb_to_use << "\");" << std::endl;
             // driver.write_to_device(payload.data(), command_args.size, command_args.destination, command_args.address, command_args.tlb_to_use, false, false);
         } break;
-        case RemoteTransferType::ROLLED_WRITE: {
-            rolled_write_transfer_sample_t const& command_args = std::get<rolled_write_transfer_sample_t>(std::get<1>(command));
-            std::cout << "tt_cxy_pair const& destination = tt_cxy_pair(" << command_args.destination.chip << ", " << command_args.destination.x << ", " << command_args.destination.y << ");"  << std::endl;
-            emit_payload_resize_string(command_args.size_in_bytes, sizeof(uint32_t));
-            std::cout << "device->rolled_write_to_device(payload, " << command_args.unroll_count << ", destination, " << command_args.address << ", \"" << command_args.tlb_to_use << "\");" << std::endl;
-            // driver.rolled_write_to_device(payload, command_args.unroll_count, command_args.destination, command_args.address, command_args.tlb_to_use);
-        } break;
         case RemoteTransferType::READ: {
             read_transfer_sample_t const& command_args = std::get<read_transfer_sample_t>(std::get<1>(command));
             std::cout << "tt_cxy_pair const& destination = tt_cxy_pair(" << command_args.destination.chip << ", " << command_args.destination.x << ", " << command_args.destination.y << ");"  << std::endl;
@@ -541,15 +430,6 @@ static void print_command_executable_code(remote_transfer_sample_t const& comman
             std::cout << "device->read_from_device(payload.data(), destination, " << command_args.address << ", " << command_args.size_in_bytes << ", \"" << command_args.tlb_to_use << "\");" << std::endl;
             // driver.read_from_device(payload.data(), command_args.destination, command_args.address, command_args.size, command_args.tlb_to_use);
         } break;
-        case RemoteTransferType::EPOCH_CMD_WRITE: {
-            write_epoch_cmd_sample_t const& command_args = std::get<write_epoch_cmd_sample_t>(std::get<1>(command));
-            std::cout << "tt_cxy_pair const& destination = tt_cxy_pair(" << command_args.destination.chip << ", " << command_args.destination.x << ", " << command_args.destination.y << ");"  << std::endl;
-            emit_payload_resize_string(command_args.size_in_bytes, sizeof(uint32_t));
-            emit_bytes_to_words_len_string("len", command_args.size_in_bytes, sizeof(uint32_t));
-            std::cout << "device->write_epoch_cmd_to_device(payload.data(), len, destination, " << command_args.address << ", \""  << command_args.tlb_to_use << "\", " << (command_args.last_epoch_command ? "true":"false")
-            << "\", " << (command_args.ordered_with_prev_remote_write ? "true":"false") << ");" << std::endl;
-            // driver.write_epoch_cmd_to_device(payload.data(), command_args.size, command_args.destination, command_args.address, command_args.tlb_to_use, command_args.last_epoch_command, command_args.ordered_with_prev_remote_write);
-        } break;
         default:
             throw std::runtime_error("Invalid transfer type");
     };
@@ -572,18 +452,6 @@ template<
     template <typename> class WRITE_ADDR_DISTR_T, 
     class WRITE_SIZE_DISTR_OUT_T,
     template <typename> class WRITE_SIZE_DISTR_T,
-    
-    template <typename> class ROLLED_WRITE_DEST_DISTR_T, 
-    template <typename> class ROLLED_WRITE_ADDR_DISTR_T,
-    class ROLLED_WRITE_SIZE_DISTR_OUT_T, 
-    template <typename> class ROLLED_WRITE_SIZE_DISTR_T,
-    template <typename> class ROLLED_WRITE_UNROLL_COUNT_DISTR_T,
-
-    template <typename> class WRITE_EPOCH_CMD_DEST_DISTR_T, 
-    template <typename> class WRITE_EPOCH_CMD_ADDR_DISTR_T, 
-    template <typename> class WRITE_EPOCH_CMD_SIZE_DISTR_T,
-    class WRITE_EPOCH_CMD_LAST_CMD_DISTR_T,
-    class WRITE_EPOCH_CMD_ORDERED_DISTR_T,
 
     template <typename> class READ_DEST_DISTR_T, 
     template <typename> class READ_ADDR_DISTR_T, 
@@ -598,8 +466,6 @@ void RunMixedTransfers(
     transfer_type_weights_t const& transfer_type_weights,
 
     WriteCommandGenerator<WRITE_DEST_DISTR_T, WRITE_ADDR_DISTR_T, WRITE_SIZE_DISTR_OUT_T, WRITE_SIZE_DISTR_T> const& write_command_generator,
-    RolledWriteCommandGenerator<ROLLED_WRITE_DEST_DISTR_T, ROLLED_WRITE_ADDR_DISTR_T, ROLLED_WRITE_SIZE_DISTR_OUT_T, ROLLED_WRITE_SIZE_DISTR_T, ROLLED_WRITE_UNROLL_COUNT_DISTR_T> const& rolled_write_command_generator,
-    WriteEpochCmdCommandGenerator<WRITE_EPOCH_CMD_DEST_DISTR_T, WRITE_EPOCH_CMD_ADDR_DISTR_T, WRITE_EPOCH_CMD_SIZE_DISTR_T, WRITE_EPOCH_CMD_LAST_CMD_DISTR_T, WRITE_EPOCH_CMD_ORDERED_DISTR_T> const& write_epoch_cmd_command_generator,
     ReadCommandGenerator<READ_DEST_DISTR_T, READ_ADDR_DISTR_T, READ_SIZE_DISTR_OUT_T, READ_SIZE_DISTR_T> const& read_command_generator,
     
     bool record_command_history = false,
@@ -609,14 +475,12 @@ void RunMixedTransfers(
     auto test_generator = TestGenerator(
         seed,
         {seed,
-         {transfer_type_weights.write, transfer_type_weights.rolled_write, transfer_type_weights.read, transfer_type_weights.epoch_cmd_write},
+         {transfer_type_weights.write, transfer_type_weights.read},
          [](int transfer_type) -> RemoteTransferType {
              assert(transfer_type < 4);
              return static_cast<RemoteTransferType>(transfer_type);
          }},
         write_command_generator,
-        rolled_write_command_generator,
-        write_epoch_cmd_command_generator,
         read_command_generator);
 
     if (record_command_history) {
@@ -663,58 +527,6 @@ static ConstrainedTemplateTemplateGenerator<destination_t, int, std::uniform_int
         [core_index_to_location](int dest) -> destination_t { return core_index_to_location.at(dest); });
 }
 
-
-static RolledWriteCommandGenerator <
-    std::uniform_int_distribution,
-    std::uniform_int_distribution,
-    transfer_size_t,
-    std::uniform_int_distribution,
-    std::uniform_int_distribution
->
- build_dummy_rolled_write_command_generator(tt_SiliconDevice &device) {
-    tt_ClusterDescriptor *cluster_desc = device.get_cluster_description();
-    tt_SocDescriptor const& soc_desc = device.get_virtual_soc_descriptors().at(0);
-    std::vector<destination_t> core_index_to_location = generate_core_index_locations(*cluster_desc, soc_desc);
-    auto dest_generator = ConstrainedTemplateTemplateGenerator<destination_t, int, std::uniform_int_distribution>(
-        0,
-        std::uniform_int_distribution<int>(0, core_index_to_location.size() - 1),
-        [core_index_to_location](int dest) -> destination_t { return core_index_to_location.at(dest); });
-    auto addr_generator_32B_aligned = ConstrainedTemplateTemplateGenerator<address_t, address_t, std::uniform_int_distribution>(0, std::uniform_int_distribution<address_t>(0,0), address_aligner_32B);
-    auto rolled_write_size_generator = ConstrainedTemplateTemplateGenerator<transfer_size_t, transfer_size_t, std::uniform_int_distribution>(
-        0, std::uniform_int_distribution<transfer_size_t>(0,0), rolled_write_transfer_size_aligner);
-    auto unroll_count_generator = ConstrainedTemplateTemplateGenerator<int, int, std::uniform_int_distribution>(
-        0, std::uniform_int_distribution<int>(0,0), [](int unroll_count) -> int { return unroll_count; });
-
-    return RolledWriteCommandGenerator(
-        dest_generator, addr_generator_32B_aligned, rolled_write_size_generator, unroll_count_generator);
-}
-
-static WriteEpochCmdCommandGenerator <
-    std::uniform_int_distribution,
-    std::uniform_int_distribution,
-    std::uniform_int_distribution,
-    std::bernoulli_distribution,
-    std::bernoulli_distribution
-> build_dummy_write_epoch_cmd_command_generator(tt_SiliconDevice &device) {
-    tt_ClusterDescriptor *cluster_desc = device.get_cluster_description();
-    tt_SocDescriptor const& soc_desc = device.get_virtual_soc_descriptors().at(0);
-    std::vector<destination_t> core_index_to_location = generate_core_index_locations(*cluster_desc, soc_desc);
-    auto dest_generator = ConstrainedTemplateTemplateGenerator<destination_t, int, std::uniform_int_distribution>(
-        0,
-        std::uniform_int_distribution<int>(0, core_index_to_location.size() - 1),
-        [core_index_to_location](int dest) -> destination_t { return core_index_to_location.at(dest); });
-    auto addr_generator_32B_aligned = ConstrainedTemplateTemplateGenerator<address_t, address_t, std::uniform_int_distribution>(0, std::uniform_int_distribution<address_t>(0,0), address_aligner_32B);
-    auto write_epoch_cmd_generator = ConstrainedTemplateTemplateGenerator<transfer_size_t, transfer_size_t, std::uniform_int_distribution>(
-        0, std::uniform_int_distribution<transfer_size_t>(0,0), transfer_size_aligner);
-    auto last_epoch_cmd_generator = ConstrainedTemplateGenerator<bool, bool, std::bernoulli_distribution>(
-        0, std::bernoulli_distribution(1), [](bool last_epoch_cmd) -> bool { return last_epoch_cmd; });
-    auto ordered_generator = ConstrainedTemplateGenerator<bool, bool, std::bernoulli_distribution>(
-        0, std::bernoulli_distribution(1), [](bool ordered_with_prev_remote_write) -> bool { return ordered_with_prev_remote_write; });
-
-    return WriteEpochCmdCommandGenerator(
-        dest_generator, addr_generator_32B_aligned, write_epoch_cmd_generator, last_epoch_cmd_generator, ordered_generator);
-}
-
 static WriteCommandGenerator<
     std::uniform_int_distribution,
     std::uniform_int_distribution,
@@ -764,10 +576,6 @@ template<
     template <typename>
     class WRITE_SIZE_GENERATOR_T,
     template <typename>
-    class ROLLED_WRITE_SIZE_GENERATOR_T,
-    template <typename>
-    class WRITE_EPOCH_CMD_SIZE_GENERATOR_T,
-    template <typename>
     class READ_SIZE_GENERATOR_T,
     template <typename>
     class UNROLL_COUNT_GENERATOR_T
@@ -780,9 +588,7 @@ void RunMixedTransfersUniformDistributions(
     transfer_type_weights_t const& transfer_type_weights,
     ADDR_GENERATOR_T<ADDR_DISTR_T> const& address_distribution,
     WRITE_SIZE_GENERATOR_T<transfer_size_t> const& write_size_distribution,
-    ROLLED_WRITE_SIZE_GENERATOR_T<transfer_size_t> const& rolled_write_size_distribution,
     UNROLL_COUNT_GENERATOR_T<int> const& unroll_count_distribution,
-    WRITE_EPOCH_CMD_SIZE_GENERATOR_T<transfer_size_t> const& write_epoch_cmd_size_distribution,
     float percent_not_last_epoch_cmd,
     float percent_not_remote_ordered,
     READ_SIZE_GENERATOR_T<transfer_size_t> const& read_size_distribution,
@@ -802,12 +608,8 @@ void RunMixedTransfersUniformDistributions(
     auto addr_generator_32B_aligned = ConstrainedTemplateTemplateGenerator<address_t, address_t, std::uniform_int_distribution>(seed + 1, address_distribution, address_aligner_32B);
     auto write_size_generator = ConstrainedTemplateTemplateGenerator<transfer_size_t, transfer_size_t, std::uniform_int_distribution>(
         seed + 2, write_size_distribution, transfer_size_aligner);
-    auto rolled_write_size_generator = ConstrainedTemplateTemplateGenerator<transfer_size_t, transfer_size_t, std::uniform_int_distribution>(
-        seed + 2, rolled_write_size_distribution, rolled_write_transfer_size_aligner);
     auto read_size_generator = ConstrainedTemplateTemplateGenerator<transfer_size_t, transfer_size_t, std::uniform_int_distribution>(
         seed + 2, read_size_distribution, transfer_size_aligner);
-    auto write_epoch_cmd_generator = ConstrainedTemplateTemplateGenerator<transfer_size_t, transfer_size_t, std::uniform_int_distribution>(
-        seed + 2, write_epoch_cmd_size_distribution, transfer_size_aligner);
     auto last_epoch_cmd_generator = ConstrainedTemplateGenerator<bool, bool, std::bernoulli_distribution>(
         seed + 3, std::bernoulli_distribution(percent_not_last_epoch_cmd), [](bool last_epoch_cmd) -> bool { return last_epoch_cmd; });
     auto ordered_generator = ConstrainedTemplateGenerator<bool, bool, std::bernoulli_distribution>(
@@ -823,9 +625,6 @@ void RunMixedTransfersUniformDistributions(
         transfer_type_weights,
 
         WriteCommandGenerator(dest_generator, addr_generator, write_size_generator),
-        RolledWriteCommandGenerator(dest_generator, addr_generator_32B_aligned, rolled_write_size_generator, unroll_count_generator),
-        WriteEpochCmdCommandGenerator(
-            dest_generator, addr_generator_32B_aligned, write_epoch_cmd_generator, last_epoch_cmd_generator, ordered_generator),
         ReadCommandGenerator(dest_generator, addr_generator, read_size_generator),
         
         record_command_history,
diff --git a/tests/wormhole/test_silicon_driver_wh.cpp b/tests/wormhole/test_silicon_driver_wh.cpp
index df686dfa..6551b3cc 100644
--- a/tests/wormhole/test_silicon_driver_wh.cpp
+++ b/tests/wormhole/test_silicon_driver_wh.cpp
@@ -13,7 +13,7 @@
 #include "host_mem_address_map.h"
 
 #include "device/tt_cluster_descriptor.h"
-#include "device/wormhole_implementation.h"
+#include "device/wormhole/wormhole_implementation.h"
 #include "tests/test_utils/generate_cluster_desc.hpp"
 
 void set_params_for_remote_txn(tt_SiliconDevice& device) {
diff --git a/tests/wormhole/test_umd_remote_api_stability.cpp b/tests/wormhole/test_umd_remote_api_stability.cpp
index 36c02914..96fef09a 100644
--- a/tests/wormhole/test_umd_remote_api_stability.cpp
+++ b/tests/wormhole/test_umd_remote_api_stability.cpp
@@ -73,13 +73,11 @@ TEST_F(WormholeNebulaX2TestFixture, MixedRemoteTransfersMediumSmall) {
             100000 * scale_number_of_tests,
             0,
 
-            transfer_type_weights_t{.write = 0.25, .rolled_write = 0.25, .read = 0.25, .epoch_cmd_write = 0.25},
+            transfer_type_weights_t{.write = 0.25, .read = 0.25},
 
             std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
             std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution,
             std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution,
             0.75,
             0.75,
             std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
@@ -108,13 +106,11 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersMediumSmall
             100000 * scale_number_of_tests,
             0,
 
-            transfer_type_weights_t{.write = 0.50, .rolled_write = 0., .read = 0.50, .epoch_cmd_write = 0.},
+            transfer_type_weights_t{.write = 0.50, .read = 0.50},
 
             std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
             std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution,
             std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution,
             0.75,
             0.75,
             std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
@@ -129,13 +125,11 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersMediumSmall
             100000 * scale_number_of_tests,
             100,
 
-            transfer_type_weights_t{.write = 0.25, .rolled_write = 0.25, .read = 0.50, .epoch_cmd_write = 0.},
+            transfer_type_weights_t{.write = 0.25, .read = 0.50},
 
             std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
             std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution,
             std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution,
             0.75,
             0.75,
             std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
@@ -150,13 +144,11 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersMediumSmall
             100000 * scale_number_of_tests,
             23,
 
-            transfer_type_weights_t{.write = 0.5, .rolled_write = 0.25, .read = 0.25, .epoch_cmd_write = 0.},
+            transfer_type_weights_t{.write = 0.5, .read = 0.25},
 
             std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
             std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution,
             std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution,
             0.75,
             0.75,
             std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
@@ -171,13 +163,11 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersMediumSmall
             100000 * scale_number_of_tests,
             99,
 
-            transfer_type_weights_t{.write = 1.0, .rolled_write = 0, .read = 0.0, .epoch_cmd_write = 0.0},
+            transfer_type_weights_t{.write = 1.0, .read = 0.0},
 
             std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
             std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution,
             std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution,
             0.75,
             0.75,
             std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
@@ -206,13 +196,11 @@ TEST_F(WormholeNebulaX2TestFixture, MixedRemoteTransfersLarge) {
             10000 * scale_number_of_tests,
             0,
 
-            transfer_type_weights_t{.write = 0.15, .rolled_write = 0, .read = 0.15, .epoch_cmd_write = 0.7},
+            transfer_type_weights_t{.write = 0.15, .read = 0.15},
 
             std::uniform_int_distribution<address_t>(0x10000, 0x200000), // address generator distribution
             std::uniform_int_distribution<transfer_size_t>(0x4, 300000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 300000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution,
             std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution,
             0.75,
             0.75,
             std::uniform_int_distribution<transfer_size_t>(0x4, 300000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
@@ -247,11 +235,9 @@ TEST_F(WormholeNebulaX2TestFixture, WritesOnlyNormalDistributionMean10kStd3kMinS
             10000 * scale_number_of_tests,
             0,
 
-            transfer_type_weights_t{.write = 1., .rolled_write = 0., .read = 0., .epoch_cmd_write = 0.},
+            transfer_type_weights_t{.write = 1., .read = 0.},
 
             WriteCommandGenerator(dest_generator, address_generator, write_size_generator),
-            build_dummy_rolled_write_command_generator(*device),
-            build_dummy_write_epoch_cmd_command_generator(*device),
             build_dummy_read_command_generator(*device),
 
             false, // Set to true if you want to emit the command history code to command line
@@ -279,13 +265,11 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLMS) {
             100000 * scale_number_of_tests,
             0,
 
-            transfer_type_weights_t{.write = 0.50, .rolled_write = 0., .read = 0.50, .epoch_cmd_write = 0.},
+            transfer_type_weights_t{.write = 0.50, .read = 0.50},
 
             std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
             std::uniform_int_distribution<transfer_size_t>(4, 300000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution,
             std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution,
             0.75,
             0.75,
             std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
@@ -300,13 +284,11 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLMS) {
             100000 * scale_number_of_tests,
             100,
 
-            transfer_type_weights_t{.write = 0.25, .rolled_write = 0.25, .read = 0.50, .epoch_cmd_write = 0.},
+            transfer_type_weights_t{.write = 0.25, .read = 0.50},
 
             std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
             std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution,
             std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution,
             0.75,
             0.75,
             std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
@@ -321,13 +303,11 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLMS) {
             100000 * scale_number_of_tests,
             23,
 
-            transfer_type_weights_t{.write = 0.5, .rolled_write = 0.25, .read = 0.25, .epoch_cmd_write = 0.},
+            transfer_type_weights_t{.write = 0.5, .read = 0.25},
 
             std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
             std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution,
             std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution,
             0.75,
             0.75,
             std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
@@ -342,13 +322,11 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLMS) {
             100000 * scale_number_of_tests,
             99,
 
-            transfer_type_weights_t{.write = 1.0, .rolled_write = 0, .read = 0.0, .epoch_cmd_write = 0.0},
+            transfer_type_weights_t{.write = 1.0, .read = 0.0},
 
             std::uniform_int_distribution<address_t>(0x100000, 0x200000), // address generator distribution
             std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution,
-            std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution,
             std::uniform_int_distribution<int>(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution
-            std::uniform_int_distribution<transfer_size_t>(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution,
             0.75,
             0.75,
             std::uniform_int_distribution<transfer_size_t>(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution,
@@ -387,11 +365,9 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLargeWrites
             10000 * scale_number_of_tests,
             0,
 
-            transfer_type_weights_t{.write = 1., .rolled_write = 0., .read = 0., .epoch_cmd_write = 0.},
+            transfer_type_weights_t{.write = 1., .read = 0.},
 
             WriteCommandGenerator(dest_generator, address_generator, write_size_generator),
-            build_dummy_rolled_write_command_generator(*device),
-            build_dummy_write_epoch_cmd_command_generator(*device),
             build_dummy_read_command_generator(*device),
 
             false, // Set to true if you want to emit the command history code to command line
@@ -404,11 +380,9 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLargeWrites
             10000 * scale_number_of_tests,
             0,
 
-            transfer_type_weights_t{.write = 1., .rolled_write = 0., .read = 0., .epoch_cmd_write = 0.},
+            transfer_type_weights_t{.write = 1., .read = 0.},
 
             WriteCommandGenerator(dest_generator, address_generator, write_size_generator),
-            build_dummy_rolled_write_command_generator(*device),
-            build_dummy_write_epoch_cmd_command_generator(*device),
             build_dummy_read_command_generator(*device),
 
             false, // Set to true if you want to emit the command history code to command line
@@ -421,11 +395,9 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLargeWrites
             10000 * scale_number_of_tests,
             0,
 
-            transfer_type_weights_t{.write = 0, .rolled_write = 0., .read = 1., .epoch_cmd_write = 0.},
+            transfer_type_weights_t{.write = 0, .read = 1.},
 
             build_dummy_write_command_generator(*device),
-            build_dummy_rolled_write_command_generator(*device),
-            build_dummy_write_epoch_cmd_command_generator(*device),
             ReadCommandGenerator(dest_generator, address_generator, read_size_generator),
 
             false, // Set to true if you want to emit the command history code to command line
@@ -438,11 +410,9 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLargeWrites
             10000 * scale_number_of_tests,
             0,
 
-            transfer_type_weights_t{.write = 0, .rolled_write = 0., .read = 1., .epoch_cmd_write = 0.},
+            transfer_type_weights_t{.write = 0, .read = 1.},
 
             build_dummy_write_command_generator(*device),
-            build_dummy_rolled_write_command_generator(*device),
-            build_dummy_write_epoch_cmd_command_generator(*device),
             ReadCommandGenerator(dest_generator, address_generator, read_size_generator),
 
             false, // Set to true if you want to emit the command history code to command line