From dbd2bb6e8597efeeba4a9df0e3bc159a92486ff4 Mon Sep 17 00:00:00 2001
From: Bojan Rosko <brosko@tenstorrent.com>
Date: Tue, 5 Nov 2024 11:07:34 +0000
Subject: [PATCH] clang for device

---
 device/.clang-format                          |    2 -
 device/architecture_implementation.cpp        |   12 +-
 device/architecture_implementation.h          |    7 +-
 .../blackhole/blackhole_coordinate_manager.h  |    8 +-
 device/blackhole/blackhole_implementation.cpp |   14 +-
 device/blackhole/blackhole_implementation.h   |   96 +-
 device/coordinate_manager.cpp                 |   26 +-
 device/coordinate_manager.h                   |   19 +-
 device/cpuset_lib.cpp                         |  489 ++--
 device/cpuset_lib.hpp                         |  168 +-
 device/device_api_metal.h                     |    2 +-
 device/driver_atomics.h                       |   34 +-
 .../grayskull/grayskull_coordinate_manager.h  |    6 +-
 device/grayskull/grayskull_implementation.cpp |    7 +-
 device/grayskull/grayskull_implementation.h   |   43 +-
 device/ioctl.h                                |  142 +-
 device/mockup/tt_mockup_device.hpp            |   37 +-
 device/pcie/pci_device.cpp                    |  284 +-
 device/pcie/pci_device.hpp                    |   82 +-
 .../deprecated/tt_emulation_device.cpp        |  244 +-
 .../deprecated/tt_emulation_device.h          |  123 +-
 .../deprecated/tt_emulation_stub.cpp          |  105 +-
 .../deprecated/tt_versim_device.cpp           |  435 +--
 .../simulation/deprecated/tt_versim_device.h  |   98 +-
 .../simulation/deprecated/tt_versim_stub.cpp  |  119 +-
 device/simulation/tt_simulation_device.cpp    |  125 +-
 device/simulation/tt_simulation_device.h      |   33 +-
 device/simulation/tt_simulation_host.cpp      |   19 +-
 device/simulation/tt_simulation_host.hpp      |    3 +-
 device/tlb.h                                  |    8 +-
 device/tt_arch_types.h                        |    2 +-
 device/tt_cluster_descriptor.cpp              |  463 ++--
 device/tt_cluster_descriptor.h                |  173 +-
 device/tt_cluster_descriptor_types.h          |   18 +-
 device/tt_device.cpp                          |   20 +-
 device/tt_device.h                            |  520 ++--
 device/tt_io.hpp                              |   26 +-
 device/tt_silicon_driver.cpp                  | 2460 +++++++++++------
 device/tt_silicon_driver_common.cpp           |   28 +-
 device/tt_silicon_driver_common.hpp           |   55 +-
 device/tt_soc_descriptor.cpp                  |   76 +-
 device/tt_soc_descriptor.h                    |   65 +-
 device/tt_xy_pair.h                           |   12 +
 .../wormhole/wormhole_coordinate_manager.cpp  |    6 +-
 device/wormhole/wormhole_coordinate_manager.h |    8 +-
 device/wormhole/wormhole_implementation.cpp   |    7 +-
 device/wormhole/wormhole_implementation.h     |   40 +-
 device/xy_pair.h                              |    7 +-
 48 files changed, 4205 insertions(+), 2571 deletions(-)
 delete mode 100644 device/.clang-format

diff --git a/device/.clang-format b/device/.clang-format
deleted file mode 100644
index 9d159247..00000000
--- a/device/.clang-format
+++ /dev/null
@@ -1,2 +0,0 @@
-DisableFormat: true
-SortIncludes: false
diff --git a/device/architecture_implementation.cpp b/device/architecture_implementation.cpp
index 7cd1dac8..186c6c14 100644
--- a/device/architecture_implementation.cpp
+++ b/device/architecture_implementation.cpp
@@ -12,10 +12,14 @@ namespace tt::umd {
 
 std::unique_ptr<architecture_implementation> architecture_implementation::create(tt::ARCH architecture) {
     switch (architecture) {
-        case tt::ARCH::BLACKHOLE: return std::make_unique<blackhole_implementation>();
-        case tt::ARCH::GRAYSKULL: return std::make_unique<grayskull_implementation>();
-        case tt::ARCH::WORMHOLE_B0: return std::make_unique<wormhole_implementation>();
-        default: return nullptr;
+        case tt::ARCH::BLACKHOLE:
+            return std::make_unique<blackhole_implementation>();
+        case tt::ARCH::GRAYSKULL:
+            return std::make_unique<grayskull_implementation>();
+        case tt::ARCH::WORMHOLE_B0:
+            return std::make_unique<wormhole_implementation>();
+        default:
+            return nullptr;
     }
 }
 
diff --git a/device/architecture_implementation.h b/device/architecture_implementation.h
index 41767081..5f966255 100644
--- a/device/architecture_implementation.h
+++ b/device/architecture_implementation.h
@@ -12,15 +12,15 @@
 #include <vector>
 
 #include "device/tlb.h"
-#include "device/xy_pair.h"
 #include "device/tt_arch_types.h"
+#include "device/xy_pair.h"
 
 struct tt_driver_host_address_params;
 
 namespace tt::umd {
 
 class architecture_implementation {
-   public:
+public:
     virtual ~architecture_implementation() = default;
 
     virtual tt::ARCH get_architecture() const = 0;
@@ -63,7 +63,8 @@ class architecture_implementation {
     virtual std::tuple<xy_pair, xy_pair> multicast_workaround(xy_pair start, xy_pair end) const = 0;
     virtual tlb_configuration get_tlb_configuration(uint32_t tlb_index) const = 0;
     virtual std::optional<std::tuple<std::uint64_t, std::uint64_t>> describe_tlb(std::int32_t tlb_index) const = 0;
-    virtual std::pair<std::uint64_t, std::uint64_t> get_tlb_data(std::uint32_t tlb_index, const tlb_data& data) const = 0;
+    virtual std::pair<std::uint64_t, std::uint64_t> get_tlb_data(
+        std::uint32_t tlb_index, const tlb_data& data) const = 0;
 
     virtual tt_driver_host_address_params get_host_address_params() const = 0;
 
diff --git a/device/blackhole/blackhole_coordinate_manager.h b/device/blackhole/blackhole_coordinate_manager.h
index 76f1ebc6..88d385ad 100644
--- a/device/blackhole/blackhole_coordinate_manager.h
+++ b/device/blackhole/blackhole_coordinate_manager.h
@@ -9,15 +9,15 @@
 #include "device/coordinate_manager.h"
 
 class BlackholeCoordinateManager : public CoordinateManager {
-
 public:
-    BlackholeCoordinateManager(const tt_xy_pair& worker_grid_size, const std::vector<tt_xy_pair>& workers, std::size_t harvesting_mask)
-        : CoordinateManager(worker_grid_size, workers, harvesting_mask) {}
+    BlackholeCoordinateManager(
+        const tt_xy_pair& worker_grid_size, const std::vector<tt_xy_pair>& workers, std::size_t harvesting_mask) :
+        CoordinateManager(worker_grid_size, workers, harvesting_mask) {}
 
     tt_translated_coords to_translated_coords(tt_logical_coords logical_coords) override;
 
     tt_logical_coords to_logical_coords(tt_translated_coords translated_coords) override;
 
-protected: 
+protected:
     std::set<std::size_t> get_x_coordinates_to_harvest(std::size_t harvesting_mask) override;
 };
\ No newline at end of file
diff --git a/device/blackhole/blackhole_implementation.cpp b/device/blackhole/blackhole_implementation.cpp
index 91d80bc7..c3b39da8 100644
--- a/device/blackhole/blackhole_implementation.cpp
+++ b/device/blackhole/blackhole_implementation.cpp
@@ -4,9 +4,8 @@
 
 #include "blackhole_implementation.h"
 
-#include "src/firmware/riscv/blackhole/host_mem_address_map.h"
-
 #include "device/tt_device.h"
+#include "src/firmware/riscv/blackhole/host_mem_address_map.h"
 
 namespace tt::umd {
 
@@ -22,10 +21,9 @@ std::tuple<xy_pair, xy_pair> blackhole_implementation::multicast_workaround(xy_p
 }
 
 tlb_configuration blackhole_implementation::get_tlb_configuration(uint32_t tlb_index) const {
-
     // If TLB index is in range for 4GB tlbs (8 TLBs after 202 TLBs for 2MB)
     if (tlb_index >= blackhole::TLB_COUNT_2M && tlb_index < blackhole::TLB_COUNT_2M + blackhole::TLB_COUNT_4G) {
-        return tlb_configuration {
+        return tlb_configuration{
             .size = blackhole::DYNAMIC_TLB_4G_SIZE,
             .base = blackhole::DYNAMIC_TLB_4G_BASE,
             .cfg_addr = blackhole::DYNAMIC_TLB_4G_CFG_ADDR,
@@ -33,7 +31,7 @@ tlb_configuration blackhole_implementation::get_tlb_configuration(uint32_t tlb_i
             .offset = blackhole::TLB_4G_OFFSET,
         };
     }
-    
+
     return tlb_configuration{
         .size = blackhole::DYNAMIC_TLB_2M_SIZE,
         .base = blackhole::DYNAMIC_TLB_2M_BASE,
@@ -69,17 +67,17 @@ std::optional<std::tuple<std::uint64_t, std::uint64_t>> blackhole_implementation
 
 std::pair<std::uint64_t, std::uint64_t> blackhole_implementation::get_tlb_data(
     std::uint32_t tlb_index, const tlb_data& data) const {
-
     if (tlb_index < blackhole::TLB_COUNT_2M) {
         return data.apply_offset(blackhole::TLB_2M_OFFSET);
     } else {
         throw std::runtime_error("Invalid TLB index for Blackhole arch");
     }
-
 }
 
 tt_driver_host_address_params blackhole_implementation::get_host_address_params() const {
-    return {::blackhole::host_mem::address_map::ETH_ROUTING_BLOCK_SIZE, ::blackhole::host_mem::address_map::ETH_ROUTING_BUFFERS_START};
+    return {
+        ::blackhole::host_mem::address_map::ETH_ROUTING_BLOCK_SIZE,
+        ::blackhole::host_mem::address_map::ETH_ROUTING_BUFFERS_START};
 }
 
 }  // namespace tt::umd
diff --git a/device/blackhole/blackhole_implementation.h b/device/blackhole/blackhole_implementation.h
index 9a7799b7..2bb7c678 100644
--- a/device/blackhole/blackhole_implementation.h
+++ b/device/blackhole/blackhole_implementation.h
@@ -7,10 +7,10 @@
 #pragma once
 
 #include <array>
+#include <stdexcept>
 
 #include "device/architecture_implementation.h"
 #include "device/tlb.h"
-#include <stdexcept>
 
 namespace tt::umd {
 
@@ -59,30 +59,8 @@ enum class arc_message_type {
 
 // DEVICE_DATA
 static constexpr std::array<xy_pair, 24> DRAM_LOCATIONS = {
-    {{0, 0},
-     {0, 1},
-     {0, 11},
-     {0, 2},
-     {0, 10},
-     {0, 3},
-     {0, 9},
-     {0, 4},
-     {0, 8},
-     {0, 5},
-     {0, 7},
-     {0, 6},
-     {9, 0},
-     {9, 1},
-     {9, 11},
-     {9, 2},
-     {9, 10},
-     {9, 3},
-     {9, 9},
-     {9, 4},
-     {9, 8},
-     {9, 5},
-     {9, 7},
-     {9, 6}}};
+    {{0, 0}, {0, 1}, {0, 11}, {0, 2}, {0, 10}, {0, 3}, {0, 9}, {0, 4}, {0, 8}, {0, 5}, {0, 7}, {0, 6},
+     {9, 0}, {9, 1}, {9, 11}, {9, 2}, {9, 10}, {9, 3}, {9, 9}, {9, 4}, {9, 8}, {9, 5}, {9, 7}, {9, 6}}};
 
 static constexpr std::array<xy_pair, 1> ARC_LOCATIONS = {{{8, 0}}};
 static constexpr std::array<xy_pair, 1> PCI_LOCATIONS = {{{11, 0}}};
@@ -113,14 +91,14 @@ static constexpr uint32_t BROADCAST_TLB_INDEX = 0;     // TODO: Copied from worm
 static constexpr uint32_t STATIC_TLB_CFG_ADDR = 0x1fc00000;
 
 static constexpr uint32_t TLB_COUNT_2M = 202;
-static constexpr uint32_t TLB_BASE_2M = 0; // 0 in BAR0
+static constexpr uint32_t TLB_BASE_2M = 0;  // 0 in BAR0
 static constexpr uint32_t TLB_BASE_INDEX_2M = 0;
 static constexpr uint32_t TLB_2M_SIZE = 2 * 1024 * 1024;
 
 static constexpr uint32_t TLB_CFG_REG_SIZE_BYTES = 12;
 
 static constexpr uint32_t TLB_COUNT_4G = 8;
-static constexpr uint32_t TLB_BASE_4G = 0; // 0 in BAR4
+static constexpr uint32_t TLB_BASE_4G = 0;  // 0 in BAR4
 static constexpr uint32_t TLB_BASE_INDEX_4G = TLB_COUNT_2M;
 static constexpr uint64_t TLB_4G_SIZE = 4ULL * 1024ULL * 1024ULL * 1024ULL;
 static constexpr uint64_t DYNAMIC_TLB_4G_SIZE = TLB_4G_SIZE;
@@ -168,59 +146,108 @@ static constexpr uint32_t MSG_TYPE_SETUP_IATU_FOR_PEER_TO_PEER = 0x97;
 }  // namespace blackhole
 
 class blackhole_implementation : public architecture_implementation {
-   public:
+public:
     tt::ARCH get_architecture() const override { return tt::ARCH::BLACKHOLE; }
+
     uint32_t get_arc_message_arc_get_harvesting() const override {
         return static_cast<uint32_t>(blackhole::arc_message_type::ARC_GET_HARVESTING);
     }
+
     uint32_t get_arc_message_arc_go_busy() const override {
         return static_cast<uint32_t>(blackhole::arc_message_type::ARC_GO_BUSY);
     }
+
     uint32_t get_arc_message_arc_go_long_idle() const override {
         return static_cast<uint32_t>(blackhole::arc_message_type::ARC_GO_LONG_IDLE);
     }
+
     uint32_t get_arc_message_arc_go_short_idle() const override {
         return static_cast<uint32_t>(blackhole::arc_message_type::ARC_GO_SHORT_IDLE);
     }
+
     uint32_t get_arc_message_deassert_riscv_reset() const override {
         return static_cast<uint32_t>(blackhole::arc_message_type::DEASSERT_RISCV_RESET);
     }
+
     uint32_t get_arc_message_get_aiclk() const override {
         return static_cast<uint32_t>(blackhole::arc_message_type::GET_AICLK);
     }
+
     uint32_t get_arc_message_setup_iatu_for_peer_to_peer() const override {
         return static_cast<uint32_t>(blackhole::arc_message_type::SETUP_IATU_FOR_PEER_TO_PEER);
     }
+
     uint32_t get_arc_message_test() const override { return static_cast<uint32_t>(blackhole::arc_message_type::TEST); }
-    uint32_t get_arc_csm_mailbox_offset() const override { throw std::runtime_error("Not supported for Blackhole arch"); return 0; }
+
+    uint32_t get_arc_csm_mailbox_offset() const override {
+        throw std::runtime_error("Not supported for Blackhole arch");
+        return 0;
+    }
+
     uint32_t get_arc_reset_arc_misc_cntl_offset() const override { return blackhole::ARC_RESET_ARC_MISC_CNTL_OFFSET; }
+
     uint32_t get_arc_reset_scratch_offset() const override { return blackhole::ARC_RESET_SCRATCH_OFFSET; }
+
     uint32_t get_dram_channel_0_peer2peer_region_start() const override {
         return blackhole::DRAM_CHANNEL_0_PEER2PEER_REGION_START;
     }
+
     uint32_t get_dram_channel_0_x() const override { return blackhole::DRAM_CHANNEL_0_X; }
+
     uint32_t get_dram_channel_0_y() const override { return blackhole::DRAM_CHANNEL_0_Y; }
+
     uint32_t get_broadcast_tlb_index() const override { return blackhole::BROADCAST_TLB_INDEX; }
+
     uint32_t get_dynamic_tlb_2m_base() const override { return blackhole::DYNAMIC_TLB_2M_BASE; }
+
     uint32_t get_dynamic_tlb_2m_size() const override { return blackhole::DYNAMIC_TLB_2M_SIZE; }
-    uint32_t get_dynamic_tlb_16m_base() const override { throw std::runtime_error("No 16MB TLBs for Blackhole arch"); return 0; }
-    uint32_t get_dynamic_tlb_16m_size() const override { throw std::runtime_error("No 16MB TLBs for Blackhole arch"); return 0; }
-    uint32_t get_dynamic_tlb_16m_cfg_addr() const override { throw std::runtime_error("No 16MB TLBs for Blackhole arch"); return 0; }
+
+    uint32_t get_dynamic_tlb_16m_base() const override {
+        throw std::runtime_error("No 16MB TLBs for Blackhole arch");
+        return 0;
+    }
+
+    uint32_t get_dynamic_tlb_16m_size() const override {
+        throw std::runtime_error("No 16MB TLBs for Blackhole arch");
+        return 0;
+    }
+
+    uint32_t get_dynamic_tlb_16m_cfg_addr() const override {
+        throw std::runtime_error("No 16MB TLBs for Blackhole arch");
+        return 0;
+    }
+
     uint32_t get_mem_large_read_tlb() const override { return blackhole::MEM_LARGE_READ_TLB; }
+
     uint32_t get_mem_large_write_tlb() const override { return blackhole::MEM_LARGE_WRITE_TLB; }
+
     uint32_t get_static_tlb_cfg_addr() const override { return blackhole::STATIC_TLB_CFG_ADDR; }
-    uint32_t get_static_tlb_size() const override { return blackhole::STATIC_TLB_SIZE;  }
+
+    uint32_t get_static_tlb_size() const override { return blackhole::STATIC_TLB_SIZE; }
+
     uint32_t get_reg_tlb() const override { return blackhole::REG_TLB; }
-    uint32_t get_tlb_base_index_16m() const override { throw std::runtime_error("No 16MB TLBs for Blackhole arch"); return 0;  }
+
+    uint32_t get_tlb_base_index_16m() const override {
+        throw std::runtime_error("No 16MB TLBs for Blackhole arch");
+        return 0;
+    }
+
     uint32_t get_tensix_soft_reset_addr() const override { return blackhole::TENSIX_SOFT_RESET_ADDR; }
+
     uint32_t get_grid_size_x() const override { return blackhole::GRID_SIZE_X; }
+
     uint32_t get_grid_size_y() const override { return blackhole::GRID_SIZE_Y; }
+
     uint32_t get_tlb_cfg_reg_size_bytes() const override { return blackhole::TLB_CFG_REG_SIZE_BYTES; }
+
     uint32_t get_small_read_write_tlb() const override { return blackhole::MEM_SMALL_READ_WRITE_TLB; }
+
     const std::vector<uint32_t>& get_harvesting_noc_locations() const override {
         return blackhole::HARVESTING_NOC_LOCATIONS;
     }
+
     const std::vector<uint32_t>& get_t6_x_locations() const override { return blackhole::T6_X_LOCATIONS; }
+
     const std::vector<uint32_t>& get_t6_y_locations() const override { return blackhole::T6_Y_LOCATIONS; }
 
     std::tuple<xy_pair, xy_pair> multicast_workaround(xy_pair start, xy_pair end) const override;
@@ -229,7 +256,6 @@ class blackhole_implementation : public architecture_implementation {
     std::pair<std::uint64_t, std::uint64_t> get_tlb_data(std::uint32_t tlb_index, const tlb_data& data) const override;
 
     tt_driver_host_address_params get_host_address_params() const override;
-
 };
 
 }  // namespace tt::umd
diff --git a/device/coordinate_manager.cpp b/device/coordinate_manager.cpp
index 438e002f..de6a7649 100644
--- a/device/coordinate_manager.cpp
+++ b/device/coordinate_manager.cpp
@@ -1,5 +1,7 @@
 #include "device/coordinate_manager.h"
+
 #include <memory>
+
 #include "coordinate_manager.h"
 #include "grayskull/grayskull_coordinate_manager.h"
 
@@ -66,13 +68,9 @@ void CoordinateManager::clear_harvesting_structures() {
     virtual_y_to_logical_y.clear();
 }
 
-std::set<std::size_t> CoordinateManager::get_x_coordinates_to_harvest(std::size_t harvesting_mask) {
-    return {};
-}
+std::set<std::size_t> CoordinateManager::get_x_coordinates_to_harvest(std::size_t harvesting_mask) { return {}; }
 
-std::set<std::size_t> CoordinateManager::get_y_coordinates_to_harvest(std::size_t harvesting_mask) {
-    return {};
-}
+std::set<std::size_t> CoordinateManager::get_y_coordinates_to_harvest(std::size_t harvesting_mask) { return {}; }
 
 void CoordinateManager::perform_harvesting(std::size_t harvesting_mask) {
     clear_harvesting_structures();
@@ -99,14 +97,16 @@ void CoordinateManager::perform_harvesting(std::size_t harvesting_mask) {
     logical_x_to_virtual_x.resize(grid_size_x - num_harvested_x);
     logical_y_to_virtual_y.resize(grid_size_y - num_harvested_y);
 
-    fill_logical_to_physical_mapping(x_coordinates_to_harvest, y_coordinates_to_harvest, physical_x_unharvested, physical_y_unharvested);
+    fill_logical_to_physical_mapping(
+        x_coordinates_to_harvest, y_coordinates_to_harvest, physical_x_unharvested, physical_y_unharvested);
     fill_logical_to_virtual_mapping(physical_x_unharvested, physical_y_unharvested);
 }
 
 void CoordinateManager::fill_logical_to_physical_mapping(
-    const std::set<size_t>& x_to_harvest, const std::set<size_t>& y_to_harvest,
-    const std::set<size_t>& physical_x_unharvested, const std::set<size_t>& physical_y_unharvested) {
-    
+    const std::set<size_t>& x_to_harvest,
+    const std::set<size_t>& y_to_harvest,
+    const std::set<size_t>& physical_x_unharvested,
+    const std::set<size_t>& physical_y_unharvested) {
     auto physical_y_it = physical_y_unharvested.begin();
     std::size_t logical_y = 0;
     for (size_t y = 0; y < worker_grid_size.y; y++) {
@@ -125,7 +125,7 @@ void CoordinateManager::fill_logical_to_physical_mapping(
 
     auto physical_x_it = physical_x_unharvested.begin();
     std::size_t logical_x = 0;
-    for(std::size_t x = 0; x < worker_grid_size.x; x++) {
+    for (std::size_t x = 0; x < worker_grid_size.x; x++) {
         if (x_to_harvest.find(x) == x_to_harvest.end()) {
             logical_x_to_physical_x[logical_x] = *physical_x_it;
             if (physical_x_to_logical_x.find(*physical_x_it) != physical_x_to_logical_x.end()) {
@@ -140,7 +140,8 @@ void CoordinateManager::fill_logical_to_physical_mapping(
     }
 }
 
-void CoordinateManager::fill_logical_to_virtual_mapping(const std::set<size_t>& physical_x_unharvested, const std::set<size_t>& physical_y_unharvested) {
+void CoordinateManager::fill_logical_to_virtual_mapping(
+    const std::set<size_t>& physical_x_unharvested, const std::set<size_t>& physical_y_unharvested) {
     auto physical_y_it = physical_y_unharvested.begin();
     for (std::size_t y = 0; y < logical_y_to_virtual_y.size(); y++) {
         logical_y_to_virtual_y[y] = *physical_y_it;
@@ -171,7 +172,6 @@ std::unique_ptr<CoordinateManager> CoordinateManager::get_coordinate_manager(
     const tt_xy_pair& worker_grid_size,
     const std::vector<tt_xy_pair>& workers,
     std::size_t harvesting_mask) {
-
     switch (arch) {
         case tt::ARCH::GRAYSKULL:
             return std::make_unique<GrayskullCoordinateManager>(worker_grid_size, workers, harvesting_mask);
diff --git a/device/coordinate_manager.h b/device/coordinate_manager.h
index a71764df..967e5237 100644
--- a/device/coordinate_manager.h
+++ b/device/coordinate_manager.h
@@ -7,17 +7,17 @@
 #pragma once
 
 #include <map>
-#include <vector>
 #include <set>
+#include <vector>
 
-#include "device/tt_xy_pair.h"
 #include "device/tt_arch_types.h"
+#include "device/tt_xy_pair.h"
 
 class CoordinateManager {
-
 public:
-    CoordinateManager(const tt_xy_pair& worker_grid_size, const std::vector<tt_xy_pair>& workers, std::size_t harvesting_mask)
-        : worker_grid_size(worker_grid_size), workers(workers), harvesting_mask(harvesting_mask) {}
+    CoordinateManager(
+        const tt_xy_pair& worker_grid_size, const std::vector<tt_xy_pair>& workers, std::size_t harvesting_mask) :
+        worker_grid_size(worker_grid_size), workers(workers), harvesting_mask(harvesting_mask) {}
 
     virtual void perform_harvesting(std::size_t harvesting_mask);
 
@@ -49,14 +49,17 @@ class CoordinateManager {
 
 protected:
     virtual void clear_harvesting_structures();
-    
+
     virtual std::set<std::size_t> get_x_coordinates_to_harvest(std::size_t harvesting_mask);
     virtual std::set<std::size_t> get_y_coordinates_to_harvest(std::size_t harvesting_mask);
 
     virtual void fill_logical_to_physical_mapping(
-        const std::set<size_t>& x_to_harvest, const std::set<size_t>& y_to_harvest,
+        const std::set<size_t>& x_to_harvest,
+        const std::set<size_t>& y_to_harvest,
+        const std::set<size_t>& physical_x_unharvested,
+        const std::set<size_t>& physical_y_unharvested);
+    virtual void fill_logical_to_virtual_mapping(
         const std::set<size_t>& physical_x_unharvested, const std::set<size_t>& physical_y_unharvested);
-    virtual void fill_logical_to_virtual_mapping(const std::set<size_t>& physical_x_unharvested, const std::set<size_t>& physical_y_unharvested);
 
     std::map<std::size_t, std::size_t> physical_y_to_logical_y;
     std::map<std::size_t, std::size_t> physical_x_to_logical_x;
diff --git a/device/cpuset_lib.cpp b/device/cpuset_lib.cpp
index 00f82a46..1bbbb291 100644
--- a/device/cpuset_lib.cpp
+++ b/device/cpuset_lib.cpp
@@ -2,17 +2,20 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#include "cpuset_lib.hpp"
+
 #include <algorithm>
+#include <filesystem>
+#include <thread>
 
-#include "cpuset_lib.hpp"
 #include "common/logger.hpp"
-#include <thread>
 #include "device/tt_device.h"
-#include <filesystem>
 #include "fmt/core.h"
+
 namespace tt {
 
 namespace fs = std::filesystem;
+
 namespace cpuset {
 
 /////////////////////////////////////////////////////////////////////////
@@ -21,15 +24,18 @@ namespace cpuset {
 
 // Constructor for singleton class cpu id allocator
 tt_cpuset_allocator::tt_cpuset_allocator() {
-
-    m_pid           = getpid();
-    m_debug         = std::getenv("TT_BACKEND_CPUSET_ALLOCATOR_DEBUG") ? true : false;
+    m_pid = getpid();
+    m_debug = std::getenv("TT_BACKEND_CPUSET_ALLOCATOR_DEBUG") ? true : false;
 
     // Chicken bit to disable this entire feature for debug/comparison.
     bool cpuset_allocator_enable_env = std::getenv("TT_BACKEND_CPUSET_ALLOCATOR_ENABLE") ? true : false;
 
     auto system_tid = std::this_thread::get_id();
-    log_debug(LogSiliconDriver, "Starting tt_cpuset_allocator constructor now for process_id: {} thread_id: {}", m_pid, system_tid);
+    log_debug(
+        LogSiliconDriver,
+        "Starting tt_cpuset_allocator constructor now for process_id: {} thread_id: {}",
+        m_pid,
+        system_tid);
 
     m_enable_cpuset_allocator = true;
 
@@ -38,86 +44,102 @@ tt_cpuset_allocator::tt_cpuset_allocator() {
     m_enable_cpuset_allocator &= init_get_number_of_packages();
     m_enable_cpuset_allocator &= init_find_tt_pci_devices_packages_numanodes();
 
-    if (!cpuset_allocator_enable_env){
+    if (!cpuset_allocator_enable_env) {
         m_enable_cpuset_allocator = false;
-    }else{
-
-        bool is_cpu_supported      = init_is_cpu_model_supported();
+    } else {
+        bool is_cpu_supported = init_is_cpu_model_supported();
 
-        if (is_cpu_supported){
+        if (is_cpu_supported) {
             m_enable_cpuset_allocator &= init_determine_cpuset_allocations();
-        }else{
+        } else {
             m_enable_cpuset_allocator = false;
         }
 
-        log_debug(LogSiliconDriver,"Finished tt_cpuset_allocator constructor now with m_enable_cpuset_allocator: {} for process_id: {} thread_id: {} ", m_enable_cpuset_allocator, m_pid, system_tid);
+        log_debug(
+            LogSiliconDriver,
+            "Finished tt_cpuset_allocator constructor now with m_enable_cpuset_allocator: {} for process_id: {} "
+            "thread_id: {} ",
+            m_enable_cpuset_allocator,
+            m_pid,
+            system_tid);
     }
 }
 
 // Step 1 : Initialize and perform m_topology detection
-bool tt_cpuset_allocator::init_topology_init_and_load(){
-    log_debug(LogSiliconDriver,"Inside tt_cpuset_allocator::topology_init_and_load()");
+bool tt_cpuset_allocator::init_topology_init_and_load() {
+    log_debug(LogSiliconDriver, "Inside tt_cpuset_allocator::topology_init_and_load()");
 
-    if (!m_enable_cpuset_allocator){
+    if (!m_enable_cpuset_allocator) {
         return false;
     }
 
-    if (hwloc_topology_init(&m_topology)){
+    if (hwloc_topology_init(&m_topology)) {
         log_warning(LogSiliconDriver, "Problem initializing topology");
         return false;
     }
 
-    hwloc_topology_set_type_filter(m_topology, HWLOC_OBJ_PCI_DEVICE, HWLOC_TYPE_FILTER_KEEP_ALL); // Need to find PCI devices.
+    hwloc_topology_set_type_filter(
+        m_topology, HWLOC_OBJ_PCI_DEVICE, HWLOC_TYPE_FILTER_KEEP_ALL);  // Need to find PCI devices.
 
-    if (hwloc_topology_load(m_topology)){
+    if (hwloc_topology_load(m_topology)) {
         log_warning(LogSiliconDriver, "Problem loading topology");
         return false;
     }
 
-    return true; // Success
+    return true;  // Success
 }
 
-// Step 2 - Find TT PCI devices in topology by vendor_id to get their PCI bus_id and physical device_id, and package and numamode.
-bool tt_cpuset_allocator::init_find_tt_pci_devices_packages_numanodes(){
-
-    if (!m_enable_cpuset_allocator){
+// Step 2 - Find TT PCI devices in topology by vendor_id to get their PCI bus_id and physical device_id, and package and
+// numamode.
+bool tt_cpuset_allocator::init_find_tt_pci_devices_packages_numanodes() {
+    if (!m_enable_cpuset_allocator) {
         return false;
     }
 
-    log_debug(LogSiliconDriver,"Starting tt_cpuset_allocator::init_find_tt_pci_devices_packages_numanodes()");
+    log_debug(LogSiliconDriver, "Starting tt_cpuset_allocator::init_find_tt_pci_devices_packages_numanodes()");
     m_num_tt_device_by_pci_device_id_map.clear();
 
     hwloc_obj_t pci_device_obj = NULL;
     const std::regex tt_device_re("tenstorrent!([0-9]+)");
 
-    while ((pci_device_obj = hwloc_get_next_pcidev(m_topology, pci_device_obj))){
-
-        if (hwloc_obj_type_is_io(pci_device_obj->type) && (pci_device_obj->attr->pcidev.vendor_id == TENSTORRENT_VENDOR_ID)) {
-
-            std::pair<uint16_t, uint16_t> device_id_revision = std::make_pair(pci_device_obj->attr->pcidev.device_id, pci_device_obj->attr->pcidev.revision);
+    while ((pci_device_obj = hwloc_get_next_pcidev(m_topology, pci_device_obj))) {
+        if (hwloc_obj_type_is_io(pci_device_obj->type) &&
+            (pci_device_obj->attr->pcidev.vendor_id == TENSTORRENT_VENDOR_ID)) {
+            std::pair<uint16_t, uint16_t> device_id_revision =
+                std::make_pair(pci_device_obj->attr->pcidev.device_id, pci_device_obj->attr->pcidev.revision);
             m_num_tt_device_by_pci_device_id_map[device_id_revision] += 1;
 
-            std::string pci_bus_id_str  = get_pci_bus_id(pci_device_obj);
+            std::string pci_bus_id_str = get_pci_bus_id(pci_device_obj);
             std::string pci_device_dir = fmt::format("/sys/bus/pci/devices/{}/tenstorrent/", pci_bus_id_str);
             int physical_device_id = -1;
 
-            log_trace(LogSiliconDriver, "Found TT device with pci_bus_id_str: {} num_devices_by_pci_device_id: {}", pci_bus_id_str, m_num_tt_device_by_pci_device_id_map[device_id_revision]);
+            log_trace(
+                LogSiliconDriver,
+                "Found TT device with pci_bus_id_str: {} num_devices_by_pci_device_id: {}",
+                pci_bus_id_str,
+                m_num_tt_device_by_pci_device_id_map[device_id_revision]);
 
             // First, get the physical_device_id of the device.
-            if (fs::exists(pci_device_dir)){
-                for (const auto &entry : fs::directory_iterator(pci_device_dir)){
+            if (fs::exists(pci_device_dir)) {
+                for (const auto &entry : fs::directory_iterator(pci_device_dir)) {
                     auto entry_str = entry.path().string();
 
-                    if (std::smatch device_match; std::regex_search(entry_str, device_match, tt_device_re) and (stoi(device_match[1]) >= 0)){
+                    if (std::smatch device_match;
+                        std::regex_search(entry_str, device_match, tt_device_re) and (stoi(device_match[1]) >= 0)) {
                         physical_device_id = stoi(device_match[1]);
                         m_all_tt_devices.push_back(physical_device_id);
-                        log_debug(LogSiliconDriver, "Found physical_device_id: {} from file: {}", physical_device_id, entry_str);
+                        log_debug(
+                            LogSiliconDriver,
+                            "Found physical_device_id: {} from file: {}",
+                            physical_device_id,
+                            entry_str);
                         break;
                     }
                 }
 
-                if (physical_device_id == -1){
-                    log_warning(LogSiliconDriver, "Did not find file containing physical_device_id in {}", pci_device_dir);
+                if (physical_device_id == -1) {
+                    log_warning(
+                        LogSiliconDriver, "Did not find file containing physical_device_id in {}", pci_device_dir);
                     return false;
                 }
 
@@ -125,19 +147,23 @@ bool tt_cpuset_allocator::init_find_tt_pci_devices_packages_numanodes(){
 
                 // Next, get the PackageID of the device and update maps.
                 auto package_id = get_package_id_from_device(pci_device_obj, physical_device_id);
-                
-                // This package was not previously seen. Initialize structures tracking the TT Devices mapped to this 
+
+                // This package was not previously seen. Initialize structures tracking the TT Devices mapped to this
                 // package and structures storing the CPU characteristics per package.
                 if (m_package_id_to_devices_map.find(package_id) == m_package_id_to_devices_map.end()) {
                     m_package_id_to_devices_map.insert({package_id, {}});
                     m_package_id_to_num_l3_per_ccx_map.insert({package_id, 0});
                     m_package_id_to_num_ccx_per_ccd_map.insert({package_id, 0});
                 }
-                if (package_id != -1){
+                if (package_id != -1) {
                     m_package_id_to_devices_map.at(package_id).push_back(physical_device_id);
                     m_physical_device_id_to_package_id_map.insert({physical_device_id, package_id});
                 } else {
-                    log_warning(LogSiliconDriver, "Could not find package_id for TT Device (physical_device_id: {} pci_bus_id: {})", physical_device_id, pci_bus_id_str);
+                    log_warning(
+                        LogSiliconDriver,
+                        "Could not find package_id for TT Device (physical_device_id: {} pci_bus_id: {})",
+                        physical_device_id,
+                        pci_bus_id_str);
                     return false;
                 }
 
@@ -145,378 +171,479 @@ bool tt_cpuset_allocator::init_find_tt_pci_devices_packages_numanodes(){
                 auto numa_nodeset = get_numa_nodeset_from_device(pci_device_obj, physical_device_id);
                 m_physical_device_id_to_numa_nodeset_map.insert({physical_device_id, numa_nodeset});
 
-                if (numa_nodeset == 0x0){
-                    log_warning(LogSiliconDriver, "Could not find NumaNodeSet for TT Device (physical_device_id: {} pci_bus_id: {})", physical_device_id, pci_bus_id_str);
+                if (numa_nodeset == 0x0) {
+                    log_warning(
+                        LogSiliconDriver,
+                        "Could not find NumaNodeSet for TT Device (physical_device_id: {} pci_bus_id: {})",
+                        physical_device_id,
+                        pci_bus_id_str);
                     return false;
                 }
 
-                m_physical_device_id_to_cpusets_map.insert({physical_device_id, {}}); // Empty vector.
+                m_physical_device_id_to_cpusets_map.insert({physical_device_id, {}});  // Empty vector.
                 m_num_cpu_cores_allocated_per_tt_device.insert({physical_device_id, 0});
             }
         }
     }
 
-    if (m_all_tt_devices.size() == 0){
-        log_warning(LogSiliconDriver, "Did not find any PCI devices matching Tenstorrent vendor_id 0x{:x}", TENSTORRENT_VENDOR_ID);
+    if (m_all_tt_devices.size() == 0) {
+        log_warning(
+            LogSiliconDriver,
+            "Did not find any PCI devices matching Tenstorrent vendor_id 0x{:x}",
+            TENSTORRENT_VENDOR_ID);
         return false;
     }
 
-    log_debug(LogSiliconDriver,"Finshed tt_cpuset_allocator::init_find_tt_pci_devices_packages_numanodes() found {} devices", m_all_tt_devices.size());
-
+    log_debug(
+        LogSiliconDriver,
+        "Finshed tt_cpuset_allocator::init_find_tt_pci_devices_packages_numanodes() found {} devices",
+        m_all_tt_devices.size());
 
     // Sort these 2 vectors of device_ids before we are done, since discovery can be in any order.
-    for (auto &p: m_package_id_to_devices_map){
+    for (auto &p : m_package_id_to_devices_map) {
         std::sort(p.second.begin(), p.second.end());
     }
 
     std::sort(m_all_tt_devices.begin(), m_all_tt_devices.end());
 
-    return true; // Success
+    return true;  // Success
 }
 
-
 // Step 3 : Detect the number of packages.
-bool tt_cpuset_allocator::init_get_number_of_packages(){
-
-    if (!m_enable_cpuset_allocator){
+bool tt_cpuset_allocator::init_get_number_of_packages() {
+    if (!m_enable_cpuset_allocator) {
         return false;
     }
 
     m_num_packages = hwloc_get_nbobjs_by_type(m_topology, HWLOC_OBJ_PACKAGE);
-    log_debug(LogSiliconDriver,"Found {} CPU packages", m_num_packages);
-    return m_num_packages > 0; // Success
+    log_debug(LogSiliconDriver, "Found {} CPU packages", m_num_packages);
+    return m_num_packages > 0;  // Success
 }
 
 // Step 4 : Return true if all packages are models we want to support. Env-var can be used to ignore this check.
-bool tt_cpuset_allocator::init_is_cpu_model_supported(){
-
-    if (!m_enable_cpuset_allocator){
+bool tt_cpuset_allocator::init_is_cpu_model_supported() {
+    if (!m_enable_cpuset_allocator) {
         return false;
     }
 
-    if (m_num_packages == 0){
-        log_debug(LogSiliconDriver,"init_is_cpu_model_supported(): Found 0 packages, functions run out of order?");
+    if (m_num_packages == 0) {
+        log_debug(LogSiliconDriver, "init_is_cpu_model_supported(): Found 0 packages, functions run out of order?");
         return false;
     }
 
     bool use_any_cpu = std::getenv("TT_BACKEND_CPUSET_ALLOCATOR_SUPPORT_ANY_CPU") ? true : false;
 
-    log_debug(LogSiliconDriver,"Inside tt_cpuset_allocator::check_if_cpu_model_supported()");
+    log_debug(LogSiliconDriver, "Inside tt_cpuset_allocator::check_if_cpu_model_supported()");
 
     // Supported CPU Models for enabling CPUSET Allocator.  Keep the list small to production machines to start.
-    std::vector<std::string> supported_cpu_models = {   "AMD EPYC 7352 24-Core Processor",
-                                                        "AMD EPYC 7532 32-Core Processor"};
+    std::vector<std::string> supported_cpu_models = {
+        "AMD EPYC 7352 24-Core Processor", "AMD EPYC 7532 32-Core Processor"};
 
     // CPU Models that have L3 per CCX and 2 CCX per CCD
-    std::vector<std::string> opt_2ccx_per_ccd_cpu_models = {    "AMD EPYC 7352 24-Core Processor",
-                                                                "AMD EPYC 7532 32-Core Processor"};
-    for(const auto& package: m_package_id_to_devices_map) {
+    std::vector<std::string> opt_2ccx_per_ccd_cpu_models = {
+        "AMD EPYC 7352 24-Core Processor", "AMD EPYC 7532 32-Core Processor"};
+    for (const auto &package : m_package_id_to_devices_map) {
         int package_id = package.first;
         auto package_obj = hwloc_get_obj_by_type(m_topology, HWLOC_OBJ_PACKAGE, package_id);
-        if (m_debug) print_hwloc_object(package_obj, 0, true, true);
+        if (m_debug) {
+            print_hwloc_object(package_obj, 0, true, true);
+        }
 
         std::string pkg_cpu_model = hwloc_obj_get_info_by_name(package_obj, "CPUModel");
 
         // First find out if this CPU is supported by CPUSET Allocator at all.
         bool has_supported_cpu = use_any_cpu ? true : false;
 
-        for (auto &supported_cpu_model : supported_cpu_models){
+        for (auto &supported_cpu_model : supported_cpu_models) {
             has_supported_cpu |= (pkg_cpu_model.find(supported_cpu_model) != std::string::npos);
         }
 
-        log_debug(LogSiliconDriver,"Detected package-id: {} has_supported_cpu: {} for CpuModel: {}", package_id, has_supported_cpu, pkg_cpu_model);
+        log_debug(
+            LogSiliconDriver,
+            "Detected package-id: {} has_supported_cpu: {} for CpuModel: {}",
+            package_id,
+            has_supported_cpu,
+            pkg_cpu_model);
 
-        if (!has_supported_cpu){
+        if (!has_supported_cpu) {
             return false;
         }
 
         // Then, determine if the 2CCX-PER-CCD optimization can be enabled for this CPU Model in the package.
-        for (auto &opt_cpu_model : opt_2ccx_per_ccd_cpu_models){
-            if (pkg_cpu_model.find(opt_cpu_model) != std::string::npos){
+        for (auto &opt_cpu_model : opt_2ccx_per_ccd_cpu_models) {
+            if (pkg_cpu_model.find(opt_cpu_model) != std::string::npos) {
                 m_package_id_to_num_l3_per_ccx_map.at(package_id) = 1;
                 m_package_id_to_num_ccx_per_ccd_map.at(package_id) = 2;
             }
         }
     }
 
-    return true; // Successhwloc
+    return true;  // Successhwloc
 }
 
-
-// Step 5: Get all target allocation objects (ie. L3Cache if IO thread to be allocated per L3Cache cpuset) for a given socket/package.
-bool tt_cpuset_allocator::init_determine_cpuset_allocations(){
-
-    if (!m_enable_cpuset_allocator){
+// Step 5: Get all target allocation objects (ie. L3Cache if IO thread to be allocated per L3Cache cpuset) for a given
+// socket/package.
+bool tt_cpuset_allocator::init_determine_cpuset_allocations() {
+    if (!m_enable_cpuset_allocator) {
         return false;
     }
 
-    log_debug(LogSiliconDriver,"Inside tt_cpuset_allocator::init_determine_cpuset_allocations()");
-    for (const auto& package : m_package_id_to_devices_map) {
+    log_debug(LogSiliconDriver, "Inside tt_cpuset_allocator::init_determine_cpuset_allocations()");
+    for (const auto &package : m_package_id_to_devices_map) {
         int package_id = package.first;
         auto num_tt_devices_for_cpu_package = package.second.size();
 
-        if (num_tt_devices_for_cpu_package == 0){
-            log_debug(LogSiliconDriver, "init_determine_cpuset_allocations() -- no TT devices for package_id: {}, skipping.", package_id);
+        if (num_tt_devices_for_cpu_package == 0) {
+            log_debug(
+                LogSiliconDriver,
+                "init_determine_cpuset_allocations() -- no TT devices for package_id: {}, skipping.",
+                package_id);
             continue;
         }
 
-        log_debug(LogSiliconDriver, "init_determine_cpuset_allocations(). starting to detect allocation slots for package_id: {} ", package_id);
+        log_debug(
+            LogSiliconDriver,
+            "init_determine_cpuset_allocations(). starting to detect allocation slots for package_id: {} ",
+            package_id);
 
         auto package_obj = hwloc_get_obj_by_type(m_topology, HWLOC_OBJ_PACKAGE, package_id);
-        if (m_debug) print_hwloc_object(package_obj, 0, true, true);
+        if (m_debug) {
+            print_hwloc_object(package_obj, 0, true, true);
+        }
 
-        auto num_alloc_slots_in_package = hwloc_get_nbobjs_inside_cpuset_by_type(m_topology, package_obj->cpuset, m_object_per_alloc_slot);
-        if (num_alloc_slots_in_package == 0){
-            log_warning(LogSiliconDriver, "Could not find any of the alloc objects in package_id: {} for this cpu arc", package_id);
+        auto num_alloc_slots_in_package =
+            hwloc_get_nbobjs_inside_cpuset_by_type(m_topology, package_obj->cpuset, m_object_per_alloc_slot);
+        if (num_alloc_slots_in_package == 0) {
+            log_warning(
+                LogSiliconDriver,
+                "Could not find any of the alloc objects in package_id: {} for this cpu arc",
+                package_id);
             return false;
         }
         auto num_alloc_slots_per_tt_device = num_alloc_slots_in_package / num_tt_devices_for_cpu_package;
 
         // Above splits evenly by devices, leaves remainder unused in the example case of 3 devices but 8 slots.
-        log_debug(LogSiliconDriver, "init_determine_cpuset_allocations(). package_id: {} num_alloc_slots_in_package: {} num_tt_devices_for_cpu_package: {} num_alloc_slots_per_tt_device: {}",
-            package_id, num_alloc_slots_in_package, num_tt_devices_for_cpu_package, num_alloc_slots_per_tt_device);
+        log_debug(
+            LogSiliconDriver,
+            "init_determine_cpuset_allocations(). package_id: {} num_alloc_slots_in_package: {} "
+            "num_tt_devices_for_cpu_package: {} num_alloc_slots_per_tt_device: {}",
+            package_id,
+            num_alloc_slots_in_package,
+            num_tt_devices_for_cpu_package,
+            num_alloc_slots_per_tt_device);
 
         int device_idx = 0;
 
-        for (int obj_idx = 0; obj_idx < num_alloc_slots_in_package; obj_idx++){
+        for (int obj_idx = 0; obj_idx < num_alloc_slots_in_package; obj_idx++) {
+            auto obj = hwloc_get_obj_below_by_type(
+                m_topology, HWLOC_OBJ_PACKAGE, package_id, m_object_per_alloc_slot, obj_idx);
 
-            auto obj = hwloc_get_obj_below_by_type(m_topology, HWLOC_OBJ_PACKAGE, package_id, m_object_per_alloc_slot, obj_idx);
-
-            if (obj){
-                if (m_debug) print_hwloc_object(obj, 1, true);
+            if (obj) {
+                if (m_debug) {
+                    print_hwloc_object(obj, 1, true);
+                }
 
                 auto physical_device_id = m_package_id_to_devices_map.at(package_id).at(device_idx);
 
                 // Hack for maximum number of slots per device.
                 // if (m_physical_device_id_to_cpusets_map.at(physical_device_id).size() < 2){
                 m_physical_device_id_to_cpusets_map.at(physical_device_id).push_back(obj->cpuset);
-                int num_cpus = hwloc_get_nbobjs_inside_cpuset_by_type(m_topology,obj->cpuset,HWLOC_OBJ_CORE);
+                int num_cpus = hwloc_get_nbobjs_inside_cpuset_by_type(m_topology, obj->cpuset, HWLOC_OBJ_CORE);
                 m_num_cpu_cores_allocated_per_tt_device.at(physical_device_id) += num_cpus;
                 // }
 
                 // We're distributing allocation objects per package across TT devices, so switch to next one.
-                if (((obj_idx + 1) % num_alloc_slots_per_tt_device) == 0){
-                    device_idx = (device_idx + 1) % num_tt_devices_for_cpu_package; // Loop around if extra slots remain. Assigned to first device for that package.
+                if (((obj_idx + 1) % num_alloc_slots_per_tt_device) == 0) {
+                    device_idx = (device_idx + 1) %
+                                 num_tt_devices_for_cpu_package;  // Loop around if extra slots remain. Assigned to
+                                                                  // first device for that package.
                 }
 
-            }else{
-                log_warning(LogSiliconDriver, "init_determine_cpuset_allocations(). Something went wrong looking for cpuset alloc object under package");
+            } else {
+                log_warning(
+                    LogSiliconDriver,
+                    "init_determine_cpuset_allocations(). Something went wrong looking for cpuset alloc object under "
+                    "package");
                 return false;
             }
         }
 
-        log_debug(LogSiliconDriver, "init_determine_cpuset_allocations(). Done detecting allocation slots for package_id: {} ", package_id);
+        log_debug(
+            LogSiliconDriver,
+            "init_determine_cpuset_allocations(). Done detecting allocation slots for package_id: {} ",
+            package_id);
     }
 
-
     // Summary for Debug purposes.
-    for (auto &physical_device_id : m_all_tt_devices){
-        for (size_t device_alloc_idx=0; device_alloc_idx < m_physical_device_id_to_cpusets_map.at(physical_device_id).size(); device_alloc_idx++){
+    for (auto &physical_device_id : m_all_tt_devices) {
+        for (size_t device_alloc_idx = 0;
+             device_alloc_idx < m_physical_device_id_to_cpusets_map.at(physical_device_id).size();
+             device_alloc_idx++) {
             auto cpuset = m_physical_device_id_to_cpusets_map.at(physical_device_id).at(device_alloc_idx);
             auto pu_ids_vector = get_hwloc_bitmap_vector(cpuset);
             auto num_pu_ids = pu_ids_vector.size();
             auto package_id = m_physical_device_id_to_package_id_map.at(physical_device_id);
-            log_debug(LogSiliconDriver, "Done init_determine_cpuset_allocations(). Summary => for mmio physical_device_id: {} package_id: {} device_alloc_idx: {} picked {} PU's {}", physical_device_id, package_id, device_alloc_idx, num_pu_ids, pu_ids_vector);
+            log_debug(
+                LogSiliconDriver,
+                "Done init_determine_cpuset_allocations(). Summary => for mmio physical_device_id: {} package_id: {} "
+                "device_alloc_idx: {} picked {} PU's {}",
+                physical_device_id,
+                package_id,
+                device_alloc_idx,
+                num_pu_ids,
+                pu_ids_vector);
         }
     }
 
-    return true; // Success
-
+    return true;  // Success
 }
 
 /////////////////////////////////////////////////////////////////////////
 // Runtime Functions ////////////////////////////////////////////////////
 /////////////////////////////////////////////////////////////////////////
 
-// Given a physical device_id, determine the right numa nodes associated with it and attempt to membind a previously allocated memory region to it.
-bool tt_cpuset_allocator::bind_area_memory_nodeset(chip_id_t physical_device_id, const void * addr, size_t len){
-
+// Given a physical device_id, determine the right numa nodes associated with it and attempt to membind a previously
+// allocated memory region to it.
+bool tt_cpuset_allocator::bind_area_memory_nodeset(chip_id_t physical_device_id, const void *addr, size_t len) {
     auto tid = std::this_thread::get_id();
-    log_debug(LogSiliconDriver,"bind_area_memory_nodeset(): Going to attempt memory binding of addr/len to NumaNode for physical_device_id: {} (pid: {} tid: {})", physical_device_id, m_pid, tid);
-
-    if (m_physical_device_id_to_numa_nodeset_map.count(physical_device_id) == 0){
-        log_fatal("bind_area_memory_nodeset(): Did not find physical_device_id: {} in numanode_mask map, this is not expected.", physical_device_id);
+    log_debug(
+        LogSiliconDriver,
+        "bind_area_memory_nodeset(): Going to attempt memory binding of addr/len to NumaNode for physical_device_id: "
+        "{} (pid: {} tid: {})",
+        physical_device_id,
+        m_pid,
+        tid);
+
+    if (m_physical_device_id_to_numa_nodeset_map.count(physical_device_id) == 0) {
+        log_fatal(
+            "bind_area_memory_nodeset(): Did not find physical_device_id: {} in numanode_mask map, this is not "
+            "expected.",
+            physical_device_id);
         return false;
     }
 
     auto target_nodeset = m_physical_device_id_to_numa_nodeset_map.at(physical_device_id);
 
-    if (target_nodeset != 0){
-        if (hwloc_set_area_membind(m_topology, addr, len, target_nodeset, HWLOC_MEMBIND_BIND, HWLOC_MEMBIND_BYNODESET | HWLOC_MEMBIND_STRICT | HWLOC_MEMBIND_MIGRATE) ){
-            log_warning(LogSiliconDriver,"hwloc_set_area_membind(): failed for physical_device_id: {} on NodeSet: {} with errno: {} (pid: {} tid: {})", 
-                physical_device_id, get_hwloc_bitmap_vector(target_nodeset), strerror(errno), m_pid, tid);
+    if (target_nodeset != 0) {
+        if (hwloc_set_area_membind(
+                m_topology,
+                addr,
+                len,
+                target_nodeset,
+                HWLOC_MEMBIND_BIND,
+                HWLOC_MEMBIND_BYNODESET | HWLOC_MEMBIND_STRICT | HWLOC_MEMBIND_MIGRATE)) {
+            log_warning(
+                LogSiliconDriver,
+                "hwloc_set_area_membind(): failed for physical_device_id: {} on NodeSet: {} with errno: {} (pid: {} "
+                "tid: {})",
+                physical_device_id,
+                get_hwloc_bitmap_vector(target_nodeset),
+                strerror(errno),
+                m_pid,
+                tid);
             return false;
-        }else{
-            log_debug(LogSiliconDriver,"hwloc_set_area_membind(): success for physical_device_id: {} on NodeSet: {} (pid: {} tid: {})", physical_device_id, get_hwloc_bitmap_vector(target_nodeset), m_pid, tid);
+        } else {
+            log_debug(
+                LogSiliconDriver,
+                "hwloc_set_area_membind(): success for physical_device_id: {} on NodeSet: {} (pid: {} tid: {})",
+                physical_device_id,
+                get_hwloc_bitmap_vector(target_nodeset),
+                m_pid,
+                tid);
         }
-    }else{
-        log_warning(LogSiliconDriver,"bind_area_memory_nodeset(): Unable to determine TT Device to NumaNode mapping for physical_device_id: {}. Skipping membind.", physical_device_id);
+    } else {
+        log_warning(
+            LogSiliconDriver,
+            "bind_area_memory_nodeset(): Unable to determine TT Device to NumaNode mapping for physical_device_id: {}. "
+            "Skipping membind.",
+            physical_device_id);
         return false;
     }
 
-    return true; // Success
+    return true;  // Success
 }
 
 int tt_cpuset_allocator::_get_num_tt_pci_devices() {
-
     for (auto &d : m_physical_device_id_to_package_id_map) {
         log_trace(LogSiliconDriver, "Found physical_device_id: {} ", d.first);
     }
     return m_physical_device_id_to_package_id_map.size();
 }
 
-
-
-
 /////////////////////////////////////////////////////////////////////////
-//Helper Functions //////////////////////////////////////////////////////
+// Helper Functions //////////////////////////////////////////////////////
 /////////////////////////////////////////////////////////////////////////
 
-
-std::string tt_cpuset_allocator::get_pci_bus_id(hwloc_obj_t pci_device_obj){
-
+std::string tt_cpuset_allocator::get_pci_bus_id(hwloc_obj_t pci_device_obj) {
     std::string pci_bus_id_str = "";
 
-    if (hwloc_obj_type_is_io(pci_device_obj->type)) {        
+    if (hwloc_obj_type_is_io(pci_device_obj->type)) {
         auto attrs = pci_device_obj->attr->pcidev;
         pci_bus_id_str = fmt::format("{:04x}:{:02x}:{:02x}.{:01x}", attrs.domain, attrs.bus, attrs.dev, attrs.func);
     }
 
     return pci_bus_id_str;
-
 }
 
-int tt_cpuset_allocator::get_package_id_from_device(hwloc_obj_t pci_device_obj, chip_id_t physical_device_id){
-
+int tt_cpuset_allocator::get_package_id_from_device(hwloc_obj_t pci_device_obj, chip_id_t physical_device_id) {
     auto pci_bus_id_str = m_physical_device_id_to_pci_bus_id_map.at(physical_device_id);
 
-    log_debug(LogSiliconDriver, "Checking TT device (physical_device_id: {} pci_bus_id: {}) to find it's corresponding CPU package", physical_device_id, pci_bus_id_str);
+    log_debug(
+        LogSiliconDriver,
+        "Checking TT device (physical_device_id: {} pci_bus_id: {}) to find it's corresponding CPU package",
+        physical_device_id,
+        pci_bus_id_str);
 
     hwloc_obj_t tmp_obj = hwloc_get_non_io_ancestor_obj(m_topology, pci_device_obj);
     int package_id = -1;
 
     // Keep going up until package/machine hierarchy is found, in case we don't find it right away.
-    while (package_id == -1){
-
-        if ((hwloc_compare_types(tmp_obj->type, HWLOC_OBJ_PACKAGE) == 0) || (hwloc_compare_types(tmp_obj->type, HWLOC_OBJ_MACHINE) == 0)){
-            if (tmp_obj->os_index != (unsigned) -1){
+    while (package_id == -1) {
+        if ((hwloc_compare_types(tmp_obj->type, HWLOC_OBJ_PACKAGE) == 0) ||
+            (hwloc_compare_types(tmp_obj->type, HWLOC_OBJ_MACHINE) == 0)) {
+            if (tmp_obj->os_index != (unsigned)-1) {
                 package_id = tmp_obj->os_index;
-            }else{
-                log_warning(LogSiliconDriver, "Could not find os_index of package or machine object for TT device (physical_device_id: {} pci_bus_id: {})", physical_device_id, pci_bus_id_str);
+            } else {
+                log_warning(
+                    LogSiliconDriver,
+                    "Could not find os_index of package or machine object for TT device (physical_device_id: {} "
+                    "pci_bus_id: {})",
+                    physical_device_id,
+                    pci_bus_id_str);
                 break;
             }
-        }else{
-            if (tmp_obj->parent){
+        } else {
+            if (tmp_obj->parent) {
                 tmp_obj = tmp_obj->parent;
-            }else{
+            } else {
                 break;
             }
         }
     }
 
-    if (m_debug) print_hwloc_object(pci_device_obj, 1, true, true);
-    if (m_debug) print_hwloc_object(tmp_obj, 1, true, true);
+    if (m_debug) {
+        print_hwloc_object(pci_device_obj, 1, true, true);
+    }
+    if (m_debug) {
+        print_hwloc_object(tmp_obj, 1, true, true);
+    }
 
     return package_id;
 }
 
-hwloc_nodeset_t tt_cpuset_allocator::get_numa_nodeset_from_device(hwloc_obj_t pci_device_obj, chip_id_t physical_device_id){
-
+hwloc_nodeset_t tt_cpuset_allocator::get_numa_nodeset_from_device(
+    hwloc_obj_t pci_device_obj, chip_id_t physical_device_id) {
     hwloc_nodeset_t nodeset = 0x0;
 
     // Currently an issue in non-EPYC machines where PCI devices are directly under Machine, and not any NumaNodes.
     // As quick workaround, skip this if there is only single numanode since returning 1 seems fine.
-    if (hwloc_get_nbobjs_by_type(m_topology, HWLOC_OBJ_NUMANODE) == 1){
+    if (hwloc_get_nbobjs_by_type(m_topology, HWLOC_OBJ_NUMANODE) == 1) {
         auto numanode = hwloc_get_obj_by_type(m_topology, HWLOC_OBJ_NUMANODE, 0);
         return numanode->nodeset;
     }
 
     auto pci_bus_id_str = m_physical_device_id_to_pci_bus_id_map.at(physical_device_id);
 
-    log_debug(LogSiliconDriver, "init_detect_tt_device_numanodes(): Checking TT device (physical_device_id: {} pci_bus_id: {}) to find it's corresponding NumaNode.", physical_device_id, pci_bus_id_str);
+    log_debug(
+        LogSiliconDriver,
+        "init_detect_tt_device_numanodes(): Checking TT device (physical_device_id: {} pci_bus_id: {}) to find it's "
+        "corresponding NumaNode.",
+        physical_device_id,
+        pci_bus_id_str);
 
     hwloc_obj_t tmp_obj = pci_device_obj->parent;
-    while (tmp_obj && !tmp_obj->memory_arity){
+    while (tmp_obj && !tmp_obj->memory_arity) {
         tmp_obj = tmp_obj->parent; /* no memory child, walk up */
     }
 
-    if (tmp_obj && tmp_obj->nodeset){
-        log_debug(LogSiliconDriver, "init_detect_tt_device_numanodes(): For TT device (physical_device_id: {} pci_bus_id: {}) found NumaNodeSet: {}", physical_device_id, pci_bus_id_str, get_hwloc_bitmap_vector(tmp_obj->nodeset));
+    if (tmp_obj && tmp_obj->nodeset) {
+        log_debug(
+            LogSiliconDriver,
+            "init_detect_tt_device_numanodes(): For TT device (physical_device_id: {} pci_bus_id: {}) found "
+            "NumaNodeSet: {}",
+            physical_device_id,
+            pci_bus_id_str,
+            get_hwloc_bitmap_vector(tmp_obj->nodeset));
         nodeset = tmp_obj->nodeset;
-    }else{
-        log_warning(LogSiliconDriver, "init_detect_tt_device_numanodes(): Could not determine NumaNodeSet for TT device (physical_device_id: {} pci_bus_id: {})", physical_device_id, pci_bus_id_str);
+    } else {
+        log_warning(
+            LogSiliconDriver,
+            "init_detect_tt_device_numanodes(): Could not determine NumaNodeSet for TT device (physical_device_id: {} "
+            "pci_bus_id: {})",
+            physical_device_id,
+            pci_bus_id_str);
     }
 
     return nodeset;
-
 }
 
 int tt_cpuset_allocator::_get_num_tt_pci_devices_by_pci_device_id(uint16_t device_id, uint16_t revision) {
-
     std::pair<uint16_t, uint16_t> device_id_revision = std::make_pair(device_id, revision);
 
     if (m_num_tt_device_by_pci_device_id_map.find(device_id_revision) != m_num_tt_device_by_pci_device_id_map.end()) {
         return m_num_tt_device_by_pci_device_id_map.at(device_id_revision);
     } else {
-        log_warning(LogSiliconDriver, "Cannot find any TT device with PCI device_id: 0x{:x} and revision: {} in topology.", device_id, revision);
+        log_warning(
+            LogSiliconDriver,
+            "Cannot find any TT device with PCI device_id: 0x{:x} and revision: {} in topology.",
+            device_id,
+            revision);
         return 0;
     }
 }
 
 /////////////////////////////////////////////////////////////////////////
-//Debug Functions ///////////////////////////////////////////////////////
+// Debug Functions ///////////////////////////////////////////////////////
 /////////////////////////////////////////////////////////////////////////
 
 // Get all PU ids (or numa nodes) in a vector, for legacy/back-compat/debug purposes.
-std::vector<int> tt_cpuset_allocator::get_hwloc_bitmap_vector(hwloc_bitmap_t &bitmap){
-
+std::vector<int> tt_cpuset_allocator::get_hwloc_bitmap_vector(hwloc_bitmap_t &bitmap) {
     std::vector<int> indices;
     int index;
-    if (bitmap){
-        hwloc_bitmap_foreach_begin(index, bitmap)
-            indices.push_back(index);
+    if (bitmap) {
+        hwloc_bitmap_foreach_begin(index, bitmap) indices.push_back(index);
         hwloc_bitmap_foreach_end();
     }
     return indices;
 }
 
-std::vector<int> tt_cpuset_allocator::get_hwloc_cpuset_vector(hwloc_obj_t &obj){
+std::vector<int> tt_cpuset_allocator::get_hwloc_cpuset_vector(hwloc_obj_t &obj) {
     return get_hwloc_bitmap_vector(obj->cpuset);
 }
 
-std::vector<int> tt_cpuset_allocator::get_hwloc_nodeset_vector(hwloc_obj_t &obj){
+std::vector<int> tt_cpuset_allocator::get_hwloc_nodeset_vector(hwloc_obj_t &obj) {
     return get_hwloc_bitmap_vector(obj->nodeset);
 }
 
-
 // Nicer way to print pu ids as a vector on single line.
-void tt_cpuset_allocator::print_hwloc_cpuset(hwloc_obj_t &obj){
+void tt_cpuset_allocator::print_hwloc_cpuset(hwloc_obj_t &obj) {
     std::cout << " Number: " << hwloc_bitmap_weight(obj->cpuset) << " cpuset_pu_ids: " << get_hwloc_cpuset_vector(obj);
 }
 
-void tt_cpuset_allocator::print_hwloc_nodeset(hwloc_obj_t &obj){
-    std::cout << " Number: " << hwloc_bitmap_weight(obj->nodeset) << " nodeset node_ids: " << get_hwloc_nodeset_vector(obj);
+void tt_cpuset_allocator::print_hwloc_nodeset(hwloc_obj_t &obj) {
+    std::cout << " Number: " << hwloc_bitmap_weight(obj->nodeset)
+              << " nodeset node_ids: " << get_hwloc_nodeset_vector(obj);
 }
 
-void tt_cpuset_allocator::print_hwloc_object(hwloc_obj_t &obj, int depth, bool verbose, bool show_cpuids){
-
+void tt_cpuset_allocator::print_hwloc_object(hwloc_obj_t &obj, int depth, bool verbose, bool show_cpuids) {
     char type[32], attr[1024];
 
     hwloc_obj_type_snprintf(type, sizeof(type), obj, verbose);
-    printf("%*s%s", 2*depth, "", type);
-    if (obj->os_index != (unsigned) -1)
+    printf("%*s%s", 2 * depth, "", type);
+    if (obj->os_index != (unsigned)-1) {
         printf("#%u", obj->os_index);
+    }
     hwloc_obj_attr_snprintf(attr, sizeof(attr), obj, " ", verbose);
 
-    if (*attr)
+    if (*attr) {
         printf("(%s)", attr);
-    if (show_cpuids && obj->cpuset)
+    }
+    if (show_cpuids && obj->cpuset) {
         print_hwloc_cpuset(obj);
+    }
 
     printf("\n");
 }
 
-
 }  // namespace cpuset
 }  // namespace tt
-
diff --git a/device/cpuset_lib.hpp b/device/cpuset_lib.hpp
index a14a4f33..46994833 100644
--- a/device/cpuset_lib.hpp
+++ b/device/cpuset_lib.hpp
@@ -4,18 +4,17 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-
 #pragma once
 
+#include <unistd.h>
+
 #include <map>
-#include <vector>
-#include <string>
 #include <mutex>
+#include <string>
 #include <thread>
-#include <unistd.h>
-
-#include "device/tt_cluster_descriptor.h" // For chip_id_t
+#include <vector>
 
+#include "device/tt_cluster_descriptor.h"  // For chip_id_t
 #include "hwloc.h"
 
 using tt_cluster_description = tt_ClusterDescriptor;
@@ -27,90 +26,87 @@ namespace cpuset {
 // CPU ID allocator for pinning threads to cpu_ids
 // It's a singleton that should be retrieved via get()
 struct tt_cpuset_allocator {
-    public:
-
-        tt_cpuset_allocator(tt_cpuset_allocator const&)     = delete;
-        void operator=(tt_cpuset_allocator const&)          = delete;
-
-        // Bind an already allocated memory region to particular numa nodes
-        static bool bind_area_to_memory_nodeset(chip_id_t physical_device_id, const void * addr, size_t len){
-            auto& instance = tt_cpuset_allocator::get();
-            return instance.bind_area_memory_nodeset(physical_device_id, addr, len);
-        }
-
-        static int get_num_tt_pci_devices(){
-            auto& instance = tt_cpuset_allocator::get();
-            return instance._get_num_tt_pci_devices();
-        }
-
-        static int get_num_tt_pci_devices_by_pci_device_id(uint16_t device_id, uint16_t revision_id){
-            auto& instance = tt_cpuset_allocator::get();
-            return instance._get_num_tt_pci_devices_by_pci_device_id(device_id, revision_id);
-        }
-
-    private:
-
-        static tt_cpuset_allocator& get() {
-            static tt_cpuset_allocator instance;
-            return instance;
-        }
-
-        tt_cpuset_allocator();
-
-        int TENSTORRENT_VENDOR_ID = 0x1e52;
-
-        bool bind_area_memory_nodeset(chip_id_t physical_device_id, const void * addr, size_t len);
-        int _get_num_tt_pci_devices();
-        int _get_num_tt_pci_devices_by_pci_device_id(uint16_t device_id, uint16_t revision_id);
-
-        // Series of init functions, must be called in this order. Seperated out to support
-        // early exit in case of errors.
-        bool init_topology_init_and_load();
-        bool init_find_tt_pci_devices_packages_numanodes();
-        bool init_get_number_of_packages();
-        bool init_is_cpu_model_supported();
-        bool init_determine_cpuset_allocations();
-
-        // Helper Functions
-        std::string get_pci_bus_id(hwloc_obj_t pci_device_obj);
-        int get_package_id_from_device(hwloc_obj_t pci_device_obj, chip_id_t physical_device_id);
-        hwloc_nodeset_t get_numa_nodeset_from_device(hwloc_obj_t pci_device_obj, chip_id_t physical_device_id);
-
-        // Debug Functions
-        void print_hwloc_cpuset(hwloc_obj_t &obj);
-        void print_hwloc_nodeset(hwloc_obj_t &obj);
-        void print_hwloc_object(hwloc_obj_t &obj, int depth = 0, bool verbose = false, bool show_cpuids = true);
-        std::vector<int> get_hwloc_bitmap_vector(hwloc_bitmap_t &bitmap);
-        std::vector<int> get_hwloc_cpuset_vector(hwloc_obj_t &obj);
-        std::vector<int> get_hwloc_nodeset_vector(hwloc_obj_t &obj);
-        hwloc_topology_t m_topology;
-        bool m_debug;
-        pid_t m_pid;
-
-        // Items calculated by parsing system info, used by allocation algorithm:
-        std::map<int, std::vector<int>> m_package_id_to_devices_map;
-        std::map<int, std::string> m_physical_device_id_to_pci_bus_id_map; // Debug/Info
-        std::map<std::pair<uint16_t, uint16_t>, int> m_num_tt_device_by_pci_device_id_map;
-
-        std::map<chip_id_t, std::vector<hwloc_cpuset_t>> m_physical_device_id_to_cpusets_map;
-        std::map<chip_id_t, int> m_physical_device_id_to_package_id_map;
-
-        bool m_enable_cpuset_allocator = true; // Enable feature, otherwise do nothing.
-        int m_num_packages = 0;
-        std::vector<int> m_all_tt_devices = {};
-
-        hwloc_obj_type_t m_object_per_alloc_slot = HWLOC_OBJ_L3CACHE; // Default
+public:
+    tt_cpuset_allocator(tt_cpuset_allocator const &) = delete;
+    void operator=(tt_cpuset_allocator const &) = delete;
+
+    // Bind an already allocated memory region to particular numa nodes
+    static bool bind_area_to_memory_nodeset(chip_id_t physical_device_id, const void *addr, size_t len) {
+        auto &instance = tt_cpuset_allocator::get();
+        return instance.bind_area_memory_nodeset(physical_device_id, addr, len);
+    }
 
-        // For 2CCX-PER-CCD Optimization detection.
-        std::map<int, int> m_package_id_to_num_l3_per_ccx_map;
-        std::map<int, int> m_package_id_to_num_ccx_per_ccd_map;
+    static int get_num_tt_pci_devices() {
+        auto &instance = tt_cpuset_allocator::get();
+        return instance._get_num_tt_pci_devices();
+    }
 
-        // Memory Binding
-        std::map<chip_id_t, hwloc_nodeset_t> m_physical_device_id_to_numa_nodeset_map;
+    static int get_num_tt_pci_devices_by_pci_device_id(uint16_t device_id, uint16_t revision_id) {
+        auto &instance = tt_cpuset_allocator::get();
+        return instance._get_num_tt_pci_devices_by_pci_device_id(device_id, revision_id);
+    }
 
-        // Helper for some dynamic multi-threading.
-        std::map<chip_id_t, int> m_num_cpu_cores_allocated_per_tt_device;
+private:
+    static tt_cpuset_allocator &get() {
+        static tt_cpuset_allocator instance;
+        return instance;
+    }
 
+    tt_cpuset_allocator();
+
+    int TENSTORRENT_VENDOR_ID = 0x1e52;
+
+    bool bind_area_memory_nodeset(chip_id_t physical_device_id, const void *addr, size_t len);
+    int _get_num_tt_pci_devices();
+    int _get_num_tt_pci_devices_by_pci_device_id(uint16_t device_id, uint16_t revision_id);
+
+    // Series of init functions, must be called in this order. Seperated out to support
+    // early exit in case of errors.
+    bool init_topology_init_and_load();
+    bool init_find_tt_pci_devices_packages_numanodes();
+    bool init_get_number_of_packages();
+    bool init_is_cpu_model_supported();
+    bool init_determine_cpuset_allocations();
+
+    // Helper Functions
+    std::string get_pci_bus_id(hwloc_obj_t pci_device_obj);
+    int get_package_id_from_device(hwloc_obj_t pci_device_obj, chip_id_t physical_device_id);
+    hwloc_nodeset_t get_numa_nodeset_from_device(hwloc_obj_t pci_device_obj, chip_id_t physical_device_id);
+
+    // Debug Functions
+    void print_hwloc_cpuset(hwloc_obj_t &obj);
+    void print_hwloc_nodeset(hwloc_obj_t &obj);
+    void print_hwloc_object(hwloc_obj_t &obj, int depth = 0, bool verbose = false, bool show_cpuids = true);
+    std::vector<int> get_hwloc_bitmap_vector(hwloc_bitmap_t &bitmap);
+    std::vector<int> get_hwloc_cpuset_vector(hwloc_obj_t &obj);
+    std::vector<int> get_hwloc_nodeset_vector(hwloc_obj_t &obj);
+    hwloc_topology_t m_topology;
+    bool m_debug;
+    pid_t m_pid;
+
+    // Items calculated by parsing system info, used by allocation algorithm:
+    std::map<int, std::vector<int>> m_package_id_to_devices_map;
+    std::map<int, std::string> m_physical_device_id_to_pci_bus_id_map;  // Debug/Info
+    std::map<std::pair<uint16_t, uint16_t>, int> m_num_tt_device_by_pci_device_id_map;
+
+    std::map<chip_id_t, std::vector<hwloc_cpuset_t>> m_physical_device_id_to_cpusets_map;
+    std::map<chip_id_t, int> m_physical_device_id_to_package_id_map;
+
+    bool m_enable_cpuset_allocator = true;  // Enable feature, otherwise do nothing.
+    int m_num_packages = 0;
+    std::vector<int> m_all_tt_devices = {};
+
+    hwloc_obj_type_t m_object_per_alloc_slot = HWLOC_OBJ_L3CACHE;  // Default
+
+    // For 2CCX-PER-CCD Optimization detection.
+    std::map<int, int> m_package_id_to_num_l3_per_ccx_map;
+    std::map<int, int> m_package_id_to_num_ccx_per_ccd_map;
+
+    // Memory Binding
+    std::map<chip_id_t, hwloc_nodeset_t> m_physical_device_id_to_numa_nodeset_map;
+
+    // Helper for some dynamic multi-threading.
+    std::map<chip_id_t, int> m_num_cpu_cores_allocated_per_tt_device;
 };
 
 template <typename T>
diff --git a/device/device_api_metal.h b/device/device_api_metal.h
index 0fc7820c..c148a71a 100644
--- a/device/device_api_metal.h
+++ b/device/device_api_metal.h
@@ -5,5 +5,5 @@
  */
 
 #pragma once
-#include "device/tt_device.h"
 #include "device/driver_atomics.h"
+#include "device/tt_device.h"
diff --git a/device/driver_atomics.h b/device/driver_atomics.h
index cbf4f6c7..6ed52416 100644
--- a/device/driver_atomics.h
+++ b/device/driver_atomics.h
@@ -12,54 +12,44 @@ namespace tt_driver_atomics {
 
 #if defined(__x86_64__) || defined(__i386__)
 // Store-Any barrier.
-static inline __attribute__((always_inline)) void sfence() {
-    _mm_sfence();
-}
+static inline __attribute__((always_inline)) void sfence() { _mm_sfence(); }
+
 // Load-Any barrier.
-static inline __attribute__((always_inline)) void lfence() {
-    _mm_lfence();
-}
+static inline __attribute__((always_inline)) void lfence() { _mm_lfence(); }
+
 // Any-Any barrier.
-static inline __attribute__((always_inline)) void mfence() {
-    _mm_mfence();
-}
+static inline __attribute__((always_inline)) void mfence() { _mm_mfence(); }
 
 #elif defined(__ARM_ARCH)
 
 static inline __attribute__((always_inline)) void sfence() {
     // Full memory barrier (full system). ARM does not have a Store-Any barrier.
     // https://developer.arm.com/documentation/100941/0101/Barriers
-    asm volatile ("DMB SY" : : : "memory");
+    asm volatile("DMB SY" : : : "memory");
 }
 
 static inline __attribute__((always_inline)) void lfence() {
     // Load-Any barrier (full system)
     // https://developer.arm.com/documentation/100941/0101/Barriers
-    asm volatile ("DMB LD" : : : "memory");
+    asm volatile("DMB LD" : : : "memory");
 }
 
 static inline __attribute__((always_inline)) void mfence() {
     // Full memory barrier (full system).
     // https://developer.arm.com/documentation/100941/0101/Barriers
-    asm volatile ("DMB SY" : : : "memory");
+    asm volatile("DMB SY" : : : "memory");
 }
 
 #elif defined(__riscv)
 
-static inline __attribute__((always_inline)) void sfence() {
-    asm volatile ("fence ow, ow" : : : "memory");
-}
+static inline __attribute__((always_inline)) void sfence() { asm volatile("fence ow, ow" : : : "memory"); }
 
-static inline __attribute__((always_inline)) void lfence() {
-    asm volatile ("fence ir, ir" : : : "memory");
-}
+static inline __attribute__((always_inline)) void lfence() { asm volatile("fence ir, ir" : : : "memory"); }
 
-static inline __attribute__((always_inline)) void mfence() {
-    asm volatile ("fence iorw, iorw" : : : "memory");
-}
+static inline __attribute__((always_inline)) void mfence() { asm volatile("fence iorw, iorw" : : : "memory"); }
 
 #else
 #error "Unsupported architecture"
 #endif
 
-} // namespace tt_driver_atomics
\ No newline at end of file
+}  // namespace tt_driver_atomics
\ No newline at end of file
diff --git a/device/grayskull/grayskull_coordinate_manager.h b/device/grayskull/grayskull_coordinate_manager.h
index f7f6720c..5be371cd 100644
--- a/device/grayskull/grayskull_coordinate_manager.h
+++ b/device/grayskull/grayskull_coordinate_manager.h
@@ -9,8 +9,8 @@
 #include "device/coordinate_manager.h"
 
 class GrayskullCoordinateManager : public CoordinateManager {
-
 public:
-    GrayskullCoordinateManager(const tt_xy_pair& worker_grid_size, const std::vector<tt_xy_pair>& workers, std::size_t harvesting_mask)
-        : CoordinateManager(worker_grid_size, workers, harvesting_mask) {}
+    GrayskullCoordinateManager(
+        const tt_xy_pair& worker_grid_size, const std::vector<tt_xy_pair>& workers, std::size_t harvesting_mask) :
+        CoordinateManager(worker_grid_size, workers, harvesting_mask) {}
 };
\ No newline at end of file
diff --git a/device/grayskull/grayskull_implementation.cpp b/device/grayskull/grayskull_implementation.cpp
index 2b94d187..b14029ca 100644
--- a/device/grayskull/grayskull_implementation.cpp
+++ b/device/grayskull/grayskull_implementation.cpp
@@ -4,9 +4,8 @@
 
 #include "grayskull_implementation.h"
 
-#include "src/firmware/riscv/grayskull/host_mem_address_map.h"
-
 #include "device/tt_device.h"
+#include "src/firmware/riscv/grayskull/host_mem_address_map.h"
 
 namespace tt::umd {
 
@@ -86,7 +85,9 @@ std::pair<std::uint64_t, std::uint64_t> grayskull_implementation::get_tlb_data(
 }
 
 tt_driver_host_address_params grayskull_implementation::get_host_address_params() const {
-    return {::grayskull::host_mem::address_map::ETH_ROUTING_BLOCK_SIZE, ::grayskull::host_mem::address_map::ETH_ROUTING_BUFFERS_START};
+    return {
+        ::grayskull::host_mem::address_map::ETH_ROUTING_BLOCK_SIZE,
+        ::grayskull::host_mem::address_map::ETH_ROUTING_BUFFERS_START};
 }
 
 }  // namespace tt::umd
diff --git a/device/grayskull/grayskull_implementation.h b/device/grayskull/grayskull_implementation.h
index 35b4c78b..610361f2 100644
--- a/device/grayskull/grayskull_implementation.h
+++ b/device/grayskull/grayskull_implementation.h
@@ -104,7 +104,8 @@ enum class arc_message_type {
 };
 
 // DEVICE_DATA
-static const std::array<xy_pair, 8> DRAM_LOCATIONS = {{{1, 6}, {4, 6}, {7, 6}, {10, 6}, {1, 0}, {4, 0}, {7, 0}, {10, 0}}};
+static const std::array<xy_pair, 8> DRAM_LOCATIONS = {
+    {{1, 6}, {4, 6}, {7, 6}, {10, 6}, {1, 0}, {4, 0}, {7, 0}, {10, 0}}};
 static const std::array<xy_pair, 1> ARC_LOCATIONS = {{{0, 2}}};
 static const std::array<xy_pair, 1> PCI_LOCATIONS = {{{0, 4}}};
 static const std::array<xy_pair, 0> ETH_LOCATIONS = {};
@@ -134,7 +135,8 @@ static constexpr uint32_t STATIC_TLB_CFG_ADDR = 0x1fc00000;
 static constexpr uint32_t TLB_CFG_REG_SIZE_BYTES = 8;
 
 static constexpr uint32_t DYNAMIC_TLB_16M_SIZE = 16 * 1024 * 1024;
-static constexpr uint32_t DYNAMIC_TLB_16M_CFG_ADDR = STATIC_TLB_CFG_ADDR + (TLB_BASE_INDEX_16M * TLB_CFG_REG_SIZE_BYTES);
+static constexpr uint32_t DYNAMIC_TLB_16M_CFG_ADDR =
+    STATIC_TLB_CFG_ADDR + (TLB_BASE_INDEX_16M * TLB_CFG_REG_SIZE_BYTES);
 static constexpr uint32_t DYNAMIC_TLB_16M_BASE = TLB_BASE_16M;
 
 static constexpr uint32_t DYNAMIC_TLB_2M_SIZE = 2 * 1024 * 1024;
@@ -171,59 +173,93 @@ static constexpr uint32_t TENSIX_SOFT_RESET_ADDR = 0xFFB121B0;
 }  // namespace grayskull
 
 class grayskull_implementation : public architecture_implementation {
-   public:
+public:
     tt::ARCH get_architecture() const override { return tt::ARCH::GRAYSKULL; }
+
     uint32_t get_arc_message_arc_get_harvesting() const override {
         return static_cast<uint32_t>(grayskull::arc_message_type::ARC_GET_HARVESTING);
     }
+
     uint32_t get_arc_message_arc_go_busy() const override {
         return static_cast<uint32_t>(grayskull::arc_message_type::ARC_GO_BUSY);
     }
+
     uint32_t get_arc_message_arc_go_long_idle() const override {
         return static_cast<uint32_t>(grayskull::arc_message_type::ARC_GO_LONG_IDLE);
     }
+
     uint32_t get_arc_message_arc_go_short_idle() const override {
         return static_cast<uint32_t>(grayskull::arc_message_type::ARC_GO_SHORT_IDLE);
     }
+
     uint32_t get_arc_message_deassert_riscv_reset() const override {
         return static_cast<uint32_t>(grayskull::arc_message_type::DEASSERT_RISCV_RESET);
     }
+
     uint32_t get_arc_message_get_aiclk() const override {
         return static_cast<uint32_t>(grayskull::arc_message_type::GET_AICLK);
     }
+
     uint32_t get_arc_message_setup_iatu_for_peer_to_peer() const override {
         return static_cast<uint32_t>(grayskull::arc_message_type::SETUP_IATU_FOR_PEER_TO_PEER);
     }
+
     uint32_t get_arc_message_test() const override { return static_cast<uint32_t>(grayskull::arc_message_type::TEST); }
+
     uint32_t get_arc_csm_mailbox_offset() const override { return grayskull::ARC_CSM_MAILBOX_OFFSET; }
+
     uint32_t get_arc_reset_arc_misc_cntl_offset() const override { return grayskull::ARC_RESET_ARC_MISC_CNTL_OFFSET; }
+
     uint32_t get_arc_reset_scratch_offset() const override { return grayskull::ARC_RESET_SCRATCH_OFFSET; }
+
     uint32_t get_dram_channel_0_peer2peer_region_start() const override {
         return grayskull::DRAM_CHANNEL_0_PEER2PEER_REGION_START;
     }
+
     uint32_t get_dram_channel_0_x() const override { return grayskull::DRAM_CHANNEL_0_X; }
+
     uint32_t get_dram_channel_0_y() const override { return grayskull::DRAM_CHANNEL_0_Y; }
+
     uint32_t get_broadcast_tlb_index() const override { return grayskull::BROADCAST_TLB_INDEX; }
+
     uint32_t get_dynamic_tlb_2m_base() const override { return grayskull::DYNAMIC_TLB_2M_BASE; }
+
     uint32_t get_dynamic_tlb_2m_size() const override { return grayskull::DYNAMIC_TLB_2M_SIZE; }
+
     uint32_t get_dynamic_tlb_16m_base() const override { return grayskull::DYNAMIC_TLB_16M_BASE; }
+
     uint32_t get_dynamic_tlb_16m_size() const override { return grayskull::DYNAMIC_TLB_16M_SIZE; }
+
     uint32_t get_dynamic_tlb_16m_cfg_addr() const override { return grayskull::DYNAMIC_TLB_16M_CFG_ADDR; }
+
     uint32_t get_mem_large_read_tlb() const override { return grayskull::MEM_LARGE_READ_TLB; }
+
     uint32_t get_mem_large_write_tlb() const override { return grayskull::MEM_LARGE_WRITE_TLB; }
+
     uint32_t get_static_tlb_cfg_addr() const override { return grayskull::STATIC_TLB_CFG_ADDR; }
+
     uint32_t get_static_tlb_size() const override { return grayskull::STATIC_TLB_SIZE; }
+
     uint32_t get_reg_tlb() const override { return grayskull::REG_TLB; }
+
     uint32_t get_tlb_base_index_16m() const override { return grayskull::TLB_BASE_INDEX_16M; }
+
     uint32_t get_tensix_soft_reset_addr() const override { return grayskull::TENSIX_SOFT_RESET_ADDR; }
+
     uint32_t get_grid_size_x() const override { return grayskull::GRID_SIZE_X; }
+
     uint32_t get_grid_size_y() const override { return grayskull::GRID_SIZE_Y; }
+
     uint32_t get_tlb_cfg_reg_size_bytes() const override { return grayskull::TLB_CFG_REG_SIZE_BYTES; }
+
     uint32_t get_small_read_write_tlb() const override { return grayskull::MEM_SMALL_READ_WRITE_TLB; }
+
     const std::vector<uint32_t>& get_harvesting_noc_locations() const override {
         return grayskull::HARVESTING_NOC_LOCATIONS;
     }
+
     const std::vector<uint32_t>& get_t6_x_locations() const override { return grayskull::T6_X_LOCATIONS; }
+
     const std::vector<uint32_t>& get_t6_y_locations() const override { return grayskull::T6_Y_LOCATIONS; }
 
     std::tuple<xy_pair, xy_pair> multicast_workaround(xy_pair start, xy_pair end) const override;
@@ -232,7 +268,6 @@ class grayskull_implementation : public architecture_implementation {
     std::pair<std::uint64_t, std::uint64_t> get_tlb_data(std::uint32_t tlb_index, const tlb_data& data) const override;
 
     tt_driver_host_address_params get_host_address_params() const override;
-
 };
 
 }  // namespace tt::umd
diff --git a/device/ioctl.h b/device/ioctl.h
index 60ec7b2f..a2e04980 100644
--- a/device/ioctl.h
+++ b/device/ioctl.h
@@ -7,151 +7,149 @@
 #ifndef TTDRIVER_IOCTL_H_INCLUDED
 #define TTDRIVER_IOCTL_H_INCLUDED
 
-#include <linux/types.h>
 #include <linux/ioctl.h>
+#include <linux/types.h>
 
 #define TENSTORRENT_DRIVER_VERSION 1
 
 #define TENSTORRENT_IOCTL_MAGIC 0xFA
 
-#define TENSTORRENT_IOCTL_GET_DEVICE_INFO	_IO(TENSTORRENT_IOCTL_MAGIC, 0)
-#define TENSTORRENT_IOCTL_GET_HARVESTING	_IO(TENSTORRENT_IOCTL_MAGIC, 1)
-#define TENSTORRENT_IOCTL_QUERY_MAPPINGS	_IO(TENSTORRENT_IOCTL_MAGIC, 2)
-#define TENSTORRENT_IOCTL_ALLOCATE_DMA_BUF	_IO(TENSTORRENT_IOCTL_MAGIC, 3)
-#define TENSTORRENT_IOCTL_FREE_DMA_BUF		_IO(TENSTORRENT_IOCTL_MAGIC, 4)
-#define TENSTORRENT_IOCTL_GET_DRIVER_INFO	_IO(TENSTORRENT_IOCTL_MAGIC, 5)
-#define TENSTORRENT_IOCTL_RESET_DEVICE		_IO(TENSTORRENT_IOCTL_MAGIC, 6)
-#define TENSTORRENT_IOCTL_PIN_PAGES		_IO(TENSTORRENT_IOCTL_MAGIC, 7)
+#define TENSTORRENT_IOCTL_GET_DEVICE_INFO _IO(TENSTORRENT_IOCTL_MAGIC, 0)
+#define TENSTORRENT_IOCTL_GET_HARVESTING _IO(TENSTORRENT_IOCTL_MAGIC, 1)
+#define TENSTORRENT_IOCTL_QUERY_MAPPINGS _IO(TENSTORRENT_IOCTL_MAGIC, 2)
+#define TENSTORRENT_IOCTL_ALLOCATE_DMA_BUF _IO(TENSTORRENT_IOCTL_MAGIC, 3)
+#define TENSTORRENT_IOCTL_FREE_DMA_BUF _IO(TENSTORRENT_IOCTL_MAGIC, 4)
+#define TENSTORRENT_IOCTL_GET_DRIVER_INFO _IO(TENSTORRENT_IOCTL_MAGIC, 5)
+#define TENSTORRENT_IOCTL_RESET_DEVICE _IO(TENSTORRENT_IOCTL_MAGIC, 6)
+#define TENSTORRENT_IOCTL_PIN_PAGES _IO(TENSTORRENT_IOCTL_MAGIC, 7)
 
 // For tenstorrent_mapping.mapping_id. These are not array indices.
-#define TENSTORRENT_MAPPING_UNUSED		0
-#define TENSTORRENT_MAPPING_RESOURCE0_UC	1
-#define TENSTORRENT_MAPPING_RESOURCE0_WC	2
-#define TENSTORRENT_MAPPING_RESOURCE1_UC	3
-#define TENSTORRENT_MAPPING_RESOURCE1_WC	4
-#define TENSTORRENT_MAPPING_RESOURCE2_UC	5
-#define TENSTORRENT_MAPPING_RESOURCE2_WC	6
+#define TENSTORRENT_MAPPING_UNUSED 0
+#define TENSTORRENT_MAPPING_RESOURCE0_UC 1
+#define TENSTORRENT_MAPPING_RESOURCE0_WC 2
+#define TENSTORRENT_MAPPING_RESOURCE1_UC 3
+#define TENSTORRENT_MAPPING_RESOURCE1_WC 4
+#define TENSTORRENT_MAPPING_RESOURCE2_UC 5
+#define TENSTORRENT_MAPPING_RESOURCE2_WC 6
 
-#define TENSTORRENT_MAX_DMA_BUFS	8
+#define TENSTORRENT_MAX_DMA_BUFS 8
 
 struct tenstorrent_get_device_info_in {
-	__u32 output_size_bytes;
+    __u32 output_size_bytes;
 };
 
 struct tenstorrent_get_device_info_out {
-	__u32 output_size_bytes;
-	__u16 vendor_id;
-	__u16 device_id;
-	__u16 subsystem_vendor_id;
-	__u16 subsystem_id;
-	__u16 bus_dev_fn;	// [0:2] function, [3:7] device, [8:15] bus
-	__u16 max_dma_buf_size_log2;
-	__u16 pci_domain;
+    __u32 output_size_bytes;
+    __u16 vendor_id;
+    __u16 device_id;
+    __u16 subsystem_vendor_id;
+    __u16 subsystem_id;
+    __u16 bus_dev_fn;  // [0:2] function, [3:7] device, [8:15] bus
+    __u16 max_dma_buf_size_log2;
+    __u16 pci_domain;
 };
 
 struct tenstorrent_get_device_info {
-	struct tenstorrent_get_device_info_in in;
-	struct tenstorrent_get_device_info_out out;
+    struct tenstorrent_get_device_info_in in;
+    struct tenstorrent_get_device_info_out out;
 };
 
 struct tenstorrent_query_mappings_in {
-	__u32 output_mapping_count;
-	__u32 reserved;
+    __u32 output_mapping_count;
+    __u32 reserved;
 };
 
 struct tenstorrent_mapping {
-	__u32 mapping_id;
-	__u32 reserved;
-	__u64 mapping_base;
-	__u64 mapping_size;
+    __u32 mapping_id;
+    __u32 reserved;
+    __u64 mapping_base;
+    __u64 mapping_size;
 };
 
 struct tenstorrent_query_mappings_out {
-	struct tenstorrent_mapping mappings[0];
+    struct tenstorrent_mapping mappings[0];
 };
 
 struct tenstorrent_query_mappings {
-	struct tenstorrent_query_mappings_in in;
-	struct tenstorrent_query_mappings_out out;
+    struct tenstorrent_query_mappings_in in;
+    struct tenstorrent_query_mappings_out out;
 };
 
 struct tenstorrent_allocate_dma_buf_in {
-	__u32 requested_size;
-	__u8  buf_index;	// [0,TENSTORRENT_MAX_DMA_BUFS)
-	__u8  reserved0[3];
-	__u64 reserved1[2];
+    __u32 requested_size;
+    __u8 buf_index;  // [0,TENSTORRENT_MAX_DMA_BUFS)
+    __u8 reserved0[3];
+    __u64 reserved1[2];
 };
 
 struct tenstorrent_allocate_dma_buf_out {
-	__u64 physical_address;
-	__u64 mapping_offset;
-	__u32 size;
-	__u32 reserved0;
-	__u64 reserved1[2];
+    __u64 physical_address;
+    __u64 mapping_offset;
+    __u32 size;
+    __u32 reserved0;
+    __u64 reserved1[2];
 };
 
 struct tenstorrent_allocate_dma_buf {
-	struct tenstorrent_allocate_dma_buf_in in;
-	struct tenstorrent_allocate_dma_buf_out out;
+    struct tenstorrent_allocate_dma_buf_in in;
+    struct tenstorrent_allocate_dma_buf_out out;
 };
 
-struct tenstorrent_free_dma_buf_in {
-};
+struct tenstorrent_free_dma_buf_in {};
 
-struct tenstorrent_free_dma_buf_out {
-};
+struct tenstorrent_free_dma_buf_out {};
 
 struct tenstorrent_free_dma_buf {
-	struct tenstorrent_free_dma_buf_in in;
-	struct tenstorrent_free_dma_buf_out out;
+    struct tenstorrent_free_dma_buf_in in;
+    struct tenstorrent_free_dma_buf_out out;
 };
 
 struct tenstorrent_get_driver_info_in {
-	__u32 output_size_bytes;
+    __u32 output_size_bytes;
 };
 
 struct tenstorrent_get_driver_info_out {
-	__u32 output_size_bytes;
-	__u32 driver_version;
+    __u32 output_size_bytes;
+    __u32 driver_version;
 };
 
 struct tenstorrent_get_driver_info {
-	struct tenstorrent_get_driver_info_in in;
-	struct tenstorrent_get_driver_info_out out;
+    struct tenstorrent_get_driver_info_in in;
+    struct tenstorrent_get_driver_info_out out;
 };
 
 struct tenstorrent_reset_device_in {
-	__u32 output_size_bytes;
-	__u32 flags;
+    __u32 output_size_bytes;
+    __u32 flags;
 };
 
 struct tenstorrent_reset_device_out {
-	__u32 output_size_bytes;
-	__u32 result;
+    __u32 output_size_bytes;
+    __u32 result;
 };
 
 struct tenstorrent_reset_device {
-	struct tenstorrent_reset_device_in in;
-	struct tenstorrent_reset_device_out out;
+    struct tenstorrent_reset_device_in in;
+    struct tenstorrent_reset_device_out out;
 };
 
 // tenstorrent_pin_pages_in.flags
 #define TENSTORRENT_PIN_PAGES_CONTIGUOUS 1
 
 struct tenstorrent_pin_pages_in {
-	__u32 output_size_bytes;
-	__u32 flags;
-	__u64 virtual_address;
-	__u64 size;
+    __u32 output_size_bytes;
+    __u32 flags;
+    __u64 virtual_address;
+    __u64 size;
 };
 
 struct tenstorrent_pin_pages_out {
-	__u64 physical_address;
+    __u64 physical_address;
 };
 
 struct tenstorrent_pin_pages {
-	struct tenstorrent_pin_pages_in in;
-	struct tenstorrent_pin_pages_out out;
+    struct tenstorrent_pin_pages_in in;
+    struct tenstorrent_pin_pages_out out;
 };
 
 #endif
diff --git a/device/mockup/tt_mockup_device.hpp b/device/mockup/tt_mockup_device.hpp
index bacfb832..86de3e29 100644
--- a/device/mockup/tt_mockup_device.hpp
+++ b/device/mockup/tt_mockup_device.hpp
@@ -13,27 +13,37 @@
 #include "device/tt_device.h"
 
 class tt_MockupDevice : public tt_device {
-   public:
+public:
     tt_MockupDevice(const std::string& sdesc_path) : tt_device(sdesc_path) {
         soc_descriptor_per_chip.emplace(0, tt_SocDescriptor(sdesc_path));
         std::set<chip_id_t> target_devices = {0};
     }
+
     virtual ~tt_MockupDevice() {}
 
     // Setup/Teardown Functions
     virtual std::unordered_map<chip_id_t, tt_SocDescriptor>& get_virtual_soc_descriptors() override {
         return soc_descriptor_per_chip;
     }
+
     void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) override {}
+
     void set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_) override {}
+
     void set_driver_host_address_params(const tt_driver_host_address_params& host_address_params_) override {}
-    void set_driver_eth_interface_params(
-        const tt_driver_eth_interface_params& eth_interface_params_) override {}
+
+    void set_driver_eth_interface_params(const tt_driver_eth_interface_params& eth_interface_params_) override {}
+
     void start_device(const tt_device_params& device_params) override {}
+
     void assert_risc_reset() override {}
+
     void deassert_risc_reset() override {}
+
     void deassert_risc_reset_at_core(tt_cxy_pair core) override {}
+
     void assert_risc_reset_at_core(tt_cxy_pair core) override {}
+
     void close_device() override {}
 
     // Runtime Functions
@@ -43,10 +53,13 @@ class tt_MockupDevice : public tt_device {
         tt_cxy_pair core,
         uint64_t addr,
         const std::string& tlb_to_use) override {}
+
     void read_from_device(
         void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) override {}
+
     void write_to_sysmem(
         const void* mem_ptr, std::uint32_t size, uint64_t addr, uint16_t channel, chip_id_t src_device_id) override {}
+
     void read_from_sysmem(
         void* mem_ptr, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id) override {}
 
@@ -54,10 +67,12 @@ class tt_MockupDevice : public tt_device {
         const chip_id_t chip,
         const std::string& fallback_tlb,
         const std::unordered_set<tt_xy_pair>& cores = {}) override {}
+
     void dram_membar(
         const chip_id_t chip,
         const std::string& fallback_tlb,
         const std::unordered_set<uint32_t>& channels = {}) override {}
+
     void dram_membar(
         const chip_id_t chip,
         const std::string& fallback_tlb,
@@ -66,27 +81,35 @@ class tt_MockupDevice : public tt_device {
     void wait_for_non_mmio_flush() override {}
 
     // Misc. Functions to Query/Set Device State
-    std::unordered_map<chip_id_t, uint32_t> get_harvesting_masks_for_soc_descriptors() override {
-        return {{0, 0}};
-    }
+    std::unordered_map<chip_id_t, uint32_t> get_harvesting_masks_for_soc_descriptors() override { return {{0, 0}}; }
+
     static std::vector<chip_id_t> detect_available_device_ids() { return {0}; };
+
     std::set<chip_id_t> get_target_remote_device_ids() override { return target_remote_chips; }
+
     std::map<int, int> get_clocks() override { return {{0, 0}}; }
+
     void* host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const override {
         return nullptr;
     }
+
     std::uint64_t get_pcie_base_addr_from_device(const chip_id_t chip_id) const override { return 0; }
+
     std::uint32_t get_num_dram_channels(std::uint32_t device_id) override {
         return get_soc_descriptor(device_id).get_num_dram_channels();
     };
+
     std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel) override {
         return get_soc_descriptor(device_id).dram_bank_size;
     }
+
     std::uint32_t get_num_host_channels(std::uint32_t device_id) override { return 1; }
+
     std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel) override { return 0; }
+
     std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id) override { return 0; }
 
-   private:
+private:
     std::vector<tt::ARCH> archs_in_cluster = {};
     std::set<chip_id_t> target_devices_in_cluster = {};
     std::set<chip_id_t> target_remote_chips = {};
diff --git a/device/pcie/pci_device.cpp b/device/pcie/pci_device.cpp
index 4cd3ab79..7f5627b2 100644
--- a/device/pcie/pci_device.cpp
+++ b/device/pcie/pci_device.cpp
@@ -4,24 +4,24 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
+#include "pci_device.hpp"
+
+#include <fcntl.h>      // for ::open
+#include <linux/pci.h>  // for PCI_SLOT, PCI_FUNC
+#include <sys/ioctl.h>  // for ioctl
+#include <sys/mman.h>   // for mmap, munmap
+#include <unistd.h>     // for ::close
+
 #include <cstdint>
-#include <cstring> // for memcpy
+#include <cstring>  // for memcpy
 #include <vector>
-#include <fcntl.h>  // for ::open
-#include <unistd.h> // for ::close
-#include <sys/ioctl.h> // for ioctl
-#include <sys/mman.h>  // for mmap, munmap
-#include <linux/pci.h> // for PCI_SLOT, PCI_FUNC
-
-#include "pci_device.hpp"
-#include "ioctl.h"
 
-#include "ioctl.h"
-#include "device/tt_arch_types.h"
-#include "device/driver_atomics.h"
-#include "device/architecture_implementation.h"
 #include "common/assert.hpp"
 #include "common/logger.hpp"
+#include "device/architecture_implementation.h"
+#include "device/driver_atomics.h"
+#include "device/tt_arch_types.h"
+#include "ioctl.h"
 
 static const uint16_t GS_PCIE_DEVICE_ID = 0xfaca;
 static const uint16_t WH_PCIE_DEVICE_ID = 0x401e;
@@ -29,19 +29,23 @@ static const uint16_t BH_PCIE_DEVICE_ID = 0xb140;
 
 // TODO: we'll have to rethink this when KMD takes control of the inbound PCIe
 // TLB windows and there is no longer a pre-defined WC/UC split.
-static const uint32_t GS_BAR0_WC_MAPPING_SIZE = (156<<20) + (10<<21) + (18<<24);
+static const uint32_t GS_BAR0_WC_MAPPING_SIZE = (156 << 20) + (10 << 21) + (18 << 24);
 
 // Defines the address for WC region. addresses 0 to BH_BAR0_WC_MAPPING_SIZE are in WC, above that are UC
-static const uint32_t BH_BAR0_WC_MAPPING_SIZE = 188<<21;
+static const uint32_t BH_BAR0_WC_MAPPING_SIZE = 188 << 21;
 
 static const uint32_t BH_NOC_NODE_ID_OFFSET = 0x1FD04044;
 static const uint32_t GS_WH_ARC_SCRATCH_6_OFFSET = 0x1FF30078;
 
 template <typename T>
 static T read_sysfs(const PciDeviceInfo &device_info, const std::string &attribute_name) {
-    const auto sysfs_path = fmt::format("/sys/bus/pci/devices/{:04x}:{:02x}:{:02x}.{:x}/{}",
-                                        device_info.pci_domain, device_info.pci_bus,
-                                        device_info.pci_device, device_info.pci_function, attribute_name);
+    const auto sysfs_path = fmt::format(
+        "/sys/bus/pci/devices/{:04x}:{:02x}:{:02x}.{:x}/{}",
+        device_info.pci_domain,
+        device_info.pci_bus,
+        device_info.pci_device,
+        device_info.pci_function,
+        attribute_name);
     std::ifstream attribute_file(sysfs_path);
     std::string value_str;
     T value;
@@ -66,8 +70,7 @@ static T read_sysfs(const PciDeviceInfo &device_info, const std::string &attribu
     return value;
 }
 
-static PciDeviceInfo read_device_info(int fd)
-{
+static PciDeviceInfo read_device_info(int fd) {
     tenstorrent_get_device_info info{};
     info.in.output_size_bytes = sizeof(info.out);
 
@@ -83,11 +86,11 @@ static PciDeviceInfo read_device_info(int fd)
 }
 
 static tt::ARCH detect_arch(uint32_t pcie_device_id, uint32_t pcie_revision_id) {
-    if (pcie_device_id == GS_PCIE_DEVICE_ID){
+    if (pcie_device_id == GS_PCIE_DEVICE_ID) {
         return tt::ARCH::GRAYSKULL;
-    } else if (pcie_device_id == WH_PCIE_DEVICE_ID && pcie_revision_id == 0x01){
+    } else if (pcie_device_id == WH_PCIE_DEVICE_ID && pcie_revision_id == 0x01) {
         return tt::ARCH::WORMHOLE_B0;
-    } else if (pcie_device_id == BH_PCIE_DEVICE_ID){
+    } else if (pcie_device_id == BH_PCIE_DEVICE_ID) {
         return tt::ARCH::BLACKHOLE;
     } else {
         TT_THROW("Unknown pcie device id that does not match any known architecture: ", pcie_device_id);
@@ -113,28 +116,29 @@ inline void memcpy_to_device(void *dest, const void *src, std::size_t num_bytes)
 
     if (dest_misalignment != 0) {
         // Read-modify-write for the first dest element.
-        dp = reinterpret_cast<copy_t*>(dest_addr - dest_misalignment);
+        dp = reinterpret_cast<copy_t *>(dest_addr - dest_misalignment);
 
         copy_t tmp = *dp;
 
         auto leading_len = std::min(sizeof(tmp) - dest_misalignment, num_bytes);
 
-        std::memcpy(reinterpret_cast<char*>(&tmp) + dest_misalignment, src, leading_len);
+        std::memcpy(reinterpret_cast<char *>(&tmp) + dest_misalignment, src, leading_len);
         num_bytes -= leading_len;
         src = static_cast<const char *>(src) + leading_len;
 
         *dp++ = tmp;
 
     } else {
-        dp = static_cast<copy_t*>(dest);
+        dp = static_cast<copy_t *>(dest);
     }
 
     // Copy the destination-aligned middle.
-    const copy_t *sp = static_cast<const copy_t*>(src);
+    const copy_t *sp = static_cast<const copy_t *>(src);
     std::size_t num_words = num_bytes / sizeof(copy_t);
 
-    for (std::size_t i = 0; i < num_words; i++)
+    for (std::size_t i = 0; i < num_words; i++) {
         *dp++ = *sp++;
+    }
 
     // Finally copy any sub-word trailer, again RMW on the destination.
     auto trailing_len = num_bytes % sizeof(copy_t);
@@ -157,7 +161,7 @@ inline void memcpy_from_device(void *dest, const void *src, std::size_t num_byte
     unsigned int src_misalignment = src_addr % sizeof(copy_t);
 
     if (src_misalignment != 0) {
-        sp = reinterpret_cast<copy_t*>(src_addr - src_misalignment);
+        sp = reinterpret_cast<copy_t *>(src_addr - src_misalignment);
 
         copy_t tmp = *sp++;
 
@@ -167,15 +171,16 @@ inline void memcpy_from_device(void *dest, const void *src, std::size_t num_byte
         dest = static_cast<char *>(dest) + leading_len;
 
     } else {
-        sp = static_cast<const volatile copy_t*>(src);
+        sp = static_cast<const volatile copy_t *>(src);
     }
 
     // Copy the source-aligned middle.
     copy_t *dp = static_cast<copy_t *>(dest);
     std::size_t num_words = num_bytes / sizeof(copy_t);
 
-    for (std::size_t i = 0; i < num_words; i++)
+    for (std::size_t i = 0; i < num_words; i++) {
         *dp++ = *sp++;
+    }
 
     // Finally copy any sub-word trailer.
     auto trailing_len = num_bytes % sizeof(copy_t);
@@ -186,17 +191,16 @@ inline void memcpy_from_device(void *dest, const void *src, std::size_t num_byte
 }
 
 tt::ARCH PciDeviceInfo::get_arch() const {
-    if (this->device_id == GS_PCIE_DEVICE_ID){
+    if (this->device_id == GS_PCIE_DEVICE_ID) {
         return tt::ARCH::GRAYSKULL;
     } else if (this->device_id == WH_PCIE_DEVICE_ID) {
         return tt::ARCH::WORMHOLE_B0;
-    } else if (this->device_id == BH_PCIE_DEVICE_ID){
+    } else if (this->device_id == BH_PCIE_DEVICE_ID) {
         return tt::ARCH::BLACKHOLE;
     }
     return tt::ARCH::Invalid;
 }
 
-
 /* static */ std::vector<int> PCIDevice::enumerate_devices() {
     std::vector<int> device_ids;
     std::string path = "/dev/tenstorrent/";
@@ -204,7 +208,7 @@ tt::ARCH PciDeviceInfo::get_arch() const {
     if (!std::filesystem::exists(path)) {
         return device_ids;
     }
-    for (const auto& entry : std::filesystem::directory_iterator(path)) {
+    for (const auto &entry : std::filesystem::directory_iterator(path)) {
         std::string filename = entry.path().filename().string();
 
         // TODO: this will skip any device that has a non-numeric name, which
@@ -228,28 +232,29 @@ tt::ARCH PciDeviceInfo::get_arch() const {
 
         try {
             infos[n] = read_device_info(fd);
-        } catch (...) {}
+        } catch (...) {
+        }
 
         close(fd);
     }
     return infos;
 }
 
-PCIDevice::PCIDevice(int pci_device_number, int logical_device_id)
-    : device_path(fmt::format("/dev/tenstorrent/{}", pci_device_number))
-    , pci_device_num(pci_device_number)
-    , logical_id(logical_device_id)
-    , pci_device_file_desc(open(device_path.c_str(), O_RDWR | O_CLOEXEC))
-    , info(read_device_info(pci_device_file_desc))
-    , numa_node(read_sysfs<int>(info, "numa_node"))
-    , revision(read_sysfs<int>(info, "revision"))
-    , arch(detect_arch(info.device_id, revision))
-    , architecture_implementation(tt::umd::architecture_implementation::create(arch))
-{
+PCIDevice::PCIDevice(int pci_device_number, int logical_device_id) :
+    device_path(fmt::format("/dev/tenstorrent/{}", pci_device_number)),
+    pci_device_num(pci_device_number),
+    logical_id(logical_device_id),
+    pci_device_file_desc(open(device_path.c_str(), O_RDWR | O_CLOEXEC)),
+    info(read_device_info(pci_device_file_desc)),
+    numa_node(read_sysfs<int>(info, "numa_node")),
+    revision(read_sysfs<int>(info, "revision")),
+    arch(detect_arch(info.device_id, revision)),
+    architecture_implementation(tt::umd::architecture_implementation::create(arch)) {
     struct {
         tenstorrent_query_mappings query_mappings;
         tenstorrent_mapping mapping_array[8];
     } mappings;
+
     memset(&mappings, 0, sizeof(mappings));
     mappings.query_mappings.in.output_mapping_count = 8;
 
@@ -293,7 +298,9 @@ PCIDevice::PCIDevice(int pci_device_number, int logical_device_id)
             bar4_wc_mapping = mappings.mapping_array[i];
         }
 
-        log_debug(LogSiliconDriver, "BAR mapping id {} base {} size {}",
+        log_debug(
+            LogSiliconDriver,
+            "BAR mapping id {} base {} size {}",
             mappings.mapping_array[i].mapping_id,
             (void *)mappings.mapping_array[i].mapping_base,
             mappings.mapping_array[i].mapping_size);
@@ -308,7 +315,8 @@ PCIDevice::PCIDevice(int pci_device_number, int logical_device_id)
     // Attempt WC mapping first so we can fall back to all-UC if it fails.
     if (bar0_wc_mapping.mapping_id == TENSTORRENT_MAPPING_RESOURCE0_WC) {
         bar0_wc_size = std::min<size_t>(bar0_wc_mapping.mapping_size, wc_mapping_size);
-        bar0_wc = mmap(NULL, bar0_wc_size, PROT_READ | PROT_WRITE, MAP_SHARED, pci_device_file_desc, bar0_wc_mapping.mapping_base);
+        bar0_wc = mmap(
+            NULL, bar0_wc_size, PROT_READ | PROT_WRITE, MAP_SHARED, pci_device_file_desc, bar0_wc_mapping.mapping_base);
         if (bar0_wc == MAP_FAILED) {
             bar0_wc_size = 0;
             bar0_wc = nullptr;
@@ -325,7 +333,13 @@ PCIDevice::PCIDevice(int pci_device_number, int logical_device_id)
         bar0_uc_offset = 0;
     }
 
-    bar0_uc = mmap(NULL, bar0_uc_size, PROT_READ | PROT_WRITE, MAP_SHARED, pci_device_file_desc, bar0_uc_mapping.mapping_base + bar0_uc_offset);
+    bar0_uc = mmap(
+        NULL,
+        bar0_uc_size,
+        PROT_READ | PROT_WRITE,
+        MAP_SHARED,
+        pci_device_file_desc,
+        bar0_uc_mapping.mapping_base + bar0_uc_offset);
 
     if (bar0_uc == MAP_FAILED) {
         throw std::runtime_error(fmt::format("BAR0 UC mapping failed for device {}.", pci_device_num));
@@ -342,22 +356,34 @@ PCIDevice::PCIDevice(int pci_device_number, int logical_device_id)
 
         system_reg_mapping_size = bar4_uc_mapping.mapping_size;
 
-        system_reg_mapping = mmap(NULL, bar4_uc_mapping.mapping_size, PROT_READ | PROT_WRITE, MAP_SHARED, pci_device_file_desc, bar4_uc_mapping.mapping_base);
+        system_reg_mapping = mmap(
+            NULL,
+            bar4_uc_mapping.mapping_size,
+            PROT_READ | PROT_WRITE,
+            MAP_SHARED,
+            pci_device_file_desc,
+            bar4_uc_mapping.mapping_base);
 
         if (system_reg_mapping == MAP_FAILED) {
             throw std::runtime_error(fmt::format("BAR4 UC mapping failed for device {}.", pci_device_num));
         }
 
-        system_reg_start_offset = (512 - 16) * 1024*1024;
-        system_reg_offset_adjust = (512 - 32) * 1024*1024;
-    } else if(arch == tt::ARCH::BLACKHOLE) {
+        system_reg_start_offset = (512 - 16) * 1024 * 1024;
+        system_reg_offset_adjust = (512 - 32) * 1024 * 1024;
+    } else if (arch == tt::ARCH::BLACKHOLE) {
         if (bar2_uc_mapping.mapping_id != TENSTORRENT_MAPPING_RESOURCE1_UC) {
             throw std::runtime_error(fmt::format("Device {} has no BAR2 UC mapping.", pci_device_num));
         }
 
         // Using UnCachable memory mode. This is used for accessing registers on Blackhole.
         bar2_uc_size = bar2_uc_mapping.mapping_size;
-        bar2_uc = mmap(NULL, bar2_uc_mapping.mapping_size, PROT_READ | PROT_WRITE, MAP_SHARED, pci_device_file_desc, bar2_uc_mapping.mapping_base);
+        bar2_uc = mmap(
+            NULL,
+            bar2_uc_mapping.mapping_size,
+            PROT_READ | PROT_WRITE,
+            MAP_SHARED,
+            pci_device_file_desc,
+            bar2_uc_mapping.mapping_base);
 
         if (bar2_uc == MAP_FAILED) {
             throw std::runtime_error(fmt::format("BAR2 UC mapping failed for device {}.", pci_device_num));
@@ -370,7 +396,13 @@ PCIDevice::PCIDevice(int pci_device_number, int logical_device_id)
         // Using Write-Combine memory mode. This is used for accessing DRAM on Blackhole.
         // WC doesn't guarantee write ordering but has better performance.
         bar4_wc_size = bar4_wc_mapping.mapping_size;
-        bar4_wc = mmap(NULL, bar4_wc_mapping.mapping_size, PROT_READ | PROT_WRITE, MAP_SHARED, pci_device_file_desc, bar4_wc_mapping.mapping_base);
+        bar4_wc = mmap(
+            NULL,
+            bar4_wc_mapping.mapping_size,
+            PROT_READ | PROT_WRITE,
+            MAP_SHARED,
+            pci_device_file_desc,
+            bar4_wc_mapping.mapping_base);
 
         if (bar4_wc == MAP_FAILED) {
             throw std::runtime_error(fmt::format("BAR4 WC mapping failed for device {}.", pci_device_num));
@@ -390,8 +422,8 @@ PCIDevice::~PCIDevice() {
         // essential for correctness then it needs to move to the driver.
         uint64_t iatu_index = 0;
         uint64_t iatu_base = UNROLL_ATU_OFFSET_BAR + iatu_index * 0x200;
-        uint32_t region_ctrl_2 = 0 << 31; // REGION_EN = 0
-        write_regs(reinterpret_cast<uint32_t*>(static_cast<uint8_t*>(bar2_uc) + iatu_base + 0x04), &region_ctrl_2, 1);
+        uint32_t region_ctrl_2 = 0 << 31;  // REGION_EN = 0
+        write_regs(reinterpret_cast<uint32_t *>(static_cast<uint8_t *>(bar2_uc) + iatu_base + 0x04), &region_ctrl_2, 1);
     }
 
     close(pci_device_file_desc);
@@ -417,8 +449,8 @@ PCIDevice::~PCIDevice() {
     }
 }
 
-template<typename T>
-T* PCIDevice::get_register_address(uint32_t register_offset) {
+template <typename T>
+T *PCIDevice::get_register_address(uint32_t register_offset) {
     // Right now, address can either be exposed register in BAR, or TLB window in BAR0 (BAR4 for Blackhole).
     // Should clarify this interface
     void *reg_mapping;
@@ -431,10 +463,10 @@ T* PCIDevice::get_register_address(uint32_t register_offset) {
         register_offset -= bar0_uc_offset;
         reg_mapping = bar0_uc;
     }
-    return reinterpret_cast<T*>(static_cast<uint8_t*>(reg_mapping) + register_offset);
+    return reinterpret_cast<T *>(static_cast<uint8_t *>(reg_mapping) + register_offset);
 }
 
-void PCIDevice::write_block(uint64_t byte_addr, uint64_t num_bytes, const uint8_t* buffer_addr) {
+void PCIDevice::write_block(uint64_t byte_addr, uint64_t num_bytes, const uint8_t *buffer_addr) {
     void *dest = nullptr;
     if (bar4_wc != nullptr && byte_addr >= BAR0_BH_SIZE) {
         byte_addr -= BAR0_BH_SIZE;
@@ -451,7 +483,7 @@ void PCIDevice::write_block(uint64_t byte_addr, uint64_t num_bytes, const uint8_
     }
 }
 
-void PCIDevice::read_block(uint64_t byte_addr, uint64_t num_bytes, uint8_t* buffer_addr) {
+void PCIDevice::read_block(uint64_t byte_addr, uint64_t num_bytes, uint8_t *buffer_addr) {
     void *src = nullptr;
     if (bar4_wc != nullptr && byte_addr >= BAR0_BH_SIZE) {
         byte_addr -= BAR0_BH_SIZE;
@@ -468,7 +500,7 @@ void PCIDevice::read_block(uint64_t byte_addr, uint64_t num_bytes, uint8_t* buff
     }
 
     if (num_bytes >= sizeof(std::uint32_t)) {
-        detect_hang_read(*reinterpret_cast<std::uint32_t*>(dest));
+        detect_hang_read(*reinterpret_cast<std::uint32_t *>(dest));
     }
 }
 
@@ -481,14 +513,14 @@ void PCIDevice::write_regs(volatile uint32_t *dest, const uint32_t *src, uint32_
 
 void PCIDevice::write_regs(uint32_t byte_addr, uint32_t word_len, const void *data) {
     volatile uint32_t *dest = get_register_address<uint32_t>(byte_addr);
-    const uint32_t *src = reinterpret_cast<const uint32_t*>(data);
+    const uint32_t *src = reinterpret_cast<const uint32_t *>(data);
 
     write_regs(dest, src, word_len);
 }
 
 void PCIDevice::read_regs(uint32_t byte_addr, uint32_t word_len, void *data) {
     const volatile uint32_t *src = get_register_address<uint32_t>(byte_addr);
-    uint32_t *dest = reinterpret_cast<uint32_t*>(data);
+    uint32_t *dest = reinterpret_cast<uint32_t *>(data);
 
     while (word_len-- != 0) {
         uint32_t temp = *src++;
@@ -496,29 +528,34 @@ void PCIDevice::read_regs(uint32_t byte_addr, uint32_t word_len, void *data) {
     }
 }
 
-void PCIDevice::write_tlb_reg(uint32_t byte_addr, uint64_t value_lower, uint64_t value_upper, uint32_t tlb_cfg_reg_size){
-    log_assert((tlb_cfg_reg_size == 8) or (tlb_cfg_reg_size == 12), "Tenstorrent hardware supports only 64bit or 96bit TLB config regs");
+void PCIDevice::write_tlb_reg(
+    uint32_t byte_addr, uint64_t value_lower, uint64_t value_upper, uint32_t tlb_cfg_reg_size) {
+    log_assert(
+        (tlb_cfg_reg_size == 8) or (tlb_cfg_reg_size == 12),
+        "Tenstorrent hardware supports only 64bit or 96bit TLB config regs");
 
     volatile uint64_t *dest_qw = get_register_address<uint64_t>(byte_addr);
-    volatile uint32_t *dest_extra_dw = get_register_address<uint32_t>(byte_addr+8);
+    volatile uint32_t *dest_extra_dw = get_register_address<uint32_t>(byte_addr + 8);
 #if defined(__ARM_ARCH) || defined(__riscv)
     // The store below goes through UC memory on x86, which has implicit ordering constraints with WC accesses.
-    // ARM has no concept of UC memory. This will not allow for implicit ordering of this store wrt other memory accesses.
-    // Insert an explicit full memory barrier for ARM.
-    // Do the same for RISC-V.
+    // ARM has no concept of UC memory. This will not allow for implicit ordering of this store wrt other memory
+    // accesses. Insert an explicit full memory barrier for ARM. Do the same for RISC-V.
     tt_driver_atomics::mfence();
 #endif
     *dest_qw = value_lower;
     if (tlb_cfg_reg_size > 8) {
-        uint32_t* p_value_upper = reinterpret_cast<uint32_t*>(&value_upper);
+        uint32_t *p_value_upper = reinterpret_cast<uint32_t *>(&value_upper);
         *dest_extra_dw = p_value_upper[0];
     }
-    tt_driver_atomics::mfence(); // Otherwise subsequent WC loads move earlier than the above UC store to the TLB register.
+    tt_driver_atomics::mfence();  // Otherwise subsequent WC loads move earlier than the above UC store to the TLB
+                                  // register.
 }
 
 bool PCIDevice::is_hardware_hung() {
-    volatile const void *addr = reinterpret_cast<const char *>(bar0_uc) + (get_architecture_implementation()->get_arc_reset_scratch_offset() + 6 * 4) - bar0_uc_offset;
-    std::uint32_t scratch_data = *reinterpret_cast<const volatile std::uint32_t*>(addr);
+    volatile const void *addr = reinterpret_cast<const char *>(bar0_uc) +
+                                (get_architecture_implementation()->get_arc_reset_scratch_offset() + 6 * 4) -
+                                bar0_uc_offset;
+    std::uint32_t scratch_data = *reinterpret_cast<const volatile std::uint32_t *>(addr);
 
     return (scratch_data == c_hang_read_value);
 }
@@ -532,52 +569,91 @@ void PCIDevice::detect_hang_read(std::uint32_t data_read) {
 }
 
 // Get TLB index (from zero), check if it's in 16MB, 2MB or 1MB TLB range, and dynamically program it.
-dynamic_tlb PCIDevice::set_dynamic_tlb(unsigned int tlb_index, tt_xy_pair start, tt_xy_pair end,
-                            std::uint64_t address, bool multicast, std::unordered_map<chip_id_t, std::unordered_map<tt_xy_pair, tt_xy_pair>>& harvested_coord_translation, std::uint64_t ordering) {
+dynamic_tlb PCIDevice::set_dynamic_tlb(
+    unsigned int tlb_index,
+    tt_xy_pair start,
+    tt_xy_pair end,
+    std::uint64_t address,
+    bool multicast,
+    std::unordered_map<chip_id_t, std::unordered_map<tt_xy_pair, tt_xy_pair>> &harvested_coord_translation,
+    std::uint64_t ordering) {
     auto architecture_implementation = get_architecture_implementation();
     if (multicast) {
         std::tie(start, end) = architecture_implementation->multicast_workaround(start, end);
     }
 
-    log_trace(LogSiliconDriver, "set_dynamic_tlb with arguments: tlb_index = {}, start = ({}, {}), end = ({}, {}), address = 0x{:x}, multicast = {}, ordering = {}",
-         tlb_index, start.x, start.y, end.x, end.y, address, multicast, (int)ordering);
+    log_trace(
+        LogSiliconDriver,
+        "set_dynamic_tlb with arguments: tlb_index = {}, start = ({}, {}), end = ({}, {}), address = 0x{:x}, multicast "
+        "= {}, ordering = {}",
+        tlb_index,
+        start.x,
+        start.y,
+        end.x,
+        end.y,
+        address,
+        multicast,
+        (int)ordering);
 
     tt::umd::tlb_configuration tlb_config = architecture_implementation->get_tlb_configuration(tlb_index);
     std::uint32_t TLB_CFG_REG_SIZE_BYTES = architecture_implementation->get_tlb_cfg_reg_size_bytes();
     auto translated_start_coords = harvested_coord_translation.at(logical_id).at(start);
     auto translated_end_coords = harvested_coord_translation.at(logical_id).at(end);
-    uint32_t tlb_address    = address / tlb_config.size;
-    uint32_t local_address   = address % tlb_config.size;
-    uint64_t tlb_base       = tlb_config.base + (tlb_config.size * tlb_config.index_offset);
-    uint32_t tlb_cfg_reg    = tlb_config.cfg_addr + (TLB_CFG_REG_SIZE_BYTES * tlb_config.index_offset);
-
-    std::pair<std::uint64_t, std::uint64_t> tlb_data = tt::umd::tlb_data {
-        .local_offset = tlb_address,
-        .x_end = static_cast<uint64_t>(translated_end_coords.x),
-        .y_end = static_cast<uint64_t>(translated_end_coords.y),
-        .x_start = static_cast<uint64_t>(translated_start_coords.x),
-        .y_start = static_cast<uint64_t>(translated_start_coords.y),
-        .mcast = multicast,
-        .ordering = ordering,
-        // TODO #2715: hack for Blackhole A0, will potentially be fixed in B0.
-        // Using the same static vc for reads and writes through TLBs can hang the card. It doesn't even have to be the same TLB.
-        // Dynamic vc should not have this issue. There might be a perf impact with using dynamic vc.
-        .static_vc = (get_arch() == tt::ARCH::BLACKHOLE) ? false : true,
-    }.apply_offset(tlb_config.offset);
-
-    log_debug(LogSiliconDriver, "set_dynamic_tlb() with tlb_index: {} tlb_index_offset: {} dynamic_tlb_size: {}MB tlb_base: 0x{:x} tlb_cfg_reg: 0x{:x}", tlb_index, tlb_config.index_offset, tlb_config.size/(1024*1024), tlb_base, tlb_cfg_reg);
+    uint32_t tlb_address = address / tlb_config.size;
+    uint32_t local_address = address % tlb_config.size;
+    uint64_t tlb_base = tlb_config.base + (tlb_config.size * tlb_config.index_offset);
+    uint32_t tlb_cfg_reg = tlb_config.cfg_addr + (TLB_CFG_REG_SIZE_BYTES * tlb_config.index_offset);
+
+    std::pair<std::uint64_t, std::uint64_t> tlb_data =
+        tt::umd::tlb_data{
+            .local_offset = tlb_address,
+            .x_end = static_cast<uint64_t>(translated_end_coords.x),
+            .y_end = static_cast<uint64_t>(translated_end_coords.y),
+            .x_start = static_cast<uint64_t>(translated_start_coords.x),
+            .y_start = static_cast<uint64_t>(translated_start_coords.y),
+            .mcast = multicast,
+            .ordering = ordering,
+            // TODO #2715: hack for Blackhole A0, will potentially be fixed in B0.
+            // Using the same static vc for reads and writes through TLBs can hang the card. It doesn't even have to be
+            // the same TLB. Dynamic vc should not have this issue. There might be a perf impact with using dynamic vc.
+            .static_vc = (get_arch() == tt::ARCH::BLACKHOLE) ? false : true,
+        }
+            .apply_offset(tlb_config.offset);
+
+    log_debug(
+        LogSiliconDriver,
+        "set_dynamic_tlb() with tlb_index: {} tlb_index_offset: {} dynamic_tlb_size: {}MB tlb_base: 0x{:x} "
+        "tlb_cfg_reg: 0x{:x}",
+        tlb_index,
+        tlb_config.index_offset,
+        tlb_config.size / (1024 * 1024),
+        tlb_base,
+        tlb_cfg_reg);
     write_tlb_reg(tlb_cfg_reg, tlb_data.first, tlb_data.second, TLB_CFG_REG_SIZE_BYTES);
 
-    return { tlb_base + local_address, tlb_config.size - local_address };
+    return {tlb_base + local_address, tlb_config.size - local_address};
 }
 
-dynamic_tlb PCIDevice::set_dynamic_tlb(unsigned int tlb_index, tt_xy_pair target, std::uint64_t address, std::unordered_map<chip_id_t, std::unordered_map<tt_xy_pair, tt_xy_pair>>& harvested_coord_translation, std::uint64_t ordering) {
+dynamic_tlb PCIDevice::set_dynamic_tlb(
+    unsigned int tlb_index,
+    tt_xy_pair target,
+    std::uint64_t address,
+    std::unordered_map<chip_id_t, std::unordered_map<tt_xy_pair, tt_xy_pair>> &harvested_coord_translation,
+    std::uint64_t ordering) {
     return set_dynamic_tlb(tlb_index, tt_xy_pair(0, 0), target, address, false, harvested_coord_translation, ordering);
 }
 
-dynamic_tlb PCIDevice::set_dynamic_tlb_broadcast(unsigned int tlb_index, std::uint64_t address, std::unordered_map<chip_id_t, std::unordered_map<tt_xy_pair, tt_xy_pair>>& harvested_coord_translation, tt_xy_pair start, tt_xy_pair end, std::uint64_t ordering) {
+dynamic_tlb PCIDevice::set_dynamic_tlb_broadcast(
+    unsigned int tlb_index,
+    std::uint64_t address,
+    std::unordered_map<chip_id_t, std::unordered_map<tt_xy_pair, tt_xy_pair>> &harvested_coord_translation,
+    tt_xy_pair start,
+    tt_xy_pair end,
+    std::uint64_t ordering) {
     // Issue a broadcast to cores included in the start (top left) and end (bottom right) grid
     return set_dynamic_tlb(tlb_index, start, end, address, true, harvested_coord_translation, ordering);
 }
 
-tt::umd::architecture_implementation* PCIDevice::get_architecture_implementation() const {return architecture_implementation.get();}
\ No newline at end of file
+tt::umd::architecture_implementation *PCIDevice::get_architecture_implementation() const {
+    return architecture_implementation.get();
+}
\ No newline at end of file
diff --git a/device/pcie/pci_device.hpp b/device/pcie/pci_device.hpp
index c79c1089..3f8f604c 100644
--- a/device/pcie/pci_device.hpp
+++ b/device/pcie/pci_device.hpp
@@ -12,32 +12,33 @@
 #include <unordered_map>
 #include <vector>
 
-#include "device/tt_xy_pair.h"
+#include "device/tlb.h"
 #include "device/tt_arch_types.h"
 #include "device/tt_cluster_descriptor_types.h"
-#include "device/tlb.h"
+#include "device/tt_xy_pair.h"
 
 // TODO: this is used up in tt_silicon_driver.cpp but that logic ought to be
 // lowered into the PCIDevice class since it is specific to PCIe cards.
 // See /vendor_ip/synopsys/052021/bh_pcie_ctl_gen5/export/configuration/DWC_pcie_ctl.h
 static const uint64_t UNROLL_ATU_OFFSET_BAR = 0x1200;
 
-// TODO: this is a bit of a hack... something to revisit when we formalize an 
+// TODO: this is a bit of a hack... something to revisit when we formalize an
 // abstraction for IO.
 // BAR0 size for Blackhole, used to determine whether write block should use BAR0 or BAR4
 static const uint64_t BAR0_BH_SIZE = 512 * 1024 * 1024;
 
 constexpr unsigned int c_hang_read_value = 0xffffffffu;
 
-namespace tt::umd { class architecture_implementation; }
+namespace tt::umd {
+class architecture_implementation;
+}
 
 struct dynamic_tlb {
-    uint64_t bar_offset;        // Offset that address is mapped to, within the PCI BAR.
-    uint64_t remaining_size;    // Bytes remaining between bar_offset and end of the TLB.
+    uint64_t bar_offset;      // Offset that address is mapped to, within the PCI BAR.
+    uint64_t remaining_size;  // Bytes remaining between bar_offset and end of the TLB.
 };
 
-struct PciDeviceInfo
-{
+struct PciDeviceInfo {
     uint16_t vendor_id;
     uint16_t device_id;
     uint16_t pci_domain;
@@ -51,14 +52,14 @@ struct PciDeviceInfo
 };
 
 class PCIDevice {
-    const std::string device_path;  // Path to character device: /dev/tenstorrent/N
-    const int pci_device_num;       // N in /dev/tenstorrent/N
-    const int logical_id;           // Unique identifier for each device in entire network topology
-    const int pci_device_file_desc; // Character device file descriptor
-    const PciDeviceInfo info;       // PCI device info
-    const int numa_node;            // -1 if non-NUMA
-    const int revision;             // PCI revision value from sysfs
-    const tt::ARCH arch;            // e.g. Grayskull, Wormhole, Blackhole
+    const std::string device_path;   // Path to character device: /dev/tenstorrent/N
+    const int pci_device_num;        // N in /dev/tenstorrent/N
+    const int logical_id;            // Unique identifier for each device in entire network topology
+    const int pci_device_file_desc;  // Character device file descriptor
+    const PciDeviceInfo info;        // PCI device info
+    const int numa_node;             // -1 if non-NUMA
+    const int revision;              // PCI revision value from sysfs
+    const tt::ARCH arch;             // e.g. Grayskull, Wormhole, Blackhole
     std::unique_ptr<tt::umd::architecture_implementation> architecture_implementation;
 
 public:
@@ -77,7 +78,7 @@ class PCIDevice {
      *
      * Opens the character device file descriptor, reads device information from
      * sysfs, and maps device memory region(s) into the process address space.
-     * 
+     *
      * @param pci_device_number     N in /dev/tenstorrent/N
      * @param logical_device_id     unique identifier for this device in the network topology
      */
@@ -89,8 +90,8 @@ class PCIDevice {
      */
     ~PCIDevice();
 
-    PCIDevice(const PCIDevice&) = delete; // copy
-    void operator=(const PCIDevice&) = delete; // copy assignment
+    PCIDevice(const PCIDevice &) = delete;       // copy
+    void operator=(const PCIDevice &) = delete;  // copy assignment
 
     /**
      * @return PCI device info
@@ -149,21 +150,39 @@ class PCIDevice {
     // NOC endpoints.  Probably worth waiting for the KMD to start owning the
     // resource management aspect of these PCIe->NOC mappings (the "TLBs")
     // before doing too much work here...
-    void write_block(uint64_t byte_addr, uint64_t num_bytes, const uint8_t* buffer_addr);
-    void read_block(uint64_t byte_addr, uint64_t num_bytes, uint8_t* buffer_addr);
+    void write_block(uint64_t byte_addr, uint64_t num_bytes, const uint8_t *buffer_addr);
+    void read_block(uint64_t byte_addr, uint64_t num_bytes, uint8_t *buffer_addr);
     void write_regs(uint32_t byte_addr, uint32_t word_len, const void *data);
     void write_regs(volatile uint32_t *dest, const uint32_t *src, uint32_t word_len);
     void read_regs(uint32_t byte_addr, uint32_t word_len, void *data);
 
     // TLB related functions.
     // TODO: These are architecture specific, and will be moved out of the class.
-    void write_tlb_reg(uint32_t byte_addr, std::uint64_t value_lower, std::uint64_t value_upper, std::uint32_t tlb_cfg_reg_size);
-    dynamic_tlb set_dynamic_tlb(unsigned int tlb_index, tt_xy_pair start, tt_xy_pair end,
-                                std::uint64_t address, bool multicast, std::unordered_map<chip_id_t, std::unordered_map<tt_xy_pair, tt_xy_pair>>& harvested_coord_translation, std::uint64_t ordering);
-    dynamic_tlb set_dynamic_tlb(unsigned int tlb_index, tt_xy_pair target, std::uint64_t address, std::unordered_map<chip_id_t, std::unordered_map<tt_xy_pair, tt_xy_pair>>& harvested_coord_translation, std::uint64_t ordering = tt::umd::tlb_data::Relaxed);
-    dynamic_tlb set_dynamic_tlb_broadcast(unsigned int tlb_index, std::uint64_t address, std::unordered_map<chip_id_t, std::unordered_map<tt_xy_pair, tt_xy_pair>>& harvested_coord_translation, tt_xy_pair start, tt_xy_pair end, std::uint64_t ordering = tt::umd::tlb_data::Relaxed);
-
-    tt::umd::architecture_implementation* get_architecture_implementation() const;
+    void write_tlb_reg(
+        uint32_t byte_addr, std::uint64_t value_lower, std::uint64_t value_upper, std::uint32_t tlb_cfg_reg_size);
+    dynamic_tlb set_dynamic_tlb(
+        unsigned int tlb_index,
+        tt_xy_pair start,
+        tt_xy_pair end,
+        std::uint64_t address,
+        bool multicast,
+        std::unordered_map<chip_id_t, std::unordered_map<tt_xy_pair, tt_xy_pair>> &harvested_coord_translation,
+        std::uint64_t ordering);
+    dynamic_tlb set_dynamic_tlb(
+        unsigned int tlb_index,
+        tt_xy_pair target,
+        std::uint64_t address,
+        std::unordered_map<chip_id_t, std::unordered_map<tt_xy_pair, tt_xy_pair>> &harvested_coord_translation,
+        std::uint64_t ordering = tt::umd::tlb_data::Relaxed);
+    dynamic_tlb set_dynamic_tlb_broadcast(
+        unsigned int tlb_index,
+        std::uint64_t address,
+        std::unordered_map<chip_id_t, std::unordered_map<tt_xy_pair, tt_xy_pair>> &harvested_coord_translation,
+        tt_xy_pair start,
+        tt_xy_pair end,
+        std::uint64_t ordering = tt::umd::tlb_data::Relaxed);
+
+    tt::umd::architecture_implementation *get_architecture_implementation() const;
     void detect_hang_read(uint32_t data_read = c_hang_read_value);
 
 public:
@@ -186,8 +205,8 @@ class PCIDevice {
     // and simplify the code.
     void *system_reg_mapping = nullptr;
     size_t system_reg_mapping_size;
-    uint32_t system_reg_start_offset;  // Registers >= this are system regs, use the mapping.
-    uint32_t system_reg_offset_adjust; // This is the offset of the first reg in the system reg mapping.
+    uint32_t system_reg_start_offset;   // Registers >= this are system regs, use the mapping.
+    uint32_t system_reg_offset_adjust;  // This is the offset of the first reg in the system reg mapping.
 
     uint32_t read_checking_offset;
 
@@ -195,6 +214,5 @@ class PCIDevice {
     bool is_hardware_hung();
 
     template <typename T>
-    T* get_register_address(uint32_t register_offset);
+    T *get_register_address(uint32_t register_offset);
 };
-
diff --git a/device/simulation/deprecated/tt_emulation_device.cpp b/device/simulation/deprecated/tt_emulation_device.cpp
index 3e64c15e..2073ff41 100644
--- a/device/simulation/deprecated/tt_emulation_device.cpp
+++ b/device/simulation/deprecated/tt_emulation_device.cpp
@@ -1,190 +1,228 @@
-#include <stdexcept>
+#include "tt_emulation_device.h"
+
 #include <cstring>
+#include <stdexcept>
 
 #include "common/logger.hpp"
 #include "device/tt_cluster_descriptor.h"
-#include "tt_emulation_device.h"
 #include "tt_emu_zemi3_wrapper.h"
 
-
 tt_emulation_device::tt_emulation_device(const std::string& sdesc_path) : tt_device(sdesc_path) {
-  soc_descriptor_per_chip.emplace(0, tt_SocDescriptor(sdesc_path));
-  std::set<chip_id_t> target_devices = {0};
-  // create just a default one, we do not have cluster anyway
-  ndesc = tt_ClusterDescriptor::create_for_grayskull_cluster(target_devices, {});
-  tt_zebu_wrapper_inst = new tt_emu_zemi3_wrapper();
+    soc_descriptor_per_chip.emplace(0, tt_SocDescriptor(sdesc_path));
+    std::set<chip_id_t> target_devices = {0};
+    // create just a default one, we do not have cluster anyway
+    ndesc = tt_ClusterDescriptor::create_for_grayskull_cluster(target_devices, {});
+    tt_zebu_wrapper_inst = new tt_emu_zemi3_wrapper();
 
-  log_info(tt::LogEmulationDriver, "Created Emulation Device ");
+    log_info(tt::LogEmulationDriver, "Created Emulation Device ");
 }
 
 tt_emulation_device::~tt_emulation_device() {
-  ndesc.reset();
-  delete tt_zebu_wrapper_inst;
-  log_info(tt::LogEmulationDriver, "Destroyed Emulation Device ");
+    ndesc.reset();
+    delete tt_zebu_wrapper_inst;
+    log_info(tt::LogEmulationDriver, "Destroyed Emulation Device ");
 }
-  
+
 void tt_emulation_device::write(tt_cxy_pair core, uint64_t addr, const std::vector<uint8_t>& data) {
-  const uint32_t size = static_cast<uint32_t>(data.size());
-  tt_zebu_wrapper_inst->axi_write(0, core.x, core.y, addr, size, data); 
-  log_info(tt::LogEmulationDriver, "Wrote {} bytes to address {:#016x}, core {},{}", size, addr, core.x, core.y);
+    const uint32_t size = static_cast<uint32_t>(data.size());
+    tt_zebu_wrapper_inst->axi_write(0, core.x, core.y, addr, size, data);
+    log_info(tt::LogEmulationDriver, "Wrote {} bytes to address {:#016x}, core {},{}", size, addr, core.x, core.y);
 }
 
 std::vector<uint8_t> tt_emulation_device::read(tt_cxy_pair core, uint64_t addr, uint32_t size) {
-  std::vector<uint8_t> data(size);
-  tt_zebu_wrapper_inst->axi_read(0, core.x, core.y, addr, size, data);
-  log_info(tt::LogEmulationDriver, "Read {} bytes from address {:#016x}", size, addr);
+    std::vector<uint8_t> data(size);
+    tt_zebu_wrapper_inst->axi_read(0, core.x, core.y, addr, size, data);
+    log_info(tt::LogEmulationDriver, "Read {} bytes from address {:#016x}", size, addr);
 
-  return data;
+    return data;
 }
 
-
 void tt_emulation_device::start_device(const tt_device_params& device_params) {
-  tt_zebu_wrapper_inst->zebu_start();
-  tt_zebu_wrapper_inst->zebu_enable_waveform_dump(tt_zebu_wrapper::WAVEFORM_DUMP_QIWC);
-  log_info(tt::LogEmulationDriver, "Started Emulation Device ");
+    tt_zebu_wrapper_inst->zebu_start();
+    tt_zebu_wrapper_inst->zebu_enable_waveform_dump(tt_zebu_wrapper::WAVEFORM_DUMP_QIWC);
+    log_info(tt::LogEmulationDriver, "Started Emulation Device ");
 }
 
 void tt_emulation_device::deassert_risc_reset() {
-  tt_zebu_wrapper_inst->all_tensix_reset_deassert();
-  log_info(tt::LogEmulationDriver, "Deasserted all tensix RISC Reset ");
+    tt_zebu_wrapper_inst->all_tensix_reset_deassert();
+    log_info(tt::LogEmulationDriver, "Deasserted all tensix RISC Reset ");
 }
 
 void tt_emulation_device::assert_risc_reset() {
-  tt_zebu_wrapper_inst->all_tensix_reset_assert();
-  log_info(tt::LogEmulationDriver, "Asserted all tensix RISC Reset ");
+    tt_zebu_wrapper_inst->all_tensix_reset_assert();
+    log_info(tt::LogEmulationDriver, "Asserted all tensix RISC Reset ");
 }
 
 void tt_emulation_device::deassert_risc_reset_at_core(tt_cxy_pair core) {
-  tt_zebu_wrapper_inst->tensix_reset_deassert(core.x, core.y);
+    tt_zebu_wrapper_inst->tensix_reset_deassert(core.x, core.y);
 }
 
 void tt_emulation_device::assert_risc_reset_at_core(tt_cxy_pair core) {
-  tt_zebu_wrapper_inst->tensix_reset_assert(core.x, core.y);
+    tt_zebu_wrapper_inst->tensix_reset_assert(core.x, core.y);
 }
 
-
-
 void tt_emulation_device::close_device() {
     log_info(tt::LogEmulationDriver, "Closing Emulation Device ");
     tt_zebu_wrapper_inst->zebu_finish();
 }
 
-void tt_emulation_device::start(std::vector<std::string> plusargs, std::vector<std::string> dump_cores, bool no_checkers, bool /*init_device*/, bool /*skip_driver_allocs*/
+void tt_emulation_device::start(
+    std::vector<std::string> plusargs,
+    std::vector<std::string> dump_cores,
+    bool no_checkers,
+    bool /*init_device*/,
+    bool /*skip_driver_allocs*/
 ) {
-  log_info(tt::LogEmulationDriver, "Starting Emulation Device ");
+    log_info(tt::LogEmulationDriver, "Starting Emulation Device ");
+}
+
+void tt_emulation_device::broadcast_write_to_cluster(
+    const void* mem_ptr,
+    uint32_t size_in_bytes,
+    uint64_t address,
+    const std::set<chip_id_t>& chips_to_exclude,
+    std::set<uint32_t>& rows_to_exclude,
+    std::set<uint32_t>& cols_to_exclude,
+    const std::string& fallback_tlb) {
+    for (const auto& core : get_soc_descriptor(0)->cores) {
+        // if(cols_to_exclude.find(core.first.x) == cols_to_exclude.end() and rows_to_exclude.find(core.first.y) ==
+        // rows_to_exclude.end() and core.second.type != CoreType::HARVESTED) {
+        //     write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, "");
+        //   }
+        // MT: Iterate through all the worker cores for bcast:
+        // if (get_soc_descriptor(0)->is_worker_core(core.first)) {
+        //   write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, "");
+        // }
+        // Emulation only broadcasts to all Tensix cores or all DRAM cores.
+        // differentiate which bcast pattern to use based on exclude columns
+        if (cols_to_exclude.find(0) == cols_to_exclude.end()) {
+            // Detect DRAM bcast
+            if (get_soc_descriptor(0)->is_dram_core(core.first)) {
+                write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, "");
+            }
+        } else {
+            if (get_soc_descriptor(0)->is_worker_core(core.first)) {
+                write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, "");
+            }
+        }
+    }
 }
 
-
-void tt_emulation_device::broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set<chip_id_t>& chips_to_exclude, std::set<uint32_t>& rows_to_exclude, std::set<uint32_t>& cols_to_exclude, const std::string& fallback_tlb) {
-  for(const auto& core : get_soc_descriptor(0) -> cores) {
-    // if(cols_to_exclude.find(core.first.x) == cols_to_exclude.end() and rows_to_exclude.find(core.first.y) == rows_to_exclude.end() and core.second.type != CoreType::HARVESTED) {
-    //     write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, "");
-    //   }
-    // MT: Iterate through all the worker cores for bcast:
-    // if (get_soc_descriptor(0)->is_worker_core(core.first)) {
-    //   write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, "");
-    // }
-    // Emulation only broadcasts to all Tensix cores or all DRAM cores.
-    // differentiate which bcast pattern to use based on exclude columns
-    if (cols_to_exclude.find(0) == cols_to_exclude.end()) {
-      // Detect DRAM bcast
-      if (get_soc_descriptor(0)->is_dram_core(core.first)) {
-        write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, "");
-      }
-    } else {
-      if (get_soc_descriptor(0)->is_worker_core(core.first)) {
-        write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, "");
-      }
+void tt_emulation_device::rolled_write_to_device(
+    std::vector<uint32_t>& base_vec,
+    uint32_t unroll_count,
+    tt_cxy_pair core,
+    uint64_t base_addr,
+    const std::string& tlb_to_use) {
+    std::vector<uint32_t> vec = base_vec;
+    uint32_t byte_increment = 4 * vec.size();
+    for (uint32_t i = 0; i < unroll_count; ++i) {
+        vec[0] = i;  // slot id for debug
+        uint64_t offset_addr = base_addr + i * byte_increment;
+        write_to_device(vec, core, offset_addr, tlb_to_use);
     }
-  }
-} 
-void tt_emulation_device::rolled_write_to_device(std::vector<uint32_t>& base_vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t base_addr, const std::string& tlb_to_use) {
-  std::vector<uint32_t> vec = base_vec;
-  uint32_t byte_increment = 4 * vec.size();
-  for (uint32_t i = 0; i < unroll_count; ++i) {
-    vec[0] = i; // slot id for debug
-    uint64_t offset_addr = base_addr + i * byte_increment;
-    write_to_device(vec, core, offset_addr, tlb_to_use);
-  }
 }
-void tt_emulation_device::write_to_device(const void *mem_ptr, uint32_t size, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {
-  log_assert(!(size % 4), "Writes to Emulation Backend should be 4 byte aligned!");
 
-  std::vector<std::uint32_t> mem_vector((uint32_t*)mem_ptr, (uint32_t*)mem_ptr + size / sizeof(uint32_t));
-  write_to_device(mem_vector, core, addr, tlb_to_use, send_epoch_cmd, last_send_epoch_cmd, ordered_with_prev_remote_write);
-}
+void tt_emulation_device::write_to_device(
+    const void* mem_ptr,
+    uint32_t size,
+    tt_cxy_pair core,
+    uint64_t addr,
+    const std::string& tlb_to_use,
+    bool send_epoch_cmd,
+    bool last_send_epoch_cmd,
+    bool ordered_with_prev_remote_write) {
+    log_assert(!(size % 4), "Writes to Emulation Backend should be 4 byte aligned!");
 
-void tt_emulation_device::write_to_device(std::vector<uint32_t>& vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {
+    std::vector<std::uint32_t> mem_vector((uint32_t*)mem_ptr, (uint32_t*)mem_ptr + size / sizeof(uint32_t));
+    write_to_device(
+        mem_vector, core, addr, tlb_to_use, send_epoch_cmd, last_send_epoch_cmd, ordered_with_prev_remote_write);
+}
 
-  std::vector<uint8_t> byte_data(vec.size() * sizeof(uint32_t));
-  std::memcpy(byte_data.data(), vec.data(), byte_data.size());
+void tt_emulation_device::write_to_device(
+    std::vector<uint32_t>& vec,
+    tt_cxy_pair core,
+    uint64_t addr,
+    const std::string& tlb_to_use,
+    bool send_epoch_cmd,
+    bool last_send_epoch_cmd,
+    bool ordered_with_prev_remote_write) {
+    std::vector<uint8_t> byte_data(vec.size() * sizeof(uint32_t));
+    std::memcpy(byte_data.data(), vec.data(), byte_data.size());
 
-  write(core, addr, byte_data);
+    write(core, addr, byte_data);
 }
 
-void tt_emulation_device::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {
+void tt_emulation_device::l1_membar(
+    const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {
     // Placeholder - implement later - https://yyz-gitlab.local.tenstorrent.com/tenstorrent/open-umd/-/issues/26
 }
 
-void tt_emulation_device::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {
+void tt_emulation_device::dram_membar(
+    const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {
     // Placeholder - implement later - https://yyz-gitlab.local.tenstorrent.com/tenstorrent/open-umd/-/issues/26
 }
 
-void tt_emulation_device::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels) {
+void tt_emulation_device::dram_membar(
+    const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels) {
     // Placeholder - implement later - https://yyz-gitlab.local.tenstorrent.com/tenstorrent/open-umd/-/issues/26
 }
 
+void tt_emulation_device::read_from_device(
+    std::vector<uint32_t>& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& /*tlb_to_use*/) {
+    std::vector<uint8_t> byte_data = read(core, addr, size);
 
+    // Verify that the received byte data can be converted to uint32_t
+    // if (byte_data.size() % sizeof(uint32_t) != 0) {
+    //   throw std::runtime_error("Received byte data size is not a multiple of uint32_t size.");
+    // }
 
-void tt_emulation_device::read_from_device(std::vector<uint32_t>& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& /*tlb_to_use*/) {
-  std::vector<uint8_t> byte_data = read(core, addr, size);
-
-  // Verify that the received byte data can be converted to uint32_t
-  // if (byte_data.size() % sizeof(uint32_t) != 0) {
-  //   throw std::runtime_error("Received byte data size is not a multiple of uint32_t size.");
-  // }
-
-  vec.clear();
-  vec.resize(byte_data.size() / sizeof(uint32_t));
-  std::memcpy(vec.data(), byte_data.data(), byte_data.size());
+    vec.clear();
+    vec.resize(byte_data.size() / sizeof(uint32_t));
+    std::memcpy(vec.data(), byte_data.data(), byte_data.size());
 }
 
 void tt_emulation_device::translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c) {
-  // No translation is performed
-  return;
+    // No translation is performed
+    return;
 }
+
 tt_ClusterDescriptor* tt_emulation_device::get_cluster_description() { return ndesc.get(); }
 
 std::set<chip_id_t> tt_emulation_device::get_target_mmio_device_ids() {
-  log_error("LogEmulationDriver: get_target_mmio_device_ids not implemented");
-  return {};
+    log_error("LogEmulationDriver: get_target_mmio_device_ids not implemented");
+    return {};
 }
 
 std::set<chip_id_t> tt_emulation_device::get_target_remote_device_ids() {
-  log_error("LogEmulationDriver: get_target_remote_device_ids not implemented");
-  return {};
+    log_error("LogEmulationDriver: get_target_remote_device_ids not implemented");
+    return {};
 }
 
 void tt_emulation_device::set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_) {
     dram_address_params = dram_address_params_;
 }
+
 int tt_emulation_device::get_number_of_chips_in_cluster() { return detect_number_of_chips(); }
-std::unordered_set<int> tt_emulation_device::get_all_chips_in_cluster() { return { 0 }; }
+
+std::unordered_set<int> tt_emulation_device::get_all_chips_in_cluster() { return {0}; }
+
 int tt_emulation_device::detect_number_of_chips() { return 1; }
 
 bool tt_emulation_device::using_harvested_soc_descriptors() { return false; }
-bool tt_emulation_device::noc_translation_en() { return false; }
-std::unordered_map<chip_id_t, uint32_t> tt_emulation_device::get_harvesting_masks_for_soc_descriptors() { return {{0, 0}};}
 
-std::unordered_map<chip_id_t, tt_SocDescriptor>& tt_emulation_device::get_virtual_soc_descriptors() {return soc_descriptor_per_chip;}
+bool tt_emulation_device::noc_translation_en() { return false; }
 
-std::map<int, int> tt_emulation_device::get_clocks() {
-  return std::map<int, int>();
+std::unordered_map<chip_id_t, uint32_t> tt_emulation_device::get_harvesting_masks_for_soc_descriptors() {
+    return {{0, 0}};
 }
 
-void tt_emulation_device::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) {
-  l1_address_params = l1_address_params_;
+std::unordered_map<chip_id_t, tt_SocDescriptor>& tt_emulation_device::get_virtual_soc_descriptors() {
+    return soc_descriptor_per_chip;
 }
 
+std::map<int, int> tt_emulation_device::get_clocks() { return std::map<int, int>(); }
 
-
+void tt_emulation_device::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) {
+    l1_address_params = l1_address_params_;
+}
diff --git a/device/simulation/deprecated/tt_emulation_device.h b/device/simulation/deprecated/tt_emulation_device.h
index fb2b5e0d..f3f932a6 100644
--- a/device/simulation/deprecated/tt_emulation_device.h
+++ b/device/simulation/deprecated/tt_emulation_device.h
@@ -9,63 +9,96 @@
 #include <cstdint>
 #include <fstream>
 #include <vector>
+
+#include "tt_device.h"
 #include "tt_soc_descriptor.h"
 #include "tt_xy_pair.h"
-#include "tt_device.h"
 
 // use forward declaration here so we do not need to include tt_zebu_wrapper.h
 class tt_zebu_wrapper;
 
 class tt_emulation_device : public tt_device {
 public:
-  virtual void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_); // Dont care
-  tt_emulation_device(const std::string& sdesc_path);
-  virtual void start(std::vector<std::string> plusargs, std::vector<std::string> dump_cores, bool no_checkers, bool init_device, bool skip_driver_allocs);
-  virtual void start_device(const tt_device_params& device_params);
-  virtual void close_device();
-  virtual void deassert_risc_reset();
-  virtual void deassert_risc_reset_at_core(tt_cxy_pair core);
-  virtual void assert_risc_reset();
-  virtual void assert_risc_reset_at_core(tt_cxy_pair core);
-  virtual void write_to_device(std::vector<uint32_t>& vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false);
-  virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false);
-  virtual void broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set<chip_id_t>& chips_to_exclude, std::set<uint32_t>& rows_to_exclude, std::set<uint32_t>& columns_to_exclude, const std::string& fallback_tlb);
+    virtual void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_);  // Dont care
+    tt_emulation_device(const std::string& sdesc_path);
+    virtual void start(
+        std::vector<std::string> plusargs,
+        std::vector<std::string> dump_cores,
+        bool no_checkers,
+        bool init_device,
+        bool skip_driver_allocs);
+    virtual void start_device(const tt_device_params& device_params);
+    virtual void close_device();
+    virtual void deassert_risc_reset();
+    virtual void deassert_risc_reset_at_core(tt_cxy_pair core);
+    virtual void assert_risc_reset();
+    virtual void assert_risc_reset_at_core(tt_cxy_pair core);
+    virtual void write_to_device(
+        std::vector<uint32_t>& vec,
+        tt_cxy_pair core,
+        uint64_t addr,
+        const std::string& tlb_to_use,
+        bool send_epoch_cmd = false,
+        bool last_send_epoch_cmd = true,
+        bool ordered_with_prev_remote_write = false);
+    virtual void write_to_device(
+        const void* mem_ptr,
+        uint32_t size_in_bytes,
+        tt_cxy_pair core,
+        uint64_t addr,
+        const std::string& tlb_to_use,
+        bool send_epoch_cmd = false,
+        bool last_send_epoch_cmd = true,
+        bool ordered_with_prev_remote_write = false);
+    virtual void broadcast_write_to_cluster(
+        const void* mem_ptr,
+        uint32_t size_in_bytes,
+        uint64_t address,
+        const std::set<chip_id_t>& chips_to_exclude,
+        std::set<uint32_t>& rows_to_exclude,
+        std::set<uint32_t>& columns_to_exclude,
+        const std::string& fallback_tlb);
 
-  void l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
-  void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels);
-  void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
+    void l1_membar(
+        const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
+    void dram_membar(
+        const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels);
+    void dram_membar(
+        const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
 
-  virtual void rolled_write_to_device(std::vector<uint32_t>& base_vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t base_addr, const std::string& tlb_to_use); // See Versim Implementation
-  virtual void read_from_device(std::vector<uint32_t>& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use);
+    virtual void rolled_write_to_device(
+        std::vector<uint32_t>& base_vec,
+        uint32_t unroll_count,
+        tt_cxy_pair core,
+        uint64_t base_addr,
+        const std::string& tlb_to_use);  // See Versim Implementation
+    virtual void read_from_device(
+        std::vector<uint32_t>& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use);
 
-  virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c);
-  virtual bool using_harvested_soc_descriptors();
-  virtual std::unordered_map<chip_id_t, uint32_t> get_harvesting_masks_for_soc_descriptors();
-  virtual std::unordered_map<chip_id_t, tt_SocDescriptor>& get_virtual_soc_descriptors();
-  virtual bool noc_translation_en();
-  virtual std::set<chip_id_t> get_target_mmio_device_ids(); 
-  virtual std::set<chip_id_t> get_target_remote_device_ids();
-  virtual ~tt_emulation_device(); 
-  virtual tt_ClusterDescriptor* get_cluster_description();
-  virtual void set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_);
-  virtual int get_number_of_chips_in_cluster(); 
-  virtual std::unordered_set<chip_id_t> get_all_chips_in_cluster(); 
-  static int detect_number_of_chips();  
-  virtual std::map<int, int> get_clocks(); 
-private:
-
-  tt_device_l1_address_params l1_address_params;
-  std::shared_ptr<tt_ClusterDescriptor> ndesc;
-  tt_device_dram_address_params dram_address_params;
-  
-  // zebu wrapper, provides interface to zebu emulator device through axi and command transactors
-  tt_zebu_wrapper *tt_zebu_wrapper_inst = NULL;
+    virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c);
+    virtual bool using_harvested_soc_descriptors();
+    virtual std::unordered_map<chip_id_t, uint32_t> get_harvesting_masks_for_soc_descriptors();
+    virtual std::unordered_map<chip_id_t, tt_SocDescriptor>& get_virtual_soc_descriptors();
+    virtual bool noc_translation_en();
+    virtual std::set<chip_id_t> get_target_mmio_device_ids();
+    virtual std::set<chip_id_t> get_target_remote_device_ids();
+    virtual ~tt_emulation_device();
+    virtual tt_ClusterDescriptor* get_cluster_description();
+    virtual void set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_);
+    virtual int get_number_of_chips_in_cluster();
+    virtual std::unordered_set<chip_id_t> get_all_chips_in_cluster();
+    static int detect_number_of_chips();
+    virtual std::map<int, int> get_clocks();
 
+private:
+    tt_device_l1_address_params l1_address_params;
+    std::shared_ptr<tt_ClusterDescriptor> ndesc;
+    tt_device_dram_address_params dram_address_params;
 
+    // zebu wrapper, provides interface to zebu emulator device through axi and command transactors
+    tt_zebu_wrapper* tt_zebu_wrapper_inst = NULL;
 
-  // These functions implement the "protocol" between the RTL simulation and the UMD
-  void write(tt_cxy_pair core, uint64_t addr, const std::vector<uint8_t>& data);
-  std::vector<uint8_t> read(tt_cxy_pair core, uint64_t addr, uint32_t size);
-  
+    // These functions implement the "protocol" between the RTL simulation and the UMD
+    void write(tt_cxy_pair core, uint64_t addr, const std::vector<uint8_t>& data);
+    std::vector<uint8_t> read(tt_cxy_pair core, uint64_t addr, uint32_t size);
 };
-
diff --git a/device/simulation/deprecated/tt_emulation_stub.cpp b/device/simulation/deprecated/tt_emulation_stub.cpp
index 33fc3c90..db9ba2cc 100644
--- a/device/simulation/deprecated/tt_emulation_stub.cpp
+++ b/device/simulation/deprecated/tt_emulation_stub.cpp
@@ -1,20 +1,18 @@
-#include <stdexcept>
 #include <cstring>
+#include <stdexcept>
 
 #include "common/logger.hpp"
 #include "tt_emulation_device.h"
 
 tt_emulation_device::tt_emulation_device(const std::string& sdesc_path) : tt_device(sdesc_path) {
-  throw std::runtime_error("tt_emulation_device() -- Zebu Emulation is not supported in this build\n");
+    throw std::runtime_error("tt_emulation_device() -- Zebu Emulation is not supported in this build\n");
 }
 
-
 tt_emulation_device::~tt_emulation_device() {}
-  
-void tt_emulation_device::write(tt_cxy_pair core, uint64_t addr, const std::vector<uint8_t>& data) {}
 
-std::vector<uint8_t> tt_emulation_device::read(tt_cxy_pair core, uint64_t addr, uint32_t size) {return {};}
+void tt_emulation_device::write(tt_cxy_pair core, uint64_t addr, const std::vector<uint8_t>& data) {}
 
+std::vector<uint8_t> tt_emulation_device::read(tt_cxy_pair core, uint64_t addr, uint32_t size) { return {}; }
 
 void tt_emulation_device::start_device(const tt_device_params& device_params) {}
 
@@ -28,46 +26,93 @@ void tt_emulation_device::assert_risc_reset_at_core(tt_cxy_pair core) {}
 
 void tt_emulation_device::close_device() {}
 
-void tt_emulation_device::start(std::vector<std::string> plusargs, std::vector<std::string> dump_cores, bool no_checkers, bool /*init_device*/, bool /*skip_driver_allocs*/) {}
-
-
-void tt_emulation_device::broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set<chip_id_t>& chips_to_exclude, std::set<uint32_t>& rows_to_exclude, std::set<uint32_t>& cols_to_exclude, const std::string& fallback_tlb) {} 
-void tt_emulation_device::rolled_write_to_device(std::vector<uint32_t>& base_vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t base_addr, const std::string& tlb_to_use) {}
-
-void tt_emulation_device::write_to_device(std::vector<uint32_t>& vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {}
-void tt_emulation_device::write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {};
-void tt_emulation_device::read_from_device(std::vector<uint32_t>& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& /*tlb_to_use*/) {}
-void tt_emulation_device::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {}
-void tt_emulation_device::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {}
-void tt_emulation_device::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels) {}
-
+void tt_emulation_device::start(
+    std::vector<std::string> plusargs,
+    std::vector<std::string> dump_cores,
+    bool no_checkers,
+    bool /*init_device*/,
+    bool /*skip_driver_allocs*/) {}
+
+void tt_emulation_device::broadcast_write_to_cluster(
+    const void* mem_ptr,
+    uint32_t size_in_bytes,
+    uint64_t address,
+    const std::set<chip_id_t>& chips_to_exclude,
+    std::set<uint32_t>& rows_to_exclude,
+    std::set<uint32_t>& cols_to_exclude,
+    const std::string& fallback_tlb) {}
+
+void tt_emulation_device::rolled_write_to_device(
+    std::vector<uint32_t>& base_vec,
+    uint32_t unroll_count,
+    tt_cxy_pair core,
+    uint64_t base_addr,
+    const std::string& tlb_to_use) {}
+
+void tt_emulation_device::write_to_device(
+    std::vector<uint32_t>& vec,
+    tt_cxy_pair core,
+    uint64_t addr,
+    const std::string& tlb_to_use,
+    bool send_epoch_cmd,
+    bool last_send_epoch_cmd,
+    bool ordered_with_prev_remote_write) {}
+
+void tt_emulation_device::write_to_device(
+    const void* mem_ptr,
+    uint32_t size_in_bytes,
+    tt_cxy_pair core,
+    uint64_t addr,
+    const std::string& tlb_to_use,
+    bool send_epoch_cmd,
+    bool last_send_epoch_cmd,
+    bool ordered_with_prev_remote_write){};
+
+void tt_emulation_device::read_from_device(
+    std::vector<uint32_t>& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& /*tlb_to_use*/) {}
+
+void tt_emulation_device::l1_membar(
+    const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {}
+
+void tt_emulation_device::dram_membar(
+    const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {}
+
+void tt_emulation_device::dram_membar(
+    const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels) {}
 
 // -------------------------
 // Not sure how to implement these functions below, leaving them blank/default for now
 void tt_emulation_device::translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c) {
-  // No translation is performed
-  return;
+    // No translation is performed
+    return;
 }
+
 tt_ClusterDescriptor* tt_emulation_device::get_cluster_description() { return ndesc.get(); }
 
-std::set<chip_id_t> tt_emulation_device::get_target_mmio_device_ids() {return {};}
+std::set<chip_id_t> tt_emulation_device::get_target_mmio_device_ids() { return {}; }
 
-std::set<chip_id_t> tt_emulation_device::get_target_remote_device_ids() {return {};}
+std::set<chip_id_t> tt_emulation_device::get_target_remote_device_ids() { return {}; }
 
 void tt_emulation_device::set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_) {}
+
 int tt_emulation_device::get_number_of_chips_in_cluster() { return detect_number_of_chips(); }
-std::unordered_set<int> tt_emulation_device::get_all_chips_in_cluster() { return { 0 }; }
+
+std::unordered_set<int> tt_emulation_device::get_all_chips_in_cluster() { return {0}; }
+
 int tt_emulation_device::detect_number_of_chips() { return 1; }
 
 bool tt_emulation_device::using_harvested_soc_descriptors() { return false; }
-bool tt_emulation_device::noc_translation_en() { return false; }
-std::unordered_map<chip_id_t, uint32_t> tt_emulation_device::get_harvesting_masks_for_soc_descriptors() { return {{0, 0}};}
-
-std::unordered_map<chip_id_t, tt_SocDescriptor>& tt_emulation_device::get_virtual_soc_descriptors() {return soc_descriptor_per_chip;}
 
-std::map<int, int> tt_emulation_device::get_clocks() {return std::map<int, int>();}
+bool tt_emulation_device::noc_translation_en() { return false; }
 
-void tt_emulation_device::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) {}
+std::unordered_map<chip_id_t, uint32_t> tt_emulation_device::get_harvesting_masks_for_soc_descriptors() {
+    return {{0, 0}};
+}
 
+std::unordered_map<chip_id_t, tt_SocDescriptor>& tt_emulation_device::get_virtual_soc_descriptors() {
+    return soc_descriptor_per_chip;
+}
 
+std::map<int, int> tt_emulation_device::get_clocks() { return std::map<int, int>(); }
 
+void tt_emulation_device::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) {}
diff --git a/device/simulation/deprecated/tt_versim_device.cpp b/device/simulation/deprecated/tt_versim_device.cpp
index e7ac7506..b59ffc08 100644
--- a/device/simulation/deprecated/tt_versim_device.cpp
+++ b/device/simulation/deprecated/tt_versim_device.cpp
@@ -2,16 +2,14 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-
-
-#include "tt_device.h"
-#include "device/driver_atomics.h"
-#include "common/logger.hpp"
-#include <iostream>
 #include <fstream>
+#include <iostream>
 #include <string>
 #include <vector>
 
+#include "common/logger.hpp"
+#include "device/driver_atomics.h"
+#include "tt_device.h"
 #include "yaml-cpp/yaml.h"
 
 // TODO: Remove dependency on command_assembler + soc
@@ -19,112 +17,134 @@
 #include "device/tt_cluster_descriptor.h"
 namespace CA = CommandAssembler;
 
-
-void translate_soc_descriptor_to_ca_soc(CA::Soc &soc, const tt_SocDescriptor soc_descriptor) {
-  for (auto &core : soc_descriptor.cores) {
-    CA::SocNocNode node;
-    CA::xy_pair CA_coord(core.first.x, core.first.y);
-    node.noc_coord = CA_coord;
-    node.memory_size = core.second.l1_size;
-    switch (core.second.type) {
-      case CoreType::ARC: node.arc = true; break;
-      case CoreType::DRAM: {
-        node.dram = true; 
-        #ifdef EN_DRAM_ALIAS
-          node.dram_channel_id = std::get<0>(soc_descriptor.dram_core_channel_map.at(core.first));
-        #endif
-      } break;
-      case CoreType::ETH: node.eth = true; break;
-      case CoreType::PCIE: node.pcie = true; break;
-      case CoreType::WORKER: node.worker = true; break;
-      case CoreType::HARVESTED: node.harvested = true; break;
-      case CoreType::ROUTER_ONLY: node.router_only = true; break;
-      default: std::cout << " Error: Unsupported CoreType type: " << static_cast<int>(core.second.type) << std::endl; break;
+void translate_soc_descriptor_to_ca_soc(CA::Soc& soc, const tt_SocDescriptor soc_descriptor) {
+    for (auto& core : soc_descriptor.cores) {
+        CA::SocNocNode node;
+        CA::xy_pair CA_coord(core.first.x, core.first.y);
+        node.noc_coord = CA_coord;
+        node.memory_size = core.second.l1_size;
+        switch (core.second.type) {
+            case CoreType::ARC:
+                node.arc = true;
+                break;
+            case CoreType::DRAM: {
+                node.dram = true;
+#ifdef EN_DRAM_ALIAS
+                node.dram_channel_id = std::get<0>(soc_descriptor.dram_core_channel_map.at(core.first));
+#endif
+            } break;
+            case CoreType::ETH:
+                node.eth = true;
+                break;
+            case CoreType::PCIE:
+                node.pcie = true;
+                break;
+            case CoreType::WORKER:
+                node.worker = true;
+                break;
+            case CoreType::HARVESTED:
+                node.harvested = true;
+                break;
+            case CoreType::ROUTER_ONLY:
+                node.router_only = true;
+                break;
+            default:
+                std::cout << " Error: Unsupported CoreType type: " << static_cast<int>(core.second.type) << std::endl;
+                break;
+        }
+        soc.SetNodeProperties(node.noc_coord, node);
     }
-    soc.SetNodeProperties(node.noc_coord, node);
-  }
 }
 
 ////////
 // Device Versim
 ////////
 
+#include <command_assembler/xy_pair.h>
+
 #include "device.h"
 #include "sim_interactive.h"
-#include <command_assembler/xy_pair.h>
 
-tt_VersimDevice::tt_VersimDevice(const std::string &sdesc_path, const std::string &ndesc_path) : tt_device(sdesc_path) {
-  soc_descriptor_per_chip.emplace(0, tt_SocDescriptor(sdesc_path));
-  std::set<chip_id_t> target_devices = {0};
-  if (ndesc_path == "") {
-    ndesc = tt_ClusterDescriptor::create_for_grayskull_cluster(target_devices, {});
-  } 
-  else {
-    ndesc = tt_ClusterDescriptor::create_from_yaml(ndesc_path);
-  }
+tt_VersimDevice::tt_VersimDevice(const std::string& sdesc_path, const std::string& ndesc_path) : tt_device(sdesc_path) {
+    soc_descriptor_per_chip.emplace(0, tt_SocDescriptor(sdesc_path));
+    std::set<chip_id_t> target_devices = {0};
+    if (ndesc_path == "") {
+        ndesc = tt_ClusterDescriptor::create_for_grayskull_cluster(target_devices, {});
+    } else {
+        ndesc = tt_ClusterDescriptor::create_from_yaml(ndesc_path);
+    }
 }
 
-std::unordered_map<chip_id_t, tt_SocDescriptor>& tt_VersimDevice::get_virtual_soc_descriptors() {return soc_descriptor_per_chip;}
-
-tt_ClusterDescriptor* tt_VersimDevice::get_cluster_description() {return ndesc.get();}
-void tt_VersimDevice::start_device(const tt_device_params &device_params) {
-  bool no_checkers = true;
-  std::vector<std::string> dump_cores = device_params.unroll_vcd_dump_cores(get_soc_descriptor(0) -> grid_size);
-  start(device_params.expand_plusargs(), dump_cores, no_checkers, device_params.init_device, false);
+std::unordered_map<chip_id_t, tt_SocDescriptor>& tt_VersimDevice::get_virtual_soc_descriptors() {
+    return soc_descriptor_per_chip;
 }
 
-void tt_VersimDevice::close_device() {
-  stop();
+tt_ClusterDescriptor* tt_VersimDevice::get_cluster_description() { return ndesc.get(); }
+
+void tt_VersimDevice::start_device(const tt_device_params& device_params) {
+    bool no_checkers = true;
+    std::vector<std::string> dump_cores = device_params.unroll_vcd_dump_cores(get_soc_descriptor(0)->grid_size);
+    start(device_params.expand_plusargs(), dump_cores, no_checkers, device_params.init_device, false);
 }
 
+void tt_VersimDevice::close_device() { stop(); }
+
 void tt_VersimDevice::start(
     std::vector<std::string> plusargs,
     std::vector<std::string> dump_cores,
     bool no_checkers,
     bool /*init_device*/,
     bool /*skip_driver_allocs*/
-    ) {
-
-     std::cout << "Start Versim Device " << std::endl;
-     std::string device_descriptor_dir = "./";
+) {
+    std::cout << "Start Versim Device " << std::endl;
+    std::string device_descriptor_dir = "./";
 
-     std::optional<std::string> vcd_suffix;
-     if (dump_cores.size() > 0) {
-       vcd_suffix = "core_dump.vcd";
-     }
+    std::optional<std::string> vcd_suffix;
+    if (dump_cores.size() > 0) {
+        vcd_suffix = "core_dump.vcd";
+    }
 
-     std::vector<std::string> vcd_cores;
+    std::vector<std::string> vcd_cores;
 
-     // TODO: For now create a temporary stuff from CA and populate from descriptor before passing back to versim-core
-     // interface. mainly bypasses arch_configs etc from llir.  We can populate soc directly
-     // MT: have to preserve ca_soc_descriptor object since versim references it at runtime
-     CA::xy_pair CA_grid_size((soc_descriptor_per_chip.begin() -> second).grid_size.x, (soc_descriptor_per_chip.begin() -> second).grid_size.y);
-     // CA::Soc ca_soc_manager(CA_grid_size);
-     std::unique_ptr<CA::Soc> p_ca_soc_manager_unique = std::make_unique<CA::Soc>(CA_grid_size);
-     translate_soc_descriptor_to_ca_soc(*p_ca_soc_manager_unique, (soc_descriptor_per_chip.begin() -> second));
-     // TODO: End
+    // TODO: For now create a temporary stuff from CA and populate from descriptor before passing back to versim-core
+    // interface. mainly bypasses arch_configs etc from llir.  We can populate soc directly
+    // MT: have to preserve ca_soc_descriptor object since versim references it at runtime
+    CA::xy_pair CA_grid_size(
+        (soc_descriptor_per_chip.begin()->second).grid_size.x, (soc_descriptor_per_chip.begin()->second).grid_size.y);
+    // CA::Soc ca_soc_manager(CA_grid_size);
+    std::unique_ptr<CA::Soc> p_ca_soc_manager_unique = std::make_unique<CA::Soc>(CA_grid_size);
+    translate_soc_descriptor_to_ca_soc(*p_ca_soc_manager_unique, (soc_descriptor_per_chip.begin()->second));
+    // TODO: End
 
-     std::cout << "Versim Device: turn_on_device ";
-     std::vector<std::uint32_t> trisc_sizes = {static_cast<unsigned int>(l1_address_params.trisc0_size), static_cast<unsigned int>(l1_address_params.trisc1_size), static_cast<unsigned int>(l1_address_params.trisc2_size)};
-     std::unique_ptr<versim::VersimSimulator> versim_unique = versim::turn_on_device(CA_grid_size, *p_ca_soc_manager_unique, plusargs, vcd_suffix, dump_cores, no_checkers,
-        l1_address_params.trisc_base, trisc_sizes);
-     versim = versim_unique.release();
+    std::cout << "Versim Device: turn_on_device ";
+    std::vector<std::uint32_t> trisc_sizes = {
+        static_cast<unsigned int>(l1_address_params.trisc0_size),
+        static_cast<unsigned int>(l1_address_params.trisc1_size),
+        static_cast<unsigned int>(l1_address_params.trisc2_size)};
+    std::unique_ptr<versim::VersimSimulator> versim_unique = versim::turn_on_device(
+        CA_grid_size,
+        *p_ca_soc_manager_unique,
+        plusargs,
+        vcd_suffix,
+        dump_cores,
+        no_checkers,
+        l1_address_params.trisc_base,
+        trisc_sizes);
+    versim = versim_unique.release();
 
-     std::cout << "Versim Device: write info to tvm db " << std::endl;
-     versim::write_info_to_tvm_db(l1_address_params.trisc_base, trisc_sizes);
-     versim::build_and_connect_tvm_phase();
+    std::cout << "Versim Device: write info to tvm db " << std::endl;
+    versim::write_info_to_tvm_db(l1_address_params.trisc_base, trisc_sizes);
+    versim::build_and_connect_tvm_phase();
 
-     versim->spin_threads(*p_ca_soc_manager_unique, false);
-     versim::assert_reset(*versim);
+    versim->spin_threads(*p_ca_soc_manager_unique, false);
+    versim::assert_reset(*versim);
 
-     p_ca_soc_manager = (void*)(p_ca_soc_manager_unique.release());
+    p_ca_soc_manager = (void*)(p_ca_soc_manager_unique.release());
 
-     std::cout << "Versim Device: Done start " << std::endl;
+    std::cout << "Versim Device: Done start " << std::endl;
 }
 
-tt_VersimDevice::~tt_VersimDevice () {
-  ndesc.reset();
-}
+tt_VersimDevice::~tt_VersimDevice() { ndesc.reset(); }
 
 // bool tt_VersimDevice::run() {
 //   std::cout << "Versim Device: Run " << std::endl;
@@ -136,165 +156,218 @@ tt_VersimDevice::~tt_VersimDevice () {
 // }
 
 void tt_VersimDevice::deassert_risc_reset() {
-  std::cout << "Versim Device: Deassert risc resets start" << std::endl;
-  versim::handle_resetting_triscs(*versim);
-  std::cout << "Versim Device: Start main loop " << std::endl;
-  versim::startup_versim_main_loop(*versim);
+    std::cout << "Versim Device: Deassert risc resets start" << std::endl;
+    versim::handle_resetting_triscs(*versim);
+    std::cout << "Versim Device: Start main loop " << std::endl;
+    versim::startup_versim_main_loop(*versim);
 }
 
 void tt_VersimDevice::deassert_risc_reset_at_core(tt_cxy_pair core) {
-  // This function deasserts reset on the full versim device (don't need core level granularity for versim)
- deassert_risc_reset();
+    // This function deasserts reset on the full versim device (don't need core level granularity for versim)
+    deassert_risc_reset();
 }
 
 void tt_VersimDevice::assert_risc_reset() {
-  std::cout << "Pause all the cores" << std::endl;
-  versim::pause(*versim);
+    std::cout << "Pause all the cores" << std::endl;
+    versim::pause(*versim);
 
-  std::cout << "Wait for cores to go to paused state" << std::endl;
-  versim::sleep_wait_for_paused (*versim);
+    std::cout << "Wait for cores to go to paused state" << std::endl;
+    versim::sleep_wait_for_paused(*versim);
 
-  std::cout << "Assert riscv reset" << std::endl;
-  versim::assert_riscv_reset(*versim);
+    std::cout << "Assert riscv reset" << std::endl;
+    versim::assert_riscv_reset(*versim);
 }
 
 void tt_VersimDevice::assert_risc_reset_at_core(tt_cxy_pair core) {
-  // This function asserts reset on the full versim device (don't need core level granularity for versim)
- assert_risc_reset();
-}
-
-void tt_VersimDevice::rolled_write_to_device(std::vector<uint32_t> &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) {
-  uint32_t byte_increment = vec.size() * 4; 
-  for (int i=0; i<unroll_count; i++) {
-      vec[0] = i; // slot id for debug
-      write_to_device(vec, core, addr + i * byte_increment, tlb_to_use);
-  }
-}
-
-void tt_VersimDevice::rolled_write_to_device(uint32_t* mem_ptr, uint32_t len, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb) {
-  std::vector<std::uint32_t> mem_vector(mem_ptr, mem_ptr + len);
-  rolled_write_to_device(mem_vector, unroll_count, core, addr, fallback_tlb);
+    // This function asserts reset on the full versim device (don't need core level granularity for versim)
+    assert_risc_reset();
 }
 
-void tt_VersimDevice::write_to_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {
-  
-  log_debug(tt::LogSiliconDriver, "Versim Device ({}): Write vector at target core {}, address: {}", get_sim_time(*versim), core.str(), addr);
-
-  bool aligned_32B = (soc_descriptor_per_chip.begin() -> second).cores.at(core).type == CoreType::DRAM;
-  // MT: Remove these completely
-  CommandAssembler::xy_pair CA_target(core.x, core.y);
-  CommandAssembler::memory CA_tensor_memory(addr, vec);
-
-  nuapi::device::write_memory_to_core(*versim, CA_target, CA_tensor_memory);
+void tt_VersimDevice::rolled_write_to_device(
+    std::vector<uint32_t>& vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) {
+    uint32_t byte_increment = vec.size() * 4;
+    for (int i = 0; i < unroll_count; i++) {
+        vec[0] = i;  // slot id for debug
+        write_to_device(vec, core, addr + i * byte_increment, tlb_to_use);
+    }
 }
 
-void tt_VersimDevice::write_to_device(const void *mem_ptr, uint32_t size, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {
-  log_assert(!(size % 4), "Writes to Versim Backend should be 4 byte aligned!");
-
-  std::vector<std::uint32_t> mem_vector((uint32_t*)mem_ptr, (uint32_t*)mem_ptr + size / sizeof(uint32_t));
-  write_to_device(mem_vector, core, addr, tlb_to_use, send_epoch_cmd, last_send_epoch_cmd, ordered_with_prev_remote_write);
+void tt_VersimDevice::rolled_write_to_device(
+    uint32_t* mem_ptr,
+    uint32_t len,
+    uint32_t unroll_count,
+    tt_cxy_pair core,
+    uint64_t addr,
+    const std::string& fallback_tlb) {
+    std::vector<std::uint32_t> mem_vector(mem_ptr, mem_ptr + len);
+    rolled_write_to_device(mem_vector, unroll_count, core, addr, fallback_tlb);
+}
+
+void tt_VersimDevice::write_to_device(
+    std::vector<uint32_t>& vec,
+    tt_cxy_pair core,
+    uint64_t addr,
+    const std::string& tlb_to_use,
+    bool send_epoch_cmd,
+    bool last_send_epoch_cmd,
+    bool ordered_with_prev_remote_write) {
+    log_debug(
+        tt::LogSiliconDriver,
+        "Versim Device ({}): Write vector at target core {}, address: {}",
+        get_sim_time(*versim),
+        core.str(),
+        addr);
+
+    bool aligned_32B = (soc_descriptor_per_chip.begin()->second).cores.at(core).type == CoreType::DRAM;
+    // MT: Remove these completely
+    CommandAssembler::xy_pair CA_target(core.x, core.y);
+    CommandAssembler::memory CA_tensor_memory(addr, vec);
+
+    nuapi::device::write_memory_to_core(*versim, CA_target, CA_tensor_memory);
+}
+
+void tt_VersimDevice::write_to_device(
+    const void* mem_ptr,
+    uint32_t size,
+    tt_cxy_pair core,
+    uint64_t addr,
+    const std::string& tlb_to_use,
+    bool send_epoch_cmd,
+    bool last_send_epoch_cmd,
+    bool ordered_with_prev_remote_write) {
+    log_assert(!(size % 4), "Writes to Versim Backend should be 4 byte aligned!");
+
+    std::vector<std::uint32_t> mem_vector((uint32_t*)mem_ptr, (uint32_t*)mem_ptr + size / sizeof(uint32_t));
+    write_to_device(
+        mem_vector, core, addr, tlb_to_use, send_epoch_cmd, last_send_epoch_cmd, ordered_with_prev_remote_write);
+}
+
+void tt_VersimDevice::broadcast_write_to_cluster(
+    const void* mem_ptr,
+    uint32_t size_in_bytes,
+    uint64_t address,
+    const std::set<chip_id_t>& chips_to_exclude,
+    std::set<uint32_t>& rows_to_exclude,
+    std::set<uint32_t>& cols_to_exclude,
+    const std::string& fallback_tlb) {
+    for (const auto& core : get_soc_descriptor(0)->cores) {
+        if (cols_to_exclude.find(core.first.x) == cols_to_exclude.end() and
+            rows_to_exclude.find(core.first.y) == rows_to_exclude.end() and core.second.type != CoreType::HARVESTED) {
+            write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, "");
+        }
+    }
 }
 
-void tt_VersimDevice::broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set<chip_id_t>& chips_to_exclude, std::set<uint32_t>& rows_to_exclude, std::set<uint32_t>& cols_to_exclude, const std::string& fallback_tlb) {
-  for(const auto& core : get_soc_descriptor(0) -> cores) {
-    if(cols_to_exclude.find(core.first.x) == cols_to_exclude.end() and rows_to_exclude.find(core.first.y) == rows_to_exclude.end() and core.second.type != CoreType::HARVESTED) {
-        write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, "");
-      }
-  }
-}
 void tt_VersimDevice::wait_for_non_mmio_flush() {
-  // Do nothing, since Versim does not simulate non-mmio mapped chips
+    // Do nothing, since Versim does not simulate non-mmio mapped chips
 }
 
-void tt_VersimDevice::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {
-  tt_driver_atomics::mfence(); // Ensure no reordering of loads/stores around this
+void tt_VersimDevice::l1_membar(
+    const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {
+    tt_driver_atomics::mfence();  // Ensure no reordering of loads/stores around this
 }
 
-void tt_VersimDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels) {
-  tt_driver_atomics::mfence(); // Ensure no reordering of loads/stores around this
+void tt_VersimDevice::dram_membar(
+    const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels) {
+    tt_driver_atomics::mfence();  // Ensure no reordering of loads/stores around this
 }
 
-void tt_VersimDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& dram_cores) {
-  tt_driver_atomics::mfence(); // Ensure no reordering of loads/stores around this
+void tt_VersimDevice::dram_membar(
+    const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& dram_cores) {
+    tt_driver_atomics::mfence();  // Ensure no reordering of loads/stores around this
 }
 
-void tt_VersimDevice::read_from_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) {
-  log_debug(tt::LogSiliconDriver, "Versim Device ({}): Read vector from address: {}, with size: {} Bytes", get_sim_time(*versim), addr, size);
+void tt_VersimDevice::read_from_device(
+    std::vector<uint32_t>& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) {
+    log_debug(
+        tt::LogSiliconDriver,
+        "Versim Device ({}): Read vector from address: {}, with size: {} Bytes",
+        get_sim_time(*versim),
+        addr,
+        size);
 
-  CommandAssembler::xy_pair CA_target(core.x, core.y);
+    CommandAssembler::xy_pair CA_target(core.x, core.y);
 
-  size_t size_in_words = size / 4;
-  auto result = nuapi::device::read_memory_from_core(*versim, CA_target, addr, size_in_words);
-  vec = result;
+    size_t size_in_words = size / 4;
+    auto result = nuapi::device::read_memory_from_core(*versim, CA_target, addr, size_in_words);
+    vec = result;
 }
 
-void tt_VersimDevice::read_from_device(void *mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) {
-  log_debug(tt::LogSiliconDriver, "Versim Device ({}): Read vector from address: {}, with size: {} Bytes", get_sim_time(*versim), addr, size);
-  log_assert(!(size % 4), "Reads from Versim backend should be 4 byte aligned!");
+void tt_VersimDevice::read_from_device(
+    void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) {
+    log_debug(
+        tt::LogSiliconDriver,
+        "Versim Device ({}): Read vector from address: {}, with size: {} Bytes",
+        get_sim_time(*versim),
+        addr,
+        size);
+    log_assert(!(size % 4), "Reads from Versim backend should be 4 byte aligned!");
 
-  CommandAssembler::xy_pair CA_target(core.x, core.y);
+    CommandAssembler::xy_pair CA_target(core.x, core.y);
 
-  size_t size_in_words = size / 4;
-  auto result = nuapi::device::read_memory_from_core(*versim, CA_target, addr, size_in_words);
-  memcpy(mem_ptr, result.data(), result.size()*sizeof(uint32_t));
+    size_t size_in_words = size / 4;
+    auto result = nuapi::device::read_memory_from_core(*versim, CA_target, addr, size_in_words);
+    memcpy(mem_ptr, result.data(), result.size() * sizeof(uint32_t));
 }
 
-void tt_VersimDevice::translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c) {
-  // No translation is performed
-  return;
+void tt_VersimDevice::translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c) {
+    // No translation is performed
+    return;
 }
 
 std::set<chip_id_t> tt_VersimDevice::get_target_mmio_device_ids() {
-  // Must only be used for silicon
-  return {};
+    // Must only be used for silicon
+    return {};
 }
 
 std::set<chip_id_t> tt_VersimDevice::get_target_remote_device_ids() {
-  // Must only be used for silicon
-  return {};
+    // Must only be used for silicon
+    return {};
 }
 
-
-bool versim_check_dram_core_exists(const std::vector<std::vector<tt_xy_pair>> &dram_core_channels, tt_xy_pair target_core) {
+bool versim_check_dram_core_exists(
+    const std::vector<std::vector<tt_xy_pair>>& dram_core_channels, tt_xy_pair target_core) {
     bool dram_core_exists = false;
-    for (const auto &dram_cores_in_channel: dram_core_channels) {
-      for (const auto &dram_core : dram_cores_in_channel) {
-        if (dram_core.x == target_core.x && dram_core.y == target_core.y) {
-            return true;
+    for (const auto& dram_cores_in_channel : dram_core_channels) {
+        for (const auto& dram_core : dram_cores_in_channel) {
+            if (dram_core.x == target_core.x && dram_core.y == target_core.y) {
+                return true;
+            }
         }
-      }
     }
     return false;
 }
 
 int tt_VersimDevice::get_number_of_chips_in_cluster() { return detect_number_of_chips(); }
+
 std::unordered_set<int> tt_VersimDevice::get_all_chips_in_cluster() { return {0}; }
+
 int tt_VersimDevice::detect_number_of_chips() { return 1; }
 
 bool tt_VersimDevice::using_harvested_soc_descriptors() { return false; }
+
 bool tt_VersimDevice::noc_translation_en() { return false; }
-std::unordered_map<chip_id_t, uint32_t> tt_VersimDevice::get_harvesting_masks_for_soc_descriptors() { return {{0, 0}};}
+
+std::unordered_map<chip_id_t, uint32_t> tt_VersimDevice::get_harvesting_masks_for_soc_descriptors() { return {{0, 0}}; }
 
 // Meant to breakout running functions for simulator
 bool tt_VersimDevice::stop() {
-  std::cout << "Versim Device: Stop " << std::endl;
-
-  versim::turn_off_device(*versim);
-  versim->shutdown();
-  // Force free of all versim cores
-  for (auto x = 0; x < versim->grid_size.x; x++) {
-    for (auto y = 0; y < versim->grid_size.y; y++) {
-      delete versim->core_grid.at(x).at(y);
+    std::cout << "Versim Device: Stop " << std::endl;
+
+    versim::turn_off_device(*versim);
+    versim->shutdown();
+    // Force free of all versim cores
+    for (auto x = 0; x < versim->grid_size.x; x++) {
+        for (auto y = 0; y < versim->grid_size.y; y++) {
+            delete versim->core_grid.at(x).at(y);
+        }
     }
-  }
-  std::cout << "Versim Device: Stop completed " << std::endl;
-  delete versim;
-  return true;
+    std::cout << "Versim Device: Stop completed " << std::endl;
+    delete versim;
+    return true;
 }
 
-std::map<int,int> tt_VersimDevice::get_clocks() {
-  return std::map<int,int>();
-}
+std::map<int, int> tt_VersimDevice::get_clocks() { return std::map<int, int>(); }
 
 void tt_VersimDevice::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) {
     l1_address_params = l1_address_params_;
@@ -305,11 +378,11 @@ void tt_VersimDevice::set_device_dram_address_params(const tt_device_dram_addres
 }
 
 std::uint32_t tt_VersimDevice::get_num_dram_channels(std::uint32_t device_id) {
-    return get_soc_descriptor(device_id) -> get_num_dram_channels();
+    return get_soc_descriptor(device_id)->get_num_dram_channels();
 }
 
 std::uint64_t tt_VersimDevice::get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel) {
-    return get_soc_descriptor(device_id) -> dram_bank_size; // Space per channel is identical for now
+    return get_soc_descriptor(device_id)->dram_bank_size;  // Space per channel is identical for now
 }
 
 std::uint32_t tt_VersimDevice::get_num_host_channels(std::uint32_t device_id) {
diff --git a/device/simulation/deprecated/tt_versim_device.h b/device/simulation/deprecated/tt_versim_device.h
index 087b7336..5cfb1ea7 100644
--- a/device/simulation/deprecated/tt_versim_device.h
+++ b/device/simulation/deprecated/tt_versim_device.h
@@ -11,42 +11,91 @@
 #include "tt_xy_pair.h"
 
 class c_versim_core;
-namespace nuapi {namespace device {template <typename, typename>class Simulator;}}
-namespace versim {
-  struct VersimSimulatorState;
-  using VersimSimulator = nuapi::device::Simulator<c_versim_core *, VersimSimulatorState>;
+
+namespace nuapi {
+namespace device {
+template <typename, typename>
+class Simulator;
 }
+}  // namespace nuapi
+
+namespace versim {
+struct VersimSimulatorState;
+using VersimSimulator = nuapi::device::Simulator<c_versim_core*, VersimSimulatorState>;
+}  // namespace versim
 
 /**
  * @brief Versim Backend Class, derived from the tt_device class
  * Implements APIs to communicate with a simulated (using Verilator) Tenstorrent Device.
-*/ 
-class tt_VersimDevice: public tt_device
-{
-    public:
+ */
+class tt_VersimDevice : public tt_device {
+public:
     virtual void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_);
     virtual void set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_);
-    tt_VersimDevice(const std::string &sdesc_path, const std::string &ndesc_path);
+    tt_VersimDevice(const std::string& sdesc_path, const std::string& ndesc_path);
     virtual std::unordered_map<chip_id_t, tt_SocDescriptor>& get_virtual_soc_descriptors();
-    virtual void start(std::vector<std::string> plusargs, std::vector<std::string> dump_cores, bool no_checkers, bool init_device, bool skip_driver_allocs);
-    virtual void start_device(const tt_device_params &device_params);
+    virtual void start(
+        std::vector<std::string> plusargs,
+        std::vector<std::string> dump_cores,
+        bool no_checkers,
+        bool init_device,
+        bool skip_driver_allocs);
+    virtual void start_device(const tt_device_params& device_params);
     virtual void close_device();
     virtual void deassert_risc_reset();
     virtual void deassert_risc_reset_at_core(tt_cxy_pair core);
     virtual void assert_risc_reset();
     virtual void assert_risc_reset_at_core(tt_cxy_pair core);
-    virtual void write_to_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false);
-    virtual void broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set<chip_id_t>& chips_to_exclude, std::set<uint32_t>& rows_to_exclude, std::set<uint32_t>& columns_to_exclude, const std::string& fallback_tlb);
-    virtual void rolled_write_to_device(std::vector<uint32_t> &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use);
-    virtual void read_from_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use);
-    virtual void rolled_write_to_device(uint32_t* mem_ptr, uint32_t size_in_bytes, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb);
-    virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false);
-    virtual void read_from_device(void *mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use); 
+    virtual void write_to_device(
+        std::vector<uint32_t>& vec,
+        tt_cxy_pair core,
+        uint64_t addr,
+        const std::string& tlb_to_use,
+        bool send_epoch_cmd = false,
+        bool last_send_epoch_cmd = true,
+        bool ordered_with_prev_remote_write = false);
+    virtual void broadcast_write_to_cluster(
+        const void* mem_ptr,
+        uint32_t size_in_bytes,
+        uint64_t address,
+        const std::set<chip_id_t>& chips_to_exclude,
+        std::set<uint32_t>& rows_to_exclude,
+        std::set<uint32_t>& columns_to_exclude,
+        const std::string& fallback_tlb);
+    virtual void rolled_write_to_device(
+        std::vector<uint32_t>& vec,
+        uint32_t unroll_count,
+        tt_cxy_pair core,
+        uint64_t addr,
+        const std::string& tlb_to_use);
+    virtual void read_from_device(
+        std::vector<uint32_t>& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use);
+    virtual void rolled_write_to_device(
+        uint32_t* mem_ptr,
+        uint32_t size_in_bytes,
+        uint32_t unroll_count,
+        tt_cxy_pair core,
+        uint64_t addr,
+        const std::string& fallback_tlb);
+    virtual void write_to_device(
+        const void* mem_ptr,
+        uint32_t size_in_bytes,
+        tt_cxy_pair core,
+        uint64_t addr,
+        const std::string& tlb_to_use,
+        bool send_epoch_cmd = false,
+        bool last_send_epoch_cmd = true,
+        bool ordered_with_prev_remote_write = false);
+    virtual void read_from_device(
+        void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use);
     virtual void wait_for_non_mmio_flush();
-    void l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
-    void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels);
-    void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
-    virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c);
+    void l1_membar(
+        const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
+    void dram_membar(
+        const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels);
+    void dram_membar(
+        const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
+    virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c);
     virtual bool using_harvested_soc_descriptors();
     virtual std::unordered_map<chip_id_t, uint32_t> get_harvesting_masks_for_soc_descriptors();
     virtual bool noc_translation_en();
@@ -57,12 +106,13 @@ class tt_VersimDevice: public tt_device
     virtual int get_number_of_chips_in_cluster();
     virtual std::unordered_set<chip_id_t> get_all_chips_in_cluster();
     static int detect_number_of_chips();
-    virtual std::map<int,int> get_clocks();
+    virtual std::map<int, int> get_clocks();
     virtual std::uint32_t get_num_dram_channels(std::uint32_t device_id);
     virtual std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel);
     virtual std::uint32_t get_num_host_channels(std::uint32_t device_id);
     virtual std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel);
-    private:
+
+private:
     bool stop();
     tt_device_l1_address_params l1_address_params;
     tt_device_dram_address_params dram_address_params;
diff --git a/device/simulation/deprecated/tt_versim_stub.cpp b/device/simulation/deprecated/tt_versim_stub.cpp
index 27c69f80..1a0e5cc3 100644
--- a/device/simulation/deprecated/tt_versim_stub.cpp
+++ b/device/simulation/deprecated/tt_versim_stub.cpp
@@ -2,19 +2,18 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-
-#include "tt_device.h"
-
-#include <iostream>
 #include <fstream>
+#include <iostream>
 #include <string>
 #include <vector>
 
-tt_VersimDevice::tt_VersimDevice(const std::string &sdesc_path, const std::string &ndesc_path) : tt_device(sdesc_path) {
-  throw std::runtime_error("tt_VersimDevice() -- VERSIM is not supported in this build\n");
+#include "tt_device.h"
+
+tt_VersimDevice::tt_VersimDevice(const std::string& sdesc_path, const std::string& ndesc_path) : tt_device(sdesc_path) {
+    throw std::runtime_error("tt_VersimDevice() -- VERSIM is not supported in this build\n");
 }
 
-tt_VersimDevice::~tt_VersimDevice () {}
+tt_VersimDevice::~tt_VersimDevice() {}
 
 std::unordered_map<chip_id_t, tt_SocDescriptor>& tt_VersimDevice::get_virtual_soc_descriptors() {
     throw std::runtime_error("tt_VersimDevice() -- VERSIM is not supported in this build\n");
@@ -22,23 +21,71 @@ std::unordered_map<chip_id_t, tt_SocDescriptor>& tt_VersimDevice::get_virtual_so
 }
 
 int tt_VersimDevice::get_number_of_chips_in_cluster() { return detect_number_of_chips(); }
+
 std::unordered_set<int> tt_VersimDevice::get_all_chips_in_cluster() { return {}; }
+
 int tt_VersimDevice::detect_number_of_chips() { return 0; }
 
-void tt_VersimDevice::start_device(const tt_device_params &device_params) {}
+void tt_VersimDevice::start_device(const tt_device_params& device_params) {}
+
 void tt_VersimDevice::close_device() {}
-void tt_VersimDevice::write_to_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {}
-void tt_VersimDevice::broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set<chip_id_t>& chips_to_exclude, std::set<uint32_t>& rows_to_exclude, std::set<uint32_t>& cols_to_exclude, const std::string& fallback_tlb) {}
-void tt_VersimDevice::read_from_device(std::vector<uint32_t> &vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) {}
-void tt_VersimDevice::rolled_write_to_device(std::vector<uint32_t> &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) {}
-void tt_VersimDevice::write_to_device(const void *mem_ptr, uint32_t len, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {}
-void tt_VersimDevice::read_from_device(void *mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) {}
-void tt_VersimDevice::rolled_write_to_device(uint32_t* mem_ptr, uint32_t len, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb) {}
+
+void tt_VersimDevice::write_to_device(
+    std::vector<uint32_t>& vec,
+    tt_cxy_pair core,
+    uint64_t addr,
+    const std::string& tlb_to_use,
+    bool send_epoch_cmd,
+    bool last_send_epoch_cmd,
+    bool ordered_with_prev_remote_write) {}
+
+void tt_VersimDevice::broadcast_write_to_cluster(
+    const void* mem_ptr,
+    uint32_t size_in_bytes,
+    uint64_t address,
+    const std::set<chip_id_t>& chips_to_exclude,
+    std::set<uint32_t>& rows_to_exclude,
+    std::set<uint32_t>& cols_to_exclude,
+    const std::string& fallback_tlb) {}
+
+void tt_VersimDevice::read_from_device(
+    std::vector<uint32_t>& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) {}
+
+void tt_VersimDevice::rolled_write_to_device(
+    std::vector<uint32_t>& vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) {
+}
+
+void tt_VersimDevice::write_to_device(
+    const void* mem_ptr,
+    uint32_t len,
+    tt_cxy_pair core,
+    uint64_t addr,
+    const std::string& tlb_to_use,
+    bool send_epoch_cmd,
+    bool last_send_epoch_cmd,
+    bool ordered_with_prev_remote_write) {}
+
+void tt_VersimDevice::read_from_device(
+    void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) {}
+
+void tt_VersimDevice::rolled_write_to_device(
+    uint32_t* mem_ptr,
+    uint32_t len,
+    uint32_t unroll_count,
+    tt_cxy_pair core,
+    uint64_t addr,
+    const std::string& fallback_tlb) {}
+
 void tt_VersimDevice::wait_for_non_mmio_flush() {}
 
-void tt_VersimDevice::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {}
-void tt_VersimDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels) {}
-void tt_VersimDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& dram_cores) {}
+void tt_VersimDevice::l1_membar(
+    const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {}
+
+void tt_VersimDevice::dram_membar(
+    const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels) {}
+
+void tt_VersimDevice::dram_membar(
+    const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& dram_cores) {}
 
 void tt_VersimDevice::start(
     std::vector<std::string> plusargs,
@@ -49,36 +96,48 @@ void tt_VersimDevice::start(
 ) {}
 
 void tt_VersimDevice::deassert_risc_reset() {}
+
 void tt_VersimDevice::deassert_risc_reset_at_core(tt_cxy_pair core) {}
+
 void tt_VersimDevice::assert_risc_reset() {}
+
 void tt_VersimDevice::assert_risc_reset_at_core(tt_cxy_pair core) {}
 
-void tt_VersimDevice::translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c) {};
+void tt_VersimDevice::translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c){};
+
 // void tt_VersimDevice::dump_wall_clock_mailbox(std::string output_path, int device_id) {}
 
-std::set<chip_id_t> tt_VersimDevice::get_target_mmio_device_ids() {return {};}
-std::set<chip_id_t> tt_VersimDevice::get_target_remote_device_ids() {return {};}
+std::set<chip_id_t> tt_VersimDevice::get_target_mmio_device_ids() { return {}; }
+
+std::set<chip_id_t> tt_VersimDevice::get_target_remote_device_ids() { return {}; }
 
 bool versim_check_dram_core_exists(
-    const std::vector<std::vector<tt_xy_pair>> &dram_core_channels, tt_xy_pair target_core) {
-  return false;
+    const std::vector<std::vector<tt_xy_pair>>& dram_core_channels, tt_xy_pair target_core) {
+    return false;
 }
 
 bool tt_VersimDevice::using_harvested_soc_descriptors() { return false; }
+
 bool tt_VersimDevice::noc_translation_en() { return false; }
-std::unordered_map<chip_id_t, uint32_t> tt_VersimDevice::get_harvesting_masks_for_soc_descriptors() { return std::unordered_map<chip_id_t, uint32_t>();}
+
+std::unordered_map<chip_id_t, uint32_t> tt_VersimDevice::get_harvesting_masks_for_soc_descriptors() {
+    return std::unordered_map<chip_id_t, uint32_t>();
+}
 
 bool tt_VersimDevice::stop() { return true; }
 
 void tt_VersimDevice::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) {}
+
 void tt_VersimDevice::set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_) {}
 
-std::uint32_t tt_VersimDevice::get_num_dram_channels(std::uint32_t device_id) {return 0;}
-std::uint64_t tt_VersimDevice::get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel) {return 0;}
-std::uint32_t tt_VersimDevice::get_num_host_channels(std::uint32_t device_id) {return 0;}
-std::uint32_t tt_VersimDevice::get_host_channel_size(std::uint32_t device_id, std::uint32_t channel) {return 0;}
+std::uint32_t tt_VersimDevice::get_num_dram_channels(std::uint32_t device_id) { return 0; }
+
+std::uint64_t tt_VersimDevice::get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel) { return 0; }
+
+std::uint32_t tt_VersimDevice::get_num_host_channels(std::uint32_t device_id) { return 0; }
 
-std::map<int,int> tt_VersimDevice::get_clocks() {return std::map<int,int>();}
+std::uint32_t tt_VersimDevice::get_host_channel_size(std::uint32_t device_id, std::uint32_t channel) { return 0; }
 
-tt_ClusterDescriptor* tt_VersimDevice::get_cluster_description() {return ndesc.get();}
+std::map<int, int> tt_VersimDevice::get_clocks() { return std::map<int, int>(); }
 
+tt_ClusterDescriptor* tt_VersimDevice::get_cluster_description() { return ndesc.get(); }
diff --git a/device/simulation/tt_simulation_device.cpp b/device/simulation/tt_simulation_device.cpp
index 9b0457d4..59bb2dfd 100644
--- a/device/simulation/tt_simulation_device.cpp
+++ b/device/simulation/tt_simulation_device.cpp
@@ -4,43 +4,44 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-#include <iostream>
+#include "tt_simulation_device.h"
+
+#include <nng/nng.h>
+#include <uv.h>
+
 #include <fstream>
+#include <iostream>
 #include <string>
 #include <vector>
 
-#include <uv.h>
-#include <nng/nng.h>
-
-#include "common/logger.hpp"
 #include "common/assert.hpp"
+#include "common/logger.hpp"
 #include "device/driver_atomics.h"
 #include "device/tt_cluster_descriptor.h"
-
-#include "tt_simulation_device.h"
 #include "tt_simulation_device_generated.h"
 
-flatbuffers::FlatBufferBuilder create_flatbuffer(DEVICE_COMMAND rw, std::vector<uint32_t> vec, tt_cxy_pair core_, uint64_t addr, uint64_t size_=0){
+flatbuffers::FlatBufferBuilder create_flatbuffer(
+    DEVICE_COMMAND rw, std::vector<uint32_t> vec, tt_cxy_pair core_, uint64_t addr, uint64_t size_ = 0) {
     flatbuffers::FlatBufferBuilder builder;
     auto data = builder.CreateVector(vec);
     auto core = tt_vcs_core(core_.x, core_.y);
-    uint64_t size = size_ == 0 ? size = vec.size()*sizeof(uint32_t) : size = size_;
+    uint64_t size = size_ == 0 ? size = vec.size() * sizeof(uint32_t) : size = size_;
     auto device_cmd = CreateDeviceRequestResponse(builder, rw, data, &core, addr, size);
     builder.Finish(device_cmd);
     return builder;
 }
 
-void print_flatbuffer(const DeviceRequestResponse *buf){    
+void print_flatbuffer(const DeviceRequestResponse* buf) {
     std::vector<uint32_t> data_vec(buf->data()->begin(), buf->data()->end());
     uint64_t addr = buf->address();
     uint32_t size = buf->size();
     tt_cxy_pair core = {0, buf->core()->x(), buf->core()->y()};
-    
+
     std::stringstream ss;
     ss << std::hex << reinterpret_cast<uintptr_t>(addr);
     std::string addr_hex = ss.str();
     log_info(tt::LogEmulationDriver, "{} bytes @ address {} in core ({}, {})", size, addr_hex, core.x, core.y);
-    for(int i = 0; i < data_vec.size(); i++){
+    for (int i = 0; i < data_vec.size(); i++) {
         std::ios_base::fmtflags save = std::cout.flags();
         std::cout << "0x" << std::hex << std::setw(8) << std::setfill('0') << data_vec[i] << " ";
         std::cout.flags(save);
@@ -48,14 +49,14 @@ void print_flatbuffer(const DeviceRequestResponse *buf){
     std::cout << std::endl;
 }
 
-tt_SimulationDevice::tt_SimulationDevice(const std::string &sdesc_path) : tt_device(sdesc_path){
+tt_SimulationDevice::tt_SimulationDevice(const std::string& sdesc_path) : tt_device(sdesc_path) {
     log_info(tt::LogEmulationDriver, "Instantiating simulation device");
     soc_descriptor_per_chip.emplace(0, tt_SocDescriptor(sdesc_path));
     std::set<chip_id_t> target_devices = {0};
-    
+
     // Start VCS simulator in a separate process
     TT_ASSERT(std::getenv("TT_REMOTE_EXE"), "TT_REMOTE_EXE not set, please provide path to the VCS binary");
-    uv_loop_t *loop = uv_default_loop();
+    uv_loop_t* loop = uv_default_loop();
     uv_process_t child_p;
     uv_process_options_t child_options = {0};
 
@@ -69,14 +70,12 @@ tt_SimulationDevice::tt_SimulationDevice(const std::string &sdesc_path) : tt_dev
         log_info(tt::LogEmulationDriver, "Simulator process spawned with PID: {}", child_p.pid);
     }
 
-    uv_unref((uv_handle_t *) &child_p);
+    uv_unref((uv_handle_t*)&child_p);
     uv_run(loop, UV_RUN_DEFAULT);
     uv_loop_close(loop);
 }
 
-tt_SimulationDevice::~tt_SimulationDevice() {
-    close_device();
-}
+tt_SimulationDevice::~tt_SimulationDevice() { close_device(); }
 
 // Setup/Teardown Functions
 std::unordered_map<chip_id_t, tt_SocDescriptor>& tt_SimulationDevice::get_virtual_soc_descriptors() {
@@ -99,11 +98,11 @@ void tt_SimulationDevice::set_driver_eth_interface_params(const tt_driver_eth_in
     eth_interface_params = eth_interface_params_;
 }
 
-void tt_SimulationDevice::start_device(const tt_device_params &device_params) {
-    void *buf_ptr = nullptr;
+void tt_SimulationDevice::start_device(const tt_device_params& device_params) {
+    void* buf_ptr = nullptr;
 
     host.start_host();
-    
+
     log_info(tt::LogEmulationDriver, "Waiting for ack msg from remote...");
     size_t buf_size = host.recv_from_device(&buf_ptr);
     auto buf = GetDeviceRequestResponse(buf_ptr);
@@ -114,8 +113,9 @@ void tt_SimulationDevice::start_device(const tt_device_params &device_params) {
 
 void tt_SimulationDevice::assert_risc_reset() {
     log_info(tt::LogEmulationDriver, "Sending assert_risc_reset signal..");
-    auto wr_buffer = create_flatbuffer(DEVICE_COMMAND_ALL_TENSIX_RESET_ASSERT, std::vector<uint32_t>(1, 0), {0, 0, 0}, 0);
-    uint8_t *wr_buffer_ptr = wr_buffer.GetBufferPointer();
+    auto wr_buffer =
+        create_flatbuffer(DEVICE_COMMAND_ALL_TENSIX_RESET_ASSERT, std::vector<uint32_t>(1, 0), {0, 0, 0}, 0);
+    uint8_t* wr_buffer_ptr = wr_buffer.GetBufferPointer();
     size_t wr_buffer_size = wr_buffer.GetSize();
 
     print_flatbuffer(GetDeviceRequestResponse(wr_buffer_ptr));
@@ -124,20 +124,25 @@ void tt_SimulationDevice::assert_risc_reset() {
 
 void tt_SimulationDevice::deassert_risc_reset() {
     log_info(tt::LogEmulationDriver, "Sending 'deassert_risc_reset' signal..");
-    auto wr_buffer = create_flatbuffer(DEVICE_COMMAND_ALL_TENSIX_RESET_DEASSERT, std::vector<uint32_t>(1, 0), {0, 0, 0}, 0);
-    uint8_t *wr_buffer_ptr = wr_buffer.GetBufferPointer();
+    auto wr_buffer =
+        create_flatbuffer(DEVICE_COMMAND_ALL_TENSIX_RESET_DEASSERT, std::vector<uint32_t>(1, 0), {0, 0, 0}, 0);
+    uint8_t* wr_buffer_ptr = wr_buffer.GetBufferPointer();
     size_t wr_buffer_size = wr_buffer.GetSize();
 
     host.send_to_device(wr_buffer_ptr, wr_buffer_size);
 }
 
 void tt_SimulationDevice::deassert_risc_reset_at_core(tt_cxy_pair core) {
-    log_info(tt::LogEmulationDriver, "Sending 'deassert_risc_reset_at_core'.. (Not implemented, defaulting to 'deassert_risc_reset' instead)");
+    log_info(
+        tt::LogEmulationDriver,
+        "Sending 'deassert_risc_reset_at_core'.. (Not implemented, defaulting to 'deassert_risc_reset' instead)");
     deassert_risc_reset();
 }
 
 void tt_SimulationDevice::assert_risc_reset_at_core(tt_cxy_pair core) {
-    log_info(tt::LogEmulationDriver, "Sending 'assert_risc_reset_at_core'.. (Not implemented, defaulting to 'assert_risc_reset' instead)");
+    log_info(
+        tt::LogEmulationDriver,
+        "Sending 'assert_risc_reset_at_core'.. (Not implemented, defaulting to 'assert_risc_reset' instead)");
     assert_risc_reset();
 }
 
@@ -149,19 +154,21 @@ void tt_SimulationDevice::close_device() {
 }
 
 // Runtime Functions
-void tt_SimulationDevice::write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) {
+void tt_SimulationDevice::write_to_device(
+    const void* mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) {
     log_info(tt::LogEmulationDriver, "Device writing");
     std::vector<std::uint32_t> data((uint32_t*)mem_ptr, (uint32_t*)mem_ptr + size_in_bytes / sizeof(uint32_t));
     auto wr_buffer = create_flatbuffer(DEVICE_COMMAND_WRITE, data, core, addr);
-    uint8_t *wr_buffer_ptr = wr_buffer.GetBufferPointer();
+    uint8_t* wr_buffer_ptr = wr_buffer.GetBufferPointer();
     size_t wr_buffer_size = wr_buffer.GetSize();
-    
-    print_flatbuffer(GetDeviceRequestResponse(wr_buffer_ptr)); // sanity print
+
+    print_flatbuffer(GetDeviceRequestResponse(wr_buffer_ptr));  // sanity print
     host.send_to_device(wr_buffer_ptr, wr_buffer_size);
 }
 
-void tt_SimulationDevice::read_from_device(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) {
-    void *rd_resp;
+void tt_SimulationDevice::read_from_device(
+    void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) {
+    void* rd_resp;
 
     // Send read request
     auto rd_req_buf = create_flatbuffer(DEVICE_COMMAND_READ, {0}, core, addr, size);
@@ -171,50 +178,49 @@ void tt_SimulationDevice::read_from_device(void* mem_ptr, tt_cxy_pair core, uint
     size_t rd_rsp_sz = host.recv_from_device(&rd_resp);
 
     auto rd_resp_buf = GetDeviceRequestResponse(rd_resp);
-    if (addr != 0x40){
+    if (addr != 0x40) {
         log_info(tt::LogEmulationDriver, "Device reading vec");
-        print_flatbuffer(rd_resp_buf); // 0x40 is host polling device, don't print since it'll spam
+        print_flatbuffer(rd_resp_buf);  // 0x40 is host polling device, don't print since it'll spam
     }
     std::memcpy(mem_ptr, rd_resp_buf->data()->data(), rd_resp_buf->data()->size() * sizeof(uint32_t));
     nng_free(rd_resp, rd_rsp_sz);
 }
 
 void tt_SimulationDevice::wait_for_non_mmio_flush() {}
+
 void tt_SimulationDevice::wait_for_non_mmio_flush(const chip_id_t chip) {}
-void tt_SimulationDevice::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {}
-void tt_SimulationDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels) {}
-void tt_SimulationDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {}
+
+void tt_SimulationDevice::l1_membar(
+    const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {}
+
+void tt_SimulationDevice::dram_membar(
+    const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels) {}
+
+void tt_SimulationDevice::dram_membar(
+    const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {}
 
 // Misc. Functions to Query/Set Device State
 std::unordered_map<chip_id_t, uint32_t> tt_SimulationDevice::get_harvesting_masks_for_soc_descriptors() {
     return {{0, 0}};
 }
 
-std::vector<chip_id_t> tt_SimulationDevice::detect_available_device_ids() {
-    return {0};
-}
+std::vector<chip_id_t> tt_SimulationDevice::detect_available_device_ids() { return {0}; }
 
-std::set<chip_id_t> tt_SimulationDevice::get_target_remote_device_ids() {
-    return target_remote_chips;
-}
+std::set<chip_id_t> tt_SimulationDevice::get_target_remote_device_ids() { return target_remote_chips; }
 
-std::map<int,int> tt_SimulationDevice::get_clocks() {
-    return {{0, 0}};
-}
+std::map<int, int> tt_SimulationDevice::get_clocks() { return {{0, 0}}; }
 
-void *tt_SimulationDevice::host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const {
+void* tt_SimulationDevice::host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const {
     return nullptr;
 }
 
 std::uint64_t tt_SimulationDevice::get_pcie_base_addr_from_device(const chip_id_t chip_id) const {
-    if(arch_name == tt::ARCH::WORMHOLE_B0) {
+    if (arch_name == tt::ARCH::WORMHOLE_B0) {
         return 0x800000000;
-    }
-    else if (arch_name == tt::ARCH::BLACKHOLE) {
+    } else if (arch_name == tt::ARCH::BLACKHOLE) {
         // Enable 4th ATU window.
         return 1ULL << 60;
-    }
-    else {
+    } else {
         return 0;
     }
 }
@@ -224,12 +230,11 @@ std::uint32_t tt_SimulationDevice::get_num_dram_channels(std::uint32_t device_id
 }
 
 std::uint64_t tt_SimulationDevice::get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel) {
-    return get_soc_descriptor(device_id).dram_bank_size; // Space per channel is identical for now
+    return get_soc_descriptor(device_id).dram_bank_size;  // Space per channel is identical for now
 }
 
-std::uint32_t tt_SimulationDevice::get_num_host_channels(std::uint32_t device_id) {
-    return 1;
-}
+std::uint32_t tt_SimulationDevice::get_num_host_channels(std::uint32_t device_id) { return 1; }
+
+std::uint32_t tt_SimulationDevice::get_host_channel_size(std::uint32_t device_id, std::uint32_t channel) { return 0; }
 
-std::uint32_t tt_SimulationDevice::get_host_channel_size(std::uint32_t device_id, std::uint32_t channel) {return 0;}
-std::uint32_t tt_SimulationDevice::get_numa_node_for_pcie_device(std::uint32_t device_id) {return 0;}
+std::uint32_t tt_SimulationDevice::get_numa_node_for_pcie_device(std::uint32_t device_id) { return 0; }
diff --git a/device/simulation/tt_simulation_device.h b/device/simulation/tt_simulation_device.h
index 95bdec82..a5eb0c85 100644
--- a/device/simulation/tt_simulation_device.h
+++ b/device/simulation/tt_simulation_device.h
@@ -10,23 +10,23 @@
 #include <fstream>
 #include <vector>
 
-#include "device/tt_device.h"
 #include "device/simulation/tt_simulation_host.hpp"
+#include "device/tt_device.h"
 
-class tt_SimulationDevice: public tt_device {
-    public:
-    tt_SimulationDevice(const std::string &sdesc_path);
+class tt_SimulationDevice : public tt_device {
+public:
+    tt_SimulationDevice(const std::string& sdesc_path);
     ~tt_SimulationDevice();
 
     tt_SimulationHost host;
 
-    //Setup/Teardown Functions
+    // Setup/Teardown Functions
     virtual std::unordered_map<chip_id_t, tt_SocDescriptor>& get_virtual_soc_descriptors();
     virtual void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_);
     virtual void set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_);
     virtual void set_driver_host_address_params(const tt_driver_host_address_params& host_address_params_);
     virtual void set_driver_eth_interface_params(const tt_driver_eth_interface_params& eth_interface_params_);
-    virtual void start_device(const tt_device_params &device_params);
+    virtual void start_device(const tt_device_params& device_params);
     virtual void assert_risc_reset();
     virtual void deassert_risc_reset();
     virtual void deassert_risc_reset_at_core(tt_cxy_pair core);
@@ -34,22 +34,27 @@ class tt_SimulationDevice: public tt_device {
     virtual void close_device();
 
     // Runtime Functions
-    virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use);
-    virtual void read_from_device(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb);
+    virtual void write_to_device(
+        const void* mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use);
+    virtual void read_from_device(
+        void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb);
 
     virtual void wait_for_non_mmio_flush();
     virtual void wait_for_non_mmio_flush(const chip_id_t chip);
-    void l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
-    void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels);
-    void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
+    void l1_membar(
+        const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
+    void dram_membar(
+        const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels);
+    void dram_membar(
+        const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
 
     // Misc. Functions to Query/Set Device State
     // virtual bool using_harvested_soc_descriptors();
     virtual std::unordered_map<chip_id_t, uint32_t> get_harvesting_masks_for_soc_descriptors();
     static std::vector<chip_id_t> detect_available_device_ids();
     virtual std::set<chip_id_t> get_target_remote_device_ids();
-    virtual std::map<int,int> get_clocks();
-    virtual void *host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const;
+    virtual std::map<int, int> get_clocks();
+    virtual void* host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const;
     virtual std::uint64_t get_pcie_base_addr_from_device(const chip_id_t chip_id) const;
     virtual std::uint32_t get_num_dram_channels(std::uint32_t device_id);
     virtual std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel);
@@ -57,7 +62,7 @@ class tt_SimulationDevice: public tt_device {
     virtual std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel);
     virtual std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id);
 
-    private:
+private:
     // State variables
     tt_device_dram_address_params dram_address_params;
     tt_device_l1_address_params l1_address_params;
diff --git a/device/simulation/tt_simulation_host.cpp b/device/simulation/tt_simulation_host.cpp
index ed9cf7e9..309bb7be 100644
--- a/device/simulation/tt_simulation_host.cpp
+++ b/device/simulation/tt_simulation_host.cpp
@@ -2,19 +2,20 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include <typeinfo>
-#include <sstream>
-#include <iomanip>
-#include <filesystem>
-#include <cassert>
-#include <cstdlib>
+#include "tt_simulation_host.hpp"
 
 #include <nng/nng.h>
 #include <nng/protocol/pair1/pair.h>
 
-#include "common/logger.hpp"
+#include <cassert>
+#include <cstdlib>
+#include <filesystem>
+#include <iomanip>
+#include <sstream>
+#include <typeinfo>
+
 #include "common/assert.hpp"
-#include "tt_simulation_host.hpp"
+#include "common/logger.hpp"
 
 tt_SimulationHost::tt_SimulationHost() {
     // Initialize socket and dialer
@@ -64,7 +65,7 @@ void tt_SimulationHost::start_host() {
 void tt_SimulationHost::send_to_device(uint8_t *buf, size_t buf_size) {
     int rv;
     log_debug(tt::LogEmulationDriver, "Sending messsage to remote..");
-    
+
     void *msg = nng_alloc(buf_size);
     std::memcpy(msg, buf, buf_size);
 
diff --git a/device/simulation/tt_simulation_host.hpp b/device/simulation/tt_simulation_host.hpp
index 6de18a04..26897a44 100644
--- a/device/simulation/tt_simulation_host.hpp
+++ b/device/simulation/tt_simulation_host.hpp
@@ -1,9 +1,9 @@
 // SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
-#include <vector>
 #include <cstdint>
 #include <memory>
+#include <vector>
 
 #include "device/tt_xy_pair.h"
 
@@ -20,6 +20,7 @@ class tt_SimulationHost {
     void start_host();
     void send_to_device(uint8_t *buf, size_t buf_size);
     size_t recv_from_device(void **data_ptr);
+
 private:
     std::unique_ptr<nng_socket> host_socket;
     std::unique_ptr<nng_dialer> host_dialer;
diff --git a/device/tlb.h b/device/tlb.h
index 3e8fb826..30094202 100644
--- a/device/tlb.h
+++ b/device/tlb.h
@@ -8,8 +8,8 @@
 
 #include <cstdint>
 #include <optional>
-#include <utility>
 #include <stdexcept>
+#include <utility>
 
 namespace tt::umd {
 
@@ -41,10 +41,10 @@ struct tlb_data {
 
     // Orderings
     static constexpr uint64_t Relaxed = 0;
-    static constexpr uint64_t Strict  = 1;
-    static constexpr uint64_t Posted  = 2;
+    static constexpr uint64_t Strict = 1;
+    static constexpr uint64_t Posted = 2;
 
-    bool check(const tlb_offsets & offset) const;
+    bool check(const tlb_offsets &offset) const;
     std::pair<std::uint64_t, std::uint64_t> apply_offset(const tlb_offsets &offset) const;
 };
 
diff --git a/device/tt_arch_types.h b/device/tt_arch_types.h
index 8a7c5dba..c165bf1b 100644
--- a/device/tt_arch_types.h
+++ b/device/tt_arch_types.h
@@ -17,4 +17,4 @@ enum class ARCH {
     BLACKHOLE = 3,
     Invalid = 0xFF,
 };
-}
+}  // namespace tt
diff --git a/device/tt_cluster_descriptor.cpp b/device/tt_cluster_descriptor.cpp
index 558fb0ab..8c1472f1 100644
--- a/device/tt_cluster_descriptor.cpp
+++ b/device/tt_cluster_descriptor.cpp
@@ -2,22 +2,23 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-
 #include "tt_cluster_descriptor.h"
 
 #include <fstream>
 #include <memory>
-#include <sstream> 
+#include <sstream>
 
 #include "common/logger.hpp"
-#include "yaml-cpp/yaml.h"
-
 #include "fmt/core.h"
+#include "yaml-cpp/yaml.h"
 
 using namespace tt;
-bool tt_ClusterDescriptor::ethernet_core_has_active_ethernet_link(chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const {
+
+bool tt_ClusterDescriptor::ethernet_core_has_active_ethernet_link(
+    chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const {
     return this->ethernet_connections.find(local_chip) != this->ethernet_connections.end() &&
-           this->ethernet_connections.at(local_chip).find(local_ethernet_channel) != this->ethernet_connections.at(local_chip).end();
+           this->ethernet_connections.at(local_chip).find(local_ethernet_channel) !=
+               this->ethernet_connections.at(local_chip).end();
 }
 
 std::tuple<chip_id_t, ethernet_channel_t> tt_ClusterDescriptor::get_chip_and_channel_of_remote_ethernet_core(
@@ -38,10 +39,14 @@ std::tuple<chip_id_t, ethernet_channel_t> tt_ClusterDescriptor::get_chip_and_cha
     }
 }
 
-// NOTE: It might be worthwhile to precompute this for every pair of directly connected chips, depending on how extensively router needs to use it
-std::vector<std::tuple<ethernet_channel_t, ethernet_channel_t>> tt_ClusterDescriptor::get_directly_connected_ethernet_channels_between_chips(const chip_id_t &first, const chip_id_t &second) const {
+// NOTE: It might be worthwhile to precompute this for every pair of directly connected chips, depending on how
+// extensively router needs to use it
+std::vector<std::tuple<ethernet_channel_t, ethernet_channel_t>>
+tt_ClusterDescriptor::get_directly_connected_ethernet_channels_between_chips(
+    const chip_id_t &first, const chip_id_t &second) const {
     std::vector<std::tuple<ethernet_channel_t, ethernet_channel_t>> directly_connected_channels = {};
-    if (this->enabled_active_chips.find(first) == this->enabled_active_chips.end() || this->enabled_active_chips.find(second) == this->enabled_active_chips.end()) {
+    if (this->enabled_active_chips.find(first) == this->enabled_active_chips.end() ||
+        this->enabled_active_chips.find(second) == this->enabled_active_chips.end()) {
         return {};
     }
 
@@ -58,9 +63,7 @@ bool tt_ClusterDescriptor::is_chip_mmio_capable(const chip_id_t chip_id) const {
     return this->chips_with_mmio.find(chip_id) != this->chips_with_mmio.end();
 }
 
-bool tt_ClusterDescriptor::is_chip_remote(const chip_id_t chip_id) const {
-    return !is_chip_mmio_capable(chip_id);
-}
+bool tt_ClusterDescriptor::is_chip_remote(const chip_id_t chip_id) const { return !is_chip_mmio_capable(chip_id); }
 
 // given two coordinates, finds the number of hops between the two chips
 // it assumes that shelves are connected in x-dim and racks are connected in y-dim
@@ -69,11 +72,19 @@ bool tt_ClusterDescriptor::is_chip_remote(const chip_id_t chip_id) const {
 // then once a chip on the same shelf&rack is found,
 // the distance from this chip to either location_a or location_b is just x&y dim difference.
 // the function returns the total distance of travelled between shelves and racks, plust the x&y dim difference
-int tt_ClusterDescriptor::get_ethernet_link_coord_distance(const eth_coord_t &location_a, const eth_coord_t &location_b) const {
-
-    log_trace(LogSiliconDriver, "get_ethernet_link_coord_distance from ({}, {}, {}, {}) to ({}, {}, {}, {})",
-        std::get<0>(location_a), std::get<1>(location_a), std::get<2>(location_a), std::get<3>(location_a),
-        std::get<0>(location_b), std::get<1>(location_b), std::get<2>(location_b), std::get<3>(location_b));
+int tt_ClusterDescriptor::get_ethernet_link_coord_distance(
+    const eth_coord_t &location_a, const eth_coord_t &location_b) const {
+    log_trace(
+        LogSiliconDriver,
+        "get_ethernet_link_coord_distance from ({}, {}, {}, {}) to ({}, {}, {}, {})",
+        std::get<0>(location_a),
+        std::get<1>(location_a),
+        std::get<2>(location_a),
+        std::get<3>(location_a),
+        std::get<0>(location_b),
+        std::get<1>(location_b),
+        std::get<2>(location_b),
+        std::get<3>(location_b));
 
     // eth_coord_t: x, y, rack, shelf
 
@@ -93,166 +104,236 @@ int tt_ClusterDescriptor::get_ethernet_link_coord_distance(const eth_coord_t &lo
     int y_distance = std::abs(y_a - y_b);
 
     // move along y-dim to exit from the shelf to go to a higher shelf
-    if(shelf_b > shelf_a) {
+    if (shelf_b > shelf_a) {
         // this is already verified where galaxy_shelves_exit_chip_coords_per_y_dim is populated, but just to be safe
-        log_assert(galaxy_shelves_exit_chip_coords_per_y_dim.find(shelf_a) != galaxy_shelves_exit_chip_coords_per_y_dim.end(),
+        log_assert(
+            galaxy_shelves_exit_chip_coords_per_y_dim.find(shelf_a) != galaxy_shelves_exit_chip_coords_per_y_dim.end(),
             "Expected shelf-to-shelf connection");
         // this row does not have a shelf-to-shelf connection
-        if(galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_a).find(y_a) == galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_a).end()) {
+        if (galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_a).find(y_a) ==
+            galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_a).end()) {
             return std::numeric_limits<int>::max();
         }
 
-        const Chip2ChipConnection& shelf_to_shelf_connection = galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_a).at(y_a);
-        log_assert(shelf_to_shelf_connection.destination_chip_coords.size(), "Expecting at least one shelf-to-shelf connection, possibly one-to-many");
+        const Chip2ChipConnection &shelf_to_shelf_connection =
+            galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_a).at(y_a);
+        log_assert(
+            shelf_to_shelf_connection.destination_chip_coords.size(),
+            "Expecting at least one shelf-to-shelf connection, possibly one-to-many");
 
         // for each shelf-to-shelf connection at y_a, find the distance to location_b, take min
         int distance = std::numeric_limits<int>::max();
         eth_coord_t exit_shelf = shelf_to_shelf_connection.source_chip_coord;
-        for(eth_coord_t next_shelf : shelf_to_shelf_connection.destination_chip_coords) {
-
-            log_assert(std::get<1>(exit_shelf) == y_a && std::get<3>(exit_shelf) == shelf_a && std::get<2>(exit_shelf) == rack_a,
+        for (eth_coord_t next_shelf : shelf_to_shelf_connection.destination_chip_coords) {
+            log_assert(
+                std::get<1>(exit_shelf) == y_a && std::get<3>(exit_shelf) == shelf_a &&
+                    std::get<2>(exit_shelf) == rack_a,
                 "Invalid shelf exit coordinates");
 
             // next shelf could be at a different y-dim in nebula->galaxy systems
-            log_assert(std::get<3>(next_shelf) == (shelf_a+1) && std::get<2>(next_shelf) == rack_a,
+            log_assert(
+                std::get<3>(next_shelf) == (shelf_a + 1) && std::get<2>(next_shelf) == rack_a,
                 "Invalid shelf entry coordinates");
 
             // hop onto the next shelf and find distance from there
             int distance_to_exit = get_ethernet_link_coord_distance(location_a, exit_shelf);
             int distance_in_next_shelf = get_ethernet_link_coord_distance(next_shelf, location_b);
             // no path found
-            if(distance_to_exit == std::numeric_limits<int>::max() || distance_in_next_shelf == std::numeric_limits<int>::max()) {
+            if (distance_to_exit == std::numeric_limits<int>::max() ||
+                distance_in_next_shelf == std::numeric_limits<int>::max()) {
                 continue;
             }
             distance = std::min(distance, distance_to_exit + distance_in_next_shelf + 1);
         }
-        log_trace(LogSiliconDriver, "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}",
-            std::get<0>(location_a), std::get<1>(location_a), std::get<2>(location_a), std::get<3>(location_a),
-            std::get<0>(location_b), std::get<1>(location_b), std::get<2>(location_b), std::get<3>(location_b), distance);
+        log_trace(
+            LogSiliconDriver,
+            "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}",
+            std::get<0>(location_a),
+            std::get<1>(location_a),
+            std::get<2>(location_a),
+            std::get<3>(location_a),
+            std::get<0>(location_b),
+            std::get<1>(location_b),
+            std::get<2>(location_b),
+            std::get<3>(location_b),
+            distance);
         return distance;
-    }
-    else if(shelf_a > shelf_b) {
-
+    } else if (shelf_a > shelf_b) {
         // this is already verified where galaxy_shelves_exit_chip_coords_per_y_dim is populated, but just to be safe
-        log_assert(galaxy_shelves_exit_chip_coords_per_y_dim.find(shelf_b) != galaxy_shelves_exit_chip_coords_per_y_dim.end(),
+        log_assert(
+            galaxy_shelves_exit_chip_coords_per_y_dim.find(shelf_b) != galaxy_shelves_exit_chip_coords_per_y_dim.end(),
             "Expected shelf-to-shelf connection");
         // this row does not have a shelf-to-shelf connection
-        if(galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_b).find(y_b) == galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_b).end()) {
+        if (galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_b).find(y_b) ==
+            galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_b).end()) {
             return std::numeric_limits<int>::max();
         }
 
-        const Chip2ChipConnection& shelf_to_shelf_connection = galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_b).at(y_b);
-        log_assert(shelf_to_shelf_connection.destination_chip_coords.size(), "Expecting at least one shelf-to-shelf connection, possibly one-to-many")
+        const Chip2ChipConnection &shelf_to_shelf_connection =
+            galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_b).at(y_b);
+        log_assert(
+            shelf_to_shelf_connection.destination_chip_coords.size(),
+            "Expecting at least one shelf-to-shelf connection, possibly one-to-many")
 
-        // for each shelf-to-shelf connection at y_b, find the distance to location_a, take min
-        int distance = std::numeric_limits<int>::max();
+            // for each shelf-to-shelf connection at y_b, find the distance to location_a, take min
+            int distance = std::numeric_limits<int>::max();
         eth_coord_t exit_shelf = shelf_to_shelf_connection.source_chip_coord;
-        for(eth_coord_t next_shelf : shelf_to_shelf_connection.destination_chip_coords) {
-
-            log_assert(std::get<1>(exit_shelf) == y_b && std::get<3>(exit_shelf) == shelf_b && std::get<2>(exit_shelf) == rack_b,
+        for (eth_coord_t next_shelf : shelf_to_shelf_connection.destination_chip_coords) {
+            log_assert(
+                std::get<1>(exit_shelf) == y_b && std::get<3>(exit_shelf) == shelf_b &&
+                    std::get<2>(exit_shelf) == rack_b,
                 "Invalid shelf exit coordinates");
             // next shelf could be at a different y-dim in nebula->galaxy systems
-            log_assert(std::get<3>(next_shelf) == (shelf_b+1) && std::get<2>(next_shelf) == rack_b,
+            log_assert(
+                std::get<3>(next_shelf) == (shelf_b + 1) && std::get<2>(next_shelf) == rack_b,
                 "Invalid shelf entry coordinates");
 
             // hop onto the next shelf and find distance from there
             int distance_to_exit = get_ethernet_link_coord_distance(location_b, exit_shelf);
             int distance_in_next_shelf = get_ethernet_link_coord_distance(next_shelf, location_a);
             // no path found
-            if(distance_to_exit == std::numeric_limits<int>::max() || distance_in_next_shelf == std::numeric_limits<int>::max()) {
+            if (distance_to_exit == std::numeric_limits<int>::max() ||
+                distance_in_next_shelf == std::numeric_limits<int>::max()) {
                 continue;
             }
             distance = std::min(distance, distance_to_exit + distance_in_next_shelf + 1);
         }
-        log_trace(LogSiliconDriver, "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}",
-            std::get<0>(location_a), std::get<1>(location_a), std::get<2>(location_a), std::get<3>(location_a),
-            std::get<0>(location_b), std::get<1>(location_b), std::get<2>(location_b), std::get<3>(location_b), distance);
+        log_trace(
+            LogSiliconDriver,
+            "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}",
+            std::get<0>(location_a),
+            std::get<1>(location_a),
+            std::get<2>(location_a),
+            std::get<3>(location_a),
+            std::get<0>(location_b),
+            std::get<1>(location_b),
+            std::get<2>(location_b),
+            std::get<3>(location_b),
+            distance);
         return distance;
     }
 
     // move along y-dim to exit from the shelf to go to a higher shelf
-    if(rack_b > rack_a) {
-
+    if (rack_b > rack_a) {
         // this is already verified where galaxy_racks_exit_chip_coords_per_x_dim is populated, but just to be safe
-        log_assert(galaxy_racks_exit_chip_coords_per_x_dim.find(rack_a) != galaxy_racks_exit_chip_coords_per_x_dim.end(),
+        log_assert(
+            galaxy_racks_exit_chip_coords_per_x_dim.find(rack_a) != galaxy_racks_exit_chip_coords_per_x_dim.end(),
             "Expected rack-to-rack connection");
 
         // this row does not have a rack-to-rack connection
-        if(galaxy_racks_exit_chip_coords_per_x_dim.at(rack_a).find(x_a) == galaxy_racks_exit_chip_coords_per_x_dim.at(rack_a).end()) {
+        if (galaxy_racks_exit_chip_coords_per_x_dim.at(rack_a).find(x_a) ==
+            galaxy_racks_exit_chip_coords_per_x_dim.at(rack_a).end()) {
             return std::numeric_limits<int>::max();
         }
 
-        const Chip2ChipConnection& rack_to_rack_connection = galaxy_racks_exit_chip_coords_per_x_dim.at(rack_a).at(x_a);
-        log_assert(rack_to_rack_connection.destination_chip_coords.size(), "Expecting at least one rack-to-rack connection, possibly one-to-many");
+        const Chip2ChipConnection &rack_to_rack_connection = galaxy_racks_exit_chip_coords_per_x_dim.at(rack_a).at(x_a);
+        log_assert(
+            rack_to_rack_connection.destination_chip_coords.size(),
+            "Expecting at least one rack-to-rack connection, possibly one-to-many");
 
         // for each rack-to-rack connection at x_a, find the distance to location_b, take min
         int distance = std::numeric_limits<int>::max();
         eth_coord_t exit_rack = rack_to_rack_connection.source_chip_coord;
-        for(eth_coord_t next_rack : rack_to_rack_connection.destination_chip_coords) {
-
-            log_assert(std::get<0>(exit_rack) == x_a && std::get<3>(exit_rack) == shelf_a && std::get<2>(exit_rack) == rack_a,
+        for (eth_coord_t next_rack : rack_to_rack_connection.destination_chip_coords) {
+            log_assert(
+                std::get<0>(exit_rack) == x_a && std::get<3>(exit_rack) == shelf_a && std::get<2>(exit_rack) == rack_a,
                 "Invalid rack exit coordinates");
-            log_assert(std::get<0>(next_rack) == x_a && std::get<3>(next_rack) == shelf_a && std::get<2>(next_rack) == (rack_a+1),
+            log_assert(
+                std::get<0>(next_rack) == x_a && std::get<3>(next_rack) == shelf_a &&
+                    std::get<2>(next_rack) == (rack_a + 1),
                 "Invalid rack entry coordinates");
 
             // hop onto the next rack and find distance from there
             int distance_to_exit = get_ethernet_link_coord_distance(location_a, exit_rack);
             int distance_in_next_rack = get_ethernet_link_coord_distance(next_rack, location_b);
             // no path found
-            if (distance_to_exit == std::numeric_limits<int>::max() || distance_in_next_rack == std::numeric_limits<int>::max()) {
+            if (distance_to_exit == std::numeric_limits<int>::max() ||
+                distance_in_next_rack == std::numeric_limits<int>::max()) {
                 continue;
             }
             distance = std::min(distance, distance_to_exit + distance_in_next_rack + 1);
         }
-        log_trace(LogSiliconDriver, "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}",
-            std::get<0>(location_a), std::get<1>(location_a), std::get<2>(location_a), std::get<3>(location_a),
-            std::get<0>(location_b), std::get<1>(location_b), std::get<2>(location_b), std::get<3>(location_b), distance);
+        log_trace(
+            LogSiliconDriver,
+            "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}",
+            std::get<0>(location_a),
+            std::get<1>(location_a),
+            std::get<2>(location_a),
+            std::get<3>(location_a),
+            std::get<0>(location_b),
+            std::get<1>(location_b),
+            std::get<2>(location_b),
+            std::get<3>(location_b),
+            distance);
 
         return distance;
-    }
-    else if(rack_a > rack_b) {
-
+    } else if (rack_a > rack_b) {
         // this is already verified where galaxy_racks_exit_chip_coords_per_x_dim is populated, but just to be safe
-        log_assert(galaxy_racks_exit_chip_coords_per_x_dim.find(rack_b) != galaxy_racks_exit_chip_coords_per_x_dim.end(),
+        log_assert(
+            galaxy_racks_exit_chip_coords_per_x_dim.find(rack_b) != galaxy_racks_exit_chip_coords_per_x_dim.end(),
             "Expected rack-to-rack connection");
 
         // this row does not have a rack-to-rack connection
-        if(galaxy_racks_exit_chip_coords_per_x_dim.at(rack_b).find(x_b) == galaxy_racks_exit_chip_coords_per_x_dim.at(rack_b).end()) {
+        if (galaxy_racks_exit_chip_coords_per_x_dim.at(rack_b).find(x_b) ==
+            galaxy_racks_exit_chip_coords_per_x_dim.at(rack_b).end()) {
             return std::numeric_limits<int>::max();
         }
 
-        const Chip2ChipConnection& rack_to_rack_connection = galaxy_racks_exit_chip_coords_per_x_dim.at(rack_b).at(x_b);
-        log_assert(rack_to_rack_connection.destination_chip_coords.size(), "Expecting at least one rack-to-rack connection, possibly one-to-many");
+        const Chip2ChipConnection &rack_to_rack_connection = galaxy_racks_exit_chip_coords_per_x_dim.at(rack_b).at(x_b);
+        log_assert(
+            rack_to_rack_connection.destination_chip_coords.size(),
+            "Expecting at least one rack-to-rack connection, possibly one-to-many");
 
         // for each rack-to-rack connection at x_a, find the distance to location_b, take min
         int distance = std::numeric_limits<int>::max();
         eth_coord_t exit_rack = rack_to_rack_connection.source_chip_coord;
-        for(eth_coord_t next_rack : rack_to_rack_connection.destination_chip_coords) {
-
-            log_assert(std::get<0>(exit_rack) == x_b && std::get<3>(exit_rack) == shelf_b && std::get<2>(exit_rack) == rack_b,
+        for (eth_coord_t next_rack : rack_to_rack_connection.destination_chip_coords) {
+            log_assert(
+                std::get<0>(exit_rack) == x_b && std::get<3>(exit_rack) == shelf_b && std::get<2>(exit_rack) == rack_b,
                 "Invalid rack exit coordinates");
-            log_assert(std::get<0>(next_rack) == x_b && std::get<3>(next_rack) == shelf_b && std::get<2>(next_rack) == (rack_b+1),
+            log_assert(
+                std::get<0>(next_rack) == x_b && std::get<3>(next_rack) == shelf_b &&
+                    std::get<2>(next_rack) == (rack_b + 1),
                 "Invalid rack entry coordinates");
 
             // hop onto the next rack and find distance from there
             int distance_to_exit = get_ethernet_link_coord_distance(location_b, exit_rack);
             int distance_in_next_rack = get_ethernet_link_coord_distance(next_rack, location_a);
             // no path found
-            if (distance_to_exit == std::numeric_limits<int>::max() || distance_in_next_rack == std::numeric_limits<int>::max()) {
+            if (distance_to_exit == std::numeric_limits<int>::max() ||
+                distance_in_next_rack == std::numeric_limits<int>::max()) {
                 continue;
             }
             distance = std::min(distance, distance_to_exit + distance_in_next_rack + 1);
         }
-        log_trace(LogSiliconDriver, "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}",
-            std::get<0>(location_a), std::get<1>(location_a), std::get<2>(location_a), std::get<3>(location_a),
-            std::get<0>(location_b), std::get<1>(location_b), std::get<2>(location_b), std::get<3>(location_b), distance);
+        log_trace(
+            LogSiliconDriver,
+            "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}",
+            std::get<0>(location_a),
+            std::get<1>(location_a),
+            std::get<2>(location_a),
+            std::get<3>(location_a),
+            std::get<0>(location_b),
+            std::get<1>(location_b),
+            std::get<2>(location_b),
+            std::get<3>(location_b),
+            distance);
 
         return distance;
     }
 
-    log_trace(LogSiliconDriver, "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}",
-        std::get<0>(location_a), std::get<1>(location_a), std::get<2>(location_a), std::get<3>(location_a),
-        std::get<0>(location_b), std::get<1>(location_b), std::get<2>(location_b), std::get<3>(location_b), x_distance + y_distance);
+    log_trace(
+        LogSiliconDriver,
+        "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}",
+        std::get<0>(location_a),
+        std::get<1>(location_a),
+        std::get<2>(location_a),
+        std::get<3>(location_a),
+        std::get<0>(location_b),
+        std::get<1>(location_b),
+        std::get<2>(location_b),
+        std::get<3>(location_b),
+        x_distance + y_distance);
 
     // on same shelf/rack, the distance is just x+y difference
     return x_distance + y_distance;
@@ -260,14 +341,13 @@ int tt_ClusterDescriptor::get_ethernet_link_coord_distance(const eth_coord_t &lo
 
 // Returns the closest mmio chip to the given chip
 chip_id_t tt_ClusterDescriptor::get_closest_mmio_capable_chip(const chip_id_t chip) {
-
     log_debug(LogSiliconDriver, "get_closest_mmio_chip to chip{}", chip);
 
     if (this->is_chip_mmio_capable(chip)) {
         return chip;
     }
 
-    if(closest_mmio_chip_cache.find(chip) != closest_mmio_chip_cache.end()) {
+    if (closest_mmio_chip_cache.find(chip) != closest_mmio_chip_cache.end()) {
         return closest_mmio_chip_cache[chip];
     }
 
@@ -279,7 +359,14 @@ chip_id_t tt_ClusterDescriptor::get_closest_mmio_capable_chip(const chip_id_t ch
         const chip_id_t &mmio_chip = pair.first;
         eth_coord_t mmio_eth_coord = this->chip_locations.at(mmio_chip);
 
-        log_debug(LogSiliconDriver, "Checking chip{} at ({}, {}, {}, {})", mmio_chip, std::get<0>(mmio_eth_coord), std::get<1>(mmio_eth_coord), std::get<2>(mmio_eth_coord), std::get<3>(mmio_eth_coord));
+        log_debug(
+            LogSiliconDriver,
+            "Checking chip{} at ({}, {}, {}, {})",
+            mmio_chip,
+            std::get<0>(mmio_eth_coord),
+            std::get<1>(mmio_eth_coord),
+            std::get<2>(mmio_eth_coord),
+            std::get<3>(mmio_eth_coord));
 
         int distance = get_ethernet_link_coord_distance(mmio_eth_coord, chip_eth_coord);
         if (distance < min_distance) {
@@ -287,7 +374,8 @@ chip_id_t tt_ClusterDescriptor::get_closest_mmio_capable_chip(const chip_id_t ch
             closest_chip = mmio_chip;
         }
     }
-    log_assert(min_distance != std::numeric_limits<int>::max(), "Chip{} is not connected to any MMIO capable chip", chip);
+    log_assert(
+        min_distance != std::numeric_limits<int>::max(), "Chip{} is not connected to any MMIO capable chip", chip);
 
     log_assert(is_chip_mmio_capable(closest_chip), "Closest MMIO chip must be MMIO capable");
 
@@ -298,12 +386,14 @@ chip_id_t tt_ClusterDescriptor::get_closest_mmio_capable_chip(const chip_id_t ch
     return closest_chip;
 }
 
-std::unique_ptr<tt_ClusterDescriptor> tt_ClusterDescriptor::create_from_yaml(const std::string &cluster_descriptor_file_path) {
+std::unique_ptr<tt_ClusterDescriptor> tt_ClusterDescriptor::create_from_yaml(
+    const std::string &cluster_descriptor_file_path) {
     std::unique_ptr<tt_ClusterDescriptor> desc = std::unique_ptr<tt_ClusterDescriptor>(new tt_ClusterDescriptor());
 
     std::ifstream fdesc(cluster_descriptor_file_path);
     if (fdesc.fail()) {
-        throw std::runtime_error(fmt::format("Error: cluster connectivity descriptor file {} does not exist!", cluster_descriptor_file_path));
+        throw std::runtime_error(fmt::format(
+            "Error: cluster connectivity descriptor file {} does not exist!", cluster_descriptor_file_path));
     }
     fdesc.close();
 
@@ -319,22 +409,31 @@ std::unique_ptr<tt_ClusterDescriptor> tt_ClusterDescriptor::create_from_yaml(con
 }
 
 std::unique_ptr<tt_ClusterDescriptor> tt_ClusterDescriptor::create_for_grayskull_cluster(
-    const std::set<chip_id_t> &logical_mmio_device_ids,
-    const std::vector<chip_id_t> &physical_mmio_device_ids) {
+    const std::set<chip_id_t> &logical_mmio_device_ids, const std::vector<chip_id_t> &physical_mmio_device_ids) {
     std::unique_ptr<tt_ClusterDescriptor> desc = std::unique_ptr<tt_ClusterDescriptor>(new tt_ClusterDescriptor());
 
     // Some users need not care about physical ids, can provide empty set.
-    auto use_physical_ids                   = physical_mmio_device_ids.size() ? true : false;
-    auto largest_workload_logical_device_id = *logical_mmio_device_ids.rbegin(); // Last element in ordered set.
-    auto num_available_physical_devices     = physical_mmio_device_ids.size();
-    auto required_physical_devices          = largest_workload_logical_device_id + 1;
-
-    log_debug(tt::LogSiliconDriver, "{} - use_physical_ids: {} largest_workload_logical_device_id: {} num_available_physical_devices: {} required_physical_devices: {}",
-        __FUNCTION__, use_physical_ids, largest_workload_logical_device_id, num_available_physical_devices, required_physical_devices);
-
-    log_assert(!use_physical_ids || num_available_physical_devices >= required_physical_devices,
+    auto use_physical_ids = physical_mmio_device_ids.size() ? true : false;
+    auto largest_workload_logical_device_id = *logical_mmio_device_ids.rbegin();  // Last element in ordered set.
+    auto num_available_physical_devices = physical_mmio_device_ids.size();
+    auto required_physical_devices = largest_workload_logical_device_id + 1;
+
+    log_debug(
+        tt::LogSiliconDriver,
+        "{} - use_physical_ids: {} largest_workload_logical_device_id: {} num_available_physical_devices: {} "
+        "required_physical_devices: {}",
+        __FUNCTION__,
+        use_physical_ids,
+        largest_workload_logical_device_id,
+        num_available_physical_devices,
+        required_physical_devices);
+
+    log_assert(
+        !use_physical_ids || num_available_physical_devices >= required_physical_devices,
         "Insufficient silicon devices. Workload requires device_id: {} (ie. {} devices) but only {} present",
-        largest_workload_logical_device_id, required_physical_devices, num_available_physical_devices);
+        largest_workload_logical_device_id,
+        required_physical_devices,
+        num_available_physical_devices);
 
     // All Grayskull devices are MMIO mapped so physical_mmio_device_ids correspond to all available devices
     for (auto &logical_id : logical_mmio_device_ids) {
@@ -343,8 +442,10 @@ std::unique_ptr<tt_ClusterDescriptor> tt_ClusterDescriptor::create_for_grayskull
         desc->all_chips.insert(logical_id);
         eth_coord_t chip_location{logical_id, 0, 0, 0};
         desc->chip_locations.insert({logical_id, chip_location});
-        desc->coords_to_chip_ids[std::get<2>(chip_location)][std::get<3>(chip_location)][std::get<1>(chip_location)][std::get<0>(chip_location)] = logical_id;
-        log_debug(tt::LogSiliconDriver, "{} - adding logical: {} => physical: {}", __FUNCTION__, logical_id, physical_id);
+        desc->coords_to_chip_ids[std::get<2>(chip_location)][std::get<3>(chip_location)][std::get<1>(chip_location)]
+                                [std::get<0>(chip_location)] = logical_id;
+        log_debug(
+            tt::LogSiliconDriver, "{} - adding logical: {} => physical: {}", __FUNCTION__, logical_id, physical_id);
     }
 
     desc->enable_all_devices();
@@ -352,7 +453,8 @@ std::unique_ptr<tt_ClusterDescriptor> tt_ClusterDescriptor::create_for_grayskull
     return desc;
 }
 
-void tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc) {
+void tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descriptor(
+    YAML::Node &yaml, tt_ClusterDescriptor &desc) {
     log_assert(yaml["ethernet_connections"].IsSequence(), "Invalid YAML");
     for (YAML::Node &connected_endpoints : yaml["ethernet_connections"].as<std::vector<YAML::Node>>()) {
         log_assert(connected_endpoints.IsSequence(), "Invalid YAML");
@@ -385,7 +487,13 @@ void tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descripto
     log_debug(LogSiliconDriver, "Ethernet Connectivity Descriptor:");
     for (const auto &[chip, chan_to_chip_chan_map] : desc.ethernet_connections) {
         for (const auto &[chan, chip_and_chan] : chan_to_chip_chan_map) {
-            log_debug(LogSiliconDriver, "\tchip: {}, chan: {}  <-->  chip: {}, chan: {}", chip, chan, std::get<0>(chip_and_chan), std::get<1>(chip_and_chan));
+            log_debug(
+                LogSiliconDriver,
+                "\tchip: {}, chan: {}  <-->  chip: {}, chan: {}",
+                chip,
+                chan,
+                std::get<0>(chip_and_chan),
+                std::get<1>(chip_and_chan));
         }
     }
 
@@ -407,47 +515,57 @@ void tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descripto
     int highest_rack_id = 0;
 
     // shelves and racks can be connected at different chip coordinates
-    // determine which chips are connected to the next (i.e. higher id) shelf/rack and what the coordinate of the chip on the other shelf/rack is
-    // this is used in get_ethernet_link_coord_distance to find the distance between two chips
+    // determine which chips are connected to the next (i.e. higher id) shelf/rack and what the coordinate of the chip
+    // on the other shelf/rack is this is used in get_ethernet_link_coord_distance to find the distance between two
+    // chips
     for (const auto &[chip_id, chip_eth_coord] : desc.chip_locations) {
         highest_shelf_id = std::max(highest_shelf_id, std::get<3>(chip_eth_coord));
         highest_rack_id = std::max(highest_rack_id, std::get<2>(chip_eth_coord));
         // iterate over all neighbors
-        if(desc.ethernet_connections.find(chip_id) == desc.ethernet_connections.end()) {
-            continue; // chip has no eth connections
+        if (desc.ethernet_connections.find(chip_id) == desc.ethernet_connections.end()) {
+            continue;  // chip has no eth connections
         }
         for (const auto &[chan, chip_and_chan] : desc.ethernet_connections.at(chip_id)) {
             const chip_id_t &neighbor_chip = std::get<0>(chip_and_chan);
             eth_coord_t neighbor_eth_coord = desc.chip_locations.at(neighbor_chip);
             // shelves are connected in x-dim
-            if(std::get<3>(neighbor_eth_coord) != std::get<3>(chip_eth_coord)) {
-                eth_coord_t higher_shelf_coord = std::get<3>(neighbor_eth_coord) > std::get<3>(chip_eth_coord) ? neighbor_eth_coord : chip_eth_coord;
-                eth_coord_t lower_shelf_coord = std::get<3>(neighbor_eth_coord) < std::get<3>(chip_eth_coord) ? neighbor_eth_coord : chip_eth_coord;
+            if (std::get<3>(neighbor_eth_coord) != std::get<3>(chip_eth_coord)) {
+                eth_coord_t higher_shelf_coord =
+                    std::get<3>(neighbor_eth_coord) > std::get<3>(chip_eth_coord) ? neighbor_eth_coord : chip_eth_coord;
+                eth_coord_t lower_shelf_coord =
+                    std::get<3>(neighbor_eth_coord) < std::get<3>(chip_eth_coord) ? neighbor_eth_coord : chip_eth_coord;
                 int lower_shelf_id = std::get<3>(lower_shelf_coord);
                 int lower_shelf_y = std::get<1>(lower_shelf_coord);
 
-                auto& galaxy_shelf_exit_chip_coords_per_y_dim = desc.galaxy_shelves_exit_chip_coords_per_y_dim[lower_shelf_id];
+                auto &galaxy_shelf_exit_chip_coords_per_y_dim =
+                    desc.galaxy_shelves_exit_chip_coords_per_y_dim[lower_shelf_id];
 
                 log_assert(
-                    galaxy_shelf_exit_chip_coords_per_y_dim.find(lower_shelf_y) == galaxy_shelf_exit_chip_coords_per_y_dim.end() ||
-                    galaxy_shelf_exit_chip_coords_per_y_dim[lower_shelf_y].source_chip_coord == lower_shelf_coord,
+                    galaxy_shelf_exit_chip_coords_per_y_dim.find(lower_shelf_y) ==
+                            galaxy_shelf_exit_chip_coords_per_y_dim.end() ||
+                        galaxy_shelf_exit_chip_coords_per_y_dim[lower_shelf_y].source_chip_coord == lower_shelf_coord,
                     "Expected a single exit chip on each shelf row");
                 galaxy_shelf_exit_chip_coords_per_y_dim[lower_shelf_y].source_chip_coord = lower_shelf_coord;
-                galaxy_shelf_exit_chip_coords_per_y_dim[lower_shelf_y].destination_chip_coords.insert(higher_shelf_coord);
+                galaxy_shelf_exit_chip_coords_per_y_dim[lower_shelf_y].destination_chip_coords.insert(
+                    higher_shelf_coord);
             }
 
             // racks are connected in y-dim
-            if(std::get<2>(neighbor_eth_coord) != std::get<2>(chip_eth_coord)) {
-                eth_coord_t higher_rack_coord = std::get<2>(neighbor_eth_coord) > std::get<2>(chip_eth_coord) ? neighbor_eth_coord : chip_eth_coord;
-                eth_coord_t lower_rack_coord = std::get<2>(neighbor_eth_coord) < std::get<2>(chip_eth_coord) ? neighbor_eth_coord : chip_eth_coord;
+            if (std::get<2>(neighbor_eth_coord) != std::get<2>(chip_eth_coord)) {
+                eth_coord_t higher_rack_coord =
+                    std::get<2>(neighbor_eth_coord) > std::get<2>(chip_eth_coord) ? neighbor_eth_coord : chip_eth_coord;
+                eth_coord_t lower_rack_coord =
+                    std::get<2>(neighbor_eth_coord) < std::get<2>(chip_eth_coord) ? neighbor_eth_coord : chip_eth_coord;
                 int lower_rack_id = std::get<2>(lower_rack_coord);
                 int lower_rack_x = std::get<0>(lower_rack_coord);
 
-                auto& galaxy_rack_exit_chip_coords_per_x_dim = desc.galaxy_racks_exit_chip_coords_per_x_dim[lower_rack_id];
+                auto &galaxy_rack_exit_chip_coords_per_x_dim =
+                    desc.galaxy_racks_exit_chip_coords_per_x_dim[lower_rack_id];
 
                 log_assert(
-                    galaxy_rack_exit_chip_coords_per_x_dim.find(lower_rack_x) == galaxy_rack_exit_chip_coords_per_x_dim.end() ||
-                    galaxy_rack_exit_chip_coords_per_x_dim[lower_rack_x].source_chip_coord == lower_rack_coord,
+                    galaxy_rack_exit_chip_coords_per_x_dim.find(lower_rack_x) ==
+                            galaxy_rack_exit_chip_coords_per_x_dim.end() ||
+                        galaxy_rack_exit_chip_coords_per_x_dim[lower_rack_x].source_chip_coord == lower_rack_coord,
                     "Expected a single exit chip on each rack column");
                 galaxy_rack_exit_chip_coords_per_x_dim[lower_rack_x].source_chip_coord = lower_rack_coord;
                 galaxy_rack_exit_chip_coords_per_x_dim[lower_rack_x].destination_chip_coords.insert(higher_rack_coord);
@@ -458,23 +576,36 @@ void tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descripto
     // verify that every shelf (except the highest in id) is found in galaxy_shelves_exit_chip_coords_per_y_dim
     // this means that we expect the shelves to be connected linearly in a daisy-chain fashion.
     // shelf0->shelf1->shelf2->...->shelfN
-    for(int shelf_id = 0; shelf_id < highest_shelf_id; shelf_id++) {
-        log_assert(desc.galaxy_shelves_exit_chip_coords_per_y_dim.find(shelf_id) != desc.galaxy_shelves_exit_chip_coords_per_y_dim.end(),
-            "Expected shelf {} to be connected to the next shelf", shelf_id);
+    for (int shelf_id = 0; shelf_id < highest_shelf_id; shelf_id++) {
+        log_assert(
+            desc.galaxy_shelves_exit_chip_coords_per_y_dim.find(shelf_id) !=
+                desc.galaxy_shelves_exit_chip_coords_per_y_dim.end(),
+            "Expected shelf {} to be connected to the next shelf",
+            shelf_id);
     }
 
     // this prints the exit chip coordinates for each shelf
     // this is used in get_ethernet_link_coord_distance to find the distance between two chips
     for (const auto &[shelf, shelf_exit_chip_coords_per_y_dim] : desc.galaxy_shelves_exit_chip_coords_per_y_dim) {
         for (const auto &[y_dim, shelf_exit_chip_coords] : shelf_exit_chip_coords_per_y_dim) {
-            log_debug(LogSiliconDriver, "shelf: {} y_dim: {} exit_coord:({}, {}, {}, {})",
-                shelf, y_dim,
-                std::get<0>(shelf_exit_chip_coords.source_chip_coord), std::get<1>(shelf_exit_chip_coords.source_chip_coord),
-                std::get<2>(shelf_exit_chip_coords.source_chip_coord), std::get<3>(shelf_exit_chip_coords.source_chip_coord));
+            log_debug(
+                LogSiliconDriver,
+                "shelf: {} y_dim: {} exit_coord:({}, {}, {}, {})",
+                shelf,
+                y_dim,
+                std::get<0>(shelf_exit_chip_coords.source_chip_coord),
+                std::get<1>(shelf_exit_chip_coords.source_chip_coord),
+                std::get<2>(shelf_exit_chip_coords.source_chip_coord),
+                std::get<3>(shelf_exit_chip_coords.source_chip_coord));
             for (const auto &destination_chip_coord : shelf_exit_chip_coords.destination_chip_coords) {
                 // print shelf_exit_chip_coord in the format: (x, y, rack, shelf)
-                log_debug(LogSiliconDriver, "\tdestination_chip_coord: ({}, {}, {}, {})",
-                    std::get<0>(destination_chip_coord), std::get<1>(destination_chip_coord), std::get<2>(destination_chip_coord), std::get<3>(destination_chip_coord));
+                log_debug(
+                    LogSiliconDriver,
+                    "\tdestination_chip_coord: ({}, {}, {}, {})",
+                    std::get<0>(destination_chip_coord),
+                    std::get<1>(destination_chip_coord),
+                    std::get<2>(destination_chip_coord),
+                    std::get<3>(destination_chip_coord));
             }
         }
     }
@@ -482,21 +613,35 @@ void tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descripto
     // verify that every rack (except the highest in id) is found in galaxy_racks_exit_chip_coords_per_x_dim
     // this means that we expect the racks to be connected linearly in a daisy-chain fashion.
     // rack0->rack1->rack2->...->rackN
-    for(int rack_id = 0; rack_id < highest_rack_id; rack_id++) {
-        log_assert(desc.galaxy_racks_exit_chip_coords_per_x_dim.find(rack_id) != desc.galaxy_racks_exit_chip_coords_per_x_dim.end(),
-            "Expected rack {} to be connected to the next rack", rack_id);
+    for (int rack_id = 0; rack_id < highest_rack_id; rack_id++) {
+        log_assert(
+            desc.galaxy_racks_exit_chip_coords_per_x_dim.find(rack_id) !=
+                desc.galaxy_racks_exit_chip_coords_per_x_dim.end(),
+            "Expected rack {} to be connected to the next rack",
+            rack_id);
     }
 
     // this prints the exit chip coordinates for each rack
     // this is used in get_ethernet_link_coord_distance to find the distance between two chips
     for (const auto &[rack, rack_exit_chip_coords_per_x_dim] : desc.galaxy_racks_exit_chip_coords_per_x_dim) {
         for (const auto &[x_dim, rack_exit_chip_coords] : rack_exit_chip_coords_per_x_dim) {
-            log_debug(LogSiliconDriver, "rack: {} x_dim: {} exit_coord:({}, {}, {}, {})", rack, x_dim,
-                std::get<0>(rack_exit_chip_coords.source_chip_coord), std::get<1>(rack_exit_chip_coords.source_chip_coord),
-                std::get<2>(rack_exit_chip_coords.source_chip_coord), std::get<3>(rack_exit_chip_coords.source_chip_coord));
+            log_debug(
+                LogSiliconDriver,
+                "rack: {} x_dim: {} exit_coord:({}, {}, {}, {})",
+                rack,
+                x_dim,
+                std::get<0>(rack_exit_chip_coords.source_chip_coord),
+                std::get<1>(rack_exit_chip_coords.source_chip_coord),
+                std::get<2>(rack_exit_chip_coords.source_chip_coord),
+                std::get<3>(rack_exit_chip_coords.source_chip_coord));
             for (const auto &destination_chip_coord : rack_exit_chip_coords.destination_chip_coords) {
-                log_debug(LogSiliconDriver, "\tdestination_chip_coord: ({}, {}, {}, {})",
-                    std::get<0>(destination_chip_coord), std::get<1>(destination_chip_coord), std::get<2>(destination_chip_coord), std::get<3>(destination_chip_coord));
+                log_debug(
+                    LogSiliconDriver,
+                    "\tdestination_chip_coord: ({}, {}, {}, {})",
+                    std::get<0>(destination_chip_coord),
+                    std::get<1>(destination_chip_coord),
+                    std::get<2>(destination_chip_coord),
+                    std::get<3>(destination_chip_coord));
             }
         }
     }
@@ -509,19 +654,19 @@ void tt_ClusterDescriptor::load_chips_from_connectivity_descriptor(YAML::Node &y
         log_assert(chip_rack_coords.size() == 4, "Galaxy (x, y, rack, shelf) coords must be size 4");
         eth_coord_t chip_location{
             chip_rack_coords.at(0), chip_rack_coords.at(1), chip_rack_coords.at(2), chip_rack_coords.at(3)};
-        
+
         desc.chip_locations.insert({chip_id, chip_location});
-        desc.coords_to_chip_ids[std::get<2>(chip_location)][std::get<3>(chip_location)][std::get<1>(chip_location)][std::get<0>(chip_location)] = chip_id;
+        desc.coords_to_chip_ids[std::get<2>(chip_location)][std::get<3>(chip_location)][std::get<1>(chip_location)]
+                               [std::get<0>(chip_location)] = chip_id;
         desc.all_chips.insert(chip_id);
     }
-    
-    for(const auto& chip : yaml["chips_with_mmio"]) {
-        if(chip.IsMap()) {
+
+    for (const auto &chip : yaml["chips_with_mmio"]) {
+        if (chip.IsMap()) {
             const auto &chip_map = chip.as<std::map<chip_id_t, chip_id_t>>();
             const auto &chips = chip_map.begin();
             desc.chips_with_mmio.insert({chips->first, chips->second});
-        }
-        else {
+        } else {
             const auto &chip_val = chip.as<int>();
             desc.chips_with_mmio.insert({chip_val, chip_val});
         }
@@ -538,8 +683,8 @@ void tt_ClusterDescriptor::load_chips_from_connectivity_descriptor(YAML::Node &y
             std::get<3>(chip_location));
     }
 
-		if (yaml["boardtype"]) {
-        for (const auto& chip_board_type : yaml["boardtype"].as<std::map<int, std::string>>()) {
+    if (yaml["boardtype"]) {
+        for (const auto &chip_board_type : yaml["boardtype"].as<std::map<int, std::string>>()) {
             auto &chip = chip_board_type.first;
             BoardType board_type;
             if (chip_board_type.second == "n150") {
@@ -554,15 +699,15 @@ void tt_ClusterDescriptor::load_chips_from_connectivity_descriptor(YAML::Node &y
             desc.chip_board_type.insert({chip, board_type});
         }
     } else {
-        for (const auto& chip: desc.all_chips) {
+        for (const auto &chip : desc.all_chips) {
             desc.chip_board_type.insert({chip, BoardType::DEFAULT});
         }
     }
 }
 
 void tt_ClusterDescriptor::load_harvesting_information(YAML::Node &yaml, tt_ClusterDescriptor &desc) {
-    if(yaml["harvesting"]) {
-        for (const auto& chip_node : yaml["harvesting"].as<std::map<int, YAML::Node>>()) {
+    if (yaml["harvesting"]) {
+        for (const auto &chip_node : yaml["harvesting"].as<std::map<int, YAML::Node>>()) {
             chip_id_t chip = chip_node.first;
             auto harvesting_info = chip_node.second;
             desc.noc_translation_enabled.insert({chip, harvesting_info["noc_translation"].as<bool>()});
@@ -571,9 +716,7 @@ void tt_ClusterDescriptor::load_harvesting_information(YAML::Node &yaml, tt_Clus
     }
 }
 
-void tt_ClusterDescriptor::enable_all_devices() {
-    this->enabled_active_chips = this->all_chips;
-}
+void tt_ClusterDescriptor::enable_all_devices() { this->enabled_active_chips = this->all_chips; }
 
 void tt_ClusterDescriptor::fill_chips_grouped_by_closest_mmio() {
     for (const auto &chip : this->all_chips) {
@@ -583,8 +726,10 @@ void tt_ClusterDescriptor::fill_chips_grouped_by_closest_mmio() {
     }
 }
 
-std::unordered_map<chip_id_t, std::unordered_map<ethernet_channel_t, std::tuple<chip_id_t, ethernet_channel_t> > > tt_ClusterDescriptor::get_ethernet_connections() const {
-    auto eth_connections = std::unordered_map<chip_id_t, std::unordered_map<ethernet_channel_t, std::tuple<chip_id_t, ethernet_channel_t> > >();
+std::unordered_map<chip_id_t, std::unordered_map<ethernet_channel_t, std::tuple<chip_id_t, ethernet_channel_t>>>
+tt_ClusterDescriptor::get_ethernet_connections() const {
+    auto eth_connections = std::
+        unordered_map<chip_id_t, std::unordered_map<ethernet_channel_t, std::tuple<chip_id_t, ethernet_channel_t>>>();
 
     for (const auto &[chip, channel_mapping] : this->ethernet_connections) {
         if (this->enabled_active_chips.find(chip) != this->enabled_active_chips.end()) {
@@ -613,7 +758,8 @@ std::unordered_map<chip_id_t, eth_coord_t> tt_ClusterDescriptor::get_chip_locati
 
 chip_id_t tt_ClusterDescriptor::get_shelf_local_physical_chip_coords(chip_id_t virtual_coord) {
     // Physical cooridnates of chip inside a single rack. Calculated based on Galaxy topology.
-    // See: https://yyz-gitlab.local.tenstorrent.com/tenstorrent/budabackend/-/wikis/uploads/23e7a5168f38dfb706f9887fde78cb03/image.png
+    // See:
+    // https://yyz-gitlab.local.tenstorrent.com/tenstorrent/budabackend/-/wikis/uploads/23e7a5168f38dfb706f9887fde78cb03/image.png
     int x = std::get<0>(get_chip_locations().at(virtual_coord));
     int y = std::get<1>(get_chip_locations().at(virtual_coord));
     return 8 * x + y;
@@ -632,9 +778,7 @@ std::unordered_map<chip_id_t, chip_id_t> tt_ClusterDescriptor::get_chips_with_mm
     return chips_map;
 }
 
-std::unordered_set<chip_id_t> tt_ClusterDescriptor::get_all_chips() const {
-    return this->enabled_active_chips;
-}
+std::unordered_set<chip_id_t> tt_ClusterDescriptor::get_all_chips() const { return this->enabled_active_chips; }
 
 std::unordered_map<chip_id_t, std::uint32_t> tt_ClusterDescriptor::get_harvesting_info() const {
     return harvesting_masks;
@@ -651,10 +795,11 @@ int tt_ClusterDescriptor::get_ethernet_link_distance(chip_id_t chip_a, chip_id_t
 }
 
 BoardType tt_ClusterDescriptor::get_board_type(chip_id_t chip_id) const {
-  BoardType board_type = this->chip_board_type.at(chip_id);
-  return board_type;
+    BoardType board_type = this->chip_board_type.at(chip_id);
+    return board_type;
 }
 
-std::unordered_map<chip_id_t, std::unordered_set<chip_id_t>> tt_ClusterDescriptor::get_chips_grouped_by_closest_mmio() const {
+std::unordered_map<chip_id_t, std::unordered_set<chip_id_t>> tt_ClusterDescriptor::get_chips_grouped_by_closest_mmio()
+    const {
     return chips_grouped_by_closest_mmio;
 }
diff --git a/device/tt_cluster_descriptor.h b/device/tt_cluster_descriptor.h
index ea8d0f52..ef99c574 100644
--- a/device/tt_cluster_descriptor.h
+++ b/device/tt_cluster_descriptor.h
@@ -4,23 +4,24 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-
 #pragma once
 
-#include "device/tt_xy_pair.h"
-
 #include <cstdint>
-#include <unordered_map>
-#include <unordered_set>
-#include <set>
 #include <map>
-#include <tuple>
+#include <memory>
+#include <set>
 #include <string>
+#include <tuple>
+#include <unordered_map>
+#include <unordered_set>
 #include <vector>
-#include <memory>
+
 #include "device/tt_cluster_descriptor_types.h"
+#include "device/tt_xy_pair.h"
 
-namespace YAML { class Node; }
+namespace YAML {
+class Node;
+}
 
 enum BoardType : uint32_t {
     N150 = 0,
@@ -30,80 +31,82 @@ enum BoardType : uint32_t {
 };
 
 class tt_ClusterDescriptor {
-
-  private:
-  int get_ethernet_link_coord_distance(const eth_coord_t &location_a, const eth_coord_t &location_b) const;
-
-  protected:
-
-  std::unordered_map<chip_id_t, std::unordered_map<ethernet_channel_t, std::tuple<chip_id_t, ethernet_channel_t> > > ethernet_connections;
-  std::unordered_map<chip_id_t, eth_coord_t> chip_locations;
-  // reverse map: rack/shelf/y/x -> chip_id
-  std::map<int, std::map<int, std::map<int, std::map<int, chip_id_t > > > > coords_to_chip_ids;
-  std::unordered_map<chip_id_t, chip_id_t> chips_with_mmio;
-  std::unordered_set<chip_id_t> all_chips;
-  std::unordered_map<chip_id_t, bool> noc_translation_enabled = {};
-  std::unordered_map<chip_id_t, std::uint32_t> harvesting_masks = {};
-  std::unordered_set<chip_id_t> enabled_active_chips;
-  std::unordered_map<chip_id_t, chip_id_t> closest_mmio_chip_cache = {};
-  std::unordered_map<chip_id_t, BoardType> chip_board_type = {};
-  std::unordered_map<chip_id_t, std::unordered_set<chip_id_t>> chips_grouped_by_closest_mmio;
-
-  // one-to-many chip connections
-  struct Chip2ChipConnection {
-    eth_coord_t source_chip_coord;
-    std::unordered_set<eth_coord_t> destination_chip_coords;
-  };
-
-  // shelf_id -> y dim -> list of chip2chip connections between different shelves
-  // assumption is that on every row of the shelf there is a chip that is connected to the other shelf
-  // there could be one-to-many connections between shelves, i.e. one chip is connected to multiple chips on the other shelf (in case of nebula->galaxy)
-  std::unordered_map<int, std::unordered_map<int, Chip2ChipConnection > > galaxy_shelves_exit_chip_coords_per_y_dim = {};
-  // rack_id -> x dim -> list of chip2chip connections between different racks
-  // assumption is that on every row of the rack there is a chip that is connected to the other rack
-  std::unordered_map<int, std::unordered_map<int, Chip2ChipConnection > > galaxy_racks_exit_chip_coords_per_x_dim = {};
-
-  static void load_ethernet_connections_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc);
-  static void load_chips_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc);
-  static void load_harvesting_information(YAML::Node &yaml, tt_ClusterDescriptor &desc);
-
-  void fill_chips_grouped_by_closest_mmio();
-
- public:
-  tt_ClusterDescriptor() = default;
-  tt_ClusterDescriptor(const tt_ClusterDescriptor&) = default;
-
-  /*
-   * Returns the pairs of channels that are connected where the first entry in the pair corresponds to the argument ordering when calling the function
-   * An empty result implies that the two chips do not share any direct connection
-   */
-  std::vector<std::tuple<ethernet_channel_t, ethernet_channel_t>> get_directly_connected_ethernet_channels_between_chips(const chip_id_t &first, const chip_id_t &second) const;
-  
-  bool is_chip_mmio_capable(const chip_id_t chip_id) const;
-  bool is_chip_remote(const chip_id_t chip_id) const;
-  chip_id_t get_closest_mmio_capable_chip(const chip_id_t chip);
-  chip_id_t get_shelf_local_physical_chip_coords(chip_id_t virtual_coord);
-  static std::unique_ptr<tt_ClusterDescriptor> create_from_yaml(const std::string &cluster_descriptor_file_path);
-  static std::unique_ptr<tt_ClusterDescriptor> create_for_grayskull_cluster(
-      const std::set<chip_id_t> &logical_mmio_device_ids,
-      const std::vector<chip_id_t> &physical_mmio_device_ids);
-
-  std::unordered_map<chip_id_t, std::uint32_t> get_harvesting_info() const;
-  std::unordered_map<chip_id_t, bool> get_noc_translation_table_en() const;
-  std::unordered_map<chip_id_t, eth_coord_t> get_chip_locations() const;
-  std::unordered_map<chip_id_t, std::unordered_map<ethernet_channel_t, std::tuple<chip_id_t, ethernet_channel_t> > > get_ethernet_connections() const;
-  std::unordered_map<chip_id_t, chip_id_t> get_chips_with_mmio() const;
-  std::unordered_set<chip_id_t> get_all_chips() const;
-  std::size_t get_number_of_chips() const;
-  std::unordered_map<chip_id_t, std::unordered_set<chip_id_t>> get_chips_grouped_by_closest_mmio() const;
-
-  int get_ethernet_link_distance(chip_id_t chip_a, chip_id_t chip_b) const;
-
-  BoardType get_board_type(chip_id_t chip_id) const;
-
-  bool ethernet_core_has_active_ethernet_link(chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const;
-  std::tuple<chip_id_t, ethernet_channel_t> get_chip_and_channel_of_remote_ethernet_core(chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const;
-
-  void enable_all_devices();
-
+private:
+    int get_ethernet_link_coord_distance(const eth_coord_t &location_a, const eth_coord_t &location_b) const;
+
+protected:
+    std::unordered_map<chip_id_t, std::unordered_map<ethernet_channel_t, std::tuple<chip_id_t, ethernet_channel_t>>>
+        ethernet_connections;
+    std::unordered_map<chip_id_t, eth_coord_t> chip_locations;
+    // reverse map: rack/shelf/y/x -> chip_id
+    std::map<int, std::map<int, std::map<int, std::map<int, chip_id_t>>>> coords_to_chip_ids;
+    std::unordered_map<chip_id_t, chip_id_t> chips_with_mmio;
+    std::unordered_set<chip_id_t> all_chips;
+    std::unordered_map<chip_id_t, bool> noc_translation_enabled = {};
+    std::unordered_map<chip_id_t, std::uint32_t> harvesting_masks = {};
+    std::unordered_set<chip_id_t> enabled_active_chips;
+    std::unordered_map<chip_id_t, chip_id_t> closest_mmio_chip_cache = {};
+    std::unordered_map<chip_id_t, BoardType> chip_board_type = {};
+    std::unordered_map<chip_id_t, std::unordered_set<chip_id_t>> chips_grouped_by_closest_mmio;
+
+    // one-to-many chip connections
+    struct Chip2ChipConnection {
+        eth_coord_t source_chip_coord;
+        std::unordered_set<eth_coord_t> destination_chip_coords;
+    };
+
+    // shelf_id -> y dim -> list of chip2chip connections between different shelves
+    // assumption is that on every row of the shelf there is a chip that is connected to the other shelf
+    // there could be one-to-many connections between shelves, i.e. one chip is connected to multiple chips on the other
+    // shelf (in case of nebula->galaxy)
+    std::unordered_map<int, std::unordered_map<int, Chip2ChipConnection>> galaxy_shelves_exit_chip_coords_per_y_dim =
+        {};
+    // rack_id -> x dim -> list of chip2chip connections between different racks
+    // assumption is that on every row of the rack there is a chip that is connected to the other rack
+    std::unordered_map<int, std::unordered_map<int, Chip2ChipConnection>> galaxy_racks_exit_chip_coords_per_x_dim = {};
+
+    static void load_ethernet_connections_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc);
+    static void load_chips_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc);
+    static void load_harvesting_information(YAML::Node &yaml, tt_ClusterDescriptor &desc);
+
+    void fill_chips_grouped_by_closest_mmio();
+
+public:
+    tt_ClusterDescriptor() = default;
+    tt_ClusterDescriptor(const tt_ClusterDescriptor &) = default;
+
+    /*
+     * Returns the pairs of channels that are connected where the first entry in the pair corresponds to the argument
+     * ordering when calling the function An empty result implies that the two chips do not share any direct connection
+     */
+    std::vector<std::tuple<ethernet_channel_t, ethernet_channel_t>>
+    get_directly_connected_ethernet_channels_between_chips(const chip_id_t &first, const chip_id_t &second) const;
+
+    bool is_chip_mmio_capable(const chip_id_t chip_id) const;
+    bool is_chip_remote(const chip_id_t chip_id) const;
+    chip_id_t get_closest_mmio_capable_chip(const chip_id_t chip);
+    chip_id_t get_shelf_local_physical_chip_coords(chip_id_t virtual_coord);
+    static std::unique_ptr<tt_ClusterDescriptor> create_from_yaml(const std::string &cluster_descriptor_file_path);
+    static std::unique_ptr<tt_ClusterDescriptor> create_for_grayskull_cluster(
+        const std::set<chip_id_t> &logical_mmio_device_ids, const std::vector<chip_id_t> &physical_mmio_device_ids);
+
+    std::unordered_map<chip_id_t, std::uint32_t> get_harvesting_info() const;
+    std::unordered_map<chip_id_t, bool> get_noc_translation_table_en() const;
+    std::unordered_map<chip_id_t, eth_coord_t> get_chip_locations() const;
+    std::unordered_map<chip_id_t, std::unordered_map<ethernet_channel_t, std::tuple<chip_id_t, ethernet_channel_t>>>
+    get_ethernet_connections() const;
+    std::unordered_map<chip_id_t, chip_id_t> get_chips_with_mmio() const;
+    std::unordered_set<chip_id_t> get_all_chips() const;
+    std::size_t get_number_of_chips() const;
+    std::unordered_map<chip_id_t, std::unordered_set<chip_id_t>> get_chips_grouped_by_closest_mmio() const;
+
+    int get_ethernet_link_distance(chip_id_t chip_a, chip_id_t chip_b) const;
+
+    BoardType get_board_type(chip_id_t chip_id) const;
+
+    bool ethernet_core_has_active_ethernet_link(chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const;
+    std::tuple<chip_id_t, ethernet_channel_t> get_chip_and_channel_of_remote_ethernet_core(
+        chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const;
+
+    void enable_all_devices();
 };
diff --git a/device/tt_cluster_descriptor_types.h b/device/tt_cluster_descriptor_types.h
index 6508a08d..841d80a2 100644
--- a/device/tt_cluster_descriptor_types.h
+++ b/device/tt_cluster_descriptor_types.h
@@ -4,7 +4,7 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-#pragma once 
+#pragma once
 
 #include <tuple>
 
@@ -15,13 +15,11 @@ using eth_coord_t = std::tuple<int, int, int, int>;  // x, y, rack, shelf
 namespace std {
 template <>
 struct hash<eth_coord_t> {
-  std::size_t operator()(eth_coord_t const &c) const {
-    std::size_t seed = 0;
-    seed = std::hash<std::size_t>()(std::get<0>(c)) << 48 | 
-          std::hash<std::size_t>()(std::get<1>(c)) << 32 |
-          std::hash<std::size_t>()(std::get<2>(c)) << 16 |
-          std::hash<std::size_t>()(std::get<3>(c));
-    return seed;
-  }
+    std::size_t operator()(eth_coord_t const &c) const {
+        std::size_t seed = 0;
+        seed = std::hash<std::size_t>()(std::get<0>(c)) << 48 | std::hash<std::size_t>()(std::get<1>(c)) << 32 |
+               std::hash<std::size_t>()(std::get<2>(c)) << 16 | std::hash<std::size_t>()(std::get<3>(c));
+        return seed;
+    }
 };
-}
+}  // namespace std
diff --git a/device/tt_device.cpp b/device/tt_device.cpp
index 9d974936..b7c3590e 100644
--- a/device/tt_device.cpp
+++ b/device/tt_device.cpp
@@ -2,30 +2,32 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-
 #ifdef TT_DEBUG_LOGGING
-#define DEBUG_LOG(str) do { std::cout << str << std::endl; } while( false )
+#define DEBUG_LOG(str)                 \
+    do {                               \
+        std::cout << str << std::endl; \
+    } while (false)
 #else
 #define DEBUG_LOG(str) ((void)0)
 #endif
 
 #include "tt_device.h"
-#include "device/tt_cluster_descriptor_types.h"
-#include <iostream>
+
 #include <fstream>
+#include <iostream>
 #include <string>
-#include <vector>
 #include <unordered_map>
+#include <vector>
+
+#include "device/tt_cluster_descriptor_types.h"
 #include "yaml-cpp/yaml.h"
 
 ////////
 // Device base
 ////////
-tt_device::tt_device(const std::string& sdesc_path) : soc_descriptor_per_chip({}) {
-}
+tt_device::tt_device(const std::string& sdesc_path) : soc_descriptor_per_chip({}) {}
 
-tt_device::~tt_device() {
-}
+tt_device::~tt_device() {}
 
 const tt_SocDescriptor& tt_device::get_soc_descriptor(chip_id_t chip_id) const {
     return soc_descriptor_per_chip.at(chip_id);
diff --git a/device/tt_device.h b/device/tt_device.h
index f5749ea5..60d8bf5a 100644
--- a/device/tt_device.h
+++ b/device/tt_device.h
@@ -8,20 +8,19 @@
 #include <cassert>
 #include <cstdint>
 #include <memory>
+#include <set>
 #include <string>
 #include <unordered_set>
 #include <vector>
-#include <set>
 
-#include "tt_soc_descriptor.h"
-#include "tt_xy_pair.h"
-#include "tt_silicon_driver_common.hpp"
-#include "device/tt_cluster_descriptor_types.h"
 #include "device/tlb.h"
+#include "device/tt_cluster_descriptor_types.h"
 #include "device/tt_io.hpp"
-
-#include "pcie/pci_device.hpp"
 #include "fmt/core.h"
+#include "pcie/pci_device.hpp"
+#include "tt_silicon_driver_common.hpp"
+#include "tt_soc_descriptor.h"
+#include "tt_xy_pair.h"
 
 using TLB_DATA = tt::umd::tlb_data;
 
@@ -30,29 +29,32 @@ using TLB_DATA = tt::umd::tlb_data;
 tt::ARCH detect_arch(int pci_device_num);
 tt::ARCH detect_arch();
 
-namespace boost::interprocess{
-    class named_mutex;
+namespace boost::interprocess {
+class named_mutex;
 }
 
 class tt_ClusterDescriptor;
 
-enum tt_DevicePowerState {
-    BUSY,
-    SHORT_IDLE,
-    LONG_IDLE
-};
+enum tt_DevicePowerState { BUSY, SHORT_IDLE, LONG_IDLE };
 
 enum tt_MemBarFlag {
     SET = 0xaa,
     RESET = 0xbb,
 };
 
-inline std::ostream &operator <<(std::ostream &os, const tt_DevicePowerState power_state) {
+inline std::ostream& operator<<(std::ostream& os, const tt_DevicePowerState power_state) {
     switch (power_state) {
-        case tt_DevicePowerState::BUSY: os << "Busy"; break;
-        case tt_DevicePowerState::SHORT_IDLE: os << "SHORT_IDLE"; break;
-        case tt_DevicePowerState::LONG_IDLE: os << "LONG_IDLE"; break;
-        default: throw ("Unknown DevicePowerState");
+        case tt_DevicePowerState::BUSY:
+            os << "Busy";
+            break;
+        case tt_DevicePowerState::SHORT_IDLE:
+            os << "SHORT_IDLE";
+            break;
+        case tt_DevicePowerState::LONG_IDLE:
+            os << "LONG_IDLE";
+            break;
+        default:
+            throw("Unknown DevicePowerState");
     }
     return os;
 }
@@ -119,20 +121,22 @@ struct tt_version {
     std::uint16_t major = 0xffff;
     std::uint8_t minor = 0xff;
     std::uint8_t patch = 0xff;
+
     tt_version() {}
+
     tt_version(std::uint16_t major_, std::uint8_t minor_, std::uint8_t patch_) {
         major = major_;
         minor = minor_;
         patch = patch_;
     }
+
     tt_version(std::uint32_t version) {
         major = (version >> 16) & 0xff;
         minor = (version >> 12) & 0xf;
         patch = version & 0xfff;
     }
-    std::string str() const {
-        return fmt::format("{}.{}.{}", major, minor, patch);
-    }
+
+    std::string str() const { return fmt::format("{}.{}.{}", major, minor, patch); }
 };
 
 struct tt_device_params {
@@ -143,29 +147,32 @@ struct tt_device_params {
     bool init_device = true;
     bool early_open_device = false;
     int aiclk = 0;
+
     // The command-line input for vcd_dump_cores can have the following format:
     // {"*-2", "1-*", "*-*", "1-2"}
     // '*' indicates we must dump all the cores in that dimension.
     // This function takes the vector above and unrolles the coords with '*' in one or both dimensions.
     std::vector<std::string> unroll_vcd_dump_cores(tt_xy_pair grid_size) const {
         std::vector<std::string> unrolled_dump_core;
-        for (auto &dump_core: vcd_dump_cores) {
+        for (auto& dump_core : vcd_dump_cores) {
             // If the input is a single *, then dump all cores.
             if (dump_core == "*") {
                 for (size_t x = 0; x < grid_size.x; x++) {
-                for (size_t y = 0; y < grid_size.y; y++) {
-                    std::string current_core_coord = fmt::format("{}-{}", x, y);
-                    if (std::find(std::begin(unrolled_dump_core), std::end(unrolled_dump_core), current_core_coord) == std::end(unrolled_dump_core)) {
-                        unrolled_dump_core.push_back(current_core_coord);
+                    for (size_t y = 0; y < grid_size.y; y++) {
+                        std::string current_core_coord = fmt::format("{}-{}", x, y);
+                        if (std::find(
+                                std::begin(unrolled_dump_core), std::end(unrolled_dump_core), current_core_coord) ==
+                            std::end(unrolled_dump_core)) {
+                            unrolled_dump_core.push_back(current_core_coord);
+                        }
                     }
                 }
-                }
                 continue;
             }
             // Each core coordinate must contain three characters: "core.x-core.y".
             assert(dump_core.size() <= 5);
             size_t delimiter_pos = dump_core.find('-');
-            assert (delimiter_pos != std::string::npos); // y-dim should exist in core coord.
+            assert(delimiter_pos != std::string::npos);  // y-dim should exist in core coord.
 
             std::string core_dim_x = dump_core.substr(0, delimiter_pos);
             size_t core_dim_y_start = delimiter_pos + 1;
@@ -175,7 +182,9 @@ struct tt_device_params {
                 for (size_t x = 0; x < grid_size.x; x++) {
                     for (size_t y = 0; y < grid_size.y; y++) {
                         std::string current_core_coord = fmt::format("{}-{}", x, y);
-                        if (std::find(std::begin(unrolled_dump_core), std::end(unrolled_dump_core), current_core_coord) == std::end(unrolled_dump_core)) {
+                        if (std::find(
+                                std::begin(unrolled_dump_core), std::end(unrolled_dump_core), current_core_coord) ==
+                            std::end(unrolled_dump_core)) {
                             unrolled_dump_core.push_back(current_core_coord);
                         }
                     }
@@ -183,14 +192,16 @@ struct tt_device_params {
             } else if (core_dim_x == "*") {
                 for (size_t x = 0; x < grid_size.x; x++) {
                     std::string current_core_coord = fmt::format("{}-{}", x, core_dim_y);
-                    if (std::find(std::begin(unrolled_dump_core), std::end(unrolled_dump_core), current_core_coord) == std::end(unrolled_dump_core)) {
+                    if (std::find(std::begin(unrolled_dump_core), std::end(unrolled_dump_core), current_core_coord) ==
+                        std::end(unrolled_dump_core)) {
                         unrolled_dump_core.push_back(current_core_coord);
                     }
                 }
             } else if (core_dim_y == "*") {
                 for (size_t y = 0; y < grid_size.y; y++) {
                     std::string current_core_coord = fmt::format("{}-{}", core_dim_x, y);
-                    if (std::find(std::begin(unrolled_dump_core), std::end(unrolled_dump_core), current_core_coord) == std::end(unrolled_dump_core)) {
+                    if (std::find(std::begin(unrolled_dump_core), std::end(unrolled_dump_core), current_core_coord) ==
+                        std::end(unrolled_dump_core)) {
                         unrolled_dump_core.push_back(current_core_coord);
                     }
                 }
@@ -202,10 +213,9 @@ struct tt_device_params {
     }
 
     std::vector<std::string> expand_plusargs() const {
-        std::vector<std::string> all_plusargs {
+        std::vector<std::string> all_plusargs{
             fmt::format("+enable_perf_scoreboard={}", enable_perf_scoreboard),
-            fmt::format("+register_monitor={}", register_monitor)
-        };
+            fmt::format("+register_monitor={}", register_monitor)};
 
         all_plusargs.insert(all_plusargs.end(), plusargs.begin(), plusargs.end());
 
@@ -218,18 +228,18 @@ struct tt_device_params {
  * Exposes a generic interface to callers, providing declarations for virtual functions defined differently for Silicon.
  * Valid usage consists of declaring a tt_device object and initializing it to Silicon backend.
  * Using tt_device itself will throw errors, since its APIs are undefined.
- */ 
-class tt_device
-{
-    public:
+ */
+class tt_device {
+public:
     tt_device(const std::string& sdesc_path);
     virtual ~tt_device();
+
     // Setup/Teardown Functions
     /**
      * Set L1 Address Map parameters used by UMD to communicate with the TT Device.
      *
      * @param l1_address_params_  All the L1 parameters required by UMD
-     */ 
+     */
     virtual void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) {
         throw std::runtime_error("---- tt_device::set_device_l1_address_params is not implemented\n");
     }
@@ -242,9 +252,9 @@ class tt_device
      * Set Host Address Map parameters used by UMD to communicate with the TT Device (used for remote transactions).
      *
      * @param host_address_params_ All the Host Address space parameters required by UMD.
-     */ 
-    [[deprecated("Using unnecessary function.")]]
-    virtual void set_driver_host_address_params(const tt_driver_host_address_params& host_address_params_) {
+     */
+    [[deprecated("Using unnecessary function.")]] virtual void set_driver_host_address_params(
+        const tt_driver_host_address_params& host_address_params_) {
         throw std::runtime_error("---- tt_device::set_driver_host_address_params is not implemented\n");
     }
 
@@ -252,7 +262,7 @@ class tt_device
      * Set ERISC Firmware parameters used by UMD to communicate with the TT Device (used for remote transactions).
      *
      * @param eth_interface_params_ All the Ethernet Firmware parameters required by UMD.
-     */ 
+     */
     virtual void set_driver_eth_interface_params(const tt_driver_eth_interface_params& eth_interface_params_) {
         throw std::runtime_error("---- tt_device::set_driver_eth_interface_params is not implemented\n");
     }
@@ -265,8 +275,13 @@ class tt_device
      * @param tlb_index TLB id that will be programmed.
      * @param address Start address TLB is mapped to.
      * @param ordering Ordering mode for the TLB.
-     */ 
-    virtual void configure_tlb(chip_id_t logical_device_id, tt_xy_pair core, std::int32_t tlb_index, std::int32_t address, uint64_t ordering = TLB_DATA::Relaxed) {
+     */
+    virtual void configure_tlb(
+        chip_id_t logical_device_id,
+        tt_xy_pair core,
+        std::int32_t tlb_index,
+        std::int32_t address,
+        uint64_t ordering = TLB_DATA::Relaxed) {
         throw std::runtime_error("---- tt_device::configure_tlb is not implemented\n");
     }
 
@@ -275,45 +290,51 @@ class tt_device
      *
      * @param fallback_tlb Dynamic TLB being targeted.
      * @param ordering Ordering mode for the TLB.
-     */ 
+     */
     virtual void set_fallback_tlb_ordering_mode(const std::string& fallback_tlb, uint64_t ordering = TLB_DATA::Posted) {
         throw std::runtime_error("---- tt_device::set_fallback_tlb_ordering_mode is not implemented\n");
     }
-    
+
     /**
-     * Give UMD a 1:1 function mapping a core to its appropriate static TLB (currently only support a single TLB per core).
+     * Give UMD a 1:1 function mapping a core to its appropriate static TLB (currently only support a single TLB per
+     * core).
      *
      * @param logical_device_id MMIO chip being targeted.
      * @param mapping_function Function which maps core to TLB index.
      */
-    virtual void setup_core_to_tlb_map(const chip_id_t logical_device_id, std::function<std::int32_t(tt_xy_pair)> mapping_function) {
+    virtual void setup_core_to_tlb_map(
+        const chip_id_t logical_device_id, std::function<std::int32_t(tt_xy_pair)> mapping_function) {
         throw std::runtime_error("---- tt_device::setup_core_to_tlb_map is not implemented\n");
     }
 
     /**
-     * Pass in ethernet cores with active links for a specific MMIO chip. When called, this function will force UMD to use a subset of cores from the active_eth_cores_per_chip set for all host->cluster
-     * non-MMIO transfers. If this function is not called, UMD will use a default set of ethernet core indices for these transfers (0 through 5).
-     * If default behaviour is not desired, this function must be called for all MMIO devices.
+     * Pass in ethernet cores with active links for a specific MMIO chip. When called, this function will force UMD to
+     * use a subset of cores from the active_eth_cores_per_chip set for all host->cluster non-MMIO transfers. If this
+     * function is not called, UMD will use a default set of ethernet core indices for these transfers (0 through 5). If
+     * default behaviour is not desired, this function must be called for all MMIO devices.
      *
      * @param mmio_chip Device being targeted.
      * @param active_eth_cores_per_chip The active ethernet cores for this chip.
      */
-    virtual void configure_active_ethernet_cores_for_mmio_device(chip_id_t mmio_chip, const std::unordered_set<tt_xy_pair>& active_eth_cores_per_chip) {
-        throw std::runtime_error("---- tt_device::configure_active_ethernet_cores_for_mmio_device is not implemented\n");
+    virtual void configure_active_ethernet_cores_for_mmio_device(
+        chip_id_t mmio_chip, const std::unordered_set<tt_xy_pair>& active_eth_cores_per_chip) {
+        throw std::runtime_error(
+            "---- tt_device::configure_active_ethernet_cores_for_mmio_device is not implemented\n");
     }
 
     /**
-     * On Silicon: Assert soft Tensix reset, deassert RiscV reset, set power state to busy (ramp up AICLK), initialize iATUs for PCIe devices and ethernet queues for remote chips.
+     * On Silicon: Assert soft Tensix reset, deassert RiscV reset, set power state to busy (ramp up AICLK), initialize
+     * iATUs for PCIe devices and ethernet queues for remote chips.
      *
      * @param device_params Object specifying initialization configuration.
      */
-    virtual void start_device(const tt_device_params &device_params) {
+    virtual void start_device(const tt_device_params& device_params) {
         throw std::runtime_error("---- tt_device::start_device is not implemented\n");
     }
 
     /**
      * Broadcast deassert soft Tensix Reset to the entire device (to be done after start_device is called).
-     */  
+     */
     virtual void deassert_risc_reset() {
         throw std::runtime_error("---- tt_device::deassert_risc_reset is not implemented\n");
     }
@@ -322,14 +343,14 @@ class tt_device
      * Send a soft deassert reset signal to a single tensix core.
      *
      * @param core Chip and core being targeted.
-     */  
+     */
     virtual void deassert_risc_reset_at_core(tt_cxy_pair core) {
         throw std::runtime_error("---- tt_device::deassert_risc_reset_at_core is not implemented\n");
     }
 
     /**
      * Broadcast assert soft Tensix Reset to the entire device.
-     */  
+     */
     virtual void assert_risc_reset() {
         throw std::runtime_error("---- tt_device::assert_risc_reset is not implemented\n");
     }
@@ -338,7 +359,7 @@ class tt_device
      * Send a soft assert reset signal to a single tensix core.
      *
      * @param core Chip and core being targeted.
-     */  
+     */
     virtual void assert_risc_reset_at_core(tt_cxy_pair core) {
         throw std::runtime_error("---- tt_device::assert_risc_reset_at_core is not implemented\n");
     }
@@ -346,17 +367,15 @@ class tt_device
     /**
      * To be called at the end of a run.
      * Set power state to idle, assert tensix reset at all cores.
-     */  
-    virtual void close_device() {
-        throw std::runtime_error("---- tt_device::close_device is not implemented\n");
-    }
+     */
+    virtual void close_device() { throw std::runtime_error("---- tt_device::close_device is not implemented\n"); }
 
     // Runtime functions
     /**
      * Non-MMIO (ethernet) barrier.
-     * Similar to an mfence for host -> host transfers. Will flush all in-flight ethernet transactions before proceeding with the next one.
-     * This will be applied to all chips in the cluster.
-     */ 
+     * Similar to an mfence for host -> host transfers. Will flush all in-flight ethernet transactions before proceeding
+     * with the next one. This will be applied to all chips in the cluster.
+     */
     virtual void wait_for_non_mmio_flush() {
         throw std::runtime_error("---- tt_device::wait_for_non_mmio_flush is not implemented\n");
     }
@@ -378,12 +397,20 @@ class tt_device
      * @param addr Address to write to.
      * @param tlb_to_use Specifies fallback/dynamic TLB to use.
      */
-    virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) {
+    virtual void write_to_device(
+        const void* mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) {
         // Only implement this for Silicon Backend
         throw std::runtime_error("---- tt_device::write_to_device is not implemented\n");
     }
 
-    virtual void broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set<chip_id_t>& chips_to_exclude,  std::set<uint32_t>& rows_to_exclude,  std::set<uint32_t>& columns_to_exclude, const std::string& fallback_tlb) {
+    virtual void broadcast_write_to_cluster(
+        const void* mem_ptr,
+        uint32_t size_in_bytes,
+        uint64_t address,
+        const std::set<chip_id_t>& chips_to_exclude,
+        std::set<uint32_t>& rows_to_exclude,
+        std::set<uint32_t>& columns_to_exclude,
+        const std::string& fallback_tlb) {
         throw std::runtime_error("---- tt_device::broadcast_write_to_cluster is not implemented\n");
     }
 
@@ -396,44 +423,54 @@ class tt_device
      * @param size Number of bytes to read.
      * @param fallback_tlb Specifies fallback/dynamic TLB to use.
      */
-    virtual void read_from_device(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) {
+    virtual void read_from_device(
+        void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) {
         // Only implement this for Silicon Backend
         throw std::runtime_error("---- tt_device::read_from_device is not implemented\n");
     }
 
     /**
      * Write uint32_t vector to specified address and channel on host (defined for Silicon).
-     * 
+     *
      * @param vec Data to write.
      * @param addr Address to write to.
      * @param channel Host channel to target.
      * @param src_device_id Chip to target.
      */
-    virtual void write_to_sysmem(const void* mem_ptr, std::uint32_t size,  uint64_t addr, uint16_t channel, chip_id_t src_device_id) {
+    virtual void write_to_sysmem(
+        const void* mem_ptr, std::uint32_t size, uint64_t addr, uint16_t channel, chip_id_t src_device_id) {
         throw std::runtime_error("---- tt_device::write_to_sysmem is not implemented\n");
     }
-    virtual void read_from_sysmem(void* mem_ptr, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id) {
+
+    virtual void read_from_sysmem(
+        void* mem_ptr, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id) {
         throw std::runtime_error("---- tt_device::read_from_sysmem is not implemented\n");
     }
-    virtual void l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {}) {
+
+    virtual void l1_membar(
+        const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {}) {
         throw std::runtime_error("---- tt_device::l1_membar is not implemented\n");
     }
-    virtual void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels = {}) {
+
+    virtual void dram_membar(
+        const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels = {}) {
         throw std::runtime_error("---- tt_device::dram_membar is not implemented\n");
     }
-    virtual void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {}) {
+
+    virtual void dram_membar(
+        const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {}) {
         throw std::runtime_error("---- tt_device::dram_membar is not implemented\n");
     }
 
     // Misc. Functions to Query/Set Device State
     /**
-     * Query post harvesting SOC descriptors from UMD in virtual coordinates. 
+     * Query post harvesting SOC descriptors from UMD in virtual coordinates.
      * These descriptors should be used for looking up cores that are passed into UMD APIs.
      */
     virtual std::unordered_map<chip_id_t, tt_SocDescriptor>& get_virtual_soc_descriptors() {
         throw std::runtime_error("---- tt_device:get_virtual_soc_descriptors is not implemented\n");
     }
-   
+
     /**
      * Determine if UMD performed harvesting on SOC descriptors.
      */
@@ -441,18 +478,18 @@ class tt_device
         throw std::runtime_error("---- tt_device:using_harvested_soc_descriptors is not implemented\n");
         return 0;
     }
-    
+
     /**
      * Get harvesting masks for all chips/SOC Descriptors in the cluster.
      * Each mask represents a map of enabled (0) and disabled (1) rows on a specific chip (in NOC0 Coordinateds).
-     */ 
+     */
     virtual std::unordered_map<chip_id_t, uint32_t> get_harvesting_masks_for_soc_descriptors() {
         throw std::runtime_error("---- tt_device:get_harvesting_masks_for_soc_descriptors is not implemented\n");
     }
 
     /**
      * Issue message to device, meant to be picked up by ARC firmware.
-     * 
+     *
      * @param logical_device_id Chip to target.
      * @param msg_code Specifies type of ARC message.
      * @param wait_for_done Block until ARC responds.
@@ -461,8 +498,16 @@ class tt_device
      * @param timeout Timeout on ARC.
      * @param return3 Return value from ARC.
      * @param return4 Return value from ARC.
-     */ 
-    virtual int arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done = true, uint32_t arg0 = 0, uint32_t arg1 = 0, int timeout=1, uint32_t *return_3 = nullptr, uint32_t *return_4 = nullptr) {
+     */
+    virtual int arc_msg(
+        int logical_device_id,
+        uint32_t msg_code,
+        bool wait_for_done = true,
+        uint32_t arg0 = 0,
+        uint32_t arg1 = 0,
+        int timeout = 1,
+        uint32_t* return_3 = nullptr,
+        uint32_t* return_4 = nullptr) {
         throw std::runtime_error("---- tt_device::arc_msg is not implemented\n");
     }
 
@@ -472,28 +517,28 @@ class tt_device
      * @param device_id Chip to target.
      * @param r Row coordinate.
      * @param c Column coordinate.
-     */ 
-    virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c) {
+     */
+    virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c) {
         throw std::runtime_error("---- tt_device::translate_to_noc_table_coords is not implemented\n");
     }
 
     /**
      * Get the total number of chips in the cluster based on the network descriptor.
-     */ 
+     */
     virtual int get_number_of_chips_in_cluster() {
         throw std::runtime_error("---- tt_device::get_number_of_chips_in_cluster is not implemented\n");
     }
 
     /**
      * Get the logical ids for all chips in the cluster
-     */ 
+     */
     virtual std::unordered_set<chip_id_t> get_all_chips_in_cluster() {
         throw std::runtime_error("---- tt_device::get_all_chips_in_cluster is not implemented\n");
     }
 
     /**
      * Get cluster descriptor object being used in UMD instance.
-     */ 
+     */
     virtual tt_ClusterDescriptor* get_cluster_description() {
         throw std::runtime_error("---- tt_device::get_cluster_description is not implemented\n");
     }
@@ -515,9 +560,9 @@ class tt_device
     /**
      * Get clock frequencies for all MMIO devices targeted by UMD.
      */
-    virtual std::map<int,int> get_clocks() {
+    virtual std::map<int, int> get_clocks() {
         throw std::runtime_error("---- tt_device::get_clocks is not implemented\n");
-        return std::map<int,int>();
+        return std::map<int, int>();
     }
 
     virtual std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id) {
@@ -535,7 +580,7 @@ class tt_device
      * Query number of DRAM channels on a specific device.
      *
      * @param device_id Logical device id to query.
-     */ 
+     */
     virtual std::uint32_t get_num_dram_channels(std::uint32_t device_id) {
         throw std::runtime_error("---- tt_device::get_num_dram_channels is not implemented\n");
         return 0;
@@ -543,10 +588,10 @@ class tt_device
 
     /**
      * Get size for a specific DRAM channel on a device.
-     *    
+     *
      * @param device_id Device to target.
      * @param channel DRAM channel to target.
-     */ 
+     */
     virtual std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel) {
         throw std::runtime_error("---- tt_device::get_dram_channel_size is not implemented\n");
         return 0;
@@ -556,7 +601,7 @@ class tt_device
      * Query number of Host channels (hugepages) allocated for a specific device.
      *
      * @param device_id Logical device id to target.
-     */ 
+     */
     virtual std::uint32_t get_num_host_channels(std::uint32_t device_id) {
         throw std::runtime_error("---- tt_device::get_num_host_channels is not implemented\n");
         return 0;
@@ -567,20 +612,21 @@ class tt_device
      *
      * @param device_id Logical device id to target.
      * @param channel Logical host channel to target.
-     */ 
+     */
     virtual std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel) {
         throw std::runtime_error("---- tt_device::get_host_channel_size is not implemented\n");
         return 0;
     }
 
     /**
-     * Get absolute address corresponding to a zero based offset into a specific host memory channel for a specific device.
-     *   
+     * Get absolute address corresponding to a zero based offset into a specific host memory channel for a specific
+     * device.
+     *
      * @param offset Offset wrt the start of the channel's address space.
-     * @param src_device_id Device to target. 
+     * @param src_device_id Device to target.
      * @param channel Host memory channel.
      */
-    virtual void *host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const {
+    virtual void* host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const {
         throw std::runtime_error("---- tt_device::host_dma_address is not implemented\n");
         return nullptr;
     }
@@ -589,25 +635,25 @@ class tt_device
         throw std::runtime_error("---- tt_device::get_pcie_base_addr_from_device is not implemented\n");
         return 0;
     }
+
     const tt_SocDescriptor& get_soc_descriptor(chip_id_t chip_id) const;
 
     bool performed_harvesting = false;
     std::unordered_map<chip_id_t, uint32_t> harvested_rows_per_target = {};
     bool translation_tables_en = false;
 
-    protected:
+protected:
     std::unordered_map<chip_id_t, tt_SocDescriptor> soc_descriptor_per_chip = {};
 };
 
 #include "device/architecture_implementation.h"
 
 /**
-* Silicon Driver Class, derived from the tt_device class
+ * Silicon Driver Class, derived from the tt_device class
  * Implements APIs to communicate with a physical Tenstorrent Device.
-*/ 
-class tt_SiliconDevice: public tt_device
-{
-    public:
+ */
+class tt_SiliconDevice : public tt_device {
+public:
     // Constructor
     /**
      * Silicon Driver constructor.
@@ -620,22 +666,35 @@ class tt_SiliconDevice: public tt_device
      * @param clean_system_resource Specifies if host state from previous runs needs to be cleaned up.
      * @param perform_harvesting Allow the driver to modify the SOC descriptors per chip.
      * @param simulated_harvesting_masks
-     */ 
-    tt_SiliconDevice(const std::string &sdesc_path, const std::string &ndesc_path, const std::set<chip_id_t> &target_devices, 
-                    const uint32_t &num_host_mem_ch_per_mmio_device = 1, const bool skip_driver_allocs = false,
-                    const bool clean_system_resources = false, bool perform_harvesting = true, std::unordered_map<chip_id_t, uint32_t> simulated_harvesting_masks = {});
-    
-    //Setup/Teardown Functions
+     */
+    tt_SiliconDevice(
+        const std::string& sdesc_path,
+        const std::string& ndesc_path,
+        const std::set<chip_id_t>& target_devices,
+        const uint32_t& num_host_mem_ch_per_mmio_device = 1,
+        const bool skip_driver_allocs = false,
+        const bool clean_system_resources = false,
+        bool perform_harvesting = true,
+        std::unordered_map<chip_id_t, uint32_t> simulated_harvesting_masks = {});
+
+    // Setup/Teardown Functions
     virtual std::unordered_map<chip_id_t, tt_SocDescriptor>& get_virtual_soc_descriptors();
     virtual void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_);
     virtual void set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_);
     virtual void set_driver_host_address_params(const tt_driver_host_address_params& host_address_params_);
     virtual void set_driver_eth_interface_params(const tt_driver_eth_interface_params& eth_interface_params_);
-    virtual void configure_tlb(chip_id_t logical_device_id, tt_xy_pair core, std::int32_t tlb_index, std::int32_t address, uint64_t ordering = TLB_DATA::Posted);
+    virtual void configure_tlb(
+        chip_id_t logical_device_id,
+        tt_xy_pair core,
+        std::int32_t tlb_index,
+        std::int32_t address,
+        uint64_t ordering = TLB_DATA::Posted);
     virtual void set_fallback_tlb_ordering_mode(const std::string& fallback_tlb, uint64_t ordering = TLB_DATA::Posted);
-    virtual void setup_core_to_tlb_map(const chip_id_t logical_device_id, std::function<std::int32_t(tt_xy_pair)> mapping_function);
-    virtual void configure_active_ethernet_cores_for_mmio_device(chip_id_t mmio_chip, const std::unordered_set<tt_xy_pair>& active_eth_cores_per_chip);
-    virtual void start_device(const tt_device_params &device_params);
+    virtual void setup_core_to_tlb_map(
+        const chip_id_t logical_device_id, std::function<std::int32_t(tt_xy_pair)> mapping_function);
+    virtual void configure_active_ethernet_cores_for_mmio_device(
+        chip_id_t mmio_chip, const std::unordered_set<tt_xy_pair>& active_eth_cores_per_chip);
+    virtual void start_device(const tt_device_params& device_params);
     virtual void assert_risc_reset();
     virtual void deassert_risc_reset();
     virtual void deassert_risc_reset_at_core(tt_cxy_pair core);
@@ -643,20 +702,34 @@ class tt_SiliconDevice: public tt_device
     virtual void close_device();
 
     // Runtime Functions
-    virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use);
-    void broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set<chip_id_t>& chips_to_exclude,  std::set<uint32_t>& rows_to_exclude,  std::set<uint32_t>& columns_to_exclude, const std::string& fallback_tlb);
-
-    virtual void read_from_device(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb);
-    virtual void write_to_sysmem(const void* mem_ptr, std::uint32_t size,  uint64_t addr, uint16_t channel, chip_id_t src_device_id);
-    virtual void read_from_sysmem(void* mem_ptr, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id);
+    virtual void write_to_device(
+        const void* mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use);
+    void broadcast_write_to_cluster(
+        const void* mem_ptr,
+        uint32_t size_in_bytes,
+        uint64_t address,
+        const std::set<chip_id_t>& chips_to_exclude,
+        std::set<uint32_t>& rows_to_exclude,
+        std::set<uint32_t>& columns_to_exclude,
+        const std::string& fallback_tlb);
+
+    virtual void read_from_device(
+        void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb);
+    virtual void write_to_sysmem(
+        const void* mem_ptr, std::uint32_t size, uint64_t addr, uint16_t channel, chip_id_t src_device_id);
+    virtual void read_from_sysmem(
+        void* mem_ptr, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id);
     virtual void wait_for_non_mmio_flush();
     virtual void wait_for_non_mmio_flush(const chip_id_t chip_id);
-    void l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
-    void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels);
-    void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
+    void l1_membar(
+        const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
+    void dram_membar(
+        const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels);
+    void dram_membar(
+        const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores = {});
     // These functions are used by Debuda, so make them public
-    void bar_write32 (int logical_device_id, uint32_t addr, uint32_t data);
-    uint32_t bar_read32 (int logical_device_id, uint32_t addr);
+    void bar_write32(int logical_device_id, uint32_t addr, uint32_t data);
+    uint32_t bar_read32(int logical_device_id, uint32_t addr);
 
     /**
      * If the tlbs are initialized, returns a tuple with the TLB base address and its size
@@ -674,16 +747,24 @@ class tt_SiliconDevice: public tt_device
      * - the mapping is unchanged during the lifetime of the returned object.
      * - the tt_SiliconDevice instance outlives the returned object.
      * - use of the returned object is congruent with the target's TLB setup.
-     *    
+     *
      * @param target The target chip and core to write to.
      */
     tt::Writer get_static_tlb_writer(tt_cxy_pair target);
 
     // Misc. Functions to Query/Set Device State
-    virtual int arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done = true, uint32_t arg0 = 0, uint32_t arg1 = 0, int timeout=1, uint32_t *return_3 = nullptr, uint32_t *return_4 = nullptr);
+    virtual int arc_msg(
+        int logical_device_id,
+        uint32_t msg_code,
+        bool wait_for_done = true,
+        uint32_t arg0 = 0,
+        uint32_t arg1 = 0,
+        int timeout = 1,
+        uint32_t* return_3 = nullptr,
+        uint32_t* return_4 = nullptr);
     virtual bool using_harvested_soc_descriptors();
     virtual std::unordered_map<chip_id_t, uint32_t> get_harvesting_masks_for_soc_descriptors();
-    virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c);
+    virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c);
     virtual int get_number_of_chips_in_cluster();
     virtual std::unordered_set<chip_id_t> get_all_chips_in_cluster();
     virtual tt_ClusterDescriptor* get_cluster_description();
@@ -691,13 +772,16 @@ class tt_SiliconDevice: public tt_device
     static std::vector<chip_id_t> detect_available_device_ids();
     virtual std::set<chip_id_t> get_target_mmio_device_ids();
     virtual std::set<chip_id_t> get_target_remote_device_ids();
-    virtual std::map<int,int> get_clocks();
-    virtual void *host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const;
+    virtual std::map<int, int> get_clocks();
+    virtual void* host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const;
     virtual std::uint64_t get_pcie_base_addr_from_device(const chip_id_t chip_id) const;
-    static std::vector<int> extract_rows_to_remove(const tt::ARCH &arch, const int worker_grid_rows, const int harvested_rows);
-    static void remove_worker_row_from_descriptor(tt_SocDescriptor& full_soc_descriptor, const std::vector<int>& row_coordinates_to_remove);
+    static std::vector<int> extract_rows_to_remove(
+        const tt::ARCH& arch, const int worker_grid_rows, const int harvested_rows);
+    static void remove_worker_row_from_descriptor(
+        tt_SocDescriptor& full_soc_descriptor, const std::vector<int>& row_coordinates_to_remove);
     static void harvest_rows_in_soc_descriptor(tt::ARCH arch, tt_SocDescriptor& sdesc, uint32_t harvested_rows);
-    static std::unordered_map<tt_xy_pair, tt_xy_pair> create_harvested_coord_translation(const tt::ARCH arch, bool identity_map);
+    static std::unordered_map<tt_xy_pair, tt_xy_pair> create_harvested_coord_translation(
+        const tt::ARCH arch, bool identity_map);
     std::unordered_map<tt_xy_pair, tt_xy_pair> get_harvested_coord_translation_map(chip_id_t logical_device_id);
     virtual std::uint32_t get_num_dram_channels(std::uint32_t device_id);
     virtual std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel);
@@ -706,70 +790,144 @@ class tt_SiliconDevice: public tt_device
     virtual std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id);
     virtual tt_version get_ethernet_fw_version() const;
     // TODO: This should be accessible through public API, probably to be moved to tt_device.
-    PCIDevice *get_pci_device(int device_id) const;
+    PCIDevice* get_pci_device(int device_id) const;
 
     // Destructor
-    virtual ~tt_SiliconDevice ();
+    virtual ~tt_SiliconDevice();
 
-    private:
+private:
     // Helper functions
     // Startup + teardown
-    void create_device(const std::unordered_set<chip_id_t> &target_mmio_device_ids, const uint32_t &num_host_mem_ch_per_mmio_device, const bool skip_driver_allocs, const bool clean_system_resources);
+    void create_device(
+        const std::unordered_set<chip_id_t>& target_mmio_device_ids,
+        const uint32_t& num_host_mem_ch_per_mmio_device,
+        const bool skip_driver_allocs,
+        const bool clean_system_resources);
     void initialize_interprocess_mutexes(int pci_interface_id, bool cleanup_mutexes_in_shm);
     void cleanup_shared_host_state();
     void initialize_pcie_devices();
-    void broadcast_pcie_tensix_risc_reset(chip_id_t chip_id, const TensixSoftResetOptions &cores);
-    void broadcast_tensix_risc_reset_to_cluster(const TensixSoftResetOptions &soft_resets);
-    void send_remote_tensix_risc_reset_to_core(const tt_cxy_pair &core, const TensixSoftResetOptions &soft_resets);
-    void send_tensix_risc_reset_to_core(const tt_cxy_pair &core, const TensixSoftResetOptions &soft_resets);
+    void broadcast_pcie_tensix_risc_reset(chip_id_t chip_id, const TensixSoftResetOptions& cores);
+    void broadcast_tensix_risc_reset_to_cluster(const TensixSoftResetOptions& soft_resets);
+    void send_remote_tensix_risc_reset_to_core(const tt_cxy_pair& core, const TensixSoftResetOptions& soft_resets);
+    void send_tensix_risc_reset_to_core(const tt_cxy_pair& core, const TensixSoftResetOptions& soft_resets);
     void perform_harvesting_and_populate_soc_descriptors(const std::string& sdesc_path, const bool perform_harvesting);
     void populate_cores();
-    void init_pcie_iatus(); // No more p2p support.
+    void init_pcie_iatus();  // No more p2p support.
     bool init_hugepage(chip_id_t device_id);
     void check_pcie_device_initialized(int device_id);
     void set_pcie_power_state(tt_DevicePowerState state);
-    int set_remote_power_state(const chip_id_t &chip, tt_DevicePowerState device_state);
+    int set_remote_power_state(const chip_id_t& chip, tt_DevicePowerState device_state);
     void set_power_state(tt_DevicePowerState state);
     uint32_t get_power_state_arc_msg(chip_id_t chip_id, tt_DevicePowerState state);
     void enable_local_ethernet_queue(const chip_id_t& chip, int timeout);
     void enable_ethernet_queue(int timeout);
     void enable_remote_ethernet_queue(const chip_id_t& chip, int timeout);
     void deassert_resets_and_set_power_state();
-    int open_hugepage_file(const std::string &dir, chip_id_t device_id, uint16_t channel);
-    int iatu_configure_peer_region (int logical_device_id, uint32_t peer_region_id, uint64_t bar_addr_64, uint32_t region_size);
-    uint32_t get_harvested_noc_rows (uint32_t harvesting_mask);
-    uint32_t get_harvested_rows (int logical_device_id);
+    int open_hugepage_file(const std::string& dir, chip_id_t device_id, uint16_t channel);
+    int iatu_configure_peer_region(
+        int logical_device_id, uint32_t peer_region_id, uint64_t bar_addr_64, uint32_t region_size);
+    uint32_t get_harvested_noc_rows(uint32_t harvesting_mask);
+    uint32_t get_harvested_rows(int logical_device_id);
     int get_clock(int logical_device_id);
 
     // Communication Functions
-    void read_buffer(void* mem_ptr, std::uint32_t address, std::uint16_t channel, std::uint32_t size_in_bytes, chip_id_t src_device_id);
-    void write_buffer(const void *mem_ptr, std::uint32_t size, std::uint32_t address, std::uint16_t channel, chip_id_t src_device_id);
-    void write_device_memory(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair target, std::uint32_t address, const std::string& fallback_tlb);
-    void write_to_non_mmio_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t address, bool broadcast = false, std::vector<int> broadcast_header = {});
-    void read_device_memory(void *mem_ptr, tt_cxy_pair target, std::uint32_t address, std::uint32_t size_in_bytes, const std::string& fallback_tlb);
+    void read_buffer(
+        void* mem_ptr,
+        std::uint32_t address,
+        std::uint16_t channel,
+        std::uint32_t size_in_bytes,
+        chip_id_t src_device_id);
+    void write_buffer(
+        const void* mem_ptr, std::uint32_t size, std::uint32_t address, std::uint16_t channel, chip_id_t src_device_id);
+    void write_device_memory(
+        const void* mem_ptr,
+        uint32_t size_in_bytes,
+        tt_cxy_pair target,
+        std::uint32_t address,
+        const std::string& fallback_tlb);
+    void write_to_non_mmio_device(
+        const void* mem_ptr,
+        uint32_t size_in_bytes,
+        tt_cxy_pair core,
+        uint64_t address,
+        bool broadcast = false,
+        std::vector<int> broadcast_header = {});
+    void read_device_memory(
+        void* mem_ptr,
+        tt_cxy_pair target,
+        std::uint32_t address,
+        std::uint32_t size_in_bytes,
+        const std::string& fallback_tlb);
     void read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_t address, uint32_t size_in_bytes);
-    void read_mmio_device_register(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb);
-    void write_mmio_device_register(const void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb);
-    void pcie_broadcast_write(chip_id_t chip, const void* mem_ptr, uint32_t size_in_bytes, std::uint32_t addr, const tt_xy_pair& start, const tt_xy_pair& end, const std::string& fallback_tlb);
-    void ethernet_broadcast_write(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set<chip_id_t>& chips_to_exclude, const std::set<uint32_t>& rows_to_exclude, 
-                                  std::set<uint32_t>& cols_to_exclude, const std::string& fallback_tlb, bool use_virtual_coords);
-    void set_membar_flag(const chip_id_t chip, const std::unordered_set<tt_xy_pair>& cores, const uint32_t barrier_value, const uint32_t barrier_addr, const std::string& fallback_tlb);
-    void insert_host_to_device_barrier(const chip_id_t chip, const std::unordered_set<tt_xy_pair>& cores, const uint32_t barrier_addr, const std::string& fallback_tlb);
+    void read_mmio_device_register(
+        void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb);
+    void write_mmio_device_register(
+        const void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb);
+    void pcie_broadcast_write(
+        chip_id_t chip,
+        const void* mem_ptr,
+        uint32_t size_in_bytes,
+        std::uint32_t addr,
+        const tt_xy_pair& start,
+        const tt_xy_pair& end,
+        const std::string& fallback_tlb);
+    void ethernet_broadcast_write(
+        const void* mem_ptr,
+        uint32_t size_in_bytes,
+        uint64_t address,
+        const std::set<chip_id_t>& chips_to_exclude,
+        const std::set<uint32_t>& rows_to_exclude,
+        std::set<uint32_t>& cols_to_exclude,
+        const std::string& fallback_tlb,
+        bool use_virtual_coords);
+    void set_membar_flag(
+        const chip_id_t chip,
+        const std::unordered_set<tt_xy_pair>& cores,
+        const uint32_t barrier_value,
+        const uint32_t barrier_addr,
+        const std::string& fallback_tlb);
+    void insert_host_to_device_barrier(
+        const chip_id_t chip,
+        const std::unordered_set<tt_xy_pair>& cores,
+        const uint32_t barrier_addr,
+        const std::string& fallback_tlb);
     void init_membars();
     uint64_t get_sys_addr(uint32_t chip_x, uint32_t chip_y, uint32_t noc_x, uint32_t noc_y, uint64_t offset);
     uint16_t get_sys_rack(uint32_t rack_x, uint32_t rack_y);
     bool is_non_mmio_cmd_q_full(uint32_t curr_wptr, uint32_t curr_rptr);
-    int pcie_arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done = true, uint32_t arg0 = 0, uint32_t arg1 = 0, int timeout=1, uint32_t *return_3 = nullptr, uint32_t *return_4 = nullptr);
-    int remote_arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done = true, uint32_t arg0 = 0, uint32_t arg1 = 0, int timeout=1, uint32_t *return_3 = nullptr, uint32_t *return_4 = nullptr);
-    bool address_in_tlb_space(uint32_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size, uint32_t chip);
+    int pcie_arc_msg(
+        int logical_device_id,
+        uint32_t msg_code,
+        bool wait_for_done = true,
+        uint32_t arg0 = 0,
+        uint32_t arg1 = 0,
+        int timeout = 1,
+        uint32_t* return_3 = nullptr,
+        uint32_t* return_4 = nullptr);
+    int remote_arc_msg(
+        int logical_device_id,
+        uint32_t msg_code,
+        bool wait_for_done = true,
+        uint32_t arg0 = 0,
+        uint32_t arg1 = 0,
+        int timeout = 1,
+        uint32_t* return_3 = nullptr,
+        uint32_t* return_4 = nullptr);
+    bool address_in_tlb_space(
+        uint32_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size, uint32_t chip);
     std::shared_ptr<boost::interprocess::named_mutex> get_mutex(const std::string& tlb_name, int pci_interface_id);
-    virtual uint32_t get_harvested_noc_rows_for_chip(int logical_device_id); // Returns one-hot encoded harvesting mask for PCIe mapped chips
-    void generate_tensix_broadcast_grids_for_grayskull( std::set<std::pair<tt_xy_pair, tt_xy_pair>>& broadcast_grids, std::set<uint32_t>& rows_to_exclude, std::set<uint32_t>& cols_to_exclude);
-    std::unordered_map<chip_id_t, std::vector<std::vector<int>>>&  get_ethernet_broadcast_headers(const std::set<chip_id_t>& chips_to_exclude);
+    virtual uint32_t get_harvested_noc_rows_for_chip(
+        int logical_device_id);  // Returns one-hot encoded harvesting mask for PCIe mapped chips
+    void generate_tensix_broadcast_grids_for_grayskull(
+        std::set<std::pair<tt_xy_pair, tt_xy_pair>>& broadcast_grids,
+        std::set<uint32_t>& rows_to_exclude,
+        std::set<uint32_t>& cols_to_exclude);
+    std::unordered_map<chip_id_t, std::vector<std::vector<int>>>& get_ethernet_broadcast_headers(
+        const std::set<chip_id_t>& chips_to_exclude);
     // Test functions
     void verify_eth_fw();
-    void verify_sw_fw_versions(int device_id, std::uint32_t sw_version, std::vector<std::uint32_t> &fw_versions);
-    int test_setup_interface ();
+    void verify_sw_fw_versions(int device_id, std::uint32_t sw_version, std::vector<std::uint32_t>& fw_versions);
+    int test_setup_interface();
 
     // This functions has to be called for local chip, and then it will wait for all connected remote chips to flush.
     void wait_for_connected_non_mmio_flush(chip_id_t chip_id);
@@ -783,22 +941,24 @@ class tt_SiliconDevice: public tt_device
     std::set<chip_id_t> target_devices_in_cluster = {};
     std::set<chip_id_t> target_remote_chips = {};
     tt::ARCH arch_name;
-    std::unordered_map<chip_id_t, std::unique_ptr<PCIDevice>> m_pci_device_map;    // Map of enabled pci devices
-    int m_num_pci_devices;                                      // Number of pci devices in system (enabled or disabled)
+    std::unordered_map<chip_id_t, std::unique_ptr<PCIDevice>> m_pci_device_map;  // Map of enabled pci devices
+    int m_num_pci_devices;  // Number of pci devices in system (enabled or disabled)
     std::shared_ptr<tt_ClusterDescriptor> ndesc;
 
     // remote eth transfer setup
     static constexpr std::uint32_t NUM_ETH_CORES_FOR_NON_MMIO_TRANSFERS = 6;
     static constexpr std::uint32_t NON_EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS = 4;
     static constexpr std::uint32_t NON_EPOCH_ETH_CORES_START_ID = 0;
-    static constexpr std::uint32_t NON_EPOCH_ETH_CORES_MASK = (NON_EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS-1);
+    static constexpr std::uint32_t NON_EPOCH_ETH_CORES_MASK = (NON_EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS - 1);
 
-    static constexpr std::uint32_t EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS = NUM_ETH_CORES_FOR_NON_MMIO_TRANSFERS - NON_EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS;
-    static constexpr std::uint32_t EPOCH_ETH_CORES_START_ID = NON_EPOCH_ETH_CORES_START_ID + NON_EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS;
-    static constexpr std::uint32_t EPOCH_ETH_CORES_MASK = (EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS-1);
+    static constexpr std::uint32_t EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS =
+        NUM_ETH_CORES_FOR_NON_MMIO_TRANSFERS - NON_EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS;
+    static constexpr std::uint32_t EPOCH_ETH_CORES_START_ID =
+        NON_EPOCH_ETH_CORES_START_ID + NON_EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS;
+    static constexpr std::uint32_t EPOCH_ETH_CORES_MASK = (EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS - 1);
 
     int active_core = NON_EPOCH_ETH_CORES_START_ID;
-    std::vector< std::vector<tt_cxy_pair> > remote_transfer_ethernet_cores;
+    std::vector<std::vector<tt_cxy_pair>> remote_transfer_ethernet_cores;
     std::unordered_map<chip_id_t, bool> flush_non_mmio_per_chip = {};
     bool non_mmio_transfer_cores_customized = false;
     std::unordered_map<chip_id_t, int> active_eth_core_idx_per_chip = {};
@@ -810,7 +970,7 @@ class tt_SiliconDevice: public tt_device
     std::unordered_set<tt_xy_pair> eth_cores = {};
     std::unordered_set<tt_xy_pair> dram_cores = {};
     uint32_t m_num_host_mem_channels = 0;
-    std::unordered_map<chip_id_t, std::unordered_map<int, void *>> hugepage_mapping;
+    std::unordered_map<chip_id_t, std::unordered_map<int, void*>> hugepage_mapping;
     std::unordered_map<chip_id_t, std::unordered_map<int, std::size_t>> hugepage_mapping_size;
     std::unordered_map<chip_id_t, std::unordered_map<int, std::uint64_t>> hugepage_physical_address;
     std::map<chip_id_t, std::unordered_map<std::int32_t, std::int32_t>> tlb_config_map = {};
@@ -828,7 +988,7 @@ class tt_SiliconDevice: public tt_device
     bool use_ethernet_ordered_writes = true;
     bool use_ethernet_broadcast = true;
     bool use_virtual_coords_for_eth_broadcast = true;
-    tt_version eth_fw_version; // Ethernet FW the driver is interfacing with
+    tt_version eth_fw_version;  // Ethernet FW the driver is interfacing with
     // Named Mutexes
     static constexpr char NON_MMIO_MUTEX_NAME[] = "NON_MMIO";
     static constexpr char ARC_MSG_MUTEX_NAME[] = "ARC_MSG";
@@ -839,11 +999,11 @@ class tt_SiliconDevice: public tt_device
 
 uint32_t get_num_hugepages();
 
-constexpr inline bool operator==(const tt_version &a, const tt_version &b) {
+constexpr inline bool operator==(const tt_version& a, const tt_version& b) {
     return a.major == b.major && a.minor == b.minor && a.patch == b.patch;
 }
 
-constexpr inline bool operator>=(const tt_version &a, const tt_version &b) {
+constexpr inline bool operator>=(const tt_version& a, const tt_version& b) {
     bool fw_major_greater = a.major > b.major;
     bool fw_minor_greater = (a.major == b.major) && (a.minor > b.minor);
     bool patch_greater_or_equal = (a.major == b.major) && (a.minor == b.minor) && (a.patch >= b.patch);
diff --git a/device/tt_io.hpp b/device/tt_io.hpp
index 5daa7262..304f2ee8 100644
--- a/device/tt_io.hpp
+++ b/device/tt_io.hpp
@@ -20,20 +20,18 @@ namespace tt {
  *
  * It is the caller's responsibility to manage the lifetime of Writer objects.
  */
-class Writer
-{
+class Writer {
     friend class ::tt_SiliconDevice;
 
 public:
     /**
      * @brief Write to a SoC core.
-     * 
+     *
      * @param address must be aligned to the size of T
-     * @param value 
+     * @param value
      */
     template <class T>
-    void write(uint32_t address, T value)
-    {
+    void write(uint32_t address, T value) {
         auto dst = reinterpret_cast<uintptr_t>(base) + address;
 
         if (address >= tlb_size) {
@@ -44,27 +42,23 @@ class Writer
             throw std::runtime_error("Unaligned write");
         }
 
-        *reinterpret_cast<volatile T*>(dst) = value;
+        *reinterpret_cast<volatile T *>(dst) = value;
     }
 
 private:
     /**
      * @brief tt_SiliconDriver interface to construct a new Writer object.
-     * 
+     *
      * @param base pointer to the base address of a mapped TLB.
      * @param tlb_size size of the mapped TLB.
      */
-    Writer(void *base, size_t tlb_size)
-        : base(base)
-        , tlb_size(tlb_size)
-    {
+    Writer(void *base, size_t tlb_size) : base(base), tlb_size(tlb_size) {
         assert(base);
         assert(tlb_size > 0);
     }
 
-    void *base{ nullptr };
-    size_t tlb_size{ 0 };
+    void *base{nullptr};
+    size_t tlb_size{0};
 };
 
-
-} // namespace tt
+}  // namespace tt
diff --git a/device/tt_silicon_driver.cpp b/device/tt_silicon_driver.cpp
index 6164f6fc..0bc899fb 100644
--- a/device/tt_silicon_driver.cpp
+++ b/device/tt_silicon_driver.cpp
@@ -2,58 +2,56 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#include <assert.h>
+#include <dirent.h>
+#include <errno.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+
+#include <algorithm>
 #include <boost/interprocess/permissions.hpp>
-#include <boost/interprocess/sync/scoped_lock.hpp>
 #include <boost/interprocess/sync/named_mutex.hpp>
-
+#include <boost/interprocess/sync/scoped_lock.hpp>
+#include <cerrno>
+#include <chrono>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <filesystem>
 #include <fstream>
 #include <iterator>
 #include <limits>
 #include <map>
-#include <vector>
 #include <memory>
 #include <mutex>
+#include <optional>
+#include <ratio>
 #include <regex>
 #include <stdexcept>
 #include <string>
 #include <utility>
-#include <cstddef>
-#include <cstdint>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <assert.h>
-#include <cstdlib>
-#include <cerrno>
-#include <chrono>
-#include <ratio>
-#include <algorithm>
-#include <filesystem>
-#include <stdarg.h>
-#include <optional>
-
-#include <sys/ioctl.h>
-#include <sys/mman.h>
-#include <dirent.h>
-#include <errno.h>
+#include <vector>
 
-#include "yaml-cpp/yaml.h"
 #include "common/logger.hpp"
-
+#include "device/architecture_implementation.h"
 #include "device/cpuset_lib.hpp"
 #include "device/driver_atomics.h"
-#include "device/architecture_implementation.h"
 #include "device/tlb.h"
 #include "device/tt_arch_types.h"
-#include "tt_device.h"
 #include "ioctl.h"
+#include "tt_device.h"
+#include "yaml-cpp/yaml.h"
 
 using namespace boost::interprocess;
 using namespace tt;
 
 const uint32_t g_MAX_HOST_MEM_CHANNELS = 4;
 
-const uint32_t HUGEPAGE_REGION_SIZE = 1 << 30; // 1GB
+const uint32_t HUGEPAGE_REGION_SIZE = 1 << 30;  // 1GB
 const uint32_t HUGEPAGE_MAP_MASK = HUGEPAGE_REGION_SIZE - 1;
 
 static const uint32_t MSG_ERROR_REPLY = 0xFFFFFFFF;
@@ -99,7 +97,7 @@ tt::ARCH detect_arch() {
 }
 
 template <typename T>
-void size_buffer_to_capacity(std::vector<T> &data_buf, std::size_t size_in_bytes) {
+void size_buffer_to_capacity(std::vector<T>& data_buf, std::size_t size_in_bytes) {
     std::size_t target_size = 0;
     if (size_in_bytes > 0) {
         target_size = ((size_in_bytes - 1) / sizeof(T)) + 1;
@@ -108,14 +106,13 @@ void size_buffer_to_capacity(std::vector<T> &data_buf, std::size_t size_in_bytes
 }
 
 // Get number of 1GB host hugepages installed. They are used for host queues.
-uint32_t get_num_hugepages(){
-
+uint32_t get_num_hugepages() {
     log_assert(HUGEPAGE_REGION_SIZE == 1 << 30, "Hugepages must be 1GB in size");
     std::string nr_hugepages_path = "/sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages";
     std::ifstream hugepages_file(nr_hugepages_path);
     uint32_t num_hugepages = 0;
 
-    if(hugepages_file.is_open()) {
+    if (hugepages_file.is_open()) {
         std::string value;
         std::getline(hugepages_file, value);
         num_hugepages = std::stoi(value);
@@ -125,56 +122,72 @@ uint32_t get_num_hugepages(){
     }
 
     return num_hugepages;
-
 }
 
 // Dynamically figure out how many host memory channels (based on hugepages installed) for each device, based on arch.
-uint32_t get_available_num_host_mem_channels(const uint32_t num_channels_per_device_target, const uint16_t device_id, const uint16_t revision_id) {
-
+uint32_t get_available_num_host_mem_channels(
+    const uint32_t num_channels_per_device_target, const uint16_t device_id, const uint16_t revision_id) {
     // To minimally support hybrid dev systems with mix of ARCH, get only devices matching current ARCH's device_id.
-    uint32_t total_num_tt_mmio_devices      = tt::cpuset::tt_cpuset_allocator::get_num_tt_pci_devices();
-    uint32_t num_tt_mmio_devices_for_arch   = tt::cpuset::tt_cpuset_allocator::get_num_tt_pci_devices_by_pci_device_id(device_id, revision_id);
-    uint32_t total_hugepages                = get_num_hugepages();
+    uint32_t total_num_tt_mmio_devices = tt::cpuset::tt_cpuset_allocator::get_num_tt_pci_devices();
+    uint32_t num_tt_mmio_devices_for_arch =
+        tt::cpuset::tt_cpuset_allocator::get_num_tt_pci_devices_by_pci_device_id(device_id, revision_id);
+    uint32_t total_hugepages = get_num_hugepages();
 
     // This shouldn't happen on silicon machines.
     if (num_tt_mmio_devices_for_arch == 0) {
-        log_warning(LogSiliconDriver,
+        log_warning(
+            LogSiliconDriver,
             "No TT devices found that match PCI device_id: 0x{:x} revision: {}, returning NumHostMemChannels:0",
-            device_id, revision_id);
+            device_id,
+            revision_id);
         return 0;
     }
 
-    // GS will use P2P + 1 channel, others may support 4 host channels. Apply min of 1 to not completely break setups that were incomplete
-    // ie fewer hugepages than devices, which would partially work previously for some devices.
-    uint32_t num_channels_per_device_available = std::min(num_channels_per_device_target, std::max((uint32_t) 1, total_hugepages / num_tt_mmio_devices_for_arch));
+    // GS will use P2P + 1 channel, others may support 4 host channels. Apply min of 1 to not completely break setups
+    // that were incomplete ie fewer hugepages than devices, which would partially work previously for some devices.
+    uint32_t num_channels_per_device_available =
+        std::min(num_channels_per_device_target, std::max((uint32_t)1, total_hugepages / num_tt_mmio_devices_for_arch));
 
-    // Perform some helpful assertion checks to guard against common pitfalls that would show up as runtime issues later on.
+    // Perform some helpful assertion checks to guard against common pitfalls that would show up as runtime issues later
+    // on.
     if (total_num_tt_mmio_devices > num_tt_mmio_devices_for_arch) {
-        log_warning(LogSiliconDriver,
-            "Hybrid system mixing different TTDevices - this is not well supported. Ensure sufficient Hugepages/HostMemChannels per device.");
+        log_warning(
+            LogSiliconDriver,
+            "Hybrid system mixing different TTDevices - this is not well supported. Ensure sufficient "
+            "Hugepages/HostMemChannels per device.");
     }
 
     if (total_hugepages < num_tt_mmio_devices_for_arch) {
-        log_warning(LogSiliconDriver,
-            "Insufficient NumHugepages: {} should be at least NumMMIODevices: {} for device_id: 0x{:x} revision: {}. NumHostMemChannels would be 0, bumping to 1.",
-            total_hugepages, num_tt_mmio_devices_for_arch, device_id, revision_id);
+        log_warning(
+            LogSiliconDriver,
+            "Insufficient NumHugepages: {} should be at least NumMMIODevices: {} for device_id: 0x{:x} revision: {}. "
+            "NumHostMemChannels would be 0, bumping to 1.",
+            total_hugepages,
+            num_tt_mmio_devices_for_arch,
+            device_id,
+            revision_id);
     }
 
     if (num_channels_per_device_available < num_channels_per_device_target) {
-        log_warning(LogSiliconDriver,
-            "NumHostMemChannels: {} used for device_id: 0x{:x} less than target: {}. Workload will fail if it exceeds NumHostMemChannels. Increase Number of Hugepages.",
-            num_channels_per_device_available, device_id, num_channels_per_device_target);
+        log_warning(
+            LogSiliconDriver,
+            "NumHostMemChannels: {} used for device_id: 0x{:x} less than target: {}. Workload will fail if it exceeds "
+            "NumHostMemChannels. Increase Number of Hugepages.",
+            num_channels_per_device_available,
+            device_id,
+            num_channels_per_device_target);
     }
 
-    log_assert(num_channels_per_device_available <= g_MAX_HOST_MEM_CHANNELS,
+    log_assert(
+        num_channels_per_device_available <= g_MAX_HOST_MEM_CHANNELS,
         "NumHostMemChannels: {} exceeds supported maximum: {}, this is unexpected.",
-        num_channels_per_device_available, g_MAX_HOST_MEM_CHANNELS);
+        num_channels_per_device_available,
+        g_MAX_HOST_MEM_CHANNELS);
 
     return num_channels_per_device_available;
-
 }
 
-bool is_char_dev(const dirent *ent, const char *parent_dir) {
+bool is_char_dev(const dirent* ent, const char* parent_dir) {
     if (ent->d_type == DT_UNKNOWN || ent->d_type == DT_LNK) {
         char name[2 * NAME_MAX + 2];
         strcpy(name, parent_dir);
@@ -192,18 +205,16 @@ bool is_char_dev(const dirent *ent, const char *parent_dir) {
     }
 }
 
-
-
 // --------------------------------------------------------------------------------------------------------------
 // --------------------------------------------------------------------------------------------------------------
 // --------------------------------------------------------------------------------------------------------------
 
-#include "tt_silicon_driver_common.hpp"
-#include "tt_xy_pair.h"
-#include <thread>
 #include <fstream>
 #include <iomanip>
+#include <thread>
 
+#include "tt_silicon_driver_common.hpp"
+#include "tt_xy_pair.h"
 
 struct routing_cmd_t {
     uint64_t sys_addr;
@@ -212,47 +223,51 @@ struct routing_cmd_t {
     uint16_t rack;
     uint16_t src_resp_buf_index;
     uint32_t local_buf_index;
-    uint8_t  src_resp_q_id;
-    uint8_t  host_mem_txn_id;
+    uint8_t src_resp_q_id;
+    uint8_t host_mem_txn_id;
     uint16_t padding;
-    uint32_t src_addr_tag; //upper 32-bits of request source address.
+    uint32_t src_addr_tag;  // upper 32-bits of request source address.
 };
 
-struct remote_update_ptr_t{
-  uint32_t ptr;
-  uint32_t pad[3];
+struct remote_update_ptr_t {
+    uint32_t ptr;
+    uint32_t pad[3];
 };
 
 namespace {
-    struct tt_4_byte_aligned_buffer {
-        // Stores a 4 byte aligned buffer
-        // If the input buffer is already 4 byte aligned, this is a nop
-        std::uint32_t* local_storage = nullptr;
-        std::uint32_t input_size = 0;
-        std::uint32_t block_size = 0;
-
-        tt_4_byte_aligned_buffer(const void* mem_ptr, uint32_t size_in_bytes) {
-            input_size = size_in_bytes;
-            local_storage = (uint32_t*)mem_ptr;
-            uint32_t alignment_mask = sizeof(uint32_t) - 1;
-            uint32_t aligned_size = (size_in_bytes + alignment_mask) & ~alignment_mask;
+struct tt_4_byte_aligned_buffer {
+    // Stores a 4 byte aligned buffer
+    // If the input buffer is already 4 byte aligned, this is a nop
+    std::uint32_t* local_storage = nullptr;
+    std::uint32_t input_size = 0;
+    std::uint32_t block_size = 0;
 
-            if(size_in_bytes < aligned_size) {
-                local_storage = new uint32_t[aligned_size / sizeof(uint32_t)];
-            }
-            block_size = aligned_size;
+    tt_4_byte_aligned_buffer(const void* mem_ptr, uint32_t size_in_bytes) {
+        input_size = size_in_bytes;
+        local_storage = (uint32_t*)mem_ptr;
+        uint32_t alignment_mask = sizeof(uint32_t) - 1;
+        uint32_t aligned_size = (size_in_bytes + alignment_mask) & ~alignment_mask;
+
+        if (size_in_bytes < aligned_size) {
+            local_storage = new uint32_t[aligned_size / sizeof(uint32_t)];
         }
+        block_size = aligned_size;
+    }
 
-        ~tt_4_byte_aligned_buffer() {
-            if(block_size > input_size) {
-                delete [] local_storage;
-            }
+    ~tt_4_byte_aligned_buffer() {
+        if (block_size > input_size) {
+            delete[] local_storage;
         }
-    };
-}
+    }
+};
+}  // namespace
 
-bool tt_SiliconDevice::address_in_tlb_space(uint32_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size, std::uint32_t chip) {
-    return ((tlb_config_map.at(chip).find(tlb_index) != tlb_config_map.at(chip).end()) && address >= tlb_config_map.at(chip).at(tlb_index) && (address + size_in_bytes <= tlb_config_map.at(chip).at(tlb_index) + tlb_size));
+bool tt_SiliconDevice::address_in_tlb_space(
+    uint32_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size, std::uint32_t chip) {
+    return (
+        (tlb_config_map.at(chip).find(tlb_index) != tlb_config_map.at(chip).end()) &&
+        address >= tlb_config_map.at(chip).at(tlb_index) &&
+        (address + size_in_bytes <= tlb_config_map.at(chip).at(tlb_index) + tlb_size));
 }
 
 std::unordered_map<chip_id_t, tt_SocDescriptor>& tt_SiliconDevice::get_virtual_soc_descriptors() {
@@ -260,10 +275,10 @@ std::unordered_map<chip_id_t, tt_SocDescriptor>& tt_SiliconDevice::get_virtual_s
 }
 
 void tt_SiliconDevice::initialize_interprocess_mutexes(int pci_interface_id, bool cleanup_mutexes_in_shm) {
-    // These mutexes are intended to be based on physical devices/pci-intf not logical. Set these up ahead of time here (during device init)
-    // since its unsafe to modify shared state during multithreaded runtime.
-    // cleanup_mutexes_in_shm is tied to clean_system_resources from the constructor. The main process is responsible for initializing the driver with this
-    // field set to cleanup after an aborted process.
+    // These mutexes are intended to be based on physical devices/pci-intf not logical. Set these up ahead of time here
+    // (during device init) since its unsafe to modify shared state during multithreaded runtime. cleanup_mutexes_in_shm
+    // is tied to clean_system_resources from the constructor. The main process is responsible for initializing the
+    // driver with this field set to cleanup after an aborted process.
 
     // Store old mask and clear processes umask
     auto old_umask = umask(0);
@@ -272,70 +287,108 @@ void tt_SiliconDevice::initialize_interprocess_mutexes(int pci_interface_id, boo
     std::string mutex_name = "";
 
     // Initialize Dynamic TLB mutexes
-    for(auto &tlb : dynamic_tlb_config) {
+    for (auto& tlb : dynamic_tlb_config) {
         mutex_name = tlb.first + std::to_string(pci_interface_id);
-        if (cleanup_mutexes_in_shm) named_mutex::remove(mutex_name.c_str());
-        hardware_resource_mutex_map[mutex_name] = std::make_shared<named_mutex>(open_or_create, mutex_name.c_str(), unrestricted_permissions);
+        if (cleanup_mutexes_in_shm) {
+            named_mutex::remove(mutex_name.c_str());
+        }
+        hardware_resource_mutex_map[mutex_name] =
+            std::make_shared<named_mutex>(open_or_create, mutex_name.c_str(), unrestricted_permissions);
     }
 
     // Initialize ARC core mutex
     mutex_name = fmt::format("ARC_MSG{}", pci_interface_id);
-    if (cleanup_mutexes_in_shm) named_mutex::remove(mutex_name.c_str());
-    hardware_resource_mutex_map[mutex_name] = std::make_shared<named_mutex>(open_or_create, mutex_name.c_str(), unrestricted_permissions);
+    if (cleanup_mutexes_in_shm) {
+        named_mutex::remove(mutex_name.c_str());
+    }
+    hardware_resource_mutex_map[mutex_name] =
+        std::make_shared<named_mutex>(open_or_create, mutex_name.c_str(), unrestricted_permissions);
 
     if (arch_name == tt::ARCH::WORMHOLE_B0) {
         mutex_name = NON_MMIO_MUTEX_NAME + std::to_string(pci_interface_id);
-        // Initialize non-MMIO mutexes for WH devices regardless of number of chips, since these may be used for ethernet broadcast
-        if (cleanup_mutexes_in_shm) named_mutex::remove(mutex_name.c_str());
-        hardware_resource_mutex_map[mutex_name] = std::make_shared<named_mutex>(open_or_create, mutex_name.c_str(), unrestricted_permissions);
+        // Initialize non-MMIO mutexes for WH devices regardless of number of chips, since these may be used for
+        // ethernet broadcast
+        if (cleanup_mutexes_in_shm) {
+            named_mutex::remove(mutex_name.c_str());
+        }
+        hardware_resource_mutex_map[mutex_name] =
+            std::make_shared<named_mutex>(open_or_create, mutex_name.c_str(), unrestricted_permissions);
     }
 
     // Initialize interprocess mutexes to make host -> device memory barriers atomic
     mutex_name = MEM_BARRIER_MUTEX_NAME + std::to_string(pci_interface_id);
-    if (cleanup_mutexes_in_shm) named_mutex::remove(mutex_name.c_str());
-    hardware_resource_mutex_map[mutex_name] = std::make_shared<named_mutex>(open_or_create, mutex_name.c_str(), unrestricted_permissions);
-    
+    if (cleanup_mutexes_in_shm) {
+        named_mutex::remove(mutex_name.c_str());
+    }
+    hardware_resource_mutex_map[mutex_name] =
+        std::make_shared<named_mutex>(open_or_create, mutex_name.c_str(), unrestricted_permissions);
+
     // Restore old mask
     umask(old_umask);
 }
 
-void tt_SiliconDevice::create_device(const std::unordered_set<chip_id_t> &target_mmio_device_ids, const uint32_t &num_host_mem_ch_per_mmio_device, const bool skip_driver_allocs, const bool clean_system_resources) {
+void tt_SiliconDevice::create_device(
+    const std::unordered_set<chip_id_t>& target_mmio_device_ids,
+    const uint32_t& num_host_mem_ch_per_mmio_device,
+    const bool skip_driver_allocs,
+    const bool clean_system_resources) {
     log_debug(LogSiliconDriver, "tt_SiliconDevice::tt_SiliconDevice");
 
     // Don't buffer stdout.
     setbuf(stdout, NULL);
 
-    // Just use PCI interface id from physical_device_id given by cluster desc mmio map. For GS, already virtualized to use available devices.
+    // Just use PCI interface id from physical_device_id given by cluster desc mmio map. For GS, already virtualized to
+    // use available devices.
     auto logical_to_physical_device_id_map = ndesc->get_chips_with_mmio();
 
-    log_assert(target_mmio_device_ids.size() > 0, "Must provide set of target_mmio_device_ids to tt_SiliconDevice constructor now.");
+    log_assert(
+        target_mmio_device_ids.size() > 0,
+        "Must provide set of target_mmio_device_ids to tt_SiliconDevice constructor now.");
 
-    for (const chip_id_t &logical_device_id : target_mmio_device_ids) {
-        log_assert(logical_to_physical_device_id_map.count(logical_device_id) != 0, "Cannot find logical mmio device_id: {} in cluster desc / logical-to-physical-map", logical_device_id);
+    for (const chip_id_t& logical_device_id : target_mmio_device_ids) {
+        log_assert(
+            logical_to_physical_device_id_map.count(logical_device_id) != 0,
+            "Cannot find logical mmio device_id: {} in cluster desc / logical-to-physical-map",
+            logical_device_id);
         int pci_interface_id = logical_to_physical_device_id_map.at(logical_device_id);
 
         if (!m_pci_device_map.count(logical_device_id)) {
-            log_debug(LogSiliconDriver, "Opening TT_PCI_INTERFACE_ID {} for netlist target_device_id: {}", pci_interface_id, logical_device_id);
-            m_pci_device_map.insert({logical_device_id, std::make_unique<PCIDevice>(pci_interface_id, logical_device_id)});
+            log_debug(
+                LogSiliconDriver,
+                "Opening TT_PCI_INTERFACE_ID {} for netlist target_device_id: {}",
+                pci_interface_id,
+                logical_device_id);
+            m_pci_device_map.insert(
+                {logical_device_id, std::make_unique<PCIDevice>(pci_interface_id, logical_device_id)});
         }
         auto dev = m_pci_device_map.at(logical_device_id).get();
 
         uint16_t pcie_device_id = dev->get_pci_device_id();
         uint32_t pcie_revision = dev->get_pci_revision();
         // TODO: get rid of this, it doesn't make any sense.
-        m_num_host_mem_channels = get_available_num_host_mem_channels(num_host_mem_ch_per_mmio_device, pcie_device_id, pcie_revision);
+        m_num_host_mem_channels =
+            get_available_num_host_mem_channels(num_host_mem_ch_per_mmio_device, pcie_device_id, pcie_revision);
         if (dev->get_arch() == tt::ARCH::BLACKHOLE && m_num_host_mem_channels > 1) {
             // TODO: Implement support for multiple host channels on BLACKHOLE.
-            log_warning(LogSiliconDriver, "Forcing a single channel for Blackhole device. Multiple host channels not supported.");
+            log_warning(
+                LogSiliconDriver,
+                "Forcing a single channel for Blackhole device. Multiple host channels not supported.");
             m_num_host_mem_channels = 1;
         }
 
-        log_debug(LogSiliconDriver, "Using {} Hugepages/NumHostMemChannels for PCIDevice (logical_device_id: {} pci_interface_id: {} device_id: 0x{:x} revision: {})",
-            m_num_host_mem_channels, logical_device_id, pci_interface_id, pci_device->get_device_num(), pci_device->revision_id);
+        log_debug(
+            LogSiliconDriver,
+            "Using {} Hugepages/NumHostMemChannels for PCIDevice (logical_device_id: {} pci_interface_id: {} "
+            "device_id: 0x{:x} revision: {})",
+            m_num_host_mem_channels,
+            logical_device_id,
+            pci_interface_id,
+            pci_device->get_device_num(),
+            pci_device->revision_id);
 
         // Initialize these. Used to be in header file.
-        for (int ch = 0; ch < g_MAX_HOST_MEM_CHANNELS; ch ++) {
-            hugepage_mapping[logical_device_id][ch]= nullptr;
+        for (int ch = 0; ch < g_MAX_HOST_MEM_CHANNELS; ch++) {
+            hugepage_mapping[logical_device_id][ch] = nullptr;
             hugepage_mapping_size[logical_device_id][ch] = 0;
             hugepage_physical_address[logical_device_id][ch] = 0;
         }
@@ -344,49 +397,63 @@ void tt_SiliconDevice::create_device(const std::unordered_set<chip_id_t> &target
 
         // MT: Initial BH - hugepages will fail init
         // For using silicon driver without workload to query mission mode params, no need for hugepage.
-        if (!skip_driver_allocs){
+        if (!skip_driver_allocs) {
             bool hugepages_initialized = init_hugepage(logical_device_id);
             // Large writes to remote chips require hugepages to be initialized.
-            // Conservative assert - end workload if remote chips present but hugepages not initialized (failures caused if using remote only for small transactions)
-            if(target_remote_chips.size()) {
-                log_assert(hugepages_initialized, "Hugepages must be successfully initialized if workload contains remote chips!");
+            // Conservative assert - end workload if remote chips present but hugepages not initialized (failures caused
+            // if using remote only for small transactions)
+            if (target_remote_chips.size()) {
+                log_assert(
+                    hugepages_initialized,
+                    "Hugepages must be successfully initialized if workload contains remote chips!");
             }
             if (not hugepage_mapping.at(logical_device_id).at(0)) {
                 log_warning(LogSiliconDriver, "No hugepage mapping at device {}.", logical_device_id);
             }
         }
-        harvested_coord_translation.insert({logical_device_id, create_harvested_coord_translation(arch_name, true)}); //translation layer for harvested coords. Default is identity map
+        harvested_coord_translation.insert(
+            {logical_device_id,
+             create_harvested_coord_translation(
+                 arch_name, true)});  // translation layer for harvested coords. Default is identity map
     }
 
-    for(const chip_id_t& chip : target_devices_in_cluster) {
+    for (const chip_id_t& chip : target_devices_in_cluster) {
         // Initialize identity mapping for Non-MMIO chips as well
-        if(!ndesc -> is_chip_mmio_capable(chip)) {
+        if (!ndesc->is_chip_mmio_capable(chip)) {
             harvested_coord_translation.insert({chip, create_harvested_coord_translation(arch_name, true)});
             flush_non_mmio_per_chip[chip] = false;
         }
     }
 }
 
-bool tt_SiliconDevice::using_harvested_soc_descriptors() {
-    return perform_harvesting_on_sdesc && performed_harvesting;
-}
+bool tt_SiliconDevice::using_harvested_soc_descriptors() { return perform_harvesting_on_sdesc && performed_harvesting; }
 
-std::unordered_map<tt_xy_pair, tt_xy_pair> tt_SiliconDevice::get_harvested_coord_translation_map(chip_id_t logical_device_id) {
+std::unordered_map<tt_xy_pair, tt_xy_pair> tt_SiliconDevice::get_harvested_coord_translation_map(
+    chip_id_t logical_device_id) {
     return harvested_coord_translation.at(logical_device_id);
 }
 
 std::unordered_map<chip_id_t, uint32_t> tt_SiliconDevice::get_harvesting_masks_for_soc_descriptors() {
-    if(using_harvested_soc_descriptors()) {
+    if (using_harvested_soc_descriptors()) {
         return harvested_rows_per_target;
     }
     std::unordered_map<chip_id_t, uint32_t> default_harvesting_masks = {};
-    for(const auto chip : target_devices_in_cluster) default_harvesting_masks.insert({chip, 0});
+    for (const auto chip : target_devices_in_cluster) {
+        default_harvesting_masks.insert({chip, 0});
+    }
     return default_harvesting_masks;
 }
 
-tt_SiliconDevice::tt_SiliconDevice(const std::string &sdesc_path, const std::string &ndesc_path, const std::set<chip_id_t> &target_devices, 
-                                   const uint32_t &num_host_mem_ch_per_mmio_device, const bool skip_driver_allocs,
-                                   const bool clean_system_resources, bool perform_harvesting, std::unordered_map<chip_id_t, uint32_t> simulated_harvesting_masks) : tt_device(sdesc_path) {
+tt_SiliconDevice::tt_SiliconDevice(
+    const std::string& sdesc_path,
+    const std::string& ndesc_path,
+    const std::set<chip_id_t>& target_devices,
+    const uint32_t& num_host_mem_ch_per_mmio_device,
+    const bool skip_driver_allocs,
+    const bool clean_system_resources,
+    bool perform_harvesting,
+    std::unordered_map<chip_id_t, uint32_t> simulated_harvesting_masks) :
+    tt_device(sdesc_path) {
     std::unordered_set<chip_id_t> target_mmio_device_ids;
     target_devices_in_cluster = target_devices;
     arch_name = tt_SocDescriptor(sdesc_path).arch;
@@ -396,117 +463,143 @@ tt_SiliconDevice::tt_SiliconDevice(const std::string &sdesc_path, const std::str
     m_num_pci_devices = available_device_ids.size();
 
     if (!skip_driver_allocs) {
-        log_info(LogSiliconDriver, "Detected {} PCI device{} : {}", m_num_pci_devices, (m_num_pci_devices > 1) ? "s":"", available_device_ids);
+        log_info(
+            LogSiliconDriver,
+            "Detected {} PCI device{} : {}",
+            m_num_pci_devices,
+            (m_num_pci_devices > 1) ? "s" : "",
+            available_device_ids);
         log_debug(LogSiliconDriver, "Passed target devices: {}", target_devices);
     }
 
     if (ndesc_path == "") {
         ndesc = tt_ClusterDescriptor::create_for_grayskull_cluster(target_devices, available_device_ids);
-    }
-    else {
+    } else {
         ndesc = tt_ClusterDescriptor::create_from_yaml(ndesc_path);
     }
 
-    for (auto &d: target_devices){
-        if (ndesc->is_chip_mmio_capable(d)){
+    for (auto& d : target_devices) {
+        if (ndesc->is_chip_mmio_capable(d)) {
             target_mmio_device_ids.insert(d);
-        }
-        else {
+        } else {
             target_remote_chips.insert(d);
         }
     }
 
-    // It is mandatory for all devices to have these TLBs set aside, as the driver needs them to issue remote reads and writes.
+    // It is mandatory for all devices to have these TLBs set aside, as the driver needs them to issue remote reads and
+    // writes.
     auto architecture_implementation = tt::umd::architecture_implementation::create(arch_name);
-    dynamic_tlb_config["LARGE_READ_TLB"] =  architecture_implementation->get_mem_large_read_tlb();
+    dynamic_tlb_config["LARGE_READ_TLB"] = architecture_implementation->get_mem_large_read_tlb();
     dynamic_tlb_config["LARGE_WRITE_TLB"] = architecture_implementation->get_mem_large_write_tlb();
     dynamic_tlb_config["REG_TLB"] = architecture_implementation->get_reg_tlb();
     dynamic_tlb_config["SMALL_READ_WRITE_TLB"] = architecture_implementation->get_small_read_write_tlb();
 
-    for(const auto& tlb : dynamic_tlb_config) {
-        dynamic_tlb_ordering_modes.insert({tlb.first, TLB_DATA::Relaxed}); // All dynamic TLBs use Relaxed Ordering by default; MT: Good for BH
+    for (const auto& tlb : dynamic_tlb_config) {
+        dynamic_tlb_ordering_modes.insert(
+            {tlb.first, TLB_DATA::Relaxed});  // All dynamic TLBs use Relaxed Ordering by default; MT: Good for BH
     }
     create_device(target_mmio_device_ids, num_host_mem_ch_per_mmio_device, skip_driver_allocs, clean_system_resources);
 
     // MT: Initial BH - Disable dependency to ethernet firmware
-    if(arch_name == tt::ARCH::BLACKHOLE) {
+    if (arch_name == tt::ARCH::BLACKHOLE) {
         use_ethernet_ordered_writes = false;
         use_ethernet_broadcast = false;
         use_virtual_coords_for_eth_broadcast = false;
     }
 
-    if(arch_name == tt::ARCH::WORMHOLE_B0) {
-        const auto& harvesting_masks = ndesc -> get_harvesting_info();
-        const auto& noc_translation_enabled = ndesc -> get_noc_translation_table_en();
+    if (arch_name == tt::ARCH::WORMHOLE_B0) {
+        const auto& harvesting_masks = ndesc->get_harvesting_info();
+        const auto& noc_translation_enabled = ndesc->get_noc_translation_table_en();
 
         translation_tables_en = false;
-        for(auto& masks : harvesting_masks) {
-            if(target_devices.find(masks.first) != target_devices.end()) {
+        for (auto& masks : harvesting_masks) {
+            if (target_devices.find(masks.first) != target_devices.end()) {
                 harvested_rows_per_target[masks.first] = get_harvested_noc_rows(masks.second);
                 noc_translation_enabled_for_chip[masks.first] = noc_translation_enabled.at(masks.first);
                 num_rows_harvested.insert({masks.first, std::bitset<32>(masks.second).count()});
-                if(harvested_rows_per_target[masks.first]) {
+                if (harvested_rows_per_target[masks.first]) {
                     performed_harvesting = true;
                 }
             }
         }
-        if(noc_translation_enabled_for_chip.size() > 0) {
-            auto const consistent_translation_table_state = [&] (std::pair<chip_id_t, bool> const& i) {
-                return noc_translation_enabled_for_chip.begin() -> second == i.second;
+        if (noc_translation_enabled_for_chip.size() > 0) {
+            auto const consistent_translation_table_state = [&](std::pair<chip_id_t, bool> const& i) {
+                return noc_translation_enabled_for_chip.begin()->second == i.second;
             };
 
-            bool translation_tables_match_on_all_chips = std::all_of(noc_translation_enabled_for_chip.begin(), noc_translation_enabled_for_chip.end(), consistent_translation_table_state);
-            log_assert(translation_tables_match_on_all_chips, "Cluster uses NOC translation tables inconsistently across chips.");
-            translation_tables_en = noc_translation_enabled_for_chip.begin() -> second;
+            bool translation_tables_match_on_all_chips = std::all_of(
+                noc_translation_enabled_for_chip.begin(),
+                noc_translation_enabled_for_chip.end(),
+                consistent_translation_table_state);
+            log_assert(
+                translation_tables_match_on_all_chips,
+                "Cluster uses NOC translation tables inconsistently across chips.");
+            translation_tables_en = noc_translation_enabled_for_chip.begin()->second;
         }
 
-        if(translation_tables_en) {
+        if (translation_tables_en) {
             harvested_coord_translation.clear();
-            for(const chip_id_t& chip : target_devices_in_cluster) {
+            for (const chip_id_t& chip : target_devices_in_cluster) {
                 harvested_coord_translation.insert({chip, create_harvested_coord_translation(arch_name, false)});
             }
         }
-        log_assert(performed_harvesting ? translation_tables_en : true, "Using a harvested WH cluster with NOC translation disabled.");
-    }
-    else if(arch_name == tt::ARCH::BLACKHOLE) {
+        log_assert(
+            performed_harvesting ? translation_tables_en : true,
+            "Using a harvested WH cluster with NOC translation disabled.");
+    } else if (arch_name == tt::ARCH::BLACKHOLE) {
         // Default harvesting info for Blackhole, describing no harvesting
-        for(auto chip_id = target_devices.begin(); chip_id != target_devices.end(); chip_id++){
-            harvested_rows_per_target[*chip_id] =  0; //get_harvested_noc_rows_for_chip(*chip_id);
-            num_rows_harvested.insert({*chip_id, 0}); // Only set for broadcast TLB to get RISCS out of reset. We want all rows to have a reset signal sent.
-            if(harvested_rows_per_target[*chip_id]) {
+        for (auto chip_id = target_devices.begin(); chip_id != target_devices.end(); chip_id++) {
+            harvested_rows_per_target[*chip_id] = 0;   // get_harvested_noc_rows_for_chip(*chip_id);
+            num_rows_harvested.insert({*chip_id, 0});  // Only set for broadcast TLB to get RISCS out of reset. We want
+                                                       // all rows to have a reset signal sent.
+            if (harvested_rows_per_target[*chip_id]) {
                 performed_harvesting = true;
             }
         }
-    }
-    else if(arch_name == tt::ARCH::GRAYSKULL) {
+    } else if (arch_name == tt::ARCH::GRAYSKULL) {
         // Multichip harvesting is supported for GS.
-        for(auto chip_id = target_devices.begin(); chip_id != target_devices.end(); chip_id++){
-            harvested_rows_per_target[*chip_id] =  get_harvested_noc_rows_for_chip(*chip_id);
-            num_rows_harvested.insert({*chip_id, 0}); // Only set for broadcast TLB to get RISCS out of reset. We want all rows to have a reset signal sent.
-            if(harvested_rows_per_target[*chip_id]) {
+        for (auto chip_id = target_devices.begin(); chip_id != target_devices.end(); chip_id++) {
+            harvested_rows_per_target[*chip_id] = get_harvested_noc_rows_for_chip(*chip_id);
+            num_rows_harvested.insert({*chip_id, 0});  // Only set for broadcast TLB to get RISCS out of reset. We want
+                                                       // all rows to have a reset signal sent.
+            if (harvested_rows_per_target[*chip_id]) {
                 performed_harvesting = true;
             }
         }
     }
 
-    if(simulated_harvesting_masks.size()) {
+    if (simulated_harvesting_masks.size()) {
         performed_harvesting = true;
         for (auto device_id = target_devices.begin(); device_id != target_devices.end(); device_id++) {
-            log_assert(simulated_harvesting_masks.find(*device_id) != simulated_harvesting_masks.end(), "Could not find harvesting mask for device_id {}", *device_id);
-            if(arch_name == tt::ARCH::GRAYSKULL) {
-                if ((simulated_harvesting_masks.at(*device_id) & harvested_rows_per_target[*device_id]) != harvested_rows_per_target[*device_id]) {
-                    log_warning(LogSiliconDriver,
-                                "Simulated harvesting config for device {} does not include the actual harvesting config. Simulated harvesting mask will be added to the real harvesting mask. Actual Harvested Rows : {}    Simulated Harvested Rows : {}",
-                                *device_id,  harvested_rows_per_target[*device_id], simulated_harvesting_masks.at(*device_id));
+            log_assert(
+                simulated_harvesting_masks.find(*device_id) != simulated_harvesting_masks.end(),
+                "Could not find harvesting mask for device_id {}",
+                *device_id);
+            if (arch_name == tt::ARCH::GRAYSKULL) {
+                if ((simulated_harvesting_masks.at(*device_id) & harvested_rows_per_target[*device_id]) !=
+                    harvested_rows_per_target[*device_id]) {
+                    log_warning(
+                        LogSiliconDriver,
+                        "Simulated harvesting config for device {} does not include the actual harvesting config. "
+                        "Simulated harvesting mask will be added to the real harvesting mask. Actual Harvested Rows : "
+                        "{}    Simulated Harvested Rows : {}",
+                        *device_id,
+                        harvested_rows_per_target[*device_id],
+                        simulated_harvesting_masks.at(*device_id));
                 }
                 simulated_harvesting_masks.at(*device_id) |= harvested_rows_per_target[*device_id];
-            }
-            else if(arch_name == tt::ARCH::WORMHOLE_B0) {
-                log_assert(std::bitset<32>(simulated_harvesting_masks.at(*device_id)).count() >= std::bitset<32>(harvested_rows_per_target[*device_id]).count(),
-                            "Simulated Harvesting for WH must contain at least as many rows as the actual harvesting config. Actual Harvested Rows : {}  Simulated Harvested Rows : {}",
-                            harvested_rows_per_target[*device_id], simulated_harvesting_masks.at(*device_id));
-                            num_rows_harvested.at(*device_id) = std::bitset<32>(simulated_harvesting_masks.at(*device_id)).count();
-                log_assert(performed_harvesting ? translation_tables_en : true, "Using a harvested WH cluster with NOC translation disabled.");
+            } else if (arch_name == tt::ARCH::WORMHOLE_B0) {
+                log_assert(
+                    std::bitset<32>(simulated_harvesting_masks.at(*device_id)).count() >=
+                        std::bitset<32>(harvested_rows_per_target[*device_id]).count(),
+                    "Simulated Harvesting for WH must contain at least as many rows as the actual harvesting config. "
+                    "Actual Harvested Rows : {}  Simulated Harvested Rows : {}",
+                    harvested_rows_per_target[*device_id],
+                    simulated_harvesting_masks.at(*device_id));
+                num_rows_harvested.at(*device_id) = std::bitset<32>(simulated_harvesting_masks.at(*device_id)).count();
+                log_assert(
+                    performed_harvesting ? translation_tables_en : true,
+                    "Using a harvested WH cluster with NOC translation disabled.");
             }
             harvested_rows_per_target[*device_id] = simulated_harvesting_masks.at(*device_id);
         }
@@ -516,38 +609,43 @@ tt_SiliconDevice::tt_SiliconDevice(const std::string &sdesc_path, const std::str
     populate_cores();
 
     // MT: Initial BH - skip this for BH
-    if(arch_name == tt::ARCH::WORMHOLE_B0) {
+    if (arch_name == tt::ARCH::WORMHOLE_B0) {
         remote_transfer_ethernet_cores.resize(target_mmio_device_ids.size());
-        for (const auto &logical_mmio_chip_id : target_mmio_device_ids) {
+        for (const auto& logical_mmio_chip_id : target_mmio_device_ids) {
             const tt_SocDescriptor& soc_desc = get_soc_descriptor(logical_mmio_chip_id);
             // 4-5 is for send_epoch_commands, 0-3 are for everything else
             for (std::uint32_t i = 0; i < NUM_ETH_CORES_FOR_NON_MMIO_TRANSFERS; i++) {
-                if(remote_transfer_ethernet_cores.size() <= logical_mmio_chip_id) {
+                if (remote_transfer_ethernet_cores.size() <= logical_mmio_chip_id) {
                     remote_transfer_ethernet_cores.resize(logical_mmio_chip_id + 1);
                 }
-                remote_transfer_ethernet_cores.at(logical_mmio_chip_id).push_back(
-                    tt_cxy_pair(logical_mmio_chip_id, soc_desc.ethernet_cores.at(i).x, soc_desc.ethernet_cores.at(i).y)
-                );
+                remote_transfer_ethernet_cores.at(logical_mmio_chip_id)
+                    .push_back(tt_cxy_pair(
+                        logical_mmio_chip_id, soc_desc.ethernet_cores.at(i).x, soc_desc.ethernet_cores.at(i).y));
             }
         }
     }
 
     // Default initialize host_address_params based on detected arch
     host_address_params = architecture_implementation->get_host_address_params();
-
 }
 
-void tt_SiliconDevice::configure_active_ethernet_cores_for_mmio_device(chip_id_t mmio_chip, const std::unordered_set<tt_xy_pair>& active_eth_cores_per_chip) {
+void tt_SiliconDevice::configure_active_ethernet_cores_for_mmio_device(
+    chip_id_t mmio_chip, const std::unordered_set<tt_xy_pair>& active_eth_cores_per_chip) {
     // Makes UMD aware of which ethernet cores have active links.
     // Based on this information, UMD determines which ethernet cores can be used for host->cluster non-MMIO transfers.
-    // This overrides the default ethernet cores tagged for host to cluster routing in the constructor and must be called for all MMIO devices, if default behaviour
-    // is not desired.
-    log_assert(get_soc_descriptor(mmio_chip).arch == tt::ARCH::WORMHOLE_B0, "{} can only be called for Wormhole arch", __FUNCTION__);
+    // This overrides the default ethernet cores tagged for host to cluster routing in the constructor and must be
+    // called for all MMIO devices, if default behaviour is not desired.
+    log_assert(
+        get_soc_descriptor(mmio_chip).arch == tt::ARCH::WORMHOLE_B0,
+        "{} can only be called for Wormhole arch",
+        __FUNCTION__);
     auto& eth_cores = get_soc_descriptor(mmio_chip).ethernet_cores;
     // Cores 0, 1, 6, 7 are only available if in the active set
-    static std::unordered_set<tt_xy_pair> eth_cores_available_if_active = {eth_cores.at(0), eth_cores.at(1), eth_cores.at(6), eth_cores.at(7)};
+    static std::unordered_set<tt_xy_pair> eth_cores_available_if_active = {
+        eth_cores.at(0), eth_cores.at(1), eth_cores.at(6), eth_cores.at(7)};
     // Eth cores 8 and 9 are always available
-    std::vector<tt_cxy_pair> non_mmio_access_cores_for_chip = {tt_cxy_pair(mmio_chip, eth_cores.at(8)), tt_cxy_pair(mmio_chip, eth_cores.at(9))};
+    std::vector<tt_cxy_pair> non_mmio_access_cores_for_chip = {
+        tt_cxy_pair(mmio_chip, eth_cores.at(8)), tt_cxy_pair(mmio_chip, eth_cores.at(9))};
     for (const auto& active_eth_core : active_eth_cores_per_chip) {
         if (eth_cores_available_if_active.find(active_eth_core) != eth_cores_available_if_active.end()) {
             non_mmio_access_cores_for_chip.push_back(tt_cxy_pair(mmio_chip, active_eth_core));
@@ -561,27 +659,33 @@ void tt_SiliconDevice::configure_active_ethernet_cores_for_mmio_device(chip_id_t
 
 void tt_SiliconDevice::populate_cores() {
     std::uint32_t count = 0;
-    for(const auto chip : soc_descriptor_per_chip) {
-        workers_per_chip.insert({chip.first, std::unordered_set<tt_xy_pair>(chip.second.workers.begin(), chip.second.workers.end())});
-        if(count == 0) {
-            eth_cores = std::unordered_set<tt_xy_pair>(chip.second.ethernet_cores.begin(), chip.second.ethernet_cores.end());
-            for(std::uint32_t dram_idx = 0; dram_idx < chip.second.get_num_dram_channels(); dram_idx++) {
-                dram_cores.insert(chip.second.get_core_for_dram_channel(dram_idx, 0)) ;
+    for (const auto chip : soc_descriptor_per_chip) {
+        workers_per_chip.insert(
+            {chip.first, std::unordered_set<tt_xy_pair>(chip.second.workers.begin(), chip.second.workers.end())});
+        if (count == 0) {
+            eth_cores =
+                std::unordered_set<tt_xy_pair>(chip.second.ethernet_cores.begin(), chip.second.ethernet_cores.end());
+            for (std::uint32_t dram_idx = 0; dram_idx < chip.second.get_num_dram_channels(); dram_idx++) {
+                dram_cores.insert(chip.second.get_core_for_dram_channel(dram_idx, 0));
             }
         }
         count++;
     }
 }
 
-std::vector<int> tt_SiliconDevice::extract_rows_to_remove(const tt::ARCH &arch, const int worker_grid_rows, const int harvested_rows) {
+std::vector<int> tt_SiliconDevice::extract_rows_to_remove(
+    const tt::ARCH& arch, const int worker_grid_rows, const int harvested_rows) {
     // Check if harvesting config is legal for GS and WH
-    log_assert(!((harvested_rows & 1) || (harvested_rows & 64) || (harvested_rows & 0xFFFFF000)), "For grayskull and wormhole, only rows 1-5 and 7-11 can be harvested");
+    log_assert(
+        !((harvested_rows & 1) || (harvested_rows & 64) || (harvested_rows & 0xFFFFF000)),
+        "For grayskull and wormhole, only rows 1-5 and 7-11 can be harvested");
     std::vector<int> row_coordinates_to_remove;
     int row_coordinate = 0;
     int tmp = harvested_rows;
     while (tmp) {
-        if (tmp & 1)
+        if (tmp & 1) {
             row_coordinates_to_remove.push_back(row_coordinate);
+        }
 
         tmp = tmp >> 1;
         row_coordinate++;
@@ -595,13 +699,14 @@ std::vector<int> tt_SiliconDevice::extract_rows_to_remove(const tt::ARCH &arch,
     return row_coordinates_to_remove;
 }
 
-void tt_SiliconDevice::remove_worker_row_from_descriptor(tt_SocDescriptor& full_soc_descriptor, const std::vector<int>& row_coordinates_to_remove) {
+void tt_SiliconDevice::remove_worker_row_from_descriptor(
+    tt_SocDescriptor& full_soc_descriptor, const std::vector<int>& row_coordinates_to_remove) {
     std::vector<tt_xy_pair> workers_to_keep;
-    for(auto worker = (full_soc_descriptor.workers).begin(); worker != (full_soc_descriptor.workers).end(); worker++){
-        if(find(row_coordinates_to_remove.begin(), row_coordinates_to_remove.end(), (*worker).y) == row_coordinates_to_remove.end()){
+    for (auto worker = (full_soc_descriptor.workers).begin(); worker != (full_soc_descriptor.workers).end(); worker++) {
+        if (find(row_coordinates_to_remove.begin(), row_coordinates_to_remove.end(), (*worker).y) ==
+            row_coordinates_to_remove.end()) {
             workers_to_keep.push_back(*worker);
-        }
-        else{
+        } else {
             (full_soc_descriptor.harvested_workers).push_back(*worker);
             full_soc_descriptor.cores.at(*worker).type = CoreType::HARVESTED;
         }
@@ -613,28 +718,32 @@ void tt_SiliconDevice::remove_worker_row_from_descriptor(tt_SocDescriptor& full_
 
     std::set<int> modified_y_coords = {};
 
-    for(const auto& core : full_soc_descriptor.workers) {
+    for (const auto& core : full_soc_descriptor.workers) {
         modified_y_coords.insert(core.y);
     }
     int logical_y_coord = 0;
-    for(const auto& y_coord : modified_y_coords) {
+    for (const auto& y_coord : modified_y_coords) {
         full_soc_descriptor.routing_y_to_worker_y.insert({y_coord, logical_y_coord});
-        full_soc_descriptor.worker_log_to_routing_y.insert({logical_y_coord,  y_coord});
+        full_soc_descriptor.worker_log_to_routing_y.insert({logical_y_coord, y_coord});
         logical_y_coord++;
     }
 }
 
 void tt_SiliconDevice::harvest_rows_in_soc_descriptor(tt::ARCH arch, tt_SocDescriptor& sdesc, uint32_t harvested_rows) {
-    std::uint32_t max_row_to_remove = (*std::max_element((sdesc.workers).begin(), (sdesc.workers).end(), [] (const auto& a, const auto& b) { return a.y < b.y; })).y;
+    std::uint32_t max_row_to_remove =
+        (*std::max_element((sdesc.workers).begin(), (sdesc.workers).end(), [](const auto& a, const auto& b) {
+            return a.y < b.y;
+        })).y;
     std::vector<int> row_coordinates_to_remove = extract_rows_to_remove(arch, max_row_to_remove, harvested_rows);
     remove_worker_row_from_descriptor(sdesc, row_coordinates_to_remove);
 }
 
-void tt_SiliconDevice::perform_harvesting_and_populate_soc_descriptors(const std::string& sdesc_path, const bool perform_harvesting) {
+void tt_SiliconDevice::perform_harvesting_and_populate_soc_descriptors(
+    const std::string& sdesc_path, const bool perform_harvesting) {
     const auto default_sdesc = tt_SocDescriptor(sdesc_path);
-    for(const auto& chip : harvested_rows_per_target) {
+    for (const auto& chip : harvested_rows_per_target) {
         auto temp_sdesc = default_sdesc;
-        if(perform_harvesting) {
+        if (perform_harvesting) {
             harvest_rows_in_soc_descriptor(arch_name, temp_sdesc, chip.second);
         }
         soc_descriptor_per_chip.insert({chip.first, temp_sdesc});
@@ -642,25 +751,24 @@ void tt_SiliconDevice::perform_harvesting_and_populate_soc_descriptors(const std
 }
 
 void tt_SiliconDevice::check_pcie_device_initialized(int device_id) {
-
-    PCIDevice *pci_device = get_pci_device(device_id);
+    PCIDevice* pci_device = get_pci_device(device_id);
     tt::ARCH device_arch = pci_device->get_arch();
     if (arch_name == tt::ARCH::GRAYSKULL) {
         if (device_arch != tt::ARCH::GRAYSKULL) {
-            throw std::runtime_error(fmt::format("Attempted to run grayskull configured tt_device on {}", get_arch_str(device_arch)));
+            throw std::runtime_error(
+                fmt::format("Attempted to run grayskull configured tt_device on {}", get_arch_str(device_arch)));
         }
-    }
-    else if (arch_name == tt::ARCH::WORMHOLE_B0) {
+    } else if (arch_name == tt::ARCH::WORMHOLE_B0) {
         if (device_arch != tt::ARCH::WORMHOLE_B0) {
-            throw std::runtime_error(fmt::format("Attempted to run wormhole configured tt_device on {}", get_arch_str(device_arch)));
+            throw std::runtime_error(
+                fmt::format("Attempted to run wormhole configured tt_device on {}", get_arch_str(device_arch)));
         }
-    }
-    else if (arch_name == tt::ARCH::BLACKHOLE) {
+    } else if (arch_name == tt::ARCH::BLACKHOLE) {
         if (device_arch != tt::ARCH::BLACKHOLE) {
-            throw std::runtime_error(fmt::format("Attempted to run blackhole configured tt_device on {}", get_arch_str(device_arch)));
+            throw std::runtime_error(
+                fmt::format("Attempted to run blackhole configured tt_device on {}", get_arch_str(device_arch)));
         }
-    }
-    else {
+    } else {
         throw std::runtime_error(fmt::format("Unsupported architecture: {}", get_arch_str(arch_name)));
     }
     auto architecture_implementation = pci_device->get_architecture_implementation();
@@ -668,29 +776,36 @@ void tt_SiliconDevice::check_pcie_device_initialized(int device_id) {
     // MT Initial BH - Add check for blackhole once access to ARC registers is setup through TLBs
     if (arch_name != tt::ARCH::BLACKHOLE) {
         log_debug(LogSiliconDriver, "== Check if device_id: {} is initialized", device_id);
-        uint32_t bar_read_initial = bar_read32(device_id, architecture_implementation->get_arc_reset_scratch_offset() + 3 * 4);
+        uint32_t bar_read_initial =
+            bar_read32(device_id, architecture_implementation->get_arc_reset_scratch_offset() + 3 * 4);
         uint32_t arg = bar_read_initial == 500 ? 325 : 500;
         uint32_t bar_read_again;
-        uint32_t arc_msg_return = arc_msg(device_id, 0xaa00 | architecture_implementation->get_arc_message_test(), true, arg, 0, 1, &bar_read_again);
+        uint32_t arc_msg_return = arc_msg(
+            device_id, 0xaa00 | architecture_implementation->get_arc_message_test(), true, arg, 0, 1, &bar_read_again);
         if (arc_msg_return != 0 || bar_read_again != arg + 1) {
             auto postcode = bar_read32(device_id, architecture_implementation->get_arc_reset_scratch_offset());
-            throw std::runtime_error(fmt::format("Device is not initialized: arc_fw postcode: {} arc_msg_return: {} arg: {} bar_read_initial: {} bar_read_again: {}",
-                                                 postcode,
-                                                 arc_msg_return,
-                                                 arg,
-                                                 bar_read_initial,
-                                                 bar_read_again));
+            throw std::runtime_error(fmt::format(
+                "Device is not initialized: arc_fw postcode: {} arc_msg_return: {} arg: {} bar_read_initial: {} "
+                "bar_read_again: {}",
+                postcode,
+                arc_msg_return,
+                arg,
+                bar_read_initial,
+                bar_read_again));
         }
     }
 
-
     if (test_setup_interface()) {
-        throw std::runtime_error("Device is incorrectly initialized. If this is a harvested Wormhole machine, it is likely that NOC Translation Tables are not enabled on device. These need to be enabled for the silicon driver to run.");
+        throw std::runtime_error(
+            "Device is incorrectly initialized. If this is a harvested Wormhole machine, it is likely that NOC "
+            "Translation Tables are not enabled on device. These need to be enabled for the silicon driver to run.");
     }
 }
 
-std::unordered_map<tt_xy_pair, tt_xy_pair> tt_SiliconDevice::create_harvested_coord_translation(const tt::ARCH arch, bool identity_map) {
-    log_assert(identity_map ? true : (arch != tt::ARCH::GRAYSKULL), "NOC Translation can only be performed for WH devices");
+std::unordered_map<tt_xy_pair, tt_xy_pair> tt_SiliconDevice::create_harvested_coord_translation(
+    const tt::ARCH arch, bool identity_map) {
+    log_assert(
+        identity_map ? true : (arch != tt::ARCH::GRAYSKULL), "NOC Translation can only be performed for WH devices");
     std::unordered_map<tt_xy_pair, tt_xy_pair> translation_table = {};
 
     tt_xy_pair grid_size;
@@ -698,29 +813,42 @@ std::unordered_map<tt_xy_pair, tt_xy_pair> tt_SiliconDevice::create_harvested_co
     std::vector<uint32_t> T6_y = {};
     std::vector<tt_xy_pair> ethernet = {};
     // Store device specific data for GS and WH depending on arch
-    if(arch == tt::ARCH::GRAYSKULL) {
+    if (arch == tt::ARCH::GRAYSKULL) {
         grid_size = tt_xy_pair(13, 12);
         T6_x = {12, 1, 11, 2, 10, 3, 9, 4, 8, 5, 7, 6};
         T6_y = {11, 1, 10, 2, 9, 3, 8, 4, 7, 5};
-    }
-    else if (arch == tt::ARCH::BLACKHOLE) {
+    } else if (arch == tt::ARCH::BLACKHOLE) {
         grid_size = tt_xy_pair(17, 12);
         T6_x = {16, 1, 15, 2, 14, 3, 13, 4, 12, 5, 11, 6, 10, 7};
         T6_y = {11, 2, 10, 3, 9, 4, 8, 5, 7, 6};
-    }
-    else {
+    } else {
         grid_size = tt_xy_pair(10, 12);
         T6_x = {1, 2, 3, 4, 6, 7, 8, 9};
         T6_y = {1, 2, 3, 4, 5, 7, 8, 9, 10, 11};
-        ethernet = {{1, 0}, {2, 0}, {3, 0}, {4, 0}, {6, 0}, {7, 0}, {8, 0}, {9, 0}, {1, 6}, {2, 6}, {3, 6}, {4, 6}, {6, 6}, {7, 6}, {8, 6}, {9, 6}};
-    }
-
-    
-    if(identity_map) {
+        ethernet = {
+            {1, 0},
+            {2, 0},
+            {3, 0},
+            {4, 0},
+            {6, 0},
+            {7, 0},
+            {8, 0},
+            {9, 0},
+            {1, 6},
+            {2, 6},
+            {3, 6},
+            {4, 6},
+            {6, 6},
+            {7, 6},
+            {8, 6},
+            {9, 6}};
+    }
+
+    if (identity_map) {
         // When device is initialized, assume no harvesting and create an identity map for cores
         // This flow is always used for GS, since there is no hardware harvesting
-        for(int x = 0; x < grid_size.x; x++) {
-            for(int y = 0; y < grid_size.y; y++) {
+        for (int x = 0; x < grid_size.x; x++) {
+            for (int y = 0; y < grid_size.y; y++) {
                 tt_xy_pair curr_core = tt_xy_pair(x, y);
                 translation_table.insert({curr_core, curr_core});
             }
@@ -731,34 +859,50 @@ std::unordered_map<tt_xy_pair, tt_xy_pair> tt_SiliconDevice::create_harvested_co
     // If this function is called with identity_map = false, we have perform NOC translation
     // This can only happen for WH devices
     // Setup coord translation for workers. Map all worker cores
-    for(int x = 0; x < grid_size.x; x++) {
-        for(int y = 0; y < grid_size.y; y++) {
+    for (int x = 0; x < grid_size.x; x++) {
+        for (int y = 0; y < grid_size.y; y++) {
             tt_xy_pair curr_core = tt_xy_pair(x, y);
 
-            if(std::find(T6_x.begin(), T6_x.end(), x) != T6_x.end() &&
-            std::find(T6_y.begin(), T6_y.end(), y) != T6_y.end()) {
+            if (std::find(T6_x.begin(), T6_x.end(), x) != T6_x.end() &&
+                std::find(T6_y.begin(), T6_y.end(), y) != T6_y.end()) {
                 // This is a worker core. Apply translation for WH.
                 tt_xy_pair harvested_worker;
-                if(x >= 1 && x <= 4) harvested_worker.x = x + 17;
-                else if(x <= 9 && x > 5) harvested_worker.x = x + 16;
-                else log_assert(false, "Invalid WH worker x coord {} when creating translation tables.", x);
+                if (x >= 1 && x <= 4) {
+                    harvested_worker.x = x + 17;
+                } else if (x <= 9 && x > 5) {
+                    harvested_worker.x = x + 16;
+                } else {
+                    log_assert(false, "Invalid WH worker x coord {} when creating translation tables.", x);
+                }
 
-                if(y >= 1 && y <= 5) harvested_worker.y = y + 17;
-                else if(y <= 11 && y > 6) harvested_worker.y = y + 16;
-                else log_assert(false, "Invalid WH worker y coord {} when creating translation tables.", y);
+                if (y >= 1 && y <= 5) {
+                    harvested_worker.y = y + 17;
+                } else if (y <= 11 && y > 6) {
+                    harvested_worker.y = y + 16;
+                } else {
+                    log_assert(false, "Invalid WH worker y coord {} when creating translation tables.", y);
+                }
                 translation_table.insert({curr_core, harvested_worker});
             }
 
-            else if(std::find(ethernet.begin(), ethernet.end(), curr_core) != ethernet.end()){
+            else if (std::find(ethernet.begin(), ethernet.end(), curr_core) != ethernet.end()) {
                 // This is an eth core. Apply translation for WH.
                 tt_xy_pair harvested_eth_core;
-                if(x >= 1 && x <= 4) harvested_eth_core.x = x + 17;
-                else if(x <= 9 && x > 5) harvested_eth_core.x = x + 16;
-                else log_assert(false, "Invalid WH eth_core x coord {} when creating translation tables.", x);
+                if (x >= 1 && x <= 4) {
+                    harvested_eth_core.x = x + 17;
+                } else if (x <= 9 && x > 5) {
+                    harvested_eth_core.x = x + 16;
+                } else {
+                    log_assert(false, "Invalid WH eth_core x coord {} when creating translation tables.", x);
+                }
 
-                if(y == 0) harvested_eth_core.y = y + 16;
-                else if(y == 6) harvested_eth_core.y = y + 11;
-                else log_assert(false, "Invalid WH eth_core y coord {} when creating translation tables.", y);
+                if (y == 0) {
+                    harvested_eth_core.y = y + 16;
+                } else if (y == 6) {
+                    harvested_eth_core.y = y + 11;
+                } else {
+                    log_assert(false, "Invalid WH eth_core y coord {} when creating translation tables.", y);
+                }
                 translation_table.insert({curr_core, harvested_eth_core});
             }
 
@@ -771,7 +915,7 @@ std::unordered_map<tt_xy_pair, tt_xy_pair> tt_SiliconDevice::create_harvested_co
     return translation_table;
 }
 
-void tt_SiliconDevice::translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c) {
+void tt_SiliconDevice::translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c) {
     auto translated_coords = harvested_coord_translation[device_id].at(tt_xy_pair(c, r));
     c = translated_coords.x;
     r = translated_coords.y;
@@ -780,18 +924,20 @@ void tt_SiliconDevice::translate_to_noc_table_coords(chip_id_t device_id, std::s
 void tt_SiliconDevice::initialize_pcie_devices() {
     log_debug(LogSiliconDriver, "tt_SiliconDevice::start");
 
-    for (auto &device_it : m_pci_device_map){
+    for (auto& device_it : m_pci_device_map) {
         check_pcie_device_initialized(device_it.first);
     }
 
     // TODO: Implement support for multiple host channels on BLACKHOLE.
-    log_assert(!(arch_name == tt::ARCH::BLACKHOLE && m_num_host_mem_channels > 1), "More channels are not yet supported for Blackhole");
+    log_assert(
+        !(arch_name == tt::ARCH::BLACKHOLE && m_num_host_mem_channels > 1),
+        "More channels are not yet supported for Blackhole");
     init_pcie_iatus();
 
     init_membars();
 }
 
-void tt_SiliconDevice::broadcast_pcie_tensix_risc_reset(chip_id_t chip_id, const TensixSoftResetOptions &soft_resets) {
+void tt_SiliconDevice::broadcast_pcie_tensix_risc_reset(chip_id_t chip_id, const TensixSoftResetOptions& soft_resets) {
     log_debug(LogSiliconDriver, "tt_SiliconDevice::broadcast_tensix_risc_reset");
 
     PCIDevice* device = get_pci_device(chip_id);
@@ -799,7 +945,10 @@ void tt_SiliconDevice::broadcast_pcie_tensix_risc_reset(chip_id_t chip_id, const
     auto valid = soft_resets & ALL_TENSIX_SOFT_RESET;
     auto logical_id = device->get_logical_id();
 
-    log_debug(LogSiliconDriver, "== For all tensix set soft-reset for {} risc cores.", TensixSoftResetOptionsToString(valid).c_str());
+    log_debug(
+        LogSiliconDriver,
+        "== For all tensix set soft-reset for {} risc cores.",
+        TensixSoftResetOptionsToString(valid).c_str());
 
     auto architecture_implementation = device->get_architecture_implementation();
 
@@ -818,77 +967,87 @@ void tt_SiliconDevice::broadcast_pcie_tensix_risc_reset(chip_id_t chip_id, const
 }
 
 std::set<chip_id_t> tt_SiliconDevice::get_target_mmio_device_ids() {
-    if(!all_target_mmio_devices.size()) {
-        for (const auto &it: m_pci_device_map) {
+    if (!all_target_mmio_devices.size()) {
+        for (const auto& it : m_pci_device_map) {
             all_target_mmio_devices.insert(it.first);
         }
     }
     return all_target_mmio_devices;
 }
 
-void tt_SiliconDevice::assert_risc_reset() {
-    broadcast_tensix_risc_reset_to_cluster(TENSIX_ASSERT_SOFT_RESET);
-}
+void tt_SiliconDevice::assert_risc_reset() { broadcast_tensix_risc_reset_to_cluster(TENSIX_ASSERT_SOFT_RESET); }
 
-void tt_SiliconDevice::deassert_risc_reset() {
-    broadcast_tensix_risc_reset_to_cluster(TENSIX_DEASSERT_SOFT_RESET);
-}
+void tt_SiliconDevice::deassert_risc_reset() { broadcast_tensix_risc_reset_to_cluster(TENSIX_DEASSERT_SOFT_RESET); }
 
 void tt_SiliconDevice::deassert_risc_reset_at_core(tt_cxy_pair core) {
-    std::uint32_t target_device = core.chip; // Get Target Device to query soc descriptor and determine location in cluster
-    log_assert(std::find(get_soc_descriptor(target_device).workers.begin(), get_soc_descriptor(target_device).workers.end(), core) != get_soc_descriptor(target_device).workers.end() ||
-               std::find(get_soc_descriptor(target_device).ethernet_cores.begin(), get_soc_descriptor(target_device).ethernet_cores.end(), core) != get_soc_descriptor(target_device).ethernet_cores.end(),
-                                "Cannot deassert reset on a non-tensix or harvested core");
-    bool target_is_mmio_capable = ndesc -> is_chip_mmio_capable(target_device);
-    if(target_is_mmio_capable) {
-        log_assert(m_pci_device_map.find(target_device) != m_pci_device_map.end(), "Could not find MMIO mapped device in devices connected over PCIe");
+    std::uint32_t target_device =
+        core.chip;  // Get Target Device to query soc descriptor and determine location in cluster
+    log_assert(
+        std::find(
+            get_soc_descriptor(target_device).workers.begin(), get_soc_descriptor(target_device).workers.end(), core) !=
+                get_soc_descriptor(target_device).workers.end() ||
+            std::find(
+                get_soc_descriptor(target_device).ethernet_cores.begin(),
+                get_soc_descriptor(target_device).ethernet_cores.end(),
+                core) != get_soc_descriptor(target_device).ethernet_cores.end(),
+        "Cannot deassert reset on a non-tensix or harvested core");
+    bool target_is_mmio_capable = ndesc->is_chip_mmio_capable(target_device);
+    if (target_is_mmio_capable) {
+        log_assert(
+            m_pci_device_map.find(target_device) != m_pci_device_map.end(),
+            "Could not find MMIO mapped device in devices connected over PCIe");
         send_tensix_risc_reset_to_core(core, TENSIX_DEASSERT_SOFT_RESET);
-    }
-    else {
+    } else {
         log_assert(arch_name != tt::ARCH::BLACKHOLE, "Can't issue access to remote core in BH");
         send_remote_tensix_risc_reset_to_core(core, TENSIX_DEASSERT_SOFT_RESET);
     }
 }
 
 void tt_SiliconDevice::assert_risc_reset_at_core(tt_cxy_pair core) {
-    std::uint32_t target_device = core.chip; // Get Target Device to query soc descriptor and determine location in cluster
-    log_assert(std::find(get_soc_descriptor(target_device).workers.begin(), get_soc_descriptor(target_device).workers.end(), core) != get_soc_descriptor(target_device).workers.end() ||
-               std::find(get_soc_descriptor(target_device).ethernet_cores.begin(), get_soc_descriptor(target_device).ethernet_cores.end(), core) != get_soc_descriptor(target_device).ethernet_cores.end(),
-                                "Cannot assert reset on a non-tensix or harvested core");
-    bool target_is_mmio_capable = ndesc -> is_chip_mmio_capable(target_device);
-    if(target_is_mmio_capable) {
-        log_assert(m_pci_device_map.find(target_device) != m_pci_device_map.end(), "Could not find MMIO mapped device in devices connected over PCIe");
+    std::uint32_t target_device =
+        core.chip;  // Get Target Device to query soc descriptor and determine location in cluster
+    log_assert(
+        std::find(
+            get_soc_descriptor(target_device).workers.begin(), get_soc_descriptor(target_device).workers.end(), core) !=
+                get_soc_descriptor(target_device).workers.end() ||
+            std::find(
+                get_soc_descriptor(target_device).ethernet_cores.begin(),
+                get_soc_descriptor(target_device).ethernet_cores.end(),
+                core) != get_soc_descriptor(target_device).ethernet_cores.end(),
+        "Cannot assert reset on a non-tensix or harvested core");
+    bool target_is_mmio_capable = ndesc->is_chip_mmio_capable(target_device);
+    if (target_is_mmio_capable) {
+        log_assert(
+            m_pci_device_map.find(target_device) != m_pci_device_map.end(),
+            "Could not find MMIO mapped device in devices connected over PCIe");
         send_tensix_risc_reset_to_core(core, TENSIX_ASSERT_SOFT_RESET);
-    }
-    else {
+    } else {
         send_remote_tensix_risc_reset_to_core(core, TENSIX_ASSERT_SOFT_RESET);
     }
 }
 
 // Free memory during teardown, and remove (clean/unlock) from any leftover mutexes.
 void tt_SiliconDevice::cleanup_shared_host_state() {
-    for(auto &mutex : hardware_resource_mutex_map) {
+    for (auto& mutex : hardware_resource_mutex_map) {
         mutex.second.reset();
         mutex.second = nullptr;
         named_mutex::remove(mutex.first.c_str());
     }
 }
 
-std::unordered_set<chip_id_t> tt_SiliconDevice::get_all_chips_in_cluster() {
-    return ndesc -> get_all_chips();
-}
+std::unordered_set<chip_id_t> tt_SiliconDevice::get_all_chips_in_cluster() { return ndesc->get_all_chips(); }
+
 int tt_SiliconDevice::get_number_of_chips_in_cluster() {
     // Returns the number of chips seen in the network descriptor
-    return ndesc -> get_all_chips().size();
+    return ndesc->get_all_chips().size();
 }
 
-tt_ClusterDescriptor* tt_SiliconDevice::get_cluster_description() {return ndesc.get();}
+tt_ClusterDescriptor* tt_SiliconDevice::get_cluster_description() { return ndesc.get(); }
+
 // Can be used before instantiating a silicon device
 int tt_SiliconDevice::detect_number_of_chips() {
-
     auto available_device_ids = detect_available_device_ids();
     return available_device_ids.size();
-
 }
 
 // Can be used before instantiating a silicon device
@@ -902,7 +1061,8 @@ std::vector<chip_id_t> tt_SiliconDevice::detect_available_device_ids() {
     return PCIDevice::enumerate_devices();
 }
 
-std::function<void(uint32_t, uint32_t, const uint8_t*)> tt_SiliconDevice::get_fast_pcie_static_tlb_write_callable(int device_id) {
+std::function<void(uint32_t, uint32_t, const uint8_t*)> tt_SiliconDevice::get_fast_pcie_static_tlb_write_callable(
+    int device_id) {
     PCIDevice* dev = get_pci_device(device_id);
 
     const auto callable = [dev](uint32_t byte_addr, uint32_t num_bytes, const uint8_t* buffer_addr) {
@@ -921,7 +1081,7 @@ tt::Writer tt_SiliconDevice::get_static_tlb_writer(tt_cxy_pair target) {
         throw std::runtime_error("TLBs not initialized");
     }
 
-    auto *dev = get_pci_device(target.chip);
+    auto* dev = get_pci_device(target.chip);
 
     if (!dev->bar0_wc) {
         throw std::runtime_error("No write-combined mapping for BAR0");
@@ -935,26 +1095,39 @@ tt::Writer tt_SiliconDevice::get_static_tlb_writer(tt_cxy_pair target) {
     }
 
     auto [tlb_offset, tlb_size] = tlb_data.value();
-    auto *base = reinterpret_cast<uint8_t *>(dev->bar0_wc);
+    auto* base = reinterpret_cast<uint8_t*>(dev->bar0_wc);
 
     return tt::Writer(base + tlb_offset, tlb_size);
 }
 
-void tt_SiliconDevice::write_device_memory(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair target, std::uint32_t address, const std::string& fallback_tlb) {
-    PCIDevice *dev = get_pci_device(target.chip);
+void tt_SiliconDevice::write_device_memory(
+    const void* mem_ptr,
+    uint32_t size_in_bytes,
+    tt_cxy_pair target,
+    std::uint32_t address,
+    const std::string& fallback_tlb) {
+    PCIDevice* dev = get_pci_device(target.chip);
     const uint8_t* buffer_addr = static_cast<const uint8_t*>(mem_ptr);
 
-    log_debug(LogSiliconDriver, "tt_SiliconDevice::write_device_memory to chip:{} {}-{} at 0x{:x} size_in_bytes: {} small_access: {}",
-        target.chip, target.x, target.y, address, size_in_bytes, small_access);
+    log_debug(
+        LogSiliconDriver,
+        "tt_SiliconDevice::write_device_memory to chip:{} {}-{} at 0x{:x} size_in_bytes: {} small_access: {}",
+        target.chip,
+        target.x,
+        target.y,
+        address,
+        size_in_bytes,
+        small_access);
 
     std::int32_t tlb_index = 0;
     std::optional<std::tuple<std::uint64_t, std::uint64_t>> tlb_data = std::nullopt;
-    if(tlbs_init_per_chip[target.chip]) {
+    if (tlbs_init_per_chip[target.chip]) {
         tlb_index = map_core_to_tlb_per_chip[target.chip](tt_xy_pair(target.x, target.y));
         tlb_data = dev->get_architecture_implementation()->describe_tlb(tlb_index);
     }
 
-    if (tlb_data.has_value() && address_in_tlb_space(address, size_in_bytes, tlb_index, std::get<1>(tlb_data.value()), target.chip)) {
+    if (tlb_data.has_value() &&
+        address_in_tlb_space(address, size_in_bytes, tlb_index, std::get<1>(tlb_data.value()), target.chip)) {
         auto [tlb_offset, tlb_size] = tlb_data.value();
         if (dev->bar4_wc != nullptr && tlb_size == BH_4GB_TLB_SIZE) {
             // This is only for Blackhole. If we want to  write to DRAM (BAR4 space), we add offset
@@ -967,9 +1140,9 @@ void tt_SiliconDevice::write_device_memory(const void *mem_ptr, uint32_t size_in
         const auto tlb_index = dynamic_tlb_config.at(fallback_tlb);
         const scoped_lock<named_mutex> lock(*get_mutex(fallback_tlb, dev->get_device_num()));
 
-        while(size_in_bytes > 0) {
-
-            auto [mapped_address, tlb_size] = dev->set_dynamic_tlb(tlb_index, target, address, harvested_coord_translation, dynamic_tlb_ordering_modes.at(fallback_tlb));
+        while (size_in_bytes > 0) {
+            auto [mapped_address, tlb_size] = dev->set_dynamic_tlb(
+                tlb_index, target, address, harvested_coord_translation, dynamic_tlb_ordering_modes.at(fallback_tlb));
             uint32_t transfer_size = std::min((uint64_t)size_in_bytes, tlb_size);
             dev->write_block(mapped_address, transfer_size, buffer_addr);
 
@@ -981,22 +1154,36 @@ void tt_SiliconDevice::write_device_memory(const void *mem_ptr, uint32_t size_in
     }
 }
 
-void tt_SiliconDevice::read_device_memory(void *mem_ptr, tt_cxy_pair target, std::uint32_t address, std::uint32_t size_in_bytes, const std::string& fallback_tlb) {
-    // Assume that mem_ptr has been allocated adequate memory on host when this function is called. Otherwise, this function will cause a segfault.
-    log_debug(LogSiliconDriver, "tt_SiliconDevice::read_device_memory to chip:{} {}-{} at 0x{:x} size_in_bytes: {}", target.chip, target.x, target.y, address, size_in_bytes);
-    PCIDevice *dev = get_pci_device(target.chip);
+void tt_SiliconDevice::read_device_memory(
+    void* mem_ptr,
+    tt_cxy_pair target,
+    std::uint32_t address,
+    std::uint32_t size_in_bytes,
+    const std::string& fallback_tlb) {
+    // Assume that mem_ptr has been allocated adequate memory on host when this function is called. Otherwise, this
+    // function will cause a segfault.
+    log_debug(
+        LogSiliconDriver,
+        "tt_SiliconDevice::read_device_memory to chip:{} {}-{} at 0x{:x} size_in_bytes: {}",
+        target.chip,
+        target.x,
+        target.y,
+        address,
+        size_in_bytes);
+    PCIDevice* dev = get_pci_device(target.chip);
 
     uint8_t* buffer_addr = static_cast<uint8_t*>(mem_ptr);
 
     std::int32_t tlb_index = 0;
     std::optional<std::tuple<std::uint64_t, std::uint64_t>> tlb_data = std::nullopt;
-    if(tlbs_init_per_chip[target.chip]) {
+    if (tlbs_init_per_chip[target.chip]) {
         tlb_index = map_core_to_tlb_per_chip[target.chip](tt_xy_pair(target.x, target.y));
         tlb_data = dev->get_architecture_implementation()->describe_tlb(tlb_index);
     }
     log_debug(LogSiliconDriver, "  tlb_index: {}, tlb_data.has_value(): {}", tlb_index, tlb_data.has_value());
 
-    if (tlb_data.has_value()  && address_in_tlb_space(address, size_in_bytes, tlb_index, std::get<1>(tlb_data.value()), target.chip)) {
+    if (tlb_data.has_value() &&
+        address_in_tlb_space(address, size_in_bytes, tlb_index, std::get<1>(tlb_data.value()), target.chip)) {
         auto [tlb_offset, tlb_size] = tlb_data.value();
         if (dev->bar4_wc != nullptr && tlb_size == BH_4GB_TLB_SIZE) {
             // This is only for Blackhole. If we want to  read from DRAM (BAR4 space), we add offset
@@ -1010,9 +1197,9 @@ void tt_SiliconDevice::read_device_memory(void *mem_ptr, tt_cxy_pair target, std
         const auto tlb_index = dynamic_tlb_config.at(fallback_tlb);
         const scoped_lock<named_mutex> lock(*get_mutex(fallback_tlb, dev->get_device_num()));
         log_debug(LogSiliconDriver, "  dynamic tlb_index: {}", tlb_index);
-        while(size_in_bytes > 0) {
-
-            auto [mapped_address, tlb_size] = dev->set_dynamic_tlb(tlb_index, target, address, harvested_coord_translation, dynamic_tlb_ordering_modes.at(fallback_tlb));
+        while (size_in_bytes > 0) {
+            auto [mapped_address, tlb_size] = dev->set_dynamic_tlb(
+                tlb_index, target, address, harvested_coord_translation, dynamic_tlb_ordering_modes.at(fallback_tlb));
             uint32_t transfer_size = std::min((uint64_t)size_in_bytes, tlb_size);
             dev->read_block(mapped_address, transfer_size, buffer_addr);
 
@@ -1025,55 +1212,61 @@ void tt_SiliconDevice::read_device_memory(void *mem_ptr, tt_cxy_pair target, std
 }
 
 void tt_SiliconDevice::read_buffer(
-    void* mem_ptr,
-    std::uint32_t address,
-    std::uint16_t channel,
-    std::uint32_t size_in_bytes,
-    chip_id_t src_device_id) {
-
+    void* mem_ptr, std::uint32_t address, std::uint16_t channel, std::uint32_t size_in_bytes, chip_id_t src_device_id) {
     log_assert(src_device_id != -1, "Must provide src_device_id for host_resident read/write");
-    log_assert(channel >= 0 && channel <= g_MAX_HOST_MEM_CHANNELS, "{} - Invalid channel {} for host_resident read/write.", __FUNCTION__, channel);
-    void * user_scratchspace = nullptr;
-
-    if(hugepage_mapping.at(src_device_id).at(channel)) {
-      user_scratchspace = static_cast<char*>(hugepage_mapping.at(src_device_id).at(channel)) + (address & HUGEPAGE_MAP_MASK);
+    log_assert(
+        channel >= 0 && channel <= g_MAX_HOST_MEM_CHANNELS,
+        "{} - Invalid channel {} for host_resident read/write.",
+        __FUNCTION__,
+        channel);
+    void* user_scratchspace = nullptr;
+
+    if (hugepage_mapping.at(src_device_id).at(channel)) {
+        user_scratchspace =
+            static_cast<char*>(hugepage_mapping.at(src_device_id).at(channel)) + (address & HUGEPAGE_MAP_MASK);
     } else {
-        throw std::runtime_error(fmt::format("write_buffer: Hugepages are not allocated for src_device_id: {} ch: {}."
-                                             " - Ensure sufficient number of Hugepages installed per device (1 per host mem ch, per device)",
-                                             src_device_id,
-                                             channel));
+        throw std::runtime_error(fmt::format(
+            "write_buffer: Hugepages are not allocated for src_device_id: {} ch: {}."
+            " - Ensure sufficient number of Hugepages installed per device (1 per host mem ch, per device)",
+            src_device_id,
+            channel));
     }
 
-    log_debug(LogSiliconDriver, "tt_SiliconDevice::read_buffer (src_device_id: {}, ch: {}) from 0x{:x}",  src_device_id, channel, user_scratchspace);
-    
+    log_debug(
+        LogSiliconDriver,
+        "tt_SiliconDevice::read_buffer (src_device_id: {}, ch: {}) from 0x{:x}",
+        src_device_id,
+        channel,
+        user_scratchspace);
+
     memcpy(mem_ptr, user_scratchspace, size_in_bytes);
 }
 
 void tt_SiliconDevice::write_buffer(
-    const void *mem_ptr,
-    std::uint32_t size,
-    std::uint32_t address,
-    std::uint16_t channel,
-    chip_id_t src_device_id) {
-
-    void * user_scratchspace = nullptr;
-    if(hugepage_mapping.at(src_device_id).at(channel)) {
-      log_assert(size <= HUGEPAGE_REGION_SIZE, "write_buffer data has larger size {} than destination buffer {}", size, HUGEPAGE_REGION_SIZE);
-      log_debug(LogSiliconDriver, "Using hugepage mapping at address {} offset {} chan {} size {}",
-        hugepage_mapping.at(src_device_id).at(channel),
-        (address & HUGEPAGE_MAP_MASK),
-        channel,
-        size);
-      user_scratchspace = static_cast<char*>(hugepage_mapping.at(src_device_id).at(channel)) + (address & HUGEPAGE_MAP_MASK);
+    const void* mem_ptr, std::uint32_t size, std::uint32_t address, std::uint16_t channel, chip_id_t src_device_id) {
+    void* user_scratchspace = nullptr;
+    if (hugepage_mapping.at(src_device_id).at(channel)) {
+        log_assert(
+            size <= HUGEPAGE_REGION_SIZE,
+            "write_buffer data has larger size {} than destination buffer {}",
+            size,
+            HUGEPAGE_REGION_SIZE);
+        log_debug(
+            LogSiliconDriver,
+            "Using hugepage mapping at address {} offset {} chan {} size {}",
+            hugepage_mapping.at(src_device_id).at(channel),
+            (address & HUGEPAGE_MAP_MASK),
+            channel,
+            size);
+        user_scratchspace =
+            static_cast<char*>(hugepage_mapping.at(src_device_id).at(channel)) + (address & HUGEPAGE_MAP_MASK);
     } else {
-        throw std::runtime_error(fmt::format("write_buffer: Hugepage are not allocated for src_device_id: {} ch: {}",
-                                             src_device_id,
-                                             channel));
+        throw std::runtime_error(fmt::format(
+            "write_buffer: Hugepage are not allocated for src_device_id: {} ch: {}", src_device_id, channel));
     }
     memcpy(user_scratchspace, mem_ptr, size);
 }
 
-
 uint32_t tt_SiliconDevice::get_power_state_arc_msg(chip_id_t chip_id, tt_DevicePowerState state) {
     PCIDevice* pci_device = get_pci_device(chip_id);
     uint32_t msg = 0xaa00;
@@ -1090,34 +1283,37 @@ uint32_t tt_SiliconDevice::get_power_state_arc_msg(chip_id_t chip_id, tt_DeviceP
             msg |= pci_device->get_architecture_implementation()->get_arc_message_arc_go_short_idle();
             break;
         }
-        default: throw std::runtime_error("Unrecognized power state.");
+        default:
+            throw std::runtime_error("Unrecognized power state.");
     }
     return msg;
 }
 
 void tt_SiliconDevice::set_pcie_power_state(tt_DevicePowerState state) {
-
-    for (auto &device_it : m_pci_device_map){
+    for (auto& device_it : m_pci_device_map) {
         int chip_id = device_it.first;
         uint32_t msg = get_power_state_arc_msg(chip_id, state);
         std::stringstream ss;
         ss << state;
         auto exit_code = arc_msg(chip_id, 0xaa00 | msg, true, 0, 0);
         if (exit_code != 0) {
-            throw std::runtime_error(fmt::format("Failed to set power state to {} with exit code {}", ss.str(), exit_code));
+            throw std::runtime_error(
+                fmt::format("Failed to set power state to {} with exit code {}", ss.str(), exit_code));
         }
     }
 }
 
 int tt_SiliconDevice::get_clock(int logical_device_id) {
-
     // TODO: remove this once ARC messages work.
     // This is currently used only for testing and bringing up Blackhole on Buda.
     if (arch_name == tt::ARCH::BLACKHOLE) {
         char* clk_env_var = getenv("TT_SILICON_DRIVER_AICLK");
         if (clk_env_var != nullptr) {
-            log_warning(LogSiliconDriver, "ARC messages are not enabled on Blackhole. "
-                        "Using AICLK value from environment variable TT_SILICON_DRIVER_AICLK: {}" , clk_env_var);
+            log_warning(
+                LogSiliconDriver,
+                "ARC messages are not enabled on Blackhole. "
+                "Using AICLK value from environment variable TT_SILICON_DRIVER_AICLK: {}",
+                clk_env_var);
             return std::stoi(clk_env_var);
         }
     }
@@ -1125,7 +1321,14 @@ int tt_SiliconDevice::get_clock(int logical_device_id) {
     uint32_t clock;
     auto mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(logical_device_id);
     PCIDevice* pci_device = get_pci_device(mmio_capable_chip_logical);
-    auto exit_code = arc_msg(logical_device_id, 0xaa00 | pci_device->get_architecture_implementation()->get_arc_message_get_aiclk(), true, 0xFFFF, 0xFFFF, 1, &clock);
+    auto exit_code = arc_msg(
+        logical_device_id,
+        0xaa00 | pci_device->get_architecture_implementation()->get_arc_message_get_aiclk(),
+        true,
+        0xFFFF,
+        0xFFFF,
+        1,
+        &clock);
     if (exit_code != 0) {
         throw std::runtime_error(fmt::format("Failed to get aiclk value with exit code {}", exit_code));
     }
@@ -1133,31 +1336,29 @@ int tt_SiliconDevice::get_clock(int logical_device_id) {
 }
 
 std::map<int, int> tt_SiliconDevice::get_clocks() {
-    std::map<int,int> clock_freq_map;
-    for (auto &device_it : m_pci_device_map){
+    std::map<int, int> clock_freq_map;
+    for (auto& device_it : m_pci_device_map) {
         int d = device_it.first;
         clock_freq_map.insert({d, get_clock(d)});
     }
     return clock_freq_map;
 }
 
-tt_SiliconDevice::~tt_SiliconDevice () {
-
+tt_SiliconDevice::~tt_SiliconDevice() {
     log_debug(LogSiliconDriver, "tt_SiliconDevice::~tt_SiliconDevice");
 
     cleanup_shared_host_state();
 
-    for (auto &device_it : m_pci_device_map){
-
+    for (auto& device_it : m_pci_device_map) {
         chip_id_t device_id = device_it.first;
         // PCIDevice *dev = device_it.second.get();
 
-        for (int ch = 0; ch < m_num_host_mem_channels; ch ++) {
+        for (int ch = 0; ch < m_num_host_mem_channels; ch++) {
             if (hugepage_mapping.at(device_id).at(ch)) {
                 munmap(hugepage_mapping.at(device_id).at(ch), hugepage_mapping_size.at(device_id).at(ch));
             }
         }
-        
+
         device_it.second.reset();
     }
     m_pci_device_map.clear();
@@ -1176,23 +1377,34 @@ std::optional<std::tuple<uint32_t, uint32_t>> tt_SiliconDevice::get_tlb_data_fro
         tlb_index = map_core_to_tlb_per_chip[target.chip](tt_xy_pair(target.x, target.y));
         auto architecture_implementation = tt::umd::architecture_implementation::create(arch_name);
         tlb_data = architecture_implementation->describe_tlb(tlb_index);
-    } 
+    }
     return tlb_data;
 }
 
-void tt_SiliconDevice::configure_tlb(chip_id_t logical_device_id, tt_xy_pair core, std::int32_t tlb_index, std::int32_t address, uint64_t ordering) {
-    log_assert(ordering == TLB_DATA::Strict || ordering == TLB_DATA::Posted || ordering == TLB_DATA::Relaxed, "Invalid ordering specified in tt_SiliconDevice::configure_tlb");
-    PCIDevice *pci_device = get_pci_device(logical_device_id);
+void tt_SiliconDevice::configure_tlb(
+    chip_id_t logical_device_id, tt_xy_pair core, std::int32_t tlb_index, std::int32_t address, uint64_t ordering) {
+    log_assert(
+        ordering == TLB_DATA::Strict || ordering == TLB_DATA::Posted || ordering == TLB_DATA::Relaxed,
+        "Invalid ordering specified in tt_SiliconDevice::configure_tlb");
+    PCIDevice* pci_device = get_pci_device(logical_device_id);
     pci_device->set_dynamic_tlb(tlb_index, core, address, harvested_coord_translation, ordering);
     auto tlb_size = std::get<1>(pci_device->get_architecture_implementation()->describe_tlb(tlb_index).value());
-    if(tlb_config_map.find(logical_device_id) == tlb_config_map.end()) tlb_config_map.insert({logical_device_id, {}});
+    if (tlb_config_map.find(logical_device_id) == tlb_config_map.end()) {
+        tlb_config_map.insert({logical_device_id, {}});
+    }
     tlb_config_map[logical_device_id].insert({tlb_index, (address / tlb_size) * tlb_size});
 }
 
 void tt_SiliconDevice::set_fallback_tlb_ordering_mode(const std::string& fallback_tlb, uint64_t ordering) {
-    log_assert(ordering == TLB_DATA::Strict || ordering == TLB_DATA::Posted || ordering == TLB_DATA::Relaxed, "Invalid ordering specified in tt_SiliconDevice::configure_tlb.");
-    log_assert(dynamic_tlb_ordering_modes.find(fallback_tlb) != dynamic_tlb_ordering_modes.end(), "Invalid TLB specified in tt_SiliconDevice::set_fallback_tlb_ordering_mode.");
-    log_assert(fallback_tlb != "LARGE_READ_TLB" &&  fallback_tlb != "LARGE_WRITE_TLB", "Ordering modes for LARGE_READ_TLB and LARGE_WRITE_TLB cannot be modified.");
+    log_assert(
+        ordering == TLB_DATA::Strict || ordering == TLB_DATA::Posted || ordering == TLB_DATA::Relaxed,
+        "Invalid ordering specified in tt_SiliconDevice::configure_tlb.");
+    log_assert(
+        dynamic_tlb_ordering_modes.find(fallback_tlb) != dynamic_tlb_ordering_modes.end(),
+        "Invalid TLB specified in tt_SiliconDevice::set_fallback_tlb_ordering_mode.");
+    log_assert(
+        fallback_tlb != "LARGE_READ_TLB" && fallback_tlb != "LARGE_WRITE_TLB",
+        "Ordering modes for LARGE_READ_TLB and LARGE_WRITE_TLB cannot be modified.");
     dynamic_tlb_ordering_modes.at(fallback_tlb) = ordering;
 }
 
@@ -1201,9 +1413,12 @@ void tt_SiliconDevice::set_fallback_tlb_ordering_mode(const std::string& fallbac
 void tt_SiliconDevice::init_pcie_iatus() {
     int num_enabled_devices = m_pci_device_map.size();
     log_debug(LogSiliconDriver, "tt_SiliconDevice::init_pcie_iatus() num_enabled_devices: {}", num_enabled_devices);
-    log_assert(m_num_host_mem_channels <= g_MAX_HOST_MEM_CHANNELS, "Maximum of {} 1GB Host memory channels supported.",  g_MAX_HOST_MEM_CHANNELS);
+    log_assert(
+        m_num_host_mem_channels <= g_MAX_HOST_MEM_CHANNELS,
+        "Maximum of {} 1GB Host memory channels supported.",
+        g_MAX_HOST_MEM_CHANNELS);
 
-    for (auto &src_device_it : m_pci_device_map){
+    for (auto& src_device_it : m_pci_device_map) {
         int logical_id = src_device_it.first;
         PCIDevice* src_pci_device = src_device_it.second.get();
 
@@ -1211,57 +1426,67 @@ void tt_SiliconDevice::init_pcie_iatus() {
         for (int channel_id = 0; channel_id < m_num_host_mem_channels; channel_id++) {
             if (hugepage_mapping.at(logical_id).at(channel_id)) {
                 std::uint32_t region_size = HUGEPAGE_REGION_SIZE;
-                if (channel_id == 3) region_size = 805306368; // Remove 256MB from full 1GB for channel 3 (iATU limitation)
+                if (channel_id == 3) {
+                    region_size = 805306368;  // Remove 256MB from full 1GB for channel 3 (iATU limitation)
+                }
 
                 // This log message doesn't look right.
-                log_debug(LogSiliconDriver, "Configuring ATU channel {} to point to hugepage {}.", channel_id, logical_id);
-                iatu_configure_peer_region(logical_id, channel_id, hugepage_physical_address.at(logical_id).at(channel_id), region_size);
+                log_debug(
+                    LogSiliconDriver, "Configuring ATU channel {} to point to hugepage {}.", channel_id, logical_id);
+                iatu_configure_peer_region(
+                    logical_id, channel_id, hugepage_physical_address.at(logical_id).at(channel_id), region_size);
 
                 if (host_channel_size.find(logical_id) == host_channel_size.end()) {
-                     host_channel_size.insert({logical_id, {}});
+                    host_channel_size.insert({logical_id, {}});
                 }
                 host_channel_size.at(logical_id).push_back(region_size);
             } else {
-                throw std::runtime_error(fmt::format("init_pcie_iatus: Hugepages are not allocated for logical device id: {} ch: {}", logical_id, channel_id));
+                throw std::runtime_error(fmt::format(
+                    "init_pcie_iatus: Hugepages are not allocated for logical device id: {} ch: {}",
+                    logical_id,
+                    channel_id));
             }
         }
     }
 }
 
 // Looks for hugetlbfs inside /proc/mounts matching desired pagesize (typically 1G)
-std::string find_hugepage_dir(std::size_t pagesize)
-{
-
-    static const std::regex hugetlbfs_mount_re(fmt::format("^(nodev|hugetlbfs) ({}) hugetlbfs ([^ ]+) 0 0$", hugepage_dir));
+std::string find_hugepage_dir(std::size_t pagesize) {
+    static const std::regex hugetlbfs_mount_re(
+        fmt::format("^(nodev|hugetlbfs) ({}) hugetlbfs ([^ ]+) 0 0$", hugepage_dir));
     static const std::regex pagesize_re("(?:^|,)pagesize=([0-9]+)([KMGT])(?:,|$)");
 
     std::ifstream proc_mounts("/proc/mounts");
 
-    for (std::string line; std::getline(proc_mounts, line); )
-    {
-        if (std::smatch mount_match; std::regex_match(line, mount_match, hugetlbfs_mount_re))
-        {
+    for (std::string line; std::getline(proc_mounts, line);) {
+        if (std::smatch mount_match; std::regex_match(line, mount_match, hugetlbfs_mount_re)) {
             std::string options = mount_match[3];
-            if (std::smatch pagesize_match; std::regex_search(options, pagesize_match, pagesize_re))
-            {
+            if (std::smatch pagesize_match; std::regex_search(options, pagesize_match, pagesize_re)) {
                 std::size_t mount_page_size = std::stoull(pagesize_match[1]);
-                switch (pagesize_match[2].str()[0])
-                {
-                    case 'T': mount_page_size <<= 10;
-                    case 'G': mount_page_size <<= 10;
-                    case 'M': mount_page_size <<= 10;
-                    case 'K': mount_page_size <<= 10;
+                switch (pagesize_match[2].str()[0]) {
+                    case 'T':
+                        mount_page_size <<= 10;
+                    case 'G':
+                        mount_page_size <<= 10;
+                    case 'M':
+                        mount_page_size <<= 10;
+                    case 'K':
+                        mount_page_size <<= 10;
                 }
 
-                if (mount_page_size == pagesize)
-                {
+                if (mount_page_size == pagesize) {
                     return mount_match[2];
                 }
             }
         }
     }
 
-    log_warning(LogSiliconDriver, "ttSiliconDevice::find_hugepage_dir: no huge page mount found in /proc/mounts for path: {} with hugepage_size: {}.", hugepage_dir, pagesize);
+    log_warning(
+        LogSiliconDriver,
+        "ttSiliconDevice::find_hugepage_dir: no huge page mount found in /proc/mounts for path: {} with hugepage_size: "
+        "{}.",
+        hugepage_dir,
+        pagesize);
     return std::string();
 }
 
@@ -1269,16 +1494,18 @@ std::string find_hugepage_dir(std::size_t pagesize)
 // All processes operating on the same pipeline must agree on the file name.
 // Today we assume there's only one pipeline running within the system.
 // One hugepage per device such that each device gets unique memory.
-int tt_SiliconDevice::open_hugepage_file(const std::string &dir, chip_id_t physical_device_id, uint16_t channel) {
+int tt_SiliconDevice::open_hugepage_file(const std::string& dir, chip_id_t physical_device_id, uint16_t channel) {
     std::vector<char> filename;
     static const char pipeline_name[] = "tenstorrent";
 
     filename.insert(filename.end(), dir.begin(), dir.end());
-    if (filename.back() != '/') filename.push_back('/');
+    if (filename.back() != '/') {
+        filename.push_back('/');
+    }
 
     // In order to limit number of hugepages while transition from shared hugepage (1 per system) to unique
     // hugepage per device, will share original/shared hugepage filename with physical device 0.
-    if (physical_device_id != 0 || channel != 0){
+    if (physical_device_id != 0 || channel != 0) {
         std::string device_id_str = fmt::format("device_{}_", physical_device_id);
         filename.insert(filename.end(), device_id_str.begin(), device_id_str.end());
     }
@@ -1288,20 +1515,32 @@ int tt_SiliconDevice::open_hugepage_file(const std::string &dir, chip_id_t physi
         filename.insert(filename.end(), channel_id_str.begin(), channel_id_str.end());
     }
 
-    filename.insert(filename.end(), std::begin(pipeline_name), std::end(pipeline_name)); // includes NUL terminator
+    filename.insert(filename.end(), std::begin(pipeline_name), std::end(pipeline_name));  // includes NUL terminator
 
     std::string filename_str(filename.begin(), filename.end());
-    filename_str.erase(std::find(filename_str.begin(), filename_str.end(), '\0'), filename_str.end()); // Erase NULL terminator for printing.
-    log_debug(LogSiliconDriver, "ttSiliconDevice::open_hugepage_file: using filename: {} for physical_device_id: {} channel: {}", filename_str.c_str(), physical_device_id, channel);
+    filename_str.erase(
+        std::find(filename_str.begin(), filename_str.end(), '\0'),
+        filename_str.end());  // Erase NULL terminator for printing.
+    log_debug(
+        LogSiliconDriver,
+        "ttSiliconDevice::open_hugepage_file: using filename: {} for physical_device_id: {} channel: {}",
+        filename_str.c_str(),
+        physical_device_id,
+        channel);
 
     // Save original and set umask to unrestricted.
     auto old_umask = umask(0);
 
-    int fd = open(filename.data(), O_RDWR | O_CREAT | O_CLOEXEC, S_IWUSR | S_IRUSR | S_IWGRP | S_IRGRP | S_IWOTH | S_IROTH );
+    int fd =
+        open(filename.data(), O_RDWR | O_CREAT | O_CLOEXEC, S_IWUSR | S_IRUSR | S_IWGRP | S_IRGRP | S_IWOTH | S_IROTH);
     if (fd == -1 && errno == EACCES) {
-        log_warning(LogSiliconDriver, "ttSiliconDevice::open_hugepage_file could not open filename: {} on first try, unlinking it and retrying.", filename_str);
+        log_warning(
+            LogSiliconDriver,
+            "ttSiliconDevice::open_hugepage_file could not open filename: {} on first try, unlinking it and retrying.",
+            filename_str);
         unlink(filename.data());
-        fd = open(filename.data(), O_RDWR | O_CREAT | O_CLOEXEC, S_IWUSR | S_IRUSR | S_IWGRP | S_IRGRP | S_IWOTH | S_IROTH );
+        fd = open(
+            filename.data(), O_RDWR | O_CREAT | O_CLOEXEC, S_IWUSR | S_IRUSR | S_IWGRP | S_IRGRP | S_IWOTH | S_IROTH);
     }
 
     // Verify opened file size.
@@ -1310,7 +1549,11 @@ int tt_SiliconDevice::open_hugepage_file(const std::string &dir, chip_id_t physi
         log_warning(LogSiliconDriver, "Error reading file size after opening: {}", filename_str);
     } else {
         if (st.st_size == 0) {
-            log_warning(LogSiliconDriver, "Opened hugepage file has zero size, mapping it might fail: {}. Verify that enough hugepages are provided.", filename_str);
+            log_warning(
+                LogSiliconDriver,
+                "Opened hugepage file has zero size, mapping it might fail: {}. Verify that enough hugepages are "
+                "provided.",
+                filename_str);
         }
     }
 
@@ -1326,10 +1569,10 @@ int tt_SiliconDevice::open_hugepage_file(const std::string &dir, chip_id_t physi
 }
 
 // For debug purposes when various stages fails.
-void print_file_contents(std::string filename, std::string hint = ""){
-    if (std::filesystem::exists(filename)){
+void print_file_contents(std::string filename, std::string hint = "") {
+    if (std::filesystem::exists(filename)) {
         std::ifstream meminfo(filename);
-        if (meminfo.is_open()){
+        if (meminfo.is_open()) {
             std::cout << std::endl << "File " << filename << " " << hint << " is: " << std::endl;
             std::cout << meminfo.rdbuf();
         }
@@ -1346,7 +1589,10 @@ bool tt_SiliconDevice::init_hugepage(chip_id_t device_id) {
 
     std::string hugepage_dir = find_hugepage_dir(hugepage_size);
     if (hugepage_dir.empty()) {
-        log_warning(LogSiliconDriver, "ttSiliconDevice::init_hugepage: no huge page mount found for hugepage_size: {}.", hugepage_size);
+        log_warning(
+            LogSiliconDriver,
+            "ttSiliconDevice::init_hugepage: no huge page mount found for hugepage_size: {}.",
+            hugepage_size);
         return false;
     }
 
@@ -1354,32 +1600,49 @@ bool tt_SiliconDevice::init_hugepage(chip_id_t device_id) {
 
     // Support for more than 1GB host memory accessible per device, via channels.
     for (int ch = 0; ch < m_num_host_mem_channels; ch++) {
-
         int hugepage_fd = open_hugepage_file(hugepage_dir, physical_device_id, ch);
         if (hugepage_fd == -1) {
             // Probably a permissions problem.
-            log_warning(LogSiliconDriver, "ttSiliconDevice::init_hugepage: physical_device_id: {} ch: {} creating hugepage mapping file failed.", physical_device_id, ch);
+            log_warning(
+                LogSiliconDriver,
+                "ttSiliconDevice::init_hugepage: physical_device_id: {} ch: {} creating hugepage mapping file failed.",
+                physical_device_id,
+                ch);
             success = false;
             continue;
         }
 
-        std::byte *mapping = static_cast<std::byte*>(mmap(nullptr, hugepage_size, PROT_READ|PROT_WRITE, MAP_SHARED | MAP_POPULATE, hugepage_fd, 0));
+        std::byte* mapping = static_cast<std::byte*>(
+            mmap(nullptr, hugepage_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, hugepage_fd, 0));
 
         close(hugepage_fd);
 
         if (mapping == MAP_FAILED) {
-            log_warning(LogSiliconDriver, "UMD: Mapping a hugepage failed. (device: {}, {}/{} errno: {}).", physical_device_id, ch, m_num_host_mem_channels, strerror(errno));
-            print_file_contents("/proc/cmdline");\
-            print_file_contents("/sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages"); // Hardcoded for 1GB hugepage.
+            log_warning(
+                LogSiliconDriver,
+                "UMD: Mapping a hugepage failed. (device: {}, {}/{} errno: {}).",
+                physical_device_id,
+                ch,
+                m_num_host_mem_channels,
+                strerror(errno));
+            print_file_contents("/proc/cmdline");
+            print_file_contents(
+                "/sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages");  // Hardcoded for 1GB hugepage.
             success = false;
             continue;
         }
 
-        // Beter performance if hugepage just allocated (populate flag to prevent lazy alloc) is migrated to same numanode as TT device.
-        if (!tt::cpuset::tt_cpuset_allocator::bind_area_to_memory_nodeset(physical_device_id, mapping, hugepage_size)){
-            log_warning(LogSiliconDriver, "---- ttSiliconDevice::init_hugepage: bind_area_to_memory_nodeset() failed (physical_device_id: {} ch: {}). "
-            "Hugepage allocation is not on NumaNode matching TT Device. Side-Effect is decreased Device->Host perf (Issue #893).",
-            physical_device_id, ch);
+        // Beter performance if hugepage just allocated (populate flag to prevent lazy alloc) is migrated to same
+        // numanode as TT device.
+        if (!tt::cpuset::tt_cpuset_allocator::bind_area_to_memory_nodeset(physical_device_id, mapping, hugepage_size)) {
+            log_warning(
+                LogSiliconDriver,
+                "---- ttSiliconDevice::init_hugepage: bind_area_to_memory_nodeset() failed (physical_device_id: {} ch: "
+                "{}). "
+                "Hugepage allocation is not on NumaNode matching TT Device. Side-Effect is decreased Device->Host perf "
+                "(Issue #893).",
+                physical_device_id,
+                ch);
         }
 
         tenstorrent_pin_pages pin_pages;
@@ -1392,7 +1655,13 @@ bool tt_SiliconDevice::init_hugepage(chip_id_t device_id) {
         auto fd = dev->get_fd();
 
         if (ioctl(fd, TENSTORRENT_IOCTL_PIN_PAGES, &pin_pages) == -1) {
-            log_warning(LogSiliconDriver, "---- ttSiliconDevice::init_hugepage: physical_device_id: {} ch: {} TENSTORRENT_IOCTL_PIN_PAGES failed (errno: {}). Common Issue: Requires TTMKD >= 1.11, see following file contents...", physical_device_id, ch, strerror(errno));
+            log_warning(
+                LogSiliconDriver,
+                "---- ttSiliconDevice::init_hugepage: physical_device_id: {} ch: {} TENSTORRENT_IOCTL_PIN_PAGES failed "
+                "(errno: {}). Common Issue: Requires TTMKD >= 1.11, see following file contents...",
+                physical_device_id,
+                ch,
+                strerror(errno));
             munmap(mapping, hugepage_size);
             print_file_contents("/sys/module/tenstorrent/version", "(TTKMD version)");
             print_file_contents("/proc/meminfo");
@@ -1405,64 +1674,80 @@ bool tt_SiliconDevice::init_hugepage(chip_id_t device_id) {
         hugepage_mapping_size.at(device_id).at(ch) = hugepage_size;
         hugepage_physical_address.at(device_id).at(ch) = pin_pages.out.physical_address;
 
-        log_debug(LogSiliconDriver, "ttSiliconDevice::init_hugepage: physical_device_id: {} ch: {} mapping_size: {} physical address 0x{:x}", physical_device_id, ch, hugepage_size, (unsigned long long)hugepage_physical_address.at(device_id).at(ch));
+        log_debug(
+            LogSiliconDriver,
+            "ttSiliconDevice::init_hugepage: physical_device_id: {} ch: {} mapping_size: {} physical address 0x{:x}",
+            physical_device_id,
+            ch,
+            hugepage_size,
+            (unsigned long long)hugepage_physical_address.at(device_id).at(ch));
     }
 
     return success;
 }
 
-int tt_SiliconDevice::test_setup_interface () {
+int tt_SiliconDevice::test_setup_interface() {
     if (arch_name == tt::ARCH::GRAYSKULL) {
         int ret_val = 0;
-        PCIDevice *dev = m_pci_device_map.begin()->second.get();
+        PCIDevice* dev = m_pci_device_map.begin()->second.get();
 
-        uint32_t mapped_reg = dev->set_dynamic_tlb(dev->get_architecture_implementation()->get_reg_tlb(), tt_xy_pair(0, 0), 0xffb20108, harvested_coord_translation).bar_offset;
+        uint32_t mapped_reg = dev->set_dynamic_tlb(
+                                     dev->get_architecture_implementation()->get_reg_tlb(),
+                                     tt_xy_pair(0, 0),
+                                     0xffb20108,
+                                     harvested_coord_translation)
+                                  .bar_offset;
 
         uint32_t regval = 0;
         dev->read_regs(mapped_reg, 1, &regval);
         ret_val = (regval != 0xffffffff && ((regval & 0x1) == 1)) ? 0 : 1;
         return ret_val;
-    }
-    else if (arch_name == tt::ARCH::WORMHOLE_B0) {
+    } else if (arch_name == tt::ARCH::WORMHOLE_B0) {
         int ret_val = 0;
-        PCIDevice *dev = m_pci_device_map.begin()->second.get();
+        PCIDevice* dev = m_pci_device_map.begin()->second.get();
 
-        uint32_t mapped_reg = dev->set_dynamic_tlb(dev->get_architecture_implementation()->get_reg_tlb(), tt_xy_pair(1, 0), 0xffb20108, harvested_coord_translation).bar_offset;
+        uint32_t mapped_reg = dev->set_dynamic_tlb(
+                                     dev->get_architecture_implementation()->get_reg_tlb(),
+                                     tt_xy_pair(1, 0),
+                                     0xffb20108,
+                                     harvested_coord_translation)
+                                  .bar_offset;
 
         uint32_t regval = 0;
         dev->read_regs(mapped_reg, 1, &regval);
         ret_val = (regval != 0xffffffff && (regval == 33)) ? 0 : 1;
         return ret_val;
-    }
-    else if (arch_name == tt::ARCH::BLACKHOLE) {
+    } else if (arch_name == tt::ARCH::BLACKHOLE) {
         // MT Inital BH - Try to enable this, but double check "regval == 33"
         // int ret_val = 0;
         // PCIDevice *dev = m_pci_device_map.begin()->second->hdev;
 
-        // uint32_t mapped_reg = dev->set_dynamic_tlb(m_pci_device_map.begin()->second, dev->get_architecture_implementation()->get_reg_tlb(), tt_xy_pair(1, 0), 0xffb20108, harvested_coord_translation).bar_offset;
+        // uint32_t mapped_reg = dev->set_dynamic_tlb(m_pci_device_map.begin()->second,
+        // dev->get_architecture_implementation()->get_reg_tlb(), tt_xy_pair(1, 0), 0xffb20108,
+        // harvested_coord_translation).bar_offset;
 
         // uint32_t regval = 0;
         // read_regs(dev, mapped_reg, 1, &regval);
         // ret_val = (regval != 0xffffffff && (regval == 33)) ? 0 : 1;
         // return ret_val;
         return 0;
-    }
-    else {
+    } else {
         throw std::runtime_error(fmt::format("Unsupported architecture: {}", get_arch_str(arch_name)));
     }
 }
 
-void tt_SiliconDevice::bar_write32 (int logical_device_id, uint32_t addr, uint32_t data) {
-    PCIDevice *dev = get_pci_device(logical_device_id);
+void tt_SiliconDevice::bar_write32(int logical_device_id, uint32_t addr, uint32_t data) {
+    PCIDevice* dev = get_pci_device(logical_device_id);
 
     if (addr < dev->bar0_uc_offset) {
-        dev->write_block(addr, sizeof(data), reinterpret_cast<const uint8_t*>(&data)); // do we have to reinterpret_cast?
+        dev->write_block(
+            addr, sizeof(data), reinterpret_cast<const uint8_t*>(&data));  // do we have to reinterpret_cast?
     } else {
         dev->write_regs(addr, 1, &data);
     }
 }
 
-uint32_t tt_SiliconDevice::bar_read32 (int logical_device_id, uint32_t addr) {
+uint32_t tt_SiliconDevice::bar_read32(int logical_device_id, uint32_t addr) {
     PCIDevice* dev = get_pci_device(logical_device_id);
 
     uint32_t data;
@@ -1475,32 +1760,39 @@ uint32_t tt_SiliconDevice::bar_read32 (int logical_device_id, uint32_t addr) {
 }
 
 // Returns 0 if everything was OK
-int tt_SiliconDevice::pcie_arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done, uint32_t arg0, uint32_t arg1, int timeout, uint32_t *return_3, uint32_t *return_4) {
-
-
+int tt_SiliconDevice::pcie_arc_msg(
+    int logical_device_id,
+    uint32_t msg_code,
+    bool wait_for_done,
+    uint32_t arg0,
+    uint32_t arg1,
+    int timeout,
+    uint32_t* return_3,
+    uint32_t* return_4) {
     if ((msg_code & 0xff00) != 0xaa00) {
         log_error("Malformed message. msg_code is 0x{:x} but should be 0xaa..", msg_code);
     }
-    log_assert(arg0 <= 0xffff and arg1 <= 0xffff, "Only 16 bits allowed in arc_msg args"); // Only 16 bits are allowed
+    log_assert(arg0 <= 0xffff and arg1 <= 0xffff, "Only 16 bits allowed in arc_msg args");  // Only 16 bits are allowed
 
-    PCIDevice *pci_device = get_pci_device(logical_device_id);
+    PCIDevice* pci_device = get_pci_device(logical_device_id);
     auto architecture_implementation = pci_device->get_architecture_implementation();
 
     // Exclusive access for a single process at a time. Based on physical pci interface id.
     std::string msg_type = "ARC_MSG";
     const scoped_lock<named_mutex> lock(*get_mutex(msg_type, pci_device->get_device_num()));
-    uint32_t fw_arg = arg0 | (arg1<<16);
+    uint32_t fw_arg = arg0 | (arg1 << 16);
     int exit_code = 0;
 
-    bar_write32 (logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 3 * 4, fw_arg);
-    bar_write32 (logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 5 * 4, msg_code);
+    bar_write32(logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 3 * 4, fw_arg);
+    bar_write32(logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 5 * 4, msg_code);
 
-    uint32_t misc = bar_read32 (logical_device_id, architecture_implementation->get_arc_reset_arc_misc_cntl_offset());
+    uint32_t misc = bar_read32(logical_device_id, architecture_implementation->get_arc_reset_arc_misc_cntl_offset());
     if (misc & (1 << 16)) {
         log_error("trigger_fw_int failed on device {}", logical_device_id);
         return 1;
     } else {
-        bar_write32(logical_device_id, architecture_implementation->get_arc_reset_arc_misc_cntl_offset(), misc | (1 << 16));
+        bar_write32(
+            logical_device_id, architecture_implementation->get_arc_reset_arc_misc_cntl_offset(), misc | (1 << 16));
     }
 
     if (wait_for_done) {
@@ -1509,24 +1801,31 @@ int tt_SiliconDevice::pcie_arc_msg(int logical_device_id, uint32_t msg_code, boo
         auto start = std::chrono::system_clock::now();
         while (true) {
             if (std::chrono::system_clock::now() - start > timeout_seconds) {
-                throw std::runtime_error(fmt::format("Timed out after waiting {} seconds for device {} ARC to respond", timeout, logical_device_id));
+                throw std::runtime_error(fmt::format(
+                    "Timed out after waiting {} seconds for device {} ARC to respond", timeout, logical_device_id));
             }
 
             status = bar_read32(logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 5 * 4);
 
             if ((status & 0xffff) == (msg_code & 0xff)) {
                 if (return_3 != nullptr) {
-                    *return_3 = bar_read32(logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 3 * 4);
+                    *return_3 = bar_read32(
+                        logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 3 * 4);
                 }
 
                 if (return_4 != nullptr) {
-                    *return_4 = bar_read32(logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 4 * 4);
+                    *return_4 = bar_read32(
+                        logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 4 * 4);
                 }
 
                 exit_code = (status & 0xffff0000) >> 16;
                 break;
             } else if (status == MSG_ERROR_REPLY) {
-                log_warning(LogSiliconDriver, "On device {}, message code 0x{:x} not recognized by FW", logical_device_id, msg_code);
+                log_warning(
+                    LogSiliconDriver,
+                    "On device {}, message code 0x{:x} not recognized by FW",
+                    logical_device_id,
+                    msg_code);
                 exit_code = MSG_ERROR_REPLY;
                 break;
             }
@@ -1537,12 +1836,16 @@ int tt_SiliconDevice::pcie_arc_msg(int logical_device_id, uint32_t msg_code, boo
     return exit_code;
 }
 
-int tt_SiliconDevice::iatu_configure_peer_region (int logical_device_id, uint32_t peer_region_id, uint64_t bar_addr_64, uint32_t region_size) {
+int tt_SiliconDevice::iatu_configure_peer_region(
+    int logical_device_id, uint32_t peer_region_id, uint64_t bar_addr_64, uint32_t region_size) {
     uint32_t dest_bar_lo = bar_addr_64 & 0xffffffff;
     uint32_t dest_bar_hi = (bar_addr_64 >> 32) & 0xffffffff;
     std::uint32_t region_id_to_use = peer_region_id;
-    if(peer_region_id == 3) region_id_to_use = 4; // Hack use region 4 for channel 3..this ensures that we have a smaller chan 3 address space with the correct start offset
-    PCIDevice *pci_device = get_pci_device(logical_device_id);
+    if (peer_region_id == 3) {
+        region_id_to_use = 4;  // Hack use region 4 for channel 3..this ensures that we have a smaller chan 3 address
+                               // space with the correct start offset
+    }
+    PCIDevice* pci_device = get_pci_device(logical_device_id);
     auto architecture_implementation = pci_device->get_architecture_implementation();
 
     // BR: ARC doesn't work yet on Blackhole, so programming ATU directly. Should be removed when arc starts working.
@@ -1552,8 +1855,8 @@ int tt_SiliconDevice::iatu_configure_peer_region (int logical_device_id, uint32_
         uint64_t base_size = (region_id_to_use + 1) * region_size;
         uint64_t limit_address = base_addr + base_size - 1;
 
-        uint32_t region_ctrl_1 = 1 << 13; // INCREASE_REGION_SIZE = 1
-        uint32_t region_ctrl_2 = 1 << 31; // REGION_EN = 1
+        uint32_t region_ctrl_1 = 1 << 13;  // INCREASE_REGION_SIZE = 1
+        uint32_t region_ctrl_2 = 1 << 31;  // REGION_EN = 1
         uint32_t region_ctrl_3 = 0;
         uint32_t base_addr_lo = base_addr & 0xffffffff;
         uint32_t base_addr_hi = (base_addr >> 32) & 0xffffffff;
@@ -1563,43 +1866,83 @@ int tt_SiliconDevice::iatu_configure_peer_region (int logical_device_id, uint32_
         uint64_t iatu_index = 0;
         uint64_t iatu_base = UNROLL_ATU_OFFSET_BAR + iatu_index * 0x200;
 
-        pci_device->write_regs(reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x00), &region_ctrl_1, 1);
-        pci_device->write_regs(reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x04), &region_ctrl_2, 1);
-        pci_device->write_regs(reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x08), &base_addr_lo, 1);
-        pci_device->write_regs(reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x0c), &base_addr_hi, 1);
-        pci_device->write_regs(reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x10), &limit_address_lo, 1);
-        pci_device->write_regs(reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x14), &dest_bar_lo, 1);
-        pci_device->write_regs(reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x18), &dest_bar_hi, 1);
-        pci_device->write_regs(reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x1c), &region_ctrl_3, 1);
-        pci_device->write_regs(reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x20), &limit_address_hi, 1);
-    }
-    else {
-        bar_write32(logical_device_id, architecture_implementation->get_arc_csm_mailbox_offset() + 0 * 4, region_id_to_use);
+        pci_device->write_regs(
+            reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x00),
+            &region_ctrl_1,
+            1);
+        pci_device->write_regs(
+            reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x04),
+            &region_ctrl_2,
+            1);
+        pci_device->write_regs(
+            reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x08),
+            &base_addr_lo,
+            1);
+        pci_device->write_regs(
+            reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x0c),
+            &base_addr_hi,
+            1);
+        pci_device->write_regs(
+            reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x10),
+            &limit_address_lo,
+            1);
+        pci_device->write_regs(
+            reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x14),
+            &dest_bar_lo,
+            1);
+        pci_device->write_regs(
+            reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x18),
+            &dest_bar_hi,
+            1);
+        pci_device->write_regs(
+            reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x1c),
+            &region_ctrl_3,
+            1);
+        pci_device->write_regs(
+            reinterpret_cast<std::uint32_t*>(static_cast<uint8_t*>(pci_device->bar2_uc) + iatu_base + 0x20),
+            &limit_address_hi,
+            1);
+    } else {
+        bar_write32(
+            logical_device_id, architecture_implementation->get_arc_csm_mailbox_offset() + 0 * 4, region_id_to_use);
         bar_write32(logical_device_id, architecture_implementation->get_arc_csm_mailbox_offset() + 1 * 4, dest_bar_lo);
         bar_write32(logical_device_id, architecture_implementation->get_arc_csm_mailbox_offset() + 2 * 4, dest_bar_hi);
         bar_write32(logical_device_id, architecture_implementation->get_arc_csm_mailbox_offset() + 3 * 4, region_size);
-        arc_msg(logical_device_id, 0xaa00 | architecture_implementation->get_arc_message_setup_iatu_for_peer_to_peer(), true, 0, 0);
+        arc_msg(
+            logical_device_id,
+            0xaa00 | architecture_implementation->get_arc_message_setup_iatu_for_peer_to_peer(),
+            true,
+            0,
+            0);
     }
 
     // Print what just happened
-    uint32_t peer_region_start = region_id_to_use*region_size;
-    uint32_t peer_region_end = (region_id_to_use+1)*region_size - 1;
-    log_debug(LogSiliconDriver, "    [region id {}] NOC to PCI address range 0x{:x}-0x{:x} mapped to addr 0x{:x}", peer_region_id, peer_region_start, peer_region_end, bar_addr_64);
+    uint32_t peer_region_start = region_id_to_use * region_size;
+    uint32_t peer_region_end = (region_id_to_use + 1) * region_size - 1;
+    log_debug(
+        LogSiliconDriver,
+        "    [region id {}] NOC to PCI address range 0x{:x}-0x{:x} mapped to addr 0x{:x}",
+        peer_region_id,
+        peer_region_start,
+        peer_region_end,
+        bar_addr_64);
     return 0;
 }
 
 // Returns broken rows as bits set to 1 in 'memory' and 'logic'
 uint32_t tt_SiliconDevice::get_harvested_noc_rows(uint32_t harvesting_mask) {
     auto architecture_implementation = tt::umd::architecture_implementation::create(arch_name);
-    const std::vector<uint32_t> &harv_to_noc_loc = architecture_implementation->get_harvesting_noc_locations();
+    const std::vector<uint32_t>& harv_to_noc_loc = architecture_implementation->get_harvesting_noc_locations();
     uint32_t harv_noc_rows = 0;
     std::string harv_noc_rows_str = "";
 
-    for (int pos=0; pos<harv_to_noc_loc.size(); ++pos) {
+    for (int pos = 0; pos < harv_to_noc_loc.size(); ++pos) {
         bool is_row_harvested = harvesting_mask & 0x1;
         if (is_row_harvested) {
             harv_noc_rows |= (1 << harv_to_noc_loc[pos]);
-            if (harv_noc_rows_str != "") harv_noc_rows_str += ", ";
+            if (harv_noc_rows_str != "") {
+                harv_noc_rows_str += ", ";
+            }
             harv_noc_rows_str += std::to_string(harv_to_noc_loc[pos]);
         }
         harvesting_mask = harvesting_mask >> 1;
@@ -1610,36 +1953,45 @@ uint32_t tt_SiliconDevice::get_harvested_noc_rows(uint32_t harvesting_mask) {
     return harv_noc_rows;
 }
 
-uint32_t tt_SiliconDevice::get_harvested_rows (int logical_device_id) {
+uint32_t tt_SiliconDevice::get_harvested_rows(int logical_device_id) {
     const char* harv_override = std::getenv("T6PY_HARVESTING_OVERRIDE");
     uint32_t harv = 0xffffffff;
     if (harv_override) {
         harv = std::stoul(harv_override, nullptr, 16);
     } else {
         auto mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(logical_device_id);
-        PCIDevice *pci_device = get_pci_device(mmio_capable_chip_logical);
-        int harvesting_msg_code = arc_msg(logical_device_id, 0xaa00 | pci_device->get_architecture_implementation()->get_arc_message_arc_get_harvesting(), true, 0, 0, 1, &harv);
-        log_assert(harvesting_msg_code != MSG_ERROR_REPLY, "Failed to read harvested rows from device {}", logical_device_id);
+        PCIDevice* pci_device = get_pci_device(mmio_capable_chip_logical);
+        int harvesting_msg_code = arc_msg(
+            logical_device_id,
+            0xaa00 | pci_device->get_architecture_implementation()->get_arc_message_arc_get_harvesting(),
+            true,
+            0,
+            0,
+            1,
+            &harv);
+        log_assert(
+            harvesting_msg_code != MSG_ERROR_REPLY, "Failed to read harvested rows from device {}", logical_device_id);
     }
     log_assert(harv != 0xffffffff, "Readback 0xffffffff for harvesting info. Chip is fused incorrectly!");
-    log_debug(LogSiliconDriver, "HARVESTING {}, 0x{:x}", (harv==0) ? "DISABLED":"ENABLED", harv);
-    
+    log_debug(LogSiliconDriver, "HARVESTING {}, 0x{:x}", (harv == 0) ? "DISABLED" : "ENABLED", harv);
+
     uint32_t memory = harv & 0x3ff;
     uint32_t logic = (harv >> 10) & 0x3ff;
-    return (memory|logic);
+    return (memory | logic);
 }
 
-uint32_t tt_SiliconDevice::get_harvested_noc_rows_for_chip (int logical_device_id) {
+uint32_t tt_SiliconDevice::get_harvested_noc_rows_for_chip(int logical_device_id) {
     return get_harvested_noc_rows(get_harvested_rows(logical_device_id));
 }
 
-void tt_SiliconDevice::enable_local_ethernet_queue(const chip_id_t &device_id, int timeout) {
+void tt_SiliconDevice::enable_local_ethernet_queue(const chip_id_t& device_id, int timeout) {
     uint32_t msg_success = 0x0;
     auto timeout_seconds = std::chrono::seconds(timeout);
     auto start = std::chrono::system_clock::now();
     while (msg_success != 1) {
         if (std::chrono::system_clock::now() - start > timeout_seconds) {
-            throw std::runtime_error(fmt::format("Timed out after waiting {} seconds for for DRAM to finish training", timeout));
+            throw std::runtime_error(
+                fmt::format("Timed out after waiting {} seconds for for DRAM to finish training", timeout));
         }
 
         if (arc_msg(device_id, 0xaa58, true, 0xFFFF, 0xFFFF, 1, &msg_success) == MSG_ERROR_REPLY) {
@@ -1648,7 +2000,7 @@ void tt_SiliconDevice::enable_local_ethernet_queue(const chip_id_t &device_id, i
     }
 }
 
-void *tt_SiliconDevice::host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const {
+void* tt_SiliconDevice::host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const {
     if (hugepage_mapping.at(src_device_id).at(channel) != nullptr) {
         return static_cast<std::byte*>(hugepage_mapping.at(src_device_id).at(channel)) + offset;
     } else {
@@ -1658,18 +2010,20 @@ void *tt_SiliconDevice::host_dma_address(std::uint64_t offset, chip_id_t src_dev
 
 // Wrapper for throwing more helpful exception when not-enabled pci intf is accessed.
 inline PCIDevice* tt_SiliconDevice::get_pci_device(int device_id) const {
-    if (!m_pci_device_map.count(device_id)){
+    if (!m_pci_device_map.count(device_id)) {
         throw std::runtime_error(fmt::format("device_id: {} attempted to be accessed, but is not enabled.", device_id));
     }
     return m_pci_device_map.at(device_id).get();
 }
 
-std::shared_ptr<boost::interprocess::named_mutex> tt_SiliconDevice::get_mutex(const std::string& tlb_name, int pci_interface_id) {
+std::shared_ptr<boost::interprocess::named_mutex> tt_SiliconDevice::get_mutex(
+    const std::string& tlb_name, int pci_interface_id) {
     std::string mutex_name = tlb_name + std::to_string(pci_interface_id);
     return hardware_resource_mutex_map.at(mutex_name);
 }
 
-uint64_t tt_SiliconDevice::get_sys_addr(uint32_t chip_x, uint32_t chip_y, uint32_t noc_x, uint32_t noc_y, uint64_t offset) {
+uint64_t tt_SiliconDevice::get_sys_addr(
+    uint32_t chip_x, uint32_t chip_y, uint32_t noc_x, uint32_t noc_y, uint64_t offset) {
     uint64_t result = chip_y;
     uint64_t noc_addr_local_bits_mask = (1UL << eth_interface_params.noc_addr_local_bits) - 1;
     result <<= eth_interface_params.noc_addr_node_id_bits;
@@ -1692,7 +2046,8 @@ uint16_t tt_SiliconDevice::get_sys_rack(uint32_t rack_x, uint32_t rack_y) {
 }
 
 bool tt_SiliconDevice::is_non_mmio_cmd_q_full(uint32_t curr_wptr, uint32_t curr_rptr) {
-  return (curr_wptr != curr_rptr) && ((curr_wptr & eth_interface_params.cmd_buf_size_mask) == (curr_rptr & eth_interface_params.cmd_buf_size_mask));
+    return (curr_wptr != curr_rptr) && ((curr_wptr & eth_interface_params.cmd_buf_size_mask) ==
+                                        (curr_rptr & eth_interface_params.cmd_buf_size_mask));
 }
 
 /*
@@ -1741,35 +2096,37 @@ bool tt_SiliconDevice::is_non_mmio_cmd_q_full(uint32_t curr_wptr, uint32_t curr_
  * Other schemes may be more performant.
  */
 
-
 /*
  * Note that this function is required to acquire the `NON_MMIO_MUTEX_NAME` mutex for interacting with the
  * ethernet core (host) command queue DO NOT issue any pcie reads/writes to the ethernet core prior to acquiring the
  * mutex. For extra information, see the "NON_MMIO_MUTEX Usage" above
  */
 
-
 void tt_SiliconDevice::write_to_non_mmio_device(
-                        const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t address, 
-                        bool broadcast, std::vector<int> broadcast_header) {
-    
+    const void* mem_ptr,
+    uint32_t size_in_bytes,
+    tt_cxy_pair core,
+    uint64_t address,
+    bool broadcast,
+    std::vector<int> broadcast_header) {
     chip_id_t mmio_capable_chip_logical;
-    
-    if(broadcast) {
+
+    if (broadcast) {
         mmio_capable_chip_logical = core.chip;
-    }
-    else {
+    } else {
         mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(core.chip);
     }
     flush_non_mmio_per_chip[ndesc->get_closest_mmio_capable_chip(core.chip)] = true;
 
     if (non_mmio_transfer_cores_customized) {
-        log_assert(active_eth_core_idx_per_chip.find(mmio_capable_chip_logical) != active_eth_core_idx_per_chip.end(), "Ethernet Cores for Host to Cluster communication were not initialized for all MMIO devices.");
+        log_assert(
+            active_eth_core_idx_per_chip.find(mmio_capable_chip_logical) != active_eth_core_idx_per_chip.end(),
+            "Ethernet Cores for Host to Cluster communication were not initialized for all MMIO devices.");
     }
 
     using data_word_t = uint32_t;
     constexpr int DATA_WORD_SIZE = sizeof(data_word_t);
-    constexpr int BROADCAST_HEADER_SIZE = sizeof(data_word_t) * 8; // Broadcast header is 8 words
+    constexpr int BROADCAST_HEADER_SIZE = sizeof(data_word_t) * 8;  // Broadcast header is 8 words
     const auto target_chip = ndesc->get_chip_locations().at(core.chip);
 
     std::string write_tlb = "LARGE_WRITE_TLB";
@@ -1778,14 +2135,15 @@ void tt_SiliconDevice::write_to_non_mmio_device(
     translate_to_noc_table_coords(core.chip, core.y, core.x);
     std::vector<std::uint32_t> erisc_command;
     std::vector<std::uint32_t> erisc_q_rptr = std::vector<uint32_t>(1);
-    std::vector<std::uint32_t> erisc_q_ptrs = std::vector<uint32_t>(eth_interface_params.remote_update_ptr_size_bytes*2 / sizeof(uint32_t));
+    std::vector<std::uint32_t> erisc_q_ptrs =
+        std::vector<uint32_t>(eth_interface_params.remote_update_ptr_size_bytes * 2 / sizeof(uint32_t));
 
     std::vector<std::uint32_t> data_block;
 
-    routing_cmd_t *new_cmd;
+    routing_cmd_t* new_cmd;
 
     uint32_t buffer_id = 0;
-    uint32_t timestamp = 0; //CMD_TIMESTAMP;
+    uint32_t timestamp = 0;  // CMD_TIMESTAMP;
     bool use_dram;
     uint32_t max_block_size;
 
@@ -1797,14 +2155,22 @@ void tt_SiliconDevice::write_to_non_mmio_device(
     //                    MUTEX ACQUIRE (NON-MMIO)
     //  do not locate any ethernet core reads/writes before this acquire
     //
-    const scoped_lock<named_mutex> lock(*get_mutex(NON_MMIO_MUTEX_NAME, this->get_pci_device(mmio_capable_chip_logical)->get_device_num()));
-
-    int& active_core_for_txn = non_mmio_transfer_cores_customized ? active_eth_core_idx_per_chip.at(mmio_capable_chip_logical) : active_core;
-    tt_cxy_pair remote_transfer_ethernet_core = remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn];
-
-    erisc_command.resize(sizeof(routing_cmd_t)/DATA_WORD_SIZE);
-    new_cmd = (routing_cmd_t *)&erisc_command[0];
-    read_device_memory(erisc_q_ptrs.data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb);
+    const scoped_lock<named_mutex> lock(
+        *get_mutex(NON_MMIO_MUTEX_NAME, this->get_pci_device(mmio_capable_chip_logical)->get_device_num()));
+
+    int& active_core_for_txn =
+        non_mmio_transfer_cores_customized ? active_eth_core_idx_per_chip.at(mmio_capable_chip_logical) : active_core;
+    tt_cxy_pair remote_transfer_ethernet_core =
+        remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn];
+
+    erisc_command.resize(sizeof(routing_cmd_t) / DATA_WORD_SIZE);
+    new_cmd = (routing_cmd_t*)&erisc_command[0];
+    read_device_memory(
+        erisc_q_ptrs.data(),
+        remote_transfer_ethernet_core,
+        eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes,
+        eth_interface_params.remote_update_ptr_size_bytes * 2,
+        read_tlb);
     uint32_t full_count = 0;
     uint32_t offset = 0;
     uint32_t block_size;
@@ -1814,40 +2180,55 @@ void tt_SiliconDevice::write_to_non_mmio_device(
     erisc_q_rptr[0] = erisc_q_ptrs[4];
     while (offset < size_in_bytes) {
         while (full) {
-            read_device_memory(erisc_q_rptr.data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes + eth_interface_params.remote_update_ptr_size_bytes, DATA_WORD_SIZE, read_tlb);
-            full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0],erisc_q_rptr[0]);
+            read_device_memory(
+                erisc_q_rptr.data(),
+                remote_transfer_ethernet_core,
+                eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes +
+                    eth_interface_params.remote_update_ptr_size_bytes,
+                DATA_WORD_SIZE,
+                read_tlb);
+            full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0], erisc_q_rptr[0]);
             full_count++;
         }
-        //full = true;
-        // set full only if this command will make the q full.
-        // otherwise full stays false so that we do not poll the rd pointer in next iteration.
-        // As long as current command push does not fill up the queue completely, we do not want
-        // to poll rd pointer in every iteration.
-        //full = is_non_mmio_cmd_q_full((erisc_q_ptrs[0] + 1) & CMD_BUF_PTR_MASK, erisc_q_rptr[0]);
+        // full = true;
+        //  set full only if this command will make the q full.
+        //  otherwise full stays false so that we do not poll the rd pointer in next iteration.
+        //  As long as current command push does not fill up the queue completely, we do not want
+        //  to poll rd pointer in every iteration.
+        // full = is_non_mmio_cmd_q_full((erisc_q_ptrs[0] + 1) & CMD_BUF_PTR_MASK, erisc_q_rptr[0]);
 
         uint32_t req_wr_ptr = erisc_q_ptrs[0] & eth_interface_params.cmd_buf_size_mask;
-        if ((address + offset) & 0x1F) { // address not 32-byte aligned
-            block_size = DATA_WORD_SIZE; // 4 byte aligned
+        if ((address + offset) & 0x1F) {  // address not 32-byte aligned
+            block_size = DATA_WORD_SIZE;  // 4 byte aligned
         } else {
             // For broadcast we prepend a 32byte header. Decrease block size (size of payload) by this amount.
-            block_size = offset + max_block_size > size_in_bytes + 32 * broadcast ? size_in_bytes - offset : max_block_size - 32 * broadcast;
+            block_size = offset + max_block_size > size_in_bytes + 32 * broadcast ? size_in_bytes - offset
+                                                                                  : max_block_size - 32 * broadcast;
             // Explictly align block_size to 4 bytes, in case the input buffer is not uint32_t aligned
             uint32_t alignment_mask = sizeof(uint32_t) - 1;
             block_size = (block_size + alignment_mask) & ~alignment_mask;
         }
-        // For 4 byte aligned data, transfer_size always == block_size. For unaligned data, transfer_size < block_size in the last block
-        uint64_t transfer_size = std::min(block_size, size_in_bytes - offset); // Host side data size that needs to be copied
+        // For 4 byte aligned data, transfer_size always == block_size. For unaligned data, transfer_size < block_size
+        // in the last block
+        uint64_t transfer_size =
+            std::min(block_size, size_in_bytes - offset);  // Host side data size that needs to be copied
         // Use block mode for broadcast
-        uint32_t req_flags = (broadcast || (block_size > DATA_WORD_SIZE)) ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_wr_req | timestamp) : eth_interface_params.cmd_wr_req;
-        uint32_t resp_flags = block_size > DATA_WORD_SIZE ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_wr_ack) : eth_interface_params.cmd_wr_ack;
+        uint32_t req_flags = (broadcast || (block_size > DATA_WORD_SIZE))
+                                 ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_wr_req | timestamp)
+                                 : eth_interface_params.cmd_wr_req;
+        uint32_t resp_flags = block_size > DATA_WORD_SIZE
+                                  ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_wr_ack)
+                                  : eth_interface_params.cmd_wr_ack;
         timestamp = 0;
-        
-        if(broadcast) {
+
+        if (broadcast) {
             req_flags |= eth_interface_params.cmd_broadcast;
         }
 
-        uint32_t host_dram_block_addr = host_address_params.eth_routing_buffers_start + (active_core_for_txn * eth_interface_params.cmd_buf_size + req_wr_ptr) * max_block_size;
-        uint16_t host_dram_channel = 0; // This needs to be 0, since WH can only map ETH buffers to chan 0.
+        uint32_t host_dram_block_addr =
+            host_address_params.eth_routing_buffers_start +
+            (active_core_for_txn * eth_interface_params.cmd_buf_size + req_wr_ptr) * max_block_size;
+        uint16_t host_dram_channel = 0;  // This needs to be 0, since WH can only map ETH buffers to chan 0.
 
         if (req_flags & eth_interface_params.cmd_data_block) {
             // Copy data to sysmem or device DRAM for Block mode
@@ -1856,46 +2237,61 @@ void tt_SiliconDevice::write_to_non_mmio_device(
                 resp_flags |= eth_interface_params.cmd_data_block_dram;
                 size_buffer_to_capacity(data_block, block_size);
                 memcpy(&data_block[0], (uint8_t*)mem_ptr + offset, transfer_size);
-                if(broadcast) {
+                if (broadcast) {
                     // Write broadcast header to sysmem
-                    write_to_sysmem(broadcast_header.data(), broadcast_header.size() * sizeof(uint32_t), host_dram_block_addr, host_dram_channel, mmio_capable_chip_logical);
+                    write_to_sysmem(
+                        broadcast_header.data(),
+                        broadcast_header.size() * sizeof(uint32_t),
+                        host_dram_block_addr,
+                        host_dram_channel,
+                        mmio_capable_chip_logical);
                 }
                 // Write payload to sysmem
-                write_to_sysmem(data_block.data(), data_block.size() * DATA_WORD_SIZE, host_dram_block_addr + BROADCAST_HEADER_SIZE * broadcast, host_dram_channel, mmio_capable_chip_logical);
+                write_to_sysmem(
+                    data_block.data(),
+                    data_block.size() * DATA_WORD_SIZE,
+                    host_dram_block_addr + BROADCAST_HEADER_SIZE * broadcast,
+                    host_dram_channel,
+                    mmio_capable_chip_logical);
 
             } else {
                 uint32_t buf_address = eth_interface_params.eth_routing_data_buffer_addr + req_wr_ptr * max_block_size;
                 size_buffer_to_capacity(data_block, block_size);
                 memcpy(&data_block[0], (uint8_t*)mem_ptr + offset, transfer_size);
-                write_device_memory(data_block.data(), data_block.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, buf_address, write_tlb);
+                write_device_memory(
+                    data_block.data(),
+                    data_block.size() * DATA_WORD_SIZE,
+                    remote_transfer_ethernet_core,
+                    buf_address,
+                    write_tlb);
             }
             tt_driver_atomics::sfence();
         }
 
         // Send the read request
-        log_assert(broadcast || (req_flags == eth_interface_params.cmd_wr_req) || (((address + offset) % 32) == 0), "Block mode address must be 32-byte aligned."); // Block mode address must be 32-byte aligned.
-        
-        if(broadcast) {
+        log_assert(
+            broadcast || (req_flags == eth_interface_params.cmd_wr_req) || (((address + offset) % 32) == 0),
+            "Block mode address must be 32-byte aligned.");  // Block mode address must be 32-byte aligned.
+
+        if (broadcast) {
             // Only specify endpoint local address for broadcast
             new_cmd->sys_addr = address + offset;
-        }
-        else {
-            new_cmd->sys_addr = get_sys_addr(std::get<0>(target_chip), std::get<1>(target_chip), core.x, core.y, address + offset);
+        } else {
+            new_cmd->sys_addr =
+                get_sys_addr(std::get<0>(target_chip), std::get<1>(target_chip), core.x, core.y, address + offset);
             new_cmd->rack = get_sys_rack(std::get<2>(target_chip), std::get<3>(target_chip));
         }
-            
-        if(req_flags & eth_interface_params.cmd_data_block) {
+
+        if (req_flags & eth_interface_params.cmd_data_block) {
             // Block mode
             new_cmd->data = block_size + BROADCAST_HEADER_SIZE * broadcast;
-        }
-        else {
-            if(size_in_bytes - offset < sizeof(uint32_t)) {
+        } else {
+            if (size_in_bytes - offset < sizeof(uint32_t)) {
                 // Handle misalignment at the end of the buffer:
                 // Assemble a padded uint32_t from single bytes, in case we have less than 4 bytes remaining
                 memcpy(&new_cmd->data, static_cast<const uint8_t*>(mem_ptr) + offset, size_in_bytes - offset);
-            }
-            else {
-                new_cmd->data = *((uint32_t*)mem_ptr + offset/DATA_WORD_SIZE);
+            } else {
+                new_cmd->data = *((uint32_t*)mem_ptr + offset / DATA_WORD_SIZE);
             }
         }
 
@@ -1903,14 +2299,24 @@ void tt_SiliconDevice::write_to_non_mmio_device(
         if (use_dram) {
             new_cmd->src_addr_tag = host_dram_block_addr;
         }
-        write_device_memory(erisc_command.data(), erisc_command.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.request_routing_cmd_queue_base + (sizeof(routing_cmd_t) * req_wr_ptr), write_tlb);
+        write_device_memory(
+            erisc_command.data(),
+            erisc_command.size() * DATA_WORD_SIZE,
+            remote_transfer_ethernet_core,
+            eth_interface_params.request_routing_cmd_queue_base + (sizeof(routing_cmd_t) * req_wr_ptr),
+            write_tlb);
         tt_driver_atomics::sfence();
 
         erisc_q_ptrs[0] = (erisc_q_ptrs[0] + 1) & eth_interface_params.cmd_buf_ptr_mask;
         std::vector<std::uint32_t> erisc_q_wptr;
         erisc_q_wptr.resize(1);
         erisc_q_wptr[0] = erisc_q_ptrs[0];
-        write_device_memory(erisc_q_wptr.data(), erisc_q_wptr.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, write_tlb);
+        write_device_memory(
+            erisc_q_wptr.data(),
+            erisc_q_wptr.size() * DATA_WORD_SIZE,
+            remote_transfer_ethernet_core,
+            eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes,
+            write_tlb);
         tt_driver_atomics::sfence();
 
         offset += transfer_size;
@@ -1923,10 +2329,19 @@ void tt_SiliconDevice::write_to_non_mmio_device(
         if (is_non_mmio_cmd_q_full((erisc_q_ptrs[0]) & eth_interface_params.cmd_buf_ptr_mask, erisc_q_rptr[0])) {
             active_core_for_txn++;
             uint32_t update_mask_for_chip = remote_transfer_ethernet_cores[mmio_capable_chip_logical].size() - 1;
-            active_core_for_txn = non_mmio_transfer_cores_customized ? (active_core_for_txn & update_mask_for_chip) : ((active_core_for_txn & NON_EPOCH_ETH_CORES_MASK) + NON_EPOCH_ETH_CORES_START_ID);
+            active_core_for_txn =
+                non_mmio_transfer_cores_customized
+                    ? (active_core_for_txn & update_mask_for_chip)
+                    : ((active_core_for_txn & NON_EPOCH_ETH_CORES_MASK) + NON_EPOCH_ETH_CORES_START_ID);
             // active_core = (active_core & NON_EPOCH_ETH_CORES_MASK) + NON_EPOCH_ETH_CORES_START_ID;
-            remote_transfer_ethernet_core = remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn];
-            read_device_memory(erisc_q_ptrs.data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb);
+            remote_transfer_ethernet_core =
+                remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn];
+            read_device_memory(
+                erisc_q_ptrs.data(),
+                remote_transfer_ethernet_core,
+                eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes,
+                eth_interface_params.remote_update_ptr_size_bytes * 2,
+                read_tlb);
             full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0], erisc_q_ptrs[4]);
             erisc_q_rptr[0] = erisc_q_ptrs[4];
         }
@@ -1934,11 +2349,12 @@ void tt_SiliconDevice::write_to_non_mmio_device(
 }
 
 /*
- * Note that this function is required to acquire the `NON_MMIO_MUTEX_NAME` mutex for interacting with the ethernet core (host) command queue
- * DO NOT use `active_core` or issue any pcie reads/writes to the ethernet core prior to acquiring the mutex. For extra information, see the "NON_MMIO_MUTEX Usage" above
+ * Note that this function is required to acquire the `NON_MMIO_MUTEX_NAME` mutex for interacting with the ethernet core
+ * (host) command queue DO NOT use `active_core` or issue any pcie reads/writes to the ethernet core prior to acquiring
+ * the mutex. For extra information, see the "NON_MMIO_MUTEX Usage" above
  */
-void tt_SiliconDevice::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_t address, uint32_t size_in_bytes) {
-
+void tt_SiliconDevice::read_from_non_mmio_device(
+    void* mem_ptr, tt_cxy_pair core, uint64_t address, uint32_t size_in_bytes) {
     using data_word_t = uint32_t;
     constexpr int DATA_WORD_SIZE = sizeof(data_word_t);
     std::string write_tlb = "LARGE_WRITE_TLB";
@@ -1946,33 +2362,50 @@ void tt_SiliconDevice::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core
     std::string empty_tlb = "";
     translate_to_noc_table_coords(core.chip, core.y, core.x);
 
-    const auto &mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(core.chip);
+    const auto& mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(core.chip);
     const eth_coord_t target_chip = ndesc->get_chip_locations().at(core.chip);
 
     std::vector<std::uint32_t> erisc_command;
     std::vector<std::uint32_t> erisc_q_rptr;
-    std::vector<std::uint32_t> erisc_q_ptrs = std::vector<uint32_t>(eth_interface_params.remote_update_ptr_size_bytes*2 / DATA_WORD_SIZE);
+    std::vector<std::uint32_t> erisc_q_ptrs =
+        std::vector<uint32_t>(eth_interface_params.remote_update_ptr_size_bytes * 2 / DATA_WORD_SIZE);
     std::vector<std::uint32_t> erisc_resp_q_wptr = std::vector<uint32_t>(1);
     std::vector<std::uint32_t> erisc_resp_q_rptr = std::vector<uint32_t>(1);
 
-
     std::vector<std::uint32_t> data_block;
 
-    routing_cmd_t *new_cmd;
+    routing_cmd_t* new_cmd;
 
-    erisc_command.resize(sizeof(routing_cmd_t)/DATA_WORD_SIZE);
-    new_cmd = (routing_cmd_t *)&erisc_command[0];
+    erisc_command.resize(sizeof(routing_cmd_t) / DATA_WORD_SIZE);
+    new_cmd = (routing_cmd_t*)&erisc_command[0];
 
     //
     //                    MUTEX ACQUIRE (NON-MMIO)
     //  do not locate any ethernet core reads/writes before this acquire
     //
-    const scoped_lock<named_mutex> lock(*get_mutex(NON_MMIO_MUTEX_NAME, this->get_pci_device(mmio_capable_chip_logical)->get_device_num()));
+    const scoped_lock<named_mutex> lock(
+        *get_mutex(NON_MMIO_MUTEX_NAME, this->get_pci_device(mmio_capable_chip_logical)->get_device_num()));
     const tt_cxy_pair remote_transfer_ethernet_core = remote_transfer_ethernet_cores[mmio_capable_chip_logical].at(0);
 
-    read_device_memory(erisc_q_ptrs.data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb);
-    read_device_memory(erisc_resp_q_wptr.data(), remote_transfer_ethernet_core, eth_interface_params.response_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, DATA_WORD_SIZE, read_tlb);
-    read_device_memory(erisc_resp_q_rptr.data(), remote_transfer_ethernet_core, eth_interface_params.response_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes + eth_interface_params.remote_update_ptr_size_bytes, DATA_WORD_SIZE, read_tlb);
+    read_device_memory(
+        erisc_q_ptrs.data(),
+        remote_transfer_ethernet_core,
+        eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes,
+        eth_interface_params.remote_update_ptr_size_bytes * 2,
+        read_tlb);
+    read_device_memory(
+        erisc_resp_q_wptr.data(),
+        remote_transfer_ethernet_core,
+        eth_interface_params.response_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes,
+        DATA_WORD_SIZE,
+        read_tlb);
+    read_device_memory(
+        erisc_resp_q_rptr.data(),
+        remote_transfer_ethernet_core,
+        eth_interface_params.response_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes +
+            eth_interface_params.remote_update_ptr_size_bytes,
+        DATA_WORD_SIZE,
+        read_tlb);
 
     bool full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0], erisc_q_ptrs[4]);
     erisc_q_rptr.resize(1);
@@ -1990,25 +2423,34 @@ void tt_SiliconDevice::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core
 
     while (offset < size_in_bytes) {
         while (full) {
-            read_device_memory(erisc_q_rptr.data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes + eth_interface_params.remote_update_ptr_size_bytes, DATA_WORD_SIZE, read_tlb);
-            full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0],erisc_q_rptr[0]);
+            read_device_memory(
+                erisc_q_rptr.data(),
+                remote_transfer_ethernet_core,
+                eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes +
+                    eth_interface_params.remote_update_ptr_size_bytes,
+                DATA_WORD_SIZE,
+                read_tlb);
+            full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0], erisc_q_rptr[0]);
         }
 
         uint32_t req_wr_ptr = erisc_q_ptrs[0] & eth_interface_params.cmd_buf_size_mask;
-        if ((address + offset) & 0x1F) { // address not 32-byte aligned
-            block_size = DATA_WORD_SIZE; // 4 byte aligned block
+        if ((address + offset) & 0x1F) {  // address not 32-byte aligned
+            block_size = DATA_WORD_SIZE;  // 4 byte aligned block
         } else {
             block_size = offset + max_block_size > size_in_bytes ? size_in_bytes - offset : max_block_size;
             // Align up to 4 bytes.
             uint32_t alignment_mask = sizeof(uint32_t) - 1;
             block_size = (block_size + alignment_mask) & ~alignment_mask;
-
         }
-        uint32_t req_flags = block_size > DATA_WORD_SIZE ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_rd_req) : eth_interface_params.cmd_rd_req;
-        uint32_t resp_flags = block_size > DATA_WORD_SIZE ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_rd_data) : eth_interface_params.cmd_rd_data;
+        uint32_t req_flags = block_size > DATA_WORD_SIZE
+                                 ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_rd_req)
+                                 : eth_interface_params.cmd_rd_req;
+        uint32_t resp_flags = block_size > DATA_WORD_SIZE
+                                  ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_rd_data)
+                                  : eth_interface_params.cmd_rd_data;
         uint32_t resp_rd_ptr = erisc_resp_q_rptr[0] & eth_interface_params.cmd_buf_size_mask;
         uint32_t host_dram_block_addr = host_address_params.eth_routing_buffers_start + resp_rd_ptr * max_block_size;
-        uint16_t host_dram_channel = 0; // This needs to be 0, since WH can only map ETH buffers to chan 0.
+        uint16_t host_dram_channel = 0;  // This needs to be 0, since WH can only map ETH buffers to chan 0.
 
         if (use_dram && block_size > DATA_WORD_SIZE) {
             req_flags |= eth_interface_params.cmd_data_block_dram;
@@ -2016,22 +2458,36 @@ void tt_SiliconDevice::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core
         }
 
         // Send the read request
-        log_assert((req_flags == eth_interface_params.cmd_rd_req) || (((address + offset) & 0x1F) == 0), "Block mode offset must be 32-byte aligned."); // Block mode offset must be 32-byte aligned.
-        new_cmd->sys_addr = get_sys_addr(std::get<0>(target_chip), std::get<1>(target_chip), core.x, core.y, address + offset);
+        log_assert(
+            (req_flags == eth_interface_params.cmd_rd_req) || (((address + offset) & 0x1F) == 0),
+            "Block mode offset must be 32-byte aligned.");  // Block mode offset must be 32-byte aligned.
+        new_cmd->sys_addr =
+            get_sys_addr(std::get<0>(target_chip), std::get<1>(target_chip), core.x, core.y, address + offset);
         new_cmd->rack = get_sys_rack(std::get<2>(target_chip), std::get<3>(target_chip));
         new_cmd->data = block_size;
         new_cmd->flags = req_flags;
         if (use_dram) {
             new_cmd->src_addr_tag = host_dram_block_addr;
         }
-        write_device_memory(erisc_command.data(), erisc_command.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.request_routing_cmd_queue_base + (sizeof(routing_cmd_t) * req_wr_ptr), write_tlb);;
+        write_device_memory(
+            erisc_command.data(),
+            erisc_command.size() * DATA_WORD_SIZE,
+            remote_transfer_ethernet_core,
+            eth_interface_params.request_routing_cmd_queue_base + (sizeof(routing_cmd_t) * req_wr_ptr),
+            write_tlb);
+        ;
         tt_driver_atomics::sfence();
 
         erisc_q_ptrs[0] = (erisc_q_ptrs[0] + 1) & eth_interface_params.cmd_buf_ptr_mask;
         std::vector<std::uint32_t> erisc_q_wptr;
         erisc_q_wptr.resize(1);
         erisc_q_wptr[0] = erisc_q_ptrs[0];
-        write_device_memory(erisc_q_wptr.data(), erisc_q_wptr.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, write_tlb);
+        write_device_memory(
+            erisc_q_wptr.data(),
+            erisc_q_wptr.size() * DATA_WORD_SIZE,
+            remote_transfer_ethernet_core,
+            eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes,
+            write_tlb);
         tt_driver_atomics::sfence();
         // If there is more data to read and this command will make the q full, set full to 1.
         // otherwise full stays false so that we do not poll the rd pointer in next iteration.
@@ -2039,7 +2495,12 @@ void tt_SiliconDevice::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core
         // to poll rd pointer in every iteration.
 
         if (is_non_mmio_cmd_q_full((erisc_q_ptrs[0]), erisc_q_rptr[0])) {
-            read_device_memory(erisc_q_ptrs.data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb);
+            read_device_memory(
+                erisc_q_ptrs.data(),
+                remote_transfer_ethernet_core,
+                eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes,
+                eth_interface_params.remote_update_ptr_size_bytes * 2,
+                read_tlb);
             full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0], erisc_q_ptrs[4]);
             erisc_q_rptr[0] = erisc_q_ptrs[4];
         }
@@ -2055,13 +2516,23 @@ void tt_SiliconDevice::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core
         // So we have to wait for wrptr to advance, then wait for flags to be nonzero, then read data.
 
         do {
-            read_device_memory(erisc_resp_q_wptr.data(), remote_transfer_ethernet_core, eth_interface_params.response_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, DATA_WORD_SIZE, read_tlb);
+            read_device_memory(
+                erisc_resp_q_wptr.data(),
+                remote_transfer_ethernet_core,
+                eth_interface_params.response_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes,
+                DATA_WORD_SIZE,
+                read_tlb);
         } while (erisc_resp_q_rptr[0] == erisc_resp_q_wptr[0]);
         tt_driver_atomics::lfence();
         uint32_t flags_offset = 12 + sizeof(routing_cmd_t) * resp_rd_ptr;
         std::vector<std::uint32_t> erisc_resp_flags = std::vector<uint32_t>(1);
         do {
-            read_device_memory(erisc_resp_flags.data(), remote_transfer_ethernet_core, eth_interface_params.response_routing_cmd_queue_base + flags_offset, DATA_WORD_SIZE, read_tlb);
+            read_device_memory(
+                erisc_resp_flags.data(),
+                remote_transfer_ethernet_core,
+                eth_interface_params.response_routing_cmd_queue_base + flags_offset,
+                DATA_WORD_SIZE,
+                read_tlb);
         } while (erisc_resp_flags[0] == 0);
 
         if (erisc_resp_flags[0] == resp_flags) {
@@ -2069,27 +2540,40 @@ void tt_SiliconDevice::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core
             uint32_t data_offset = 8 + sizeof(routing_cmd_t) * resp_rd_ptr;
             if (block_size == DATA_WORD_SIZE) {
                 std::vector<std::uint32_t> erisc_resp_data = std::vector<uint32_t>(1);
-                read_device_memory(erisc_resp_data.data(), remote_transfer_ethernet_core, eth_interface_params.response_routing_cmd_queue_base + data_offset, DATA_WORD_SIZE, read_tlb);
-                if(size_in_bytes - offset < 4)  {
+                read_device_memory(
+                    erisc_resp_data.data(),
+                    remote_transfer_ethernet_core,
+                    eth_interface_params.response_routing_cmd_queue_base + data_offset,
+                    DATA_WORD_SIZE,
+                    read_tlb);
+                if (size_in_bytes - offset < 4) {
                     // Handle misaligned (4 bytes) data at the end of the block.
                     // Only read remaining bytes into the host buffer, instead of reading the full uint32_t
                     std::memcpy((uint8_t*)mem_ptr + offset, erisc_resp_data.data(), size_in_bytes - offset);
-                }
-                else {
-                    *((uint32_t*)mem_ptr + offset/DATA_WORD_SIZE) = erisc_resp_data[0];
+                } else {
+                    *((uint32_t*)mem_ptr + offset / DATA_WORD_SIZE) = erisc_resp_data[0];
                 }
             } else {
                 // Read 4 byte aligned block from device/sysmem
                 if (use_dram) {
                     size_buffer_to_capacity(data_block, block_size);
-                    read_from_sysmem(data_block.data(), host_dram_block_addr, host_dram_channel, block_size, mmio_capable_chip_logical);
+                    read_from_sysmem(
+                        data_block.data(),
+                        host_dram_block_addr,
+                        host_dram_channel,
+                        block_size,
+                        mmio_capable_chip_logical);
                 } else {
-                    uint32_t buf_address = eth_interface_params.eth_routing_data_buffer_addr + resp_rd_ptr * max_block_size;
+                    uint32_t buf_address =
+                        eth_interface_params.eth_routing_data_buffer_addr + resp_rd_ptr * max_block_size;
                     size_buffer_to_capacity(data_block, block_size);
-                    read_device_memory(data_block.data(), remote_transfer_ethernet_core, buf_address, block_size, read_tlb);
+                    read_device_memory(
+                        data_block.data(), remote_transfer_ethernet_core, buf_address, block_size, read_tlb);
                 }
                 // assert(mem_ptr.size() - (offset/DATA_WORD_SIZE) >= (block_size * DATA_WORD_SIZE));
-                log_assert((data_block.size() * DATA_WORD_SIZE) >= block_size, "Incorrect data size read back from sysmem/device");
+                log_assert(
+                    (data_block.size() * DATA_WORD_SIZE) >= block_size,
+                    "Incorrect data size read back from sysmem/device");
                 // Account for misalignment by skipping any padding bytes in the copied data_block
                 memcpy((uint8_t*)mem_ptr + offset, data_block.data(), std::min(block_size, size_in_bytes - offset));
             }
@@ -2097,40 +2581,53 @@ void tt_SiliconDevice::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core
 
         // Finally increment the rdptr for the response command q
         erisc_resp_q_rptr[0] = (erisc_resp_q_rptr[0] + 1) & eth_interface_params.cmd_buf_ptr_mask;
-        write_device_memory(erisc_resp_q_rptr.data(), erisc_resp_q_rptr.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.response_cmd_queue_base + sizeof(remote_update_ptr_t) + eth_interface_params.cmd_counters_size_bytes, write_tlb);
+        write_device_memory(
+            erisc_resp_q_rptr.data(),
+            erisc_resp_q_rptr.size() * DATA_WORD_SIZE,
+            remote_transfer_ethernet_core,
+            eth_interface_params.response_cmd_queue_base + sizeof(remote_update_ptr_t) +
+                eth_interface_params.cmd_counters_size_bytes,
+            write_tlb);
         tt_driver_atomics::sfence();
         log_assert(erisc_resp_flags[0] == resp_flags, "Unexpected ERISC Response Flags.");
 
         offset += block_size;
     }
-
 }
 
 void tt_SiliconDevice::wait_for_connected_non_mmio_flush(const chip_id_t chip_id) {
-    if(flush_non_mmio_per_chip[chip_id]) {
+    if (flush_non_mmio_per_chip[chip_id]) {
         log_assert(arch_name != tt::ARCH::BLACKHOLE, "Non-MMIO flush not supported in Blackhole");
         std::string read_tlb = "LARGE_READ_TLB";
         auto chips_with_mmio = this->get_target_mmio_device_ids();
 
         if (chips_with_mmio.find(chip_id) == chips_with_mmio.end()) {
-            log_debug(LogSiliconDriver, "Chip {} is not an MMIO chip, skipping wait_for_connected_non_mmio_flush", chip_id);
+            log_debug(
+                LogSiliconDriver, "Chip {} is not an MMIO chip, skipping wait_for_connected_non_mmio_flush", chip_id);
             return;
         }
 
         if (arch_name == tt::ARCH::WORMHOLE_B0) {
             std::vector<std::uint32_t> erisc_txn_counters = std::vector<uint32_t>(2);
-            std::vector<std::uint32_t> erisc_q_ptrs = std::vector<uint32_t>(eth_interface_params.remote_update_ptr_size_bytes*2 / sizeof(uint32_t));
+            std::vector<std::uint32_t> erisc_q_ptrs =
+                std::vector<uint32_t>(eth_interface_params.remote_update_ptr_size_bytes * 2 / sizeof(uint32_t));
 
-            //wait for all queues to be empty.
-            for (tt_cxy_pair &cxy : remote_transfer_ethernet_cores.at(chip_id)) {
+            // wait for all queues to be empty.
+            for (tt_cxy_pair& cxy : remote_transfer_ethernet_cores.at(chip_id)) {
                 do {
-                    read_device_memory(erisc_q_ptrs.data(), cxy, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb);
+                    read_device_memory(
+                        erisc_q_ptrs.data(),
+                        cxy,
+                        eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes,
+                        eth_interface_params.remote_update_ptr_size_bytes * 2,
+                        read_tlb);
                 } while (erisc_q_ptrs[0] != erisc_q_ptrs[4]);
             }
-            //wait for all write responses to come back.
-            for (tt_cxy_pair &cxy : remote_transfer_ethernet_cores.at(chip_id)) {
+            // wait for all write responses to come back.
+            for (tt_cxy_pair& cxy : remote_transfer_ethernet_cores.at(chip_id)) {
                 do {
-                    read_device_memory(erisc_txn_counters.data(), cxy, eth_interface_params.request_cmd_queue_base, 8, read_tlb);
+                    read_device_memory(
+                        erisc_txn_counters.data(), cxy, eth_interface_params.request_cmd_queue_base, 8, read_tlb);
                 } while (erisc_txn_counters[0] != erisc_txn_counters[1]);
             }
         }
@@ -2138,7 +2635,6 @@ void tt_SiliconDevice::wait_for_connected_non_mmio_flush(const chip_id_t chip_id
     }
 }
 
-
 void tt_SiliconDevice::wait_for_non_mmio_flush(const chip_id_t chip_id) {
     log_assert(arch_name != tt::ARCH::BLACKHOLE, "Non-MMIO flush not supported in Blackhole");
     std::string read_tlb = "LARGE_READ_TLB";
@@ -2159,39 +2655,48 @@ void tt_SiliconDevice::wait_for_non_mmio_flush() {
 }
 
 // Broadcast Functions
-void tt_SiliconDevice::generate_tensix_broadcast_grids_for_grayskull(std::set<std::pair<tt_xy_pair, tt_xy_pair>>& broadcast_grids,  std::set<uint32_t>& rows_to_exclude, std::set<uint32_t>& cols_to_exclude) {
+void tt_SiliconDevice::generate_tensix_broadcast_grids_for_grayskull(
+    std::set<std::pair<tt_xy_pair, tt_xy_pair>>& broadcast_grids,
+    std::set<uint32_t>& rows_to_exclude,
+    std::set<uint32_t>& cols_to_exclude) {
     // If row 0 is not explicitly excluded, exclude it here since its non-tensix
     rows_to_exclude.insert(0);
     // If row 11 is excluded, we can close the SOC grid. If not, exclude row 12 to close grid.
-    if(rows_to_exclude.find(11) == rows_to_exclude.end()) {
+    if (rows_to_exclude.find(11) == rows_to_exclude.end()) {
         rows_to_exclude.insert(12);
     }
     // If col 0 is not explicitly excluded, exclude it here since its non-tensix
     cols_to_exclude.insert(0);
     // If col 12 is excluded, we can close the SOC grid. If not, exclude col 13 to close grid.
-    if(cols_to_exclude.find(12) == cols_to_exclude.end()) {
+    if (cols_to_exclude.find(12) == cols_to_exclude.end()) {
         cols_to_exclude.insert(13);
     }
     std::vector<std::pair<int, int>> bb_x_coords = {};
     std::vector<std::pair<int, int>> bb_y_coords = {};
 
     // Generate starting and ending x coordinates of each bounding box/grid
-    for(auto x_it = cols_to_exclude.begin(); x_it != cols_to_exclude.end(); x_it++) {
-        if(x_it == std::prev(cols_to_exclude.end(), 1)) continue;
-        if(cols_to_exclude.find(*(x_it) + 1) == cols_to_exclude.end() and cols_to_exclude.find(*(std::next(x_it, 1)) - 1) == cols_to_exclude.end()) {
+    for (auto x_it = cols_to_exclude.begin(); x_it != cols_to_exclude.end(); x_it++) {
+        if (x_it == std::prev(cols_to_exclude.end(), 1)) {
+            continue;
+        }
+        if (cols_to_exclude.find(*(x_it) + 1) == cols_to_exclude.end() and
+            cols_to_exclude.find(*(std::next(x_it, 1)) - 1) == cols_to_exclude.end()) {
             bb_x_coords.push_back({*(x_it) + 1, *(std::next(x_it, 1)) - 1});
         }
     }
 
-    for(auto y_it = rows_to_exclude.begin(); y_it != rows_to_exclude.end(); y_it++) {
-        if(y_it == std::prev(rows_to_exclude.end(), 1)) continue;
-        if(rows_to_exclude.find((*y_it) + 1) == rows_to_exclude.end() and rows_to_exclude.find(*std::next(y_it, 1) - 1) == rows_to_exclude.end()) {
+    for (auto y_it = rows_to_exclude.begin(); y_it != rows_to_exclude.end(); y_it++) {
+        if (y_it == std::prev(rows_to_exclude.end(), 1)) {
+            continue;
+        }
+        if (rows_to_exclude.find((*y_it) + 1) == rows_to_exclude.end() and
+            rows_to_exclude.find(*std::next(y_it, 1) - 1) == rows_to_exclude.end()) {
             bb_y_coords.push_back({*(y_it) + 1, *(std::next(y_it, 1)) - 1});
         }
     }
     // Assemble x and y coordinates into bounding box vertices
-    for(const auto& x_pair : bb_x_coords) {
-        for(const auto& y_pair : bb_y_coords) {
+    for (const auto& x_pair : bb_x_coords) {
+        for (const auto& y_pair : bb_y_coords) {
             tt_xy_pair top_left = tt_xy_pair(x_pair.first, y_pair.first);
             tt_xy_pair bot_right = tt_xy_pair(x_pair.second, y_pair.second);
             broadcast_grids.insert({top_left, bot_right});
@@ -2199,81 +2704,94 @@ void tt_SiliconDevice::generate_tensix_broadcast_grids_for_grayskull(std::set<st
     }
 }
 
-std::unordered_map<chip_id_t, std::vector<std::vector<int>>>& tt_SiliconDevice::get_ethernet_broadcast_headers(const std::set<chip_id_t>& chips_to_exclude) {
+std::unordered_map<chip_id_t, std::vector<std::vector<int>>>& tt_SiliconDevice::get_ethernet_broadcast_headers(
+    const std::set<chip_id_t>& chips_to_exclude) {
     // Generate headers for Ethernet Broadcast (WH) only. Each header corresponds to a unique broadcast "grid".
-    if(bcast_header_cache.find(chips_to_exclude) == bcast_header_cache.end()) {
+    if (bcast_header_cache.find(chips_to_exclude) == bcast_header_cache.end()) {
         bcast_header_cache[chips_to_exclude] = {};
-        std::unordered_map<chip_id_t, std::unordered_map<chip_id_t, std::vector<int>>> broadcast_mask_for_target_chips_per_group = {};
+        std::unordered_map<chip_id_t, std::unordered_map<chip_id_t, std::vector<int>>>
+            broadcast_mask_for_target_chips_per_group = {};
         std::map<std::vector<int>, std::tuple<chip_id_t, std::vector<int>>> broadcast_header_union_per_group = {};
         chip_id_t first_mmio_chip = *(get_target_mmio_device_ids().begin());
-        for(const auto& chip : target_devices_in_cluster) {
-            if(chips_to_exclude.find(chip) == chips_to_exclude.end()) {
+        for (const auto& chip : target_devices_in_cluster) {
+            if (chips_to_exclude.find(chip) == chips_to_exclude.end()) {
                 // Get shelf local physical chip id included in broadcast
-                chip_id_t physical_chip_id = ndesc -> get_shelf_local_physical_chip_coords(chip);
-                eth_coord_t eth_coords = ndesc -> get_chip_locations().at(chip);
+                chip_id_t physical_chip_id = ndesc->get_shelf_local_physical_chip_coords(chip);
+                eth_coord_t eth_coords = ndesc->get_chip_locations().at(chip);
                 // Rack word to be set in header
                 uint32_t rack_word = std::get<2>(eth_coords) >> 2;
                 // Rack byte to be set in header
                 uint32_t rack_byte = std::get<2>(eth_coords) % 4;
                 // 1st level grouping: Group broadcasts based on the MMIO chip they must go through
-                // Nebula + Galaxy Topology assumption: Disjoint sets can only be present in the first shelf, with each set connected to host through its closest MMIO chip
-                // For the first shelf, pass broadcasts to specific chips through their closest MMIO chip
-                // All other shelves are fully connected galaxy grids. These are connected to all MMIO devices. Use any (or the first) MMIO device in the list.
+                // Nebula + Galaxy Topology assumption: Disjoint sets can only be present in the first shelf, with each
+                // set connected to host through its closest MMIO chip For the first shelf, pass broadcasts to specific
+                // chips through their closest MMIO chip All other shelves are fully connected galaxy grids. These are
+                // connected to all MMIO devices. Use any (or the first) MMIO device in the list.
                 chip_id_t closest_mmio_chip = 0;
                 if (std::get<2>(eth_coords) == 0 && std::get<3>(eth_coords) == 0) {
-                    // Shelf 0 + Rack 0: Either an MMIO chip or a remote chip potentially connected to host through its own MMIO counterpart.
-                    closest_mmio_chip = ndesc -> get_closest_mmio_capable_chip(chip);
-                }
-                else {
-                    // All other shelves: Group these under the same/first MMIO chip, since all MMIO chips are connected.
+                    // Shelf 0 + Rack 0: Either an MMIO chip or a remote chip potentially connected to host through its
+                    // own MMIO counterpart.
+                    closest_mmio_chip = ndesc->get_closest_mmio_capable_chip(chip);
+                } else {
+                    // All other shelves: Group these under the same/first MMIO chip, since all MMIO chips are
+                    // connected.
                     closest_mmio_chip = first_mmio_chip;
                 }
-                if(broadcast_mask_for_target_chips_per_group.find(closest_mmio_chip) == broadcast_mask_for_target_chips_per_group.end()) {
+                if (broadcast_mask_for_target_chips_per_group.find(closest_mmio_chip) ==
+                    broadcast_mask_for_target_chips_per_group.end()) {
                     broadcast_mask_for_target_chips_per_group.insert({closest_mmio_chip, {}});
                 }
-                // For each target physical chip id (local to a shelf), generate headers based on all racks and shelves that contain this physical id.
-                if(broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip).find(physical_chip_id) == broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip).end()) {
+                // For each target physical chip id (local to a shelf), generate headers based on all racks and shelves
+                // that contain this physical id.
+                if (broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip).find(physical_chip_id) ==
+                    broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip).end()) {
                     // Target seen for the first time.
                     std::vector<int> broadcast_mask(8, 0);
                     broadcast_mask.at(rack_word) |= (1 << std::get<3>(eth_coords)) << rack_byte;
                     broadcast_mask.at(3) |= 1 << physical_chip_id;
-                    broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip).insert({physical_chip_id, broadcast_mask});
+                    broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip)
+                        .insert({physical_chip_id, broadcast_mask});
 
-                }
-                else {
+                } else {
                     // Target was seen before -> include curr rack and shelf in header
-                    broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip).at(physical_chip_id).at(rack_word) |= static_cast<uint32_t>(1 << std::get<3>(eth_coords)) << rack_byte;
+                    broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip)
+                        .at(physical_chip_id)
+                        .at(rack_word) |= static_cast<uint32_t>(1 << std::get<3>(eth_coords)) << rack_byte;
                 }
             }
         }
-        // 2nd level grouping: For each MMIO group, further group the chips based on their rack and shelf headers. The number of groups after this step represent the final set of broadcast grids.
-        for(auto& mmio_group : broadcast_mask_for_target_chips_per_group) {
-            for(auto& chip : mmio_group.second) {
+        // 2nd level grouping: For each MMIO group, further group the chips based on their rack and shelf headers. The
+        // number of groups after this step represent the final set of broadcast grids.
+        for (auto& mmio_group : broadcast_mask_for_target_chips_per_group) {
+            for (auto& chip : mmio_group.second) {
                 // Generate a hash for this MMIO Chip + Rack + Shelf group
-                std::vector<int> header_hash = {mmio_group.first, chip.second.at(0), chip.second.at(1), chip.second.at(2)};
-                if(broadcast_header_union_per_group.find(header_hash) == broadcast_header_union_per_group.end()) {
-                    broadcast_header_union_per_group.insert({header_hash, std::make_tuple(mmio_group.first, chip.second)});
-                }
-                else {
+                std::vector<int> header_hash = {
+                    mmio_group.first, chip.second.at(0), chip.second.at(1), chip.second.at(2)};
+                if (broadcast_header_union_per_group.find(header_hash) == broadcast_header_union_per_group.end()) {
+                    broadcast_header_union_per_group.insert(
+                        {header_hash, std::make_tuple(mmio_group.first, chip.second)});
+                } else {
                     // If group found, update chip header entry
                     std::get<1>(broadcast_header_union_per_group.at(header_hash)).at(3) |= chip.second.at(3);
                 }
             }
         }
         // Get all broadcast headers per MMIO group
-        for(const auto& header : broadcast_header_union_per_group) {
+        for (const auto& header : broadcast_header_union_per_group) {
             chip_id_t mmio_chip = std::get<0>(header.second);
-            if(bcast_header_cache[chips_to_exclude].find(mmio_chip) == bcast_header_cache[chips_to_exclude].end()) {
+            if (bcast_header_cache[chips_to_exclude].find(mmio_chip) == bcast_header_cache[chips_to_exclude].end()) {
                 bcast_header_cache[chips_to_exclude].insert({mmio_chip, {}});
             }
             bcast_header_cache[chips_to_exclude].at(mmio_chip).push_back(std::get<1>(header.second));
         }
         // Invert headers (FW convention)
-        for(auto& bcast_group : bcast_header_cache[chips_to_exclude]) {
-            for(auto& header : bcast_group.second) {
+        for (auto& bcast_group : bcast_header_cache[chips_to_exclude]) {
+            for (auto& header : bcast_group.second) {
                 int header_idx = 0;
-                for(auto& header_entry : header) {
-                    if(header_idx == 4) break;
+                for (auto& header_entry : header) {
+                    if (header_idx == 4) {
+                        break;
+                    }
                     header_entry = ~header_entry;
                     header_idx++;
                 }
@@ -2283,14 +2801,23 @@ std::unordered_map<chip_id_t, std::vector<std::vector<int>>>& tt_SiliconDevice::
     return bcast_header_cache[chips_to_exclude];
 }
 
-void tt_SiliconDevice::pcie_broadcast_write(chip_id_t chip, const void* mem_ptr, uint32_t size_in_bytes, std::uint32_t addr, const tt_xy_pair& start, const tt_xy_pair& end, const std::string& fallback_tlb) {
-    // Use the specified TLB to broadcast data to all cores included in the [start, end] grid -> GS Only. Use Ethernet Broadcast for WH.
-    PCIDevice *pci_device = get_pci_device(chip);
+void tt_SiliconDevice::pcie_broadcast_write(
+    chip_id_t chip,
+    const void* mem_ptr,
+    uint32_t size_in_bytes,
+    std::uint32_t addr,
+    const tt_xy_pair& start,
+    const tt_xy_pair& end,
+    const std::string& fallback_tlb) {
+    // Use the specified TLB to broadcast data to all cores included in the [start, end] grid -> GS Only. Use Ethernet
+    // Broadcast for WH.
+    PCIDevice* pci_device = get_pci_device(chip);
     const auto tlb_index = dynamic_tlb_config.at(fallback_tlb);
     const uint8_t* buffer_addr = static_cast<const uint8_t*>(mem_ptr);
     const scoped_lock<named_mutex> lock(*get_mutex(fallback_tlb, pci_device->get_device_num()));
-    while(size_in_bytes > 0) {
-        auto [mapped_address, tlb_size] = pci_device->set_dynamic_tlb_broadcast(tlb_index, addr, harvested_coord_translation, start, end, dynamic_tlb_ordering_modes.at(fallback_tlb));
+    while (size_in_bytes > 0) {
+        auto [mapped_address, tlb_size] = pci_device->set_dynamic_tlb_broadcast(
+            tlb_index, addr, harvested_coord_translation, start, end, dynamic_tlb_ordering_modes.at(fallback_tlb));
         uint64_t transfer_size = std::min((uint64_t)size_in_bytes, tlb_size);
         pci_device->write_block(mapped_address, transfer_size, buffer_addr);
 
@@ -2300,155 +2827,235 @@ void tt_SiliconDevice::pcie_broadcast_write(chip_id_t chip, const void* mem_ptr,
     }
 }
 
-inline bool tensix_or_eth_in_broadcast(const std::set<uint32_t>& cols_to_exclude, const tt::umd::architecture_implementation* architecture_implementation) {
+inline bool tensix_or_eth_in_broadcast(
+    const std::set<uint32_t>& cols_to_exclude,
+    const tt::umd::architecture_implementation* architecture_implementation) {
     bool found_tensix_or_eth = false;
-    for(const auto& col : architecture_implementation->get_t6_x_locations()) {
+    for (const auto& col : architecture_implementation->get_t6_x_locations()) {
         found_tensix_or_eth |= (cols_to_exclude.find(col) == cols_to_exclude.end());
     }
     return found_tensix_or_eth;
 }
 
-inline bool valid_tensix_broadcast_grid(const std::set<uint32_t>& rows_to_exclude, const std::set<uint32_t>& cols_to_exclude, const tt::umd::architecture_implementation* architecture_implementation) {
+inline bool valid_tensix_broadcast_grid(
+    const std::set<uint32_t>& rows_to_exclude,
+    const std::set<uint32_t>& cols_to_exclude,
+    const tt::umd::architecture_implementation* architecture_implementation) {
     bool t6_bcast_rows_complete = true;
     bool t6_bcast_rows_empty = true;
-    
-    for(const auto& row : architecture_implementation->get_t6_y_locations()) {
+
+    for (const auto& row : architecture_implementation->get_t6_y_locations()) {
         t6_bcast_rows_complete &= (rows_to_exclude.find(row) == rows_to_exclude.end());
         t6_bcast_rows_empty &= (rows_to_exclude.find(row) != rows_to_exclude.end());
     }
     return t6_bcast_rows_complete || t6_bcast_rows_empty;
 }
 
-
-void tt_SiliconDevice::ethernet_broadcast_write(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address,
-                                                const std::set<chip_id_t>& chips_to_exclude, const std::set<uint32_t>& rows_to_exclude, 
-                                                std::set<uint32_t>& cols_to_exclude, const std::string& fallback_tlb, bool use_virtual_coords) {
-    if(use_ethernet_broadcast) {
+void tt_SiliconDevice::ethernet_broadcast_write(
+    const void* mem_ptr,
+    uint32_t size_in_bytes,
+    uint64_t address,
+    const std::set<chip_id_t>& chips_to_exclude,
+    const std::set<uint32_t>& rows_to_exclude,
+    std::set<uint32_t>& cols_to_exclude,
+    const std::string& fallback_tlb,
+    bool use_virtual_coords) {
+    if (use_ethernet_broadcast) {
         // Broadcast through ERISC core supported
-        std::unordered_map<chip_id_t, std::vector<std::vector<int>>>& broadcast_headers = get_ethernet_broadcast_headers(chips_to_exclude);
-        // Apply row and column exclusion mask explictly. Placing this here if we want to cache the higher level broadcast headers on future/
+        std::unordered_map<chip_id_t, std::vector<std::vector<int>>>& broadcast_headers =
+            get_ethernet_broadcast_headers(chips_to_exclude);
+        // Apply row and column exclusion mask explictly. Placing this here if we want to cache the higher level
+        // broadcast headers on future/
         std::uint32_t row_exclusion_mask = 0;
         std::uint32_t col_exclusion_mask = 0;
-        for(const auto& row : rows_to_exclude) {
+        for (const auto& row : rows_to_exclude) {
             row_exclusion_mask |= 1 << row;
         }
 
-        for(const auto& col : cols_to_exclude) {
+        for (const auto& col : cols_to_exclude) {
             col_exclusion_mask |= 1 << (16 + col);
         }
         // Write broadcast block to device.
-        for(auto& mmio_group : broadcast_headers) {
-            for(auto& header : mmio_group.second) {
-                header.at(4) = use_virtual_coords * 0x8000; // Reset row/col exclusion masks
+        for (auto& mmio_group : broadcast_headers) {
+            for (auto& header : mmio_group.second) {
+                header.at(4) = use_virtual_coords * 0x8000;  // Reset row/col exclusion masks
                 header.at(4) |= row_exclusion_mask;
                 header.at(4) |= col_exclusion_mask;
                 // Write Target: x-y endpoint is a don't care. Initialize to tt_xy_pair(1, 1)
-                write_to_non_mmio_device(mem_ptr, size_in_bytes, tt_cxy_pair(mmio_group.first, tt_xy_pair(1, 1)), address, true, header);
+                write_to_non_mmio_device(
+                    mem_ptr, size_in_bytes, tt_cxy_pair(mmio_group.first, tt_xy_pair(1, 1)), address, true, header);
             }
         }
-    }
-    else {
+    } else {
         // Broadcast not supported. Implement this at the software level as a for loop
         std::vector<tt_cxy_pair> cores_to_write = {};
-        for(const auto& chip : target_devices_in_cluster) {
-            if(chips_to_exclude.find(chip) != chips_to_exclude.end()) continue;
-            for(const auto& core : get_soc_descriptor(chip).cores) {
-                if(cols_to_exclude.find(core.first.x) == cols_to_exclude.end() and rows_to_exclude.find(core.first.y) == rows_to_exclude.end() and core.second.type != CoreType::HARVESTED) {
-                    write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(chip, core.first.x, core.first.y), address, fallback_tlb);
+        for (const auto& chip : target_devices_in_cluster) {
+            if (chips_to_exclude.find(chip) != chips_to_exclude.end()) {
+                continue;
+            }
+            for (const auto& core : get_soc_descriptor(chip).cores) {
+                if (cols_to_exclude.find(core.first.x) == cols_to_exclude.end() and
+                    rows_to_exclude.find(core.first.y) == rows_to_exclude.end() and
+                    core.second.type != CoreType::HARVESTED) {
+                    write_to_device(
+                        mem_ptr, size_in_bytes, tt_cxy_pair(chip, core.first.x, core.first.y), address, fallback_tlb);
                 }
             }
         }
     }
 }
 
-void tt_SiliconDevice::broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address,
-                       const std::set<chip_id_t>& chips_to_exclude, std::set<uint32_t>& rows_to_exclude, std::set<uint32_t>& cols_to_exclude, const std::string& fallback_tlb) {
+void tt_SiliconDevice::broadcast_write_to_cluster(
+    const void* mem_ptr,
+    uint32_t size_in_bytes,
+    uint64_t address,
+    const std::set<chip_id_t>& chips_to_exclude,
+    std::set<uint32_t>& rows_to_exclude,
+    std::set<uint32_t>& cols_to_exclude,
+    const std::string& fallback_tlb) {
     if (arch_name == tt::ARCH::GRAYSKULL) {
         // Device FW disables broadcasts to all non tensix cores.
         std::vector<tt_xy_pair> dram_cores_to_write = {};
         std::vector<uint32_t> dram_rows = {0, 6};
         std::vector<uint32_t> dram_cols = {1, 4, 7, 10};
 
-        for(const auto& row : dram_rows) {
-            for(const auto& col : dram_cols) {
-                if(rows_to_exclude.find(row) == rows_to_exclude.end() and cols_to_exclude.find(col) == cols_to_exclude.end()) {
+        for (const auto& row : dram_rows) {
+            for (const auto& col : dram_cols) {
+                if (rows_to_exclude.find(row) == rows_to_exclude.end() and
+                    cols_to_exclude.find(col) == cols_to_exclude.end()) {
                     dram_cores_to_write.push_back(tt_xy_pair(col, row));
                 }
             }
         }
-        
+
         std::set<std::pair<tt_xy_pair, tt_xy_pair>> broadcast_grids = {};
         generate_tensix_broadcast_grids_for_grayskull(broadcast_grids, rows_to_exclude, cols_to_exclude);
-        for(const auto& chip : target_devices_in_cluster) {
-            if(chips_to_exclude.find(chip) != chips_to_exclude.end()) continue;
-            for(const auto& dram : dram_cores_to_write) {
+        for (const auto& chip : target_devices_in_cluster) {
+            if (chips_to_exclude.find(chip) != chips_to_exclude.end()) {
+                continue;
+            }
+            for (const auto& dram : dram_cores_to_write) {
                 write_device_memory(mem_ptr, size_in_bytes, tt_cxy_pair(chip, dram), address, fallback_tlb);
             }
-            for(const auto& grid : broadcast_grids) {
+            for (const auto& grid : broadcast_grids) {
                 pcie_broadcast_write(chip, mem_ptr, size_in_bytes, address, grid.first, grid.second, fallback_tlb);
             }
-        } 
-    }
-    else if (arch_name == tt::ARCH::BLACKHOLE) {
+        }
+    } else if (arch_name == tt::ARCH::BLACKHOLE) {
         auto architecture_implementation = tt::umd::architecture_implementation::create(arch_name);
-        if(cols_to_exclude.find(0) == cols_to_exclude.end() or cols_to_exclude.find(9) == cols_to_exclude.end()) {
-            log_assert(!tensix_or_eth_in_broadcast(cols_to_exclude, architecture_implementation.get()), "Cannot broadcast to tensix/ethernet and DRAM simultaneously on Blackhole.");
-            if(cols_to_exclude.find(0) == cols_to_exclude.end()) {
+        if (cols_to_exclude.find(0) == cols_to_exclude.end() or cols_to_exclude.find(9) == cols_to_exclude.end()) {
+            log_assert(
+                !tensix_or_eth_in_broadcast(cols_to_exclude, architecture_implementation.get()),
+                "Cannot broadcast to tensix/ethernet and DRAM simultaneously on Blackhole.");
+            if (cols_to_exclude.find(0) == cols_to_exclude.end()) {
                 // When broadcast includes column zero do not exclude anything
                 std::set<uint32_t> unsafe_rows = {};
                 std::set<uint32_t> cols_to_exclude_for_col_0_bcast = cols_to_exclude;
                 std::set<uint32_t> rows_to_exclude_for_col_0_bcast = rows_to_exclude;
                 cols_to_exclude_for_col_0_bcast.insert(9);
                 rows_to_exclude_for_col_0_bcast.insert(unsafe_rows.begin(), unsafe_rows.end());
-                ethernet_broadcast_write(mem_ptr, size_in_bytes, address, chips_to_exclude,
-                                        rows_to_exclude_for_col_0_bcast, cols_to_exclude_for_col_0_bcast, fallback_tlb, false);
+                ethernet_broadcast_write(
+                    mem_ptr,
+                    size_in_bytes,
+                    address,
+                    chips_to_exclude,
+                    rows_to_exclude_for_col_0_bcast,
+                    cols_to_exclude_for_col_0_bcast,
+                    fallback_tlb,
+                    false);
             }
-            if(cols_to_exclude.find(9) == cols_to_exclude.end()) {
+            if (cols_to_exclude.find(9) == cols_to_exclude.end()) {
                 std::set<uint32_t> cols_to_exclude_for_col_9_bcast = cols_to_exclude;
                 cols_to_exclude_for_col_9_bcast.insert(0);
-                ethernet_broadcast_write(mem_ptr, size_in_bytes, address, chips_to_exclude,
-                                        rows_to_exclude, cols_to_exclude_for_col_9_bcast, fallback_tlb, false);
+                ethernet_broadcast_write(
+                    mem_ptr,
+                    size_in_bytes,
+                    address,
+                    chips_to_exclude,
+                    rows_to_exclude,
+                    cols_to_exclude_for_col_9_bcast,
+                    fallback_tlb,
+                    false);
             }
+        } else {
+            log_assert(
+                use_virtual_coords_for_eth_broadcast or
+                    valid_tensix_broadcast_grid(rows_to_exclude, cols_to_exclude, architecture_implementation.get()),
+                "Must broadcast to all tensix rows when ERISC FW is < 6.8.0.");
+            ethernet_broadcast_write(
+                mem_ptr,
+                size_in_bytes,
+                address,
+                chips_to_exclude,
+                rows_to_exclude,
+                cols_to_exclude,
+                fallback_tlb,
+                use_virtual_coords_for_eth_broadcast);
         }
-        else {
-            log_assert(use_virtual_coords_for_eth_broadcast or valid_tensix_broadcast_grid(rows_to_exclude, cols_to_exclude, architecture_implementation.get()), 
-                        "Must broadcast to all tensix rows when ERISC FW is < 6.8.0.");
-            ethernet_broadcast_write(mem_ptr, size_in_bytes, address, chips_to_exclude,
-                                    rows_to_exclude, cols_to_exclude, fallback_tlb, use_virtual_coords_for_eth_broadcast);
-        }
-    }
-    else {
+    } else {
         auto architecture_implementation = tt::umd::architecture_implementation::create(arch_name);
-        if(cols_to_exclude.find(0) == cols_to_exclude.end() or cols_to_exclude.find(5) == cols_to_exclude.end()) {
-            log_assert(!tensix_or_eth_in_broadcast(cols_to_exclude, architecture_implementation.get()), "Cannot broadcast to tensix/ethernet and DRAM simultaneously on Wormhole.");
-            if(cols_to_exclude.find(0) == cols_to_exclude.end()) {
-                // When broadcast includes column zero Exclude PCIe, ARC and router cores from broadcast explictly, since writing to these is unsafe
-                // ERISC FW does not exclude these.
+        if (cols_to_exclude.find(0) == cols_to_exclude.end() or cols_to_exclude.find(5) == cols_to_exclude.end()) {
+            log_assert(
+                !tensix_or_eth_in_broadcast(cols_to_exclude, architecture_implementation.get()),
+                "Cannot broadcast to tensix/ethernet and DRAM simultaneously on Wormhole.");
+            if (cols_to_exclude.find(0) == cols_to_exclude.end()) {
+                // When broadcast includes column zero Exclude PCIe, ARC and router cores from broadcast explictly,
+                // since writing to these is unsafe ERISC FW does not exclude these.
                 std::set<uint32_t> unsafe_rows = {2, 3, 4, 8, 9, 10};
                 std::set<uint32_t> cols_to_exclude_for_col_0_bcast = cols_to_exclude;
                 std::set<uint32_t> rows_to_exclude_for_col_0_bcast = rows_to_exclude;
                 cols_to_exclude_for_col_0_bcast.insert(5);
                 rows_to_exclude_for_col_0_bcast.insert(unsafe_rows.begin(), unsafe_rows.end());
-                ethernet_broadcast_write(mem_ptr, size_in_bytes, address, chips_to_exclude,
-                                        rows_to_exclude_for_col_0_bcast, cols_to_exclude_for_col_0_bcast, fallback_tlb, false);
+                ethernet_broadcast_write(
+                    mem_ptr,
+                    size_in_bytes,
+                    address,
+                    chips_to_exclude,
+                    rows_to_exclude_for_col_0_bcast,
+                    cols_to_exclude_for_col_0_bcast,
+                    fallback_tlb,
+                    false);
             }
-            if(cols_to_exclude.find(5) == cols_to_exclude.end()) {
+            if (cols_to_exclude.find(5) == cols_to_exclude.end()) {
                 std::set<uint32_t> cols_to_exclude_for_col_5_bcast = cols_to_exclude;
                 cols_to_exclude_for_col_5_bcast.insert(0);
-                ethernet_broadcast_write(mem_ptr, size_in_bytes, address, chips_to_exclude,
-                                        rows_to_exclude, cols_to_exclude_for_col_5_bcast, fallback_tlb, false);
+                ethernet_broadcast_write(
+                    mem_ptr,
+                    size_in_bytes,
+                    address,
+                    chips_to_exclude,
+                    rows_to_exclude,
+                    cols_to_exclude_for_col_5_bcast,
+                    fallback_tlb,
+                    false);
             }
-        }
-        else {
-            log_assert(use_virtual_coords_for_eth_broadcast or valid_tensix_broadcast_grid(rows_to_exclude, cols_to_exclude, architecture_implementation.get()), 
-                        "Must broadcast to all tensix rows when ERISC FW is < 6.8.0.");
-            ethernet_broadcast_write(mem_ptr, size_in_bytes, address, chips_to_exclude,
-                                    rows_to_exclude, cols_to_exclude, fallback_tlb, use_virtual_coords_for_eth_broadcast);
-        }
-    }
-}
-
-int tt_SiliconDevice::remote_arc_msg(int chip, uint32_t msg_code, bool wait_for_done, uint32_t arg0, uint32_t arg1, int timeout, uint32_t *return_3, uint32_t *return_4) {
+        } else {
+            log_assert(
+                use_virtual_coords_for_eth_broadcast or
+                    valid_tensix_broadcast_grid(rows_to_exclude, cols_to_exclude, architecture_implementation.get()),
+                "Must broadcast to all tensix rows when ERISC FW is < 6.8.0.");
+            ethernet_broadcast_write(
+                mem_ptr,
+                size_in_bytes,
+                address,
+                chips_to_exclude,
+                rows_to_exclude,
+                cols_to_exclude,
+                fallback_tlb,
+                use_virtual_coords_for_eth_broadcast);
+        }
+    }
+}
+
+int tt_SiliconDevice::remote_arc_msg(
+    int chip,
+    uint32_t msg_code,
+    bool wait_for_done,
+    uint32_t arg0,
+    uint32_t arg1,
+    int timeout,
+    uint32_t* return_3,
+    uint32_t* return_4) {
     constexpr uint64_t ARC_RESET_SCRATCH_ADDR = 0x880030060;
     constexpr uint64_t ARC_RESET_MISC_CNTL_ADDR = 0x880030100;
 
@@ -2457,18 +3064,14 @@ int tt_SiliconDevice::remote_arc_msg(int chip, uint32_t msg_code, bool wait_for_
     if ((msg_code & 0xff00) != 0xaa00) {
         log_error("Malformed message. msg_code is 0x{:x} but should be 0xaa..", msg_code);
     }
-    log_assert (arg0 <= 0xffff and arg1 <= 0xffff, "Only 16 bits allowed in arc_msg args"); // Only 16 bits are allowed
+    log_assert(arg0 <= 0xffff and arg1 <= 0xffff, "Only 16 bits allowed in arc_msg args");  // Only 16 bits are allowed
 
-    uint32_t fw_arg = arg0 | (arg1<<16);
+    uint32_t fw_arg = arg0 | (arg1 << 16);
     int exit_code = 0;
 
-    {
-        write_to_non_mmio_device(&fw_arg, sizeof(fw_arg),  core, ARC_RESET_SCRATCH_ADDR + 3 * 4);
-    }
+    { write_to_non_mmio_device(&fw_arg, sizeof(fw_arg), core, ARC_RESET_SCRATCH_ADDR + 3 * 4); }
 
-    {
-        write_to_non_mmio_device(&msg_code, sizeof(fw_arg), core, ARC_RESET_SCRATCH_ADDR + 5 * 4);
-    }
+    { write_to_non_mmio_device(&msg_code, sizeof(fw_arg), core, ARC_RESET_SCRATCH_ADDR + 5 * 4); }
 
     wait_for_non_mmio_flush();
     uint32_t misc = 0;
@@ -2490,7 +3093,11 @@ int tt_SiliconDevice::remote_arc_msg(int chip, uint32_t msg_code, bool wait_for_
             if (std::chrono::system_clock::now() - start > timeout_seconds) {
                 std::stringstream ss;
                 ss << std::hex << msg_code;
-                throw std::runtime_error(fmt::format("Timed out after waiting {} seconds for device {} ARC to respond to message 0x{}", timeout, chip, ss.str()));
+                throw std::runtime_error(fmt::format(
+                    "Timed out after waiting {} seconds for device {} ARC to respond to message 0x{}",
+                    timeout,
+                    chip,
+                    ss.str()));
             }
 
             uint32_t status = 0;
@@ -2516,66 +3123,96 @@ int tt_SiliconDevice::remote_arc_msg(int chip, uint32_t msg_code, bool wait_for_
     return exit_code;
 }
 
-void tt_SiliconDevice::write_to_sysmem(const void* mem_ptr, std::uint32_t size,  uint64_t addr, uint16_t channel, chip_id_t src_device_id) {
+void tt_SiliconDevice::write_to_sysmem(
+    const void* mem_ptr, std::uint32_t size, uint64_t addr, uint16_t channel, chip_id_t src_device_id) {
     write_buffer(mem_ptr, size, addr, channel, src_device_id);
 }
 
-void tt_SiliconDevice::read_from_sysmem(void* mem_ptr, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id) {
+void tt_SiliconDevice::read_from_sysmem(
+    void* mem_ptr, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id) {
     read_buffer(mem_ptr, addr, channel, size, src_device_id);
 }
 
-void tt_SiliconDevice::set_membar_flag(const chip_id_t chip, const std::unordered_set<tt_xy_pair>& cores, const uint32_t barrier_value, const uint32_t barrier_addr, const std::string& fallback_tlb) {
-    tt_driver_atomics::sfence(); // Ensure that writes before this do not get reordered
+void tt_SiliconDevice::set_membar_flag(
+    const chip_id_t chip,
+    const std::unordered_set<tt_xy_pair>& cores,
+    const uint32_t barrier_value,
+    const uint32_t barrier_addr,
+    const std::string& fallback_tlb) {
+    tt_driver_atomics::sfence();  // Ensure that writes before this do not get reordered
     std::unordered_set<tt_xy_pair> cores_synced = {};
     std::vector<uint32_t> barrier_val_vec = {barrier_value};
     for (const auto& core : cores) {
-        write_to_device(barrier_val_vec.data(), barrier_val_vec.size() * sizeof(uint32_t), tt_cxy_pair(chip, core), barrier_addr, fallback_tlb);
-    }
-    tt_driver_atomics::sfence(); // Ensure that all writes in the Host WC buffer are flushed
+        write_to_device(
+            barrier_val_vec.data(),
+            barrier_val_vec.size() * sizeof(uint32_t),
+            tt_cxy_pair(chip, core),
+            barrier_addr,
+            fallback_tlb);
+    }
+    tt_driver_atomics::sfence();  // Ensure that all writes in the Host WC buffer are flushed
     while (cores_synced.size() != cores.size()) {
-        for(const auto& core : cores) {
+        for (const auto& core : cores) {
             if (cores_synced.find(core) == cores_synced.end()) {
                 uint32_t readback_val;
-                read_from_device(&readback_val, tt_cxy_pair(chip, core), barrier_addr, sizeof(std::uint32_t), fallback_tlb);
+                read_from_device(
+                    &readback_val, tt_cxy_pair(chip, core), barrier_addr, sizeof(std::uint32_t), fallback_tlb);
                 if (readback_val == barrier_value) {
                     cores_synced.insert(core);
-                }
-                else {
-                    log_trace(LogSiliconDriver, "Waiting for core {} to recieve mem bar flag {} in function", core.str(), barrier_value);
+                } else {
+                    log_trace(
+                        LogSiliconDriver,
+                        "Waiting for core {} to recieve mem bar flag {} in function",
+                        core.str(),
+                        barrier_value);
                 }
             }
         }
     }
     // Ensure that reads or writes after this do not get reordered.
     // Reordering can cause races where data gets transferred before the barrier has returned
-    tt_driver_atomics::mfence(); 
+    tt_driver_atomics::mfence();
 }
 
-void tt_SiliconDevice::insert_host_to_device_barrier(const chip_id_t chip, const std::unordered_set<tt_xy_pair>& cores, const uint32_t barrier_addr, const std::string& fallback_tlb) {
+void tt_SiliconDevice::insert_host_to_device_barrier(
+    const chip_id_t chip,
+    const std::unordered_set<tt_xy_pair>& cores,
+    const uint32_t barrier_addr,
+    const std::string& fallback_tlb) {
     // Ensure that this memory barrier is atomic across processes/threads
-    const scoped_lock<named_mutex> lock(*get_mutex(MEM_BARRIER_MUTEX_NAME, this->get_pci_device(chip)->get_device_num()));
+    const scoped_lock<named_mutex> lock(
+        *get_mutex(MEM_BARRIER_MUTEX_NAME, this->get_pci_device(chip)->get_device_num()));
     set_membar_flag(chip, cores, tt_MemBarFlag::SET, barrier_addr, fallback_tlb);
     set_membar_flag(chip, cores, tt_MemBarFlag::RESET, barrier_addr, fallback_tlb);
 }
 
 void tt_SiliconDevice::init_membars() {
-    for(const auto& chip :  target_devices_in_cluster) {
-        if (ndesc -> is_chip_mmio_capable(chip)) {
-            set_membar_flag(chip, workers_per_chip.at(chip), tt_MemBarFlag::RESET, l1_address_params.tensix_l1_barrier_base, "LARGE_WRITE_TLB");
-            set_membar_flag(chip, eth_cores, tt_MemBarFlag::RESET, l1_address_params.eth_l1_barrier_base, "LARGE_WRITE_TLB");
-            set_membar_flag(chip, dram_cores, tt_MemBarFlag::RESET, dram_address_params.DRAM_BARRIER_BASE, "LARGE_WRITE_TLB");
+    for (const auto& chip : target_devices_in_cluster) {
+        if (ndesc->is_chip_mmio_capable(chip)) {
+            set_membar_flag(
+                chip,
+                workers_per_chip.at(chip),
+                tt_MemBarFlag::RESET,
+                l1_address_params.tensix_l1_barrier_base,
+                "LARGE_WRITE_TLB");
+            set_membar_flag(
+                chip, eth_cores, tt_MemBarFlag::RESET, l1_address_params.eth_l1_barrier_base, "LARGE_WRITE_TLB");
+            set_membar_flag(
+                chip, dram_cores, tt_MemBarFlag::RESET, dram_address_params.DRAM_BARRIER_BASE, "LARGE_WRITE_TLB");
         }
     }
 }
-void tt_SiliconDevice::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {
-    if (ndesc -> is_chip_mmio_capable(chip)) {
+
+void tt_SiliconDevice::l1_membar(
+    const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {
+    if (ndesc->is_chip_mmio_capable(chip)) {
         const auto& all_workers = workers_per_chip.at(chip);
         const auto& all_eth = eth_cores;
         if (cores.size()) {
             // Insert barrier on specific cores with L1
             std::unordered_set<tt_xy_pair> workers_to_sync = {};
             std::unordered_set<tt_xy_pair> eth_to_sync = {};
-            
+
             for (const auto& core : cores) {
                 if (all_workers.find(core) != all_workers.end()) {
                     workers_to_sync.insert(core);
@@ -2585,59 +3222,60 @@ void tt_SiliconDevice::l1_membar(const chip_id_t chip, const std::string& fallba
                     log_fatal("Can only insert an L1 Memory barrier on Tensix or Ethernet cores.");
                 }
             }
-            insert_host_to_device_barrier(chip, workers_to_sync, l1_address_params.tensix_l1_barrier_base, fallback_tlb);
+            insert_host_to_device_barrier(
+                chip, workers_to_sync, l1_address_params.tensix_l1_barrier_base, fallback_tlb);
             insert_host_to_device_barrier(chip, eth_to_sync, l1_address_params.eth_l1_barrier_base, fallback_tlb);
         } else {
             // Insert barrier on all cores with L1
             insert_host_to_device_barrier(chip, all_workers, l1_address_params.tensix_l1_barrier_base, fallback_tlb);
             insert_host_to_device_barrier(chip, all_eth, l1_address_params.eth_l1_barrier_base, fallback_tlb);
         }
-    }
-    else {
+    } else {
         wait_for_non_mmio_flush();
     }
 }
 
-void tt_SiliconDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {
-    if (ndesc -> is_chip_mmio_capable(chip)) {
+void tt_SiliconDevice::dram_membar(
+    const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<tt_xy_pair>& cores) {
+    if (ndesc->is_chip_mmio_capable(chip)) {
         if (cores.size()) {
-            for(const auto& core : cores) {
-                log_assert(dram_cores.find(core) != dram_cores.end(), "Can only insert a DRAM Memory barrier on DRAM cores.");
+            for (const auto& core : cores) {
+                log_assert(
+                    dram_cores.find(core) != dram_cores.end(), "Can only insert a DRAM Memory barrier on DRAM cores.");
             }
             insert_host_to_device_barrier(chip, cores, dram_address_params.DRAM_BARRIER_BASE, fallback_tlb);
-        }
-        else {
+        } else {
             // Insert Barrier on all DRAM Cores
             insert_host_to_device_barrier(chip, dram_cores, dram_address_params.DRAM_BARRIER_BASE, fallback_tlb);
         }
-    }
-    else {
+    } else {
         wait_for_non_mmio_flush();
     }
 }
 
-void tt_SiliconDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels) {
-    if (ndesc -> is_chip_mmio_capable(chip)) {
+void tt_SiliconDevice::dram_membar(
+    const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set<uint32_t>& channels) {
+    if (ndesc->is_chip_mmio_capable(chip)) {
         if (channels.size()) {
             std::unordered_set<tt_xy_pair> dram_cores_to_sync = {};
-            for(const auto& chan : channels) {
+            for (const auto& chan : channels) {
                 dram_cores_to_sync.insert(get_soc_descriptor(chip).get_core_for_dram_channel(chan, 0));
             }
-            insert_host_to_device_barrier(chip, dram_cores_to_sync, dram_address_params.DRAM_BARRIER_BASE, fallback_tlb);
-        }
-        else {
+            insert_host_to_device_barrier(
+                chip, dram_cores_to_sync, dram_address_params.DRAM_BARRIER_BASE, fallback_tlb);
+        } else {
             // Insert Barrier on all DRAM Cores
             insert_host_to_device_barrier(chip, dram_cores, dram_address_params.DRAM_BARRIER_BASE, fallback_tlb);
         }
-    }
-    else {
+    } else {
         wait_for_non_mmio_flush();
     }
 }
 
-void tt_SiliconDevice::write_to_device(const void *mem_ptr, uint32_t size, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb) {
-    bool target_is_mmio_capable = ndesc -> is_chip_mmio_capable(core.chip);
-    if(target_is_mmio_capable) {
+void tt_SiliconDevice::write_to_device(
+    const void* mem_ptr, uint32_t size, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb) {
+    bool target_is_mmio_capable = ndesc->is_chip_mmio_capable(core.chip);
+    if (target_is_mmio_capable) {
         if (fallback_tlb == "REG_TLB") {
             write_mmio_device_register(mem_ptr, core, addr, size, fallback_tlb);
         } else {
@@ -2645,100 +3283,119 @@ void tt_SiliconDevice::write_to_device(const void *mem_ptr, uint32_t size, tt_cx
         }
     } else {
         log_assert(arch_name != tt::ARCH::BLACKHOLE, "Non-MMIO targets not supported in Blackhole");
-        log_assert((get_soc_descriptor(core.chip).ethernet_cores).size() > 0 && get_number_of_chips_in_cluster() > 1, "Cannot issue ethernet writes to a single chip cluster!");
+        log_assert(
+            (get_soc_descriptor(core.chip).ethernet_cores).size() > 0 && get_number_of_chips_in_cluster() > 1,
+            "Cannot issue ethernet writes to a single chip cluster!");
         write_to_non_mmio_device(mem_ptr, size, core, addr);
     }
 }
 
-void tt_SiliconDevice::read_mmio_device_register(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) {
-    PCIDevice *pci_device = get_pci_device(core.chip);
+void tt_SiliconDevice::read_mmio_device_register(
+    void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) {
+    PCIDevice* pci_device = get_pci_device(core.chip);
 
     const auto tlb_index = dynamic_tlb_config.at(fallback_tlb);
     const scoped_lock<named_mutex> lock(*get_mutex(fallback_tlb, pci_device->get_device_num()));
     log_debug(LogSiliconDriver, "  dynamic tlb_index: {}", tlb_index);
 
-    auto [mapped_address, tlb_size] = pci_device->set_dynamic_tlb(tlb_index, core, addr, harvested_coord_translation, TLB_DATA::Strict);
-    // Align block to 4bytes if needed. 
+    auto [mapped_address, tlb_size] =
+        pci_device->set_dynamic_tlb(tlb_index, core, addr, harvested_coord_translation, TLB_DATA::Strict);
+    // Align block to 4bytes if needed.
     auto aligned_buf = tt_4_byte_aligned_buffer(mem_ptr, size);
     pci_device->read_regs(mapped_address, aligned_buf.block_size / sizeof(std::uint32_t), aligned_buf.local_storage);
 
-    if(aligned_buf.input_size != aligned_buf.block_size) {
+    if (aligned_buf.input_size != aligned_buf.block_size) {
         // Copy value from aligned buffer to main buffer.
         std::memcpy(mem_ptr, aligned_buf.local_storage, size);
     }
 }
 
-
-void tt_SiliconDevice::write_mmio_device_register(const void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) {
-    PCIDevice *pci_device = get_pci_device(core.chip);
+void tt_SiliconDevice::write_mmio_device_register(
+    const void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) {
+    PCIDevice* pci_device = get_pci_device(core.chip);
 
     const auto tlb_index = dynamic_tlb_config.at(fallback_tlb);
     const scoped_lock<named_mutex> lock(*get_mutex(fallback_tlb, pci_device->get_device_num()));
     log_debug(LogSiliconDriver, "  dynamic tlb_index: {}", tlb_index);
 
-    auto [mapped_address, tlb_size] = pci_device->set_dynamic_tlb(tlb_index, core, addr, harvested_coord_translation, TLB_DATA::Strict);
-    // Align block to 4bytes if needed. 
+    auto [mapped_address, tlb_size] =
+        pci_device->set_dynamic_tlb(tlb_index, core, addr, harvested_coord_translation, TLB_DATA::Strict);
+    // Align block to 4bytes if needed.
     auto aligned_buf = tt_4_byte_aligned_buffer(mem_ptr, size);
-    if(aligned_buf.input_size != aligned_buf.block_size) {
+    if (aligned_buf.input_size != aligned_buf.block_size) {
         // Copy value from main buffer to aligned buffer
         std::memcpy(aligned_buf.local_storage, mem_ptr, size);
     }
     pci_device->write_regs(mapped_address, aligned_buf.block_size / sizeof(uint32_t), aligned_buf.local_storage);
 }
 
-void tt_SiliconDevice::read_from_device(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) {
-    bool target_is_mmio_capable = ndesc -> is_chip_mmio_capable(core.chip);
+void tt_SiliconDevice::read_from_device(
+    void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) {
+    bool target_is_mmio_capable = ndesc->is_chip_mmio_capable(core.chip);
     if (target_is_mmio_capable) {
         if (fallback_tlb == "REG_TLB") {
             read_mmio_device_register(mem_ptr, core, addr, size, fallback_tlb);
         } else {
             read_device_memory(mem_ptr, core, addr, size, fallback_tlb);
         }
-    }
-    else {
-        log_assert(arch_name != tt::ARCH::BLACKHOLE, "Non-MMIO targets not supported in Blackhole");    // MT: Use only dynamic TLBs and never program static
-        log_assert((get_soc_descriptor(core.chip).ethernet_cores).size() > 0 &&  get_number_of_chips_in_cluster() > 1, "Cannot issue ethernet reads from a single chip cluster!");
+    } else {
+        log_assert(
+            arch_name != tt::ARCH::BLACKHOLE,
+            "Non-MMIO targets not supported in Blackhole");  // MT: Use only dynamic TLBs and never program static
+        log_assert(
+            (get_soc_descriptor(core.chip).ethernet_cores).size() > 0 && get_number_of_chips_in_cluster() > 1,
+            "Cannot issue ethernet reads from a single chip cluster!");
         read_from_non_mmio_device(mem_ptr, core, addr, size);
     }
 }
 
-int tt_SiliconDevice::arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done, uint32_t arg0, uint32_t arg1, int timeout, uint32_t *return_3, uint32_t *return_4) {
+int tt_SiliconDevice::arc_msg(
+    int logical_device_id,
+    uint32_t msg_code,
+    bool wait_for_done,
+    uint32_t arg0,
+    uint32_t arg1,
+    int timeout,
+    uint32_t* return_3,
+    uint32_t* return_4) {
     log_assert(arch_name != tt::ARCH::BLACKHOLE, "ARC messages not supported in Blackhole");
-    if(ndesc -> is_chip_mmio_capable(logical_device_id)) {
+    if (ndesc->is_chip_mmio_capable(logical_device_id)) {
         return pcie_arc_msg(logical_device_id, msg_code, wait_for_done, arg0, arg1, timeout, return_3, return_4);
-    }
-    else {
+    } else {
         return remote_arc_msg(logical_device_id, msg_code, wait_for_done, arg0, arg1, timeout, return_3, return_4);
     }
 }
 
-void tt_SiliconDevice::send_tensix_risc_reset_to_core(const tt_cxy_pair &core, const TensixSoftResetOptions &soft_resets) {
+void tt_SiliconDevice::send_tensix_risc_reset_to_core(
+    const tt_cxy_pair& core, const TensixSoftResetOptions& soft_resets) {
     auto valid = soft_resets & ALL_TENSIX_SOFT_RESET;
-    uint32_t valid_val = (std::underlying_type<TensixSoftResetOptions>::type) valid;
+    uint32_t valid_val = (std::underlying_type<TensixSoftResetOptions>::type)valid;
     write_to_device(&valid_val, sizeof(uint32_t), core, 0xFFB121B0, "REG_TLB");
     tt_driver_atomics::sfence();
 }
 
-void tt_SiliconDevice::send_remote_tensix_risc_reset_to_core(const tt_cxy_pair &core, const TensixSoftResetOptions &soft_resets) {
+void tt_SiliconDevice::send_remote_tensix_risc_reset_to_core(
+    const tt_cxy_pair& core, const TensixSoftResetOptions& soft_resets) {
     auto valid = soft_resets & ALL_TENSIX_SOFT_RESET;
-    uint32_t valid_val = (std::underlying_type<TensixSoftResetOptions>::type) valid;
+    uint32_t valid_val = (std::underlying_type<TensixSoftResetOptions>::type)valid;
     write_to_non_mmio_device(&valid_val, sizeof(uint32_t), core, 0xFFB121B0);
     tt_driver_atomics::sfence();
 }
 
-int tt_SiliconDevice::set_remote_power_state(const chip_id_t &chip, tt_DevicePowerState device_state) {
+int tt_SiliconDevice::set_remote_power_state(const chip_id_t& chip, tt_DevicePowerState device_state) {
     auto mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(chip);
-    return remote_arc_msg(chip, get_power_state_arc_msg(mmio_capable_chip_logical, device_state), true, 0, 0, 1, NULL, NULL);
+    return remote_arc_msg(
+        chip, get_power_state_arc_msg(mmio_capable_chip_logical, device_state), true, 0, 0, 1, NULL, NULL);
 }
 
-
 void tt_SiliconDevice::enable_remote_ethernet_queue(const chip_id_t& chip, int timeout) {
     uint32_t msg_success = 0x0;
     auto timeout_seconds = std::chrono::seconds(timeout);
     auto start = std::chrono::system_clock::now();
     while (msg_success != 1) {
         if (std::chrono::system_clock::now() - start > timeout_seconds) {
-            throw std::runtime_error(fmt::format("Timed out after waiting {} seconds for DRAM to finish training", timeout));
+            throw std::runtime_error(
+                fmt::format("Timed out after waiting {} seconds for DRAM to finish training", timeout));
         }
         int msg_rt = remote_arc_msg(chip, 0xaa58, true, 0xFFFF, 0xFFFF, 1, &msg_success, NULL);
         if (msg_rt == MSG_ERROR_REPLY) {
@@ -2747,16 +3404,14 @@ void tt_SiliconDevice::enable_remote_ethernet_queue(const chip_id_t& chip, int t
     }
 }
 
-
-void tt_SiliconDevice::broadcast_tensix_risc_reset_to_cluster(const TensixSoftResetOptions &soft_resets) {
-    if(arch_name == tt::ARCH::GRAYSKULL) {
-        for (auto &device_it : m_pci_device_map) {
+void tt_SiliconDevice::broadcast_tensix_risc_reset_to_cluster(const TensixSoftResetOptions& soft_resets) {
+    if (arch_name == tt::ARCH::GRAYSKULL) {
+        for (auto& device_it : m_pci_device_map) {
             broadcast_pcie_tensix_risc_reset(device_it.first, soft_resets);
         }
-    }
-    else {
+    } else {
         auto valid = soft_resets & ALL_TENSIX_SOFT_RESET;
-        uint32_t valid_val = (std::underlying_type<TensixSoftResetOptions>::type) valid;
+        uint32_t valid_val = (std::underlying_type<TensixSoftResetOptions>::type)valid;
         std::set<chip_id_t> chips_to_exclude = {};
         std::set<uint32_t> rows_to_exclude;
         std::set<uint32_t> columns_to_exclude;
@@ -2768,7 +3423,14 @@ void tt_SiliconDevice::broadcast_tensix_risc_reset_to_cluster(const TensixSoftRe
             columns_to_exclude = {0, 5};
         }
         std::string fallback_tlb = "LARGE_WRITE_TLB";
-        broadcast_write_to_cluster(&valid_val, sizeof(uint32_t), 0xFFB121B0, chips_to_exclude, rows_to_exclude, columns_to_exclude, fallback_tlb);
+        broadcast_write_to_cluster(
+            &valid_val,
+            sizeof(uint32_t),
+            0xFFB121B0,
+            chips_to_exclude,
+            rows_to_exclude,
+            columns_to_exclude,
+            fallback_tlb);
         // Ensure that reset signal is globally visible
         wait_for_non_mmio_flush();
     }
@@ -2777,22 +3439,23 @@ void tt_SiliconDevice::broadcast_tensix_risc_reset_to_cluster(const TensixSoftRe
 void tt_SiliconDevice::set_power_state(tt_DevicePowerState device_state) {
     // MT Initial BH - ARC messages not supported in Blackhole
     if (arch_name != tt::ARCH::BLACKHOLE) {
-        for(auto& chip : target_devices_in_cluster) {
-            if(ndesc -> is_chip_mmio_capable(chip)) {
+        for (auto& chip : target_devices_in_cluster) {
+            if (ndesc->is_chip_mmio_capable(chip)) {
                 set_pcie_power_state(device_state);
             } else {
                 int exit_code = set_remote_power_state(chip, device_state);
-                log_assert(exit_code == 0, "Failed to set power state to {} with exit code: {}", (int)device_state, exit_code);
+                log_assert(
+                    exit_code == 0, "Failed to set power state to {} with exit code: {}", (int)device_state, exit_code);
             }
         }
     }
 }
 
 void tt_SiliconDevice::enable_ethernet_queue(int timeout) {
-    for (const chip_id_t &chip : target_devices_in_cluster) {
+    for (const chip_id_t& chip : target_devices_in_cluster) {
         auto arch = get_soc_descriptor(chip).arch;
 
-         switch (arch) {
+        switch (arch) {
             case tt::ARCH::WORMHOLE_B0: {
                 if (ndesc->is_chip_mmio_capable(chip)) {
                     enable_local_ethernet_queue(chip, timeout);
@@ -2801,20 +3464,17 @@ void tt_SiliconDevice::enable_ethernet_queue(int timeout) {
                 }
 
                 break;
-            case tt::ARCH::BLACKHOLE:
-                log_assert(false, "Arch BLACKHOLE doesn't support ethernet queues yet");
+                case tt::ARCH::BLACKHOLE:
+                    log_assert(false, "Arch BLACKHOLE doesn't support ethernet queues yet");
             }
             default: {
                 break;
             }
         }
-
     }
 }
 
-std::set<chip_id_t> tt_SiliconDevice::get_target_remote_device_ids() {
-    return target_remote_chips;
-}
+std::set<chip_id_t> tt_SiliconDevice::get_target_remote_device_ids() { return target_remote_chips; }
 
 void tt_SiliconDevice::deassert_resets_and_set_power_state() {
     // Assert tensix resets on all chips in cluster
@@ -2823,15 +3483,29 @@ void tt_SiliconDevice::deassert_resets_and_set_power_state() {
     // MT Initial BH - ARC messages not supported in Blackhole
     if (arch_name != tt::ARCH::BLACKHOLE) {
         // Send ARC Messages to deassert RISCV resets
-        for (auto &device_it : m_pci_device_map){
-            arc_msg(device_it.first, 0xaa00 | device_it.second.get()->get_architecture_implementation()->get_arc_message_deassert_riscv_reset(), true, 0, 0);
-        }
-        if(ndesc != nullptr) {
-            for(const chip_id_t& chip : target_devices_in_cluster) {
-                if(!ndesc -> is_chip_mmio_capable(chip)) {
+        for (auto& device_it : m_pci_device_map) {
+            arc_msg(
+                device_it.first,
+                0xaa00 |
+                    device_it.second.get()->get_architecture_implementation()->get_arc_message_deassert_riscv_reset(),
+                true,
+                0,
+                0);
+        }
+        if (ndesc != nullptr) {
+            for (const chip_id_t& chip : target_devices_in_cluster) {
+                if (!ndesc->is_chip_mmio_capable(chip)) {
                     auto mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(chip);
                     auto pci_device = get_pci_device(mmio_capable_chip_logical);
-                    remote_arc_msg(chip, 0xaa00 | pci_device->get_architecture_implementation()->get_arc_message_deassert_riscv_reset(), true, 0x0, 0x0, 1, NULL, NULL);
+                    remote_arc_msg(
+                        chip,
+                        0xaa00 | pci_device->get_architecture_implementation()->get_arc_message_deassert_riscv_reset(),
+                        true,
+                        0x0,
+                        0x0,
+                        1,
+                        NULL,
+                        NULL);
                 }
             }
             enable_ethernet_queue(30);
@@ -2842,11 +3516,16 @@ void tt_SiliconDevice::deassert_resets_and_set_power_state() {
 }
 
 void tt_SiliconDevice::verify_eth_fw() {
-    for(const auto& chip : target_devices_in_cluster) {
+    for (const auto& chip : target_devices_in_cluster) {
         uint32_t fw_version;
         std::vector<uint32_t> fw_versions;
-        for (const tt_xy_pair &eth_core : get_soc_descriptor(chip).ethernet_cores) {
-            read_from_device(&fw_version, tt_cxy_pair(chip, eth_core), l1_address_params.fw_version_addr, sizeof(uint32_t), "LARGE_READ_TLB");
+        for (const tt_xy_pair& eth_core : get_soc_descriptor(chip).ethernet_cores) {
+            read_from_device(
+                &fw_version,
+                tt_cxy_pair(chip, eth_core),
+                l1_address_params.fw_version_addr,
+                sizeof(uint32_t),
+                "LARGE_READ_TLB");
             fw_versions.push_back(fw_version);
         }
         verify_sw_fw_versions(chip, SW_VERSION, fw_versions);
@@ -2854,7 +3533,8 @@ void tt_SiliconDevice::verify_eth_fw() {
     }
 }
 
-void tt_SiliconDevice::verify_sw_fw_versions(int device_id, std::uint32_t sw_version, std::vector<std::uint32_t> &fw_versions) {
+void tt_SiliconDevice::verify_sw_fw_versions(
+    int device_id, std::uint32_t sw_version, std::vector<std::uint32_t>& fw_versions) {
     tt_version sw(sw_version), fw_first_eth_core(fw_versions.at(0));
     log_info(
         LogSiliconDriver,
@@ -2862,7 +3542,7 @@ void tt_SiliconDevice::verify_sw_fw_versions(int device_id, std::uint32_t sw_ver
         sw.str(),
         fw_first_eth_core.str(),
         device_id);
-    for (std::uint32_t &fw_version : fw_versions) {
+    for (std::uint32_t& fw_version : fw_versions) {
         tt_version fw(fw_version);
         log_assert(fw == fw_first_eth_core, "FW versions are not the same across different ethernet cores");
         log_assert(sw.major == fw.major, "SW/FW major version number out of sync");
@@ -2875,14 +3555,16 @@ void tt_SiliconDevice::verify_sw_fw_versions(int device_id, std::uint32_t sw_ver
     use_ethernet_broadcast &= fw_first_eth_core >= tt_version(6, 5, 0);
     // Virtual coordinates can be used for broadcast headers if ERISC FW >= 6.8.0 and NOC translation is enabled
     // Temporarily enable this feature for 6.7.241 as well for testing.
-    use_virtual_coords_for_eth_broadcast &= (fw_first_eth_core >= tt_version(6, 8, 0) || fw_first_eth_core == tt_version(6, 7, 241)) && translation_tables_en;
+    use_virtual_coords_for_eth_broadcast &=
+        (fw_first_eth_core >= tt_version(6, 8, 0) || fw_first_eth_core == tt_version(6, 7, 241)) &&
+        translation_tables_en;
 }
 
-void tt_SiliconDevice::start_device(const tt_device_params &device_params) {
-    if(device_params.init_device) {
+void tt_SiliconDevice::start_device(const tt_device_params& device_params) {
+    if (device_params.init_device) {
         initialize_pcie_devices();
         // MT Initial BH - Ethernet firmware not present in Blackhole
-        if(arch_name == tt::ARCH::WORMHOLE_B0) {
+        if (arch_name == tt::ARCH::WORMHOLE_B0) {
             verify_eth_fw();
         }
         deassert_resets_and_set_power_state();
@@ -2894,7 +3576,6 @@ void tt_SiliconDevice::close_device() {
     broadcast_tensix_risc_reset_to_cluster(TENSIX_ASSERT_SOFT_RESET);
 }
 
-
 void tt_SiliconDevice::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) {
     l1_address_params = l1_address_params_;
 }
@@ -2911,25 +3592,30 @@ void tt_SiliconDevice::set_driver_eth_interface_params(const tt_driver_eth_inter
     eth_interface_params = eth_interface_params_;
 }
 
-void tt_SiliconDevice::setup_core_to_tlb_map(const chip_id_t logical_device_id, std::function<std::int32_t(tt_xy_pair)> mapping_function) {
+void tt_SiliconDevice::setup_core_to_tlb_map(
+    const chip_id_t logical_device_id, std::function<std::int32_t(tt_xy_pair)> mapping_function) {
     map_core_to_tlb_per_chip[logical_device_id] = mapping_function;
     tlbs_init_per_chip[logical_device_id] = true;
 }
 
 std::uint32_t tt_SiliconDevice::get_num_dram_channels(std::uint32_t device_id) {
-    log_assert(target_devices_in_cluster.find(device_id) != target_devices_in_cluster.end(), "Querying DRAM parameters for a device that does not exist.");
+    log_assert(
+        target_devices_in_cluster.find(device_id) != target_devices_in_cluster.end(),
+        "Querying DRAM parameters for a device that does not exist.");
     return get_soc_descriptor(device_id).get_num_dram_channels();
 }
 
 std::uint64_t tt_SiliconDevice::get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel) {
     log_assert(channel < get_num_dram_channels(device_id), "Querying size for a device channel that does not exist.");
-    return  get_soc_descriptor(device_id).dram_bank_size; // Space per channel is identical for now
+    return get_soc_descriptor(device_id).dram_bank_size;  // Space per channel is identical for now
 }
 
 std::uint32_t tt_SiliconDevice::get_num_host_channels(std::uint32_t device_id) {
     auto devices = get_target_mmio_device_ids();
-    log_assert(devices.find(device_id) != devices.end(), "Querying Host Address parameters for a non-mmio device or a device does not exist.");
-    return m_num_host_mem_channels; // Same number of host channels per device for now
+    log_assert(
+        devices.find(device_id) != devices.end(),
+        "Querying Host Address parameters for a non-mmio device or a device does not exist.");
+    return m_num_host_mem_channels;  // Same number of host channels per device for now
 }
 
 std::uint32_t tt_SiliconDevice::get_host_channel_size(std::uint32_t device_id, std::uint32_t channel) {
@@ -2945,20 +3631,20 @@ std::uint32_t tt_SiliconDevice::get_numa_node_for_pcie_device(std::uint32_t devi
 std::uint64_t tt_SiliconDevice::get_pcie_base_addr_from_device(const chip_id_t chip_id) const {
     // TODO: Should probably be lowered to TTDevice.
     tt::ARCH arch = get_soc_descriptor(chip_id).arch;
-    if(arch == tt::ARCH::WORMHOLE_B0) {
+    if (arch == tt::ARCH::WORMHOLE_B0) {
         return 0x800000000;
-    }
-    else if (arch == tt::ARCH::BLACKHOLE) {
+    } else if (arch == tt::ARCH::BLACKHOLE) {
         // Enable 4th ATU window.
         return 1ULL << 60;
-    }
-    else {
+    } else {
         return 0;
     }
 }
 
 tt_version tt_SiliconDevice::get_ethernet_fw_version() const {
     log_assert(arch_name == tt::ARCH::WORMHOLE_B0, "Can only get Ethernet FW version for Wormhole architectures.");
-    log_assert(eth_fw_version.major != 0xffff and eth_fw_version.minor != 0xff and eth_fw_version.patch != 0xff, "Device must be started before querying Ethernet FW version.");
+    log_assert(
+        eth_fw_version.major != 0xffff and eth_fw_version.minor != 0xff and eth_fw_version.patch != 0xff,
+        "Device must be started before querying Ethernet FW version.");
     return eth_fw_version;
 }
diff --git a/device/tt_silicon_driver_common.cpp b/device/tt_silicon_driver_common.cpp
index a7429b8e..43905200 100644
--- a/device/tt_silicon_driver_common.cpp
+++ b/device/tt_silicon_driver_common.cpp
@@ -3,37 +3,37 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "device/tt_silicon_driver_common.hpp"
-#include "tt_xy_pair.h"
+
 #include "tt_device.h"
+#include "tt_xy_pair.h"
 
 std::string TensixSoftResetOptionsToString(TensixSoftResetOptions value) {
     std::string output;
 
-    if((value & TensixSoftResetOptions::BRISC) != TensixSoftResetOptions::NONE) {
+    if ((value & TensixSoftResetOptions::BRISC) != TensixSoftResetOptions::NONE) {
         output += "BRISC | ";
     }
-    if((value & TensixSoftResetOptions::TRISC0) != TensixSoftResetOptions::NONE) {
+    if ((value & TensixSoftResetOptions::TRISC0) != TensixSoftResetOptions::NONE) {
         output += "TRISC0 | ";
     }
-    if((value & TensixSoftResetOptions::TRISC1) != TensixSoftResetOptions::NONE) {
+    if ((value & TensixSoftResetOptions::TRISC1) != TensixSoftResetOptions::NONE) {
         output += "TRISC1 | ";
     }
-    if((value & TensixSoftResetOptions::TRISC2) != TensixSoftResetOptions::NONE) {
+    if ((value & TensixSoftResetOptions::TRISC2) != TensixSoftResetOptions::NONE) {
         output += "TRISC2 | ";
     }
-    if((value & TensixSoftResetOptions::NCRISC) != TensixSoftResetOptions::NONE) {
+    if ((value & TensixSoftResetOptions::NCRISC) != TensixSoftResetOptions::NONE) {
         output += "NCRISC | ";
     }
-    if((value & TensixSoftResetOptions::STAGGERED_START) != TensixSoftResetOptions::NONE) {
+    if ((value & TensixSoftResetOptions::STAGGERED_START) != TensixSoftResetOptions::NONE) {
         output += "STAGGERED_START | ";
     }
 
-  if(output.empty()) {
-    output = "UNKNOWN";
-  } else {
-    output.erase(output.end() - 3, output.end());
-  }
+    if (output.empty()) {
+        output = "UNKNOWN";
+    } else {
+        output.erase(output.end() - 3, output.end());
+    }
 
-  return output;
+    return output;
 }
-
diff --git a/device/tt_silicon_driver_common.hpp b/device/tt_silicon_driver_common.hpp
index 9f275668..6dc6d7f4 100644
--- a/device/tt_silicon_driver_common.hpp
+++ b/device/tt_silicon_driver_common.hpp
@@ -9,53 +9,42 @@
 #include <cstdint>
 #include <string>
 
-enum class TensixSoftResetOptions: std::uint32_t {
+enum class TensixSoftResetOptions : std::uint32_t {
     NONE = 0,
-    BRISC = ((std::uint32_t) 1 << 11),
-    TRISC0 = ((std::uint32_t) 1 << 12),
-    TRISC1 = ((std::uint32_t) 1 << 13),
-    TRISC2 = ((std::uint32_t) 1 << 14),
-    NCRISC = ((std::uint32_t) 1 << 18),
-    STAGGERED_START = ((std::uint32_t) 1 << 31)
+    BRISC = ((std::uint32_t)1 << 11),
+    TRISC0 = ((std::uint32_t)1 << 12),
+    TRISC1 = ((std::uint32_t)1 << 13),
+    TRISC2 = ((std::uint32_t)1 << 14),
+    NCRISC = ((std::uint32_t)1 << 18),
+    STAGGERED_START = ((std::uint32_t)1 << 31)
 };
 
 std::string TensixSoftResetOptionsToString(TensixSoftResetOptions value);
+
 constexpr TensixSoftResetOptions operator|(TensixSoftResetOptions lhs, TensixSoftResetOptions rhs) {
-    return static_cast<TensixSoftResetOptions>(
-        static_cast<uint32_t>(lhs) |
-        static_cast<uint32_t>(rhs)
-    );
+    return static_cast<TensixSoftResetOptions>(static_cast<uint32_t>(lhs) | static_cast<uint32_t>(rhs));
 }
 
 constexpr TensixSoftResetOptions operator&(TensixSoftResetOptions lhs, TensixSoftResetOptions rhs) {
-    return static_cast<TensixSoftResetOptions>(
-        static_cast<uint32_t>(lhs) &
-        static_cast<uint32_t>(rhs)
-    );
+    return static_cast<TensixSoftResetOptions>(static_cast<uint32_t>(lhs) & static_cast<uint32_t>(rhs));
 }
 
 constexpr bool operator!=(TensixSoftResetOptions lhs, TensixSoftResetOptions rhs) {
-    return
-        static_cast<uint32_t>(lhs) !=
-        static_cast<uint32_t>(rhs);
+    return static_cast<uint32_t>(lhs) != static_cast<uint32_t>(rhs);
 }
 
-static constexpr TensixSoftResetOptions ALL_TRISC_SOFT_RESET = TensixSoftResetOptions::TRISC0 |
-                                                           TensixSoftResetOptions::TRISC1 |
-                                                           TensixSoftResetOptions::TRISC2;
+static constexpr TensixSoftResetOptions ALL_TRISC_SOFT_RESET =
+    TensixSoftResetOptions::TRISC0 | TensixSoftResetOptions::TRISC1 | TensixSoftResetOptions::TRISC2;
 
-static constexpr TensixSoftResetOptions ALL_TENSIX_SOFT_RESET = TensixSoftResetOptions::BRISC |
-                                                            TensixSoftResetOptions::NCRISC |
-                                                            TensixSoftResetOptions::STAGGERED_START |
-                                                            ALL_TRISC_SOFT_RESET;
+static constexpr TensixSoftResetOptions ALL_TENSIX_SOFT_RESET =
+    TensixSoftResetOptions::BRISC | TensixSoftResetOptions::NCRISC | TensixSoftResetOptions::STAGGERED_START |
+    ALL_TRISC_SOFT_RESET;
 
-static constexpr TensixSoftResetOptions TENSIX_ASSERT_SOFT_RESET = TensixSoftResetOptions::BRISC |
-                                                               TensixSoftResetOptions::NCRISC |
-                                                               ALL_TRISC_SOFT_RESET;
+static constexpr TensixSoftResetOptions TENSIX_ASSERT_SOFT_RESET =
+    TensixSoftResetOptions::BRISC | TensixSoftResetOptions::NCRISC | ALL_TRISC_SOFT_RESET;
 
-static constexpr TensixSoftResetOptions TENSIX_DEASSERT_SOFT_RESET = TensixSoftResetOptions::NCRISC |
-                                                                 ALL_TRISC_SOFT_RESET |
-                                                                 TensixSoftResetOptions::STAGGERED_START;
+static constexpr TensixSoftResetOptions TENSIX_DEASSERT_SOFT_RESET =
+    TensixSoftResetOptions::NCRISC | ALL_TRISC_SOFT_RESET | TensixSoftResetOptions::STAGGERED_START;
 
-static constexpr TensixSoftResetOptions TENSIX_DEASSERT_SOFT_RESET_NO_STAGGER = TensixSoftResetOptions::NCRISC |
-                                                                                 ALL_TRISC_SOFT_RESET;
+static constexpr TensixSoftResetOptions TENSIX_DEASSERT_SOFT_RESET_NO_STAGGER =
+    TensixSoftResetOptions::NCRISC | ALL_TRISC_SOFT_RESET;
diff --git a/device/tt_soc_descriptor.cpp b/device/tt_soc_descriptor.cpp
index 9a572420..ccf6dc02 100644
--- a/device/tt_soc_descriptor.cpp
+++ b/device/tt_soc_descriptor.cpp
@@ -2,10 +2,10 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "yaml-cpp/yaml.h"
 #include "tt_soc_descriptor.h"
 
 #include <assert.h>
+
 #include <fstream>
 #include <iostream>
 #include <regex>
@@ -13,53 +13,53 @@
 #include <unordered_set>
 
 #include "fmt/core.h"
+#include "yaml-cpp/yaml.h"
 
 // #include "l1_address_map.h"
 
 std::string format_node(tt_xy_pair xy) { return fmt::format("{}-{}", xy.x, xy.y); }
 
 tt_xy_pair format_node(std::string str) {
-  int x_coord;
-  int y_coord;
-  std::regex expr("([0-9]+)[-,xX]([0-9]+)");
-  std::smatch x_y_pair;
-
-  if (std::regex_search(str, x_y_pair, expr)) {
-    x_coord = std::stoi(x_y_pair[1]);
-    y_coord = std::stoi(x_y_pair[2]);
-  } else {
-    throw std::runtime_error(fmt::format("Could not parse the core id: {}", str));
-  }
+    int x_coord;
+    int y_coord;
+    std::regex expr("([0-9]+)[-,xX]([0-9]+)");
+    std::smatch x_y_pair;
+
+    if (std::regex_search(str, x_y_pair, expr)) {
+        x_coord = std::stoi(x_y_pair[1]);
+        y_coord = std::stoi(x_y_pair[2]);
+    } else {
+        throw std::runtime_error(fmt::format("Could not parse the core id: {}", str));
+    }
 
-  tt_xy_pair xy(x_coord, y_coord);
+    tt_xy_pair xy(x_coord, y_coord);
 
-  return xy;
+    return xy;
 }
-const char* ws = " \t\n\r\f\v";
+
+const char *ws = " \t\n\r\f\v";
 
 // trim from end of string (right)
-inline std::string& rtrim(std::string& s, const char* t = ws)
-{
+inline std::string &rtrim(std::string &s, const char *t = ws) {
     s.erase(s.find_last_not_of(t) + 1);
     return s;
 }
 
 // trim from beginning of string (left)
-inline std::string& ltrim(std::string& s, const char* t = ws)
-{
+inline std::string &ltrim(std::string &s, const char *t = ws) {
     s.erase(0, s.find_first_not_of(t));
     return s;
 }
 
 // trim from both ends of string (right then left)
-inline std::string& trim(std::string& s, const char* t = ws)
-{
-    return ltrim(rtrim(s, t), t);
-}
+inline std::string &trim(std::string &s, const char *t = ws) { return ltrim(rtrim(s, t), t); }
 
 void tt_SocDescriptor::load_soc_features_from_device_descriptor(YAML::Node &device_descriptor_yaml) {
     overlay_version = device_descriptor_yaml["features"]["overlay"]["version"].as<int>();
-    noc_translation_id_enabled = device_descriptor_yaml["features"]["noc"] && device_descriptor_yaml["features"]["noc"]["translation_id_enabled"] ? device_descriptor_yaml["features"]["noc"]["translation_id_enabled"].as<bool>() : false;
+    noc_translation_id_enabled =
+        device_descriptor_yaml["features"]["noc"] && device_descriptor_yaml["features"]["noc"]["translation_id_enabled"]
+            ? device_descriptor_yaml["features"]["noc"]["translation_id_enabled"].as<bool>()
+            : false;
     packer_version = device_descriptor_yaml["features"]["packer"]["version"].as<int>();
     unpacker_version = device_descriptor_yaml["features"]["unpacker"]["version"].as<int>();
     dst_size_alignment = device_descriptor_yaml["features"]["math"]["dst_size_alignment"].as<int>();
@@ -90,7 +90,8 @@ void tt_SocDescriptor::load_core_descriptors_from_device_descriptor(YAML::Node &
     }
 
     int current_dram_channel = 0;
-    for (auto channel_it = device_descriptor_yaml["dram"].begin(); channel_it != device_descriptor_yaml["dram"].end(); ++channel_it) {
+    for (auto channel_it = device_descriptor_yaml["dram"].begin(); channel_it != device_descriptor_yaml["dram"].end();
+         ++channel_it) {
         dram_cores.push_back({});
         auto &soc_dram_cores = dram_cores.at(dram_cores.size() - 1);
         const auto &dram_cores = (*channel_it).as<std::vector<std::string>>();
@@ -121,8 +122,8 @@ void tt_SocDescriptor::load_core_descriptors_from_device_descriptor(YAML::Node &
     std::vector<std::string> worker_cores = device_descriptor_yaml["functional_workers"].as<std::vector<std::string>>();
     std::set<int> worker_routing_coords_x;
     std::set<int> worker_routing_coords_y;
-    std::unordered_map<int,int> routing_coord_worker_x;
-    std::unordered_map<int,int> routing_coord_worker_y;
+    std::unordered_map<int, int> routing_coord_worker_x;
+    std::unordered_map<int, int> routing_coord_worker_y;
     for (const auto &core_string : worker_cores) {
         CoreDescriptor core_descriptor;
         core_descriptor.coord = format_node(core_string);
@@ -137,12 +138,12 @@ void tt_SocDescriptor::load_core_descriptors_from_device_descriptor(YAML::Node &
     int func_x_start = 0;
     int func_y_start = 0;
     std::set<int>::iterator it;
-    for (it=worker_routing_coords_x.begin(); it!=worker_routing_coords_x.end(); ++it) {
+    for (it = worker_routing_coords_x.begin(); it != worker_routing_coords_x.end(); ++it) {
         worker_log_to_routing_x[func_x_start] = *it;
         routing_x_to_worker_x[*it] = func_x_start;
         func_x_start++;
     }
-    for (it=worker_routing_coords_y.begin(); it!=worker_routing_coords_y.end(); ++it) {
+    for (it = worker_routing_coords_y.begin(); it != worker_routing_coords_y.end(); ++it) {
         worker_log_to_routing_y[func_y_start] = *it;
         routing_y_to_worker_y[*it] = func_y_start;
         func_y_start++;
@@ -225,7 +226,8 @@ tt_virtual_coords tt_SocDescriptor::to_virtual_coords(tt_translated_coords trans
 tt_SocDescriptor::tt_SocDescriptor(std::string device_descriptor_path, std::size_t harvesting_mask) {
     std::ifstream fdesc(device_descriptor_path);
     if (fdesc.fail()) {
-        throw std::runtime_error(fmt::format("Error: device descriptor file {} does not exist!", device_descriptor_path));
+        throw std::runtime_error(
+            fmt::format("Error: device descriptor file {} does not exist!", device_descriptor_path));
     }
     fdesc.close();
 
@@ -233,10 +235,12 @@ tt_SocDescriptor::tt_SocDescriptor(std::string device_descriptor_path, std::size
 
     auto grid_size_x = device_descriptor_yaml["grid"]["x_size"].as<int>();
     auto grid_size_y = device_descriptor_yaml["grid"]["y_size"].as<int>();
-    int physical_grid_size_x = device_descriptor_yaml["physical"] && device_descriptor_yaml["physical"]["x_size"] ?
-                                device_descriptor_yaml["physical"]["x_size"].as<int>() : grid_size_x;
-    int physical_grid_size_y = device_descriptor_yaml["physical"] && device_descriptor_yaml["physical"]["y_size"] ?
-                                device_descriptor_yaml["physical"]["y_size"].as<int>() : grid_size_y;
+    int physical_grid_size_x = device_descriptor_yaml["physical"] && device_descriptor_yaml["physical"]["x_size"]
+                                   ? device_descriptor_yaml["physical"]["x_size"].as<int>()
+                                   : grid_size_x;
+    int physical_grid_size_y = device_descriptor_yaml["physical"] && device_descriptor_yaml["physical"]["y_size"]
+                                   ? device_descriptor_yaml["physical"]["y_size"].as<int>()
+                                   : grid_size_y;
     load_core_descriptors_from_device_descriptor(device_descriptor_yaml);
     grid_size = tt_xy_pair(grid_size_x, grid_size_y);
     physical_grid_size = tt_xy_pair(physical_grid_size_x, physical_grid_size_y);
@@ -251,7 +255,7 @@ tt_SocDescriptor::tt_SocDescriptor(std::string device_descriptor_path, std::size
 
 int tt_SocDescriptor::get_num_dram_channels() const {
     int num_channels = 0;
-    for (auto& dram_core : dram_cores) {
+    for (auto &dram_core : dram_cores) {
         if (dram_core.size() > 0) {
             num_channels++;
         }
@@ -281,7 +285,7 @@ std::ostream &operator<<(std::ostream &out, const tt::ARCH &arch_name) {
     } else if (arch_name == tt::ARCH::WORMHOLE_B0) {
         out << "wormhole_b0";
     } else if (arch_name == tt::ARCH::BLACKHOLE) {
-        out << "blackhole"; //Just how many ARCH-to-string functions do we plan to have, anyway?
+        out << "blackhole";  // Just how many ARCH-to-string functions do we plan to have, anyway?
     } else {
         out << "ArchNameSerializationNotImplemented";
     }
diff --git a/device/tt_soc_descriptor.h b/device/tt_soc_descriptor.h
index 372d0a29..de1511bf 100644
--- a/device/tt_soc_descriptor.h
+++ b/device/tt_soc_descriptor.h
@@ -7,29 +7,25 @@
 #pragma once
 
 #include <cstddef>
-#include <string>
+#include <cstdint>
+#include <iostream>
 #include <map>
+#include <string>
 #include <unordered_map>
 #include <vector>
 
-#include <iostream>
-#include <string>
-#include <cstdint>
-
-#include "tt_xy_pair.h"
-#include "device/tt_arch_types.h"
-
 #include "device/coordinate_manager.h"
-
+#include "device/tt_arch_types.h"
 #include "fmt/core.h"
+#include "tt_xy_pair.h"
 
 namespace YAML {
-    class Node;
+class Node;
 }
 
 std::ostream &operator<<(std::ostream &out, const tt::ARCH &arch_name);
 
-static inline std::string get_arch_str(const tt::ARCH arch_name){
+static inline std::string get_arch_str(const tt::ARCH arch_name) {
     std::string arch_name_str;
 
     if (arch_name == tt::ARCH::GRAYSKULL) {
@@ -45,16 +41,18 @@ static inline std::string get_arch_str(const tt::ARCH arch_name){
     return arch_name_str;
 }
 
-static inline tt::ARCH get_arch_name(const std::string &arch_str){
+static inline tt::ARCH get_arch_name(const std::string &arch_str) {
     tt::ARCH arch;
 
     if ((arch_str == "grayskull") || (arch_str == "GRAYSKULL")) {
         arch = tt::ARCH::GRAYSKULL;
-    } else if ((arch_str == "wormhole") || (arch_str == "WORMHOLE") || (arch_str == "wormhole_b0") || (arch_str == "WORMHOLE_B0")){
+    } else if (
+        (arch_str == "wormhole") || (arch_str == "WORMHOLE") || (arch_str == "wormhole_b0") ||
+        (arch_str == "WORMHOLE_B0")) {
         arch = tt::ARCH::WORMHOLE_B0;
-    } else if ((arch_str == "blackhole") || (arch_str == "BLACKHOLE")){
+    } else if ((arch_str == "blackhole") || (arch_str == "BLACKHOLE")) {
         arch = tt::ARCH::BLACKHOLE;
-    }else {
+    } else {
         throw std::runtime_error(
             fmt::format("At LoadSocDescriptorFromYaml: \"{}\" is not recognized as tt::ARCH.", arch_str));
     }
@@ -69,13 +67,13 @@ tt_xy_pair format_node(std::string str);
 //! SocCore type enumerations
 /*! Superset for all chip generations */
 enum class CoreType {
-  ARC,
-  DRAM,
-  ETH,
-  PCIE,
-  WORKER,
-  HARVESTED,
-  ROUTER_ONLY,
+    ARC,
+    DRAM,
+    ETH,
+    PCIE,
+    WORKER,
+    HARVESTED,
+    ROUTER_ONLY,
 
 };
 
@@ -84,10 +82,10 @@ enum class CoreType {
     Should only contain relevant configuration for SOC
 */
 struct CoreDescriptor {
-  tt_xy_pair coord = tt_xy_pair(0, 0);
-  CoreType type;
+    tt_xy_pair coord = tt_xy_pair(0, 0);
+    CoreType type;
 
-  std::size_t l1_size = 0;
+    std::size_t l1_size = 0;
 };
 
 //! tt_SocDescriptor contains information regarding the SOC configuration targetted.
@@ -95,7 +93,6 @@ struct CoreDescriptor {
     Should only contain relevant configuration for SOC
 */
 class tt_SocDescriptor {
-
 public:
     tt::ARCH arch;
     tt_xy_pair grid_size;
@@ -110,13 +107,15 @@ class tt_SocDescriptor {
     std::unordered_map<int, int> worker_log_to_routing_y;
     std::unordered_map<int, int> routing_x_to_worker_x;
     std::unordered_map<int, int> routing_y_to_worker_y;
-    std::vector<std::vector<tt_xy_pair>> dram_cores;  // per channel list of dram cores
+    std::vector<std::vector<tt_xy_pair>> dram_cores;                             // per channel list of dram cores
     std::unordered_map<tt_xy_pair, std::tuple<int, int>> dram_core_channel_map;  // map dram core to chan/subchan
-    std::vector<tt_xy_pair> ethernet_cores;  // ethernet cores (index == channel id)
-    std::unordered_map<tt_xy_pair,int> ethernet_core_channel_map;
+    std::vector<tt_xy_pair> ethernet_cores;                                      // ethernet cores (index == channel id)
+    std::unordered_map<tt_xy_pair, int> ethernet_core_channel_map;
     std::vector<std::size_t> trisc_sizes;  // Most of software stack assumes same trisc size for whole chip..
     std::string device_descriptor_file_path = std::string("");
+
     bool has(tt_xy_pair input) { return cores.find(input) != cores.end(); }
+
     int overlay_version;
     int unpacker_version;
     int dst_size_alignment;
@@ -129,15 +128,15 @@ class tt_SocDescriptor {
     int get_num_dram_channels() const;
     bool is_worker_core(const tt_xy_pair &core) const;
     tt_xy_pair get_core_for_dram_channel(int dram_chan, int subchannel) const;
-    bool is_ethernet_core(const tt_xy_pair& core) const;
+    bool is_ethernet_core(const tt_xy_pair &core) const;
 
     // Default constructor. Creates uninitialized object with public access to all of its attributes.
     tt_SocDescriptor() = default;
-    // Constructor used to build object from device descriptor file.    
+    // Constructor used to build object from device descriptor file.
     tt_SocDescriptor(std::string device_descriptor_path, std::size_t harvesting_mask = 0);
 
     // Copy constructor
-    tt_SocDescriptor(const tt_SocDescriptor& other) :
+    tt_SocDescriptor(const tt_SocDescriptor &other) :
         arch(other.arch),
         grid_size(other.grid_size),
         physical_grid_size(other.physical_grid_size),
@@ -167,7 +166,7 @@ class tt_SocDescriptor {
         dram_bank_size(other.dram_bank_size) {
         coordinate_manager.reset(new CoordinateManager(*other.coordinate_manager));
     }
-    
+
     // Coordinate conversions.
 
     // Conversions from logical coordinates should be used just for worker cores.
diff --git a/device/tt_xy_pair.h b/device/tt_xy_pair.h
index 052b6130..74b54a23 100644
--- a/device/tt_xy_pair.h
+++ b/device/tt_xy_pair.h
@@ -15,44 +15,56 @@ using tt_cxy_pair = tt::umd::cxy_pair;
 
 struct tt_physical_coords : public tt_xy_pair {
     tt_physical_coords() : tt_xy_pair() {}
+
     tt_physical_coords(std::size_t x, std::size_t y) : tt_xy_pair(x, y) {}
 };
 
 struct tt_chip_physical_coords : public tt_cxy_pair {
     tt_chip_physical_coords() : tt_cxy_pair() {}
+
     tt_chip_physical_coords(std::size_t ichip, xy_pair pair) : tt_cxy_pair(ichip, pair) {}
+
     tt_chip_physical_coords(std::size_t ichip, std::size_t x, std::size_t y) : tt_cxy_pair(ichip, x, y) {}
 };
 
 struct tt_logical_coords : public tt_xy_pair {
     tt_logical_coords() : tt_xy_pair() {}
+
     tt_logical_coords(std::size_t x, std::size_t y) : tt_xy_pair(x, y) {}
 };
 
 struct tt_chip_logical_coords : public tt_cxy_pair {
     tt_chip_logical_coords() : tt_cxy_pair() {}
+
     tt_chip_logical_coords(std::size_t ichip, xy_pair pair) : tt_cxy_pair(ichip, pair) {}
+
     tt_chip_logical_coords(std::size_t ichip, std::size_t x, std::size_t y) : tt_cxy_pair(ichip, x, y) {}
 };
 
 struct tt_virtual_coords : public tt_xy_pair {
     tt_virtual_coords() : tt_xy_pair() {}
+
     tt_virtual_coords(std::size_t x, std::size_t y) : tt_xy_pair(x, y) {}
 };
 
 struct tt_chip_virtual_coords : public tt_cxy_pair {
     tt_chip_virtual_coords() : tt_cxy_pair() {}
+
     tt_chip_virtual_coords(std::size_t ichip, xy_pair pair) : tt_cxy_pair(ichip, pair) {}
+
     tt_chip_virtual_coords(std::size_t ichip, std::size_t x, std::size_t y) : tt_cxy_pair(ichip, x, y) {}
 };
 
 struct tt_translated_coords : public tt_xy_pair {
     tt_translated_coords() : tt_xy_pair() {}
+
     tt_translated_coords(std::size_t x, std::size_t y) : tt_xy_pair(x, y) {}
 };
 
 struct tt_chip_translated_coords : public tt_cxy_pair {
     tt_chip_translated_coords() : tt_cxy_pair() {}
+
     tt_chip_translated_coords(std::size_t ichip, xy_pair pair) : tt_cxy_pair(ichip, pair) {}
+
     tt_chip_translated_coords(std::size_t ichip, std::size_t x, std::size_t y) : tt_cxy_pair(ichip, x, y) {}
 };
\ No newline at end of file
diff --git a/device/wormhole/wormhole_coordinate_manager.cpp b/device/wormhole/wormhole_coordinate_manager.cpp
index eccc0a70..a7b60368 100644
--- a/device/wormhole/wormhole_coordinate_manager.cpp
+++ b/device/wormhole/wormhole_coordinate_manager.cpp
@@ -14,9 +14,11 @@ std::set<std::size_t> WormholeCoordinateManager::get_y_coordinates_to_harvest(st
 }
 
 tt_translated_coords WormholeCoordinateManager::to_translated_coords(tt_logical_coords logical_coords) {
-    return tt_translated_coords(logical_coords.x + translated_coordinate_start_x, logical_coords.y + translated_coordinate_start_y);
+    return tt_translated_coords(
+        logical_coords.x + translated_coordinate_start_x, logical_coords.y + translated_coordinate_start_y);
 }
 
 tt_logical_coords WormholeCoordinateManager::to_logical_coords(tt_translated_coords translated_coords) {
-    return tt_logical_coords(translated_coords.x - translated_coordinate_start_x, translated_coords.y - translated_coordinate_start_y);
+    return tt_logical_coords(
+        translated_coords.x - translated_coordinate_start_x, translated_coords.y - translated_coordinate_start_y);
 }
\ No newline at end of file
diff --git a/device/wormhole/wormhole_coordinate_manager.h b/device/wormhole/wormhole_coordinate_manager.h
index 9eca9fd1..e3e35886 100644
--- a/device/wormhole/wormhole_coordinate_manager.h
+++ b/device/wormhole/wormhole_coordinate_manager.h
@@ -9,16 +9,16 @@
 #include "device/coordinate_manager.h"
 
 class WormholeCoordinateManager : public CoordinateManager {
-
 public:
-    WormholeCoordinateManager(const tt_xy_pair& worker_grid_size, const std::vector<tt_xy_pair>& workers, std::size_t harvesting_mask)
-        : CoordinateManager(worker_grid_size, workers, harvesting_mask) {}
+    WormholeCoordinateManager(
+        const tt_xy_pair& worker_grid_size, const std::vector<tt_xy_pair>& workers, std::size_t harvesting_mask) :
+        CoordinateManager(worker_grid_size, workers, harvesting_mask) {}
 
     tt_translated_coords to_translated_coords(tt_logical_coords logical_coords) override;
 
     tt_logical_coords to_logical_coords(tt_translated_coords translated_coords) override;
 
-protected: 
+protected:
     std::set<std::size_t> get_y_coordinates_to_harvest(std::size_t harvesting_mask) override;
 
 private:
diff --git a/device/wormhole/wormhole_implementation.cpp b/device/wormhole/wormhole_implementation.cpp
index 0dc9a205..25a304a3 100644
--- a/device/wormhole/wormhole_implementation.cpp
+++ b/device/wormhole/wormhole_implementation.cpp
@@ -4,9 +4,8 @@
 
 #include "wormhole_implementation.h"
 
-#include "src/firmware/riscv/wormhole/host_mem_address_map.h"
-
 #include "device/tt_device.h"
+#include "src/firmware/riscv/wormhole/host_mem_address_map.h"
 
 namespace tt::umd {
 
@@ -94,7 +93,9 @@ std::pair<std::uint64_t, std::uint64_t> wormhole_implementation::get_tlb_data(
 }
 
 tt_driver_host_address_params wormhole_implementation::get_host_address_params() const {
-    return {::wormhole::host_mem::address_map::ETH_ROUTING_BLOCK_SIZE, ::wormhole::host_mem::address_map::ETH_ROUTING_BUFFERS_START};
+    return {
+        ::wormhole::host_mem::address_map::ETH_ROUTING_BLOCK_SIZE,
+        ::wormhole::host_mem::address_map::ETH_ROUTING_BUFFERS_START};
 }
 
 }  // namespace tt::umd
diff --git a/device/wormhole/wormhole_implementation.h b/device/wormhole/wormhole_implementation.h
index f6214d06..e16205f0 100644
--- a/device/wormhole/wormhole_implementation.h
+++ b/device/wormhole/wormhole_implementation.h
@@ -167,7 +167,8 @@ static constexpr uint32_t TLB_BASE_INDEX_16M = TLB_BASE_INDEX_2M + TLB_COUNT_2M;
 static constexpr uint32_t DYNAMIC_TLB_COUNT = 16;
 
 static constexpr uint32_t DYNAMIC_TLB_16M_SIZE = 16 * 1024 * 1024;
-static constexpr uint32_t DYNAMIC_TLB_16M_CFG_ADDR = STATIC_TLB_CFG_ADDR + (TLB_BASE_INDEX_16M * TLB_CFG_REG_SIZE_BYTES);
+static constexpr uint32_t DYNAMIC_TLB_16M_CFG_ADDR =
+    STATIC_TLB_CFG_ADDR + (TLB_BASE_INDEX_16M * TLB_CFG_REG_SIZE_BYTES);
 static constexpr uint32_t DYNAMIC_TLB_16M_BASE = TLB_BASE_16M;
 
 static constexpr uint32_t DYNAMIC_TLB_2M_SIZE = 2 * 1024 * 1024;
@@ -205,59 +206,93 @@ static constexpr uint32_t TENSIX_SOFT_RESET_ADDR = 0xFFB121B0;
 }  // namespace wormhole
 
 class wormhole_implementation : public architecture_implementation {
-   public:
+public:
     tt::ARCH get_architecture() const override { return tt::ARCH::WORMHOLE_B0; }
+
     uint32_t get_arc_message_arc_get_harvesting() const override {
         return static_cast<uint32_t>(wormhole::arc_message_type::ARC_GET_HARVESTING);
     }
+
     uint32_t get_arc_message_arc_go_busy() const override {
         return static_cast<uint32_t>(wormhole::arc_message_type::ARC_GO_BUSY);
     }
+
     uint32_t get_arc_message_arc_go_long_idle() const override {
         return static_cast<uint32_t>(wormhole::arc_message_type::ARC_GO_LONG_IDLE);
     }
+
     uint32_t get_arc_message_arc_go_short_idle() const override {
         return static_cast<uint32_t>(wormhole::arc_message_type::ARC_GO_SHORT_IDLE);
     }
+
     uint32_t get_arc_message_deassert_riscv_reset() const override {
         return static_cast<uint32_t>(wormhole::arc_message_type::DEASSERT_RISCV_RESET);
     }
+
     uint32_t get_arc_message_get_aiclk() const override {
         return static_cast<uint32_t>(wormhole::arc_message_type::GET_AICLK);
     }
+
     uint32_t get_arc_message_setup_iatu_for_peer_to_peer() const override {
         return static_cast<uint32_t>(wormhole::arc_message_type::SETUP_IATU_FOR_PEER_TO_PEER);
     }
+
     uint32_t get_arc_message_test() const override { return static_cast<uint32_t>(wormhole::arc_message_type::TEST); }
+
     uint32_t get_arc_csm_mailbox_offset() const override { return wormhole::ARC_CSM_MAILBOX_OFFSET; }
+
     uint32_t get_arc_reset_arc_misc_cntl_offset() const override { return wormhole::ARC_RESET_ARC_MISC_CNTL_OFFSET; }
+
     uint32_t get_arc_reset_scratch_offset() const override { return wormhole::ARC_RESET_SCRATCH_OFFSET; }
+
     uint32_t get_dram_channel_0_peer2peer_region_start() const override {
         return wormhole::DRAM_CHANNEL_0_PEER2PEER_REGION_START;
     }
+
     uint32_t get_dram_channel_0_x() const override { return wormhole::DRAM_CHANNEL_0_X; }
+
     uint32_t get_dram_channel_0_y() const override { return wormhole::DRAM_CHANNEL_0_Y; }
+
     uint32_t get_broadcast_tlb_index() const override { return wormhole::BROADCAST_TLB_INDEX; }
+
     uint32_t get_dynamic_tlb_2m_base() const override { return wormhole::DYNAMIC_TLB_2M_BASE; }
+
     uint32_t get_dynamic_tlb_2m_size() const override { return wormhole::DYNAMIC_TLB_2M_SIZE; }
+
     uint32_t get_dynamic_tlb_16m_base() const override { return wormhole::DYNAMIC_TLB_16M_BASE; }
+
     uint32_t get_dynamic_tlb_16m_size() const override { return wormhole::DYNAMIC_TLB_16M_SIZE; }
+
     uint32_t get_dynamic_tlb_16m_cfg_addr() const override { return wormhole::DYNAMIC_TLB_16M_CFG_ADDR; }
+
     uint32_t get_mem_large_read_tlb() const override { return wormhole::MEM_LARGE_READ_TLB; }
+
     uint32_t get_mem_large_write_tlb() const override { return wormhole::MEM_LARGE_WRITE_TLB; }
+
     uint32_t get_static_tlb_cfg_addr() const override { return wormhole::STATIC_TLB_CFG_ADDR; }
+
     uint32_t get_static_tlb_size() const override { return wormhole::STATIC_TLB_SIZE; }
+
     uint32_t get_reg_tlb() const override { return wormhole::REG_TLB; }
+
     uint32_t get_tlb_base_index_16m() const override { return wormhole::TLB_BASE_INDEX_16M; }
+
     uint32_t get_tensix_soft_reset_addr() const override { return wormhole::TENSIX_SOFT_RESET_ADDR; }
+
     uint32_t get_grid_size_x() const override { return wormhole::GRID_SIZE_X; }
+
     uint32_t get_grid_size_y() const override { return wormhole::GRID_SIZE_Y; }
+
     uint32_t get_tlb_cfg_reg_size_bytes() const override { return wormhole::TLB_CFG_REG_SIZE_BYTES; }
+
     uint32_t get_small_read_write_tlb() const override { return wormhole::MEM_SMALL_READ_WRITE_TLB; }
+
     const std::vector<uint32_t>& get_harvesting_noc_locations() const override {
         return wormhole::HARVESTING_NOC_LOCATIONS;
     }
+
     const std::vector<uint32_t>& get_t6_x_locations() const override { return wormhole::T6_X_LOCATIONS; }
+
     const std::vector<uint32_t>& get_t6_y_locations() const override { return wormhole::T6_Y_LOCATIONS; }
 
     std::tuple<xy_pair, xy_pair> multicast_workaround(xy_pair start, xy_pair end) const override;
@@ -266,7 +301,6 @@ class wormhole_implementation : public architecture_implementation {
     std::pair<std::uint64_t, std::uint64_t> get_tlb_data(std::uint32_t tlb_index, const tlb_data& data) const override;
 
     tt_driver_host_address_params get_host_address_params() const override;
-
 };
 
 }  // namespace tt::umd
diff --git a/device/xy_pair.h b/device/xy_pair.h
index 92d7c639..84884590 100644
--- a/device/xy_pair.h
+++ b/device/xy_pair.h
@@ -14,6 +14,7 @@ namespace tt::umd {
 
 struct xy_pair {
     constexpr xy_pair() : x{}, y{} {}
+
     constexpr xy_pair(std::size_t x, std::size_t y) : x(x), y(y) {}
 
     std::size_t x;
@@ -32,14 +33,14 @@ constexpr inline bool operator<(const xy_pair &left, const xy_pair &right) {
 
 struct cxy_pair : public xy_pair {
     cxy_pair() : xy_pair{}, chip{} {}
+
     cxy_pair(std::size_t ichip, xy_pair pair) : xy_pair(pair.x, pair.y), chip(ichip) {}
+
     cxy_pair(std::size_t ichip, std::size_t x, std::size_t y) : xy_pair(x, y), chip(ichip) {}
 
     std::size_t chip;
 
-    std::string str() const {
-        return fmt::format("(chip={},x={},y={})", chip, x, y);
-    }
+    std::string str() const { return fmt::format("(chip={},x={},y={})", chip, x, y); }
 };
 
 constexpr inline bool operator==(const cxy_pair &a, const cxy_pair &b) {