From af831359ccc9afd2523128b570a828d7259cf549 Mon Sep 17 00:00:00 2001
From: Stanislav Minakov <sminakov@tenstorrent.com>
Date: Thu, 24 Oct 2024 09:39:56 -0700
Subject: [PATCH] #0: Unify `tt_metal::Buffer` and `ttnn::Buffer`, add support
 for 0-volume buffers (#14023)

* #0: Buffer refactoring wip

* #0: Buffer threading change

* #0: Buffer fixup

* #0: Buffer fixup

* #0: Add experimental asserts

* #0: Refactor

* #0: Test fixup

* #0: Review fixes

* #0: Review fixes

* #0: Review fixes

* #0: Handle 0 page size

* #0: Add a test for creating/reading/writing 0-sized tensor

* #0: Add support for more 0-sized shapes

* #0: Review fixes

* #0: Refactor allocation/deallocation threading to allow getting buffer address from another thread

* #0: Treat empty shape as scalar

* #0: Fix trailing comma

* #0: Add missing include

* #0: Allow 0 page size

* #0: Rewrite threading, no mutexes

* #0: Assert fix

* #0: Fixup

* #0: Remove unneeded include

* #0: Cleanup, fix typo

* #0: Cleanup

* #0: Trying to improve perf

* #0: Trying to improve perf

* #0: Build fix

* #0: Simpler threading, get rid of canceling allocation feature

* #0: Fixup

* #0: Avoid using std::atomic::wait, it seems to have perf implications

---------

Co-authored-by: Artem Yerofieiev <169092593+ayerofieiev-tt@users.noreply.github.com>
---
 .../tensors/test_async_tensor_apis.cpp        |  46 ++-
 .../2_noc_rtor/test_noc_rtor.cpp              |   4 +-
 .../3_pcie_transfer/test_rw_buffer.cpp        |   8 +-
 .../6_dram_offchip/test_dram_offchip.cpp      |   8 +-
 .../test_dram_read.cpp                        |   8 +-
 .../test_dram_read_l1_write.cpp               |   8 +-
 .../old/noc/test_noc_read_global_l1.cpp       |  10 +-
 .../allocator/test_l1_banking_allocator.cpp   |   4 +-
 .../matmul/test_matmul_multi_core_X_dram.cpp  |   6 +-
 .../command_queue_test_utils.hpp              |   6 +-
 .../command_queue/test_EnqueueProgram.cpp     |   4 +-
 .../command_queue/test_EnqueueTrace.cpp       |  76 ++--
 ...queueWriteBuffer_and_EnqueueReadBuffer.cpp | 104 ++---
 .../command_queue/test_HostAsyncCQ.cpp        |  35 +-
 .../command_queue/test_events.cpp             |   4 +-
 .../command_queue/test_EnqueueTrace.cpp       |  46 +--
 .../test_EnqueueWaitForEvent.cpp              |   6 +-
 ...queueWriteBuffer_and_EnqueueReadBuffer.cpp |   4 +-
 .../gtests/tensor/test_create_tensor.cpp      |   9 +-
 .../unit_tests/gtests/test_async_runtime.cpp  |  12 +-
 .../unit_tests/gtests/test_ccl_on_galaxy.cpp  |   4 +-
 .../gtests/test_multi_cq_multi_dev.cpp        |   6 +-
 .../gtests/test_multiprod_queue.cpp           |   6 +-
 tt_metal/detail/tt_metal.hpp                  |   2 +-
 tt_metal/detail/util.hpp                      |   6 +-
 tt_metal/graph/graph_tracking.cpp             |   4 +-
 tt_metal/graph/graph_tracking.hpp             |   8 +-
 tt_metal/impl/buffers/buffer.cpp              | 358 +++++++++++-------
 tt_metal/impl/buffers/buffer.hpp              | 181 ++++-----
 tt_metal/impl/device/device.cpp               |   8 +-
 tt_metal/impl/device/device.hpp               |   7 +-
 tt_metal/impl/dispatch/command_queue.cpp      |  70 +---
 tt_metal/impl/dispatch/command_queue.hpp      |   9 +-
 tt_metal/impl/dispatch/lock_free_queue.hpp    |   6 +-
 tt_metal/impl/dispatch/work_executor.hpp      |  38 +-
 tt_metal/impl/program/program.cpp             |   2 +-
 tt_metal/impl/trace/trace.cpp                 |   2 +-
 tt_metal/tt_metal.cpp                         |  61 +--
 ttnn/cpp/ttnn/async_runtime.cpp               |  61 ---
 ttnn/cpp/ttnn/async_runtime.hpp               |   3 -
 ttnn/cpp/ttnn/graph/graph_processor.cpp       |   6 +-
 ttnn/cpp/ttnn/graph/graph_processor.hpp       |   6 +-
 .../ccl/ccl_host_datastructures.cpp           |   2 +-
 .../pad/device/pad_program_factory.cpp        |   6 +-
 ttnn/cpp/ttnn/run_operation_inl.hpp           |   4 +-
 ttnn/cpp/ttnn/tensor/tensor.cpp               |   8 +-
 ttnn/cpp/ttnn/tensor/tensor_impl.cpp          |  11 +-
 ttnn/cpp/ttnn/tensor/tensor_ops.cpp           |   3 +-
 ttnn/cpp/ttnn/tensor/types.hpp                |   4 +-
 ttnn/cpp/ttnn/types.hpp                       |  72 +---
 50 files changed, 593 insertions(+), 779 deletions(-)

diff --git a/tests/tt_eager/tensors/test_async_tensor_apis.cpp b/tests/tt_eager/tensors/test_async_tensor_apis.cpp
index 4803efbf2f1..488d4178edd 100644
--- a/tests/tt_eager/tensors/test_async_tensor_apis.cpp
+++ b/tests/tt_eager/tensors/test_async_tensor_apis.cpp
@@ -25,6 +25,18 @@ using namespace tt;
 using namespace tt_metal;
 using namespace constants;
 
+namespace {
+uint32_t get_device_buffer_address(const Tensor& tensor) {
+    TT_FATAL(std::holds_alternative<DeviceStorage>(tensor.get_storage()), "Tensor storage is not DeviceStorage");
+    auto buffer = std::get<DeviceStorage>(tensor.get_storage()).buffer;
+    uint32_t result = 0;
+    buffer->device()->push_work([&]() {
+        result = buffer->address();
+    }, true);
+    return result;
+}
+}
+
 TEST_F(CommonFixture, TestTensorOwnershipSanity) {
     // Sanity test tensor read, write and update paths with synchronous
     // Ensure that tensor data is copied and owned as expected
@@ -129,17 +141,17 @@ TEST_F(CommonFixture, TestAsyncEltwiseBinary) {
         Tensor output_tensor_host = output_tensor_device_2.cpu();
         // Test tensor deallocation in async mode: deallocate tensors after using them
         if (i == 0) {
-            input_a_addr = std::get<DeviceStorage>(input_tensor_a.get_storage()).buffer->address();
-            input_b_addr = std::get<DeviceStorage>(input_tensor_b.get_storage()).buffer->address();
-            input_c_addr = std::get<DeviceStorage>(input_tensor_c.get_storage()).buffer->address();
-            output_1_addr = std::get<DeviceStorage>(output_tensor_device.get_storage()).buffer->address();
-            output_2_addr = std::get<DeviceStorage>(output_tensor_device_2.get_storage()).buffer->address();
+            input_a_addr = get_device_buffer_address(input_tensor_a);
+            input_b_addr = get_device_buffer_address(input_tensor_b);
+            input_c_addr = get_device_buffer_address(input_tensor_c);
+            output_1_addr = get_device_buffer_address(output_tensor_device);
+            output_2_addr = get_device_buffer_address(output_tensor_device_2);
         } else {
-            EXPECT_EQ(std::get<DeviceStorage>(input_tensor_a.get_storage()).buffer->address(), input_a_addr);
-            EXPECT_EQ(std::get<DeviceStorage>(input_tensor_b.get_storage()).buffer->address(), input_b_addr);
-            EXPECT_EQ(std::get<DeviceStorage>(input_tensor_c.get_storage()).buffer->address(), input_c_addr);
-            EXPECT_EQ(std::get<DeviceStorage>(output_tensor_device.get_storage()).buffer->address(), output_1_addr);
-            EXPECT_EQ(std::get<DeviceStorage>(output_tensor_device_2.get_storage()).buffer->address(), output_2_addr);
+            EXPECT_EQ(get_device_buffer_address(input_tensor_a), input_a_addr);
+            EXPECT_EQ(get_device_buffer_address(input_tensor_b), input_b_addr);
+            EXPECT_EQ(get_device_buffer_address(input_tensor_c), input_c_addr);
+            EXPECT_EQ(get_device_buffer_address(output_tensor_device), output_1_addr);
+            EXPECT_EQ(get_device_buffer_address(output_tensor_device_2), output_2_addr);
         }
         input_tensor_a.deallocate();
         input_tensor_b.deallocate();
@@ -171,7 +183,7 @@ TEST_F(CommonFixture, TestAsyncRefCountManager) {
             ttnn::numpy::full<float>(tt::tt_metal::LegacyShape({1, 1, 1024, 1024}), static_cast<float>(i), DataType::BFLOAT16).to(device);
         Tensor tensor2 =
             ttnn::numpy::full<float>(tt::tt_metal::LegacyShape({1, 1, 1024, 1024}), static_cast<float>(i), DataType::BFLOAT16).to(device);
-        uint32_t tensor2_device_buf_addr = tensor2.device_buffer()->address();
+        uint32_t tensor2_device_buf_addr = get_device_buffer_address(tensor2);
         // Assign tensor1 to tensor2 and ensure that ref counts are appropriately updated with the buffer for tensor2
         // deallocated
         tensor2 = tensor1;
@@ -181,19 +193,19 @@ TEST_F(CommonFixture, TestAsyncRefCountManager) {
         // prev addr for tensor2
         Tensor tensor3 =
             ttnn::numpy::full<float>(tt::tt_metal::LegacyShape({1, 1, 1024, 1024}), static_cast<float>(i), DataType::BFLOAT16).to(device);
-        EXPECT_EQ(tensor3.device_buffer()->address(), tensor2_device_buf_addr);
-        EXPECT_EQ(tensor1.device_buffer()->address(), tensor2.device_buffer()->address());
+        EXPECT_EQ(get_device_buffer_address(tensor3), tensor2_device_buf_addr);
+        EXPECT_EQ(get_device_buffer_address(tensor1), get_device_buffer_address(tensor2));
     }
     log_info(LogTest, "Testing Device tensor self-assignment through function");
     for (int i = 0; i < 5; i++) {
         Tensor device_tensor =
             ttnn::numpy::full<float>(tt::tt_metal::LegacyShape({1, 1, 1024, 1024}), static_cast<float>(i), DataType::BFLOAT16).to(device);
-        uint32_t device_tensor_address = device_tensor.device_buffer()->address();
+        uint32_t device_tensor_address = get_device_buffer_address(device_tensor);
         // This step will copy the tensor to a temp rval and std::move it back to the caller's instance of device_tensor
         // Ensure ref count and address remain unchanged
         device_tensor = tensor_identity_copy_function(device_tensor);
         EXPECT_EQ(device_tensor.tensor_attributes->main_thread_ref_count, 1);
-        EXPECT_EQ(device_tensor.device_buffer()->address(), device_tensor_address);
+        EXPECT_EQ(get_device_buffer_address(device_tensor), device_tensor_address);
     }
 
     log_info(LogTest, "Testing Device tensor move assignment");
@@ -208,11 +220,11 @@ TEST_F(CommonFixture, TestAsyncRefCountManager) {
     log_info(LogTest, "Testing Device tensor self-assignment");
     Tensor tensor_to_self_assign =
         ttnn::numpy::full<float>(tt::tt_metal::LegacyShape({1, 1, 1024, 1024}), static_cast<float>(0), DataType::BFLOAT16).to(device);
-    uint32_t tensor_to_self_assign_address = tensor_to_self_assign.device_buffer()->address();
+    uint32_t tensor_to_self_assign_address = get_device_buffer_address(tensor_to_self_assign);
     tensor_to_self_assign = tensor_to_self_assign;
     EXPECT_EQ(tensor_to_self_assign.tensor_attributes->main_thread_ref_count, 1);
     tensor_to_self_assign = std::move(tensor_to_self_assign);
-    EXPECT_EQ(tensor_to_self_assign.device_buffer()->address(), tensor_to_self_assign_address);
+    EXPECT_EQ(get_device_buffer_address(tensor_to_self_assign), tensor_to_self_assign_address);
     auto barrier_tensor = tensor_to_self_assign.cpu();
     device->enable_async(false);
 }
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_rtor/test_noc_rtor.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_rtor/test_noc_rtor.cpp
index a8f8f6773a6..3e58551a329 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_rtor/test_noc_rtor.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_rtor/test_noc_rtor.cpp
@@ -126,7 +126,7 @@ int main(int argc, char** argv) {
 
         // limit size of the L1 buffer to do not exceed global L1 size
         uint32_t l1_buffer_size = num_cores_r * num_cores_c * (num_tiles > 256 ? 256 : num_tiles) * page_size;
-        auto l1_buffer = tt_metal::Buffer(device, l1_buffer_size, page_size, tt_metal::BufferType::L1);
+        auto l1_buffer = tt_metal::Buffer::create(device, l1_buffer_size, page_size, tt_metal::BufferType::L1);
 
         ////////////////////////////////////////////////////////////////////////////
         //                      Application Setup
@@ -166,7 +166,7 @@ int main(int argc, char** argv) {
             for (int j = 0; j < num_cores_c; j++) {
                 CoreCoord core = {(std::size_t)j, (std::size_t)i};
                 uint32_t core_index = i * num_cores_c + j;
-                uint32_t l1_buffer_addr = l1_buffer.address();
+                uint32_t l1_buffer_addr = l1_buffer->address();
 
                 const std::array noc_runtime_args = {core_index, l1_buffer_addr, num_tiles, num_cores_r * num_cores_c};
                 SetRuntimeArgs(program, noc_kernel, core, noc_runtime_args);
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_rw_buffer.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_rw_buffer.cpp
index 7f495e0729c..044d0888915 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_rw_buffer.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_rw_buffer.cpp
@@ -77,14 +77,14 @@ int main(int argc, char** argv) {
             log_error(tt::LogTest, "Command line arguments found exception", e.what());
         }
 
-        TT_ASSERT(transfer_size % page_size == 0, "Transfer size {}B should be divisible by page size {}B", transfer_size, page_size);
+        TT_ASSERT(page_size == 0 ? transfer_size == 0 : transfer_size % page_size == 0, "Transfer size {}B should be divisible by page size {}B", transfer_size, page_size);
 
         // Device setup
         int device_id = 0;
         tt_metal::Device* device = tt_metal::CreateDevice(device_id);
 
         // Application setup
-        auto buffer = tt_metal::Buffer(
+        auto buffer = tt_metal::Buffer::create(
             device, transfer_size, page_size, buffer_type == 0 ? tt_metal::BufferType::DRAM : tt_metal::BufferType::L1);
 
         std::vector<uint32_t> src_vec = create_random_vector_of_bfloat16(
@@ -104,7 +104,7 @@ int main(int argc, char** argv) {
             // Execute application
             if (!skip_write) {
                 auto t_begin = std::chrono::steady_clock::now();
-                EnqueueWriteBuffer(device->command_queue(), buffer, src_vec, false);
+                EnqueueWriteBuffer(device->command_queue(), *buffer, src_vec, false);
                 Finish(device->command_queue());
                 auto t_end = std::chrono::steady_clock::now();
                 auto elapsed_us = duration_cast<microseconds>(t_end - t_begin).count();
@@ -119,7 +119,7 @@ int main(int argc, char** argv) {
 
             if (!skip_read) {
                 auto t_begin = std::chrono::steady_clock::now();
-                EnqueueReadBuffer(device->command_queue(), buffer, result_vec, true);
+                EnqueueReadBuffer(device->command_queue(), *buffer, result_vec, true);
                 auto t_end = std::chrono::steady_clock::now();
                 auto elapsed_us = duration_cast<microseconds>(t_end - t_begin).count();
                 d2h_bandwidth.push_back((transfer_size / 1024.0 / 1024.0 / 1024.0) / (elapsed_us / 1000.0 / 1000.0));
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/6_dram_offchip/test_dram_offchip.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/6_dram_offchip/test_dram_offchip.cpp
index 56491f550e4..8efa155c9ed 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/6_dram_offchip/test_dram_offchip.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/6_dram_offchip/test_dram_offchip.cpp
@@ -192,7 +192,7 @@ int main(int argc, char **argv) {
         ////////////////////////////////////////////////////////////////////////////
         std::vector<uint32_t> input_vec = create_random_vector_of_bfloat16(
             input_size, 100, std::chrono::system_clock::now().time_since_epoch().count());
-        tt_metal::Buffer input_buffer(
+        auto input_buffer = Buffer::create(
             device, input_vec.size() * sizeof(uint32_t), single_tile_size, tt_metal::BufferType::DRAM);
 
         ////////////////////////////////////////////////////////////////////////////
@@ -212,7 +212,7 @@ int main(int argc, char **argv) {
             num_tiles_per_core_group_1,
             num_tiles_per_core_group_2,
             kernel,
-            input_buffer.address(),
+            input_buffer->address(),
             num_reqs_at_a_time,
             single_tile_size,
             tile_format);
@@ -221,7 +221,7 @@ int main(int argc, char **argv) {
         //                      Copy Input To DRAM or L1
         ////////////////////////////////////////////////////////////////////////////
         if (access_type == 0) {
-            tt_metal::detail::WriteToBuffer(input_buffer, input_vec);
+            tt_metal::detail::WriteToBuffer(*input_buffer, input_vec);
         } else {
             for (uint32_t i = 0, input_offset = 0; i < num_cores; ++i) {
                 CoreCoord core = {i / num_cores_y, i % num_cores_y};
@@ -276,7 +276,7 @@ int main(int argc, char **argv) {
         ////////////////////////////////////////////////////////////////////////////
         pass = validation(
             device,
-            input_buffer,
+            *input_buffer,
             input_vec,
             num_cores,
             num_cores_y,
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp
index 773380ebee9..829fc585e20 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp
@@ -673,18 +673,18 @@ int main(int argc, char **argv) {
                 input_size, 100, 1234);
         }
 
-        tt_metal::Buffer input_buffer(
+        auto input_buffer = tt_metal::Buffer::create(
             device, input_vec.size() * sizeof(uint32_t), single_tile_size, tt_metal::BufferType::DRAM);
 
         ////////////////////////////////////////////////////////////////////////////
         //                      Application Setup
         ////////////////////////////////////////////////////////////////////////////
-        auto [program, kernel, cb_addr] = create_program(device, all_cores, single_tile_size, tile_format, num_tiles_cb, num_tiles_per_core, k, n, num_blocks, num_banks, all_cores_list, bank_start_id, input_buffer.address());
+        auto [program, kernel, cb_addr] = create_program(device, all_cores, single_tile_size, tile_format, num_tiles_cb, num_tiles_per_core, k, n, num_blocks, num_banks, all_cores_list, bank_start_id, input_buffer->address());
 
         ////////////////////////////////////////////////////////////////////////////
         //                      Copy Input To DRAM or L1
         ////////////////////////////////////////////////////////////////////////////
-        tt_metal::detail::WriteToBuffer(input_buffer, input_vec);
+        tt_metal::detail::WriteToBuffer(*input_buffer, input_vec);
 
         ////////////////////////////////////////////////////////////////////////////
         //                      Execution Application
@@ -713,7 +713,7 @@ int main(int argc, char **argv) {
 
         pass = validation(
             device,
-            input_buffer,
+            *input_buffer,
             input_vec,
             num_cores,
             all_cores_list,
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp
index 814b28abe02..4587fa9f969 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp
@@ -869,18 +869,18 @@ int main(int argc, char **argv) {
                 input_size, 100, 1234);
         }
 
-        tt_metal::Buffer input_buffer(
+        auto input_buffer = tt_metal::Buffer::create(
             device, input_vec.size() * sizeof(uint32_t), single_tile_size, tt_metal::BufferType::DRAM);
 
         ////////////////////////////////////////////////////////////////////////////
         //                      Application Setup
         ////////////////////////////////////////////////////////////////////////////
-        auto [program, kernel, output_cb_addr] = create_program(device, all_dram_reader_cores, all_l1_receiver_cores, single_tile_size, tile_format, num_tiles_cb, num_tiles_per_core, k, n, num_blocks, num_banks, all_dram_reader_cores_ordered, all_l1_writer_cores_ordered, bank_start_id, input_buffer.address());
+        auto [program, kernel, output_cb_addr] = create_program(device, all_dram_reader_cores, all_l1_receiver_cores, single_tile_size, tile_format, num_tiles_cb, num_tiles_per_core, k, n, num_blocks, num_banks, all_dram_reader_cores_ordered, all_l1_writer_cores_ordered, bank_start_id, input_buffer->address());
 
         ////////////////////////////////////////////////////////////////////////////
         //                      Copy Input To DRAM or L1
         ////////////////////////////////////////////////////////////////////////////
-        tt_metal::detail::WriteToBuffer(input_buffer, input_vec);
+        tt_metal::detail::WriteToBuffer(*input_buffer, input_vec);
 
         ////////////////////////////////////////////////////////////////////////////
         //                      Execution Application
@@ -909,7 +909,7 @@ int main(int argc, char **argv) {
 
         pass = validation(
             device,
-            input_buffer,
+            *input_buffer,
             input_vec,
             num_cores,
             all_l1_writer_cores_ordered,
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1.cpp
index f276c67030b..23107937bb9 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1.cpp
@@ -189,7 +189,7 @@ int main(int argc, char **argv) {
             activations_addr,
             activations_addr / 1024,
             Nt);
-        std::vector<tt_metal::Buffer> l1_buffers;
+        std::vector<std::shared_ptr<tt_metal::Buffer>> l1_buffers;
 
         int l1_buffers_size = 1;
         if (!(single_read || one_buffer_share)) {
@@ -199,8 +199,8 @@ int main(int argc, char **argv) {
         l1_buffers.reserve(l1_buffers_size);
         for (int r = 0; r < num_cores_r; ++r) {
             for (int c = 0; c < num_cores_c; ++c) {
-                l1_buffers.emplace_back(device, total_tiles_size_bytes, single_tile_size, tt_metal::BufferType::L1);
-                tt_metal::detail::WriteToBuffer(l1_buffers[r * num_cores_c + c], packed_tensors[r * num_cores_c + c]);
+                l1_buffers.push_back(tt_metal::Buffer::create(device, total_tiles_size_bytes, single_tile_size, tt_metal::BufferType::L1));
+                tt_metal::detail::WriteToBuffer(*l1_buffers[r * num_cores_c + c], packed_tensors[r * num_cores_c + c]);
 
                 if (single_read || one_buffer_share)
                     break;
@@ -213,7 +213,7 @@ int main(int argc, char **argv) {
         for (int r = 0; r < num_cores_r; ++r) {
             for (int c = 0; c < num_cores_c; ++c) {
                 std::vector<uint32_t> result_vec;
-                tt_metal::detail::ReadFromBuffer(l1_buffers[r * num_cores_c + c], result_vec);
+                tt_metal::detail::ReadFromBuffer(*l1_buffers[r * num_cores_c + c], result_vec);
                 auto result_bfp16 = unpack_uint32_vec_into_bfloat16_vec(result_vec);
 
                 if (print_tensor) {
@@ -260,7 +260,7 @@ int main(int argc, char **argv) {
                 CoreCoord core = {(size_t)c, (size_t)r};
 
                 int l1_buffers_idx = (single_read || one_buffer_share) ? (0) : (r * num_cores_c + c);
-                auto l1_buffer_addr = l1_buffers[l1_buffers_idx].address();
+                auto l1_buffer_addr = l1_buffers[l1_buffers_idx]->address();
 
                 uint32_t l1_buffer_offset = (one_buffer_share) ? ((r * num_cores_c + c) * Nt) : (0);
 
diff --git a/tests/tt_metal/tt_metal/unit_tests/allocator/test_l1_banking_allocator.cpp b/tests/tt_metal/tt_metal/unit_tests/allocator/test_l1_banking_allocator.cpp
index 23f80204fa7..6c68aeeeae7 100644
--- a/tests/tt_metal/tt_metal/unit_tests/allocator/test_l1_banking_allocator.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/allocator/test_l1_banking_allocator.cpp
@@ -35,7 +35,7 @@ TEST_F(BasicFixture, TestL1BuffersAllocatedTopDown) {
 
     uint64_t alloc_limit = unit_tests::test_l1_banking_allocator::get_alloc_limit(device);
 
-    std::vector<std::unique_ptr<Buffer>> buffers;
+    std::vector<std::shared_ptr<Buffer>> buffers;
     int alloc_size_idx = 0;
     uint32_t total_buffer_size = 0;
     while (total_size_bytes < alloc_limit) {
@@ -44,7 +44,7 @@ TEST_F(BasicFixture, TestL1BuffersAllocatedTopDown) {
         if (total_buffer_size + buffer_size >= alloc_limit) {
             break;
         }
-        std::unique_ptr<tt::tt_metal::Buffer> buffer = std::make_unique<tt::tt_metal::Buffer>(device, buffer_size, buffer_size, tt::tt_metal::BufferType::L1);
+        auto buffer = tt::tt_metal::Buffer::create(device, buffer_size, buffer_size, tt::tt_metal::BufferType::L1);
         buffers.emplace_back(std::move(buffer));
         total_buffer_size += buffer_size;
         EXPECT_EQ(buffers.back()->address(), device->l1_size_per_core() - total_buffer_size);
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_X_dram.cpp b/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_X_dram.cpp
index 02ae4935d8f..2eb2e6975a4 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_X_dram.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_X_dram.cpp
@@ -430,17 +430,17 @@ bool matmul_multi_core_multi_dram(CommonFixture *fixture, tt_metal::Device *devi
     auto activations_tile_layout = convert_to_tile_layout(activations_tilized);
     auto activations = pack_bfloat16_vec_into_uint32_vec(activations_tile_layout);
 
-    auto activation_buffer = std::make_shared<Buffer>(device, activations.size() * sizeof(uint32_t), 1024 * 2, BufferType::DRAM);
+    auto activation_buffer = Buffer::create(device, activations.size() * sizeof(uint32_t), 1024 * 2, BufferType::DRAM);
     pass &= move_tiles_to_dram(device, activations, M, K, activation_buffer);
     auto identity_tilized = test_utils::tilize(identity, K * 32, N * 32);
     auto weights_tile_layout = convert_to_tile_layout(identity_tilized);
     auto weights = pack_bfloat16_vec_into_uint32_vec(weights_tile_layout);
 
-    auto weight_buffer = std::make_shared<Buffer>(device, weights.size() * sizeof(uint32_t), 1024 * 2, BufferType::DRAM);
+    auto weight_buffer = Buffer::create(device, weights.size() * sizeof(uint32_t), 1024 * 2, BufferType::DRAM);
     pass &= move_tiles_to_dram(device, weights, K, N, weight_buffer);
     log_debug(LogTest, "Copying inputs to dram complete");
 
-    auto out_buffer = std::make_shared<Buffer>(device, M * N * sizeof(uint32_t) * 32 * 32, 1024 * 2, BufferType::DRAM);
+    auto out_buffer = Buffer::create(device, M * N * sizeof(uint32_t) * 32 * 32, 1024 * 2, BufferType::DRAM);
     uint32_t out_dram_addr = out_buffer->address();
 
     log_debug(LogTest, "Writing kernel runtime args to device");
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/command_queue_test_utils.hpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/command_queue_test_utils.hpp
index 3b1a12c88ba..2d3c5a4b6c7 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/command_queue_test_utils.hpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/command_queue_test_utils.hpp
@@ -12,15 +12,15 @@ struct TestBufferConfig {
     tt::tt_metal::BufferType buftype;
 };
 
-inline std::pair<tt::tt_metal::Buffer, std::vector<uint32_t>> EnqueueWriteBuffer_prior_to_wrap(tt::tt_metal::Device* device, tt::tt_metal::CommandQueue& cq, const TestBufferConfig& config) {
+inline std::pair<std::shared_ptr<tt::tt_metal::Buffer>, std::vector<uint32_t>> EnqueueWriteBuffer_prior_to_wrap(tt::tt_metal::Device* device, tt::tt_metal::CommandQueue& cq, const TestBufferConfig& config) {
     // This function just enqueues a buffer (which should be large in the config)
     // write as a precursor to testing the wrap mechanism
     size_t buf_size = config.num_pages * config.page_size;
-    tt::tt_metal::Buffer buffer(device, buf_size, config.page_size, config.buftype);
+    auto buffer = Buffer::create(device, buf_size, config.page_size, config.buftype);
 
     std::vector<uint32_t> src = create_random_vector_of_bfloat16(
       buf_size, 100, std::chrono::system_clock::now().time_since_epoch().count());
 
-    EnqueueWriteBuffer(cq, buffer, src, false);
+    EnqueueWriteBuffer(cq, *buffer, src, false);
     return std::make_pair(std::move(buffer), src);
 }
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp
index 461f07c2825..ec8238158d1 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp
@@ -481,14 +481,14 @@ bool test_EnqueueWrap_on_EnqueueWriteBuffer(Device* device, CommandQueue& cq, co
     This just ensures we don't hang on the subsequent EnqueueWriteBuffer
     */
     size_t buf_size = config.num_pages * config.page_size;
-    Buffer buffer(device, buf_size, config.page_size, config.buftype);
+    auto buffer = Buffer::create(device, buf_size, config.page_size, config.buftype);
 
     vector<uint32_t> src(buf_size / sizeof(uint32_t), 0);
 
     for (uint32_t i = 0; i < src.size(); i++) {
         src.at(i) = i;
     }
-    EnqueueWriteBuffer(cq, buffer, src, false);
+    EnqueueWriteBuffer(cq, *buffer, src, false);
     Finish(cq);
 
     return true;
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueTrace.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueTrace.cpp
index 5b9428c4ef5..65b6b20d57f 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueTrace.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueTrace.cpp
@@ -98,13 +98,13 @@ TEST_F(SingleDeviceTraceFixture, InstantiateTraceSanity) {
     Setup(2048);
     CommandQueue& command_queue = this->device_->command_queue();
 
-    Buffer input(this->device_, 2048, 2048, BufferType::DRAM);
-    vector<uint32_t> input_data(input.size() / sizeof(uint32_t), 0);
+    auto input = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM);
+    vector<uint32_t> input_data(input->size() / sizeof(uint32_t), 0);
     for (uint32_t i = 0; i < input_data.size(); i++) {
         input_data[i] = i;
     }
-    Buffer output(this->device_, 2048, 2048, BufferType::DRAM);
-    auto simple_program = std::make_shared<Program>(create_simple_unary_program(input, output));
+    auto output = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM);
+    auto simple_program = std::make_shared<Program>(create_simple_unary_program(*input, *output));
     EnqueueProgram(command_queue, *simple_program, true);
     uint32_t tid = BeginTraceCapture(this->device_, command_queue.id());
     EnqueueProgram(command_queue, *simple_program, kNonBlocking);
@@ -128,13 +128,13 @@ TEST_F(SingleDeviceTraceFixture, InstantiateTraceSanity) {
 
 TEST_F(SingleDeviceTraceFixture, EnqueueProgramTraceCapture) {
     Setup(2048);
-    Buffer input(this->device_, 2048, 2048, BufferType::DRAM);
-    Buffer output(this->device_, 2048, 2048, BufferType::DRAM);
+    auto input = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM);
+    auto output = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM);
 
     CommandQueue& command_queue = this->device_->command_queue();
 
-    Program simple_program = create_simple_unary_program(input, output);
-    vector<uint32_t> input_data(input.size() / sizeof(uint32_t), 0);
+    Program simple_program = create_simple_unary_program(*input, *output);
+    vector<uint32_t> input_data(input->size() / sizeof(uint32_t), 0);
     for (uint32_t i = 0; i < input_data.size(); i++) {
         input_data[i] = i;
     }
@@ -144,24 +144,24 @@ TEST_F(SingleDeviceTraceFixture, EnqueueProgramTraceCapture) {
     vector<uint32_t> trace_output_data;
     trace_output_data.resize(input_data.size());
 
-    EnqueueWriteBuffer(command_queue, input, input_data.data(), true);
+    EnqueueWriteBuffer(command_queue, *input, input_data.data(), true);
     EnqueueProgram(command_queue, simple_program, true);
-    EnqueueReadBuffer(command_queue, output, eager_output_data.data(), true);
+    EnqueueReadBuffer(command_queue, *output, eager_output_data.data(), true);
 
-    EnqueueWriteBuffer(command_queue, input, input_data.data(), true);
+    EnqueueWriteBuffer(command_queue, *input, input_data.data(), true);
 
     uint32_t tid = BeginTraceCapture(this->device_, command_queue.id());
     EnqueueProgram(command_queue, simple_program, false);
     EndTraceCapture(this->device_, command_queue.id(), tid);
     // Create and Enqueue a Program with a live trace to ensure that a warning is generated
-    Buffer input_temp(this->device_, 2048, 2048, BufferType::DRAM);
-    Buffer output_temp(this->device_, 2048, 2048, BufferType::DRAM);
-    Program simple_program_temp = create_simple_unary_program(input_temp, output_temp);
+    auto input_temp = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM);
+    auto output_temp = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM);
+    Program simple_program_temp = create_simple_unary_program(*input_temp, *output_temp);
     EnqueueProgram(command_queue, simple_program_temp, true);
     // Run trace that can clobber the temporary buffers created above
     EnqueueProgram(command_queue, simple_program, false);
     EnqueueTrace(command_queue, tid, true);
-    EnqueueReadBuffer(command_queue, output, trace_output_data.data(), true);
+    EnqueueReadBuffer(command_queue, *output, trace_output_data.data(), true);
     EXPECT_TRUE(eager_output_data == trace_output_data);
 
     // Done
@@ -171,12 +171,12 @@ TEST_F(SingleDeviceTraceFixture, EnqueueProgramTraceCapture) {
 
 TEST_F(SingleDeviceTraceFixture, EnqueueProgramDeviceCapture) {
     Setup(2048);
-    Buffer input(this->device_, 2048, 2048, BufferType::DRAM);
-    Buffer output(this->device_, 2048, 2048, BufferType::DRAM);
+    auto input = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM);
+    auto output = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM);
 
     CommandQueue& command_queue = this->device_->command_queue();
 
-    vector<uint32_t> input_data(input.size() / sizeof(uint32_t), 0);
+    vector<uint32_t> input_data(input->size() / sizeof(uint32_t), 0);
     for (uint32_t i = 0; i < input_data.size(); i++) {
         input_data[i] = i;
     }
@@ -190,17 +190,17 @@ TEST_F(SingleDeviceTraceFixture, EnqueueProgramDeviceCapture) {
     std::shared_ptr<Program> simple_program;
     // EAGER MODE EXECUTION
     if (has_eager) {
-        simple_program = std::make_shared<Program>(create_simple_unary_program(input, output));
-        EnqueueWriteBuffer(command_queue, input, input_data.data(), true);
+        simple_program = std::make_shared<Program>(create_simple_unary_program(*input, *output));
+        EnqueueWriteBuffer(command_queue, *input, input_data.data(), true);
         EnqueueProgram(command_queue, *simple_program, true);
-        EnqueueReadBuffer(command_queue, output, eager_output_data.data(), true);
+        EnqueueReadBuffer(command_queue, *output, eager_output_data.data(), true);
     }
 
     // THIS->DEVICE_ CAPTURE AND REPLAY MODE
     bool has_trace = false;
     uint32_t tid = 0;
     for (int i = 0; i < 1; i++) {
-        EnqueueWriteBuffer(command_queue, input, input_data.data(), true);
+        EnqueueWriteBuffer(command_queue, *input, input_data.data(), true);
 
         if (!has_trace) {
             // Program must be cached first
@@ -211,7 +211,7 @@ TEST_F(SingleDeviceTraceFixture, EnqueueProgramDeviceCapture) {
         }
         ReplayTrace(this->device_, command_queue.id(), tid, true);
 
-        EnqueueReadBuffer(command_queue, output, trace_output_data.data(), true);
+        EnqueueReadBuffer(command_queue, *output, trace_output_data.data(), true);
         if (has_eager) EXPECT_TRUE(eager_output_data == trace_output_data);
     }
 
@@ -225,13 +225,13 @@ TEST_F(SingleDeviceTraceFixture, EnqueueTwoProgramTrace) {
     // Get command queue from device for this test, since its running in async mode
     CommandQueue& command_queue = this->device_->command_queue();
 
-    Buffer input(this->device_, 2048, 2048, BufferType::DRAM);
-    Buffer interm(this->device_, 2048, 2048, BufferType::DRAM);
-    Buffer output(this->device_, 2048, 2048, BufferType::DRAM);
+    auto input = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM);
+    auto interm = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM);
+    auto output = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM);
 
-    Program op0 = create_simple_unary_program(input, interm);
-    Program op1 = create_simple_unary_program(interm, output);
-    vector<uint32_t> input_data(input.size() / sizeof(uint32_t), 0);
+    Program op0 = create_simple_unary_program(*input, *interm);
+    Program op1 = create_simple_unary_program(*interm, *output);
+    vector<uint32_t> input_data(input->size() / sizeof(uint32_t), 0);
     for (uint32_t i = 0; i < input_data.size(); i++) {
         input_data[i] = i;
     }
@@ -252,20 +252,20 @@ TEST_F(SingleDeviceTraceFixture, EnqueueTwoProgramTrace) {
     eager_output_data.resize(input_data.size());
 
     // Warm up and use the eager blocking run as the expected output
-    EnqueueWriteBuffer(command_queue, input, input_data.data(), kBlocking);
+    EnqueueWriteBuffer(command_queue, *input, input_data.data(), kBlocking);
     EnqueueProgram(command_queue, op0, kBlocking);
     EnqueueProgram(command_queue, op1, kBlocking);
-    EnqueueReadBuffer(command_queue, output, expected_output_data.data(), kBlocking);
+    EnqueueReadBuffer(command_queue, *output, expected_output_data.data(), kBlocking);
     Finish(command_queue);
 
     for (bool blocking : blocking_flags) {
         std::string mode = blocking ? "Eager-B" : "Eager-NB";
         for (auto i = 0; i < num_loops; i++) {
             ScopedTimer timer(mode + " loop " + std::to_string(i));
-            EnqueueWriteBuffer(command_queue, input, input_data.data(), blocking);
+            EnqueueWriteBuffer(command_queue, *input, input_data.data(), blocking);
             EnqueueProgram(command_queue, op0, blocking);
             EnqueueProgram(command_queue, op1, blocking);
-            EnqueueReadBuffer(command_queue, output, eager_output_data.data(), blocking);
+            EnqueueReadBuffer(command_queue, *output, eager_output_data.data(), blocking);
         }
         if (not blocking) {
             // (Optional) wait for the last non-blocking command to finish
@@ -283,9 +283,9 @@ TEST_F(SingleDeviceTraceFixture, EnqueueTwoProgramTrace) {
     // Trace mode execution
     for (auto i = 0; i < num_loops; i++) {
         ScopedTimer timer("Trace loop " + std::to_string(i));
-        EnqueueWriteBuffer(command_queue, input, input_data.data(), kNonBlocking);
+        EnqueueWriteBuffer(command_queue, *input, input_data.data(), kNonBlocking);
         EnqueueTrace(command_queue, tid, kNonBlocking);
-        EnqueueReadBuffer(command_queue, output, trace_outputs[i].data(), kNonBlocking);
+        EnqueueReadBuffer(command_queue, *output, trace_outputs[i].data(), kNonBlocking);
     }
     Finish(command_queue);
     ReleaseTrace(this->device_, tid);
@@ -300,8 +300,8 @@ TEST_F(SingleDeviceTraceFixture, EnqueueMultiProgramTraceBenchmark) {
     Setup(6144);
     CommandQueue& command_queue = this->device_->command_queue();
 
-    std::shared_ptr<Buffer> input = std::make_shared<Buffer>(this->device_, 2048, 2048, BufferType::DRAM);
-    std::shared_ptr<Buffer> output = std::make_shared<Buffer>(this->device_, 2048, 2048, BufferType::DRAM);
+    auto input = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM);
+    auto output = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM);
 
     uint32_t num_loops = parse_env<int>("TT_METAL_TRACE_LOOPS", 4);
     uint32_t num_programs = parse_env<int>("TT_METAL_TRACE_PROGRAMS", 4);
@@ -314,7 +314,7 @@ TEST_F(SingleDeviceTraceFixture, EnqueueMultiProgramTraceBenchmark) {
     }
 
     for (int i = 0; i < num_programs; i++) {
-        interm_buffers.push_back(std::make_shared<Buffer>(this->device_, 2048, 2048, BufferType::DRAM));
+        interm_buffers.push_back(Buffer::create(this->device_, 2048, 2048, BufferType::DRAM));
         if (i == 0) {
             programs.push_back(create_simple_unary_program(*input, *(interm_buffers[i])));
         } else if (i == (num_programs - 1)) {
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
index 73f0a7cd052..fc6345f5e9a 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
@@ -114,14 +114,14 @@ void test_EnqueueWriteBuffer_and_EnqueueReadBuffer(Device *device, CommandQueue
                 continue;
             }
             size_t buf_size = config.num_pages * config.page_size;
-            Buffer bufa(device, buf_size, config.page_size, config.buftype);
+            auto bufa = Buffer::create(device, buf_size, config.page_size, config.buftype);
 
-            vector<uint32_t> src = generate_arange_vector(bufa.size());
+            vector<uint32_t> src = generate_arange_vector(bufa->size());
 
             if (cq_write) {
-                EnqueueWriteBuffer(cq, bufa, src.data(), false);
+                EnqueueWriteBuffer(cq, *bufa, src.data(), false);
             } else {
-                ::detail::WriteToBuffer(bufa, src);
+                ::detail::WriteToBuffer(*bufa, src);
                 if (config.buftype == BufferType::DRAM) {
                     tt::Cluster::instance().dram_barrier(device->id());
                 } else {
@@ -137,9 +137,9 @@ void test_EnqueueWriteBuffer_and_EnqueueReadBuffer(Device *device, CommandQueue
             }
 
             if (cq_read) {
-                EnqueueReadBuffer(cq, bufa, result.data(), true);
+                EnqueueReadBuffer(cq, *bufa, result.data(), true);
             } else {
-                ::detail::ReadFromBuffer(bufa, result);
+                ::detail::ReadFromBuffer(*bufa, result);
             }
 
             EXPECT_EQ(src, result);
@@ -154,7 +154,7 @@ bool stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer(
     bool pass = true;
     uint32_t num_pages_left = config.num_pages_total;
 
-    std::vector<std::unique_ptr<Buffer>> buffers;
+    std::vector<std::shared_ptr<Buffer>> buffers;
     std::vector<std::vector<uint32_t>> srcs;
     std::vector<std::vector<uint32_t>> dsts;
     while (num_pages_left) {
@@ -173,9 +173,9 @@ bool stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer(
             buftype = BufferType::L1;
         }
 
-        std::unique_ptr<Buffer> buf;
+        std::shared_ptr<Buffer> buf;
         try {
-            buf = std::make_unique<Buffer>(device, buf_size, config.page_size, buftype);
+            buf = Buffer::create(device, buf_size, config.page_size, buftype);
         } catch (...) {
             Finish(cq);
             size_t i = 0;
@@ -185,7 +185,7 @@ bool stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer(
             srcs.clear();
             dsts.clear();
             buffers.clear();
-            buf = std::make_unique<Buffer>(device, buf_size, config.page_size, buftype);
+            buf = Buffer::create(device, buf_size, config.page_size, buftype);
         }
         EnqueueWriteBuffer(cq, *buf, src, false);
         vector<uint32_t> dst;
@@ -238,12 +238,12 @@ void stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer_sharded(
                     src.at(i) = i;
                 }
 
-                Buffer buf(device, buf_size, config.page_size(), buftype, config.mem_config, shard_spec);
+                auto buf = Buffer::create(device, buf_size, config.page_size(), buftype, config.mem_config, shard_spec);
                 vector<uint32_t> src2 = src;
                 if (cq_write) {
-                    EnqueueWriteBuffer(cq, buf, src2.data(), false);
+                    EnqueueWriteBuffer(cq, *buf, src2.data(), false);
                 } else {
-                    ::detail::WriteToBuffer(buf, src);
+                    ::detail::WriteToBuffer(*buf, src);
                     if (buftype == BufferType::DRAM) {
                         tt::Cluster::instance().dram_barrier(device->id());
                     } else {
@@ -258,9 +258,9 @@ void stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer_sharded(
                 vector<uint32_t> res;
                 res.resize(buf_size / sizeof(uint32_t));
                 if (cq_read) {
-                    EnqueueReadBuffer(cq, buf, res.data(), true);
+                    EnqueueReadBuffer(cq, *buf, res.data(), true);
                 } else {
-                    ::detail::ReadFromBuffer(buf, res);
+                    ::detail::ReadFromBuffer(*buf, res);
                 }
                 EXPECT_EQ(src, res);
             }
@@ -477,25 +477,25 @@ TEST_F(CommandQueueSingleCardFixture, TestWrapCompletionQOnInsufficientSpace) {
         // leave only small_page_size * 2 B of space in the completion queue
         uint32_t num_pages_second_buffer = (space_after_first_buffer / small_page_size) - 2;
 
-        Buffer buff_1(device, first_buffer_size, large_page_size, BufferType::DRAM);
-        auto src_1 = local_test_functions::generate_arange_vector(buff_1.size());
-        EnqueueWriteBuffer(device->command_queue(), buff_1, src_1, false);
+        auto buff_1 = Buffer::create(device, first_buffer_size, large_page_size, BufferType::DRAM);
+        auto src_1 = local_test_functions::generate_arange_vector(buff_1->size());
+        EnqueueWriteBuffer(device->command_queue(), *buff_1, src_1, false);
         vector<uint32_t> result_1;
-        EnqueueReadBuffer(device->command_queue(), buff_1, result_1, true);
+        EnqueueReadBuffer(device->command_queue(), *buff_1, result_1, true);
         EXPECT_EQ(src_1, result_1);
 
-        Buffer buff_2(device, num_pages_second_buffer * small_page_size, small_page_size, BufferType::DRAM);
-        auto src_2 = local_test_functions::generate_arange_vector(buff_2.size());
-        EnqueueWriteBuffer(device->command_queue(), buff_2, src_2, false);
+        auto buff_2 = Buffer::create(device, num_pages_second_buffer * small_page_size, small_page_size, BufferType::DRAM);
+        auto src_2 = local_test_functions::generate_arange_vector(buff_2->size());
+        EnqueueWriteBuffer(device->command_queue(), *buff_2, src_2, false);
         vector<uint32_t> result_2;
-        EnqueueReadBuffer(device->command_queue(), buff_2, result_2, true);
+        EnqueueReadBuffer(device->command_queue(), *buff_2, result_2, true);
         EXPECT_EQ(src_2, result_2);
 
-        Buffer buff_3(device, 32 * large_page_size, large_page_size, BufferType::DRAM);
-        auto src_3 = local_test_functions::generate_arange_vector(buff_3.size());
-        EnqueueWriteBuffer(device->command_queue(), buff_3, src_3, false);
+        auto buff_3 = Buffer::create(device, 32 * large_page_size, large_page_size, BufferType::DRAM);
+        auto src_3 = local_test_functions::generate_arange_vector(buff_3->size());
+        EnqueueWriteBuffer(device->command_queue(), *buff_3, src_3, false);
         vector<uint32_t> result_3;
-        EnqueueReadBuffer(device->command_queue(), buff_3, result_3, true);
+        EnqueueReadBuffer(device->command_queue(), *buff_3, result_3, true);
         EXPECT_EQ(src_3, result_3);
     }
 }
@@ -510,25 +510,25 @@ TEST_F(CommandQueueSingleCardFixture, TestWrapCompletionQOnInsufficientSpace2) {
 
         uint32_t num_pages_buff_1 = 9;
         uint32_t page_size_buff_1 = 2048;
-        Buffer buff_1(device, num_pages_buff_1 * page_size_buff_1, page_size_buff_1, BufferType::DRAM);
-        uint32_t space_after_buff_1 = command_completion_region_size - buff_1.size();
+        auto buff_1 = Buffer::create(device, num_pages_buff_1 * page_size_buff_1, page_size_buff_1, BufferType::DRAM);
+        uint32_t space_after_buff_1 = command_completion_region_size - buff_1->size();
 
         uint32_t page_size = 8192;
         uint32_t desired_remaining_space_before_wrap = 6144;
         uint32_t avail_space_for_wrapping_buffer = space_after_buff_1 - desired_remaining_space_before_wrap;
         uint32_t num_pages_for_wrapping_buffer = (avail_space_for_wrapping_buffer / page_size) + 4;
 
-        auto src_1 = local_test_functions::generate_arange_vector(buff_1.size());
-        EnqueueWriteBuffer(device->command_queue(), buff_1, src_1, false);
+        auto src_1 = local_test_functions::generate_arange_vector(buff_1->size());
+        EnqueueWriteBuffer(device->command_queue(), *buff_1, src_1, false);
         vector<uint32_t> result_1;
-        EnqueueReadBuffer(device->command_queue(), buff_1, result_1, true);
+        EnqueueReadBuffer(device->command_queue(), *buff_1, result_1, true);
         EXPECT_EQ(src_1, result_1);
 
-        Buffer wrap_buff(device, num_pages_for_wrapping_buffer * page_size, page_size, BufferType::DRAM);
-        auto src_2 = local_test_functions::generate_arange_vector(wrap_buff.size());
-        EnqueueWriteBuffer(device->command_queue(), wrap_buff, src_2, false);
+        auto wrap_buff = Buffer::create(device, num_pages_for_wrapping_buffer * page_size, page_size, BufferType::DRAM);
+        auto src_2 = local_test_functions::generate_arange_vector(wrap_buff->size());
+        EnqueueWriteBuffer(device->command_queue(), *wrap_buff, src_2, false);
         vector<uint32_t> result_2;
-        EnqueueReadBuffer(device->command_queue(), wrap_buff, result_2, true);
+        EnqueueReadBuffer(device->command_queue(), *wrap_buff, result_2, true);
         EXPECT_EQ(src_2, result_2);
     }
 }
@@ -585,19 +585,19 @@ TEST_F(CommandQueueSingleCardFixture, TestBackToBackNon32BAlignedPageSize) {
     constexpr BufferType buff_type = BufferType::L1;
 
     for (Device *device : devices_) {
-        Buffer bufa(device, 125000, 100, buff_type);
-        auto src_a = local_test_functions::generate_arange_vector(bufa.size());
-        EnqueueWriteBuffer(device->command_queue(), bufa, src_a, false);
+        auto bufa = Buffer::create(device, 125000, 100, buff_type);
+        auto src_a = local_test_functions::generate_arange_vector(bufa->size());
+        EnqueueWriteBuffer(device->command_queue(), *bufa, src_a, false);
 
-        Buffer bufb(device, 152000, 152, buff_type);
-        auto src_b = local_test_functions::generate_arange_vector(bufb.size());
-        EnqueueWriteBuffer(device->command_queue(), bufb, src_b, false);
+        auto bufb = Buffer::create(device, 152000, 152, buff_type);
+        auto src_b = local_test_functions::generate_arange_vector(bufb->size());
+        EnqueueWriteBuffer(device->command_queue(), *bufb, src_b, false);
 
         vector<uint32_t> result_a;
-        EnqueueReadBuffer(device->command_queue(), bufa, result_a, true);
+        EnqueueReadBuffer(device->command_queue(), *bufa, result_a, true);
 
         vector<uint32_t> result_b;
-        EnqueueReadBuffer(device->command_queue(), bufb, result_b, true);
+        EnqueueReadBuffer(device->command_queue(), *bufb, result_b, true);
 
         EXPECT_EQ(src_a, result_a);
         EXPECT_EQ(src_b, result_b);
@@ -621,19 +621,19 @@ TEST_F(CommandQueueSingleCardFixture, TestNonblockingReads) {
     constexpr BufferType buff_type = BufferType::L1;
 
     for (auto device : devices_) {
-        Buffer bufa(device, 2048, 2048, buff_type);
-        auto src_a = local_test_functions::generate_arange_vector(bufa.size());
-        EnqueueWriteBuffer(device->command_queue(), bufa, src_a, false);
+        auto bufa = Buffer::create(device, 2048, 2048, buff_type);
+        auto src_a = local_test_functions::generate_arange_vector(bufa->size());
+        EnqueueWriteBuffer(device->command_queue(), *bufa, src_a, false);
 
-        Buffer bufb(device, 2048, 2048, buff_type);
-        auto src_b = local_test_functions::generate_arange_vector(bufb.size());
-        EnqueueWriteBuffer(device->command_queue(), bufb, src_b, false);
+        auto bufb = Buffer::create(device, 2048, 2048, buff_type);
+        auto src_b = local_test_functions::generate_arange_vector(bufb->size());
+        EnqueueWriteBuffer(device->command_queue(), *bufb, src_b, false);
 
         vector<uint32_t> result_a;
-        EnqueueReadBuffer(device->command_queue(), bufa, result_a, false);
+        EnqueueReadBuffer(device->command_queue(), *bufa, result_a, false);
 
         vector<uint32_t> result_b;
-        EnqueueReadBuffer(device->command_queue(), bufb, result_b, false);
+        EnqueueReadBuffer(device->command_queue(), *bufb, result_b, false);
         Finish(device->command_queue());
 
         EXPECT_EQ(src_a, result_a);
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_HostAsyncCQ.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_HostAsyncCQ.cpp
index dd4b88ae85f..03e6eb1004c 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_HostAsyncCQ.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_HostAsyncCQ.cpp
@@ -225,8 +225,7 @@ TEST_F(CommandQueueFixture, DISABLED_TestAsyncBufferRW) {
     auto& command_queue = this->device_->command_queue();
     auto current_mode = CommandQueue::default_mode();
     command_queue.set_mode(CommandQueue::CommandQueueMode::ASYNC);
-    Program program; /* Dummy program that helps keep track of buffers */
-    std::vector<Buffer> buffer_objects;
+    Program program;
     for (int j = 0; j < 10; j++) {
         // Asynchronously initialize a buffer on device
         uint32_t first_buf_value = j + 1;
@@ -234,26 +233,14 @@ TEST_F(CommandQueueFixture, DISABLED_TestAsyncBufferRW) {
         uint32_t first_buf_size = 4096;
         uint32_t second_buf_size = 2048;
         // Asynchronously allocate buffer on device
-        std::shared_ptr<Buffer> buffer = std::make_shared<Buffer>(this->device_, first_buf_size, first_buf_size, BufferType::DRAM);
-        // Copy the host side buffer structure to an object (i.e. reallocate on device). After reallocation the addresses should not match
-        buffer_objects.push_back(*buffer);
-        // Confirm that the addresses do not match after copy/reallocation. Send async calls to get addrs, which will return post dealloc addr
+        std::shared_ptr<Buffer> buffer = Buffer::create(this->device_, first_buf_size, first_buf_size, BufferType::DRAM);
         std::shared_ptr<uint32_t> allocated_buffer_address = std::make_shared<uint32_t>();
         EnqueueGetBufferAddr(this->device_->command_queue(), allocated_buffer_address.get(), buffer.get(), true);
-        std::shared_ptr<uint32_t> allocated_buffer_address_2 = std::make_shared<uint32_t>();
-        EnqueueGetBufferAddr(this->device_->command_queue(), allocated_buffer_address_2.get(), &(buffer_objects.back()), true);
-        EXPECT_NE(*allocated_buffer_address_2, *allocated_buffer_address);
-        // Ensure returned addrs are correct
+        // Ensure returned addr is correct
         EXPECT_EQ((*allocated_buffer_address), buffer->address());
-        EXPECT_EQ((*allocated_buffer_address_2), buffer_objects.back().address());
-        // Deallocate the second device side buffer
-        detail::DeallocateBuffer(&(buffer_objects.back()));
-        // Make the buffer_object address and the buffer address identical with a blocking call. buffer_object and buffer are now the same device side buffer
-        buffer_objects.back().set_address(*allocated_buffer_address);
 
         std::shared_ptr<std::vector<uint32_t>> vec = std::make_shared<std::vector<uint32_t>>(first_buf_size / 4, first_buf_value);
-        std::vector<uint32_t> readback_vec_1 = {};
-        std::vector<uint32_t> readback_vec_2 = {};
+        std::vector<uint32_t> readback_vec = {};
         // Write first vector to existing on device buffer.
         EnqueueWriteBuffer(this->device_->command_queue(), buffer, vec, false);
         // Reallocate the vector in the main thread after asynchronously pushing it (ensure that worker still has access to this data)
@@ -261,19 +248,15 @@ TEST_F(CommandQueueFixture, DISABLED_TestAsyncBufferRW) {
         // Simulate what tt-eager does: Share buffer ownership with program
         AssignGlobalBufferToProgram(buffer, program);
         // Reallocate buffer (this is safe, since the program also owns the existing buffer, which will not be deallocated)
-        buffer = std::make_shared<Buffer>(this->device_, second_buf_size, second_buf_size, BufferType::DRAM);
+        buffer = Buffer::create(this->device_, second_buf_size, second_buf_size, BufferType::DRAM);
         // Write second vector to second buffer
         EnqueueWriteBuffer(this->device_->command_queue(), buffer, vec, false);
         // Have main thread give up ownership immediately after writing
         vec.reset();
         // Read both buffer and ensure data is correct
-        EnqueueReadBuffer(this->device_->command_queue(), buffer_objects.back(), readback_vec_1, false);
-        EnqueueReadBuffer(this->device_->command_queue(), buffer, readback_vec_2, true);
-        for (int i = 0; i < readback_vec_1.size(); i++) {
-            EXPECT_EQ(readback_vec_1[i], first_buf_value);
-        }
-        for (int i = 0; i < readback_vec_2.size(); i++) {
-            EXPECT_EQ(readback_vec_2[i], second_buf_value);
+        EnqueueReadBuffer(this->device_->command_queue(), buffer, readback_vec, true);
+        for (int i = 0; i < readback_vec.size(); i++) {
+            EXPECT_EQ(readback_vec[i], second_buf_value);
         }
     }
     command_queue.set_mode(current_mode);
@@ -335,7 +318,7 @@ TEST_F(CommandQueueFixture, DISABLED_TestAsyncAssertForDeprecatedAPI) {
         "tt_metal/kernels/dataflow/reader_binary_diff_lengths.cpp",
         core,
         DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default});
-    auto src0 = std::make_shared<Buffer>(this->device_, buf_size, page_size, BufferType::DRAM);
+    auto src0 = Buffer::create(this->device_, buf_size, page_size, BufferType::DRAM);
     std::vector<uint32_t> runtime_args = {src0->address()};
     try {
         SetRuntimeArgs(program, dummy_kernel, core, runtime_args);
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_events.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_events.cpp
index a3bd30b55fb..5a772063742 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_events.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_events.cpp
@@ -39,7 +39,7 @@ TEST_F(CommandQueueFixture, TestEventsDataMovementWrittenToCompletionQueueInOrde
 
         vector<std::shared_ptr<Buffer>> buffers;
         for (size_t i = 0; i < num_buffers; i++) {
-            buffers.push_back(std::make_shared<Buffer>(this->device_, page_size, page_size, BufferType::DRAM));
+            buffers.push_back(Buffer::create(this->device_, page_size, page_size, BufferType::DRAM));
 
             if (data_movement_mode == DataMovementMode::WRITE) {
                 EnqueueWriteBuffer(this->device_->command_queue(), buffers.back(), page, true);
@@ -281,7 +281,7 @@ TEST_F(CommandQueueFixture, TestEventsMixedWriteBufferRecordWaitSynchronize) {
         EXPECT_EQ(event->cq_id, this->device_->command_queue().id());
         EXPECT_EQ(event->event_id, events_issued_per_cq + 1); // Event ids start at 1
 
-        std::shared_ptr<Buffer> buf = std::make_shared<Buffer>(this->device_, page_size, page_size, BufferType::DRAM);
+        std::shared_ptr<Buffer> buf = Buffer::create(this->device_, page_size, page_size, BufferType::DRAM);
         EnqueueWriteBuffer(this->device_->command_queue(), buf, page, true);
         EnqueueWaitForEvent(this->device_->command_queue(), event);
 
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueTrace.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueTrace.cpp
index 24e3f9b9b79..cef5a8d0c18 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueTrace.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueTrace.cpp
@@ -77,14 +77,14 @@ namespace basic_tests {
 
 TEST_F(SingleDeviceTraceFixture, EnqueueOneProgramTrace) {
     Setup(2048, 2);
-    Buffer input(this->device_, 2048, 2048, BufferType::DRAM);
-    Buffer output(this->device_, 2048, 2048, BufferType::DRAM);
+    auto input = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM);
+    auto output = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM);
 
     CommandQueue& command_queue = this->device_->command_queue(0);
     CommandQueue& data_movement_queue = this->device_->command_queue(1);
 
-    Program simple_program = create_simple_unary_program(input, output);
-    vector<uint32_t> input_data(input.size() / sizeof(uint32_t), 0);
+    Program simple_program = create_simple_unary_program(*input, *output);
+    vector<uint32_t> input_data(input->size() / sizeof(uint32_t), 0);
     for (uint32_t i = 0; i < input_data.size(); i++) {
         input_data[i] = i;
     }
@@ -93,7 +93,7 @@ TEST_F(SingleDeviceTraceFixture, EnqueueOneProgramTrace) {
     vector<uint32_t> eager_output_data;
     eager_output_data.resize(input_data.size());
 
-    EnqueueWriteBuffer(data_movement_queue, input, input_data.data(), true);
+    EnqueueWriteBuffer(data_movement_queue, *input, input_data.data(), true);
     EnqueueProgram(command_queue, simple_program, true);
     EnqueueReadBuffer(data_movement_queue, output, eager_output_data.data(), true);
 
@@ -101,14 +101,14 @@ TEST_F(SingleDeviceTraceFixture, EnqueueOneProgramTrace) {
     vector<uint32_t> trace_output_data;
     trace_output_data.resize(input_data.size());
 
-    EnqueueWriteBuffer(data_movement_queue, input, input_data.data(), true);
+    EnqueueWriteBuffer(data_movement_queue, *input, input_data.data(), true);
 
     uint32_t tid = BeginTraceCapture(this->device_, command_queue.id());
     EnqueueProgram(command_queue, simple_program, false);
     EndTraceCapture(this->device_, command_queue.id(), tid);
 
     EnqueueTrace(command_queue, tid, true);
-    EnqueueReadBuffer(data_movement_queue, output, trace_output_data.data(), true);
+    EnqueueReadBuffer(data_movement_queue, *output, trace_output_data.data(), true);
     EXPECT_TRUE(eager_output_data == trace_output_data);
 
     // Done
@@ -118,14 +118,14 @@ TEST_F(SingleDeviceTraceFixture, EnqueueOneProgramTrace) {
 
 TEST_F(SingleDeviceTraceFixture, EnqueueOneProgramTraceLoops) {
     Setup(4096, 2);
-    Buffer input(this->device_, 2048, 2048, BufferType::DRAM);
-    Buffer output(this->device_, 2048, 2048, BufferType::DRAM);
+    auto input = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM);
+    auto output = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM);
 
     CommandQueue& command_queue = this->device_->command_queue(0);
     CommandQueue& data_movement_queue = this->device_->command_queue(1);
 
-    Program simple_program = create_simple_unary_program(input, output);
-    vector<uint32_t> input_data(input.size() / sizeof(uint32_t), 0);
+    Program simple_program = create_simple_unary_program(*input, *output);
+    vector<uint32_t> input_data(input->size() / sizeof(uint32_t), 0);
     for (uint32_t i = 0; i < input_data.size(); i++) {
         input_data[i] = i;
     }
@@ -146,7 +146,7 @@ TEST_F(SingleDeviceTraceFixture, EnqueueOneProgramTraceLoops) {
     uint32_t trace_id = 0;
     bool trace_captured = false;
     for (auto i = 0; i < num_loops; i++) {
-        EnqueueWriteBuffer(data_movement_queue, input, input_data.data(), true);
+        EnqueueWriteBuffer(data_movement_queue, *input, input_data.data(), true);
 
         if (not trace_captured) {
             trace_id = BeginTraceCapture(this->device_, command_queue.id());
@@ -156,7 +156,7 @@ TEST_F(SingleDeviceTraceFixture, EnqueueOneProgramTraceLoops) {
         }
 
         EnqueueTrace(command_queue, trace_id, false);
-        EnqueueReadBuffer(data_movement_queue, output, trace_outputs[i].data(), true);
+        EnqueueReadBuffer(data_movement_queue, *output, trace_outputs[i].data(), true);
 
         // Expect same output across all loops
         EXPECT_TRUE(trace_outputs[i] == trace_outputs[0]);
@@ -169,8 +169,8 @@ TEST_F(SingleDeviceTraceFixture, EnqueueOneProgramTraceLoops) {
 
 TEST_F(SingleDeviceTraceFixture, EnqueueOneProgramTraceBenchmark) {
     Setup(6144, 2);
-    Buffer input(this->device_, 2048, 2048, BufferType::DRAM);
-    Buffer output(this->device_, 2048, 2048, BufferType::DRAM);
+    auto input = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM);
+    auto output = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM);
 
     constexpr bool kBlocking = true;
     constexpr bool kNonBlocking = false;
@@ -180,8 +180,8 @@ TEST_F(SingleDeviceTraceFixture, EnqueueOneProgramTraceBenchmark) {
     // Keep this queue in passthrough mode for now
     CommandQueue& command_queue = this->device_->command_queue(0);
 
-    auto simple_program = create_simple_unary_program(input, output);
-    vector<uint32_t> input_data(input.size() / sizeof(uint32_t), 0);
+    auto simple_program = create_simple_unary_program(*input, *output);
+    vector<uint32_t> input_data(input->size() / sizeof(uint32_t), 0);
     for (uint32_t i = 0; i < input_data.size(); i++) {
         input_data[i] = i;
     }
@@ -202,18 +202,18 @@ TEST_F(SingleDeviceTraceFixture, EnqueueOneProgramTraceBenchmark) {
     eager_output_data.resize(input_data.size());
 
     // Warm up and use the eager blocking run as the expected output
-    EnqueueWriteBuffer(command_queue, input, input_data.data(), kBlocking);
+    EnqueueWriteBuffer(command_queue, *input, input_data.data(), kBlocking);
     EnqueueProgram(command_queue, simple_program, kBlocking);
-    EnqueueReadBuffer(command_queue, output, expected_output_data.data(), kBlocking);
+    EnqueueReadBuffer(command_queue, *output, expected_output_data.data(), kBlocking);
     Finish(command_queue);
 
     for (bool blocking : blocking_flags) {
         std::string mode = blocking ? "Eager-B" : "Eager-NB";
         for (auto i = 0; i < num_loops; i++) {
             tt::ScopedTimer timer(mode + " loop " + std::to_string(i));
-            EnqueueWriteBuffer(command_queue, input, input_data.data(), blocking);
+            EnqueueWriteBuffer(command_queue, *input, input_data.data(), blocking);
             EnqueueProgram(command_queue, simple_program, blocking);
-            EnqueueReadBuffer(command_queue, output, eager_output_data.data(), blocking);
+            EnqueueReadBuffer(command_queue, *output, eager_output_data.data(), blocking);
         }
         if (not blocking) {
             // (Optional) wait for the last non-blocking command to finish
@@ -230,9 +230,9 @@ TEST_F(SingleDeviceTraceFixture, EnqueueOneProgramTraceBenchmark) {
     // Trace mode execution
     for (auto i = 0; i < num_loops; i++) {
         tt::ScopedTimer timer("Trace loop " + std::to_string(i));
-        EnqueueWriteBuffer(command_queue, input, input_data.data(), kNonBlocking);
+        EnqueueWriteBuffer(command_queue, *input, input_data.data(), kNonBlocking);
         EnqueueTrace(command_queue, tid, kNonBlocking);
-        EnqueueReadBuffer(command_queue, output, trace_outputs[i].data(), kNonBlocking);
+        EnqueueReadBuffer(command_queue, *output, trace_outputs[i].data(), kNonBlocking);
     }
     Finish(command_queue);
 
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWaitForEvent.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWaitForEvent.cpp
index 73baaa14267..b6c3e82791c 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWaitForEvent.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWaitForEvent.cpp
@@ -218,7 +218,7 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestEventsReadWriteWithWaitForEvent
         vector<vector<uint32_t>> srcs;
         for (uint i = 0; i < cqs.size(); i++) {
             uint32_t wr_data_base = (buf_idx * 1000) + (i * 100);
-            buffers.push_back(std::make_shared<Buffer>(this->device_, buf_size, config.page_size, config.buftype));
+            buffers.push_back(Buffer::create(this->device_, buf_size, config.page_size, config.buftype));
             srcs.push_back(generate_arange_vector(buffers[i]->size(), wr_data_base));
             log_debug(tt::LogTest, "buf_idx: {} Doing Write to cq_id: {} of data: {}", buf_idx, i, srcs[i]);
             EnqueueWriteBuffer(cqs[i], *buffers[i], srcs[i], false);
@@ -280,7 +280,7 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestEventsReadWriteWithWaitForEvent
             auto event = std::make_shared<Event>();
             vector<uint32_t> result;
 
-            buffers.push_back(std::make_shared<Buffer>(this->device_, buf_size, config.page_size, config.buftype));
+            buffers.push_back(Buffer::create(this->device_, buf_size, config.page_size, config.buftype));
             srcs.push_back(generate_arange_vector(buffers[i]->size(), wr_data_base));
 
             // Blocking Read after Non-Blocking Write on alternate CQs, events ensure ordering.
@@ -339,7 +339,7 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestEventsReadWriteWithWaitForEvent
             vector<vector<uint32_t>> read_results;
             vector<std::shared_ptr<Buffer>> buffers;
 
-            buffers.push_back(std::make_shared<Buffer>(this->device_, buf_size, config.page_size, config.buftype));
+            buffers.push_back(Buffer::create(this->device_, buf_size, config.page_size, config.buftype));
 
             // Number of write-read combos per buffer. Fewer make RAW race without events easier to hit.
             for (uint j = 0; j < num_wr_rd_per_buf; j++) {
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
index 94572201f06..f294316c6e4 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
@@ -21,10 +21,10 @@ bool test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(Device* device, v
     for (const bool use_void_star_api: {true, false}) {
 
         size_t buf_size = config.num_pages * config.page_size;
-        std::vector<std::unique_ptr<Buffer>> buffers;
+        std::vector<std::shared_ptr<Buffer>> buffers;
         std::vector<std::vector<uint32_t>> srcs;
         for (uint i = 0; i < cqs.size(); i++) {
-            buffers.push_back(std::make_unique<Buffer>(device, buf_size, config.page_size, config.buftype));
+            buffers.push_back(Buffer::create(device, buf_size, config.page_size, config.buftype));
             srcs.push_back(generate_arange_vector(buffers[i]->size()));
             if (use_void_star_api) {
                 EnqueueWriteBuffer(cqs[i], *buffers[i], srcs[i].data(), false);
diff --git a/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor.cpp b/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor.cpp
index 654e9bd5d54..0e43c63707b 100644
--- a/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor.cpp
+++ b/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor.cpp
@@ -32,12 +32,11 @@ void run_create_tensor_test(tt::tt_metal::Device* device, ttnn::SimpleShape inpu
         host_data[i] = 1;
     }
 
-    auto input_buffer = ttnn::allocate_buffer_on_device(input_buf_size_datums * datum_size_bytes, device, input_shape, dtype, Layout::TILE, mem_cfg);
+    auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(input_buf_size_datums * datum_size_bytes, device, input_shape, dtype, Layout::TILE, mem_cfg);
 
     auto input_storage = tt::tt_metal::DeviceStorage{input_buffer};
 
     Tensor input_tensor = Tensor(input_storage, input_shape, dtype, Layout::TILE);
-    tt::log_debug("input_data: \n {}", input_tensor.write_to_string());
 
     ttnn::write_buffer(io_cq, input_tensor, {host_data});
 
@@ -65,6 +64,10 @@ INSTANTIATE_TEST_SUITE_P(
     CreateTensorTestWithShape,
     CreateTensorTest,
     ::testing::Values(
-        CreateTensorParams{.shape=ttnn::SimpleShape({1, 1, 32, 32})}
+        CreateTensorParams{.shape=ttnn::SimpleShape({1, 1, 32, 32})},
+        CreateTensorParams{.shape=ttnn::SimpleShape({2, 1, 32, 32})},
+        CreateTensorParams{.shape=ttnn::SimpleShape({0, 0, 0, 0})},
+        CreateTensorParams{.shape=ttnn::SimpleShape({0, 1, 32, 32})},
+        CreateTensorParams{.shape=ttnn::SimpleShape({0})}
     )
 );
diff --git a/tests/ttnn/unit_tests/gtests/test_async_runtime.cpp b/tests/ttnn/unit_tests/gtests/test_async_runtime.cpp
index f3dc59b25a7..4d50192f234 100644
--- a/tests/ttnn/unit_tests/gtests/test_async_runtime.cpp
+++ b/tests/ttnn/unit_tests/gtests/test_async_runtime.cpp
@@ -54,8 +54,8 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestAsyncPreallocatedOutputs) {
     auto workload_event = std::make_shared<Event>();
     // Running sum-reduce with preallocated output
     // Preallocate Input and Output Tensors on Device
-    auto input_buffer = ttnn::allocate_buffer_on_device(input_buf_size_datums * datum_size_bytes, device, input_shape.padded_shape(), DataType::BFLOAT16, Layout::TILE, mem_cfg);
-    auto output_buffer = ttnn::allocate_buffer_on_device(output_buf_size_datums * datum_size_bytes, device, np_out.get_padded_shape(), DataType::BFLOAT16, Layout::TILE, mem_cfg);
+    auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(input_buf_size_datums * datum_size_bytes, device, input_shape.padded_shape(), DataType::BFLOAT16, Layout::TILE, mem_cfg);
+    auto output_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(output_buf_size_datums * datum_size_bytes, device, np_out.get_padded_shape(), DataType::BFLOAT16, Layout::TILE, mem_cfg);
     auto input_storage = tt::tt_metal::DeviceStorage{input_buffer};
     auto output_storage = tt::tt_metal::DeviceStorage{output_buffer};
     Tensor input_tensor = Tensor(input_storage, input_shape, DataType::BFLOAT16, Layout::TILE);
@@ -118,7 +118,7 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestAsyncRuntimeAllocatedBuffers) {
 
             auto write_event = std::make_shared<Event>();
             auto workload_event = std::make_shared<Event>();
-            auto input_buffer = ttnn::allocate_buffer_on_device(buf_size_datums * datum_size_bytes, device, shape, DataType::BFLOAT16, Layout::TILE, mem_cfg);
+            auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(buf_size_datums * datum_size_bytes, device, shape, DataType::BFLOAT16, Layout::TILE, mem_cfg);
             auto input_storage = tt::tt_metal::DeviceStorage{input_buffer};
             Tensor input_tensor = Tensor(input_storage, shape, DataType::BFLOAT16, Layout::TILE);
             ttnn::write_buffer(io_cq, input_tensor, {host_data}); // Write using cq 1
@@ -128,10 +128,10 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestAsyncRuntimeAllocatedBuffers) {
 
             // Run operation on cq 0
             Tensor output_tensor = ttnn::sqrt(workload_dispatch_cq, input_tensor);
-            auto dummy_buffer_0 = ttnn::allocate_buffer_on_device(buf_size_datums * datum_size_bytes, device, shape, DataType::BFLOAT16, Layout::TILE, mem_cfg);
+            auto dummy_buffer_0 = tt::tt_metal::tensor_impl::allocate_buffer_on_device(buf_size_datums * datum_size_bytes, device, shape, DataType::BFLOAT16, Layout::TILE, mem_cfg);
             output_tensor = ttnn::neg(workload_dispatch_cq, output_tensor);
             // Allocate this buffer to stress test async allocation across op execution and explicit allocation
-            auto dummy_buffer_1 = ttnn::allocate_buffer_on_device(buf_size_datums * datum_size_bytes, device, shape, DataType::BFLOAT16, Layout::TILE, mem_cfg);
+            auto dummy_buffer_1 = tt::tt_metal::tensor_impl::allocate_buffer_on_device(buf_size_datums * datum_size_bytes, device, shape, DataType::BFLOAT16, Layout::TILE, mem_cfg);
             // Record cq 0 prog execution
             ttnn::record_event(device->command_queue(workload_dispatch_cq), workload_event);
             // Wait until cq 0 prog execution is done
@@ -164,7 +164,7 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestAsyncRuntimeBufferDestructor) {
     // deallocate the buffer) in a loop
     for (int loop = 0; loop < 100000; loop++) {
         {
-            auto input_buffer_dummy = ttnn::allocate_buffer_on_device(buf_size_datums * datum_size_bytes, device, shape, DataType::BFLOAT16, Layout::TILE, mem_cfg);
+            auto input_buffer_dummy = tt::tt_metal::tensor_impl::allocate_buffer_on_device(buf_size_datums * datum_size_bytes, device, shape, DataType::BFLOAT16, Layout::TILE, mem_cfg);
             device->synchronize();
         }
     }
diff --git a/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp b/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp
index b101c6a6065..fc97f0aad13 100644
--- a/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp
+++ b/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp
@@ -155,7 +155,7 @@ TEST(GalaxyTests, TestAllGatherDeadlock) {
                 log_info(LogTest, "Running iteration {}", i);
             }
             for (auto& dev : devs) {
-                auto input_buffer = ttnn::allocate_buffer_on_device(buf_size_datums * datum_size_bytes, dev, shape, DataType::BFLOAT16, Layout::TILE, mem_cfg);
+                auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(buf_size_datums * datum_size_bytes, dev, shape, DataType::BFLOAT16, Layout::TILE, mem_cfg);
                 auto input_storage = DeviceStorage{input_buffer};
                 Tensor input_tensor = Tensor(input_storage, shape, DataType::BFLOAT16, Layout::TILE);
                 // Push inputs.
@@ -250,7 +250,7 @@ TEST(GalaxyTests, TestReduceScatterDeadlock) {
             log_info(LogTest, "Running iteration {}", i);
         }
         for (auto& dev : ring_devices) {
-            auto input_buffer = ttnn::allocate_buffer_on_device(buf_size_datums * datum_size_bytes, dev, shape, DataType::BFLOAT16, Layout::TILE, mem_cfg);
+            auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(buf_size_datums * datum_size_bytes, dev, shape, DataType::BFLOAT16, Layout::TILE, mem_cfg);
             auto input_storage = DeviceStorage{input_buffer};
             Tensor input_tensor = Tensor(input_storage, shape, DataType::BFLOAT16, Layout::TILE);
             // Push inputs.
diff --git a/tests/ttnn/unit_tests/gtests/test_multi_cq_multi_dev.cpp b/tests/ttnn/unit_tests/gtests/test_multi_cq_multi_dev.cpp
index 75d6f1afd7b..a1ce8693233 100644
--- a/tests/ttnn/unit_tests/gtests/test_multi_cq_multi_dev.cpp
+++ b/tests/ttnn/unit_tests/gtests/test_multi_cq_multi_dev.cpp
@@ -59,7 +59,7 @@ TEST_F(MultiCommandQueueT3KFixture, Test2CQMultiDeviceProgramsOnCQ1) {
                 for (int j = 0; j < buf_size_datums; j++) {
                     host_data[j] = bfloat16(static_cast<float>(i + dev_idx));
                 }
-                auto input_buffer = ttnn::allocate_buffer_on_device(buf_size_datums * datum_size_bytes, device, shape, DataType::BFLOAT16, Layout::TILE, mem_cfg);
+                auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(buf_size_datums * datum_size_bytes, device, shape, DataType::BFLOAT16, Layout::TILE, mem_cfg);
                 auto input_storage = tt::tt_metal::DeviceStorage{input_buffer};
                 Tensor input_tensor = Tensor(input_storage, shape, DataType::BFLOAT16, Layout::TILE);
 
@@ -110,7 +110,7 @@ TEST_F(MultiCommandQueueT3KFixture, Test2CQMultiDeviceProgramsOnCQ0) {
                 for (int j = 0; j < buf_size_datums; j++) {
                     host_data[j] = bfloat16(static_cast<float>(i + dev_idx));
                 }
-                auto input_buffer = ttnn::allocate_buffer_on_device(buf_size_datums * datum_size_bytes, device, shape, DataType::BFLOAT16, Layout::TILE, mem_cfg);
+                auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(buf_size_datums * datum_size_bytes, device, shape, DataType::BFLOAT16, Layout::TILE, mem_cfg);
                 auto input_storage = tt::tt_metal::DeviceStorage{input_buffer};
                 Tensor input_tensor = Tensor(input_storage, shape, DataType::BFLOAT16, Layout::TILE);
 
@@ -161,7 +161,7 @@ TEST_F(MultiCommandQueueT3KFixture, Test2CQMultiDeviceWithCQ1Only) {
                 for (int j = 0; j < buf_size_datums; j++) {
                     host_data[j] = bfloat16(static_cast<float>(i + dev_idx));
                 }
-                auto input_buffer = ttnn::allocate_buffer_on_device(buf_size_datums * datum_size_bytes, device, shape, DataType::BFLOAT16, Layout::TILE, mem_cfg);
+                auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(buf_size_datums * datum_size_bytes, device, shape, DataType::BFLOAT16, Layout::TILE, mem_cfg);
                 auto input_storage = tt::tt_metal::DeviceStorage{input_buffer};
                 Tensor input_tensor = Tensor(input_storage, shape, DataType::BFLOAT16, Layout::TILE);
 
diff --git a/tests/ttnn/unit_tests/gtests/test_multiprod_queue.cpp b/tests/ttnn/unit_tests/gtests/test_multiprod_queue.cpp
index a7ace491790..98fa377529c 100644
--- a/tests/ttnn/unit_tests/gtests/test_multiprod_queue.cpp
+++ b/tests/ttnn/unit_tests/gtests/test_multiprod_queue.cpp
@@ -52,7 +52,7 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestMultiProducerLockBasedQueue) {
                 t0_host_data[i] = bfloat16(static_cast<float>(2 + j));
             }
             // Allocate and write buffer
-            auto t0_input_buffer = ttnn::allocate_buffer_on_device(tensor_buf_size * datum_size_bytes, device, tensor_shape, DataType::BFLOAT16, Layout::TILE, mem_cfg);
+            auto t0_input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(tensor_buf_size * datum_size_bytes, device, tensor_shape, DataType::BFLOAT16, Layout::TILE, mem_cfg);
             auto t0_input_storage = tt::tt_metal::DeviceStorage{t0_input_buffer};
             Tensor t0_input_tensor = Tensor(t0_input_storage, tensor_shape, DataType::BFLOAT16, Layout::TILE);
             ttnn::write_buffer(t0_io_cq, t0_input_tensor, {t0_host_data});
@@ -71,7 +71,7 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestMultiProducerLockBasedQueue) {
             for (int i = 0; i < tensor_buf_size; i++) {
                 t1_host_data[i] = bfloat16(static_cast<float>(4 + j));
             }
-            auto t1_input_buffer = ttnn::allocate_buffer_on_device(tensor_buf_size * datum_size_bytes, device, tensor_shape, DataType::BFLOAT16, Layout::TILE, mem_cfg);
+            auto t1_input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(tensor_buf_size * datum_size_bytes, device, tensor_shape, DataType::BFLOAT16, Layout::TILE, mem_cfg);
             auto t1_input_storage = tt::tt_metal::DeviceStorage{t1_input_buffer};
             Tensor t1_input_tensor = Tensor(t1_input_storage, tensor_shape, DataType::BFLOAT16, Layout::TILE);
 
@@ -118,7 +118,7 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestMultiAppThreadSync) {
 
     ttnn::SimpleShape tensor_shape{1, 1, 1024, 1024};
     auto host_data = std::shared_ptr<bfloat16 []>(new bfloat16[tensor_buf_size]);
-    auto allocated_buffer = ttnn::allocate_buffer_on_device(tensor_buf_size * datum_size_bytes, device, tensor_shape, DataType::BFLOAT16, Layout::TILE, mem_cfg);
+    auto allocated_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(tensor_buf_size * datum_size_bytes, device, tensor_shape, DataType::BFLOAT16, Layout::TILE, mem_cfg);
     auto allocated_storage = tt::tt_metal::DeviceStorage{allocated_buffer};
     auto allocated_tensor = Tensor(allocated_storage, tensor_shape, DataType::BFLOAT16, Layout::TILE);
     auto readback_data = std::shared_ptr<bfloat16 []>(new bfloat16[tensor_buf_size]);
diff --git a/tt_metal/detail/tt_metal.hpp b/tt_metal/detail/tt_metal.hpp
index d362bee71a2..d6168102a5e 100644
--- a/tt_metal/detail/tt_metal.hpp
+++ b/tt_metal/detail/tt_metal.hpp
@@ -276,7 +276,7 @@ inline namespace v0 {
 
         void SetLazyCommandQueueMode(bool lazy);
 
-        void AllocateBuffer(Buffer* buffer, bool bottom_up);
+        DeviceAddr AllocateBuffer(const Buffer* buffer, bool bottom_up);
 
         void DeallocateBuffer(Buffer *buffer);
     }  // namespace detail
diff --git a/tt_metal/detail/util.hpp b/tt_metal/detail/util.hpp
index 9938f01c82a..5418e0562df 100644
--- a/tt_metal/detail/util.hpp
+++ b/tt_metal/detail/util.hpp
@@ -26,9 +26,9 @@ namespace tt::tt_metal::detail{
     }
 
     inline DeviceAddr SizeBytesPerBank(DeviceAddr size_bytes, DeviceAddr page_size_bytes, uint32_t num_banks, uint32_t alignment_bytes) {
-        TT_ASSERT(page_size_bytes > 0 and size_bytes % page_size_bytes == 0, "Page size {} should be divisible by buffer size {}", page_size_bytes, size_bytes);
-        DeviceAddr num_pages = size_bytes / page_size_bytes;
-        DeviceAddr num_equally_distributed_pages = num_pages == 1 ? 1 : 1 + ((num_pages - 1) / num_banks);
+        TT_ASSERT(page_size_bytes == 0 ? size_bytes == 0 : size_bytes % page_size_bytes == 0, "Page size {} should be divisible by buffer size {}", page_size_bytes, size_bytes);
+        DeviceAddr num_pages = page_size_bytes == 0 ? 0 : size_bytes / page_size_bytes;
+        DeviceAddr num_equally_distributed_pages = num_pages == 0 ? 0 : 1 + ((num_pages - 1) / num_banks);
         return num_equally_distributed_pages * round_up(page_size_bytes, alignment_bytes);
     }
 
diff --git a/tt_metal/graph/graph_tracking.cpp b/tt_metal/graph/graph_tracking.cpp
index 04a9ae4240a..17a72ddd5ee 100644
--- a/tt_metal/graph/graph_tracking.cpp
+++ b/tt_metal/graph/graph_tracking.cpp
@@ -27,7 +27,7 @@ bool GraphTracker::add_hook(const std::shared_ptr<IGraphHooks>& new_hook) {
     return true;
 }
 
-void GraphTracker::track_allocate(Buffer* buffer, bool bottom_up) {
+void GraphTracker::track_allocate(const Buffer* buffer, bool bottom_up) {
     if (processors.empty()) {
         return;
     }
@@ -73,7 +73,7 @@ void GraphTracker::track_program(Program* program) {
     }
 }
 
-bool GraphTracker::hook_allocate(Buffer* buffer, bool bottom_up) {
+bool GraphTracker::hook_allocate(const Buffer* buffer, bool bottom_up) {
     if (hook == nullptr)
         return false;
 
diff --git a/tt_metal/graph/graph_tracking.hpp b/tt_metal/graph/graph_tracking.hpp
index a39cf3d4a56..54ee8eef41d 100644
--- a/tt_metal/graph/graph_tracking.hpp
+++ b/tt_metal/graph/graph_tracking.hpp
@@ -28,7 +28,7 @@ inline namespace v0 {
 
         IGraphProcessor() = default;
 
-        virtual void track_allocate(tt::tt_metal::Buffer* buffer, bool bottom_up) {};
+        virtual void track_allocate(const tt::tt_metal::Buffer* buffer, bool bottom_up) {};
 
         virtual void track_deallocate(tt::tt_metal::Buffer* buffer) {};
 
@@ -54,7 +54,7 @@ inline namespace v0 {
     class IGraphHooks {
     public:
         IGraphHooks() = default;
-        virtual bool hook_allocate(tt::tt_metal::Buffer* buffer, bool bottom_up) = 0;
+        virtual bool hook_allocate(const tt::tt_metal::Buffer* buffer, bool bottom_up) = 0;
 
         virtual bool hook_deallocate(tt::tt_metal::Buffer* buffer) = 0;
 
@@ -77,7 +77,7 @@ inline namespace v0 {
 
         bool add_hook(const std::shared_ptr<IGraphHooks>& hook);
 
-        void track_allocate(Buffer* buffer, bool bottom_up);
+        void track_allocate(const Buffer* buffer, bool bottom_up);
 
         void track_deallocate(Buffer* buffer);
 
@@ -118,7 +118,7 @@ inline namespace v0 {
             }
         }
 
-        bool hook_allocate(Buffer* buffer, bool bottom_up);
+        bool hook_allocate(const Buffer* buffer, bool bottom_up);
 
         bool hook_deallocate(Buffer* buffer);
 
diff --git a/tt_metal/impl/buffers/buffer.cpp b/tt_metal/impl/buffers/buffer.cpp
index 5d204dde8c1..0403a82af98 100644
--- a/tt_metal/impl/buffers/buffer.cpp
+++ b/tt_metal/impl/buffers/buffer.cpp
@@ -28,7 +28,10 @@ void validate_buffer_size_and_page_size(
     const BufferType &buffer_type,
     const TensorMemoryLayout &buffer_layout,
     const std::optional<ShardSpecBuffer>& shard_parameters) {
-    TT_FATAL(size != 0 and page_size != 0, "Buffer size and page size should be larger than 0 bytes!");
+    if (size == 0) {
+        return;
+    }
+
     bool valid_page_size = (size % page_size == 0);
     TT_FATAL(
         valid_page_size,
@@ -55,7 +58,7 @@ inline std::tuple<std::vector<std::vector<uint32_t>>, std::vector<std::array<uin
     const std::array<uint32_t, 2> &page_shape,
     const std::array<uint32_t, 2> &shard_shape,
     const std::array<uint32_t, 2> &tensor2d_size) {
-    std::array<uint32_t, 2> shard_in_pages = {shard_shape[0] / page_shape[0], shard_shape[1] / page_shape[1]};
+    std::array<uint32_t, 2> shard_in_pages = {page_shape[0] == 0 ? 0 : shard_shape[0] / page_shape[0], page_shape[1] == 0 ? 0 : shard_shape[1] / page_shape[1]};
     std::vector<std::vector<uint32_t>> ret_vec(num_shards);
     std::vector<std::array<uint32_t, 2>> ret_shard_shape(num_shards, shard_in_pages);
 
@@ -80,7 +83,7 @@ inline std::tuple<std::vector<std::vector<uint32_t>>, std::vector<std::array<uin
     } else if (layout == TensorMemoryLayout::WIDTH_SHARDED or layout == TensorMemoryLayout::BLOCK_SHARDED) {
         uint32_t i_offset = 0;
         uint32_t j_offset = 0;
-        uint32_t num_shard_columns = div_up(tensor2d_size[1], shard_in_pages[1]);
+        uint32_t num_shard_columns = shard_in_pages[1] == 0 ? 0 : div_up(tensor2d_size[1], shard_in_pages[1]);
         uint32_t shard_in_row = 0;
 
         for (uint32_t shard_idx = 0; shard_idx < num_shards; shard_idx++) {
@@ -113,38 +116,19 @@ inline std::tuple<std::vector<std::vector<uint32_t>>, std::vector<std::array<uin
     return {ret_vec, ret_shard_shape};
 }
 
-Buffer::Buffer(
-    Device *device,
-    DeviceAddr size,
-    DeviceAddr page_size,
-    const BufferType buffer_type,
-    const TensorMemoryLayout buffer_layout,
-    const std::optional<ShardSpecBuffer>& shard_parameters,
-    const std::optional<bool> bottom_up,
-    bool allocate) :
-    device_(device),
-    size_(size),
-    page_size_(page_size),
-    buffer_type_(buffer_type),
-    buffer_layout_(buffer_layout),
-    shard_parameters_(shard_parameters),
-    bottom_up_(bottom_up),
-    buffer_page_mapping_(nullptr),
-    allocate_(allocate) {
-    TT_FATAL(this->device_ != nullptr and this->device_->allocator_ != nullptr, "Device and allocator need to not be null.");
-    validate_buffer_size_and_page_size(size, page_size, buffer_type, buffer_layout, shard_parameters);
-    if (allocate) {
-        this->allocate();
+BufferPageMapping generate_buffer_page_mapping(const Buffer& buffer) {
+    BufferPageMapping buffer_page_mapping;
+
+    if (buffer.size() == 0) {
+        return buffer_page_mapping;
     }
-}
+    auto shard_spec = buffer.shard_spec();
 
-BufferPageMapping generate_buffer_page_mapping(const Buffer &buffer) {
-    BufferPageMapping buffer_page_mapping;
-    bool row_major = buffer.shard_spec().orientation() == ShardOrientation::ROW_MAJOR;
+    bool row_major = shard_spec.orientation() == ShardOrientation::ROW_MAJOR;
     uint32_t num_cores = buffer.num_cores();
 
-    buffer_page_mapping.all_cores_ = corerange_to_cores(buffer.shard_spec().grid(), num_cores, row_major);
-    TT_ASSERT(num_cores == buffer_page_mapping.all_cores_.size());
+    buffer_page_mapping.all_cores_ = corerange_to_cores(shard_spec.grid(), num_cores, row_major);
+    TT_FATAL(num_cores == buffer_page_mapping.all_cores_.size(), "Buffer has {} cores, but page mapping expects {} cores", num_cores, buffer_page_mapping.all_cores_.size());
     uint32_t core_id = 0;
     for (const auto &core : buffer_page_mapping.all_cores_) {
         buffer_page_mapping.core_to_core_id_.insert({core, core_id});
@@ -154,12 +138,12 @@ BufferPageMapping generate_buffer_page_mapping(const Buffer &buffer) {
     uint32_t num_dev_pages = buffer.num_dev_pages();
     auto [core_host_page_indices, shard_shape] = core_to_host_pages(
         num_dev_pages,
-        buffer.shard_spec().size(),
+        shard_spec.size(),
         num_cores,
         buffer.buffer_layout(),
-        buffer.shard_spec().page_shape,
-        buffer.shard_spec().shape(),
-        buffer.shard_spec().tensor2d_shape);
+        shard_spec.page_shape,
+        shard_spec.shape(),
+        shard_spec.tensor2d_shape);
 
     buffer_page_mapping.core_host_page_indices_ = std::vector<std::vector<uint32_t>>(num_cores);
 
@@ -172,10 +156,10 @@ BufferPageMapping generate_buffer_page_mapping(const Buffer &buffer) {
     buffer_page_mapping.core_shard_shape_ = std::move(shard_shape);
     uint32_t dev_page_index = 0;
 
-    auto shape_in_pages = buffer.shard_spec().shape_in_pages();
+    auto shape_in_pages = shard_spec.shape_in_pages();
     for (uint32_t core_index = 0; core_index < core_host_page_indices.size(); core_index++) {
         uint32_t valid_shard_page = 0;
-        buffer_page_mapping.core_host_page_indices_[core_index].reserve(buffer.shard_spec().size());
+        buffer_page_mapping.core_host_page_indices_[core_index].reserve(shard_spec.size());
         uint32_t shard_page_id = 0;
         for (uint32_t shard_page_x = 0; shard_page_x < shape_in_pages[0]; shard_page_x++) {
             for (uint32_t shard_page_y = 0; shard_page_y < shape_in_pages[1]; shard_page_y++) {
@@ -198,89 +182,162 @@ BufferPageMapping generate_buffer_page_mapping(const Buffer &buffer) {
     return buffer_page_mapping;
 }
 
-Buffer::Buffer(const Buffer &other) :
-    device_(other.device_),
-    size_(other.size_),
-    page_size_(other.page_size_),
-    buffer_type_(other.buffer_type_),
-    buffer_layout_(other.buffer_layout_),
-    shard_parameters_(other.shard_parameters_),
-    bottom_up_(other.bottom_up_),
-    buffer_page_mapping_(other.buffer_page_mapping_),
-    allocate_(other.allocate_) {
-    if (this->allocate_) {
-        this->allocate();
+Buffer::Buffer(
+    Device *device,
+    DeviceAddr size,
+    DeviceAddr page_size,
+    const BufferType buffer_type,
+    const TensorMemoryLayout buffer_layout,
+    const std::optional<ShardSpecBuffer>& shard_parameters,
+    const std::optional<bool> bottom_up) :
+    device_(device),
+    size_(size),
+    page_size_(page_size),
+    buffer_type_(buffer_type),
+    buffer_layout_(buffer_layout),
+    shard_parameters_(shard_parameters),
+    bottom_up_(bottom_up),
+    buffer_page_mapping_(nullptr) {
+    TT_FATAL(this->device_ != nullptr && this->device_->allocator_ != nullptr, "Device and allocator need to not be null.");
+
+    if (size != 0) {
+        validate_buffer_size_and_page_size(size, page_size, buffer_type, buffer_layout, shard_parameters);
     }
 }
 
-Buffer &Buffer::operator=(const Buffer &other) {
-    if (this != &other) {
-        this->device_ = other.device_;
-        this->size_ = other.size_;
-        this->page_size_ = other.page_size_;
-        this->buffer_type_ = other.buffer_type_;
-        this->buffer_layout_ = other.buffer_layout_;
-        this->shard_parameters_ = other.shard_parameters_;
-        this->bottom_up_ = other.bottom_up_;
-        this->buffer_page_mapping_ = other.buffer_page_mapping_;
-        this->allocate_ = other.allocate_;
-        if (this->allocate_) {
-            this->allocate();
-        }
+std::shared_ptr<Buffer> Buffer::create(
+    Device *device,
+    DeviceAddr size,
+    DeviceAddr page_size,
+    const BufferType buffer_type,
+    const TensorMemoryLayout buffer_layout,
+    const std::optional<ShardSpecBuffer>& shard_parameters,
+    const std::optional<bool> bottom_up) {
+    auto* bufferPtr = new Buffer(device, size, page_size, buffer_type, buffer_layout, shard_parameters, bottom_up);
+    auto buffer = std::shared_ptr<Buffer>(bufferPtr, deleter);
+    buffer->weak_self = buffer;
+
+    if (buffer->size_ == 0) {
+        buffer->allocation_status_.store(AllocationStatus::ALLOCATED, std::memory_order::relaxed);
+        return buffer;
     }
-    return *this;
-}
-
-Buffer::Buffer(Buffer &&other) :
-    device_(other.device_),
-    size_(other.size_),
-    address_(other.address_),
-    page_size_(other.page_size_),
-    buffer_type_(other.buffer_type_),
-    buffer_layout_(other.buffer_layout_),
-    shard_parameters_(std::move(other.shard_parameters_)),
-    bottom_up_(other.bottom_up_),
-    buffer_page_mapping_(std::move(other.buffer_page_mapping_)),
-    allocate_(other.allocate_) {
-    // Set `other.device_` to be nullptr so destroying other does not deallocate reserved address space that is
-    // transferred to `this`
-    other.device_ = nullptr;
-}
-
-Buffer &Buffer::operator=(Buffer &&other) {
-    if (this != &other) {
-        this->device_ = other.device_;
-        this->size_ = other.size_;
-        this->address_ = other.address_;
-        this->page_size_ = other.page_size_;
-        this->buffer_type_ = other.buffer_type_;
-        this->buffer_layout_ = other.buffer_layout_;
-        this->shard_parameters_ = std::move(other.shard_parameters_);
-        this->bottom_up_ = other.bottom_up_;
-        this->buffer_page_mapping_ = std::move(other.buffer_page_mapping_);
-        this->allocate_ = other.allocate_;
-        // Set `other.device_` to be nullptr so destroying other does not deallocate reserved address space that is
-        // transferred to `this`
-        other.device_ = nullptr;
+
+    buffer->device_->push_work([buffer] {
+        bool bottom_up = buffer->bottom_up_.value_or(buffer->is_dram());
+        buffer->address_ = detail::AllocateBuffer(buffer.get(), bottom_up);
+        detail::BUFFER_MAP.insert({buffer->device_->id(), buffer->address_}, buffer.get());
+
+        std::unique_lock lock(buffer->allocation_mutex_);
+        buffer->allocation_status_.store(AllocationStatus::ALLOCATED, std::memory_order::relaxed);
+        lock.unlock();
+        buffer->allocation_cv_.notify_all();
+    });
+
+    return buffer;
+}
+
+void Buffer::deallocate() {
+    deallocation_requested_.store(true, std::memory_order::relaxed);
+    device_->push_work([self = weak_self.lock()] {
+        self->deallocate_impl();
+    });
+}
+
+void Buffer::deleter(Buffer* buffer) {
+    buffer->device_->push_work([buffer] {
+        std::unique_ptr<Buffer> unique_buffer = std::unique_ptr<Buffer>(buffer);
+        buffer->deallocate_impl();
+    });
+}
+
+void Buffer::deallocate_impl() {
+    if (allocation_status_.load(std::memory_order::relaxed) == AllocationStatus::DEALLOCATED) {
+        return;
+    }
+
+    if (device_->initialized_ && size_ != 0) {
+        // address_ is only modified from this thread, no sync required
+        detail::BUFFER_MAP.erase({device_->id(), address_});
+        detail::DeallocateBuffer(this);
+    }
+
+    allocation_status_.store(AllocationStatus::DEALLOCATED, std::memory_order::relaxed);
+}
+
+bool Buffer::is_allocated() const {
+    auto allocation_status = allocation_status_.load(std::memory_order::relaxed);
+
+    if (device_->can_use_passthrough_scheduling()) {
+        return allocation_status == AllocationStatus::ALLOCATED;
+    }
+
+    // For calls from different threads we consider buffer to be allocated even if it's just ALLOCATION_REQUESTED,
+    // because once the caller will try to access it, the buffer will already be fully allocated. For the same reason we need to check deallocation_requested_ too.
+    bool deallocation_requested = deallocation_requested_.load(std::memory_order::relaxed);
+    return (allocation_status == AllocationStatus::ALLOCATION_REQUESTED || allocation_status == AllocationStatus::ALLOCATED) && !deallocation_requested;
+}
+
+uint32_t Buffer::address() const {
+    if (device_->can_use_passthrough_scheduling()) {
+        return address_;
+    }
+
+    std::unique_lock lock(allocation_mutex_);
+    allocation_cv_.wait(lock, [this] { return this->allocation_status_.load(std::memory_order::relaxed) != AllocationStatus::ALLOCATION_REQUESTED; });
+    return address_;
+}
+
+DeviceAddr Buffer::page_size() const {
+    return page_size_;
+}
+
+void Buffer::set_page_size(DeviceAddr page_size) {
+    TT_FATAL(page_size == 0 ? size_ == 0 : size_ % page_size == 0, "buffer size must be divisible by new page size");
+    page_size_ = page_size;
+    this->buffer_page_mapping_ = nullptr;
+}
+
+uint32_t Buffer::num_pages() const {
+    return page_size() == 0 ? 0 : size() / page_size();
+}
+
+uint32_t Buffer::num_dev_pages() const {
+    if (!is_sharded(this->buffer_layout_)) {
+        return this->num_pages();
+    }
+
+    return this->shard_spec().size() * this->num_cores();
+}
+
+CoreType Buffer::core_type() const {
+    switch (this->buffer_type_) {
+        case BufferType::DRAM:
+            return CoreType::DRAM;
+        case BufferType::L1:
+        case BufferType::L1_SMALL:
+            return CoreType::WORKER;
+        default:
+            TT_THROW("Unknown CoreType {} for buffer", this->buffer_type_);
     }
-    return *this;
 }
 
-void Buffer::allocate() {
-    TT_ASSERT(this->device_ != nullptr);
-    // L1 and Trace buffers (which live in DRAM) are allocated top down!
-    bool bottom_up = this->bottom_up_.value_or(this->is_dram());
-    detail::AllocateBuffer(this, bottom_up);
-    detail::BUFFER_MAP.insert({this->device_->id(), this->address_}, this);
+bool Buffer::is_l1() const {
+    return buffer_type() == BufferType::L1 or buffer_type() == BufferType::L1_SMALL;
+}
+bool Buffer::is_dram() const {
+    return buffer_type() == BufferType::DRAM || buffer_type() == BufferType::TRACE;
+}
+bool Buffer::is_trace() const {
+    return buffer_type() == BufferType::TRACE;
 }
 
 uint32_t Buffer::dram_channel_from_bank_id(uint32_t bank_id) const {
-    TT_ASSERT(this->is_dram(), "Expected DRAM buffer!");
+    TT_FATAL(this->is_dram(), "Expected DRAM buffer!");
     return this->device_->dram_channel_from_bank_id(bank_id);
 }
 
 CoreCoord Buffer::logical_core_from_bank_id(uint32_t bank_id) const {
-    TT_ASSERT(this->is_l1(), "Expected L1 buffer!");
+    TT_FATAL(this->is_l1(), "Expected L1 buffer!");
     return this->device_->logical_core_from_bank_id(bank_id);
 }
 
@@ -307,69 +364,88 @@ CoreCoord Buffer::noc_coordinates() const { return this->noc_coordinates(0); }
 
 DeviceAddr Buffer::page_address(uint32_t bank_id, uint32_t page_index) const {
     auto num_banks = this->device_->num_banks(this->buffer_type_);
-    TT_ASSERT(bank_id < num_banks, "Invalid Bank ID: {} exceeds total numbers of banks ({})!", bank_id, num_banks);
+    TT_FATAL(bank_id < num_banks, "Invalid Bank ID: {} exceeds total numbers of banks ({})!", bank_id, num_banks);
     int pages_offset_within_bank = (int)page_index / num_banks;
-    auto offset = (round_up(this->page_size_, this->alignment()) * pages_offset_within_bank);
+    auto offset = (round_up(this->page_size(), this->alignment()) * pages_offset_within_bank);
     return translate_page_address(offset, bank_id);
 }
 
-uint32_t Buffer::alignment() const { return this->device_->get_allocator_alignment(); }
+uint32_t Buffer::alignment() const {
+    return this->device_->get_allocator_alignment();
+}
+DeviceAddr Buffer::aligned_page_size() const {
+    return align(page_size(), this->alignment());
+}
+DeviceAddr Buffer::aligned_size() const {
+    return this->num_dev_pages() * this->aligned_page_size();
+}
 
 DeviceAddr Buffer::sharded_page_address(uint32_t bank_id, uint32_t page_index) const {
-    TT_ASSERT(is_sharded(this->buffer_layout()));
-    int pages_offset_within_bank = page_index % shard_spec().size();
-    auto offset = (round_up(this->page_size_, this->alignment()) * pages_offset_within_bank);
+    TT_FATAL(is_sharded(this->buffer_layout()), "Buffer not sharded");
+    auto shard_spec = this->shard_spec();
+    uint32_t pages_offset_within_bank = page_index % shard_spec.size();
+    auto offset = (round_up(this->page_size(), this->alignment()) * pages_offset_within_bank);
     return translate_page_address(offset, bank_id);
 }
 
+ShardSpecBuffer Buffer::shard_spec() const {
+    TT_FATAL(is_sharded(this->buffer_layout_), "Buffer not sharded");
+    TT_FATAL(shard_parameters_.has_value(), "Buffer is sharded, but no shard parameters specified");
+    return this->shard_parameters_.value();
+}
+
+void Buffer::set_shard_spec(const ShardSpecBuffer& shard_spec) {
+    this->shard_parameters_ = shard_spec;
+    this->buffer_page_mapping_ = nullptr;
+}
+
+uint32_t Buffer::num_cores() const {
+    if (!is_sharded(this->buffer_layout_))
+        return 1;
+
+    return this->shard_spec().tensor_shard_spec.grid.num_cores();
+}
+
 DeviceAddr Buffer::translate_page_address(uint64_t offset, uint32_t bank_id) const {
-    DeviceAddr base_page_address = this->address_ + this->device_->bank_offset(this->buffer_type_, bank_id);
+    DeviceAddr base_page_address = this->address() + this->device_->bank_offset(this->buffer_type_, bank_id);
     return base_page_address + offset;
 }
 
 const std::shared_ptr<const BufferPageMapping>& Buffer::get_buffer_page_mapping() {
-    TT_ASSERT(is_sharded(this->buffer_layout_), "Buffer not sharded");
+    TT_FATAL(is_sharded(this->buffer_layout_), "Buffer not sharded");
     if (!this->buffer_page_mapping_) {
         this->buffer_page_mapping_ = std::make_shared<const BufferPageMapping>(generate_buffer_page_mapping(*this));
     }
     return this->buffer_page_mapping_;
 }
 
-void Buffer::deallocate() {
+bool ShardSpec::operator==(const ShardSpec&) const = default;
+bool ShardSpec::operator!=(const ShardSpec&) const = default;
 
-    if (this->device_ == nullptr or not this->device_->initialized_ or this->size_ == 0 or not this->allocate_) {
-        return;
-    }
-    // Mark as deallocated
-    this->size_ = 0;
-    TT_ASSERT(this->device_->allocator_ != nullptr, "Expected allocator to be initialized!");
-    detail::BUFFER_MAP.erase({this->device_->id(), this->address_});
-    detail::DeallocateBuffer(this);
+std::array<uint32_t, 2> ShardSpecBuffer::shape_in_pages() const {
+    auto width_in_pages = page_shape[0] == 0 ? 0 : tensor_shard_spec.shape[0] / page_shape[0];
+    auto height_in_pages = page_shape[1] == 0 ? 0 : tensor_shard_spec.shape[1] / page_shape[1];
+    return {width_in_pages, height_in_pages};
 }
 
-Buffer::~Buffer() { this->deallocate(); }
-
-bool operator==(const ShardSpec &spec_a, const ShardSpec &spec_b) {
-    if (spec_a.grid != spec_b.grid) {
-        return false;
-    }
-    if (spec_a.shape != spec_b.shape) {
-        return false;
-    }
-    if (spec_a.orientation != spec_b.orientation) {
-        return false;
-    }
-    if (spec_a.halo != spec_b.halo) {
-        return false;
-    }
-    return true;
+DeviceAddr ShardSpecBuffer::size() const {
+    auto shape_in_pages_ = this->shape_in_pages();
+    return shape_in_pages_[0] * shape_in_pages_[1];
 }
 
-bool operator!=(const ShardSpec &spec_a, const ShardSpec &spec_b) { return not(spec_a == spec_b); }
-
 namespace detail {
 buffer_map_t BUFFER_MAP = {};
 }
 
 }  // namespace tt_metal
 }  // namespace tt
+
+namespace tt::stl::json {
+tt_metal::ShardSpec from_json_t<tt_metal::ShardSpec>::operator()(const nlohmann::json &json_object) const {
+    return tt_metal::ShardSpec{
+        from_json<CoreRangeSet>(json_object.at("grid")),
+        from_json<std::array<uint32_t, 2>>(json_object.at("shape")),
+        from_json<tt_metal::ShardOrientation>(json_object.at("orientation")),
+        from_json<bool>(json_object.at("halo"))};
+}
+}
diff --git a/tt_metal/impl/buffers/buffer.hpp b/tt_metal/impl/buffers/buffer.hpp
index 91dd0fd846a..8c4332de0cb 100644
--- a/tt_metal/impl/buffers/buffer.hpp
+++ b/tt_metal/impl/buffers/buffer.hpp
@@ -6,6 +6,7 @@
 
 #include <map>
 #include <mutex>
+#include <condition_variable>
 #include <optional>
 
 #include "common/bfloat16.hpp"
@@ -46,21 +47,20 @@ struct ShardSpec {
         const ShardOrientation &shard_orientation_ = ShardOrientation::ROW_MAJOR,
         const bool &halo_ = false) :
         grid(core_sets_), shape(shard_shape_), orientation(shard_orientation_), halo(halo_) {
-        ;
     }
 
     const uint32_t num_cores() const { return this->grid.num_cores(); }
     const uint32_t numel() const { return this->shape[0] * this->shape[1]; }
 
+    bool operator==(const ShardSpec& other) const;
+    bool operator!=(const ShardSpec& other) const;
+
     static constexpr auto attribute_names = std::forward_as_tuple("grid", "shape", "orientation", "halo");
     constexpr auto attribute_values() const {
         return std::forward_as_tuple(this->grid, this->shape, this->orientation, this->halo);
     }
 };
 
-bool operator==(const ShardSpec &spec_a, const ShardSpec &spec_b);
-bool operator!=(const ShardSpec &spec_a, const ShardSpec &spec_b);
-
 struct ShardSpecBuffer {
     ShardSpec tensor_shard_spec;
     std::array<uint32_t, 2> page_shape;
@@ -91,15 +91,8 @@ struct ShardSpecBuffer {
     void set_shard_spec(const ShardSpec& shard_spec) { tensor_shard_spec = shard_spec; };
 
     /* Shape in pages of the full tensor, not per core */
-    std::array<uint32_t, 2> shape_in_pages() const {
-        auto width_in_pages = tensor_shard_spec.shape[0] / page_shape[0];
-        auto height_in_pages = tensor_shard_spec.shape[1] / page_shape[1];
-        return {width_in_pages, height_in_pages};
-    }
-    DeviceAddr size() const {
-        auto shape_in_pages_ = this->shape_in_pages();
-        return shape_in_pages_[0] * shape_in_pages_[1];
-    }
+    std::array<uint32_t, 2> shape_in_pages() const;
+    DeviceAddr size() const;
 };
 
 inline namespace v0 {
@@ -110,7 +103,6 @@ struct BufferConfig {
     DeviceAddr page_size;  // Size of unit being interleaved. For non-interleaved buffers: size == page_size
     BufferType buffer_type;
     TensorMemoryLayout buffer_layout = TensorMemoryLayout::INTERLEAVED;
-    bool allocate = true;
 };
 
 typedef BufferConfig InterleavedBufferConfig;
@@ -124,7 +116,6 @@ struct ShardedBufferConfig {
     BufferType buffer_type = BufferType::L1;
     TensorMemoryLayout buffer_layout = TensorMemoryLayout::HEIGHT_SHARDED;
     ShardSpecBuffer shard_parameters;
-    bool allocate = true;
 };
 
 }  // namespace v0
@@ -147,77 +138,41 @@ struct BufferPageMapping {
 
 inline namespace v0 {
 
-class Buffer {
+class Buffer final {
    public:
-    Buffer() :
-        device_(nullptr),
-        buffer_type_(BufferType::DRAM),
-        buffer_layout_(TensorMemoryLayout::INTERLEAVED),
-        shard_parameters_(std::nullopt),
-        bottom_up_(std::nullopt),
-        allocate_(true) {}
-
-    Buffer(
+    static std::shared_ptr<Buffer> create(
         Device *device,
         DeviceAddr size,
         DeviceAddr page_size,
-        const BufferType buffer_type,
-        const TensorMemoryLayout buffer_layout = TensorMemoryLayout::INTERLEAVED,
+        BufferType buffer_type,
+        TensorMemoryLayout buffer_layout = TensorMemoryLayout::INTERLEAVED,
         const std::optional<ShardSpecBuffer>& shard_parameter = std::nullopt,
-        const std::optional<bool> bottom_up = std::nullopt,
-        bool allocate = true);
-
-    Buffer(const Buffer &other);
-    Buffer &operator=(const Buffer &other);
+        std::optional<bool> bottom_up = std::nullopt);
 
-    Buffer(Buffer &&other);
-    Buffer &operator=(Buffer &&other);
+    Buffer(const Buffer &other) = delete;
+    Buffer &operator=(const Buffer &other) = delete;
+    Buffer(Buffer &&other) = delete;
+    Buffer &operator=(Buffer &&other) = delete;
 
-    virtual ~Buffer();
     Device *device() const { return device_; }
+    DeviceAddr size() const { return size_; }
+    bool is_allocated() const;
 
-    DeviceAddr size() const { return static_cast<DeviceAddr>(size_); }
-
-    void set_size(DeviceAddr size) { size_ = size; this->buffer_page_mapping_ = nullptr; }
     // Returns address of buffer in the first bank
-    uint32_t address() const { return static_cast<uint32_t>(address_); }
-
-    void set_address(uint64_t addr) { address_ = addr; }
+    uint32_t address() const;
 
-    DeviceAddr page_size() const { return page_size_; }
+    DeviceAddr page_size() const;
+    void set_page_size(DeviceAddr page_size);
 
-    uint32_t num_pages() const { return this->size() / this->page_size(); }
-
-    void set_page_size(DeviceAddr page_size) {
-        TT_FATAL(size_ % page_size == 0, "buffer size must be divisible by new page size");
-        page_size_ = page_size;
-        this->buffer_page_mapping_ = nullptr;
-    }
-
-    uint32_t num_dev_pages() const {
-        if (!is_sharded(this->buffer_layout_)) {
-            return this->num_pages();
-        } else {
-            return this->shard_spec().size() * this->num_cores();
-        }
-    }
+    uint32_t num_pages() const;
+    uint32_t num_dev_pages() const;
 
     BufferType buffer_type() const { return buffer_type_; }
-    CoreType core_type() const {
-        switch (this->buffer_type_) {
-            case BufferType::DRAM:
-                return CoreType::DRAM;
-            case BufferType::L1:
-            case BufferType::L1_SMALL:
-                return CoreType::WORKER;
-            default:
-                TT_THROW("Unknown CoreType for buffer");
-        }
-    }
+    CoreType core_type() const;
 
-    bool is_l1() const { return buffer_type() == BufferType::L1 or buffer_type() == BufferType::L1_SMALL; }
-    bool is_dram() const { return buffer_type() == BufferType::DRAM || buffer_type() == BufferType::TRACE; }
-    bool is_trace() const { return buffer_type() == BufferType::TRACE; }
+    bool is_l1() const;
+    bool is_dram() const;
+    bool is_trace() const;
 
     TensorMemoryLayout buffer_layout() const { return buffer_layout_; }
 
@@ -233,56 +188,64 @@ class Buffer {
     DeviceAddr page_address(uint32_t bank_id, uint32_t page_index) const;
 
     uint32_t alignment() const;
-
-    DeviceAddr aligned_page_size() const { return align(page_size_, this->alignment());}
-
-    DeviceAddr aligned_size() const { return this->num_dev_pages() * this->aligned_page_size(); }
+    DeviceAddr aligned_page_size() const;
+    DeviceAddr aligned_size() const;
 
     // SHARDED API STARTS HERE
     // TODO: WILL SEPARATE INTO SHARDED BUFFER CLASS
 
     DeviceAddr sharded_page_address(uint32_t bank_id, uint32_t page_index) const;
 
-    ShardSpecBuffer shard_spec() const {
-        TT_ASSERT(is_sharded(this->buffer_layout_), "Buffer not sharded");
-        TT_ASSERT(shard_parameters_.has_value());
-        return this->shard_parameters_.value();
-    }
+    ShardSpecBuffer shard_spec() const;
+    void set_shard_spec(const ShardSpecBuffer& shard_spec);
 
-    void set_shard_spec(const ShardSpecBuffer& shard_spec) {
-        this->shard_parameters_ = shard_spec;
-        this->buffer_page_mapping_ = nullptr;
-    }
-
-    uint32_t num_cores() const {
-        if (!is_sharded(this->buffer_layout_))
-            return 1;
-        else {
-            return this->shard_spec().tensor_shard_spec.grid.num_cores();
-        }
-    }
+    uint32_t num_cores() const;
 
     const std::shared_ptr<const BufferPageMapping>& get_buffer_page_mapping();
 
    private:
-    virtual void allocate();
-
-    virtual void deallocate();
+    Buffer(
+        Device *device,
+        DeviceAddr size,
+        DeviceAddr page_size,
+        BufferType buffer_type,
+        TensorMemoryLayout buffer_layout,
+        const std::optional<ShardSpecBuffer>& shard_parameter,
+        std::optional<bool> bottom_up);
+
+    enum class AllocationStatus : uint8_t {
+        ALLOCATION_REQUESTED,
+        ALLOCATED,
+        DEALLOCATED,
+    };
+
+    // Deallocate is allowed to be called multiple times on the same buffer
+    void deallocate();
+    static void deleter(Buffer* buffer);
+    void deallocate_impl();
     friend void DeallocateBuffer(Buffer &buffer);
 
     DeviceAddr translate_page_address(uint64_t offset, uint32_t bank_id) const;
 
-    Device *device_;
-    DeviceAddr size_;       // Size in bytes
-    DeviceAddr address_;    // Address of buffer
-    DeviceAddr page_size_;  // Size of unit being interleaved. For non-interleaved buffers: size == page_size
-    BufferType buffer_type_;
-    TensorMemoryLayout buffer_layout_;
+    Device * const device_;
+    const DeviceAddr size_; // Size in bytes
+    const BufferType buffer_type_;
+    const TensorMemoryLayout buffer_layout_;
+    const std::optional<bool> bottom_up_;
+
+    std::atomic<AllocationStatus> allocation_status_ = AllocationStatus::ALLOCATION_REQUESTED;
+    DeviceAddr address_ = 0;
+    mutable std::mutex allocation_mutex_;
+    mutable std::condition_variable allocation_cv_;
+    // Used exclusively for is_allocated() method
+    std::atomic<bool> deallocation_requested_ = false;
+
+    // These members must be only accessed on the device worker thread
+    DeviceAddr page_size_; // Size of unit being interleaved. For non-interleaved buffers: size == page_size
     std::optional<ShardSpecBuffer> shard_parameters_;
     std::shared_ptr<const BufferPageMapping> buffer_page_mapping_;
-    bool allocate_ = true;
-   protected:
-    std::optional<bool> bottom_up_;
+
+    std::weak_ptr<Buffer> weak_self;
 };
 
 }  // namespace v0
@@ -335,13 +298,7 @@ using HostDataType = std::variant<
 
 namespace tt::stl::json {
 template <>
-struct from_json_t<tt::tt_metal::ShardSpec> {
-    auto operator()(const nlohmann::json &json_object) const {
-        return tt::tt_metal::ShardSpec{
-            from_json<CoreRangeSet>(json_object.at("grid")),
-            from_json<std::array<uint32_t, 2>>(json_object.at("shape")),
-            from_json<tt::tt_metal::ShardOrientation>(json_object.at("orientation")),
-            from_json<bool>(json_object.at("halo"))};
-    }
+struct from_json_t<tt_metal::ShardSpec> {
+    tt_metal::ShardSpec operator()(const nlohmann::json &json_object) const;
 };
 }  // namespace tt::stl::json
diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp
index 47ce59b1383..7ba127c7803 100644
--- a/tt_metal/impl/device/device.cpp
+++ b/tt_metal/impl/device/device.cpp
@@ -3266,12 +3266,8 @@ CommandQueue &Device::command_queue(size_t cq_id) {
     return *sw_command_queues_[cq_id];
 }
 
-void Device::push_work(std::function<void()>&& work, bool blocking) {
-    this->work_executor.push_work(work, blocking);
-}
-
-void Device::push_work(std::shared_ptr<std::function<void()>> work, bool blocking) {
-    this->work_executor.push_work(work, blocking);
+bool Device::can_use_passthrough_scheduling() const {
+    return this->work_executor.use_passthrough();
 }
 
 void Device::synchronize() {
diff --git a/tt_metal/impl/device/device.hpp b/tt_metal/impl/device/device.hpp
index 5ee266d37b7..7beb58f3ea8 100644
--- a/tt_metal/impl/device/device.hpp
+++ b/tt_metal/impl/device/device.hpp
@@ -257,8 +257,11 @@ class Device {
     friend bool CloseDevice(Device *device);
 
     // APIs to access this device's work executor
-    void push_work(std::function<void()>&& work, bool blocking = false);
-    void push_work(std::shared_ptr<std::function<void()>> work, bool blocking = false);
+    bool can_use_passthrough_scheduling() const;
+    template<typename F>
+    void push_work(F&& work, bool blocking = false) {
+        this->work_executor.push_work(std::forward<F>(work), blocking);
+    }
     void synchronize();
     void set_worker_mode(const WorkExecutorMode& mode);
     void enable_async(bool enable);
diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp
index 010ad9e4359..656c740ccee 100644
--- a/tt_metal/impl/dispatch/command_queue.cpp
+++ b/tt_metal/impl/dispatch/command_queue.cpp
@@ -2789,62 +2789,6 @@ void EnqueueGetBufferAddr(CommandQueue& cq, uint32_t* dst_buf_addr, const Buffer
         .type = EnqueueCommandType::GET_BUF_ADDR, .blocking = blocking, .shadow_buffer = buffer, .dst = dst_buf_addr});
 }
 
-void EnqueueAllocateBufferImpl(AllocBufferMetadata alloc_md) {
-    Buffer* buffer = alloc_md.buffer;
-    uint32_t allocated_addr;
-    if (is_sharded(buffer->buffer_layout())) {
-        allocated_addr = allocator::allocate_buffer(
-            *(buffer->device()->allocator_),
-            buffer->shard_spec().size() * buffer->num_cores() * buffer->page_size(),
-            buffer->page_size(),
-            buffer->buffer_type(),
-            alloc_md.bottom_up,
-            buffer->num_cores());
-    } else {
-        allocated_addr = allocator::allocate_buffer(
-            *(buffer->device()->allocator_),
-            buffer->size(),
-            buffer->page_size(),
-            buffer->buffer_type(),
-            alloc_md.bottom_up,
-            std::nullopt);
-    }
-    TT_ASSERT(allocated_addr <= std::numeric_limits<uint32_t>::max());
-    buffer->set_address(static_cast<DeviceAddr>(allocated_addr));
-}
-
-void EnqueueAllocateBuffer(CommandQueue& cq, Buffer* buffer, bool bottom_up, bool blocking) {
-    auto alloc_md = AllocBufferMetadata{
-        .buffer = buffer,
-        .allocator = *(buffer->device()->allocator_),
-        .bottom_up = bottom_up,
-    };
-    cq.run_command(CommandInterface{
-        .type = EnqueueCommandType::ALLOCATE_BUFFER,
-        .blocking = blocking,
-        .alloc_md = alloc_md,
-    });
-}
-
-void EnqueueDeallocateBufferImpl(AllocBufferMetadata alloc_md) {
-    allocator::deallocate_buffer(alloc_md.allocator, alloc_md.device_address, alloc_md.buffer_type);
-}
-
-void EnqueueDeallocateBuffer(
-    CommandQueue& cq, Allocator& allocator, uint32_t device_address, BufferType buffer_type, bool blocking) {
-    // Need to explictly pass in relevant buffer attributes here, since the Buffer* ptr can be deallocated a this point
-    auto alloc_md = AllocBufferMetadata{
-        .allocator = allocator,
-        .buffer_type = buffer_type,
-        .device_address = device_address,
-    };
-    cq.run_command(CommandInterface{
-        .type = EnqueueCommandType::DEALLOCATE_BUFFER,
-        .blocking = blocking,
-        .alloc_md = alloc_md,
-    });
-}
-
 inline namespace v0 {
 
 void EnqueueReadBuffer(
@@ -3173,13 +3117,13 @@ void CommandQueue::run_worker() {
     }
 }
 
-void CommandQueue::run_command(const CommandInterface& command) {
+void CommandQueue::run_command(CommandInterface&& command) {
     log_trace(LogDispatch, "{} received {} in {} mode", this->name(), command.type, this->mode);
     if (this->async_mode()) {
         if (std::hash<std::thread::id>{}(std::this_thread::get_id()) == parent_thread_id) {
             // In async mode when parent pushes cmd, feed worker through queue.
-            this->worker_queue.push(command);
             bool blocking = command.blocking.has_value() and *command.blocking;
+            this->worker_queue.push(std::move(command));
             if (blocking) {
                 TT_ASSERT(not this->trace_mode(), "Blocking commands cannot be traced!");
                 this->wait_until_empty();
@@ -3193,7 +3137,7 @@ void CommandQueue::run_command(const CommandInterface& command) {
         }
     } else if (this->trace_mode()) {
         // In trace mode push to the trace queue
-        this->worker_queue.push(command);
+        this->worker_queue.push(std::move(command));
     } else if (this->passthrough_mode()) {
         this->run_command_impl(command);
     } else {
@@ -3216,14 +3160,6 @@ void CommandQueue::run_command_impl(const CommandInterface& command) {
             TT_ASSERT(command.blocking.has_value(), "Must specify blocking value!");
             EnqueueWriteBufferImpl(*this, command.buffer.value(), command.src.value(), command.blocking.value());
             break;
-        case EnqueueCommandType::ALLOCATE_BUFFER:
-            TT_ASSERT(command.alloc_md.has_value(), "Must provide buffer allocation metdata!");
-            EnqueueAllocateBufferImpl(command.alloc_md.value());
-            break;
-        case EnqueueCommandType::DEALLOCATE_BUFFER:
-            TT_ASSERT(command.alloc_md.has_value(), "Must provide buffer allocation metdata!");
-            EnqueueDeallocateBufferImpl(command.alloc_md.value());
-            break;
         case EnqueueCommandType::GET_BUF_ADDR:
             TT_ASSERT(command.dst.has_value(), "Must provide a dst address!");
             TT_ASSERT(command.shadow_buffer.has_value(), "Must provide a shadow buffer!");
diff --git a/tt_metal/impl/dispatch/command_queue.hpp b/tt_metal/impl/dispatch/command_queue.hpp
index 821a806ff9c..64f6c5407b7 100644
--- a/tt_metal/impl/dispatch/command_queue.hpp
+++ b/tt_metal/impl/dispatch/command_queue.hpp
@@ -36,8 +36,6 @@ using RuntimeArgs = std::vector<std::variant<Buffer*, uint32_t>>;
 enum class EnqueueCommandType {
     ENQUEUE_READ_BUFFER,
     ENQUEUE_WRITE_BUFFER,
-    ALLOCATE_BUFFER,
-    DEALLOCATE_BUFFER,
     GET_BUF_ADDR,
     ADD_BUFFER_TO_PROGRAM,
     SET_RUNTIME_ARGS,
@@ -577,8 +575,6 @@ class HWCommandQueue {
         std::variant<std::reference_wrapper<Buffer>, std::shared_ptr<Buffer>> buffer,
         HostDataType src,
         bool blocking);
-    friend void EnqueueAllocateBufferImpl(AllocBufferMetadata alloc_md);
-    friend void EnqueueDeallocateBufferImpl(AllocBufferMetadata alloc_md);
     friend void EnqueueGetBufferAddrImpl(void* dst_buf_addr, const Buffer* buffer);
     friend void EnqueueRecordEventImpl(CommandQueue& cq, const std::shared_ptr<Event>& event);
     friend void EnqueueWaitForEventImpl(CommandQueue& cq, const std::shared_ptr<Event>& event);
@@ -637,7 +633,7 @@ class CommandQueue {
 
     // Schedule a command to be run on the device
     // Blocking if in passthrough mode. Non-blocking if in async mode
-    void run_command(const CommandInterface& command);
+    void run_command(CommandInterface&& command);
 
     // API for setting/getting the mode of the command queue
     void set_mode(const CommandQueueMode& mode);
@@ -696,9 +692,6 @@ class CommandQueue {
 
 // Primitives used to place host only operations on the SW Command Queue.
 // These are used in functions exposed through tt_metal.hpp or host_api.hpp
-void EnqueueAllocateBuffer(CommandQueue& cq, Buffer* buffer, bool bottom_up, bool blocking);
-void EnqueueDeallocateBuffer(
-    CommandQueue& cq, Allocator& allocator, uint32_t device_address, BufferType buffer_type, bool blocking);
 void EnqueueGetBufferAddr(CommandQueue& cq, uint32_t* dst_buf_addr, const Buffer* buffer, bool blocking);
 void EnqueueSetRuntimeArgs(
     CommandQueue& cq,
diff --git a/tt_metal/impl/dispatch/lock_free_queue.hpp b/tt_metal/impl/dispatch/lock_free_queue.hpp
index 6c65b932606..1f99cafc427 100644
--- a/tt_metal/impl/dispatch/lock_free_queue.hpp
+++ b/tt_metal/impl/dispatch/lock_free_queue.hpp
@@ -64,7 +64,7 @@ class LockFreeQueue {
             parent_thread_id.store(other.parent_thread_id.load());
         }
 
-        inline void push(const T& value) {
+        inline void push(T&& value) {
             // Legacy Push API allowing copy by value
             // for object T.
 
@@ -74,7 +74,7 @@ class LockFreeQueue {
             // has progressed (data has been read).
             lock_func();
             while(tail.load()->next == head.load()) {};
-            tail.load()->data = std::make_shared<T>(value);
+            tail.load()->data = std::make_shared<T>(std::move(value));
             tail.store(tail.load()->next);
             unlock_func();
         }
@@ -89,7 +89,7 @@ class LockFreeQueue {
             // has progressed (data has been read).
             lock_func();
             while(tail.load()->next == head.load()) {};
-            tail.load()->data = value;
+            tail.load()->data = std::move(value);
             tail.store(tail.load()->next);
             unlock_func();
         }
diff --git a/tt_metal/impl/dispatch/work_executor.hpp b/tt_metal/impl/dispatch/work_executor.hpp
index d6dd8d3e4a1..c9389ac210f 100644
--- a/tt_metal/impl/dispatch/work_executor.hpp
+++ b/tt_metal/impl/dispatch/work_executor.hpp
@@ -138,37 +138,21 @@ class WorkExecutor {
         }
     }
 
-    inline void push_work(const std::function<void()>& work_executor, bool blocking = false) {
-        ZoneScopedN("PushWork");
-        if (std::this_thread::get_id() == this->worker_queue.worker_thread_id.load() or
-            not(this->worker_state == WorkerState::RUNNING)) {
-            // Worker is pushing to itself (nested work) or worker thread is not running. Execute work in current
-            // thread.
-            work_executor();
-        } else {
-            // Push to worker queue.
-            this->worker_queue.push(work_executor);
-            {
-                std::lock_guard lock(this->cv_mutex);
-                cv.notify_one();
-            }
-            if (blocking) {
-                this->synchronize();
-            }
-        }
+    inline bool use_passthrough() const {
+        return std::this_thread::get_id() == this->worker_queue.worker_thread_id.load() ||
+            this->worker_state != WorkerState::RUNNING;
     }
 
-    inline void push_work(std::shared_ptr<std::function<void()>> work_executor, bool blocking = false) {
-        // Latest push API, passing ptrs around for work container. Usually faster, since no data-copies.
+    template<typename F>
+    inline void push_work(F&& work_executor, bool blocking = false) {
         ZoneScopedN("PushWork");
-        if (std::this_thread::get_id() == this->worker_queue.worker_thread_id.load() or
-            not(this->worker_state == WorkerState::RUNNING)) {
+        if (use_passthrough()) {
             // Worker is pushing to itself (nested work) or worker thread is not running. Execute work in current
             // thread.
-            (*work_executor)();
+            work_executor();
         } else {
             // Push to worker queue.
-            this->worker_queue.push(work_executor);
+            this->worker_queue.push(std::forward<F>(work_executor));
             {
                 std::lock_guard lock(this->cv_mutex);
                 cv.notify_one();
@@ -220,11 +204,11 @@ class WorkExecutor {
         this->worker_queue_mode = mode;
     }
 
-    WorkerQueueMode get_worker_queue_mode() { return worker_queue_mode; }
+    WorkerQueueMode get_worker_queue_mode() const { return worker_queue_mode; }
 
-    inline std::thread::id get_parent_thread_id() { return this->worker_queue.parent_thread_id; }
+    inline std::thread::id get_parent_thread_id() const { return this->worker_queue.parent_thread_id; }
 
-    inline std::thread::id get_worker_thread_id() { return this->worker_queue.worker_thread_id; }
+    inline std::thread::id get_worker_thread_id() const { return this->worker_queue.worker_thread_id; }
 
    private:
     std::thread worker_thread;
diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp
index ffb8fb6a499..fba32d32c69 100644
--- a/tt_metal/impl/program/program.cpp
+++ b/tt_metal/impl/program/program.cpp
@@ -761,7 +761,7 @@ void Program::populate_dispatch_data(Device *device) {
 
     if (binaries_data.size() > 0) {
         // We allocate program binaries top down to minimize fragmentation with other buffers in DRAM, which are typically allocated bottom up
-        this->kernels_buffer = std::make_shared<Buffer>(
+        this->kernels_buffer = Buffer::create(
             device, binaries_data.size() * sizeof(uint32_t), HostMemDeviceCommand::PROGRAM_PAGE_SIZE, BufferType::DRAM, TensorMemoryLayout::INTERLEAVED, std::nullopt, false);
 
         this->program_transfer_info.binary_data = binaries_data;
diff --git a/tt_metal/impl/trace/trace.cpp b/tt_metal/impl/trace/trace.cpp
index 4a84a4aa484..aaeea3d05b7 100644
--- a/tt_metal/impl/trace/trace.cpp
+++ b/tt_metal/impl/trace/trace.cpp
@@ -85,7 +85,7 @@ void Trace::initialize_buffer(CommandQueue& cq, std::shared_ptr<TraceBuffer> tra
         cq.device()->trace_buffers_size <= cq.device()->allocator_->config.trace_region_size,
         "Creating trace buffers of size {}B on device {}, but only {}B is allocated for trace region.",  cq.device()->trace_buffers_size, cq.device()->id(),  cq.device()->allocator_->config.trace_region_size);
     // Commit trace to device DRAM
-    trace_buffer->buffer = std::make_shared<Buffer>(
+    trace_buffer->buffer = Buffer::create(
                             cq.device(), padded_size, page_size, BufferType::TRACE, TensorMemoryLayout::INTERLEAVED);
     EnqueueWriteBuffer(cq, trace_buffer->buffer, trace_data, kBlocking);
     log_trace(
diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp
index fa7b1972d51..fca037ad2a1 100644
--- a/tt_metal/tt_metal.cpp
+++ b/tt_metal/tt_metal.cpp
@@ -378,7 +378,7 @@ void WriteToDeviceSharded(Buffer &buffer, const std::vector<uint32_t> &host_buff
         buffer.size());
 
     uint32_t page_size = buffer.page_size();
-    TT_ASSERT(buffer.size() % page_size == 0);
+    TT_ASSERT(page_size == 0 ? buffer.size() == 0 : buffer.size() % page_size == 0);
 
     static constexpr uint32_t bytes_per_page_entry = sizeof(uint32_t);
     TT_ASSERT(page_size % bytes_per_page_entry == 0);
@@ -412,12 +412,7 @@ void WriteToDeviceInterleavedContiguous(const Buffer &buffer, const std::vector<
         buffer.size());
 
     uint32_t page_size = buffer.page_size();
-    TT_FATAL(
-        buffer.size() % page_size == 0,
-        "Invalid buffer size: {}. Buffer size must be a multiple of page size {}.",
-        buffer.size(),
-        page_size);
-    uint32_t num_pages = buffer.size() / page_size;
+    uint32_t num_pages = buffer.num_pages();
 
     static constexpr uint32_t bytes_per_page_entry = sizeof(uint32_t);
     TT_FATAL(page_size % bytes_per_page_entry == 0,
@@ -481,12 +476,7 @@ void WriteToBuffer(Buffer &buffer, const std::vector<uint32_t> &host_buffer) {
 void ReadFromDeviceInterleavedContiguous(const Buffer &buffer, std::vector<uint32_t> &host_buffer) {
     host_buffer.clear();  // overwrite the data
     uint32_t page_size = buffer.page_size();
-    TT_FATAL(
-        buffer.size() % page_size == 0,
-        "Invalid buffer size: {}. Buffer size must be a multiple of page size {}.",
-        buffer.size(),
-        page_size);
-    uint32_t num_pages = buffer.size() / page_size;
+    uint32_t num_pages = buffer.num_pages();
 
     auto device = buffer.device();
     auto num_banks = device->num_banks(buffer.buffer_type());
@@ -811,13 +801,35 @@ void CompileProgram(Device *device, Program &program, bool fd_bootloader_mode) {
     program.compile(device, fd_bootloader_mode);
 }
 
-void AllocateBuffer(Buffer *buffer, bool bottom_up) {
+DeviceAddr AllocateBuffer(const Buffer *buffer, bool bottom_up) {
     if(GraphTracker::instance().hook_allocate(buffer, bottom_up)) {
         GraphTracker::instance().track_allocate(buffer, bottom_up);
-        return;
+        return 0;
     }
-    EnqueueAllocateBuffer(buffer->device()->command_queue(), buffer, bottom_up, false);
+
+    uint32_t allocated_addr;
+    if (is_sharded(buffer->buffer_layout())) {
+        allocated_addr = allocator::allocate_buffer(
+            *(buffer->device()->allocator_),
+            buffer->shard_spec().size() * buffer->num_cores() * buffer->page_size(),
+            buffer->page_size(),
+            buffer->buffer_type(),
+            bottom_up,
+            buffer->num_cores());
+    } else {
+        allocated_addr = allocator::allocate_buffer(
+            *(buffer->device()->allocator_),
+            buffer->size(),
+            buffer->page_size(),
+            buffer->buffer_type(),
+            bottom_up,
+            std::nullopt);
+    }
+    TT_ASSERT(allocated_addr <= std::numeric_limits<uint32_t>::max());
+
     GraphTracker::instance().track_allocate(buffer, bottom_up);
+
+    return allocated_addr;
 }
 
 void DeallocateBuffer(Buffer *buffer) {
@@ -825,12 +837,8 @@ void DeallocateBuffer(Buffer *buffer) {
     if(GraphTracker::instance().hook_deallocate(buffer)) {
         return;
     }
-    EnqueueDeallocateBuffer(
-        buffer->device()->command_queue(),
-        *(buffer->device()->allocator_),
-        buffer->address(),
-        buffer->buffer_type(),
-        false);
+
+    allocator::deallocate_buffer(*buffer->device()->allocator_, buffer->address(), buffer->buffer_type());
 }
 
 }  // namespace detail
@@ -1045,20 +1053,19 @@ uint32_t CreateSemaphore(
 }
 
 std::shared_ptr<Buffer> CreateBuffer(const InterleavedBufferConfig &config) {
-    return std::make_shared<Buffer>(
-        config.device, config.size, config.page_size, config.buffer_type, config.buffer_layout, std::nullopt, std::nullopt, config.allocate);
+    return Buffer::create(
+        config.device, config.size, config.page_size, config.buffer_type, config.buffer_layout, std::nullopt, std::nullopt);
 }
 
 std::shared_ptr<Buffer> CreateBuffer(const ShardedBufferConfig &config) {
-    return std::make_shared<Buffer>(
+    return Buffer::create(
         config.device,
         config.size,
         config.page_size,
         config.buffer_type,
         config.buffer_layout,
         config.shard_parameters,
-        std::nullopt,
-        config.allocate);
+        std::nullopt);
 }
 
 void DeallocateBuffer(Buffer &buffer) { buffer.deallocate(); }
diff --git a/ttnn/cpp/ttnn/async_runtime.cpp b/ttnn/cpp/ttnn/async_runtime.cpp
index 29845f66316..814db64804b 100644
--- a/ttnn/cpp/ttnn/async_runtime.cpp
+++ b/ttnn/cpp/ttnn/async_runtime.cpp
@@ -8,67 +8,6 @@
 #include "ttnn/tensor/tensor_impl_wrapper.hpp"
 
 namespace ttnn {
-using DeviceBuffer = std::shared_ptr<Buffer>;
-using queue_id = uint8_t;
-
-DeviceBuffer allocate_interleaved_buffer_on_device(
-    size_t buffer_size_bytes,
-    Device* device,
-    const ttnn::SimpleShape& shape,
-    DataType data_type,
-    Layout layout,
-    const MemoryConfig& memory_config,
-    const std::optional<Tile>& tile) {
-    uint32_t page_size = tt::tt_metal::tensor_impl::get_page_size(data_type, layout, buffer_size_bytes, shape, tile);
-    return std::make_shared<Buffer>(device, buffer_size_bytes, page_size, memory_config.buffer_type);
-}
-
-DeviceBuffer allocate_contiguous_buffer_on_device(
-    size_t buffer_size_bytes, Device* device, const MemoryConfig& memory_config) {
-    return std::make_shared<Buffer>(device, buffer_size_bytes, buffer_size_bytes, memory_config.buffer_type);
-}
-
-DeviceBuffer allocate_sharded_buffer_on_device(
-    size_t buffer_size_bytes,
-    Device* device,
-    const ttnn::SimpleShape& shape,
-    DataType data_type,
-    Layout layout,
-    const ShardSpecBuffer& shard_params,
-    const MemoryConfig& memory_config,
-    const std::optional<Tile>& tile) {
-    tt::tt_metal::tensor_impl::validate_sharded_buffer_allocation(
-        shape, layout, data_type, shard_params, memory_config, tile);
-    const auto& page_shape = shard_params.page_shape;
-    uint32_t size_of_element = tt::tt_metal::tensor_impl::element_size_bytes(data_type);
-    uint32_t page_size = page_shape[0] * page_shape[1] * size_of_element;
-    if (layout == Layout::TILE) {
-        page_size = tt::tt_metal::tensor_impl::get_page_size(data_type, layout, buffer_size_bytes, shape, tile);
-    }
-
-    return std::make_shared<Buffer>(
-        device, buffer_size_bytes, page_size, memory_config.buffer_type, memory_config.memory_layout, shard_params);
-}
-
-DeviceBuffer allocate_buffer_on_device(
-    size_t buffer_size_bytes,
-    types::Device* device,
-    const ttnn::SimpleShape& shape,
-    DataType data_type,
-    Layout layout,
-    const MemoryConfig& memory_config,
-    const std::optional<ShardSpecBuffer>& shard_spec,
-    const std::optional<Tile>& tile) {
-    if (memory_config.memory_layout == tt::tt_metal::TensorMemoryLayout::INTERLEAVED) {
-        return allocate_interleaved_buffer_on_device(
-            buffer_size_bytes, device, shape, data_type, layout, memory_config, tile);
-    } else if (memory_config.memory_layout == tt::tt_metal::TensorMemoryLayout::SINGLE_BANK) {
-        return allocate_contiguous_buffer_on_device(buffer_size_bytes, device, memory_config);
-    } else {
-        return allocate_sharded_buffer_on_device(
-            buffer_size_bytes, device, shape, data_type, layout, shard_spec.value(), memory_config, tile);
-    }
-}
 
 void write_buffer(
     queue_id cq_id,
diff --git a/ttnn/cpp/ttnn/async_runtime.hpp b/ttnn/cpp/ttnn/async_runtime.hpp
index 2be1296bc56..5700f688a66 100644
--- a/ttnn/cpp/ttnn/async_runtime.hpp
+++ b/ttnn/cpp/ttnn/async_runtime.hpp
@@ -9,11 +9,8 @@
 #include "types.hpp"
 
 namespace ttnn {
-    using DeviceBuffer = std::shared_ptr<Buffer>;
     using queue_id = uint8_t;
 
-    DeviceBuffer allocate_buffer_on_device(size_t buffer_size_bytes, types::Device* device, const ttnn::SimpleShape& shape, DataType data_type, Layout layout, const MemoryConfig& memory_config, const std::optional<ShardSpecBuffer>& shard_spec = std::nullopt, const std::optional<Tile>& tile = std::nullopt);
-
     void write_buffer(queue_id cq_id, Tensor& dst, std::vector<std::shared_ptr<void>> src, const std::optional<std::size_t> transfer_size = std::nullopt);
 
     void read_buffer(queue_id cq_id, Tensor& src, std::vector<std::shared_ptr<void>> dst, const std::optional<std::size_t> transfer_size = std::nullopt, size_t src_offset = 0, bool blocking = true);
diff --git a/ttnn/cpp/ttnn/graph/graph_processor.cpp b/ttnn/cpp/ttnn/graph/graph_processor.cpp
index 76a2668a9dc..bebeadebd9d 100644
--- a/ttnn/cpp/ttnn/graph/graph_processor.cpp
+++ b/ttnn/cpp/ttnn/graph/graph_processor.cpp
@@ -90,7 +90,7 @@ GraphProcessor::GraphProcessor(RunMode mode) : run_mode(mode) {
     end_function_any_map[typeid(std::reference_wrapper<Tensor>)] = [ptr = this] (const std::any& val) mutable {ptr->end_function_process_tensor(val);};
 
 }
-void GraphProcessor::track_allocate(tt::tt_metal::Buffer* buffer, bool bottom_up) {
+void GraphProcessor::track_allocate(const tt::tt_metal::Buffer* buffer, bool bottom_up) {
     const std::lock_guard<std::mutex> lock(mutex);
     auto buf_id = add_buffer(buffer);
 
@@ -299,7 +299,7 @@ int GraphProcessor::add_tensor(const Tensor& t) {
     return tensor_counter;
 }
 
-int GraphProcessor::add_buffer(tt::tt_metal::Buffer* buffer) {
+int GraphProcessor::add_buffer(const tt::tt_metal::Buffer* buffer) {
     auto buffer_alloc_id = reinterpret_cast<std::uintptr_t>(buffer);
     auto counter = id_to_counter.count(buffer_alloc_id) > 0 ? id_to_counter[buffer_alloc_id] : graph.size();
     if (id_to_counter.count(buffer_alloc_id) == 0) {
@@ -478,7 +478,7 @@ nlohmann::json GraphProcessor::end_graph_capture() {
         return res;
 }
 
-bool ProcessorHooks::hook_allocate(tt::tt_metal::Buffer* buffer, bool bottom_up) {
+bool ProcessorHooks::hook_allocate(const tt::tt_metal::Buffer* buffer, bool bottom_up) {
     return do_block;
 }
 
diff --git a/ttnn/cpp/ttnn/graph/graph_processor.hpp b/ttnn/cpp/ttnn/graph/graph_processor.hpp
index fa614e5d63a..4f7d6f1b6e7 100644
--- a/ttnn/cpp/ttnn/graph/graph_processor.hpp
+++ b/ttnn/cpp/ttnn/graph/graph_processor.hpp
@@ -22,7 +22,7 @@ namespace ttnn::graph {
 
     public:
         ProcessorHooks() = default;
-        bool hook_allocate(tt::tt_metal::Buffer* buffer, bool bottom_up) override;
+        bool hook_allocate(const tt::tt_metal::Buffer* buffer, bool bottom_up) override;
 
         bool hook_deallocate(tt::tt_metal::Buffer* buffer) override;
 
@@ -40,7 +40,7 @@ namespace ttnn::graph {
         GraphProcessor(tt::tt_metal::IGraphProcessor::RunMode mode);
         ~GraphProcessor() override;
 
-        void track_allocate(tt::tt_metal::Buffer* buffer, bool bottom_up) override;
+        void track_allocate(const tt::tt_metal::Buffer* buffer, bool bottom_up) override;
 
         void track_deallocate(tt::tt_metal::Buffer* buffer) override;
 
@@ -80,7 +80,7 @@ namespace ttnn::graph {
         std::unordered_map<std::type_index, ProcessFunc> end_function_any_map;
 
         int add_tensor(const Tensor& t);
-        int add_buffer(tt::tt_metal::Buffer* buffer);
+        int add_buffer(const tt::tt_metal::Buffer* buffer);
 
         void begin_function_process_ref_vector(const std::any& any_val);
         void begin_function_process_ref_vector_optional(const std::any& any_val);
diff --git a/ttnn/cpp/ttnn/operations/ccl/ccl_host_datastructures.cpp b/ttnn/cpp/ttnn/operations/ccl/ccl_host_datastructures.cpp
index 89cca953ca9..6330db7c537 100644
--- a/ttnn/cpp/ttnn/operations/ccl/ccl_host_datastructures.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/ccl_host_datastructures.cpp
@@ -48,7 +48,7 @@ uint32_t EriscDatamoverConfig::compute_buffer_size(std::size_t num_edm_channels,
 
     log_trace(tt::LogOp, "Buffer size: {}", buffer_size);
 
-    TT_ASSERT(buffer_size > 0 && buffer_size % page_size == 0);
+    TT_ASSERT(page_size == 0 ? buffer_size == 0 : buffer_size % page_size == 0);
     return buffer_size;
 }
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp
index 6d04e833e2d..b8d2756443d 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp
@@ -182,7 +182,7 @@ operation::ProgramWithCallbacks pad_rm_opt(const Tensor &a,
     TT_ASSERT(unpadded_row_size_nbytes <= padded_row_size_nbytes, "Padded output tensor size should be >= input tensor size");
 
     Device *device = a.device();
-    auto dst_buffer_l1 = Buffer(device, padded_row_size_nbytes, padded_row_size_nbytes, BufferType::L1);
+    auto dst_buffer_l1 = Buffer::create(device, padded_row_size_nbytes, padded_row_size_nbytes, BufferType::L1);
 
     // construct const buffer with the pad_value
     uint32_t pad_value_const_buffer_size = 32;  // noc transfers in chunks of 32
@@ -243,7 +243,7 @@ operation::ProgramWithCallbacks pad_rm_opt(const Tensor &a,
         tt::log_debug("pad_value_const_tensor_addr: {}", pad_value_const_tensor_addr);
         tt::log_debug("pad_value_const_buffer_nbytes: {}", pad_value_const_buffer_nbytes);
         tt::log_debug("packed_pad_value: {}", packed_pad_value);
-        tt::log_debug("dst_buffer_l1_addr: {}", dst_buffer_l1.address());
+        tt::log_debug("dst_buffer_l1_addr: {}", dst_buffer_l1->address());
     }
     #endif
 
@@ -263,7 +263,7 @@ operation::ProgramWithCallbacks pad_rm_opt(const Tensor &a,
                                        pad_value_const_tensor_addr,
                                        pad_value_const_buffer_nbytes,
                                        packed_pad_value,
-                                       dst_buffer_l1.address()};
+                                       dst_buffer_l1->address()};
     tt::tt_metal::SetRuntimeArgs(program,
                    reader_kernel_id,
                    core,
diff --git a/ttnn/cpp/ttnn/run_operation_inl.hpp b/ttnn/cpp/ttnn/run_operation_inl.hpp
index df2025b17c5..db57a4e7d2f 100644
--- a/ttnn/cpp/ttnn/run_operation_inl.hpp
+++ b/ttnn/cpp/ttnn/run_operation_inl.hpp
@@ -247,8 +247,8 @@ void launch_op(
             });
 
         for (auto target_device : workers) {
-            target_device->push_work(std::make_shared<std::function<void()>>(
-                [target_device, work_lambda]() mutable { (*work_lambda)(target_device); }));
+            target_device->push_work(
+                [target_device, work_lambda]() mutable { (*work_lambda)(target_device); });
         }
     }
 
diff --git a/ttnn/cpp/ttnn/tensor/tensor.cpp b/ttnn/cpp/ttnn/tensor/tensor.cpp
index 2e57a99e1b6..d689237d5e9 100644
--- a/ttnn/cpp/ttnn/tensor/tensor.cpp
+++ b/ttnn/cpp/ttnn/tensor/tensor.cpp
@@ -261,7 +261,7 @@ void Tensor::deallocate(bool force) {
                                 : this->tensor_attributes->main_thread_ref_count;
                         if ((force or ref_count_to_use == 1) and not this->tensor_attributes->deallocated) {
                             this->tensor_attributes->deallocated = true;
-                            this->workers.at(0)->push_work(std::make_shared<std::function<void()>>(
+                            this->workers.at(0)->push_work(
                                 [force, attr = this->tensor_attributes]() mutable {
                                     // Cross worker synchronization: If the tensor being deallocated is shared across
                                     // workers (ex: all_gather op), wait until all workers are done with this tensor
@@ -297,7 +297,7 @@ void Tensor::deallocate(bool force) {
                                             }
                                         },
                                         attr->storage);
-                                }));
+                                });
                         }
                     } else {
                         TT_FATAL(
@@ -339,8 +339,8 @@ void Tensor::deallocate(bool force) {
                                 });
 
                             for (auto worker : this->workers) {
-                                worker->push_work(std::make_shared<std::function<void()>>(
-                                    [worker, dealloc_lambda]() mutable { (*dealloc_lambda)(worker); }));
+                                worker->push_work(
+                                    [worker, dealloc_lambda]() mutable { (*dealloc_lambda)(worker); });
                             }
                         }
                     } else {
diff --git a/ttnn/cpp/ttnn/tensor/tensor_impl.cpp b/ttnn/cpp/ttnn/tensor/tensor_impl.cpp
index b3a013e1b38..bea72879c2d 100644
--- a/ttnn/cpp/ttnn/tensor/tensor_impl.cpp
+++ b/ttnn/cpp/ttnn/tensor/tensor_impl.cpp
@@ -45,7 +45,6 @@ uint32_t element_size_bytes(DataType dtype) {
 }
 
 uint32_t get_page_size(DataType dtype, Layout layout, uint32_t total_size_bytes, const ttnn::SimpleShape& shape, const std::optional<Tile>& tile) {
-    uint32_t W = shape[-1];
     uint32_t page_size = 0;
     const auto tile_HW = tile.has_value() ? tile->get_tile_hw() : constants::TILE_HW;
     const auto bfloat8b_tile_HW = tile.has_value() ? tile_HW + 64 : constants::BFLOAT8_B_TILE_HW;
@@ -53,6 +52,7 @@ uint32_t get_page_size(DataType dtype, Layout layout, uint32_t total_size_bytes,
     switch (layout) {
         case Layout::ROW_MAJOR: {
             uint32_t size_of_element = element_size_bytes(dtype);
+            uint32_t W = shape.rank() == 0 ? 1 : shape[-1];
             page_size = W * size_of_element;
         } break;
         case Layout::TILE: {
@@ -82,11 +82,10 @@ uint32_t get_page_size(DataType dtype, Layout layout, uint32_t total_size_bytes,
                 } break;
                 default: TT_ASSERT(false && "Unsupported data type!");
             }
-            TT_ASSERT(total_size_bytes % page_size == 0);
+            TT_ASSERT(page_size == 0 ? total_size_bytes == 0 : total_size_bytes % page_size == 0);
         } break;
         default: TT_ASSERT(false && "Unsupported layout to write to device");
     }
-    TT_ASSERT(page_size != 0);
     return page_size;
 }
 
@@ -191,12 +190,12 @@ DeviceBuffer allocate_interleaved_buffer_on_device(
     const MemoryConfig& memory_config,
     const std::optional<Tile>& tile) {
     uint32_t page_size = get_page_size(data_type, layout, buffer_size_bytes, shape, tile);
-    return std::make_shared<Buffer>(device, buffer_size_bytes, page_size, memory_config.buffer_type);
+    return Buffer::create(device, buffer_size_bytes, page_size, memory_config.buffer_type);
 }
 
 DeviceBuffer allocate_contiguous_buffer_on_device(
     size_t buffer_size_bytes, Device* device, const MemoryConfig& memory_config) {
-    return std::make_shared<Buffer>(device, buffer_size_bytes, buffer_size_bytes, memory_config.buffer_type);
+    return Buffer::create(device, buffer_size_bytes, buffer_size_bytes, memory_config.buffer_type);
 }
 
 DeviceBuffer allocate_sharded_buffer_on_device(
@@ -212,7 +211,7 @@ DeviceBuffer allocate_sharded_buffer_on_device(
     const auto& page_shape = ttnn::SimpleShape(shard_params.page_shape);
     uint32_t page_size = get_page_size(data_type, layout, buffer_size_bytes, page_shape, tile);
 
-    return std::make_shared<Buffer>(
+    return Buffer::create(
         device, buffer_size_bytes, page_size, memory_config.buffer_type, memory_config.memory_layout, shard_params);
 }
 
diff --git a/ttnn/cpp/ttnn/tensor/tensor_ops.cpp b/ttnn/cpp/ttnn/tensor/tensor_ops.cpp
index c8c9e0bd782..e63a37fb791 100644
--- a/ttnn/cpp/ttnn/tensor/tensor_ops.cpp
+++ b/ttnn/cpp/ttnn/tensor/tensor_ops.cpp
@@ -26,9 +26,8 @@
 namespace{
     inline void SynchronizeWorkerThreads(const std::vector<Device*>& workers) {
         // Push empty work to threads and ensure its been picked up
-        static auto empty_work = std::make_shared<std::function<void()>>([](){});
         for (auto target_device : workers) {
-            target_device->work_executor.push_work(empty_work);
+            target_device->work_executor.push_work([](){});
         }
         // Block until work has been picked up, to flush the queue
         for (auto target_device : workers) {
diff --git a/ttnn/cpp/ttnn/tensor/types.hpp b/ttnn/cpp/ttnn/tensor/types.hpp
index 77b01638c11..4364414bc76 100644
--- a/ttnn/cpp/ttnn/tensor/types.hpp
+++ b/ttnn/cpp/ttnn/tensor/types.hpp
@@ -400,7 +400,7 @@ struct DeviceStorage {
     static constexpr auto attribute_names = std::forward_as_tuple("memory_config");
     const auto attribute_values() const { return std::make_tuple(this->memory_config()); }
 
-    inline bool is_allocated() const { return buffer && buffer->size() > 0; }
+    inline bool is_allocated() const { return buffer && buffer->is_allocated(); }
 };
 
 using BorrowedBuffer = std::variant<
@@ -844,7 +844,7 @@ struct MultiDeviceStorage {
         return std::all_of(
             ordered_device_ids.begin(), ordered_device_ids.end(), [&buffers = this->buffers](auto &&device_id) {
                 const auto &buffer = buffers.at(device_id);
-                return buffer && buffer->size() > 0;
+                return buffer && buffer->is_allocated();
             });
     }
 };
diff --git a/ttnn/cpp/ttnn/types.hpp b/ttnn/cpp/ttnn/types.hpp
index 615af9aa442..72e212d1c21 100644
--- a/ttnn/cpp/ttnn/types.hpp
+++ b/ttnn/cpp/ttnn/types.hpp
@@ -60,77 +60,7 @@ struct CoreGrid {
     }
 };
 
-// Keep track of live buffers and the device addresses they were assigned.
-// When a buffer is created, it is provided a buffer_id using get_buf_id().
-// The address for this buffer is assigned to buffer_id when the buffer is asynchronously allocated.
-// When the buffer destructor is called, or the buffer is asynchronously deallocated, the worker thread
-// will look up the address for buffer_id to free memory on device.
-class buffer_address_map {
-    public:
-        void insert(uint32_t buf_id, uint32_t buf_addr) {
-            std::scoped_lock<std::mutex> lock(this->map_mutex);
-            this->buf_id_to_address_map.insert({buf_id, buf_addr});
-        }
-        void erase(uint32_t buf_id) {
-            std::scoped_lock<std::mutex> lock(this->map_mutex);
-            this->buf_id_to_address_map.erase(buf_id);
-        }
-        uint32_t buffer_address(uint32_t buf_id) {
-            std::scoped_lock<std::mutex> lock(this->map_mutex);
-            return this->buf_id_to_address_map.at(buf_id);
-        }
-        uint32_t get_buf_id() {
-            return buf_id++;
-        }
-
-    private:
-    std::atomic<uint32_t> buf_id = 0;
-    std::mutex map_mutex;
-    std::unordered_map<uint32_t, uint32_t> buf_id_to_address_map = {};
-};
-
-inline buffer_address_map GLOBAL_BUFFER_ADDRESS_MAP;
-
-// This buffer class is compatible with multithreaded runtime (which lives in tt_eager)
-// It is derived from the tt_metal::Buffer class, but defines its own asynchronous allocation functions
-class Buffer : public tt::tt_metal::Buffer {
-    public:
-        Buffer(Device *device, uint64_t size, uint64_t page_size, const BufferType buffer_type,
-                const TensorMemoryLayout buffer_layout = TensorMemoryLayout::INTERLEAVED,
-                std::optional<ShardSpecBuffer> shard_parameters = std::nullopt, std::optional<bool> bottom_up = std::nullopt
-            ) : tt::tt_metal::Buffer(device, size, page_size, buffer_type, buffer_layout, shard_parameters, bottom_up, false) {
-                this->buffer_id = GLOBAL_BUFFER_ADDRESS_MAP.get_buf_id(); // Each buffer has a unique ID
-                this->allocate();
-            }
-        ~Buffer() {
-            this->deallocate();
-        }
-    private:
-        uint32_t buffer_id = 0;
-        void allocate() {
-            TT_ASSERT(this->device());
-            this->device()->push_work([this] () mutable {
-                bool bottom_up = this->bottom_up_.value_or(this->is_dram());
-                tt::tt_metal::detail::AllocateBuffer(this, bottom_up);
-                // The address inserted here, will be used during asynchronous deallocate
-                GLOBAL_BUFFER_ADDRESS_MAP.insert(this->buffer_id, this->address());
-
-            });
-        }
-        void deallocate() {
-            if (this->device() == nullptr or not this->device()->initialized_ or this->size() == 0) {
-                return;
-            }
-            this->set_size(0);
-            TT_ASSERT(this->device()->allocator_ != nullptr, "Expected allocator to be initialized!");
-            // Extract the required buffer attributes from main thread (these are guaranteed to be correctly populated) and send to worker
-            this->device()->push_work([dev = this->device(), id = this->buffer_id, type = this->buffer_type()] () mutable {
-                // At this point, the address for this buffer has made it to GLOBAL_BUFFER_ADDRESS_MAP, since the worker has allocated the buffer.
-                tt::tt_metal::allocator::deallocate_buffer(*(dev->allocator_), GLOBAL_BUFFER_ADDRESS_MAP.buffer_address(id), type);
-                GLOBAL_BUFFER_ADDRESS_MAP.erase(id);
-            });
-        }
-};
+using Buffer = tt::tt_metal::Buffer;
 
 static std::ostream &operator<<(std::ostream &os, const CoreGrid &core_grid) {
     os << "ttnn.CoreGrid(x=" <<core_grid.x<<", y="<<core_grid.y<<")";