#0: Unify tt_metal::Buffer and ttnn::Buffer, add support for 0-vo…

…lume buffers (#14023) * #0: Buffer refactoring wip * #0: Buffer threading change * #0: Buffer fixup * #0: Buffer fixup * #0: Add experimental asserts * #0: Refactor * #0: Test fixup * #0: Review fixes * #0: Review fixes * #0: Review fixes * #0: Handle 0 page size * #0: Add a test for creating/reading/writing 0-sized tensor * #0: Add support for more 0-sized shapes * #0: Review fixes * #0: Refactor allocation/deallocation threading to allow getting buffer address from another thread * #0: Treat empty shape as scalar * #0: Fix trailing comma * #0: Add missing include * #0: Allow 0 page size * #0: Rewrite threading, no mutexes * #0: Assert fix * #0: Fixup * #0: Remove unneeded include * #0: Cleanup, fix typo * #0: Cleanup * #0: Trying to improve perf * #0: Trying to improve perf * #0: Build fix * #0: Simpler threading, get rid of canceling allocation feature * #0: Fixup * #0: Avoid using std::atomic::wait, it seems to have perf implications --------- Co-authored-by: Artem Yerofieiev <[email protected]>
tenstorrent · Oct 24, 2024 · af83135 · af83135
1 parent 9e8d1b5
commit af83135
Show file tree

Hide file tree

Showing 50 changed files with 593 additions and 779 deletions.
diff --git a/tests/tt_eager/tensors/test_async_tensor_apis.cpp b/tests/tt_eager/tensors/test_async_tensor_apis.cpp
@@ -25,6 +25,18 @@ using namespace tt;
 using namespace tt_metal;
 using namespace constants;
 
+namespace {
+uint32_t get_device_buffer_address(const Tensor& tensor) {
+    TT_FATAL(std::holds_alternative<DeviceStorage>(tensor.get_storage()), "Tensor storage is not DeviceStorage");
+    auto buffer = std::get<DeviceStorage>(tensor.get_storage()).buffer;
+    uint32_t result = 0;
+    buffer->device()->push_work([&]() {
+        result = buffer->address();
+    }, true);
+    return result;
+}
+}
+
 TEST_F(CommonFixture, TestTensorOwnershipSanity) {
     // Sanity test tensor read, write and update paths with synchronous
     // Ensure that tensor data is copied and owned as expected
@@ -129,17 +141,17 @@ TEST_F(CommonFixture, TestAsyncEltwiseBinary) {
         Tensor output_tensor_host = output_tensor_device_2.cpu();
         // Test tensor deallocation in async mode: deallocate tensors after using them
         if (i == 0) {
-            input_a_addr = std::get<DeviceStorage>(input_tensor_a.get_storage()).buffer->address();
-            input_b_addr = std::get<DeviceStorage>(input_tensor_b.get_storage()).buffer->address();
-            input_c_addr = std::get<DeviceStorage>(input_tensor_c.get_storage()).buffer->address();
-            output_1_addr = std::get<DeviceStorage>(output_tensor_device.get_storage()).buffer->address();
-            output_2_addr = std::get<DeviceStorage>(output_tensor_device_2.get_storage()).buffer->address();
+            input_a_addr = get_device_buffer_address(input_tensor_a);
+            input_b_addr = get_device_buffer_address(input_tensor_b);
+            input_c_addr = get_device_buffer_address(input_tensor_c);
+            output_1_addr = get_device_buffer_address(output_tensor_device);
+            output_2_addr = get_device_buffer_address(output_tensor_device_2);
         } else {
-            EXPECT_EQ(std::get<DeviceStorage>(input_tensor_a.get_storage()).buffer->address(), input_a_addr);
-            EXPECT_EQ(std::get<DeviceStorage>(input_tensor_b.get_storage()).buffer->address(), input_b_addr);
-            EXPECT_EQ(std::get<DeviceStorage>(input_tensor_c.get_storage()).buffer->address(), input_c_addr);
-            EXPECT_EQ(std::get<DeviceStorage>(output_tensor_device.get_storage()).buffer->address(), output_1_addr);
-            EXPECT_EQ(std::get<DeviceStorage>(output_tensor_device_2.get_storage()).buffer->address(), output_2_addr);
+            EXPECT_EQ(get_device_buffer_address(input_tensor_a), input_a_addr);
+            EXPECT_EQ(get_device_buffer_address(input_tensor_b), input_b_addr);
+            EXPECT_EQ(get_device_buffer_address(input_tensor_c), input_c_addr);
+            EXPECT_EQ(get_device_buffer_address(output_tensor_device), output_1_addr);
+            EXPECT_EQ(get_device_buffer_address(output_tensor_device_2), output_2_addr);
         }
         input_tensor_a.deallocate();
         input_tensor_b.deallocate();
@@ -171,7 +183,7 @@ TEST_F(CommonFixture, TestAsyncRefCountManager) {
             ttnn::numpy::full<float>(tt::tt_metal::LegacyShape({1, 1, 1024, 1024}), static_cast<float>(i), DataType::BFLOAT16).to(device);
         Tensor tensor2 =
             ttnn::numpy::full<float>(tt::tt_metal::LegacyShape({1, 1, 1024, 1024}), static_cast<float>(i), DataType::BFLOAT16).to(device);
-        uint32_t tensor2_device_buf_addr = tensor2.device_buffer()->address();
+        uint32_t tensor2_device_buf_addr = get_device_buffer_address(tensor2);
         // Assign tensor1 to tensor2 and ensure that ref counts are appropriately updated with the buffer for tensor2
         // deallocated
         tensor2 = tensor1;
@@ -181,19 +193,19 @@ TEST_F(CommonFixture, TestAsyncRefCountManager) {
         // prev addr for tensor2
         Tensor tensor3 =
             ttnn::numpy::full<float>(tt::tt_metal::LegacyShape({1, 1, 1024, 1024}), static_cast<float>(i), DataType::BFLOAT16).to(device);
-        EXPECT_EQ(tensor3.device_buffer()->address(), tensor2_device_buf_addr);
-        EXPECT_EQ(tensor1.device_buffer()->address(), tensor2.device_buffer()->address());
+        EXPECT_EQ(get_device_buffer_address(tensor3), tensor2_device_buf_addr);
+        EXPECT_EQ(get_device_buffer_address(tensor1), get_device_buffer_address(tensor2));
     }
     log_info(LogTest, "Testing Device tensor self-assignment through function");
     for (int i = 0; i < 5; i++) {
         Tensor device_tensor =
             ttnn::numpy::full<float>(tt::tt_metal::LegacyShape({1, 1, 1024, 1024}), static_cast<float>(i), DataType::BFLOAT16).to(device);
-        uint32_t device_tensor_address = device_tensor.device_buffer()->address();
+        uint32_t device_tensor_address = get_device_buffer_address(device_tensor);
         // This step will copy the tensor to a temp rval and std::move it back to the caller's instance of device_tensor
         // Ensure ref count and address remain unchanged
         device_tensor = tensor_identity_copy_function(device_tensor);
         EXPECT_EQ(device_tensor.tensor_attributes->main_thread_ref_count, 1);
-        EXPECT_EQ(device_tensor.device_buffer()->address(), device_tensor_address);
+        EXPECT_EQ(get_device_buffer_address(device_tensor), device_tensor_address);
     }
 
     log_info(LogTest, "Testing Device tensor move assignment");
@@ -208,11 +220,11 @@ TEST_F(CommonFixture, TestAsyncRefCountManager) {
     log_info(LogTest, "Testing Device tensor self-assignment");
     Tensor tensor_to_self_assign =
         ttnn::numpy::full<float>(tt::tt_metal::LegacyShape({1, 1, 1024, 1024}), static_cast<float>(0), DataType::BFLOAT16).to(device);
-    uint32_t tensor_to_self_assign_address = tensor_to_self_assign.device_buffer()->address();
+    uint32_t tensor_to_self_assign_address = get_device_buffer_address(tensor_to_self_assign);
     tensor_to_self_assign = tensor_to_self_assign;
     EXPECT_EQ(tensor_to_self_assign.tensor_attributes->main_thread_ref_count, 1);
     tensor_to_self_assign = std::move(tensor_to_self_assign);
-    EXPECT_EQ(tensor_to_self_assign.device_buffer()->address(), tensor_to_self_assign_address);
+    EXPECT_EQ(get_device_buffer_address(tensor_to_self_assign), tensor_to_self_assign_address);
     auto barrier_tensor = tensor_to_self_assign.cpu();
     device->enable_async(false);
 }

diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_rtor/test_noc_rtor.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_rtor/test_noc_rtor.cpp
@@ -126,7 +126,7 @@ int main(int argc, char** argv) {
 
         // limit size of the L1 buffer to do not exceed global L1 size
         uint32_t l1_buffer_size = num_cores_r * num_cores_c * (num_tiles > 256 ? 256 : num_tiles) * page_size;
-        auto l1_buffer = tt_metal::Buffer(device, l1_buffer_size, page_size, tt_metal::BufferType::L1);
+        auto l1_buffer = tt_metal::Buffer::create(device, l1_buffer_size, page_size, tt_metal::BufferType::L1);
 
         ////////////////////////////////////////////////////////////////////////////
         //                      Application Setup
@@ -166,7 +166,7 @@ int main(int argc, char** argv) {
             for (int j = 0; j < num_cores_c; j++) {
                 CoreCoord core = {(std::size_t)j, (std::size_t)i};
                 uint32_t core_index = i * num_cores_c + j;
-                uint32_t l1_buffer_addr = l1_buffer.address();
+                uint32_t l1_buffer_addr = l1_buffer->address();
 
                 const std::array noc_runtime_args = {core_index, l1_buffer_addr, num_tiles, num_cores_r * num_cores_c};
                 SetRuntimeArgs(program, noc_kernel, core, noc_runtime_args);

diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_rw_buffer.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_rw_buffer.cpp
@@ -77,14 +77,14 @@ int main(int argc, char** argv) {
             log_error(tt::LogTest, "Command line arguments found exception", e.what());
         }
 
-        TT_ASSERT(transfer_size % page_size == 0, "Transfer size {}B should be divisible by page size {}B", transfer_size, page_size);
+        TT_ASSERT(page_size == 0 ? transfer_size == 0 : transfer_size % page_size == 0, "Transfer size {}B should be divisible by page size {}B", transfer_size, page_size);
 
         // Device setup
         int device_id = 0;
         tt_metal::Device* device = tt_metal::CreateDevice(device_id);
 
         // Application setup
-        auto buffer = tt_metal::Buffer(
+        auto buffer = tt_metal::Buffer::create(
             device, transfer_size, page_size, buffer_type == 0 ? tt_metal::BufferType::DRAM : tt_metal::BufferType::L1);
 
         std::vector<uint32_t> src_vec = create_random_vector_of_bfloat16(
@@ -104,7 +104,7 @@ int main(int argc, char** argv) {
             // Execute application
             if (!skip_write) {
                 auto t_begin = std::chrono::steady_clock::now();
-                EnqueueWriteBuffer(device->command_queue(), buffer, src_vec, false);
+                EnqueueWriteBuffer(device->command_queue(), *buffer, src_vec, false);
                 Finish(device->command_queue());
                 auto t_end = std::chrono::steady_clock::now();
                 auto elapsed_us = duration_cast<microseconds>(t_end - t_begin).count();
@@ -119,7 +119,7 @@ int main(int argc, char** argv) {
 
             if (!skip_read) {
                 auto t_begin = std::chrono::steady_clock::now();
-                EnqueueReadBuffer(device->command_queue(), buffer, result_vec, true);
+                EnqueueReadBuffer(device->command_queue(), *buffer, result_vec, true);
                 auto t_end = std::chrono::steady_clock::now();
                 auto elapsed_us = duration_cast<microseconds>(t_end - t_begin).count();
                 d2h_bandwidth.push_back((transfer_size / 1024.0 / 1024.0 / 1024.0) / (elapsed_us / 1000.0 / 1000.0));

diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/6_dram_offchip/test_dram_offchip.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/6_dram_offchip/test_dram_offchip.cpp
@@ -192,7 +192,7 @@ int main(int argc, char **argv) {
         ////////////////////////////////////////////////////////////////////////////
         std::vector<uint32_t> input_vec = create_random_vector_of_bfloat16(
             input_size, 100, std::chrono::system_clock::now().time_since_epoch().count());
-        tt_metal::Buffer input_buffer(
+        auto input_buffer = Buffer::create(
             device, input_vec.size() * sizeof(uint32_t), single_tile_size, tt_metal::BufferType::DRAM);
 
         ////////////////////////////////////////////////////////////////////////////
@@ -212,7 +212,7 @@ int main(int argc, char **argv) {
             num_tiles_per_core_group_1,
             num_tiles_per_core_group_2,
             kernel,
-            input_buffer.address(),
+            input_buffer->address(),
             num_reqs_at_a_time,
             single_tile_size,
             tile_format);
@@ -221,7 +221,7 @@ int main(int argc, char **argv) {
         //                      Copy Input To DRAM or L1
         ////////////////////////////////////////////////////////////////////////////
         if (access_type == 0) {
-            tt_metal::detail::WriteToBuffer(input_buffer, input_vec);
+            tt_metal::detail::WriteToBuffer(*input_buffer, input_vec);
         } else {
             for (uint32_t i = 0, input_offset = 0; i < num_cores; ++i) {
                 CoreCoord core = {i / num_cores_y, i % num_cores_y};
@@ -276,7 +276,7 @@ int main(int argc, char **argv) {
         ////////////////////////////////////////////////////////////////////////////
         pass = validation(
             device,
-            input_buffer,
+            *input_buffer,
             input_vec,
             num_cores,
             num_cores_y,

diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp
@@ -673,18 +673,18 @@ int main(int argc, char **argv) {
                 input_size, 100, 1234);
         }
 
-        tt_metal::Buffer input_buffer(
+        auto input_buffer = tt_metal::Buffer::create(
             device, input_vec.size() * sizeof(uint32_t), single_tile_size, tt_metal::BufferType::DRAM);
 
         ////////////////////////////////////////////////////////////////////////////
         //                      Application Setup
         ////////////////////////////////////////////////////////////////////////////
-        auto [program, kernel, cb_addr] = create_program(device, all_cores, single_tile_size, tile_format, num_tiles_cb, num_tiles_per_core, k, n, num_blocks, num_banks, all_cores_list, bank_start_id, input_buffer.address());
+        auto [program, kernel, cb_addr] = create_program(device, all_cores, single_tile_size, tile_format, num_tiles_cb, num_tiles_per_core, k, n, num_blocks, num_banks, all_cores_list, bank_start_id, input_buffer->address());
 
         ////////////////////////////////////////////////////////////////////////////
         //                      Copy Input To DRAM or L1
         ////////////////////////////////////////////////////////////////////////////
-        tt_metal::detail::WriteToBuffer(input_buffer, input_vec);
+        tt_metal::detail::WriteToBuffer(*input_buffer, input_vec);
 
         ////////////////////////////////////////////////////////////////////////////
         //                      Execution Application
@@ -713,7 +713,7 @@ int main(int argc, char **argv) {
 
         pass = validation(
             device,
-            input_buffer,
+            *input_buffer,
             input_vec,
             num_cores,
             all_cores_list,

diff --git a/...etal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp b/...etal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp
@@ -869,18 +869,18 @@ int main(int argc, char **argv) {
                 input_size, 100, 1234);
         }
 
-        tt_metal::Buffer input_buffer(
+        auto input_buffer = tt_metal::Buffer::create(
             device, input_vec.size() * sizeof(uint32_t), single_tile_size, tt_metal::BufferType::DRAM);
 
         ////////////////////////////////////////////////////////////////////////////
         //                      Application Setup
         ////////////////////////////////////////////////////////////////////////////
-        auto [program, kernel, output_cb_addr] = create_program(device, all_dram_reader_cores, all_l1_receiver_cores, single_tile_size, tile_format, num_tiles_cb, num_tiles_per_core, k, n, num_blocks, num_banks, all_dram_reader_cores_ordered, all_l1_writer_cores_ordered, bank_start_id, input_buffer.address());
+        auto [program, kernel, output_cb_addr] = create_program(device, all_dram_reader_cores, all_l1_receiver_cores, single_tile_size, tile_format, num_tiles_cb, num_tiles_per_core, k, n, num_blocks, num_banks, all_dram_reader_cores_ordered, all_l1_writer_cores_ordered, bank_start_id, input_buffer->address());
 
         ////////////////////////////////////////////////////////////////////////////
         //                      Copy Input To DRAM or L1
         ////////////////////////////////////////////////////////////////////////////
-        tt_metal::detail::WriteToBuffer(input_buffer, input_vec);
+        tt_metal::detail::WriteToBuffer(*input_buffer, input_vec);
 
         ////////////////////////////////////////////////////////////////////////////
         //                      Execution Application
@@ -909,7 +909,7 @@ int main(int argc, char **argv) {
 
         pass = validation(
             device,
-            input_buffer,
+            *input_buffer,
             input_vec,
             num_cores,
             all_l1_writer_cores_ordered,

diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1.cpp
@@ -189,7 +189,7 @@ int main(int argc, char **argv) {
             activations_addr,
             activations_addr / 1024,
             Nt);
-        std::vector<tt_metal::Buffer> l1_buffers;
+        std::vector<std::shared_ptr<tt_metal::Buffer>> l1_buffers;
 
         int l1_buffers_size = 1;
         if (!(single_read || one_buffer_share)) {
@@ -199,8 +199,8 @@ int main(int argc, char **argv) {
         l1_buffers.reserve(l1_buffers_size);
         for (int r = 0; r < num_cores_r; ++r) {
             for (int c = 0; c < num_cores_c; ++c) {
-                l1_buffers.emplace_back(device, total_tiles_size_bytes, single_tile_size, tt_metal::BufferType::L1);
-                tt_metal::detail::WriteToBuffer(l1_buffers[r * num_cores_c + c], packed_tensors[r * num_cores_c + c]);
+                l1_buffers.push_back(tt_metal::Buffer::create(device, total_tiles_size_bytes, single_tile_size, tt_metal::BufferType::L1));
+                tt_metal::detail::WriteToBuffer(*l1_buffers[r * num_cores_c + c], packed_tensors[r * num_cores_c + c]);
 
                 if (single_read || one_buffer_share)
                     break;
@@ -213,7 +213,7 @@ int main(int argc, char **argv) {
         for (int r = 0; r < num_cores_r; ++r) {
             for (int c = 0; c < num_cores_c; ++c) {
                 std::vector<uint32_t> result_vec;
-                tt_metal::detail::ReadFromBuffer(l1_buffers[r * num_cores_c + c], result_vec);
+                tt_metal::detail::ReadFromBuffer(*l1_buffers[r * num_cores_c + c], result_vec);
                 auto result_bfp16 = unpack_uint32_vec_into_bfloat16_vec(result_vec);
 
                 if (print_tensor) {
@@ -260,7 +260,7 @@ int main(int argc, char **argv) {
                 CoreCoord core = {(size_t)c, (size_t)r};
 
                 int l1_buffers_idx = (single_read || one_buffer_share) ? (0) : (r * num_cores_c + c);
-                auto l1_buffer_addr = l1_buffers[l1_buffers_idx].address();
+                auto l1_buffer_addr = l1_buffers[l1_buffers_idx]->address();
 
                 uint32_t l1_buffer_offset = (one_buffer_share) ? ((r * num_cores_c + c) * Nt) : (0);
 

diff --git a/tests/tt_metal/tt_metal/unit_tests/allocator/test_l1_banking_allocator.cpp b/tests/tt_metal/tt_metal/unit_tests/allocator/test_l1_banking_allocator.cpp
@@ -35,7 +35,7 @@ TEST_F(BasicFixture, TestL1BuffersAllocatedTopDown) {
 
     uint64_t alloc_limit = unit_tests::test_l1_banking_allocator::get_alloc_limit(device);
 
-    std::vector<std::unique_ptr<Buffer>> buffers;
+    std::vector<std::shared_ptr<Buffer>> buffers;
     int alloc_size_idx = 0;
     uint32_t total_buffer_size = 0;
     while (total_size_bytes < alloc_limit) {
@@ -44,7 +44,7 @@ TEST_F(BasicFixture, TestL1BuffersAllocatedTopDown) {
         if (total_buffer_size + buffer_size >= alloc_limit) {
             break;
         }
-        std::unique_ptr<tt::tt_metal::Buffer> buffer = std::make_unique<tt::tt_metal::Buffer>(device, buffer_size, buffer_size, tt::tt_metal::BufferType::L1);
+        auto buffer = tt::tt_metal::Buffer::create(device, buffer_size, buffer_size, tt::tt_metal::BufferType::L1);
         buffers.emplace_back(std::move(buffer));
         total_buffer_size += buffer_size;
         EXPECT_EQ(buffers.back()->address(), device->l1_size_per_core() - total_buffer_size);

diff --git a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_X_dram.cpp b/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_X_dram.cpp
@@ -430,17 +430,17 @@ bool matmul_multi_core_multi_dram(CommonFixture *fixture, tt_metal::Device *devi
     auto activations_tile_layout = convert_to_tile_layout(activations_tilized);
     auto activations = pack_bfloat16_vec_into_uint32_vec(activations_tile_layout);
 
-    auto activation_buffer = std::make_shared<Buffer>(device, activations.size() * sizeof(uint32_t), 1024 * 2, BufferType::DRAM);
+    auto activation_buffer = Buffer::create(device, activations.size() * sizeof(uint32_t), 1024 * 2, BufferType::DRAM);
     pass &= move_tiles_to_dram(device, activations, M, K, activation_buffer);
     auto identity_tilized = test_utils::tilize(identity, K * 32, N * 32);
     auto weights_tile_layout = convert_to_tile_layout(identity_tilized);
     auto weights = pack_bfloat16_vec_into_uint32_vec(weights_tile_layout);
 
-    auto weight_buffer = std::make_shared<Buffer>(device, weights.size() * sizeof(uint32_t), 1024 * 2, BufferType::DRAM);
+    auto weight_buffer = Buffer::create(device, weights.size() * sizeof(uint32_t), 1024 * 2, BufferType::DRAM);
     pass &= move_tiles_to_dram(device, weights, K, N, weight_buffer);
     log_debug(LogTest, "Copying inputs to dram complete");
 
-    auto out_buffer = std::make_shared<Buffer>(device, M * N * sizeof(uint32_t) * 32 * 32, 1024 * 2, BufferType::DRAM);
+    auto out_buffer = Buffer::create(device, M * N * sizeof(uint32_t) * 32 * 32, 1024 * 2, BufferType::DRAM);
     uint32_t out_dram_addr = out_buffer->address();
 
     log_debug(LogTest, "Writing kernel runtime args to device");