From af831359ccc9afd2523128b570a828d7259cf549 Mon Sep 17 00:00:00 2001 From: Stanislav Minakov Date: Thu, 24 Oct 2024 09:39:56 -0700 Subject: [PATCH] #0: Unify `tt_metal::Buffer` and `ttnn::Buffer`, add support for 0-volume buffers (#14023) * #0: Buffer refactoring wip * #0: Buffer threading change * #0: Buffer fixup * #0: Buffer fixup * #0: Add experimental asserts * #0: Refactor * #0: Test fixup * #0: Review fixes * #0: Review fixes * #0: Review fixes * #0: Handle 0 page size * #0: Add a test for creating/reading/writing 0-sized tensor * #0: Add support for more 0-sized shapes * #0: Review fixes * #0: Refactor allocation/deallocation threading to allow getting buffer address from another thread * #0: Treat empty shape as scalar * #0: Fix trailing comma * #0: Add missing include * #0: Allow 0 page size * #0: Rewrite threading, no mutexes * #0: Assert fix * #0: Fixup * #0: Remove unneeded include * #0: Cleanup, fix typo * #0: Cleanup * #0: Trying to improve perf * #0: Trying to improve perf * #0: Build fix * #0: Simpler threading, get rid of canceling allocation feature * #0: Fixup * #0: Avoid using std::atomic::wait, it seems to have perf implications --------- Co-authored-by: Artem Yerofieiev <169092593+ayerofieiev-tt@users.noreply.github.com> --- .../tensors/test_async_tensor_apis.cpp | 46 ++- .../2_noc_rtor/test_noc_rtor.cpp | 4 +- .../3_pcie_transfer/test_rw_buffer.cpp | 8 +- .../6_dram_offchip/test_dram_offchip.cpp | 8 +- .../test_dram_read.cpp | 8 +- .../test_dram_read_l1_write.cpp | 8 +- .../old/noc/test_noc_read_global_l1.cpp | 10 +- .../allocator/test_l1_banking_allocator.cpp | 4 +- .../matmul/test_matmul_multi_core_X_dram.cpp | 6 +- .../command_queue_test_utils.hpp | 6 +- .../command_queue/test_EnqueueProgram.cpp | 4 +- .../command_queue/test_EnqueueTrace.cpp | 76 ++-- ...queueWriteBuffer_and_EnqueueReadBuffer.cpp | 104 ++--- .../command_queue/test_HostAsyncCQ.cpp | 35 +- .../command_queue/test_events.cpp | 4 +- .../command_queue/test_EnqueueTrace.cpp | 46 +-- .../test_EnqueueWaitForEvent.cpp | 6 +- ...queueWriteBuffer_and_EnqueueReadBuffer.cpp | 4 +- .../gtests/tensor/test_create_tensor.cpp | 9 +- .../unit_tests/gtests/test_async_runtime.cpp | 12 +- .../unit_tests/gtests/test_ccl_on_galaxy.cpp | 4 +- .../gtests/test_multi_cq_multi_dev.cpp | 6 +- .../gtests/test_multiprod_queue.cpp | 6 +- tt_metal/detail/tt_metal.hpp | 2 +- tt_metal/detail/util.hpp | 6 +- tt_metal/graph/graph_tracking.cpp | 4 +- tt_metal/graph/graph_tracking.hpp | 8 +- tt_metal/impl/buffers/buffer.cpp | 358 +++++++++++------- tt_metal/impl/buffers/buffer.hpp | 181 ++++----- tt_metal/impl/device/device.cpp | 8 +- tt_metal/impl/device/device.hpp | 7 +- tt_metal/impl/dispatch/command_queue.cpp | 70 +--- tt_metal/impl/dispatch/command_queue.hpp | 9 +- tt_metal/impl/dispatch/lock_free_queue.hpp | 6 +- tt_metal/impl/dispatch/work_executor.hpp | 38 +- tt_metal/impl/program/program.cpp | 2 +- tt_metal/impl/trace/trace.cpp | 2 +- tt_metal/tt_metal.cpp | 61 +-- ttnn/cpp/ttnn/async_runtime.cpp | 61 --- ttnn/cpp/ttnn/async_runtime.hpp | 3 - ttnn/cpp/ttnn/graph/graph_processor.cpp | 6 +- ttnn/cpp/ttnn/graph/graph_processor.hpp | 6 +- .../ccl/ccl_host_datastructures.cpp | 2 +- .../pad/device/pad_program_factory.cpp | 6 +- ttnn/cpp/ttnn/run_operation_inl.hpp | 4 +- ttnn/cpp/ttnn/tensor/tensor.cpp | 8 +- ttnn/cpp/ttnn/tensor/tensor_impl.cpp | 11 +- ttnn/cpp/ttnn/tensor/tensor_ops.cpp | 3 +- ttnn/cpp/ttnn/tensor/types.hpp | 4 +- ttnn/cpp/ttnn/types.hpp | 72 +--- 50 files changed, 593 insertions(+), 779 deletions(-) diff --git a/tests/tt_eager/tensors/test_async_tensor_apis.cpp b/tests/tt_eager/tensors/test_async_tensor_apis.cpp index 4803efbf2f1..488d4178edd 100644 --- a/tests/tt_eager/tensors/test_async_tensor_apis.cpp +++ b/tests/tt_eager/tensors/test_async_tensor_apis.cpp @@ -25,6 +25,18 @@ using namespace tt; using namespace tt_metal; using namespace constants; +namespace { +uint32_t get_device_buffer_address(const Tensor& tensor) { + TT_FATAL(std::holds_alternative(tensor.get_storage()), "Tensor storage is not DeviceStorage"); + auto buffer = std::get(tensor.get_storage()).buffer; + uint32_t result = 0; + buffer->device()->push_work([&]() { + result = buffer->address(); + }, true); + return result; +} +} + TEST_F(CommonFixture, TestTensorOwnershipSanity) { // Sanity test tensor read, write and update paths with synchronous // Ensure that tensor data is copied and owned as expected @@ -129,17 +141,17 @@ TEST_F(CommonFixture, TestAsyncEltwiseBinary) { Tensor output_tensor_host = output_tensor_device_2.cpu(); // Test tensor deallocation in async mode: deallocate tensors after using them if (i == 0) { - input_a_addr = std::get(input_tensor_a.get_storage()).buffer->address(); - input_b_addr = std::get(input_tensor_b.get_storage()).buffer->address(); - input_c_addr = std::get(input_tensor_c.get_storage()).buffer->address(); - output_1_addr = std::get(output_tensor_device.get_storage()).buffer->address(); - output_2_addr = std::get(output_tensor_device_2.get_storage()).buffer->address(); + input_a_addr = get_device_buffer_address(input_tensor_a); + input_b_addr = get_device_buffer_address(input_tensor_b); + input_c_addr = get_device_buffer_address(input_tensor_c); + output_1_addr = get_device_buffer_address(output_tensor_device); + output_2_addr = get_device_buffer_address(output_tensor_device_2); } else { - EXPECT_EQ(std::get(input_tensor_a.get_storage()).buffer->address(), input_a_addr); - EXPECT_EQ(std::get(input_tensor_b.get_storage()).buffer->address(), input_b_addr); - EXPECT_EQ(std::get(input_tensor_c.get_storage()).buffer->address(), input_c_addr); - EXPECT_EQ(std::get(output_tensor_device.get_storage()).buffer->address(), output_1_addr); - EXPECT_EQ(std::get(output_tensor_device_2.get_storage()).buffer->address(), output_2_addr); + EXPECT_EQ(get_device_buffer_address(input_tensor_a), input_a_addr); + EXPECT_EQ(get_device_buffer_address(input_tensor_b), input_b_addr); + EXPECT_EQ(get_device_buffer_address(input_tensor_c), input_c_addr); + EXPECT_EQ(get_device_buffer_address(output_tensor_device), output_1_addr); + EXPECT_EQ(get_device_buffer_address(output_tensor_device_2), output_2_addr); } input_tensor_a.deallocate(); input_tensor_b.deallocate(); @@ -171,7 +183,7 @@ TEST_F(CommonFixture, TestAsyncRefCountManager) { ttnn::numpy::full(tt::tt_metal::LegacyShape({1, 1, 1024, 1024}), static_cast(i), DataType::BFLOAT16).to(device); Tensor tensor2 = ttnn::numpy::full(tt::tt_metal::LegacyShape({1, 1, 1024, 1024}), static_cast(i), DataType::BFLOAT16).to(device); - uint32_t tensor2_device_buf_addr = tensor2.device_buffer()->address(); + uint32_t tensor2_device_buf_addr = get_device_buffer_address(tensor2); // Assign tensor1 to tensor2 and ensure that ref counts are appropriately updated with the buffer for tensor2 // deallocated tensor2 = tensor1; @@ -181,19 +193,19 @@ TEST_F(CommonFixture, TestAsyncRefCountManager) { // prev addr for tensor2 Tensor tensor3 = ttnn::numpy::full(tt::tt_metal::LegacyShape({1, 1, 1024, 1024}), static_cast(i), DataType::BFLOAT16).to(device); - EXPECT_EQ(tensor3.device_buffer()->address(), tensor2_device_buf_addr); - EXPECT_EQ(tensor1.device_buffer()->address(), tensor2.device_buffer()->address()); + EXPECT_EQ(get_device_buffer_address(tensor3), tensor2_device_buf_addr); + EXPECT_EQ(get_device_buffer_address(tensor1), get_device_buffer_address(tensor2)); } log_info(LogTest, "Testing Device tensor self-assignment through function"); for (int i = 0; i < 5; i++) { Tensor device_tensor = ttnn::numpy::full(tt::tt_metal::LegacyShape({1, 1, 1024, 1024}), static_cast(i), DataType::BFLOAT16).to(device); - uint32_t device_tensor_address = device_tensor.device_buffer()->address(); + uint32_t device_tensor_address = get_device_buffer_address(device_tensor); // This step will copy the tensor to a temp rval and std::move it back to the caller's instance of device_tensor // Ensure ref count and address remain unchanged device_tensor = tensor_identity_copy_function(device_tensor); EXPECT_EQ(device_tensor.tensor_attributes->main_thread_ref_count, 1); - EXPECT_EQ(device_tensor.device_buffer()->address(), device_tensor_address); + EXPECT_EQ(get_device_buffer_address(device_tensor), device_tensor_address); } log_info(LogTest, "Testing Device tensor move assignment"); @@ -208,11 +220,11 @@ TEST_F(CommonFixture, TestAsyncRefCountManager) { log_info(LogTest, "Testing Device tensor self-assignment"); Tensor tensor_to_self_assign = ttnn::numpy::full(tt::tt_metal::LegacyShape({1, 1, 1024, 1024}), static_cast(0), DataType::BFLOAT16).to(device); - uint32_t tensor_to_self_assign_address = tensor_to_self_assign.device_buffer()->address(); + uint32_t tensor_to_self_assign_address = get_device_buffer_address(tensor_to_self_assign); tensor_to_self_assign = tensor_to_self_assign; EXPECT_EQ(tensor_to_self_assign.tensor_attributes->main_thread_ref_count, 1); tensor_to_self_assign = std::move(tensor_to_self_assign); - EXPECT_EQ(tensor_to_self_assign.device_buffer()->address(), tensor_to_self_assign_address); + EXPECT_EQ(get_device_buffer_address(tensor_to_self_assign), tensor_to_self_assign_address); auto barrier_tensor = tensor_to_self_assign.cpu(); device->enable_async(false); } diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_rtor/test_noc_rtor.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_rtor/test_noc_rtor.cpp index a8f8f6773a6..3e58551a329 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_rtor/test_noc_rtor.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_rtor/test_noc_rtor.cpp @@ -126,7 +126,7 @@ int main(int argc, char** argv) { // limit size of the L1 buffer to do not exceed global L1 size uint32_t l1_buffer_size = num_cores_r * num_cores_c * (num_tiles > 256 ? 256 : num_tiles) * page_size; - auto l1_buffer = tt_metal::Buffer(device, l1_buffer_size, page_size, tt_metal::BufferType::L1); + auto l1_buffer = tt_metal::Buffer::create(device, l1_buffer_size, page_size, tt_metal::BufferType::L1); //////////////////////////////////////////////////////////////////////////// // Application Setup @@ -166,7 +166,7 @@ int main(int argc, char** argv) { for (int j = 0; j < num_cores_c; j++) { CoreCoord core = {(std::size_t)j, (std::size_t)i}; uint32_t core_index = i * num_cores_c + j; - uint32_t l1_buffer_addr = l1_buffer.address(); + uint32_t l1_buffer_addr = l1_buffer->address(); const std::array noc_runtime_args = {core_index, l1_buffer_addr, num_tiles, num_cores_r * num_cores_c}; SetRuntimeArgs(program, noc_kernel, core, noc_runtime_args); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_rw_buffer.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_rw_buffer.cpp index 7f495e0729c..044d0888915 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_rw_buffer.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_rw_buffer.cpp @@ -77,14 +77,14 @@ int main(int argc, char** argv) { log_error(tt::LogTest, "Command line arguments found exception", e.what()); } - TT_ASSERT(transfer_size % page_size == 0, "Transfer size {}B should be divisible by page size {}B", transfer_size, page_size); + TT_ASSERT(page_size == 0 ? transfer_size == 0 : transfer_size % page_size == 0, "Transfer size {}B should be divisible by page size {}B", transfer_size, page_size); // Device setup int device_id = 0; tt_metal::Device* device = tt_metal::CreateDevice(device_id); // Application setup - auto buffer = tt_metal::Buffer( + auto buffer = tt_metal::Buffer::create( device, transfer_size, page_size, buffer_type == 0 ? tt_metal::BufferType::DRAM : tt_metal::BufferType::L1); std::vector src_vec = create_random_vector_of_bfloat16( @@ -104,7 +104,7 @@ int main(int argc, char** argv) { // Execute application if (!skip_write) { auto t_begin = std::chrono::steady_clock::now(); - EnqueueWriteBuffer(device->command_queue(), buffer, src_vec, false); + EnqueueWriteBuffer(device->command_queue(), *buffer, src_vec, false); Finish(device->command_queue()); auto t_end = std::chrono::steady_clock::now(); auto elapsed_us = duration_cast(t_end - t_begin).count(); @@ -119,7 +119,7 @@ int main(int argc, char** argv) { if (!skip_read) { auto t_begin = std::chrono::steady_clock::now(); - EnqueueReadBuffer(device->command_queue(), buffer, result_vec, true); + EnqueueReadBuffer(device->command_queue(), *buffer, result_vec, true); auto t_end = std::chrono::steady_clock::now(); auto elapsed_us = duration_cast(t_end - t_begin).count(); d2h_bandwidth.push_back((transfer_size / 1024.0 / 1024.0 / 1024.0) / (elapsed_us / 1000.0 / 1000.0)); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/6_dram_offchip/test_dram_offchip.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/6_dram_offchip/test_dram_offchip.cpp index 56491f550e4..8efa155c9ed 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/6_dram_offchip/test_dram_offchip.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/6_dram_offchip/test_dram_offchip.cpp @@ -192,7 +192,7 @@ int main(int argc, char **argv) { //////////////////////////////////////////////////////////////////////////// std::vector input_vec = create_random_vector_of_bfloat16( input_size, 100, std::chrono::system_clock::now().time_since_epoch().count()); - tt_metal::Buffer input_buffer( + auto input_buffer = Buffer::create( device, input_vec.size() * sizeof(uint32_t), single_tile_size, tt_metal::BufferType::DRAM); //////////////////////////////////////////////////////////////////////////// @@ -212,7 +212,7 @@ int main(int argc, char **argv) { num_tiles_per_core_group_1, num_tiles_per_core_group_2, kernel, - input_buffer.address(), + input_buffer->address(), num_reqs_at_a_time, single_tile_size, tile_format); @@ -221,7 +221,7 @@ int main(int argc, char **argv) { // Copy Input To DRAM or L1 //////////////////////////////////////////////////////////////////////////// if (access_type == 0) { - tt_metal::detail::WriteToBuffer(input_buffer, input_vec); + tt_metal::detail::WriteToBuffer(*input_buffer, input_vec); } else { for (uint32_t i = 0, input_offset = 0; i < num_cores; ++i) { CoreCoord core = {i / num_cores_y, i % num_cores_y}; @@ -276,7 +276,7 @@ int main(int argc, char **argv) { //////////////////////////////////////////////////////////////////////////// pass = validation( device, - input_buffer, + *input_buffer, input_vec, num_cores, num_cores_y, diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp index 773380ebee9..829fc585e20 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp @@ -673,18 +673,18 @@ int main(int argc, char **argv) { input_size, 100, 1234); } - tt_metal::Buffer input_buffer( + auto input_buffer = tt_metal::Buffer::create( device, input_vec.size() * sizeof(uint32_t), single_tile_size, tt_metal::BufferType::DRAM); //////////////////////////////////////////////////////////////////////////// // Application Setup //////////////////////////////////////////////////////////////////////////// - auto [program, kernel, cb_addr] = create_program(device, all_cores, single_tile_size, tile_format, num_tiles_cb, num_tiles_per_core, k, n, num_blocks, num_banks, all_cores_list, bank_start_id, input_buffer.address()); + auto [program, kernel, cb_addr] = create_program(device, all_cores, single_tile_size, tile_format, num_tiles_cb, num_tiles_per_core, k, n, num_blocks, num_banks, all_cores_list, bank_start_id, input_buffer->address()); //////////////////////////////////////////////////////////////////////////// // Copy Input To DRAM or L1 //////////////////////////////////////////////////////////////////////////// - tt_metal::detail::WriteToBuffer(input_buffer, input_vec); + tt_metal::detail::WriteToBuffer(*input_buffer, input_vec); //////////////////////////////////////////////////////////////////////////// // Execution Application @@ -713,7 +713,7 @@ int main(int argc, char **argv) { pass = validation( device, - input_buffer, + *input_buffer, input_vec, num_cores, all_cores_list, diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp index 814b28abe02..4587fa9f969 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp @@ -869,18 +869,18 @@ int main(int argc, char **argv) { input_size, 100, 1234); } - tt_metal::Buffer input_buffer( + auto input_buffer = tt_metal::Buffer::create( device, input_vec.size() * sizeof(uint32_t), single_tile_size, tt_metal::BufferType::DRAM); //////////////////////////////////////////////////////////////////////////// // Application Setup //////////////////////////////////////////////////////////////////////////// - auto [program, kernel, output_cb_addr] = create_program(device, all_dram_reader_cores, all_l1_receiver_cores, single_tile_size, tile_format, num_tiles_cb, num_tiles_per_core, k, n, num_blocks, num_banks, all_dram_reader_cores_ordered, all_l1_writer_cores_ordered, bank_start_id, input_buffer.address()); + auto [program, kernel, output_cb_addr] = create_program(device, all_dram_reader_cores, all_l1_receiver_cores, single_tile_size, tile_format, num_tiles_cb, num_tiles_per_core, k, n, num_blocks, num_banks, all_dram_reader_cores_ordered, all_l1_writer_cores_ordered, bank_start_id, input_buffer->address()); //////////////////////////////////////////////////////////////////////////// // Copy Input To DRAM or L1 //////////////////////////////////////////////////////////////////////////// - tt_metal::detail::WriteToBuffer(input_buffer, input_vec); + tt_metal::detail::WriteToBuffer(*input_buffer, input_vec); //////////////////////////////////////////////////////////////////////////// // Execution Application @@ -909,7 +909,7 @@ int main(int argc, char **argv) { pass = validation( device, - input_buffer, + *input_buffer, input_vec, num_cores, all_l1_writer_cores_ordered, diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1.cpp index f276c67030b..23107937bb9 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1.cpp @@ -189,7 +189,7 @@ int main(int argc, char **argv) { activations_addr, activations_addr / 1024, Nt); - std::vector l1_buffers; + std::vector> l1_buffers; int l1_buffers_size = 1; if (!(single_read || one_buffer_share)) { @@ -199,8 +199,8 @@ int main(int argc, char **argv) { l1_buffers.reserve(l1_buffers_size); for (int r = 0; r < num_cores_r; ++r) { for (int c = 0; c < num_cores_c; ++c) { - l1_buffers.emplace_back(device, total_tiles_size_bytes, single_tile_size, tt_metal::BufferType::L1); - tt_metal::detail::WriteToBuffer(l1_buffers[r * num_cores_c + c], packed_tensors[r * num_cores_c + c]); + l1_buffers.push_back(tt_metal::Buffer::create(device, total_tiles_size_bytes, single_tile_size, tt_metal::BufferType::L1)); + tt_metal::detail::WriteToBuffer(*l1_buffers[r * num_cores_c + c], packed_tensors[r * num_cores_c + c]); if (single_read || one_buffer_share) break; @@ -213,7 +213,7 @@ int main(int argc, char **argv) { for (int r = 0; r < num_cores_r; ++r) { for (int c = 0; c < num_cores_c; ++c) { std::vector result_vec; - tt_metal::detail::ReadFromBuffer(l1_buffers[r * num_cores_c + c], result_vec); + tt_metal::detail::ReadFromBuffer(*l1_buffers[r * num_cores_c + c], result_vec); auto result_bfp16 = unpack_uint32_vec_into_bfloat16_vec(result_vec); if (print_tensor) { @@ -260,7 +260,7 @@ int main(int argc, char **argv) { CoreCoord core = {(size_t)c, (size_t)r}; int l1_buffers_idx = (single_read || one_buffer_share) ? (0) : (r * num_cores_c + c); - auto l1_buffer_addr = l1_buffers[l1_buffers_idx].address(); + auto l1_buffer_addr = l1_buffers[l1_buffers_idx]->address(); uint32_t l1_buffer_offset = (one_buffer_share) ? ((r * num_cores_c + c) * Nt) : (0); diff --git a/tests/tt_metal/tt_metal/unit_tests/allocator/test_l1_banking_allocator.cpp b/tests/tt_metal/tt_metal/unit_tests/allocator/test_l1_banking_allocator.cpp index 23f80204fa7..6c68aeeeae7 100644 --- a/tests/tt_metal/tt_metal/unit_tests/allocator/test_l1_banking_allocator.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/allocator/test_l1_banking_allocator.cpp @@ -35,7 +35,7 @@ TEST_F(BasicFixture, TestL1BuffersAllocatedTopDown) { uint64_t alloc_limit = unit_tests::test_l1_banking_allocator::get_alloc_limit(device); - std::vector> buffers; + std::vector> buffers; int alloc_size_idx = 0; uint32_t total_buffer_size = 0; while (total_size_bytes < alloc_limit) { @@ -44,7 +44,7 @@ TEST_F(BasicFixture, TestL1BuffersAllocatedTopDown) { if (total_buffer_size + buffer_size >= alloc_limit) { break; } - std::unique_ptr buffer = std::make_unique(device, buffer_size, buffer_size, tt::tt_metal::BufferType::L1); + auto buffer = tt::tt_metal::Buffer::create(device, buffer_size, buffer_size, tt::tt_metal::BufferType::L1); buffers.emplace_back(std::move(buffer)); total_buffer_size += buffer_size; EXPECT_EQ(buffers.back()->address(), device->l1_size_per_core() - total_buffer_size); diff --git a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_X_dram.cpp b/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_X_dram.cpp index 02ae4935d8f..2eb2e6975a4 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_X_dram.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_X_dram.cpp @@ -430,17 +430,17 @@ bool matmul_multi_core_multi_dram(CommonFixture *fixture, tt_metal::Device *devi auto activations_tile_layout = convert_to_tile_layout(activations_tilized); auto activations = pack_bfloat16_vec_into_uint32_vec(activations_tile_layout); - auto activation_buffer = std::make_shared(device, activations.size() * sizeof(uint32_t), 1024 * 2, BufferType::DRAM); + auto activation_buffer = Buffer::create(device, activations.size() * sizeof(uint32_t), 1024 * 2, BufferType::DRAM); pass &= move_tiles_to_dram(device, activations, M, K, activation_buffer); auto identity_tilized = test_utils::tilize(identity, K * 32, N * 32); auto weights_tile_layout = convert_to_tile_layout(identity_tilized); auto weights = pack_bfloat16_vec_into_uint32_vec(weights_tile_layout); - auto weight_buffer = std::make_shared(device, weights.size() * sizeof(uint32_t), 1024 * 2, BufferType::DRAM); + auto weight_buffer = Buffer::create(device, weights.size() * sizeof(uint32_t), 1024 * 2, BufferType::DRAM); pass &= move_tiles_to_dram(device, weights, K, N, weight_buffer); log_debug(LogTest, "Copying inputs to dram complete"); - auto out_buffer = std::make_shared(device, M * N * sizeof(uint32_t) * 32 * 32, 1024 * 2, BufferType::DRAM); + auto out_buffer = Buffer::create(device, M * N * sizeof(uint32_t) * 32 * 32, 1024 * 2, BufferType::DRAM); uint32_t out_dram_addr = out_buffer->address(); log_debug(LogTest, "Writing kernel runtime args to device"); diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/command_queue_test_utils.hpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/command_queue_test_utils.hpp index 3b1a12c88ba..2d3c5a4b6c7 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/command_queue_test_utils.hpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/command_queue_test_utils.hpp @@ -12,15 +12,15 @@ struct TestBufferConfig { tt::tt_metal::BufferType buftype; }; -inline std::pair> EnqueueWriteBuffer_prior_to_wrap(tt::tt_metal::Device* device, tt::tt_metal::CommandQueue& cq, const TestBufferConfig& config) { +inline std::pair, std::vector> EnqueueWriteBuffer_prior_to_wrap(tt::tt_metal::Device* device, tt::tt_metal::CommandQueue& cq, const TestBufferConfig& config) { // This function just enqueues a buffer (which should be large in the config) // write as a precursor to testing the wrap mechanism size_t buf_size = config.num_pages * config.page_size; - tt::tt_metal::Buffer buffer(device, buf_size, config.page_size, config.buftype); + auto buffer = Buffer::create(device, buf_size, config.page_size, config.buftype); std::vector src = create_random_vector_of_bfloat16( buf_size, 100, std::chrono::system_clock::now().time_since_epoch().count()); - EnqueueWriteBuffer(cq, buffer, src, false); + EnqueueWriteBuffer(cq, *buffer, src, false); return std::make_pair(std::move(buffer), src); } diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp index 461f07c2825..ec8238158d1 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp @@ -481,14 +481,14 @@ bool test_EnqueueWrap_on_EnqueueWriteBuffer(Device* device, CommandQueue& cq, co This just ensures we don't hang on the subsequent EnqueueWriteBuffer */ size_t buf_size = config.num_pages * config.page_size; - Buffer buffer(device, buf_size, config.page_size, config.buftype); + auto buffer = Buffer::create(device, buf_size, config.page_size, config.buftype); vector src(buf_size / sizeof(uint32_t), 0); for (uint32_t i = 0; i < src.size(); i++) { src.at(i) = i; } - EnqueueWriteBuffer(cq, buffer, src, false); + EnqueueWriteBuffer(cq, *buffer, src, false); Finish(cq); return true; diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueTrace.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueTrace.cpp index 5b9428c4ef5..65b6b20d57f 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueTrace.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueTrace.cpp @@ -98,13 +98,13 @@ TEST_F(SingleDeviceTraceFixture, InstantiateTraceSanity) { Setup(2048); CommandQueue& command_queue = this->device_->command_queue(); - Buffer input(this->device_, 2048, 2048, BufferType::DRAM); - vector input_data(input.size() / sizeof(uint32_t), 0); + auto input = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); + vector input_data(input->size() / sizeof(uint32_t), 0); for (uint32_t i = 0; i < input_data.size(); i++) { input_data[i] = i; } - Buffer output(this->device_, 2048, 2048, BufferType::DRAM); - auto simple_program = std::make_shared(create_simple_unary_program(input, output)); + auto output = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); + auto simple_program = std::make_shared(create_simple_unary_program(*input, *output)); EnqueueProgram(command_queue, *simple_program, true); uint32_t tid = BeginTraceCapture(this->device_, command_queue.id()); EnqueueProgram(command_queue, *simple_program, kNonBlocking); @@ -128,13 +128,13 @@ TEST_F(SingleDeviceTraceFixture, InstantiateTraceSanity) { TEST_F(SingleDeviceTraceFixture, EnqueueProgramTraceCapture) { Setup(2048); - Buffer input(this->device_, 2048, 2048, BufferType::DRAM); - Buffer output(this->device_, 2048, 2048, BufferType::DRAM); + auto input = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); + auto output = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); CommandQueue& command_queue = this->device_->command_queue(); - Program simple_program = create_simple_unary_program(input, output); - vector input_data(input.size() / sizeof(uint32_t), 0); + Program simple_program = create_simple_unary_program(*input, *output); + vector input_data(input->size() / sizeof(uint32_t), 0); for (uint32_t i = 0; i < input_data.size(); i++) { input_data[i] = i; } @@ -144,24 +144,24 @@ TEST_F(SingleDeviceTraceFixture, EnqueueProgramTraceCapture) { vector trace_output_data; trace_output_data.resize(input_data.size()); - EnqueueWriteBuffer(command_queue, input, input_data.data(), true); + EnqueueWriteBuffer(command_queue, *input, input_data.data(), true); EnqueueProgram(command_queue, simple_program, true); - EnqueueReadBuffer(command_queue, output, eager_output_data.data(), true); + EnqueueReadBuffer(command_queue, *output, eager_output_data.data(), true); - EnqueueWriteBuffer(command_queue, input, input_data.data(), true); + EnqueueWriteBuffer(command_queue, *input, input_data.data(), true); uint32_t tid = BeginTraceCapture(this->device_, command_queue.id()); EnqueueProgram(command_queue, simple_program, false); EndTraceCapture(this->device_, command_queue.id(), tid); // Create and Enqueue a Program with a live trace to ensure that a warning is generated - Buffer input_temp(this->device_, 2048, 2048, BufferType::DRAM); - Buffer output_temp(this->device_, 2048, 2048, BufferType::DRAM); - Program simple_program_temp = create_simple_unary_program(input_temp, output_temp); + auto input_temp = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); + auto output_temp = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); + Program simple_program_temp = create_simple_unary_program(*input_temp, *output_temp); EnqueueProgram(command_queue, simple_program_temp, true); // Run trace that can clobber the temporary buffers created above EnqueueProgram(command_queue, simple_program, false); EnqueueTrace(command_queue, tid, true); - EnqueueReadBuffer(command_queue, output, trace_output_data.data(), true); + EnqueueReadBuffer(command_queue, *output, trace_output_data.data(), true); EXPECT_TRUE(eager_output_data == trace_output_data); // Done @@ -171,12 +171,12 @@ TEST_F(SingleDeviceTraceFixture, EnqueueProgramTraceCapture) { TEST_F(SingleDeviceTraceFixture, EnqueueProgramDeviceCapture) { Setup(2048); - Buffer input(this->device_, 2048, 2048, BufferType::DRAM); - Buffer output(this->device_, 2048, 2048, BufferType::DRAM); + auto input = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); + auto output = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); CommandQueue& command_queue = this->device_->command_queue(); - vector input_data(input.size() / sizeof(uint32_t), 0); + vector input_data(input->size() / sizeof(uint32_t), 0); for (uint32_t i = 0; i < input_data.size(); i++) { input_data[i] = i; } @@ -190,17 +190,17 @@ TEST_F(SingleDeviceTraceFixture, EnqueueProgramDeviceCapture) { std::shared_ptr simple_program; // EAGER MODE EXECUTION if (has_eager) { - simple_program = std::make_shared(create_simple_unary_program(input, output)); - EnqueueWriteBuffer(command_queue, input, input_data.data(), true); + simple_program = std::make_shared(create_simple_unary_program(*input, *output)); + EnqueueWriteBuffer(command_queue, *input, input_data.data(), true); EnqueueProgram(command_queue, *simple_program, true); - EnqueueReadBuffer(command_queue, output, eager_output_data.data(), true); + EnqueueReadBuffer(command_queue, *output, eager_output_data.data(), true); } // THIS->DEVICE_ CAPTURE AND REPLAY MODE bool has_trace = false; uint32_t tid = 0; for (int i = 0; i < 1; i++) { - EnqueueWriteBuffer(command_queue, input, input_data.data(), true); + EnqueueWriteBuffer(command_queue, *input, input_data.data(), true); if (!has_trace) { // Program must be cached first @@ -211,7 +211,7 @@ TEST_F(SingleDeviceTraceFixture, EnqueueProgramDeviceCapture) { } ReplayTrace(this->device_, command_queue.id(), tid, true); - EnqueueReadBuffer(command_queue, output, trace_output_data.data(), true); + EnqueueReadBuffer(command_queue, *output, trace_output_data.data(), true); if (has_eager) EXPECT_TRUE(eager_output_data == trace_output_data); } @@ -225,13 +225,13 @@ TEST_F(SingleDeviceTraceFixture, EnqueueTwoProgramTrace) { // Get command queue from device for this test, since its running in async mode CommandQueue& command_queue = this->device_->command_queue(); - Buffer input(this->device_, 2048, 2048, BufferType::DRAM); - Buffer interm(this->device_, 2048, 2048, BufferType::DRAM); - Buffer output(this->device_, 2048, 2048, BufferType::DRAM); + auto input = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); + auto interm = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); + auto output = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); - Program op0 = create_simple_unary_program(input, interm); - Program op1 = create_simple_unary_program(interm, output); - vector input_data(input.size() / sizeof(uint32_t), 0); + Program op0 = create_simple_unary_program(*input, *interm); + Program op1 = create_simple_unary_program(*interm, *output); + vector input_data(input->size() / sizeof(uint32_t), 0); for (uint32_t i = 0; i < input_data.size(); i++) { input_data[i] = i; } @@ -252,20 +252,20 @@ TEST_F(SingleDeviceTraceFixture, EnqueueTwoProgramTrace) { eager_output_data.resize(input_data.size()); // Warm up and use the eager blocking run as the expected output - EnqueueWriteBuffer(command_queue, input, input_data.data(), kBlocking); + EnqueueWriteBuffer(command_queue, *input, input_data.data(), kBlocking); EnqueueProgram(command_queue, op0, kBlocking); EnqueueProgram(command_queue, op1, kBlocking); - EnqueueReadBuffer(command_queue, output, expected_output_data.data(), kBlocking); + EnqueueReadBuffer(command_queue, *output, expected_output_data.data(), kBlocking); Finish(command_queue); for (bool blocking : blocking_flags) { std::string mode = blocking ? "Eager-B" : "Eager-NB"; for (auto i = 0; i < num_loops; i++) { ScopedTimer timer(mode + " loop " + std::to_string(i)); - EnqueueWriteBuffer(command_queue, input, input_data.data(), blocking); + EnqueueWriteBuffer(command_queue, *input, input_data.data(), blocking); EnqueueProgram(command_queue, op0, blocking); EnqueueProgram(command_queue, op1, blocking); - EnqueueReadBuffer(command_queue, output, eager_output_data.data(), blocking); + EnqueueReadBuffer(command_queue, *output, eager_output_data.data(), blocking); } if (not blocking) { // (Optional) wait for the last non-blocking command to finish @@ -283,9 +283,9 @@ TEST_F(SingleDeviceTraceFixture, EnqueueTwoProgramTrace) { // Trace mode execution for (auto i = 0; i < num_loops; i++) { ScopedTimer timer("Trace loop " + std::to_string(i)); - EnqueueWriteBuffer(command_queue, input, input_data.data(), kNonBlocking); + EnqueueWriteBuffer(command_queue, *input, input_data.data(), kNonBlocking); EnqueueTrace(command_queue, tid, kNonBlocking); - EnqueueReadBuffer(command_queue, output, trace_outputs[i].data(), kNonBlocking); + EnqueueReadBuffer(command_queue, *output, trace_outputs[i].data(), kNonBlocking); } Finish(command_queue); ReleaseTrace(this->device_, tid); @@ -300,8 +300,8 @@ TEST_F(SingleDeviceTraceFixture, EnqueueMultiProgramTraceBenchmark) { Setup(6144); CommandQueue& command_queue = this->device_->command_queue(); - std::shared_ptr input = std::make_shared(this->device_, 2048, 2048, BufferType::DRAM); - std::shared_ptr output = std::make_shared(this->device_, 2048, 2048, BufferType::DRAM); + auto input = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); + auto output = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); uint32_t num_loops = parse_env("TT_METAL_TRACE_LOOPS", 4); uint32_t num_programs = parse_env("TT_METAL_TRACE_PROGRAMS", 4); @@ -314,7 +314,7 @@ TEST_F(SingleDeviceTraceFixture, EnqueueMultiProgramTraceBenchmark) { } for (int i = 0; i < num_programs; i++) { - interm_buffers.push_back(std::make_shared(this->device_, 2048, 2048, BufferType::DRAM)); + interm_buffers.push_back(Buffer::create(this->device_, 2048, 2048, BufferType::DRAM)); if (i == 0) { programs.push_back(create_simple_unary_program(*input, *(interm_buffers[i]))); } else if (i == (num_programs - 1)) { diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp index 73f0a7cd052..fc6345f5e9a 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp @@ -114,14 +114,14 @@ void test_EnqueueWriteBuffer_and_EnqueueReadBuffer(Device *device, CommandQueue continue; } size_t buf_size = config.num_pages * config.page_size; - Buffer bufa(device, buf_size, config.page_size, config.buftype); + auto bufa = Buffer::create(device, buf_size, config.page_size, config.buftype); - vector src = generate_arange_vector(bufa.size()); + vector src = generate_arange_vector(bufa->size()); if (cq_write) { - EnqueueWriteBuffer(cq, bufa, src.data(), false); + EnqueueWriteBuffer(cq, *bufa, src.data(), false); } else { - ::detail::WriteToBuffer(bufa, src); + ::detail::WriteToBuffer(*bufa, src); if (config.buftype == BufferType::DRAM) { tt::Cluster::instance().dram_barrier(device->id()); } else { @@ -137,9 +137,9 @@ void test_EnqueueWriteBuffer_and_EnqueueReadBuffer(Device *device, CommandQueue } if (cq_read) { - EnqueueReadBuffer(cq, bufa, result.data(), true); + EnqueueReadBuffer(cq, *bufa, result.data(), true); } else { - ::detail::ReadFromBuffer(bufa, result); + ::detail::ReadFromBuffer(*bufa, result); } EXPECT_EQ(src, result); @@ -154,7 +154,7 @@ bool stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer( bool pass = true; uint32_t num_pages_left = config.num_pages_total; - std::vector> buffers; + std::vector> buffers; std::vector> srcs; std::vector> dsts; while (num_pages_left) { @@ -173,9 +173,9 @@ bool stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer( buftype = BufferType::L1; } - std::unique_ptr buf; + std::shared_ptr buf; try { - buf = std::make_unique(device, buf_size, config.page_size, buftype); + buf = Buffer::create(device, buf_size, config.page_size, buftype); } catch (...) { Finish(cq); size_t i = 0; @@ -185,7 +185,7 @@ bool stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer( srcs.clear(); dsts.clear(); buffers.clear(); - buf = std::make_unique(device, buf_size, config.page_size, buftype); + buf = Buffer::create(device, buf_size, config.page_size, buftype); } EnqueueWriteBuffer(cq, *buf, src, false); vector dst; @@ -238,12 +238,12 @@ void stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer_sharded( src.at(i) = i; } - Buffer buf(device, buf_size, config.page_size(), buftype, config.mem_config, shard_spec); + auto buf = Buffer::create(device, buf_size, config.page_size(), buftype, config.mem_config, shard_spec); vector src2 = src; if (cq_write) { - EnqueueWriteBuffer(cq, buf, src2.data(), false); + EnqueueWriteBuffer(cq, *buf, src2.data(), false); } else { - ::detail::WriteToBuffer(buf, src); + ::detail::WriteToBuffer(*buf, src); if (buftype == BufferType::DRAM) { tt::Cluster::instance().dram_barrier(device->id()); } else { @@ -258,9 +258,9 @@ void stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer_sharded( vector res; res.resize(buf_size / sizeof(uint32_t)); if (cq_read) { - EnqueueReadBuffer(cq, buf, res.data(), true); + EnqueueReadBuffer(cq, *buf, res.data(), true); } else { - ::detail::ReadFromBuffer(buf, res); + ::detail::ReadFromBuffer(*buf, res); } EXPECT_EQ(src, res); } @@ -477,25 +477,25 @@ TEST_F(CommandQueueSingleCardFixture, TestWrapCompletionQOnInsufficientSpace) { // leave only small_page_size * 2 B of space in the completion queue uint32_t num_pages_second_buffer = (space_after_first_buffer / small_page_size) - 2; - Buffer buff_1(device, first_buffer_size, large_page_size, BufferType::DRAM); - auto src_1 = local_test_functions::generate_arange_vector(buff_1.size()); - EnqueueWriteBuffer(device->command_queue(), buff_1, src_1, false); + auto buff_1 = Buffer::create(device, first_buffer_size, large_page_size, BufferType::DRAM); + auto src_1 = local_test_functions::generate_arange_vector(buff_1->size()); + EnqueueWriteBuffer(device->command_queue(), *buff_1, src_1, false); vector result_1; - EnqueueReadBuffer(device->command_queue(), buff_1, result_1, true); + EnqueueReadBuffer(device->command_queue(), *buff_1, result_1, true); EXPECT_EQ(src_1, result_1); - Buffer buff_2(device, num_pages_second_buffer * small_page_size, small_page_size, BufferType::DRAM); - auto src_2 = local_test_functions::generate_arange_vector(buff_2.size()); - EnqueueWriteBuffer(device->command_queue(), buff_2, src_2, false); + auto buff_2 = Buffer::create(device, num_pages_second_buffer * small_page_size, small_page_size, BufferType::DRAM); + auto src_2 = local_test_functions::generate_arange_vector(buff_2->size()); + EnqueueWriteBuffer(device->command_queue(), *buff_2, src_2, false); vector result_2; - EnqueueReadBuffer(device->command_queue(), buff_2, result_2, true); + EnqueueReadBuffer(device->command_queue(), *buff_2, result_2, true); EXPECT_EQ(src_2, result_2); - Buffer buff_3(device, 32 * large_page_size, large_page_size, BufferType::DRAM); - auto src_3 = local_test_functions::generate_arange_vector(buff_3.size()); - EnqueueWriteBuffer(device->command_queue(), buff_3, src_3, false); + auto buff_3 = Buffer::create(device, 32 * large_page_size, large_page_size, BufferType::DRAM); + auto src_3 = local_test_functions::generate_arange_vector(buff_3->size()); + EnqueueWriteBuffer(device->command_queue(), *buff_3, src_3, false); vector result_3; - EnqueueReadBuffer(device->command_queue(), buff_3, result_3, true); + EnqueueReadBuffer(device->command_queue(), *buff_3, result_3, true); EXPECT_EQ(src_3, result_3); } } @@ -510,25 +510,25 @@ TEST_F(CommandQueueSingleCardFixture, TestWrapCompletionQOnInsufficientSpace2) { uint32_t num_pages_buff_1 = 9; uint32_t page_size_buff_1 = 2048; - Buffer buff_1(device, num_pages_buff_1 * page_size_buff_1, page_size_buff_1, BufferType::DRAM); - uint32_t space_after_buff_1 = command_completion_region_size - buff_1.size(); + auto buff_1 = Buffer::create(device, num_pages_buff_1 * page_size_buff_1, page_size_buff_1, BufferType::DRAM); + uint32_t space_after_buff_1 = command_completion_region_size - buff_1->size(); uint32_t page_size = 8192; uint32_t desired_remaining_space_before_wrap = 6144; uint32_t avail_space_for_wrapping_buffer = space_after_buff_1 - desired_remaining_space_before_wrap; uint32_t num_pages_for_wrapping_buffer = (avail_space_for_wrapping_buffer / page_size) + 4; - auto src_1 = local_test_functions::generate_arange_vector(buff_1.size()); - EnqueueWriteBuffer(device->command_queue(), buff_1, src_1, false); + auto src_1 = local_test_functions::generate_arange_vector(buff_1->size()); + EnqueueWriteBuffer(device->command_queue(), *buff_1, src_1, false); vector result_1; - EnqueueReadBuffer(device->command_queue(), buff_1, result_1, true); + EnqueueReadBuffer(device->command_queue(), *buff_1, result_1, true); EXPECT_EQ(src_1, result_1); - Buffer wrap_buff(device, num_pages_for_wrapping_buffer * page_size, page_size, BufferType::DRAM); - auto src_2 = local_test_functions::generate_arange_vector(wrap_buff.size()); - EnqueueWriteBuffer(device->command_queue(), wrap_buff, src_2, false); + auto wrap_buff = Buffer::create(device, num_pages_for_wrapping_buffer * page_size, page_size, BufferType::DRAM); + auto src_2 = local_test_functions::generate_arange_vector(wrap_buff->size()); + EnqueueWriteBuffer(device->command_queue(), *wrap_buff, src_2, false); vector result_2; - EnqueueReadBuffer(device->command_queue(), wrap_buff, result_2, true); + EnqueueReadBuffer(device->command_queue(), *wrap_buff, result_2, true); EXPECT_EQ(src_2, result_2); } } @@ -585,19 +585,19 @@ TEST_F(CommandQueueSingleCardFixture, TestBackToBackNon32BAlignedPageSize) { constexpr BufferType buff_type = BufferType::L1; for (Device *device : devices_) { - Buffer bufa(device, 125000, 100, buff_type); - auto src_a = local_test_functions::generate_arange_vector(bufa.size()); - EnqueueWriteBuffer(device->command_queue(), bufa, src_a, false); + auto bufa = Buffer::create(device, 125000, 100, buff_type); + auto src_a = local_test_functions::generate_arange_vector(bufa->size()); + EnqueueWriteBuffer(device->command_queue(), *bufa, src_a, false); - Buffer bufb(device, 152000, 152, buff_type); - auto src_b = local_test_functions::generate_arange_vector(bufb.size()); - EnqueueWriteBuffer(device->command_queue(), bufb, src_b, false); + auto bufb = Buffer::create(device, 152000, 152, buff_type); + auto src_b = local_test_functions::generate_arange_vector(bufb->size()); + EnqueueWriteBuffer(device->command_queue(), *bufb, src_b, false); vector result_a; - EnqueueReadBuffer(device->command_queue(), bufa, result_a, true); + EnqueueReadBuffer(device->command_queue(), *bufa, result_a, true); vector result_b; - EnqueueReadBuffer(device->command_queue(), bufb, result_b, true); + EnqueueReadBuffer(device->command_queue(), *bufb, result_b, true); EXPECT_EQ(src_a, result_a); EXPECT_EQ(src_b, result_b); @@ -621,19 +621,19 @@ TEST_F(CommandQueueSingleCardFixture, TestNonblockingReads) { constexpr BufferType buff_type = BufferType::L1; for (auto device : devices_) { - Buffer bufa(device, 2048, 2048, buff_type); - auto src_a = local_test_functions::generate_arange_vector(bufa.size()); - EnqueueWriteBuffer(device->command_queue(), bufa, src_a, false); + auto bufa = Buffer::create(device, 2048, 2048, buff_type); + auto src_a = local_test_functions::generate_arange_vector(bufa->size()); + EnqueueWriteBuffer(device->command_queue(), *bufa, src_a, false); - Buffer bufb(device, 2048, 2048, buff_type); - auto src_b = local_test_functions::generate_arange_vector(bufb.size()); - EnqueueWriteBuffer(device->command_queue(), bufb, src_b, false); + auto bufb = Buffer::create(device, 2048, 2048, buff_type); + auto src_b = local_test_functions::generate_arange_vector(bufb->size()); + EnqueueWriteBuffer(device->command_queue(), *bufb, src_b, false); vector result_a; - EnqueueReadBuffer(device->command_queue(), bufa, result_a, false); + EnqueueReadBuffer(device->command_queue(), *bufa, result_a, false); vector result_b; - EnqueueReadBuffer(device->command_queue(), bufb, result_b, false); + EnqueueReadBuffer(device->command_queue(), *bufb, result_b, false); Finish(device->command_queue()); EXPECT_EQ(src_a, result_a); diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_HostAsyncCQ.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_HostAsyncCQ.cpp index dd4b88ae85f..03e6eb1004c 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_HostAsyncCQ.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_HostAsyncCQ.cpp @@ -225,8 +225,7 @@ TEST_F(CommandQueueFixture, DISABLED_TestAsyncBufferRW) { auto& command_queue = this->device_->command_queue(); auto current_mode = CommandQueue::default_mode(); command_queue.set_mode(CommandQueue::CommandQueueMode::ASYNC); - Program program; /* Dummy program that helps keep track of buffers */ - std::vector buffer_objects; + Program program; for (int j = 0; j < 10; j++) { // Asynchronously initialize a buffer on device uint32_t first_buf_value = j + 1; @@ -234,26 +233,14 @@ TEST_F(CommandQueueFixture, DISABLED_TestAsyncBufferRW) { uint32_t first_buf_size = 4096; uint32_t second_buf_size = 2048; // Asynchronously allocate buffer on device - std::shared_ptr buffer = std::make_shared(this->device_, first_buf_size, first_buf_size, BufferType::DRAM); - // Copy the host side buffer structure to an object (i.e. reallocate on device). After reallocation the addresses should not match - buffer_objects.push_back(*buffer); - // Confirm that the addresses do not match after copy/reallocation. Send async calls to get addrs, which will return post dealloc addr + std::shared_ptr buffer = Buffer::create(this->device_, first_buf_size, first_buf_size, BufferType::DRAM); std::shared_ptr allocated_buffer_address = std::make_shared(); EnqueueGetBufferAddr(this->device_->command_queue(), allocated_buffer_address.get(), buffer.get(), true); - std::shared_ptr allocated_buffer_address_2 = std::make_shared(); - EnqueueGetBufferAddr(this->device_->command_queue(), allocated_buffer_address_2.get(), &(buffer_objects.back()), true); - EXPECT_NE(*allocated_buffer_address_2, *allocated_buffer_address); - // Ensure returned addrs are correct + // Ensure returned addr is correct EXPECT_EQ((*allocated_buffer_address), buffer->address()); - EXPECT_EQ((*allocated_buffer_address_2), buffer_objects.back().address()); - // Deallocate the second device side buffer - detail::DeallocateBuffer(&(buffer_objects.back())); - // Make the buffer_object address and the buffer address identical with a blocking call. buffer_object and buffer are now the same device side buffer - buffer_objects.back().set_address(*allocated_buffer_address); std::shared_ptr> vec = std::make_shared>(first_buf_size / 4, first_buf_value); - std::vector readback_vec_1 = {}; - std::vector readback_vec_2 = {}; + std::vector readback_vec = {}; // Write first vector to existing on device buffer. EnqueueWriteBuffer(this->device_->command_queue(), buffer, vec, false); // Reallocate the vector in the main thread after asynchronously pushing it (ensure that worker still has access to this data) @@ -261,19 +248,15 @@ TEST_F(CommandQueueFixture, DISABLED_TestAsyncBufferRW) { // Simulate what tt-eager does: Share buffer ownership with program AssignGlobalBufferToProgram(buffer, program); // Reallocate buffer (this is safe, since the program also owns the existing buffer, which will not be deallocated) - buffer = std::make_shared(this->device_, second_buf_size, second_buf_size, BufferType::DRAM); + buffer = Buffer::create(this->device_, second_buf_size, second_buf_size, BufferType::DRAM); // Write second vector to second buffer EnqueueWriteBuffer(this->device_->command_queue(), buffer, vec, false); // Have main thread give up ownership immediately after writing vec.reset(); // Read both buffer and ensure data is correct - EnqueueReadBuffer(this->device_->command_queue(), buffer_objects.back(), readback_vec_1, false); - EnqueueReadBuffer(this->device_->command_queue(), buffer, readback_vec_2, true); - for (int i = 0; i < readback_vec_1.size(); i++) { - EXPECT_EQ(readback_vec_1[i], first_buf_value); - } - for (int i = 0; i < readback_vec_2.size(); i++) { - EXPECT_EQ(readback_vec_2[i], second_buf_value); + EnqueueReadBuffer(this->device_->command_queue(), buffer, readback_vec, true); + for (int i = 0; i < readback_vec.size(); i++) { + EXPECT_EQ(readback_vec[i], second_buf_value); } } command_queue.set_mode(current_mode); @@ -335,7 +318,7 @@ TEST_F(CommandQueueFixture, DISABLED_TestAsyncAssertForDeprecatedAPI) { "tt_metal/kernels/dataflow/reader_binary_diff_lengths.cpp", core, DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); - auto src0 = std::make_shared(this->device_, buf_size, page_size, BufferType::DRAM); + auto src0 = Buffer::create(this->device_, buf_size, page_size, BufferType::DRAM); std::vector runtime_args = {src0->address()}; try { SetRuntimeArgs(program, dummy_kernel, core, runtime_args); diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_events.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_events.cpp index a3bd30b55fb..5a772063742 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_events.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_events.cpp @@ -39,7 +39,7 @@ TEST_F(CommandQueueFixture, TestEventsDataMovementWrittenToCompletionQueueInOrde vector> buffers; for (size_t i = 0; i < num_buffers; i++) { - buffers.push_back(std::make_shared(this->device_, page_size, page_size, BufferType::DRAM)); + buffers.push_back(Buffer::create(this->device_, page_size, page_size, BufferType::DRAM)); if (data_movement_mode == DataMovementMode::WRITE) { EnqueueWriteBuffer(this->device_->command_queue(), buffers.back(), page, true); @@ -281,7 +281,7 @@ TEST_F(CommandQueueFixture, TestEventsMixedWriteBufferRecordWaitSynchronize) { EXPECT_EQ(event->cq_id, this->device_->command_queue().id()); EXPECT_EQ(event->event_id, events_issued_per_cq + 1); // Event ids start at 1 - std::shared_ptr buf = std::make_shared(this->device_, page_size, page_size, BufferType::DRAM); + std::shared_ptr buf = Buffer::create(this->device_, page_size, page_size, BufferType::DRAM); EnqueueWriteBuffer(this->device_->command_queue(), buf, page, true); EnqueueWaitForEvent(this->device_->command_queue(), event); diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueTrace.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueTrace.cpp index 24e3f9b9b79..cef5a8d0c18 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueTrace.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueTrace.cpp @@ -77,14 +77,14 @@ namespace basic_tests { TEST_F(SingleDeviceTraceFixture, EnqueueOneProgramTrace) { Setup(2048, 2); - Buffer input(this->device_, 2048, 2048, BufferType::DRAM); - Buffer output(this->device_, 2048, 2048, BufferType::DRAM); + auto input = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); + auto output = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); CommandQueue& command_queue = this->device_->command_queue(0); CommandQueue& data_movement_queue = this->device_->command_queue(1); - Program simple_program = create_simple_unary_program(input, output); - vector input_data(input.size() / sizeof(uint32_t), 0); + Program simple_program = create_simple_unary_program(*input, *output); + vector input_data(input->size() / sizeof(uint32_t), 0); for (uint32_t i = 0; i < input_data.size(); i++) { input_data[i] = i; } @@ -93,7 +93,7 @@ TEST_F(SingleDeviceTraceFixture, EnqueueOneProgramTrace) { vector eager_output_data; eager_output_data.resize(input_data.size()); - EnqueueWriteBuffer(data_movement_queue, input, input_data.data(), true); + EnqueueWriteBuffer(data_movement_queue, *input, input_data.data(), true); EnqueueProgram(command_queue, simple_program, true); EnqueueReadBuffer(data_movement_queue, output, eager_output_data.data(), true); @@ -101,14 +101,14 @@ TEST_F(SingleDeviceTraceFixture, EnqueueOneProgramTrace) { vector trace_output_data; trace_output_data.resize(input_data.size()); - EnqueueWriteBuffer(data_movement_queue, input, input_data.data(), true); + EnqueueWriteBuffer(data_movement_queue, *input, input_data.data(), true); uint32_t tid = BeginTraceCapture(this->device_, command_queue.id()); EnqueueProgram(command_queue, simple_program, false); EndTraceCapture(this->device_, command_queue.id(), tid); EnqueueTrace(command_queue, tid, true); - EnqueueReadBuffer(data_movement_queue, output, trace_output_data.data(), true); + EnqueueReadBuffer(data_movement_queue, *output, trace_output_data.data(), true); EXPECT_TRUE(eager_output_data == trace_output_data); // Done @@ -118,14 +118,14 @@ TEST_F(SingleDeviceTraceFixture, EnqueueOneProgramTrace) { TEST_F(SingleDeviceTraceFixture, EnqueueOneProgramTraceLoops) { Setup(4096, 2); - Buffer input(this->device_, 2048, 2048, BufferType::DRAM); - Buffer output(this->device_, 2048, 2048, BufferType::DRAM); + auto input = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); + auto output = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); CommandQueue& command_queue = this->device_->command_queue(0); CommandQueue& data_movement_queue = this->device_->command_queue(1); - Program simple_program = create_simple_unary_program(input, output); - vector input_data(input.size() / sizeof(uint32_t), 0); + Program simple_program = create_simple_unary_program(*input, *output); + vector input_data(input->size() / sizeof(uint32_t), 0); for (uint32_t i = 0; i < input_data.size(); i++) { input_data[i] = i; } @@ -146,7 +146,7 @@ TEST_F(SingleDeviceTraceFixture, EnqueueOneProgramTraceLoops) { uint32_t trace_id = 0; bool trace_captured = false; for (auto i = 0; i < num_loops; i++) { - EnqueueWriteBuffer(data_movement_queue, input, input_data.data(), true); + EnqueueWriteBuffer(data_movement_queue, *input, input_data.data(), true); if (not trace_captured) { trace_id = BeginTraceCapture(this->device_, command_queue.id()); @@ -156,7 +156,7 @@ TEST_F(SingleDeviceTraceFixture, EnqueueOneProgramTraceLoops) { } EnqueueTrace(command_queue, trace_id, false); - EnqueueReadBuffer(data_movement_queue, output, trace_outputs[i].data(), true); + EnqueueReadBuffer(data_movement_queue, *output, trace_outputs[i].data(), true); // Expect same output across all loops EXPECT_TRUE(trace_outputs[i] == trace_outputs[0]); @@ -169,8 +169,8 @@ TEST_F(SingleDeviceTraceFixture, EnqueueOneProgramTraceLoops) { TEST_F(SingleDeviceTraceFixture, EnqueueOneProgramTraceBenchmark) { Setup(6144, 2); - Buffer input(this->device_, 2048, 2048, BufferType::DRAM); - Buffer output(this->device_, 2048, 2048, BufferType::DRAM); + auto input = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); + auto output = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); constexpr bool kBlocking = true; constexpr bool kNonBlocking = false; @@ -180,8 +180,8 @@ TEST_F(SingleDeviceTraceFixture, EnqueueOneProgramTraceBenchmark) { // Keep this queue in passthrough mode for now CommandQueue& command_queue = this->device_->command_queue(0); - auto simple_program = create_simple_unary_program(input, output); - vector input_data(input.size() / sizeof(uint32_t), 0); + auto simple_program = create_simple_unary_program(*input, *output); + vector input_data(input->size() / sizeof(uint32_t), 0); for (uint32_t i = 0; i < input_data.size(); i++) { input_data[i] = i; } @@ -202,18 +202,18 @@ TEST_F(SingleDeviceTraceFixture, EnqueueOneProgramTraceBenchmark) { eager_output_data.resize(input_data.size()); // Warm up and use the eager blocking run as the expected output - EnqueueWriteBuffer(command_queue, input, input_data.data(), kBlocking); + EnqueueWriteBuffer(command_queue, *input, input_data.data(), kBlocking); EnqueueProgram(command_queue, simple_program, kBlocking); - EnqueueReadBuffer(command_queue, output, expected_output_data.data(), kBlocking); + EnqueueReadBuffer(command_queue, *output, expected_output_data.data(), kBlocking); Finish(command_queue); for (bool blocking : blocking_flags) { std::string mode = blocking ? "Eager-B" : "Eager-NB"; for (auto i = 0; i < num_loops; i++) { tt::ScopedTimer timer(mode + " loop " + std::to_string(i)); - EnqueueWriteBuffer(command_queue, input, input_data.data(), blocking); + EnqueueWriteBuffer(command_queue, *input, input_data.data(), blocking); EnqueueProgram(command_queue, simple_program, blocking); - EnqueueReadBuffer(command_queue, output, eager_output_data.data(), blocking); + EnqueueReadBuffer(command_queue, *output, eager_output_data.data(), blocking); } if (not blocking) { // (Optional) wait for the last non-blocking command to finish @@ -230,9 +230,9 @@ TEST_F(SingleDeviceTraceFixture, EnqueueOneProgramTraceBenchmark) { // Trace mode execution for (auto i = 0; i < num_loops; i++) { tt::ScopedTimer timer("Trace loop " + std::to_string(i)); - EnqueueWriteBuffer(command_queue, input, input_data.data(), kNonBlocking); + EnqueueWriteBuffer(command_queue, *input, input_data.data(), kNonBlocking); EnqueueTrace(command_queue, tid, kNonBlocking); - EnqueueReadBuffer(command_queue, output, trace_outputs[i].data(), kNonBlocking); + EnqueueReadBuffer(command_queue, *output, trace_outputs[i].data(), kNonBlocking); } Finish(command_queue); diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWaitForEvent.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWaitForEvent.cpp index 73baaa14267..b6c3e82791c 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWaitForEvent.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWaitForEvent.cpp @@ -218,7 +218,7 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestEventsReadWriteWithWaitForEvent vector> srcs; for (uint i = 0; i < cqs.size(); i++) { uint32_t wr_data_base = (buf_idx * 1000) + (i * 100); - buffers.push_back(std::make_shared(this->device_, buf_size, config.page_size, config.buftype)); + buffers.push_back(Buffer::create(this->device_, buf_size, config.page_size, config.buftype)); srcs.push_back(generate_arange_vector(buffers[i]->size(), wr_data_base)); log_debug(tt::LogTest, "buf_idx: {} Doing Write to cq_id: {} of data: {}", buf_idx, i, srcs[i]); EnqueueWriteBuffer(cqs[i], *buffers[i], srcs[i], false); @@ -280,7 +280,7 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestEventsReadWriteWithWaitForEvent auto event = std::make_shared(); vector result; - buffers.push_back(std::make_shared(this->device_, buf_size, config.page_size, config.buftype)); + buffers.push_back(Buffer::create(this->device_, buf_size, config.page_size, config.buftype)); srcs.push_back(generate_arange_vector(buffers[i]->size(), wr_data_base)); // Blocking Read after Non-Blocking Write on alternate CQs, events ensure ordering. @@ -339,7 +339,7 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestEventsReadWriteWithWaitForEvent vector> read_results; vector> buffers; - buffers.push_back(std::make_shared(this->device_, buf_size, config.page_size, config.buftype)); + buffers.push_back(Buffer::create(this->device_, buf_size, config.page_size, config.buftype)); // Number of write-read combos per buffer. Fewer make RAW race without events easier to hit. for (uint j = 0; j < num_wr_rd_per_buf; j++) { diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp index 94572201f06..f294316c6e4 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp @@ -21,10 +21,10 @@ bool test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(Device* device, v for (const bool use_void_star_api: {true, false}) { size_t buf_size = config.num_pages * config.page_size; - std::vector> buffers; + std::vector> buffers; std::vector> srcs; for (uint i = 0; i < cqs.size(); i++) { - buffers.push_back(std::make_unique(device, buf_size, config.page_size, config.buftype)); + buffers.push_back(Buffer::create(device, buf_size, config.page_size, config.buftype)); srcs.push_back(generate_arange_vector(buffers[i]->size())); if (use_void_star_api) { EnqueueWriteBuffer(cqs[i], *buffers[i], srcs[i].data(), false); diff --git a/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor.cpp b/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor.cpp index 654e9bd5d54..0e43c63707b 100644 --- a/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor.cpp +++ b/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor.cpp @@ -32,12 +32,11 @@ void run_create_tensor_test(tt::tt_metal::Device* device, ttnn::SimpleShape inpu host_data[i] = 1; } - auto input_buffer = ttnn::allocate_buffer_on_device(input_buf_size_datums * datum_size_bytes, device, input_shape, dtype, Layout::TILE, mem_cfg); + auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(input_buf_size_datums * datum_size_bytes, device, input_shape, dtype, Layout::TILE, mem_cfg); auto input_storage = tt::tt_metal::DeviceStorage{input_buffer}; Tensor input_tensor = Tensor(input_storage, input_shape, dtype, Layout::TILE); - tt::log_debug("input_data: \n {}", input_tensor.write_to_string()); ttnn::write_buffer(io_cq, input_tensor, {host_data}); @@ -65,6 +64,10 @@ INSTANTIATE_TEST_SUITE_P( CreateTensorTestWithShape, CreateTensorTest, ::testing::Values( - CreateTensorParams{.shape=ttnn::SimpleShape({1, 1, 32, 32})} + CreateTensorParams{.shape=ttnn::SimpleShape({1, 1, 32, 32})}, + CreateTensorParams{.shape=ttnn::SimpleShape({2, 1, 32, 32})}, + CreateTensorParams{.shape=ttnn::SimpleShape({0, 0, 0, 0})}, + CreateTensorParams{.shape=ttnn::SimpleShape({0, 1, 32, 32})}, + CreateTensorParams{.shape=ttnn::SimpleShape({0})} ) ); diff --git a/tests/ttnn/unit_tests/gtests/test_async_runtime.cpp b/tests/ttnn/unit_tests/gtests/test_async_runtime.cpp index f3dc59b25a7..4d50192f234 100644 --- a/tests/ttnn/unit_tests/gtests/test_async_runtime.cpp +++ b/tests/ttnn/unit_tests/gtests/test_async_runtime.cpp @@ -54,8 +54,8 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestAsyncPreallocatedOutputs) { auto workload_event = std::make_shared(); // Running sum-reduce with preallocated output // Preallocate Input and Output Tensors on Device - auto input_buffer = ttnn::allocate_buffer_on_device(input_buf_size_datums * datum_size_bytes, device, input_shape.padded_shape(), DataType::BFLOAT16, Layout::TILE, mem_cfg); - auto output_buffer = ttnn::allocate_buffer_on_device(output_buf_size_datums * datum_size_bytes, device, np_out.get_padded_shape(), DataType::BFLOAT16, Layout::TILE, mem_cfg); + auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(input_buf_size_datums * datum_size_bytes, device, input_shape.padded_shape(), DataType::BFLOAT16, Layout::TILE, mem_cfg); + auto output_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(output_buf_size_datums * datum_size_bytes, device, np_out.get_padded_shape(), DataType::BFLOAT16, Layout::TILE, mem_cfg); auto input_storage = tt::tt_metal::DeviceStorage{input_buffer}; auto output_storage = tt::tt_metal::DeviceStorage{output_buffer}; Tensor input_tensor = Tensor(input_storage, input_shape, DataType::BFLOAT16, Layout::TILE); @@ -118,7 +118,7 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestAsyncRuntimeAllocatedBuffers) { auto write_event = std::make_shared(); auto workload_event = std::make_shared(); - auto input_buffer = ttnn::allocate_buffer_on_device(buf_size_datums * datum_size_bytes, device, shape, DataType::BFLOAT16, Layout::TILE, mem_cfg); + auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(buf_size_datums * datum_size_bytes, device, shape, DataType::BFLOAT16, Layout::TILE, mem_cfg); auto input_storage = tt::tt_metal::DeviceStorage{input_buffer}; Tensor input_tensor = Tensor(input_storage, shape, DataType::BFLOAT16, Layout::TILE); ttnn::write_buffer(io_cq, input_tensor, {host_data}); // Write using cq 1 @@ -128,10 +128,10 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestAsyncRuntimeAllocatedBuffers) { // Run operation on cq 0 Tensor output_tensor = ttnn::sqrt(workload_dispatch_cq, input_tensor); - auto dummy_buffer_0 = ttnn::allocate_buffer_on_device(buf_size_datums * datum_size_bytes, device, shape, DataType::BFLOAT16, Layout::TILE, mem_cfg); + auto dummy_buffer_0 = tt::tt_metal::tensor_impl::allocate_buffer_on_device(buf_size_datums * datum_size_bytes, device, shape, DataType::BFLOAT16, Layout::TILE, mem_cfg); output_tensor = ttnn::neg(workload_dispatch_cq, output_tensor); // Allocate this buffer to stress test async allocation across op execution and explicit allocation - auto dummy_buffer_1 = ttnn::allocate_buffer_on_device(buf_size_datums * datum_size_bytes, device, shape, DataType::BFLOAT16, Layout::TILE, mem_cfg); + auto dummy_buffer_1 = tt::tt_metal::tensor_impl::allocate_buffer_on_device(buf_size_datums * datum_size_bytes, device, shape, DataType::BFLOAT16, Layout::TILE, mem_cfg); // Record cq 0 prog execution ttnn::record_event(device->command_queue(workload_dispatch_cq), workload_event); // Wait until cq 0 prog execution is done @@ -164,7 +164,7 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestAsyncRuntimeBufferDestructor) { // deallocate the buffer) in a loop for (int loop = 0; loop < 100000; loop++) { { - auto input_buffer_dummy = ttnn::allocate_buffer_on_device(buf_size_datums * datum_size_bytes, device, shape, DataType::BFLOAT16, Layout::TILE, mem_cfg); + auto input_buffer_dummy = tt::tt_metal::tensor_impl::allocate_buffer_on_device(buf_size_datums * datum_size_bytes, device, shape, DataType::BFLOAT16, Layout::TILE, mem_cfg); device->synchronize(); } } diff --git a/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp b/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp index b101c6a6065..fc97f0aad13 100644 --- a/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp +++ b/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp @@ -155,7 +155,7 @@ TEST(GalaxyTests, TestAllGatherDeadlock) { log_info(LogTest, "Running iteration {}", i); } for (auto& dev : devs) { - auto input_buffer = ttnn::allocate_buffer_on_device(buf_size_datums * datum_size_bytes, dev, shape, DataType::BFLOAT16, Layout::TILE, mem_cfg); + auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(buf_size_datums * datum_size_bytes, dev, shape, DataType::BFLOAT16, Layout::TILE, mem_cfg); auto input_storage = DeviceStorage{input_buffer}; Tensor input_tensor = Tensor(input_storage, shape, DataType::BFLOAT16, Layout::TILE); // Push inputs. @@ -250,7 +250,7 @@ TEST(GalaxyTests, TestReduceScatterDeadlock) { log_info(LogTest, "Running iteration {}", i); } for (auto& dev : ring_devices) { - auto input_buffer = ttnn::allocate_buffer_on_device(buf_size_datums * datum_size_bytes, dev, shape, DataType::BFLOAT16, Layout::TILE, mem_cfg); + auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(buf_size_datums * datum_size_bytes, dev, shape, DataType::BFLOAT16, Layout::TILE, mem_cfg); auto input_storage = DeviceStorage{input_buffer}; Tensor input_tensor = Tensor(input_storage, shape, DataType::BFLOAT16, Layout::TILE); // Push inputs. diff --git a/tests/ttnn/unit_tests/gtests/test_multi_cq_multi_dev.cpp b/tests/ttnn/unit_tests/gtests/test_multi_cq_multi_dev.cpp index 75d6f1afd7b..a1ce8693233 100644 --- a/tests/ttnn/unit_tests/gtests/test_multi_cq_multi_dev.cpp +++ b/tests/ttnn/unit_tests/gtests/test_multi_cq_multi_dev.cpp @@ -59,7 +59,7 @@ TEST_F(MultiCommandQueueT3KFixture, Test2CQMultiDeviceProgramsOnCQ1) { for (int j = 0; j < buf_size_datums; j++) { host_data[j] = bfloat16(static_cast(i + dev_idx)); } - auto input_buffer = ttnn::allocate_buffer_on_device(buf_size_datums * datum_size_bytes, device, shape, DataType::BFLOAT16, Layout::TILE, mem_cfg); + auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(buf_size_datums * datum_size_bytes, device, shape, DataType::BFLOAT16, Layout::TILE, mem_cfg); auto input_storage = tt::tt_metal::DeviceStorage{input_buffer}; Tensor input_tensor = Tensor(input_storage, shape, DataType::BFLOAT16, Layout::TILE); @@ -110,7 +110,7 @@ TEST_F(MultiCommandQueueT3KFixture, Test2CQMultiDeviceProgramsOnCQ0) { for (int j = 0; j < buf_size_datums; j++) { host_data[j] = bfloat16(static_cast(i + dev_idx)); } - auto input_buffer = ttnn::allocate_buffer_on_device(buf_size_datums * datum_size_bytes, device, shape, DataType::BFLOAT16, Layout::TILE, mem_cfg); + auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(buf_size_datums * datum_size_bytes, device, shape, DataType::BFLOAT16, Layout::TILE, mem_cfg); auto input_storage = tt::tt_metal::DeviceStorage{input_buffer}; Tensor input_tensor = Tensor(input_storage, shape, DataType::BFLOAT16, Layout::TILE); @@ -161,7 +161,7 @@ TEST_F(MultiCommandQueueT3KFixture, Test2CQMultiDeviceWithCQ1Only) { for (int j = 0; j < buf_size_datums; j++) { host_data[j] = bfloat16(static_cast(i + dev_idx)); } - auto input_buffer = ttnn::allocate_buffer_on_device(buf_size_datums * datum_size_bytes, device, shape, DataType::BFLOAT16, Layout::TILE, mem_cfg); + auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(buf_size_datums * datum_size_bytes, device, shape, DataType::BFLOAT16, Layout::TILE, mem_cfg); auto input_storage = tt::tt_metal::DeviceStorage{input_buffer}; Tensor input_tensor = Tensor(input_storage, shape, DataType::BFLOAT16, Layout::TILE); diff --git a/tests/ttnn/unit_tests/gtests/test_multiprod_queue.cpp b/tests/ttnn/unit_tests/gtests/test_multiprod_queue.cpp index a7ace491790..98fa377529c 100644 --- a/tests/ttnn/unit_tests/gtests/test_multiprod_queue.cpp +++ b/tests/ttnn/unit_tests/gtests/test_multiprod_queue.cpp @@ -52,7 +52,7 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestMultiProducerLockBasedQueue) { t0_host_data[i] = bfloat16(static_cast(2 + j)); } // Allocate and write buffer - auto t0_input_buffer = ttnn::allocate_buffer_on_device(tensor_buf_size * datum_size_bytes, device, tensor_shape, DataType::BFLOAT16, Layout::TILE, mem_cfg); + auto t0_input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(tensor_buf_size * datum_size_bytes, device, tensor_shape, DataType::BFLOAT16, Layout::TILE, mem_cfg); auto t0_input_storage = tt::tt_metal::DeviceStorage{t0_input_buffer}; Tensor t0_input_tensor = Tensor(t0_input_storage, tensor_shape, DataType::BFLOAT16, Layout::TILE); ttnn::write_buffer(t0_io_cq, t0_input_tensor, {t0_host_data}); @@ -71,7 +71,7 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestMultiProducerLockBasedQueue) { for (int i = 0; i < tensor_buf_size; i++) { t1_host_data[i] = bfloat16(static_cast(4 + j)); } - auto t1_input_buffer = ttnn::allocate_buffer_on_device(tensor_buf_size * datum_size_bytes, device, tensor_shape, DataType::BFLOAT16, Layout::TILE, mem_cfg); + auto t1_input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(tensor_buf_size * datum_size_bytes, device, tensor_shape, DataType::BFLOAT16, Layout::TILE, mem_cfg); auto t1_input_storage = tt::tt_metal::DeviceStorage{t1_input_buffer}; Tensor t1_input_tensor = Tensor(t1_input_storage, tensor_shape, DataType::BFLOAT16, Layout::TILE); @@ -118,7 +118,7 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestMultiAppThreadSync) { ttnn::SimpleShape tensor_shape{1, 1, 1024, 1024}; auto host_data = std::shared_ptr(new bfloat16[tensor_buf_size]); - auto allocated_buffer = ttnn::allocate_buffer_on_device(tensor_buf_size * datum_size_bytes, device, tensor_shape, DataType::BFLOAT16, Layout::TILE, mem_cfg); + auto allocated_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(tensor_buf_size * datum_size_bytes, device, tensor_shape, DataType::BFLOAT16, Layout::TILE, mem_cfg); auto allocated_storage = tt::tt_metal::DeviceStorage{allocated_buffer}; auto allocated_tensor = Tensor(allocated_storage, tensor_shape, DataType::BFLOAT16, Layout::TILE); auto readback_data = std::shared_ptr(new bfloat16[tensor_buf_size]); diff --git a/tt_metal/detail/tt_metal.hpp b/tt_metal/detail/tt_metal.hpp index d362bee71a2..d6168102a5e 100644 --- a/tt_metal/detail/tt_metal.hpp +++ b/tt_metal/detail/tt_metal.hpp @@ -276,7 +276,7 @@ inline namespace v0 { void SetLazyCommandQueueMode(bool lazy); - void AllocateBuffer(Buffer* buffer, bool bottom_up); + DeviceAddr AllocateBuffer(const Buffer* buffer, bool bottom_up); void DeallocateBuffer(Buffer *buffer); } // namespace detail diff --git a/tt_metal/detail/util.hpp b/tt_metal/detail/util.hpp index 9938f01c82a..5418e0562df 100644 --- a/tt_metal/detail/util.hpp +++ b/tt_metal/detail/util.hpp @@ -26,9 +26,9 @@ namespace tt::tt_metal::detail{ } inline DeviceAddr SizeBytesPerBank(DeviceAddr size_bytes, DeviceAddr page_size_bytes, uint32_t num_banks, uint32_t alignment_bytes) { - TT_ASSERT(page_size_bytes > 0 and size_bytes % page_size_bytes == 0, "Page size {} should be divisible by buffer size {}", page_size_bytes, size_bytes); - DeviceAddr num_pages = size_bytes / page_size_bytes; - DeviceAddr num_equally_distributed_pages = num_pages == 1 ? 1 : 1 + ((num_pages - 1) / num_banks); + TT_ASSERT(page_size_bytes == 0 ? size_bytes == 0 : size_bytes % page_size_bytes == 0, "Page size {} should be divisible by buffer size {}", page_size_bytes, size_bytes); + DeviceAddr num_pages = page_size_bytes == 0 ? 0 : size_bytes / page_size_bytes; + DeviceAddr num_equally_distributed_pages = num_pages == 0 ? 0 : 1 + ((num_pages - 1) / num_banks); return num_equally_distributed_pages * round_up(page_size_bytes, alignment_bytes); } diff --git a/tt_metal/graph/graph_tracking.cpp b/tt_metal/graph/graph_tracking.cpp index 04a9ae4240a..17a72ddd5ee 100644 --- a/tt_metal/graph/graph_tracking.cpp +++ b/tt_metal/graph/graph_tracking.cpp @@ -27,7 +27,7 @@ bool GraphTracker::add_hook(const std::shared_ptr& new_hook) { return true; } -void GraphTracker::track_allocate(Buffer* buffer, bool bottom_up) { +void GraphTracker::track_allocate(const Buffer* buffer, bool bottom_up) { if (processors.empty()) { return; } @@ -73,7 +73,7 @@ void GraphTracker::track_program(Program* program) { } } -bool GraphTracker::hook_allocate(Buffer* buffer, bool bottom_up) { +bool GraphTracker::hook_allocate(const Buffer* buffer, bool bottom_up) { if (hook == nullptr) return false; diff --git a/tt_metal/graph/graph_tracking.hpp b/tt_metal/graph/graph_tracking.hpp index a39cf3d4a56..54ee8eef41d 100644 --- a/tt_metal/graph/graph_tracking.hpp +++ b/tt_metal/graph/graph_tracking.hpp @@ -28,7 +28,7 @@ inline namespace v0 { IGraphProcessor() = default; - virtual void track_allocate(tt::tt_metal::Buffer* buffer, bool bottom_up) {}; + virtual void track_allocate(const tt::tt_metal::Buffer* buffer, bool bottom_up) {}; virtual void track_deallocate(tt::tt_metal::Buffer* buffer) {}; @@ -54,7 +54,7 @@ inline namespace v0 { class IGraphHooks { public: IGraphHooks() = default; - virtual bool hook_allocate(tt::tt_metal::Buffer* buffer, bool bottom_up) = 0; + virtual bool hook_allocate(const tt::tt_metal::Buffer* buffer, bool bottom_up) = 0; virtual bool hook_deallocate(tt::tt_metal::Buffer* buffer) = 0; @@ -77,7 +77,7 @@ inline namespace v0 { bool add_hook(const std::shared_ptr& hook); - void track_allocate(Buffer* buffer, bool bottom_up); + void track_allocate(const Buffer* buffer, bool bottom_up); void track_deallocate(Buffer* buffer); @@ -118,7 +118,7 @@ inline namespace v0 { } } - bool hook_allocate(Buffer* buffer, bool bottom_up); + bool hook_allocate(const Buffer* buffer, bool bottom_up); bool hook_deallocate(Buffer* buffer); diff --git a/tt_metal/impl/buffers/buffer.cpp b/tt_metal/impl/buffers/buffer.cpp index 5d204dde8c1..0403a82af98 100644 --- a/tt_metal/impl/buffers/buffer.cpp +++ b/tt_metal/impl/buffers/buffer.cpp @@ -28,7 +28,10 @@ void validate_buffer_size_and_page_size( const BufferType &buffer_type, const TensorMemoryLayout &buffer_layout, const std::optional& shard_parameters) { - TT_FATAL(size != 0 and page_size != 0, "Buffer size and page size should be larger than 0 bytes!"); + if (size == 0) { + return; + } + bool valid_page_size = (size % page_size == 0); TT_FATAL( valid_page_size, @@ -55,7 +58,7 @@ inline std::tuple>, std::vector &page_shape, const std::array &shard_shape, const std::array &tensor2d_size) { - std::array shard_in_pages = {shard_shape[0] / page_shape[0], shard_shape[1] / page_shape[1]}; + std::array shard_in_pages = {page_shape[0] == 0 ? 0 : shard_shape[0] / page_shape[0], page_shape[1] == 0 ? 0 : shard_shape[1] / page_shape[1]}; std::vector> ret_vec(num_shards); std::vector> ret_shard_shape(num_shards, shard_in_pages); @@ -80,7 +83,7 @@ inline std::tuple>, std::vector>, std::vector& shard_parameters, - const std::optional bottom_up, - bool allocate) : - device_(device), - size_(size), - page_size_(page_size), - buffer_type_(buffer_type), - buffer_layout_(buffer_layout), - shard_parameters_(shard_parameters), - bottom_up_(bottom_up), - buffer_page_mapping_(nullptr), - allocate_(allocate) { - TT_FATAL(this->device_ != nullptr and this->device_->allocator_ != nullptr, "Device and allocator need to not be null."); - validate_buffer_size_and_page_size(size, page_size, buffer_type, buffer_layout, shard_parameters); - if (allocate) { - this->allocate(); +BufferPageMapping generate_buffer_page_mapping(const Buffer& buffer) { + BufferPageMapping buffer_page_mapping; + + if (buffer.size() == 0) { + return buffer_page_mapping; } -} + auto shard_spec = buffer.shard_spec(); -BufferPageMapping generate_buffer_page_mapping(const Buffer &buffer) { - BufferPageMapping buffer_page_mapping; - bool row_major = buffer.shard_spec().orientation() == ShardOrientation::ROW_MAJOR; + bool row_major = shard_spec.orientation() == ShardOrientation::ROW_MAJOR; uint32_t num_cores = buffer.num_cores(); - buffer_page_mapping.all_cores_ = corerange_to_cores(buffer.shard_spec().grid(), num_cores, row_major); - TT_ASSERT(num_cores == buffer_page_mapping.all_cores_.size()); + buffer_page_mapping.all_cores_ = corerange_to_cores(shard_spec.grid(), num_cores, row_major); + TT_FATAL(num_cores == buffer_page_mapping.all_cores_.size(), "Buffer has {} cores, but page mapping expects {} cores", num_cores, buffer_page_mapping.all_cores_.size()); uint32_t core_id = 0; for (const auto &core : buffer_page_mapping.all_cores_) { buffer_page_mapping.core_to_core_id_.insert({core, core_id}); @@ -154,12 +138,12 @@ BufferPageMapping generate_buffer_page_mapping(const Buffer &buffer) { uint32_t num_dev_pages = buffer.num_dev_pages(); auto [core_host_page_indices, shard_shape] = core_to_host_pages( num_dev_pages, - buffer.shard_spec().size(), + shard_spec.size(), num_cores, buffer.buffer_layout(), - buffer.shard_spec().page_shape, - buffer.shard_spec().shape(), - buffer.shard_spec().tensor2d_shape); + shard_spec.page_shape, + shard_spec.shape(), + shard_spec.tensor2d_shape); buffer_page_mapping.core_host_page_indices_ = std::vector>(num_cores); @@ -172,10 +156,10 @@ BufferPageMapping generate_buffer_page_mapping(const Buffer &buffer) { buffer_page_mapping.core_shard_shape_ = std::move(shard_shape); uint32_t dev_page_index = 0; - auto shape_in_pages = buffer.shard_spec().shape_in_pages(); + auto shape_in_pages = shard_spec.shape_in_pages(); for (uint32_t core_index = 0; core_index < core_host_page_indices.size(); core_index++) { uint32_t valid_shard_page = 0; - buffer_page_mapping.core_host_page_indices_[core_index].reserve(buffer.shard_spec().size()); + buffer_page_mapping.core_host_page_indices_[core_index].reserve(shard_spec.size()); uint32_t shard_page_id = 0; for (uint32_t shard_page_x = 0; shard_page_x < shape_in_pages[0]; shard_page_x++) { for (uint32_t shard_page_y = 0; shard_page_y < shape_in_pages[1]; shard_page_y++) { @@ -198,89 +182,162 @@ BufferPageMapping generate_buffer_page_mapping(const Buffer &buffer) { return buffer_page_mapping; } -Buffer::Buffer(const Buffer &other) : - device_(other.device_), - size_(other.size_), - page_size_(other.page_size_), - buffer_type_(other.buffer_type_), - buffer_layout_(other.buffer_layout_), - shard_parameters_(other.shard_parameters_), - bottom_up_(other.bottom_up_), - buffer_page_mapping_(other.buffer_page_mapping_), - allocate_(other.allocate_) { - if (this->allocate_) { - this->allocate(); +Buffer::Buffer( + Device *device, + DeviceAddr size, + DeviceAddr page_size, + const BufferType buffer_type, + const TensorMemoryLayout buffer_layout, + const std::optional& shard_parameters, + const std::optional bottom_up) : + device_(device), + size_(size), + page_size_(page_size), + buffer_type_(buffer_type), + buffer_layout_(buffer_layout), + shard_parameters_(shard_parameters), + bottom_up_(bottom_up), + buffer_page_mapping_(nullptr) { + TT_FATAL(this->device_ != nullptr && this->device_->allocator_ != nullptr, "Device and allocator need to not be null."); + + if (size != 0) { + validate_buffer_size_and_page_size(size, page_size, buffer_type, buffer_layout, shard_parameters); } } -Buffer &Buffer::operator=(const Buffer &other) { - if (this != &other) { - this->device_ = other.device_; - this->size_ = other.size_; - this->page_size_ = other.page_size_; - this->buffer_type_ = other.buffer_type_; - this->buffer_layout_ = other.buffer_layout_; - this->shard_parameters_ = other.shard_parameters_; - this->bottom_up_ = other.bottom_up_; - this->buffer_page_mapping_ = other.buffer_page_mapping_; - this->allocate_ = other.allocate_; - if (this->allocate_) { - this->allocate(); - } +std::shared_ptr Buffer::create( + Device *device, + DeviceAddr size, + DeviceAddr page_size, + const BufferType buffer_type, + const TensorMemoryLayout buffer_layout, + const std::optional& shard_parameters, + const std::optional bottom_up) { + auto* bufferPtr = new Buffer(device, size, page_size, buffer_type, buffer_layout, shard_parameters, bottom_up); + auto buffer = std::shared_ptr(bufferPtr, deleter); + buffer->weak_self = buffer; + + if (buffer->size_ == 0) { + buffer->allocation_status_.store(AllocationStatus::ALLOCATED, std::memory_order::relaxed); + return buffer; } - return *this; -} - -Buffer::Buffer(Buffer &&other) : - device_(other.device_), - size_(other.size_), - address_(other.address_), - page_size_(other.page_size_), - buffer_type_(other.buffer_type_), - buffer_layout_(other.buffer_layout_), - shard_parameters_(std::move(other.shard_parameters_)), - bottom_up_(other.bottom_up_), - buffer_page_mapping_(std::move(other.buffer_page_mapping_)), - allocate_(other.allocate_) { - // Set `other.device_` to be nullptr so destroying other does not deallocate reserved address space that is - // transferred to `this` - other.device_ = nullptr; -} - -Buffer &Buffer::operator=(Buffer &&other) { - if (this != &other) { - this->device_ = other.device_; - this->size_ = other.size_; - this->address_ = other.address_; - this->page_size_ = other.page_size_; - this->buffer_type_ = other.buffer_type_; - this->buffer_layout_ = other.buffer_layout_; - this->shard_parameters_ = std::move(other.shard_parameters_); - this->bottom_up_ = other.bottom_up_; - this->buffer_page_mapping_ = std::move(other.buffer_page_mapping_); - this->allocate_ = other.allocate_; - // Set `other.device_` to be nullptr so destroying other does not deallocate reserved address space that is - // transferred to `this` - other.device_ = nullptr; + + buffer->device_->push_work([buffer] { + bool bottom_up = buffer->bottom_up_.value_or(buffer->is_dram()); + buffer->address_ = detail::AllocateBuffer(buffer.get(), bottom_up); + detail::BUFFER_MAP.insert({buffer->device_->id(), buffer->address_}, buffer.get()); + + std::unique_lock lock(buffer->allocation_mutex_); + buffer->allocation_status_.store(AllocationStatus::ALLOCATED, std::memory_order::relaxed); + lock.unlock(); + buffer->allocation_cv_.notify_all(); + }); + + return buffer; +} + +void Buffer::deallocate() { + deallocation_requested_.store(true, std::memory_order::relaxed); + device_->push_work([self = weak_self.lock()] { + self->deallocate_impl(); + }); +} + +void Buffer::deleter(Buffer* buffer) { + buffer->device_->push_work([buffer] { + std::unique_ptr unique_buffer = std::unique_ptr(buffer); + buffer->deallocate_impl(); + }); +} + +void Buffer::deallocate_impl() { + if (allocation_status_.load(std::memory_order::relaxed) == AllocationStatus::DEALLOCATED) { + return; + } + + if (device_->initialized_ && size_ != 0) { + // address_ is only modified from this thread, no sync required + detail::BUFFER_MAP.erase({device_->id(), address_}); + detail::DeallocateBuffer(this); + } + + allocation_status_.store(AllocationStatus::DEALLOCATED, std::memory_order::relaxed); +} + +bool Buffer::is_allocated() const { + auto allocation_status = allocation_status_.load(std::memory_order::relaxed); + + if (device_->can_use_passthrough_scheduling()) { + return allocation_status == AllocationStatus::ALLOCATED; + } + + // For calls from different threads we consider buffer to be allocated even if it's just ALLOCATION_REQUESTED, + // because once the caller will try to access it, the buffer will already be fully allocated. For the same reason we need to check deallocation_requested_ too. + bool deallocation_requested = deallocation_requested_.load(std::memory_order::relaxed); + return (allocation_status == AllocationStatus::ALLOCATION_REQUESTED || allocation_status == AllocationStatus::ALLOCATED) && !deallocation_requested; +} + +uint32_t Buffer::address() const { + if (device_->can_use_passthrough_scheduling()) { + return address_; + } + + std::unique_lock lock(allocation_mutex_); + allocation_cv_.wait(lock, [this] { return this->allocation_status_.load(std::memory_order::relaxed) != AllocationStatus::ALLOCATION_REQUESTED; }); + return address_; +} + +DeviceAddr Buffer::page_size() const { + return page_size_; +} + +void Buffer::set_page_size(DeviceAddr page_size) { + TT_FATAL(page_size == 0 ? size_ == 0 : size_ % page_size == 0, "buffer size must be divisible by new page size"); + page_size_ = page_size; + this->buffer_page_mapping_ = nullptr; +} + +uint32_t Buffer::num_pages() const { + return page_size() == 0 ? 0 : size() / page_size(); +} + +uint32_t Buffer::num_dev_pages() const { + if (!is_sharded(this->buffer_layout_)) { + return this->num_pages(); + } + + return this->shard_spec().size() * this->num_cores(); +} + +CoreType Buffer::core_type() const { + switch (this->buffer_type_) { + case BufferType::DRAM: + return CoreType::DRAM; + case BufferType::L1: + case BufferType::L1_SMALL: + return CoreType::WORKER; + default: + TT_THROW("Unknown CoreType {} for buffer", this->buffer_type_); } - return *this; } -void Buffer::allocate() { - TT_ASSERT(this->device_ != nullptr); - // L1 and Trace buffers (which live in DRAM) are allocated top down! - bool bottom_up = this->bottom_up_.value_or(this->is_dram()); - detail::AllocateBuffer(this, bottom_up); - detail::BUFFER_MAP.insert({this->device_->id(), this->address_}, this); +bool Buffer::is_l1() const { + return buffer_type() == BufferType::L1 or buffer_type() == BufferType::L1_SMALL; +} +bool Buffer::is_dram() const { + return buffer_type() == BufferType::DRAM || buffer_type() == BufferType::TRACE; +} +bool Buffer::is_trace() const { + return buffer_type() == BufferType::TRACE; } uint32_t Buffer::dram_channel_from_bank_id(uint32_t bank_id) const { - TT_ASSERT(this->is_dram(), "Expected DRAM buffer!"); + TT_FATAL(this->is_dram(), "Expected DRAM buffer!"); return this->device_->dram_channel_from_bank_id(bank_id); } CoreCoord Buffer::logical_core_from_bank_id(uint32_t bank_id) const { - TT_ASSERT(this->is_l1(), "Expected L1 buffer!"); + TT_FATAL(this->is_l1(), "Expected L1 buffer!"); return this->device_->logical_core_from_bank_id(bank_id); } @@ -307,69 +364,88 @@ CoreCoord Buffer::noc_coordinates() const { return this->noc_coordinates(0); } DeviceAddr Buffer::page_address(uint32_t bank_id, uint32_t page_index) const { auto num_banks = this->device_->num_banks(this->buffer_type_); - TT_ASSERT(bank_id < num_banks, "Invalid Bank ID: {} exceeds total numbers of banks ({})!", bank_id, num_banks); + TT_FATAL(bank_id < num_banks, "Invalid Bank ID: {} exceeds total numbers of banks ({})!", bank_id, num_banks); int pages_offset_within_bank = (int)page_index / num_banks; - auto offset = (round_up(this->page_size_, this->alignment()) * pages_offset_within_bank); + auto offset = (round_up(this->page_size(), this->alignment()) * pages_offset_within_bank); return translate_page_address(offset, bank_id); } -uint32_t Buffer::alignment() const { return this->device_->get_allocator_alignment(); } +uint32_t Buffer::alignment() const { + return this->device_->get_allocator_alignment(); +} +DeviceAddr Buffer::aligned_page_size() const { + return align(page_size(), this->alignment()); +} +DeviceAddr Buffer::aligned_size() const { + return this->num_dev_pages() * this->aligned_page_size(); +} DeviceAddr Buffer::sharded_page_address(uint32_t bank_id, uint32_t page_index) const { - TT_ASSERT(is_sharded(this->buffer_layout())); - int pages_offset_within_bank = page_index % shard_spec().size(); - auto offset = (round_up(this->page_size_, this->alignment()) * pages_offset_within_bank); + TT_FATAL(is_sharded(this->buffer_layout()), "Buffer not sharded"); + auto shard_spec = this->shard_spec(); + uint32_t pages_offset_within_bank = page_index % shard_spec.size(); + auto offset = (round_up(this->page_size(), this->alignment()) * pages_offset_within_bank); return translate_page_address(offset, bank_id); } +ShardSpecBuffer Buffer::shard_spec() const { + TT_FATAL(is_sharded(this->buffer_layout_), "Buffer not sharded"); + TT_FATAL(shard_parameters_.has_value(), "Buffer is sharded, but no shard parameters specified"); + return this->shard_parameters_.value(); +} + +void Buffer::set_shard_spec(const ShardSpecBuffer& shard_spec) { + this->shard_parameters_ = shard_spec; + this->buffer_page_mapping_ = nullptr; +} + +uint32_t Buffer::num_cores() const { + if (!is_sharded(this->buffer_layout_)) + return 1; + + return this->shard_spec().tensor_shard_spec.grid.num_cores(); +} + DeviceAddr Buffer::translate_page_address(uint64_t offset, uint32_t bank_id) const { - DeviceAddr base_page_address = this->address_ + this->device_->bank_offset(this->buffer_type_, bank_id); + DeviceAddr base_page_address = this->address() + this->device_->bank_offset(this->buffer_type_, bank_id); return base_page_address + offset; } const std::shared_ptr& Buffer::get_buffer_page_mapping() { - TT_ASSERT(is_sharded(this->buffer_layout_), "Buffer not sharded"); + TT_FATAL(is_sharded(this->buffer_layout_), "Buffer not sharded"); if (!this->buffer_page_mapping_) { this->buffer_page_mapping_ = std::make_shared(generate_buffer_page_mapping(*this)); } return this->buffer_page_mapping_; } -void Buffer::deallocate() { +bool ShardSpec::operator==(const ShardSpec&) const = default; +bool ShardSpec::operator!=(const ShardSpec&) const = default; - if (this->device_ == nullptr or not this->device_->initialized_ or this->size_ == 0 or not this->allocate_) { - return; - } - // Mark as deallocated - this->size_ = 0; - TT_ASSERT(this->device_->allocator_ != nullptr, "Expected allocator to be initialized!"); - detail::BUFFER_MAP.erase({this->device_->id(), this->address_}); - detail::DeallocateBuffer(this); +std::array ShardSpecBuffer::shape_in_pages() const { + auto width_in_pages = page_shape[0] == 0 ? 0 : tensor_shard_spec.shape[0] / page_shape[0]; + auto height_in_pages = page_shape[1] == 0 ? 0 : tensor_shard_spec.shape[1] / page_shape[1]; + return {width_in_pages, height_in_pages}; } -Buffer::~Buffer() { this->deallocate(); } - -bool operator==(const ShardSpec &spec_a, const ShardSpec &spec_b) { - if (spec_a.grid != spec_b.grid) { - return false; - } - if (spec_a.shape != spec_b.shape) { - return false; - } - if (spec_a.orientation != spec_b.orientation) { - return false; - } - if (spec_a.halo != spec_b.halo) { - return false; - } - return true; +DeviceAddr ShardSpecBuffer::size() const { + auto shape_in_pages_ = this->shape_in_pages(); + return shape_in_pages_[0] * shape_in_pages_[1]; } -bool operator!=(const ShardSpec &spec_a, const ShardSpec &spec_b) { return not(spec_a == spec_b); } - namespace detail { buffer_map_t BUFFER_MAP = {}; } } // namespace tt_metal } // namespace tt + +namespace tt::stl::json { +tt_metal::ShardSpec from_json_t::operator()(const nlohmann::json &json_object) const { + return tt_metal::ShardSpec{ + from_json(json_object.at("grid")), + from_json>(json_object.at("shape")), + from_json(json_object.at("orientation")), + from_json(json_object.at("halo"))}; +} +} diff --git a/tt_metal/impl/buffers/buffer.hpp b/tt_metal/impl/buffers/buffer.hpp index 91dd0fd846a..8c4332de0cb 100644 --- a/tt_metal/impl/buffers/buffer.hpp +++ b/tt_metal/impl/buffers/buffer.hpp @@ -6,6 +6,7 @@ #include #include +#include #include #include "common/bfloat16.hpp" @@ -46,21 +47,20 @@ struct ShardSpec { const ShardOrientation &shard_orientation_ = ShardOrientation::ROW_MAJOR, const bool &halo_ = false) : grid(core_sets_), shape(shard_shape_), orientation(shard_orientation_), halo(halo_) { - ; } const uint32_t num_cores() const { return this->grid.num_cores(); } const uint32_t numel() const { return this->shape[0] * this->shape[1]; } + bool operator==(const ShardSpec& other) const; + bool operator!=(const ShardSpec& other) const; + static constexpr auto attribute_names = std::forward_as_tuple("grid", "shape", "orientation", "halo"); constexpr auto attribute_values() const { return std::forward_as_tuple(this->grid, this->shape, this->orientation, this->halo); } }; -bool operator==(const ShardSpec &spec_a, const ShardSpec &spec_b); -bool operator!=(const ShardSpec &spec_a, const ShardSpec &spec_b); - struct ShardSpecBuffer { ShardSpec tensor_shard_spec; std::array page_shape; @@ -91,15 +91,8 @@ struct ShardSpecBuffer { void set_shard_spec(const ShardSpec& shard_spec) { tensor_shard_spec = shard_spec; }; /* Shape in pages of the full tensor, not per core */ - std::array shape_in_pages() const { - auto width_in_pages = tensor_shard_spec.shape[0] / page_shape[0]; - auto height_in_pages = tensor_shard_spec.shape[1] / page_shape[1]; - return {width_in_pages, height_in_pages}; - } - DeviceAddr size() const { - auto shape_in_pages_ = this->shape_in_pages(); - return shape_in_pages_[0] * shape_in_pages_[1]; - } + std::array shape_in_pages() const; + DeviceAddr size() const; }; inline namespace v0 { @@ -110,7 +103,6 @@ struct BufferConfig { DeviceAddr page_size; // Size of unit being interleaved. For non-interleaved buffers: size == page_size BufferType buffer_type; TensorMemoryLayout buffer_layout = TensorMemoryLayout::INTERLEAVED; - bool allocate = true; }; typedef BufferConfig InterleavedBufferConfig; @@ -124,7 +116,6 @@ struct ShardedBufferConfig { BufferType buffer_type = BufferType::L1; TensorMemoryLayout buffer_layout = TensorMemoryLayout::HEIGHT_SHARDED; ShardSpecBuffer shard_parameters; - bool allocate = true; }; } // namespace v0 @@ -147,77 +138,41 @@ struct BufferPageMapping { inline namespace v0 { -class Buffer { +class Buffer final { public: - Buffer() : - device_(nullptr), - buffer_type_(BufferType::DRAM), - buffer_layout_(TensorMemoryLayout::INTERLEAVED), - shard_parameters_(std::nullopt), - bottom_up_(std::nullopt), - allocate_(true) {} - - Buffer( + static std::shared_ptr create( Device *device, DeviceAddr size, DeviceAddr page_size, - const BufferType buffer_type, - const TensorMemoryLayout buffer_layout = TensorMemoryLayout::INTERLEAVED, + BufferType buffer_type, + TensorMemoryLayout buffer_layout = TensorMemoryLayout::INTERLEAVED, const std::optional& shard_parameter = std::nullopt, - const std::optional bottom_up = std::nullopt, - bool allocate = true); - - Buffer(const Buffer &other); - Buffer &operator=(const Buffer &other); + std::optional bottom_up = std::nullopt); - Buffer(Buffer &&other); - Buffer &operator=(Buffer &&other); + Buffer(const Buffer &other) = delete; + Buffer &operator=(const Buffer &other) = delete; + Buffer(Buffer &&other) = delete; + Buffer &operator=(Buffer &&other) = delete; - virtual ~Buffer(); Device *device() const { return device_; } + DeviceAddr size() const { return size_; } + bool is_allocated() const; - DeviceAddr size() const { return static_cast(size_); } - - void set_size(DeviceAddr size) { size_ = size; this->buffer_page_mapping_ = nullptr; } // Returns address of buffer in the first bank - uint32_t address() const { return static_cast(address_); } - - void set_address(uint64_t addr) { address_ = addr; } + uint32_t address() const; - DeviceAddr page_size() const { return page_size_; } + DeviceAddr page_size() const; + void set_page_size(DeviceAddr page_size); - uint32_t num_pages() const { return this->size() / this->page_size(); } - - void set_page_size(DeviceAddr page_size) { - TT_FATAL(size_ % page_size == 0, "buffer size must be divisible by new page size"); - page_size_ = page_size; - this->buffer_page_mapping_ = nullptr; - } - - uint32_t num_dev_pages() const { - if (!is_sharded(this->buffer_layout_)) { - return this->num_pages(); - } else { - return this->shard_spec().size() * this->num_cores(); - } - } + uint32_t num_pages() const; + uint32_t num_dev_pages() const; BufferType buffer_type() const { return buffer_type_; } - CoreType core_type() const { - switch (this->buffer_type_) { - case BufferType::DRAM: - return CoreType::DRAM; - case BufferType::L1: - case BufferType::L1_SMALL: - return CoreType::WORKER; - default: - TT_THROW("Unknown CoreType for buffer"); - } - } + CoreType core_type() const; - bool is_l1() const { return buffer_type() == BufferType::L1 or buffer_type() == BufferType::L1_SMALL; } - bool is_dram() const { return buffer_type() == BufferType::DRAM || buffer_type() == BufferType::TRACE; } - bool is_trace() const { return buffer_type() == BufferType::TRACE; } + bool is_l1() const; + bool is_dram() const; + bool is_trace() const; TensorMemoryLayout buffer_layout() const { return buffer_layout_; } @@ -233,56 +188,64 @@ class Buffer { DeviceAddr page_address(uint32_t bank_id, uint32_t page_index) const; uint32_t alignment() const; - - DeviceAddr aligned_page_size() const { return align(page_size_, this->alignment());} - - DeviceAddr aligned_size() const { return this->num_dev_pages() * this->aligned_page_size(); } + DeviceAddr aligned_page_size() const; + DeviceAddr aligned_size() const; // SHARDED API STARTS HERE // TODO: WILL SEPARATE INTO SHARDED BUFFER CLASS DeviceAddr sharded_page_address(uint32_t bank_id, uint32_t page_index) const; - ShardSpecBuffer shard_spec() const { - TT_ASSERT(is_sharded(this->buffer_layout_), "Buffer not sharded"); - TT_ASSERT(shard_parameters_.has_value()); - return this->shard_parameters_.value(); - } + ShardSpecBuffer shard_spec() const; + void set_shard_spec(const ShardSpecBuffer& shard_spec); - void set_shard_spec(const ShardSpecBuffer& shard_spec) { - this->shard_parameters_ = shard_spec; - this->buffer_page_mapping_ = nullptr; - } - - uint32_t num_cores() const { - if (!is_sharded(this->buffer_layout_)) - return 1; - else { - return this->shard_spec().tensor_shard_spec.grid.num_cores(); - } - } + uint32_t num_cores() const; const std::shared_ptr& get_buffer_page_mapping(); private: - virtual void allocate(); - - virtual void deallocate(); + Buffer( + Device *device, + DeviceAddr size, + DeviceAddr page_size, + BufferType buffer_type, + TensorMemoryLayout buffer_layout, + const std::optional& shard_parameter, + std::optional bottom_up); + + enum class AllocationStatus : uint8_t { + ALLOCATION_REQUESTED, + ALLOCATED, + DEALLOCATED, + }; + + // Deallocate is allowed to be called multiple times on the same buffer + void deallocate(); + static void deleter(Buffer* buffer); + void deallocate_impl(); friend void DeallocateBuffer(Buffer &buffer); DeviceAddr translate_page_address(uint64_t offset, uint32_t bank_id) const; - Device *device_; - DeviceAddr size_; // Size in bytes - DeviceAddr address_; // Address of buffer - DeviceAddr page_size_; // Size of unit being interleaved. For non-interleaved buffers: size == page_size - BufferType buffer_type_; - TensorMemoryLayout buffer_layout_; + Device * const device_; + const DeviceAddr size_; // Size in bytes + const BufferType buffer_type_; + const TensorMemoryLayout buffer_layout_; + const std::optional bottom_up_; + + std::atomic allocation_status_ = AllocationStatus::ALLOCATION_REQUESTED; + DeviceAddr address_ = 0; + mutable std::mutex allocation_mutex_; + mutable std::condition_variable allocation_cv_; + // Used exclusively for is_allocated() method + std::atomic deallocation_requested_ = false; + + // These members must be only accessed on the device worker thread + DeviceAddr page_size_; // Size of unit being interleaved. For non-interleaved buffers: size == page_size std::optional shard_parameters_; std::shared_ptr buffer_page_mapping_; - bool allocate_ = true; - protected: - std::optional bottom_up_; + + std::weak_ptr weak_self; }; } // namespace v0 @@ -335,13 +298,7 @@ using HostDataType = std::variant< namespace tt::stl::json { template <> -struct from_json_t { - auto operator()(const nlohmann::json &json_object) const { - return tt::tt_metal::ShardSpec{ - from_json(json_object.at("grid")), - from_json>(json_object.at("shape")), - from_json(json_object.at("orientation")), - from_json(json_object.at("halo"))}; - } +struct from_json_t { + tt_metal::ShardSpec operator()(const nlohmann::json &json_object) const; }; } // namespace tt::stl::json diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index 47ce59b1383..7ba127c7803 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -3266,12 +3266,8 @@ CommandQueue &Device::command_queue(size_t cq_id) { return *sw_command_queues_[cq_id]; } -void Device::push_work(std::function&& work, bool blocking) { - this->work_executor.push_work(work, blocking); -} - -void Device::push_work(std::shared_ptr> work, bool blocking) { - this->work_executor.push_work(work, blocking); +bool Device::can_use_passthrough_scheduling() const { + return this->work_executor.use_passthrough(); } void Device::synchronize() { diff --git a/tt_metal/impl/device/device.hpp b/tt_metal/impl/device/device.hpp index 5ee266d37b7..7beb58f3ea8 100644 --- a/tt_metal/impl/device/device.hpp +++ b/tt_metal/impl/device/device.hpp @@ -257,8 +257,11 @@ class Device { friend bool CloseDevice(Device *device); // APIs to access this device's work executor - void push_work(std::function&& work, bool blocking = false); - void push_work(std::shared_ptr> work, bool blocking = false); + bool can_use_passthrough_scheduling() const; + template + void push_work(F&& work, bool blocking = false) { + this->work_executor.push_work(std::forward(work), blocking); + } void synchronize(); void set_worker_mode(const WorkExecutorMode& mode); void enable_async(bool enable); diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp index 010ad9e4359..656c740ccee 100644 --- a/tt_metal/impl/dispatch/command_queue.cpp +++ b/tt_metal/impl/dispatch/command_queue.cpp @@ -2789,62 +2789,6 @@ void EnqueueGetBufferAddr(CommandQueue& cq, uint32_t* dst_buf_addr, const Buffer .type = EnqueueCommandType::GET_BUF_ADDR, .blocking = blocking, .shadow_buffer = buffer, .dst = dst_buf_addr}); } -void EnqueueAllocateBufferImpl(AllocBufferMetadata alloc_md) { - Buffer* buffer = alloc_md.buffer; - uint32_t allocated_addr; - if (is_sharded(buffer->buffer_layout())) { - allocated_addr = allocator::allocate_buffer( - *(buffer->device()->allocator_), - buffer->shard_spec().size() * buffer->num_cores() * buffer->page_size(), - buffer->page_size(), - buffer->buffer_type(), - alloc_md.bottom_up, - buffer->num_cores()); - } else { - allocated_addr = allocator::allocate_buffer( - *(buffer->device()->allocator_), - buffer->size(), - buffer->page_size(), - buffer->buffer_type(), - alloc_md.bottom_up, - std::nullopt); - } - TT_ASSERT(allocated_addr <= std::numeric_limits::max()); - buffer->set_address(static_cast(allocated_addr)); -} - -void EnqueueAllocateBuffer(CommandQueue& cq, Buffer* buffer, bool bottom_up, bool blocking) { - auto alloc_md = AllocBufferMetadata{ - .buffer = buffer, - .allocator = *(buffer->device()->allocator_), - .bottom_up = bottom_up, - }; - cq.run_command(CommandInterface{ - .type = EnqueueCommandType::ALLOCATE_BUFFER, - .blocking = blocking, - .alloc_md = alloc_md, - }); -} - -void EnqueueDeallocateBufferImpl(AllocBufferMetadata alloc_md) { - allocator::deallocate_buffer(alloc_md.allocator, alloc_md.device_address, alloc_md.buffer_type); -} - -void EnqueueDeallocateBuffer( - CommandQueue& cq, Allocator& allocator, uint32_t device_address, BufferType buffer_type, bool blocking) { - // Need to explictly pass in relevant buffer attributes here, since the Buffer* ptr can be deallocated a this point - auto alloc_md = AllocBufferMetadata{ - .allocator = allocator, - .buffer_type = buffer_type, - .device_address = device_address, - }; - cq.run_command(CommandInterface{ - .type = EnqueueCommandType::DEALLOCATE_BUFFER, - .blocking = blocking, - .alloc_md = alloc_md, - }); -} - inline namespace v0 { void EnqueueReadBuffer( @@ -3173,13 +3117,13 @@ void CommandQueue::run_worker() { } } -void CommandQueue::run_command(const CommandInterface& command) { +void CommandQueue::run_command(CommandInterface&& command) { log_trace(LogDispatch, "{} received {} in {} mode", this->name(), command.type, this->mode); if (this->async_mode()) { if (std::hash{}(std::this_thread::get_id()) == parent_thread_id) { // In async mode when parent pushes cmd, feed worker through queue. - this->worker_queue.push(command); bool blocking = command.blocking.has_value() and *command.blocking; + this->worker_queue.push(std::move(command)); if (blocking) { TT_ASSERT(not this->trace_mode(), "Blocking commands cannot be traced!"); this->wait_until_empty(); @@ -3193,7 +3137,7 @@ void CommandQueue::run_command(const CommandInterface& command) { } } else if (this->trace_mode()) { // In trace mode push to the trace queue - this->worker_queue.push(command); + this->worker_queue.push(std::move(command)); } else if (this->passthrough_mode()) { this->run_command_impl(command); } else { @@ -3216,14 +3160,6 @@ void CommandQueue::run_command_impl(const CommandInterface& command) { TT_ASSERT(command.blocking.has_value(), "Must specify blocking value!"); EnqueueWriteBufferImpl(*this, command.buffer.value(), command.src.value(), command.blocking.value()); break; - case EnqueueCommandType::ALLOCATE_BUFFER: - TT_ASSERT(command.alloc_md.has_value(), "Must provide buffer allocation metdata!"); - EnqueueAllocateBufferImpl(command.alloc_md.value()); - break; - case EnqueueCommandType::DEALLOCATE_BUFFER: - TT_ASSERT(command.alloc_md.has_value(), "Must provide buffer allocation metdata!"); - EnqueueDeallocateBufferImpl(command.alloc_md.value()); - break; case EnqueueCommandType::GET_BUF_ADDR: TT_ASSERT(command.dst.has_value(), "Must provide a dst address!"); TT_ASSERT(command.shadow_buffer.has_value(), "Must provide a shadow buffer!"); diff --git a/tt_metal/impl/dispatch/command_queue.hpp b/tt_metal/impl/dispatch/command_queue.hpp index 821a806ff9c..64f6c5407b7 100644 --- a/tt_metal/impl/dispatch/command_queue.hpp +++ b/tt_metal/impl/dispatch/command_queue.hpp @@ -36,8 +36,6 @@ using RuntimeArgs = std::vector>; enum class EnqueueCommandType { ENQUEUE_READ_BUFFER, ENQUEUE_WRITE_BUFFER, - ALLOCATE_BUFFER, - DEALLOCATE_BUFFER, GET_BUF_ADDR, ADD_BUFFER_TO_PROGRAM, SET_RUNTIME_ARGS, @@ -577,8 +575,6 @@ class HWCommandQueue { std::variant, std::shared_ptr> buffer, HostDataType src, bool blocking); - friend void EnqueueAllocateBufferImpl(AllocBufferMetadata alloc_md); - friend void EnqueueDeallocateBufferImpl(AllocBufferMetadata alloc_md); friend void EnqueueGetBufferAddrImpl(void* dst_buf_addr, const Buffer* buffer); friend void EnqueueRecordEventImpl(CommandQueue& cq, const std::shared_ptr& event); friend void EnqueueWaitForEventImpl(CommandQueue& cq, const std::shared_ptr& event); @@ -637,7 +633,7 @@ class CommandQueue { // Schedule a command to be run on the device // Blocking if in passthrough mode. Non-blocking if in async mode - void run_command(const CommandInterface& command); + void run_command(CommandInterface&& command); // API for setting/getting the mode of the command queue void set_mode(const CommandQueueMode& mode); @@ -696,9 +692,6 @@ class CommandQueue { // Primitives used to place host only operations on the SW Command Queue. // These are used in functions exposed through tt_metal.hpp or host_api.hpp -void EnqueueAllocateBuffer(CommandQueue& cq, Buffer* buffer, bool bottom_up, bool blocking); -void EnqueueDeallocateBuffer( - CommandQueue& cq, Allocator& allocator, uint32_t device_address, BufferType buffer_type, bool blocking); void EnqueueGetBufferAddr(CommandQueue& cq, uint32_t* dst_buf_addr, const Buffer* buffer, bool blocking); void EnqueueSetRuntimeArgs( CommandQueue& cq, diff --git a/tt_metal/impl/dispatch/lock_free_queue.hpp b/tt_metal/impl/dispatch/lock_free_queue.hpp index 6c65b932606..1f99cafc427 100644 --- a/tt_metal/impl/dispatch/lock_free_queue.hpp +++ b/tt_metal/impl/dispatch/lock_free_queue.hpp @@ -64,7 +64,7 @@ class LockFreeQueue { parent_thread_id.store(other.parent_thread_id.load()); } - inline void push(const T& value) { + inline void push(T&& value) { // Legacy Push API allowing copy by value // for object T. @@ -74,7 +74,7 @@ class LockFreeQueue { // has progressed (data has been read). lock_func(); while(tail.load()->next == head.load()) {}; - tail.load()->data = std::make_shared(value); + tail.load()->data = std::make_shared(std::move(value)); tail.store(tail.load()->next); unlock_func(); } @@ -89,7 +89,7 @@ class LockFreeQueue { // has progressed (data has been read). lock_func(); while(tail.load()->next == head.load()) {}; - tail.load()->data = value; + tail.load()->data = std::move(value); tail.store(tail.load()->next); unlock_func(); } diff --git a/tt_metal/impl/dispatch/work_executor.hpp b/tt_metal/impl/dispatch/work_executor.hpp index d6dd8d3e4a1..c9389ac210f 100644 --- a/tt_metal/impl/dispatch/work_executor.hpp +++ b/tt_metal/impl/dispatch/work_executor.hpp @@ -138,37 +138,21 @@ class WorkExecutor { } } - inline void push_work(const std::function& work_executor, bool blocking = false) { - ZoneScopedN("PushWork"); - if (std::this_thread::get_id() == this->worker_queue.worker_thread_id.load() or - not(this->worker_state == WorkerState::RUNNING)) { - // Worker is pushing to itself (nested work) or worker thread is not running. Execute work in current - // thread. - work_executor(); - } else { - // Push to worker queue. - this->worker_queue.push(work_executor); - { - std::lock_guard lock(this->cv_mutex); - cv.notify_one(); - } - if (blocking) { - this->synchronize(); - } - } + inline bool use_passthrough() const { + return std::this_thread::get_id() == this->worker_queue.worker_thread_id.load() || + this->worker_state != WorkerState::RUNNING; } - inline void push_work(std::shared_ptr> work_executor, bool blocking = false) { - // Latest push API, passing ptrs around for work container. Usually faster, since no data-copies. + template + inline void push_work(F&& work_executor, bool blocking = false) { ZoneScopedN("PushWork"); - if (std::this_thread::get_id() == this->worker_queue.worker_thread_id.load() or - not(this->worker_state == WorkerState::RUNNING)) { + if (use_passthrough()) { // Worker is pushing to itself (nested work) or worker thread is not running. Execute work in current // thread. - (*work_executor)(); + work_executor(); } else { // Push to worker queue. - this->worker_queue.push(work_executor); + this->worker_queue.push(std::forward(work_executor)); { std::lock_guard lock(this->cv_mutex); cv.notify_one(); @@ -220,11 +204,11 @@ class WorkExecutor { this->worker_queue_mode = mode; } - WorkerQueueMode get_worker_queue_mode() { return worker_queue_mode; } + WorkerQueueMode get_worker_queue_mode() const { return worker_queue_mode; } - inline std::thread::id get_parent_thread_id() { return this->worker_queue.parent_thread_id; } + inline std::thread::id get_parent_thread_id() const { return this->worker_queue.parent_thread_id; } - inline std::thread::id get_worker_thread_id() { return this->worker_queue.worker_thread_id; } + inline std::thread::id get_worker_thread_id() const { return this->worker_queue.worker_thread_id; } private: std::thread worker_thread; diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp index ffb8fb6a499..fba32d32c69 100644 --- a/tt_metal/impl/program/program.cpp +++ b/tt_metal/impl/program/program.cpp @@ -761,7 +761,7 @@ void Program::populate_dispatch_data(Device *device) { if (binaries_data.size() > 0) { // We allocate program binaries top down to minimize fragmentation with other buffers in DRAM, which are typically allocated bottom up - this->kernels_buffer = std::make_shared( + this->kernels_buffer = Buffer::create( device, binaries_data.size() * sizeof(uint32_t), HostMemDeviceCommand::PROGRAM_PAGE_SIZE, BufferType::DRAM, TensorMemoryLayout::INTERLEAVED, std::nullopt, false); this->program_transfer_info.binary_data = binaries_data; diff --git a/tt_metal/impl/trace/trace.cpp b/tt_metal/impl/trace/trace.cpp index 4a84a4aa484..aaeea3d05b7 100644 --- a/tt_metal/impl/trace/trace.cpp +++ b/tt_metal/impl/trace/trace.cpp @@ -85,7 +85,7 @@ void Trace::initialize_buffer(CommandQueue& cq, std::shared_ptr tra cq.device()->trace_buffers_size <= cq.device()->allocator_->config.trace_region_size, "Creating trace buffers of size {}B on device {}, but only {}B is allocated for trace region.", cq.device()->trace_buffers_size, cq.device()->id(), cq.device()->allocator_->config.trace_region_size); // Commit trace to device DRAM - trace_buffer->buffer = std::make_shared( + trace_buffer->buffer = Buffer::create( cq.device(), padded_size, page_size, BufferType::TRACE, TensorMemoryLayout::INTERLEAVED); EnqueueWriteBuffer(cq, trace_buffer->buffer, trace_data, kBlocking); log_trace( diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp index fa7b1972d51..fca037ad2a1 100644 --- a/tt_metal/tt_metal.cpp +++ b/tt_metal/tt_metal.cpp @@ -378,7 +378,7 @@ void WriteToDeviceSharded(Buffer &buffer, const std::vector &host_buff buffer.size()); uint32_t page_size = buffer.page_size(); - TT_ASSERT(buffer.size() % page_size == 0); + TT_ASSERT(page_size == 0 ? buffer.size() == 0 : buffer.size() % page_size == 0); static constexpr uint32_t bytes_per_page_entry = sizeof(uint32_t); TT_ASSERT(page_size % bytes_per_page_entry == 0); @@ -412,12 +412,7 @@ void WriteToDeviceInterleavedContiguous(const Buffer &buffer, const std::vector< buffer.size()); uint32_t page_size = buffer.page_size(); - TT_FATAL( - buffer.size() % page_size == 0, - "Invalid buffer size: {}. Buffer size must be a multiple of page size {}.", - buffer.size(), - page_size); - uint32_t num_pages = buffer.size() / page_size; + uint32_t num_pages = buffer.num_pages(); static constexpr uint32_t bytes_per_page_entry = sizeof(uint32_t); TT_FATAL(page_size % bytes_per_page_entry == 0, @@ -481,12 +476,7 @@ void WriteToBuffer(Buffer &buffer, const std::vector &host_buffer) { void ReadFromDeviceInterleavedContiguous(const Buffer &buffer, std::vector &host_buffer) { host_buffer.clear(); // overwrite the data uint32_t page_size = buffer.page_size(); - TT_FATAL( - buffer.size() % page_size == 0, - "Invalid buffer size: {}. Buffer size must be a multiple of page size {}.", - buffer.size(), - page_size); - uint32_t num_pages = buffer.size() / page_size; + uint32_t num_pages = buffer.num_pages(); auto device = buffer.device(); auto num_banks = device->num_banks(buffer.buffer_type()); @@ -811,13 +801,35 @@ void CompileProgram(Device *device, Program &program, bool fd_bootloader_mode) { program.compile(device, fd_bootloader_mode); } -void AllocateBuffer(Buffer *buffer, bool bottom_up) { +DeviceAddr AllocateBuffer(const Buffer *buffer, bool bottom_up) { if(GraphTracker::instance().hook_allocate(buffer, bottom_up)) { GraphTracker::instance().track_allocate(buffer, bottom_up); - return; + return 0; } - EnqueueAllocateBuffer(buffer->device()->command_queue(), buffer, bottom_up, false); + + uint32_t allocated_addr; + if (is_sharded(buffer->buffer_layout())) { + allocated_addr = allocator::allocate_buffer( + *(buffer->device()->allocator_), + buffer->shard_spec().size() * buffer->num_cores() * buffer->page_size(), + buffer->page_size(), + buffer->buffer_type(), + bottom_up, + buffer->num_cores()); + } else { + allocated_addr = allocator::allocate_buffer( + *(buffer->device()->allocator_), + buffer->size(), + buffer->page_size(), + buffer->buffer_type(), + bottom_up, + std::nullopt); + } + TT_ASSERT(allocated_addr <= std::numeric_limits::max()); + GraphTracker::instance().track_allocate(buffer, bottom_up); + + return allocated_addr; } void DeallocateBuffer(Buffer *buffer) { @@ -825,12 +837,8 @@ void DeallocateBuffer(Buffer *buffer) { if(GraphTracker::instance().hook_deallocate(buffer)) { return; } - EnqueueDeallocateBuffer( - buffer->device()->command_queue(), - *(buffer->device()->allocator_), - buffer->address(), - buffer->buffer_type(), - false); + + allocator::deallocate_buffer(*buffer->device()->allocator_, buffer->address(), buffer->buffer_type()); } } // namespace detail @@ -1045,20 +1053,19 @@ uint32_t CreateSemaphore( } std::shared_ptr CreateBuffer(const InterleavedBufferConfig &config) { - return std::make_shared( - config.device, config.size, config.page_size, config.buffer_type, config.buffer_layout, std::nullopt, std::nullopt, config.allocate); + return Buffer::create( + config.device, config.size, config.page_size, config.buffer_type, config.buffer_layout, std::nullopt, std::nullopt); } std::shared_ptr CreateBuffer(const ShardedBufferConfig &config) { - return std::make_shared( + return Buffer::create( config.device, config.size, config.page_size, config.buffer_type, config.buffer_layout, config.shard_parameters, - std::nullopt, - config.allocate); + std::nullopt); } void DeallocateBuffer(Buffer &buffer) { buffer.deallocate(); } diff --git a/ttnn/cpp/ttnn/async_runtime.cpp b/ttnn/cpp/ttnn/async_runtime.cpp index 29845f66316..814db64804b 100644 --- a/ttnn/cpp/ttnn/async_runtime.cpp +++ b/ttnn/cpp/ttnn/async_runtime.cpp @@ -8,67 +8,6 @@ #include "ttnn/tensor/tensor_impl_wrapper.hpp" namespace ttnn { -using DeviceBuffer = std::shared_ptr; -using queue_id = uint8_t; - -DeviceBuffer allocate_interleaved_buffer_on_device( - size_t buffer_size_bytes, - Device* device, - const ttnn::SimpleShape& shape, - DataType data_type, - Layout layout, - const MemoryConfig& memory_config, - const std::optional& tile) { - uint32_t page_size = tt::tt_metal::tensor_impl::get_page_size(data_type, layout, buffer_size_bytes, shape, tile); - return std::make_shared(device, buffer_size_bytes, page_size, memory_config.buffer_type); -} - -DeviceBuffer allocate_contiguous_buffer_on_device( - size_t buffer_size_bytes, Device* device, const MemoryConfig& memory_config) { - return std::make_shared(device, buffer_size_bytes, buffer_size_bytes, memory_config.buffer_type); -} - -DeviceBuffer allocate_sharded_buffer_on_device( - size_t buffer_size_bytes, - Device* device, - const ttnn::SimpleShape& shape, - DataType data_type, - Layout layout, - const ShardSpecBuffer& shard_params, - const MemoryConfig& memory_config, - const std::optional& tile) { - tt::tt_metal::tensor_impl::validate_sharded_buffer_allocation( - shape, layout, data_type, shard_params, memory_config, tile); - const auto& page_shape = shard_params.page_shape; - uint32_t size_of_element = tt::tt_metal::tensor_impl::element_size_bytes(data_type); - uint32_t page_size = page_shape[0] * page_shape[1] * size_of_element; - if (layout == Layout::TILE) { - page_size = tt::tt_metal::tensor_impl::get_page_size(data_type, layout, buffer_size_bytes, shape, tile); - } - - return std::make_shared( - device, buffer_size_bytes, page_size, memory_config.buffer_type, memory_config.memory_layout, shard_params); -} - -DeviceBuffer allocate_buffer_on_device( - size_t buffer_size_bytes, - types::Device* device, - const ttnn::SimpleShape& shape, - DataType data_type, - Layout layout, - const MemoryConfig& memory_config, - const std::optional& shard_spec, - const std::optional& tile) { - if (memory_config.memory_layout == tt::tt_metal::TensorMemoryLayout::INTERLEAVED) { - return allocate_interleaved_buffer_on_device( - buffer_size_bytes, device, shape, data_type, layout, memory_config, tile); - } else if (memory_config.memory_layout == tt::tt_metal::TensorMemoryLayout::SINGLE_BANK) { - return allocate_contiguous_buffer_on_device(buffer_size_bytes, device, memory_config); - } else { - return allocate_sharded_buffer_on_device( - buffer_size_bytes, device, shape, data_type, layout, shard_spec.value(), memory_config, tile); - } -} void write_buffer( queue_id cq_id, diff --git a/ttnn/cpp/ttnn/async_runtime.hpp b/ttnn/cpp/ttnn/async_runtime.hpp index 2be1296bc56..5700f688a66 100644 --- a/ttnn/cpp/ttnn/async_runtime.hpp +++ b/ttnn/cpp/ttnn/async_runtime.hpp @@ -9,11 +9,8 @@ #include "types.hpp" namespace ttnn { - using DeviceBuffer = std::shared_ptr; using queue_id = uint8_t; - DeviceBuffer allocate_buffer_on_device(size_t buffer_size_bytes, types::Device* device, const ttnn::SimpleShape& shape, DataType data_type, Layout layout, const MemoryConfig& memory_config, const std::optional& shard_spec = std::nullopt, const std::optional& tile = std::nullopt); - void write_buffer(queue_id cq_id, Tensor& dst, std::vector> src, const std::optional transfer_size = std::nullopt); void read_buffer(queue_id cq_id, Tensor& src, std::vector> dst, const std::optional transfer_size = std::nullopt, size_t src_offset = 0, bool blocking = true); diff --git a/ttnn/cpp/ttnn/graph/graph_processor.cpp b/ttnn/cpp/ttnn/graph/graph_processor.cpp index 76a2668a9dc..bebeadebd9d 100644 --- a/ttnn/cpp/ttnn/graph/graph_processor.cpp +++ b/ttnn/cpp/ttnn/graph/graph_processor.cpp @@ -90,7 +90,7 @@ GraphProcessor::GraphProcessor(RunMode mode) : run_mode(mode) { end_function_any_map[typeid(std::reference_wrapper)] = [ptr = this] (const std::any& val) mutable {ptr->end_function_process_tensor(val);}; } -void GraphProcessor::track_allocate(tt::tt_metal::Buffer* buffer, bool bottom_up) { +void GraphProcessor::track_allocate(const tt::tt_metal::Buffer* buffer, bool bottom_up) { const std::lock_guard lock(mutex); auto buf_id = add_buffer(buffer); @@ -299,7 +299,7 @@ int GraphProcessor::add_tensor(const Tensor& t) { return tensor_counter; } -int GraphProcessor::add_buffer(tt::tt_metal::Buffer* buffer) { +int GraphProcessor::add_buffer(const tt::tt_metal::Buffer* buffer) { auto buffer_alloc_id = reinterpret_cast(buffer); auto counter = id_to_counter.count(buffer_alloc_id) > 0 ? id_to_counter[buffer_alloc_id] : graph.size(); if (id_to_counter.count(buffer_alloc_id) == 0) { @@ -478,7 +478,7 @@ nlohmann::json GraphProcessor::end_graph_capture() { return res; } -bool ProcessorHooks::hook_allocate(tt::tt_metal::Buffer* buffer, bool bottom_up) { +bool ProcessorHooks::hook_allocate(const tt::tt_metal::Buffer* buffer, bool bottom_up) { return do_block; } diff --git a/ttnn/cpp/ttnn/graph/graph_processor.hpp b/ttnn/cpp/ttnn/graph/graph_processor.hpp index fa614e5d63a..4f7d6f1b6e7 100644 --- a/ttnn/cpp/ttnn/graph/graph_processor.hpp +++ b/ttnn/cpp/ttnn/graph/graph_processor.hpp @@ -22,7 +22,7 @@ namespace ttnn::graph { public: ProcessorHooks() = default; - bool hook_allocate(tt::tt_metal::Buffer* buffer, bool bottom_up) override; + bool hook_allocate(const tt::tt_metal::Buffer* buffer, bool bottom_up) override; bool hook_deallocate(tt::tt_metal::Buffer* buffer) override; @@ -40,7 +40,7 @@ namespace ttnn::graph { GraphProcessor(tt::tt_metal::IGraphProcessor::RunMode mode); ~GraphProcessor() override; - void track_allocate(tt::tt_metal::Buffer* buffer, bool bottom_up) override; + void track_allocate(const tt::tt_metal::Buffer* buffer, bool bottom_up) override; void track_deallocate(tt::tt_metal::Buffer* buffer) override; @@ -80,7 +80,7 @@ namespace ttnn::graph { std::unordered_map end_function_any_map; int add_tensor(const Tensor& t); - int add_buffer(tt::tt_metal::Buffer* buffer); + int add_buffer(const tt::tt_metal::Buffer* buffer); void begin_function_process_ref_vector(const std::any& any_val); void begin_function_process_ref_vector_optional(const std::any& any_val); diff --git a/ttnn/cpp/ttnn/operations/ccl/ccl_host_datastructures.cpp b/ttnn/cpp/ttnn/operations/ccl/ccl_host_datastructures.cpp index 89cca953ca9..6330db7c537 100644 --- a/ttnn/cpp/ttnn/operations/ccl/ccl_host_datastructures.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/ccl_host_datastructures.cpp @@ -48,7 +48,7 @@ uint32_t EriscDatamoverConfig::compute_buffer_size(std::size_t num_edm_channels, log_trace(tt::LogOp, "Buffer size: {}", buffer_size); - TT_ASSERT(buffer_size > 0 && buffer_size % page_size == 0); + TT_ASSERT(page_size == 0 ? buffer_size == 0 : buffer_size % page_size == 0); return buffer_size; } diff --git a/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp index 6d04e833e2d..b8d2756443d 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp @@ -182,7 +182,7 @@ operation::ProgramWithCallbacks pad_rm_opt(const Tensor &a, TT_ASSERT(unpadded_row_size_nbytes <= padded_row_size_nbytes, "Padded output tensor size should be >= input tensor size"); Device *device = a.device(); - auto dst_buffer_l1 = Buffer(device, padded_row_size_nbytes, padded_row_size_nbytes, BufferType::L1); + auto dst_buffer_l1 = Buffer::create(device, padded_row_size_nbytes, padded_row_size_nbytes, BufferType::L1); // construct const buffer with the pad_value uint32_t pad_value_const_buffer_size = 32; // noc transfers in chunks of 32 @@ -243,7 +243,7 @@ operation::ProgramWithCallbacks pad_rm_opt(const Tensor &a, tt::log_debug("pad_value_const_tensor_addr: {}", pad_value_const_tensor_addr); tt::log_debug("pad_value_const_buffer_nbytes: {}", pad_value_const_buffer_nbytes); tt::log_debug("packed_pad_value: {}", packed_pad_value); - tt::log_debug("dst_buffer_l1_addr: {}", dst_buffer_l1.address()); + tt::log_debug("dst_buffer_l1_addr: {}", dst_buffer_l1->address()); } #endif @@ -263,7 +263,7 @@ operation::ProgramWithCallbacks pad_rm_opt(const Tensor &a, pad_value_const_tensor_addr, pad_value_const_buffer_nbytes, packed_pad_value, - dst_buffer_l1.address()}; + dst_buffer_l1->address()}; tt::tt_metal::SetRuntimeArgs(program, reader_kernel_id, core, diff --git a/ttnn/cpp/ttnn/run_operation_inl.hpp b/ttnn/cpp/ttnn/run_operation_inl.hpp index df2025b17c5..db57a4e7d2f 100644 --- a/ttnn/cpp/ttnn/run_operation_inl.hpp +++ b/ttnn/cpp/ttnn/run_operation_inl.hpp @@ -247,8 +247,8 @@ void launch_op( }); for (auto target_device : workers) { - target_device->push_work(std::make_shared>( - [target_device, work_lambda]() mutable { (*work_lambda)(target_device); })); + target_device->push_work( + [target_device, work_lambda]() mutable { (*work_lambda)(target_device); }); } } diff --git a/ttnn/cpp/ttnn/tensor/tensor.cpp b/ttnn/cpp/ttnn/tensor/tensor.cpp index 2e57a99e1b6..d689237d5e9 100644 --- a/ttnn/cpp/ttnn/tensor/tensor.cpp +++ b/ttnn/cpp/ttnn/tensor/tensor.cpp @@ -261,7 +261,7 @@ void Tensor::deallocate(bool force) { : this->tensor_attributes->main_thread_ref_count; if ((force or ref_count_to_use == 1) and not this->tensor_attributes->deallocated) { this->tensor_attributes->deallocated = true; - this->workers.at(0)->push_work(std::make_shared>( + this->workers.at(0)->push_work( [force, attr = this->tensor_attributes]() mutable { // Cross worker synchronization: If the tensor being deallocated is shared across // workers (ex: all_gather op), wait until all workers are done with this tensor @@ -297,7 +297,7 @@ void Tensor::deallocate(bool force) { } }, attr->storage); - })); + }); } } else { TT_FATAL( @@ -339,8 +339,8 @@ void Tensor::deallocate(bool force) { }); for (auto worker : this->workers) { - worker->push_work(std::make_shared>( - [worker, dealloc_lambda]() mutable { (*dealloc_lambda)(worker); })); + worker->push_work( + [worker, dealloc_lambda]() mutable { (*dealloc_lambda)(worker); }); } } } else { diff --git a/ttnn/cpp/ttnn/tensor/tensor_impl.cpp b/ttnn/cpp/ttnn/tensor/tensor_impl.cpp index b3a013e1b38..bea72879c2d 100644 --- a/ttnn/cpp/ttnn/tensor/tensor_impl.cpp +++ b/ttnn/cpp/ttnn/tensor/tensor_impl.cpp @@ -45,7 +45,6 @@ uint32_t element_size_bytes(DataType dtype) { } uint32_t get_page_size(DataType dtype, Layout layout, uint32_t total_size_bytes, const ttnn::SimpleShape& shape, const std::optional& tile) { - uint32_t W = shape[-1]; uint32_t page_size = 0; const auto tile_HW = tile.has_value() ? tile->get_tile_hw() : constants::TILE_HW; const auto bfloat8b_tile_HW = tile.has_value() ? tile_HW + 64 : constants::BFLOAT8_B_TILE_HW; @@ -53,6 +52,7 @@ uint32_t get_page_size(DataType dtype, Layout layout, uint32_t total_size_bytes, switch (layout) { case Layout::ROW_MAJOR: { uint32_t size_of_element = element_size_bytes(dtype); + uint32_t W = shape.rank() == 0 ? 1 : shape[-1]; page_size = W * size_of_element; } break; case Layout::TILE: { @@ -82,11 +82,10 @@ uint32_t get_page_size(DataType dtype, Layout layout, uint32_t total_size_bytes, } break; default: TT_ASSERT(false && "Unsupported data type!"); } - TT_ASSERT(total_size_bytes % page_size == 0); + TT_ASSERT(page_size == 0 ? total_size_bytes == 0 : total_size_bytes % page_size == 0); } break; default: TT_ASSERT(false && "Unsupported layout to write to device"); } - TT_ASSERT(page_size != 0); return page_size; } @@ -191,12 +190,12 @@ DeviceBuffer allocate_interleaved_buffer_on_device( const MemoryConfig& memory_config, const std::optional& tile) { uint32_t page_size = get_page_size(data_type, layout, buffer_size_bytes, shape, tile); - return std::make_shared(device, buffer_size_bytes, page_size, memory_config.buffer_type); + return Buffer::create(device, buffer_size_bytes, page_size, memory_config.buffer_type); } DeviceBuffer allocate_contiguous_buffer_on_device( size_t buffer_size_bytes, Device* device, const MemoryConfig& memory_config) { - return std::make_shared(device, buffer_size_bytes, buffer_size_bytes, memory_config.buffer_type); + return Buffer::create(device, buffer_size_bytes, buffer_size_bytes, memory_config.buffer_type); } DeviceBuffer allocate_sharded_buffer_on_device( @@ -212,7 +211,7 @@ DeviceBuffer allocate_sharded_buffer_on_device( const auto& page_shape = ttnn::SimpleShape(shard_params.page_shape); uint32_t page_size = get_page_size(data_type, layout, buffer_size_bytes, page_shape, tile); - return std::make_shared( + return Buffer::create( device, buffer_size_bytes, page_size, memory_config.buffer_type, memory_config.memory_layout, shard_params); } diff --git a/ttnn/cpp/ttnn/tensor/tensor_ops.cpp b/ttnn/cpp/ttnn/tensor/tensor_ops.cpp index c8c9e0bd782..e63a37fb791 100644 --- a/ttnn/cpp/ttnn/tensor/tensor_ops.cpp +++ b/ttnn/cpp/ttnn/tensor/tensor_ops.cpp @@ -26,9 +26,8 @@ namespace{ inline void SynchronizeWorkerThreads(const std::vector& workers) { // Push empty work to threads and ensure its been picked up - static auto empty_work = std::make_shared>([](){}); for (auto target_device : workers) { - target_device->work_executor.push_work(empty_work); + target_device->work_executor.push_work([](){}); } // Block until work has been picked up, to flush the queue for (auto target_device : workers) { diff --git a/ttnn/cpp/ttnn/tensor/types.hpp b/ttnn/cpp/ttnn/tensor/types.hpp index 77b01638c11..4364414bc76 100644 --- a/ttnn/cpp/ttnn/tensor/types.hpp +++ b/ttnn/cpp/ttnn/tensor/types.hpp @@ -400,7 +400,7 @@ struct DeviceStorage { static constexpr auto attribute_names = std::forward_as_tuple("memory_config"); const auto attribute_values() const { return std::make_tuple(this->memory_config()); } - inline bool is_allocated() const { return buffer && buffer->size() > 0; } + inline bool is_allocated() const { return buffer && buffer->is_allocated(); } }; using BorrowedBuffer = std::variant< @@ -844,7 +844,7 @@ struct MultiDeviceStorage { return std::all_of( ordered_device_ids.begin(), ordered_device_ids.end(), [&buffers = this->buffers](auto &&device_id) { const auto &buffer = buffers.at(device_id); - return buffer && buffer->size() > 0; + return buffer && buffer->is_allocated(); }); } }; diff --git a/ttnn/cpp/ttnn/types.hpp b/ttnn/cpp/ttnn/types.hpp index 615af9aa442..72e212d1c21 100644 --- a/ttnn/cpp/ttnn/types.hpp +++ b/ttnn/cpp/ttnn/types.hpp @@ -60,77 +60,7 @@ struct CoreGrid { } }; -// Keep track of live buffers and the device addresses they were assigned. -// When a buffer is created, it is provided a buffer_id using get_buf_id(). -// The address for this buffer is assigned to buffer_id when the buffer is asynchronously allocated. -// When the buffer destructor is called, or the buffer is asynchronously deallocated, the worker thread -// will look up the address for buffer_id to free memory on device. -class buffer_address_map { - public: - void insert(uint32_t buf_id, uint32_t buf_addr) { - std::scoped_lock lock(this->map_mutex); - this->buf_id_to_address_map.insert({buf_id, buf_addr}); - } - void erase(uint32_t buf_id) { - std::scoped_lock lock(this->map_mutex); - this->buf_id_to_address_map.erase(buf_id); - } - uint32_t buffer_address(uint32_t buf_id) { - std::scoped_lock lock(this->map_mutex); - return this->buf_id_to_address_map.at(buf_id); - } - uint32_t get_buf_id() { - return buf_id++; - } - - private: - std::atomic buf_id = 0; - std::mutex map_mutex; - std::unordered_map buf_id_to_address_map = {}; -}; - -inline buffer_address_map GLOBAL_BUFFER_ADDRESS_MAP; - -// This buffer class is compatible with multithreaded runtime (which lives in tt_eager) -// It is derived from the tt_metal::Buffer class, but defines its own asynchronous allocation functions -class Buffer : public tt::tt_metal::Buffer { - public: - Buffer(Device *device, uint64_t size, uint64_t page_size, const BufferType buffer_type, - const TensorMemoryLayout buffer_layout = TensorMemoryLayout::INTERLEAVED, - std::optional shard_parameters = std::nullopt, std::optional bottom_up = std::nullopt - ) : tt::tt_metal::Buffer(device, size, page_size, buffer_type, buffer_layout, shard_parameters, bottom_up, false) { - this->buffer_id = GLOBAL_BUFFER_ADDRESS_MAP.get_buf_id(); // Each buffer has a unique ID - this->allocate(); - } - ~Buffer() { - this->deallocate(); - } - private: - uint32_t buffer_id = 0; - void allocate() { - TT_ASSERT(this->device()); - this->device()->push_work([this] () mutable { - bool bottom_up = this->bottom_up_.value_or(this->is_dram()); - tt::tt_metal::detail::AllocateBuffer(this, bottom_up); - // The address inserted here, will be used during asynchronous deallocate - GLOBAL_BUFFER_ADDRESS_MAP.insert(this->buffer_id, this->address()); - - }); - } - void deallocate() { - if (this->device() == nullptr or not this->device()->initialized_ or this->size() == 0) { - return; - } - this->set_size(0); - TT_ASSERT(this->device()->allocator_ != nullptr, "Expected allocator to be initialized!"); - // Extract the required buffer attributes from main thread (these are guaranteed to be correctly populated) and send to worker - this->device()->push_work([dev = this->device(), id = this->buffer_id, type = this->buffer_type()] () mutable { - // At this point, the address for this buffer has made it to GLOBAL_BUFFER_ADDRESS_MAP, since the worker has allocated the buffer. - tt::tt_metal::allocator::deallocate_buffer(*(dev->allocator_), GLOBAL_BUFFER_ADDRESS_MAP.buffer_address(id), type); - GLOBAL_BUFFER_ADDRESS_MAP.erase(id); - }); - } -}; +using Buffer = tt::tt_metal::Buffer; static std::ostream &operator<<(std::ostream &os, const CoreGrid &core_grid) { os << "ttnn.CoreGrid(x=" <