Skip to content

Commit

Permalink
#0: Unify tt_metal::Buffer and ttnn::Buffer, add support for 0-vo…
Browse files Browse the repository at this point in the history
…lume buffers (#14023)

* #0: Buffer refactoring wip

* #0: Buffer threading change

* #0: Buffer fixup

* #0: Buffer fixup

* #0: Add experimental asserts

* #0: Refactor

* #0: Test fixup

* #0: Review fixes

* #0: Review fixes

* #0: Review fixes

* #0: Handle 0 page size

* #0: Add a test for creating/reading/writing 0-sized tensor

* #0: Add support for more 0-sized shapes

* #0: Review fixes

* #0: Refactor allocation/deallocation threading to allow getting buffer address from another thread

* #0: Treat empty shape as scalar

* #0: Fix trailing comma

* #0: Add missing include

* #0: Allow 0 page size

* #0: Rewrite threading, no mutexes

* #0: Assert fix

* #0: Fixup

* #0: Remove unneeded include

* #0: Cleanup, fix typo

* #0: Cleanup

* #0: Trying to improve perf

* #0: Trying to improve perf

* #0: Build fix

* #0: Simpler threading, get rid of canceling allocation feature

* #0: Fixup

* #0: Avoid using std::atomic::wait, it seems to have perf implications

---------

Co-authored-by: Artem Yerofieiev <[email protected]>
  • Loading branch information
sminakov-tt and ayerofieiev-tt authored Oct 24, 2024
1 parent 9e8d1b5 commit af83135
Show file tree
Hide file tree
Showing 50 changed files with 593 additions and 779 deletions.
46 changes: 29 additions & 17 deletions tests/tt_eager/tensors/test_async_tensor_apis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,18 @@ using namespace tt;
using namespace tt_metal;
using namespace constants;

namespace {
uint32_t get_device_buffer_address(const Tensor& tensor) {
TT_FATAL(std::holds_alternative<DeviceStorage>(tensor.get_storage()), "Tensor storage is not DeviceStorage");
auto buffer = std::get<DeviceStorage>(tensor.get_storage()).buffer;
uint32_t result = 0;
buffer->device()->push_work([&]() {
result = buffer->address();
}, true);
return result;
}
}

TEST_F(CommonFixture, TestTensorOwnershipSanity) {
// Sanity test tensor read, write and update paths with synchronous
// Ensure that tensor data is copied and owned as expected
Expand Down Expand Up @@ -129,17 +141,17 @@ TEST_F(CommonFixture, TestAsyncEltwiseBinary) {
Tensor output_tensor_host = output_tensor_device_2.cpu();
// Test tensor deallocation in async mode: deallocate tensors after using them
if (i == 0) {
input_a_addr = std::get<DeviceStorage>(input_tensor_a.get_storage()).buffer->address();
input_b_addr = std::get<DeviceStorage>(input_tensor_b.get_storage()).buffer->address();
input_c_addr = std::get<DeviceStorage>(input_tensor_c.get_storage()).buffer->address();
output_1_addr = std::get<DeviceStorage>(output_tensor_device.get_storage()).buffer->address();
output_2_addr = std::get<DeviceStorage>(output_tensor_device_2.get_storage()).buffer->address();
input_a_addr = get_device_buffer_address(input_tensor_a);
input_b_addr = get_device_buffer_address(input_tensor_b);
input_c_addr = get_device_buffer_address(input_tensor_c);
output_1_addr = get_device_buffer_address(output_tensor_device);
output_2_addr = get_device_buffer_address(output_tensor_device_2);
} else {
EXPECT_EQ(std::get<DeviceStorage>(input_tensor_a.get_storage()).buffer->address(), input_a_addr);
EXPECT_EQ(std::get<DeviceStorage>(input_tensor_b.get_storage()).buffer->address(), input_b_addr);
EXPECT_EQ(std::get<DeviceStorage>(input_tensor_c.get_storage()).buffer->address(), input_c_addr);
EXPECT_EQ(std::get<DeviceStorage>(output_tensor_device.get_storage()).buffer->address(), output_1_addr);
EXPECT_EQ(std::get<DeviceStorage>(output_tensor_device_2.get_storage()).buffer->address(), output_2_addr);
EXPECT_EQ(get_device_buffer_address(input_tensor_a), input_a_addr);
EXPECT_EQ(get_device_buffer_address(input_tensor_b), input_b_addr);
EXPECT_EQ(get_device_buffer_address(input_tensor_c), input_c_addr);
EXPECT_EQ(get_device_buffer_address(output_tensor_device), output_1_addr);
EXPECT_EQ(get_device_buffer_address(output_tensor_device_2), output_2_addr);
}
input_tensor_a.deallocate();
input_tensor_b.deallocate();
Expand Down Expand Up @@ -171,7 +183,7 @@ TEST_F(CommonFixture, TestAsyncRefCountManager) {
ttnn::numpy::full<float>(tt::tt_metal::LegacyShape({1, 1, 1024, 1024}), static_cast<float>(i), DataType::BFLOAT16).to(device);
Tensor tensor2 =
ttnn::numpy::full<float>(tt::tt_metal::LegacyShape({1, 1, 1024, 1024}), static_cast<float>(i), DataType::BFLOAT16).to(device);
uint32_t tensor2_device_buf_addr = tensor2.device_buffer()->address();
uint32_t tensor2_device_buf_addr = get_device_buffer_address(tensor2);
// Assign tensor1 to tensor2 and ensure that ref counts are appropriately updated with the buffer for tensor2
// deallocated
tensor2 = tensor1;
Expand All @@ -181,19 +193,19 @@ TEST_F(CommonFixture, TestAsyncRefCountManager) {
// prev addr for tensor2
Tensor tensor3 =
ttnn::numpy::full<float>(tt::tt_metal::LegacyShape({1, 1, 1024, 1024}), static_cast<float>(i), DataType::BFLOAT16).to(device);
EXPECT_EQ(tensor3.device_buffer()->address(), tensor2_device_buf_addr);
EXPECT_EQ(tensor1.device_buffer()->address(), tensor2.device_buffer()->address());
EXPECT_EQ(get_device_buffer_address(tensor3), tensor2_device_buf_addr);
EXPECT_EQ(get_device_buffer_address(tensor1), get_device_buffer_address(tensor2));
}
log_info(LogTest, "Testing Device tensor self-assignment through function");
for (int i = 0; i < 5; i++) {
Tensor device_tensor =
ttnn::numpy::full<float>(tt::tt_metal::LegacyShape({1, 1, 1024, 1024}), static_cast<float>(i), DataType::BFLOAT16).to(device);
uint32_t device_tensor_address = device_tensor.device_buffer()->address();
uint32_t device_tensor_address = get_device_buffer_address(device_tensor);
// This step will copy the tensor to a temp rval and std::move it back to the caller's instance of device_tensor
// Ensure ref count and address remain unchanged
device_tensor = tensor_identity_copy_function(device_tensor);
EXPECT_EQ(device_tensor.tensor_attributes->main_thread_ref_count, 1);
EXPECT_EQ(device_tensor.device_buffer()->address(), device_tensor_address);
EXPECT_EQ(get_device_buffer_address(device_tensor), device_tensor_address);
}

log_info(LogTest, "Testing Device tensor move assignment");
Expand All @@ -208,11 +220,11 @@ TEST_F(CommonFixture, TestAsyncRefCountManager) {
log_info(LogTest, "Testing Device tensor self-assignment");
Tensor tensor_to_self_assign =
ttnn::numpy::full<float>(tt::tt_metal::LegacyShape({1, 1, 1024, 1024}), static_cast<float>(0), DataType::BFLOAT16).to(device);
uint32_t tensor_to_self_assign_address = tensor_to_self_assign.device_buffer()->address();
uint32_t tensor_to_self_assign_address = get_device_buffer_address(tensor_to_self_assign);
tensor_to_self_assign = tensor_to_self_assign;
EXPECT_EQ(tensor_to_self_assign.tensor_attributes->main_thread_ref_count, 1);
tensor_to_self_assign = std::move(tensor_to_self_assign);
EXPECT_EQ(tensor_to_self_assign.device_buffer()->address(), tensor_to_self_assign_address);
EXPECT_EQ(get_device_buffer_address(tensor_to_self_assign), tensor_to_self_assign_address);
auto barrier_tensor = tensor_to_self_assign.cpu();
device->enable_async(false);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ int main(int argc, char** argv) {

// limit size of the L1 buffer to do not exceed global L1 size
uint32_t l1_buffer_size = num_cores_r * num_cores_c * (num_tiles > 256 ? 256 : num_tiles) * page_size;
auto l1_buffer = tt_metal::Buffer(device, l1_buffer_size, page_size, tt_metal::BufferType::L1);
auto l1_buffer = tt_metal::Buffer::create(device, l1_buffer_size, page_size, tt_metal::BufferType::L1);

////////////////////////////////////////////////////////////////////////////
// Application Setup
Expand Down Expand Up @@ -166,7 +166,7 @@ int main(int argc, char** argv) {
for (int j = 0; j < num_cores_c; j++) {
CoreCoord core = {(std::size_t)j, (std::size_t)i};
uint32_t core_index = i * num_cores_c + j;
uint32_t l1_buffer_addr = l1_buffer.address();
uint32_t l1_buffer_addr = l1_buffer->address();

const std::array noc_runtime_args = {core_index, l1_buffer_addr, num_tiles, num_cores_r * num_cores_c};
SetRuntimeArgs(program, noc_kernel, core, noc_runtime_args);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,14 +77,14 @@ int main(int argc, char** argv) {
log_error(tt::LogTest, "Command line arguments found exception", e.what());
}

TT_ASSERT(transfer_size % page_size == 0, "Transfer size {}B should be divisible by page size {}B", transfer_size, page_size);
TT_ASSERT(page_size == 0 ? transfer_size == 0 : transfer_size % page_size == 0, "Transfer size {}B should be divisible by page size {}B", transfer_size, page_size);

// Device setup
int device_id = 0;
tt_metal::Device* device = tt_metal::CreateDevice(device_id);

// Application setup
auto buffer = tt_metal::Buffer(
auto buffer = tt_metal::Buffer::create(
device, transfer_size, page_size, buffer_type == 0 ? tt_metal::BufferType::DRAM : tt_metal::BufferType::L1);

std::vector<uint32_t> src_vec = create_random_vector_of_bfloat16(
Expand All @@ -104,7 +104,7 @@ int main(int argc, char** argv) {
// Execute application
if (!skip_write) {
auto t_begin = std::chrono::steady_clock::now();
EnqueueWriteBuffer(device->command_queue(), buffer, src_vec, false);
EnqueueWriteBuffer(device->command_queue(), *buffer, src_vec, false);
Finish(device->command_queue());
auto t_end = std::chrono::steady_clock::now();
auto elapsed_us = duration_cast<microseconds>(t_end - t_begin).count();
Expand All @@ -119,7 +119,7 @@ int main(int argc, char** argv) {

if (!skip_read) {
auto t_begin = std::chrono::steady_clock::now();
EnqueueReadBuffer(device->command_queue(), buffer, result_vec, true);
EnqueueReadBuffer(device->command_queue(), *buffer, result_vec, true);
auto t_end = std::chrono::steady_clock::now();
auto elapsed_us = duration_cast<microseconds>(t_end - t_begin).count();
d2h_bandwidth.push_back((transfer_size / 1024.0 / 1024.0 / 1024.0) / (elapsed_us / 1000.0 / 1000.0));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ int main(int argc, char **argv) {
////////////////////////////////////////////////////////////////////////////
std::vector<uint32_t> input_vec = create_random_vector_of_bfloat16(
input_size, 100, std::chrono::system_clock::now().time_since_epoch().count());
tt_metal::Buffer input_buffer(
auto input_buffer = Buffer::create(
device, input_vec.size() * sizeof(uint32_t), single_tile_size, tt_metal::BufferType::DRAM);

////////////////////////////////////////////////////////////////////////////
Expand All @@ -212,7 +212,7 @@ int main(int argc, char **argv) {
num_tiles_per_core_group_1,
num_tiles_per_core_group_2,
kernel,
input_buffer.address(),
input_buffer->address(),
num_reqs_at_a_time,
single_tile_size,
tile_format);
Expand All @@ -221,7 +221,7 @@ int main(int argc, char **argv) {
// Copy Input To DRAM or L1
////////////////////////////////////////////////////////////////////////////
if (access_type == 0) {
tt_metal::detail::WriteToBuffer(input_buffer, input_vec);
tt_metal::detail::WriteToBuffer(*input_buffer, input_vec);
} else {
for (uint32_t i = 0, input_offset = 0; i < num_cores; ++i) {
CoreCoord core = {i / num_cores_y, i % num_cores_y};
Expand Down Expand Up @@ -276,7 +276,7 @@ int main(int argc, char **argv) {
////////////////////////////////////////////////////////////////////////////
pass = validation(
device,
input_buffer,
*input_buffer,
input_vec,
num_cores,
num_cores_y,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -673,18 +673,18 @@ int main(int argc, char **argv) {
input_size, 100, 1234);
}

tt_metal::Buffer input_buffer(
auto input_buffer = tt_metal::Buffer::create(
device, input_vec.size() * sizeof(uint32_t), single_tile_size, tt_metal::BufferType::DRAM);

////////////////////////////////////////////////////////////////////////////
// Application Setup
////////////////////////////////////////////////////////////////////////////
auto [program, kernel, cb_addr] = create_program(device, all_cores, single_tile_size, tile_format, num_tiles_cb, num_tiles_per_core, k, n, num_blocks, num_banks, all_cores_list, bank_start_id, input_buffer.address());
auto [program, kernel, cb_addr] = create_program(device, all_cores, single_tile_size, tile_format, num_tiles_cb, num_tiles_per_core, k, n, num_blocks, num_banks, all_cores_list, bank_start_id, input_buffer->address());

////////////////////////////////////////////////////////////////////////////
// Copy Input To DRAM or L1
////////////////////////////////////////////////////////////////////////////
tt_metal::detail::WriteToBuffer(input_buffer, input_vec);
tt_metal::detail::WriteToBuffer(*input_buffer, input_vec);

////////////////////////////////////////////////////////////////////////////
// Execution Application
Expand Down Expand Up @@ -713,7 +713,7 @@ int main(int argc, char **argv) {

pass = validation(
device,
input_buffer,
*input_buffer,
input_vec,
num_cores,
all_cores_list,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -869,18 +869,18 @@ int main(int argc, char **argv) {
input_size, 100, 1234);
}

tt_metal::Buffer input_buffer(
auto input_buffer = tt_metal::Buffer::create(
device, input_vec.size() * sizeof(uint32_t), single_tile_size, tt_metal::BufferType::DRAM);

////////////////////////////////////////////////////////////////////////////
// Application Setup
////////////////////////////////////////////////////////////////////////////
auto [program, kernel, output_cb_addr] = create_program(device, all_dram_reader_cores, all_l1_receiver_cores, single_tile_size, tile_format, num_tiles_cb, num_tiles_per_core, k, n, num_blocks, num_banks, all_dram_reader_cores_ordered, all_l1_writer_cores_ordered, bank_start_id, input_buffer.address());
auto [program, kernel, output_cb_addr] = create_program(device, all_dram_reader_cores, all_l1_receiver_cores, single_tile_size, tile_format, num_tiles_cb, num_tiles_per_core, k, n, num_blocks, num_banks, all_dram_reader_cores_ordered, all_l1_writer_cores_ordered, bank_start_id, input_buffer->address());

////////////////////////////////////////////////////////////////////////////
// Copy Input To DRAM or L1
////////////////////////////////////////////////////////////////////////////
tt_metal::detail::WriteToBuffer(input_buffer, input_vec);
tt_metal::detail::WriteToBuffer(*input_buffer, input_vec);

////////////////////////////////////////////////////////////////////////////
// Execution Application
Expand Down Expand Up @@ -909,7 +909,7 @@ int main(int argc, char **argv) {

pass = validation(
device,
input_buffer,
*input_buffer,
input_vec,
num_cores,
all_l1_writer_cores_ordered,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ int main(int argc, char **argv) {
activations_addr,
activations_addr / 1024,
Nt);
std::vector<tt_metal::Buffer> l1_buffers;
std::vector<std::shared_ptr<tt_metal::Buffer>> l1_buffers;

int l1_buffers_size = 1;
if (!(single_read || one_buffer_share)) {
Expand All @@ -199,8 +199,8 @@ int main(int argc, char **argv) {
l1_buffers.reserve(l1_buffers_size);
for (int r = 0; r < num_cores_r; ++r) {
for (int c = 0; c < num_cores_c; ++c) {
l1_buffers.emplace_back(device, total_tiles_size_bytes, single_tile_size, tt_metal::BufferType::L1);
tt_metal::detail::WriteToBuffer(l1_buffers[r * num_cores_c + c], packed_tensors[r * num_cores_c + c]);
l1_buffers.push_back(tt_metal::Buffer::create(device, total_tiles_size_bytes, single_tile_size, tt_metal::BufferType::L1));
tt_metal::detail::WriteToBuffer(*l1_buffers[r * num_cores_c + c], packed_tensors[r * num_cores_c + c]);

if (single_read || one_buffer_share)
break;
Expand All @@ -213,7 +213,7 @@ int main(int argc, char **argv) {
for (int r = 0; r < num_cores_r; ++r) {
for (int c = 0; c < num_cores_c; ++c) {
std::vector<uint32_t> result_vec;
tt_metal::detail::ReadFromBuffer(l1_buffers[r * num_cores_c + c], result_vec);
tt_metal::detail::ReadFromBuffer(*l1_buffers[r * num_cores_c + c], result_vec);
auto result_bfp16 = unpack_uint32_vec_into_bfloat16_vec(result_vec);

if (print_tensor) {
Expand Down Expand Up @@ -260,7 +260,7 @@ int main(int argc, char **argv) {
CoreCoord core = {(size_t)c, (size_t)r};

int l1_buffers_idx = (single_read || one_buffer_share) ? (0) : (r * num_cores_c + c);
auto l1_buffer_addr = l1_buffers[l1_buffers_idx].address();
auto l1_buffer_addr = l1_buffers[l1_buffers_idx]->address();

uint32_t l1_buffer_offset = (one_buffer_share) ? ((r * num_cores_c + c) * Nt) : (0);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ TEST_F(BasicFixture, TestL1BuffersAllocatedTopDown) {

uint64_t alloc_limit = unit_tests::test_l1_banking_allocator::get_alloc_limit(device);

std::vector<std::unique_ptr<Buffer>> buffers;
std::vector<std::shared_ptr<Buffer>> buffers;
int alloc_size_idx = 0;
uint32_t total_buffer_size = 0;
while (total_size_bytes < alloc_limit) {
Expand All @@ -44,7 +44,7 @@ TEST_F(BasicFixture, TestL1BuffersAllocatedTopDown) {
if (total_buffer_size + buffer_size >= alloc_limit) {
break;
}
std::unique_ptr<tt::tt_metal::Buffer> buffer = std::make_unique<tt::tt_metal::Buffer>(device, buffer_size, buffer_size, tt::tt_metal::BufferType::L1);
auto buffer = tt::tt_metal::Buffer::create(device, buffer_size, buffer_size, tt::tt_metal::BufferType::L1);
buffers.emplace_back(std::move(buffer));
total_buffer_size += buffer_size;
EXPECT_EQ(buffers.back()->address(), device->l1_size_per_core() - total_buffer_size);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -430,17 +430,17 @@ bool matmul_multi_core_multi_dram(CommonFixture *fixture, tt_metal::Device *devi
auto activations_tile_layout = convert_to_tile_layout(activations_tilized);
auto activations = pack_bfloat16_vec_into_uint32_vec(activations_tile_layout);

auto activation_buffer = std::make_shared<Buffer>(device, activations.size() * sizeof(uint32_t), 1024 * 2, BufferType::DRAM);
auto activation_buffer = Buffer::create(device, activations.size() * sizeof(uint32_t), 1024 * 2, BufferType::DRAM);
pass &= move_tiles_to_dram(device, activations, M, K, activation_buffer);
auto identity_tilized = test_utils::tilize(identity, K * 32, N * 32);
auto weights_tile_layout = convert_to_tile_layout(identity_tilized);
auto weights = pack_bfloat16_vec_into_uint32_vec(weights_tile_layout);

auto weight_buffer = std::make_shared<Buffer>(device, weights.size() * sizeof(uint32_t), 1024 * 2, BufferType::DRAM);
auto weight_buffer = Buffer::create(device, weights.size() * sizeof(uint32_t), 1024 * 2, BufferType::DRAM);
pass &= move_tiles_to_dram(device, weights, K, N, weight_buffer);
log_debug(LogTest, "Copying inputs to dram complete");

auto out_buffer = std::make_shared<Buffer>(device, M * N * sizeof(uint32_t) * 32 * 32, 1024 * 2, BufferType::DRAM);
auto out_buffer = Buffer::create(device, M * N * sizeof(uint32_t) * 32 * 32, 1024 * 2, BufferType::DRAM);
uint32_t out_dram_addr = out_buffer->address();

log_debug(LogTest, "Writing kernel runtime args to device");
Expand Down
Loading

0 comments on commit af83135

Please sign in to comment.