Skip to content

Commit

Permalink
#14366: Allow creating pre-allocated buffer with an address. Minor pe…
Browse files Browse the repository at this point in the history
…rf improvements (#14394)

* #14366: Allow creating pre-allocated buffer with an address. Minor performance improvements

* #0: Cleanup

* #0: Updated methods documentation

* #0: Handle allocation failure

* #0: Scheduling fix
  • Loading branch information
sminakov-tt authored Oct 29, 2024
1 parent 1186a84 commit d4b7fc2
Show file tree
Hide file tree
Showing 4 changed files with 105 additions and 9 deletions.
28 changes: 26 additions & 2 deletions tt_metal/host_api.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -255,21 +255,45 @@ uint32_t CreateSemaphore(
*
* | Argument | Description | Type | Valid Range | Required |
* |-----------------|---------------------------------------- |--------------------------|-------------|----------|
* | config | config for buffer | InterleavedBufferConfig | | Yes |
* | config | Config for the buffer | InterleavedBufferConfig | | Yes |
*/
std::shared_ptr<Buffer> CreateBuffer(const InterleavedBufferConfig &config);

/**
* Creates a pre-allocated interleaved DRAM or L1 buffer on device
*
* Return value: std::shared_ptr<Buffer>
*
* | Argument | Description | Type | Valid Range | Required |
* |-----------------|---------------------------------------- |--------------------------|-------------|----------|
* | config | Config for the buffer | InterleavedBufferConfig | | Yes |
* | address | Device address of the buffer | DeviceAddr | | Yes |
*/
std::shared_ptr<Buffer> CreateBuffer(const InterleavedBufferConfig &config, DeviceAddr address);

/**
* Allocates a sharded DRAM or L1 buffer on device
*
* Return value: std::shared_ptr<Buffer>
*
* | Argument | Description | Type | Valid Range | Required |
* |-----------------|---------------------------------------- |--------------------------|-------------|----------|
* | config | config for buffer | ShardedBufferConfig | | Yes |
* | config | Config for the buffer | ShardedBufferConfig | | Yes |
*/
std::shared_ptr<Buffer> CreateBuffer(const ShardedBufferConfig &config);

/**
* Creates a pre-allocated sharded DRAM or L1 buffer on device
*
* Return value: std::shared_ptr<Buffer>
*
* | Argument | Description | Type | Valid Range | Required |
* |-----------------|---------------------------------------- |--------------------------|-------------|----------|
* | config | Config for the buffer | ShardedBufferConfig | | Yes |
* | address | Device address of the buffer | DeviceAddr | | Yes |
*/
std::shared_ptr<Buffer> CreateBuffer(const ShardedBufferConfig &config, DeviceAddr address);

/**
* Deallocates buffer from device by marking its memory as free.
*
Expand Down
49 changes: 44 additions & 5 deletions tt_metal/impl/buffers/buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -203,14 +203,17 @@ Buffer::Buffer(
const BufferType buffer_type,
const TensorMemoryLayout buffer_layout,
const std::optional<ShardSpecBuffer>& shard_parameters,
const std::optional<bool> bottom_up) :
const std::optional<bool> bottom_up,
const bool owns_data,
Private) :
device_(device),
size_(size),
page_size_(page_size),
buffer_type_(buffer_type),
buffer_layout_(buffer_layout),
shard_parameters_(shard_parameters),
bottom_up_(bottom_up.value_or(this->is_dram())),
owns_data_(owns_data),
buffer_page_mapping_(nullptr) {
TT_FATAL(this->device_ != nullptr && this->device_->allocator_ != nullptr, "Device and allocator need to not be null.");

Expand All @@ -227,7 +230,8 @@ std::shared_ptr<Buffer> Buffer::create(
const TensorMemoryLayout buffer_layout,
const std::optional<ShardSpecBuffer>& shard_parameters,
const std::optional<bool> bottom_up) {
auto* bufferPtr = new Buffer(device, size, page_size, buffer_type, buffer_layout, shard_parameters, bottom_up);
auto* bufferPtr = new Buffer(device, size, page_size, buffer_type, buffer_layout, shard_parameters, bottom_up, true /* owns data */, Private());
// Using a custom deleter to properly clean up the owned datas
auto buffer = std::shared_ptr<Buffer>(bufferPtr, deleter);
buffer->weak_self = buffer;

Expand All @@ -237,19 +241,50 @@ std::shared_ptr<Buffer> Buffer::create(
}

buffer->device_->push_work([buffer] {
buffer->address_ = detail::AllocateBuffer(buffer.get());
try {
buffer->address_ = detail::AllocateBuffer(buffer.get());
} catch(...) {
std::unique_lock lock(buffer->allocation_mutex_);
buffer->allocation_status_.store(AllocationStatus::ALLOCATION_FAILED, std::memory_order::relaxed);
lock.unlock();
buffer->allocation_cv_.notify_all();

throw;
}

std::unique_lock lock(buffer->allocation_mutex_);
buffer->allocation_status_.store(AllocationStatus::ALLOCATED, std::memory_order::relaxed);
buffer->allocation_status_.store(AllocationStatus::ALLOCATED, std::memory_order::release);
lock.unlock();
buffer->allocation_cv_.notify_all();
});

return buffer;
}

std::shared_ptr<Buffer> Buffer::create(
Device *device,
DeviceAddr address,
DeviceAddr size,
DeviceAddr page_size,
const BufferType buffer_type,
const TensorMemoryLayout buffer_layout,
const std::optional<ShardSpecBuffer>& shard_parameters,
const std::optional<bool> bottom_up) {
// Not using a custom deleter, because it doesn't own any data to cleanup
auto buffer = std::make_shared<Buffer>(device, size, page_size, buffer_type, buffer_layout, shard_parameters, bottom_up, false /* owns data */, Private());
buffer->weak_self = buffer;

buffer->address_ = address;
buffer->allocation_status_.store(AllocationStatus::ALLOCATED, std::memory_order::relaxed);

return buffer;
}

void Buffer::deallocate() {
deallocation_requested_.store(true, std::memory_order::relaxed);
if (!owns_data_) {
return;
}
device_->push_work([self = weak_self.lock()] {
self->deallocate_impl();
});
Expand All @@ -263,7 +298,7 @@ void Buffer::deleter(Buffer* buffer) {
}

void Buffer::deallocate_impl() {
if (allocation_status_.load(std::memory_order::relaxed) == AllocationStatus::DEALLOCATED) {
if (allocation_status_.load(std::memory_order::relaxed) != AllocationStatus::ALLOCATED) {
return;
}

Expand All @@ -289,6 +324,10 @@ bool Buffer::is_allocated() const {
}

uint32_t Buffer::address() const {
if (allocation_status_.load(std::memory_order::acquire) != AllocationStatus::ALLOCATION_REQUESTED) {
return address_;
}

if (device_->can_use_passthrough_scheduling()) {
return address_;
}
Expand Down
20 changes: 18 additions & 2 deletions tt_metal/impl/buffers/buffer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,8 @@ struct BufferPageMapping {
inline namespace v0 {

class Buffer final {
struct Private { explicit Private() = default; };

public:
static std::shared_ptr<Buffer> create(
Device *device,
Expand All @@ -153,6 +155,15 @@ class Buffer final {
TensorMemoryLayout buffer_layout = TensorMemoryLayout::INTERLEAVED,
const std::optional<ShardSpecBuffer>& shard_parameter = std::nullopt,
std::optional<bool> bottom_up = std::nullopt);
static std::shared_ptr<Buffer> create(
Device *device,
DeviceAddr address,
DeviceAddr size,
DeviceAddr page_size,
BufferType buffer_type,
TensorMemoryLayout buffer_layout = TensorMemoryLayout::INTERLEAVED,
const std::optional<ShardSpecBuffer>& shard_parameter = std::nullopt,
std::optional<bool> bottom_up = std::nullopt);

Buffer(const Buffer &other) = delete;
Buffer &operator=(const Buffer &other) = delete;
Expand Down Expand Up @@ -210,18 +221,22 @@ class Buffer final {

const std::shared_ptr<const BufferPageMapping>& get_buffer_page_mapping();

private:

Buffer(
Device *device,
DeviceAddr size,
DeviceAddr page_size,
BufferType buffer_type,
TensorMemoryLayout buffer_layout,
const std::optional<ShardSpecBuffer>& shard_parameter,
std::optional<bool> bottom_up);
std::optional<bool> bottom_up,
bool owns_data,
Private);

private:
enum class AllocationStatus : uint8_t {
ALLOCATION_REQUESTED,
ALLOCATION_FAILED,
ALLOCATED,
DEALLOCATED,
};
Expand All @@ -239,6 +254,7 @@ class Buffer final {
const BufferType buffer_type_;
const TensorMemoryLayout buffer_layout_;
const bool bottom_up_;
const bool owns_data_;

std::atomic<AllocationStatus> allocation_status_ = AllocationStatus::ALLOCATION_REQUESTED;
DeviceAddr address_ = 0;
Expand Down
17 changes: 17 additions & 0 deletions tt_metal/tt_metal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1090,6 +1090,11 @@ std::shared_ptr<Buffer> CreateBuffer(const InterleavedBufferConfig &config) {
config.device, config.size, config.page_size, config.buffer_type, config.buffer_layout, std::nullopt, std::nullopt);
}

std::shared_ptr<Buffer> CreateBuffer(const InterleavedBufferConfig &config, DeviceAddr address) {
return Buffer::create(
config.device, address, config.size, config.page_size, config.buffer_type, config.buffer_layout, std::nullopt, std::nullopt);
}

std::shared_ptr<Buffer> CreateBuffer(const ShardedBufferConfig &config) {
return Buffer::create(
config.device,
Expand All @@ -1101,6 +1106,18 @@ std::shared_ptr<Buffer> CreateBuffer(const ShardedBufferConfig &config) {
std::nullopt);
}

std::shared_ptr<Buffer> CreateBuffer(const ShardedBufferConfig &config, DeviceAddr address) {
return Buffer::create(
config.device,
address,
config.size,
config.page_size,
config.buffer_type,
config.buffer_layout,
config.shard_parameters,
std::nullopt);
}

void DeallocateBuffer(Buffer &buffer) { buffer.deallocate(); }

void AssignGlobalBufferToProgram(
Expand Down

0 comments on commit d4b7fc2

Please sign in to comment.