Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ET-VK] Using a single GPU buffer for all tensor uniforms. #7015

Open
wants to merge 6 commits into
base: gh/trivedivivek/14/base
Choose a base branch
from
14 changes: 9 additions & 5 deletions backends/vulkan/runtime/api/containers/ParamsBuffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,13 @@ class ParamsBuffer final {
vulkan_buffer_(
context_p_->adapter_ptr()->vma().create_params_buffer(block)) {}

template <typename Block>
ParamsBuffer(Context* context_p, const VkDeviceSize nbytes)
: context_p_(context_p),
nbytes_(nbytes),
vulkan_buffer_(
context_p_->adapter_ptr()->vma().create_uniform_buffer(nbytes)) {}

ParamsBuffer(const ParamsBuffer&);
ParamsBuffer& operator=(const ParamsBuffer&);

Expand All @@ -50,14 +57,11 @@ class ParamsBuffer final {
}

template <typename Block>
void update(const Block& block) {
if (sizeof(block) != nbytes_) {
VK_THROW("Attempted to update ParamsBuffer with data of different size");
}
void update(const Block& block, const uint32_t offset = 0) {
// Fill the uniform buffer with data in block
{
vkapi::MemoryMap mapping(vulkan_buffer_, vkapi::kWrite);
Block* data_ptr = mapping.template data<Block>();
Block* data_ptr = mapping.template data<Block>(offset);

*data_ptr = block;
}
Expand Down
122 changes: 84 additions & 38 deletions backends/vulkan/runtime/api/containers/Tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -451,11 +451,13 @@ vTensor::vTensor(
unsqueezed_strides_{unsqueeze_strides(strides_, numel_)},
padded_numel_(utils::multiply_integers(padded_sizes_)),
logical_limits_{{0, 0, 0}},
uniforms_(),
// Utility Uniform Buffers that can be passed to shaders as arguments
sizes_uniform_(),
strides_uniform_(),
numel_uniform_(),
logical_limits_uniform_(),
uniforms_size_(0),
sizes_uniform_offset_(kUniformOffsetUnset),
unsqueezed_strides_offset_(kUniformOffsetUnset),
numel_uniform_offset_(kUniformOffsetUnset),
logical_limits_uniform_offset_(kUniformOffsetUnset),
// Construct Tensor storage
storage_(
context,
Expand Down Expand Up @@ -497,11 +499,13 @@ vTensor::vTensor(
unsqueezed_strides_(),
padded_numel_(utils::multiply_integers(padded_sizes_)),
logical_limits_(),
uniforms_(),
// Utility Uniform Buffers that can be passed to shaders as arguments
sizes_uniform_(),
strides_uniform_(),
numel_uniform_(),
logical_limits_uniform_(),
uniforms_size_(0),
sizes_uniform_offset_(kUniformOffsetUnset),
unsqueezed_strides_offset_(kUniformOffsetUnset),
numel_uniform_offset_(kUniformOffsetUnset),
logical_limits_uniform_offset_(kUniformOffsetUnset),
// Construct Tensor storage
storage_(context, image) {
set_logical_limits(storage_.image_extents_);
Expand All @@ -522,11 +526,13 @@ vTensor::vTensor(vTensor& other)
other.unsqueezed_strides_.end()},
padded_numel_(other.padded_numel_),
logical_limits_{other.logical_limits_},
uniforms_(),
// Empty initialize Utility Uniform Buffers
sizes_uniform_(),
strides_uniform_(),
numel_uniform_(),
logical_limits_uniform_(),
uniforms_size_(0),
sizes_uniform_offset_(kUniformOffsetUnset),
unsqueezed_strides_offset_(kUniformOffsetUnset),
numel_uniform_offset_(kUniformOffsetUnset),
logical_limits_uniform_offset_(kUniformOffsetUnset),
// Copy Tensor storage
storage_(other.storage_) {}

Expand All @@ -547,11 +553,13 @@ vTensor::vTensor(
unsqueezed_strides_{unsqueeze_strides(strides_, numel_)},
padded_numel_(utils::multiply_integers(padded_sizes_)),
logical_limits_(other.logical_limits_),
uniforms_(),
// Empty initialize Utility Uniform Buffers
sizes_uniform_(),
strides_uniform_(),
numel_uniform_(),
logical_limits_uniform_(),
uniforms_size_(0),
sizes_uniform_offset_(kUniformOffsetUnset),
unsqueezed_strides_offset_(kUniformOffsetUnset),
numel_uniform_offset_(kUniformOffsetUnset),
logical_limits_uniform_offset_(kUniformOffsetUnset),
// Copy Tensor storage
storage_(other.storage_, vkapi::element_size(dtype_) * offset_numel) {
VK_CHECK_COND(
Expand Down Expand Up @@ -612,33 +620,69 @@ utils::GPUMemoryLayout vTensor::estimate_memory_layout() const {
}

const vkapi::BufferBindInfo vTensor::sizes_ubo() {
if (!sizes_uniform_.buffer()) {
sizes_uniform_ =
ParamsBuffer(storage_.context_, utils::make_whcn_ivec4(sizes_));
if (!uniforms_.buffer()) {
uniforms_ = ParamsBuffer(storage_.context_, kMaxUniformBufferSize);
}
return vkapi::BufferBindInfo(sizes_uniform_.buffer());
if (sizes_uniform_offset_ == kUniformOffsetUnset) {
const auto allocation_size = sizes_.size() * sizeof(sizes_[0]);
VK_CHECK_COND(
(uniforms_size_ + allocation_size) <= kMaxUniformBufferSize,
"Uniform data allocation has exceeded Tensor uniform buffer size");
sizes_uniform_offset_ = uniforms_size_;
uniforms_size_ += allocation_size;
uniforms_.update(utils::make_whcn_ivec4(sizes_), sizes_uniform_offset_);
}
return vkapi::BufferBindInfo(uniforms_.buffer(), sizes_uniform_offset_);
}

const vkapi::BufferBindInfo vTensor::strides_ubo() {
if (!strides_uniform_.buffer()) {
strides_uniform_ = ParamsBuffer(
storage_.context_, utils::make_whcn_ivec4(unsqueezed_strides_));
if (!uniforms_.buffer()) {
uniforms_ = ParamsBuffer(storage_.context_, kMaxUniformBufferSize);
}
if (unsqueezed_strides_offset_ == kUniformOffsetUnset) {
const auto allocation_size =
unsqueezed_strides_.size() * sizeof(unsqueezed_strides_[0]);
VK_CHECK_COND(
(uniforms_size_ + allocation_size) <= kMaxUniformBufferSize,
"Uniform data allocation has exceeded Tensor uniform buffer size");
unsqueezed_strides_offset_ = uniforms_size_;
uniforms_size_ += allocation_size;
uniforms_.update(
utils::make_whcn_ivec4(unsqueezed_strides_),
unsqueezed_strides_offset_);
}
return vkapi::BufferBindInfo(strides_uniform_.buffer());
return vkapi::BufferBindInfo(uniforms_.buffer(), unsqueezed_strides_offset_);
}

const vkapi::BufferBindInfo vTensor::logical_limits_ubo() {
if (!logical_limits_uniform_.buffer()) {
logical_limits_uniform_ = ParamsBuffer(storage_.context_, logical_limits_);
if (!uniforms_.buffer()) {
uniforms_ = ParamsBuffer(storage_.context_, kMaxUniformBufferSize);
}
return vkapi::BufferBindInfo(logical_limits_uniform_.buffer());
if (logical_limits_uniform_offset_ == kUniformOffsetUnset) {
VK_CHECK_COND(
(uniforms_size_ + sizeof(logical_limits_)) <= kMaxUniformBufferSize,
"Uniform data allocation has exceeded Tensor uniform buffer size");
logical_limits_uniform_offset_ = uniforms_size_;
uniforms_size_ += sizeof(logical_limits_);
uniforms_.update(logical_limits_, logical_limits_uniform_offset_);
}
return vkapi::BufferBindInfo(
uniforms_.buffer(), logical_limits_uniform_offset_);
}

const vkapi::BufferBindInfo vTensor::numel_ubo() {
if (!numel_uniform_.buffer()) {
numel_uniform_ = ParamsBuffer(storage_.context_, numel_);
if (!uniforms_.buffer()) {
uniforms_ = ParamsBuffer(storage_.context_, kMaxUniformBufferSize);
}
if (numel_uniform_offset_ == kUniformOffsetUnset) {
VK_CHECK_COND(
(uniforms_size_ + sizeof(numel_)) <= kMaxUniformBufferSize,
"Uniform data allocation has exceeded Tensor uniform buffer size");
numel_uniform_offset_ = uniforms_size_;
uniforms_size_ += sizeof(numel_);
uniforms_.update(numel_, numel_uniform_offset_);
}
return vkapi::BufferBindInfo(numel_uniform_.buffer());
return vkapi::BufferBindInfo(uniforms_.buffer(), numel_uniform_offset_);
}

size_t vTensor::staging_buffer_numel() const {
Expand Down Expand Up @@ -690,17 +734,19 @@ void vTensor::update_metadata() {
set_logical_limits(
calculate_image_extents(padded_sizes_, axis_map_, packed_dim_));

if (sizes_uniform_.buffer()) {
sizes_uniform_.update(utils::make_whcn_ivec4(sizes_));
if (sizes_uniform_offset_ != kUniformOffsetUnset) {
uniforms_.update(utils::make_whcn_ivec4(sizes_), sizes_uniform_offset_);
}
if (strides_uniform_.buffer()) {
strides_uniform_.update(utils::make_whcn_ivec4(unsqueezed_strides_));
if (unsqueezed_strides_offset_ != kUniformOffsetUnset) {
uniforms_.update(
utils::make_whcn_ivec4(unsqueezed_strides_),
unsqueezed_strides_offset_);
}
if (numel_uniform_.buffer()) {
numel_uniform_.update(numel_);
if (numel_uniform_offset_ != kUniformOffsetUnset) {
uniforms_.update(numel_, numel_uniform_offset_);
}
if (logical_limits_uniform_.buffer()) {
logical_limits_uniform_.update(logical_limits_);
if (logical_limits_uniform_offset_ != kUniformOffsetUnset) {
uniforms_.update(logical_limits_, logical_limits_uniform_offset_);
}
}

Expand Down
24 changes: 16 additions & 8 deletions backends/vulkan/runtime/api/containers/Tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -297,18 +297,26 @@ class vTensor final {
TextureLimits logical_limits_;

/*
* Utility GPU buffers that can be passed to shaders in order to convey tensor
* metadata. These buffers will be initialized the first time they are
* accessed via the corresponding *_ubo() function, and their contents will be
* updated whenever virtual_resize() is called.
* Utility GPU buffer that can be passed to shaders in order to convey tensor
* metadata. Uniform buffer will be initialized only the first time a ubo is
* requested. Buffer offsets will be initialized the first time they are
* accessed via the corresponding *_ubo() function. Uniform buffer's contents
* will be updated whenever virtual_resize() is called.
*
* Refer to the comments for the corresponding *_ubo() functions for more
* context about the data contained in each buffer.
*/
ParamsBuffer sizes_uniform_;
ParamsBuffer strides_uniform_;
ParamsBuffer numel_uniform_;
ParamsBuffer logical_limits_uniform_;
ParamsBuffer uniforms_;
uint32_t uniforms_size_;
uint32_t sizes_uniform_offset_;
uint32_t unsqueezed_strides_offset_;
uint32_t numel_uniform_offset_;
uint32_t logical_limits_uniform_offset_;

// Total size of tensor's uniform buffer
constexpr static uint32_t kMaxUniformBufferSize = 4 * 20;
// Initial value of uniform buffer offsets
constexpr static uint32_t kUniformOffsetUnset = kMaxUniformBufferSize;

vTensorStorage storage_;

Expand Down
6 changes: 4 additions & 2 deletions backends/vulkan/runtime/vk_api/Descriptor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,11 @@ namespace vkapi {
BufferBindInfo::BufferBindInfo()
: handle(VK_NULL_HANDLE), offset(0u), range(0u) {}

BufferBindInfo::BufferBindInfo(const VulkanBuffer& buffer_p)
BufferBindInfo::BufferBindInfo(
const VulkanBuffer& buffer_p,
const uint32_t offset_p)
: handle(buffer_p.handle()),
offset(buffer_p.mem_offset()),
offset(buffer_p.mem_offset() + offset_p),
range(buffer_p.mem_range()) {}

//
Expand Down
2 changes: 1 addition & 1 deletion backends/vulkan/runtime/vk_api/Descriptor.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ struct BufferBindInfo final {
VkDeviceSize range;

BufferBindInfo();
BufferBindInfo(const VulkanBuffer& buffer_p);
BufferBindInfo(const VulkanBuffer& buffer_p, const uint32_t offset_p = 0u);
};

struct ParamsBindList final {
Expand Down
4 changes: 2 additions & 2 deletions backends/vulkan/runtime/vk_api/memory/Buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -195,8 +195,8 @@ class MemoryMap final {

public:
template <typename T>
T* data() {
return reinterpret_cast<T*>(data_);
T* data(const uint32_t offset = 0) {
return reinterpret_cast<T*>(static_cast<uint8_t*>(data_) + offset);
}

inline size_t nbytes() {
Expand Down
Loading