From 700a4733ec97c97127b4e1e5817a7715cb31ab2a Mon Sep 17 00:00:00 2001 From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com> Date: Thu, 21 Nov 2024 11:28:31 -0800 Subject: [PATCH] [ET-VK] Using a single GPU buffer for all tensor uniforms. This diff changes Tensor class to store all uniforms in a single uniform buffer. Entities stored in uniforms ie. size, stride, numel and logical limits are now stored in a single buffer and their offsets are stored as unsigned ints in Tensor class. Other changes includes: Adding a new ctor for ParamsBuffer class to allow allocation with size without data ptr. Adding an offset input to Buffer::data function. Adding an offset parameter to BufferBindInfo ctor, so additional offset can be supplied when binding a buffer. Differential Revision: [D65841750](https://our.internmc.facebook.com/intern/diff/D65841750/) [ghstack-poisoned] --- .../runtime/api/containers/ParamsBuffer.h | 14 +- .../vulkan/runtime/api/containers/Tensor.cpp | 122 ++++++++++++------ .../vulkan/runtime/api/containers/Tensor.h | 24 ++-- backends/vulkan/runtime/vk_api/Descriptor.cpp | 6 +- backends/vulkan/runtime/vk_api/Descriptor.h | 2 +- .../vulkan/runtime/vk_api/memory/Buffer.h | 4 +- 6 files changed, 116 insertions(+), 56 deletions(-) diff --git a/backends/vulkan/runtime/api/containers/ParamsBuffer.h b/backends/vulkan/runtime/api/containers/ParamsBuffer.h index fed7c8fa72..c3f19028ed 100644 --- a/backends/vulkan/runtime/api/containers/ParamsBuffer.h +++ b/backends/vulkan/runtime/api/containers/ParamsBuffer.h @@ -33,6 +33,13 @@ class ParamsBuffer final { vulkan_buffer_( context_p_->adapter_ptr()->vma().create_params_buffer(block)) {} + template + ParamsBuffer(Context* context_p, const VkDeviceSize nbytes) + : context_p_(context_p), + nbytes_(nbytes), + vulkan_buffer_( + context_p_->adapter_ptr()->vma().create_uniform_buffer(nbytes)) {} + ParamsBuffer(const ParamsBuffer&); ParamsBuffer& operator=(const ParamsBuffer&); @@ -50,14 +57,11 @@ class ParamsBuffer final { } template - void update(const Block& block) { - if (sizeof(block) != nbytes_) { - VK_THROW("Attempted to update ParamsBuffer with data of different size"); - } + void update(const Block& block, const uint32_t offset = 0) { // Fill the uniform buffer with data in block { vkapi::MemoryMap mapping(vulkan_buffer_, vkapi::kWrite); - Block* data_ptr = mapping.template data(); + Block* data_ptr = mapping.template data(offset); *data_ptr = block; } diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp index e14de97d05..56d26ecbff 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.cpp +++ b/backends/vulkan/runtime/api/containers/Tensor.cpp @@ -451,11 +451,13 @@ vTensor::vTensor( unsqueezed_strides_{unsqueeze_strides(strides_, numel_)}, padded_numel_(utils::multiply_integers(padded_sizes_)), logical_limits_{{0, 0, 0}}, + uniforms_(), // Utility Uniform Buffers that can be passed to shaders as arguments - sizes_uniform_(), - strides_uniform_(), - numel_uniform_(), - logical_limits_uniform_(), + uniforms_size_(0), + sizes_uniform_offset_(kUniformOffsetUnset), + unsqueezed_strides_offset_(kUniformOffsetUnset), + numel_uniform_offset_(kUniformOffsetUnset), + logical_limits_uniform_offset_(kUniformOffsetUnset), // Construct Tensor storage storage_( context, @@ -497,11 +499,13 @@ vTensor::vTensor( unsqueezed_strides_(), padded_numel_(utils::multiply_integers(padded_sizes_)), logical_limits_(), + uniforms_(), // Utility Uniform Buffers that can be passed to shaders as arguments - sizes_uniform_(), - strides_uniform_(), - numel_uniform_(), - logical_limits_uniform_(), + uniforms_size_(0), + sizes_uniform_offset_(kUniformOffsetUnset), + unsqueezed_strides_offset_(kUniformOffsetUnset), + numel_uniform_offset_(kUniformOffsetUnset), + logical_limits_uniform_offset_(kUniformOffsetUnset), // Construct Tensor storage storage_(context, image) { set_logical_limits(storage_.image_extents_); @@ -522,11 +526,13 @@ vTensor::vTensor(vTensor& other) other.unsqueezed_strides_.end()}, padded_numel_(other.padded_numel_), logical_limits_{other.logical_limits_}, + uniforms_(), // Empty initialize Utility Uniform Buffers - sizes_uniform_(), - strides_uniform_(), - numel_uniform_(), - logical_limits_uniform_(), + uniforms_size_(0), + sizes_uniform_offset_(kUniformOffsetUnset), + unsqueezed_strides_offset_(kUniformOffsetUnset), + numel_uniform_offset_(kUniformOffsetUnset), + logical_limits_uniform_offset_(kUniformOffsetUnset), // Copy Tensor storage storage_(other.storage_) {} @@ -547,11 +553,13 @@ vTensor::vTensor( unsqueezed_strides_{unsqueeze_strides(strides_, numel_)}, padded_numel_(utils::multiply_integers(padded_sizes_)), logical_limits_(other.logical_limits_), + uniforms_(), // Empty initialize Utility Uniform Buffers - sizes_uniform_(), - strides_uniform_(), - numel_uniform_(), - logical_limits_uniform_(), + uniforms_size_(0), + sizes_uniform_offset_(kUniformOffsetUnset), + unsqueezed_strides_offset_(kUniformOffsetUnset), + numel_uniform_offset_(kUniformOffsetUnset), + logical_limits_uniform_offset_(kUniformOffsetUnset), // Copy Tensor storage storage_(other.storage_, vkapi::element_size(dtype_) * offset_numel) { VK_CHECK_COND( @@ -612,33 +620,69 @@ utils::GPUMemoryLayout vTensor::estimate_memory_layout() const { } const vkapi::BufferBindInfo vTensor::sizes_ubo() { - if (!sizes_uniform_.buffer()) { - sizes_uniform_ = - ParamsBuffer(storage_.context_, utils::make_whcn_ivec4(sizes_)); + if (!uniforms_.buffer()) { + uniforms_ = ParamsBuffer(storage_.context_, kMaxUniformBufferSize); } - return vkapi::BufferBindInfo(sizes_uniform_.buffer()); + if (sizes_uniform_offset_ == kUniformOffsetUnset) { + const auto allocation_size = sizes_.size() * sizeof(sizes_[0]); + VK_CHECK_COND( + (uniforms_size_ + allocation_size) <= kMaxUniformBufferSize, + "Uniform data allocation has exceeded Tensor uniform buffer size"); + sizes_uniform_offset_ = uniforms_size_; + uniforms_size_ += allocation_size; + uniforms_.update(utils::make_whcn_ivec4(sizes_), sizes_uniform_offset_); + } + return vkapi::BufferBindInfo(uniforms_.buffer(), sizes_uniform_offset_); } const vkapi::BufferBindInfo vTensor::strides_ubo() { - if (!strides_uniform_.buffer()) { - strides_uniform_ = ParamsBuffer( - storage_.context_, utils::make_whcn_ivec4(unsqueezed_strides_)); + if (!uniforms_.buffer()) { + uniforms_ = ParamsBuffer(storage_.context_, kMaxUniformBufferSize); + } + if (unsqueezed_strides_offset_ == kUniformOffsetUnset) { + const auto allocation_size = + unsqueezed_strides_.size() * sizeof(unsqueezed_strides_[0]); + VK_CHECK_COND( + (uniforms_size_ + allocation_size) <= kMaxUniformBufferSize, + "Uniform data allocation has exceeded Tensor uniform buffer size"); + unsqueezed_strides_offset_ = uniforms_size_; + uniforms_size_ += allocation_size; + uniforms_.update( + utils::make_whcn_ivec4(unsqueezed_strides_), + unsqueezed_strides_offset_); } - return vkapi::BufferBindInfo(strides_uniform_.buffer()); + return vkapi::BufferBindInfo(uniforms_.buffer(), unsqueezed_strides_offset_); } const vkapi::BufferBindInfo vTensor::logical_limits_ubo() { - if (!logical_limits_uniform_.buffer()) { - logical_limits_uniform_ = ParamsBuffer(storage_.context_, logical_limits_); + if (!uniforms_.buffer()) { + uniforms_ = ParamsBuffer(storage_.context_, kMaxUniformBufferSize); } - return vkapi::BufferBindInfo(logical_limits_uniform_.buffer()); + if (logical_limits_uniform_offset_ == kUniformOffsetUnset) { + VK_CHECK_COND( + (uniforms_size_ + sizeof(logical_limits_)) <= kMaxUniformBufferSize, + "Uniform data allocation has exceeded Tensor uniform buffer size"); + logical_limits_uniform_offset_ = uniforms_size_; + uniforms_size_ += sizeof(logical_limits_); + uniforms_.update(logical_limits_, logical_limits_uniform_offset_); + } + return vkapi::BufferBindInfo( + uniforms_.buffer(), logical_limits_uniform_offset_); } const vkapi::BufferBindInfo vTensor::numel_ubo() { - if (!numel_uniform_.buffer()) { - numel_uniform_ = ParamsBuffer(storage_.context_, numel_); + if (!uniforms_.buffer()) { + uniforms_ = ParamsBuffer(storage_.context_, kMaxUniformBufferSize); + } + if (numel_uniform_offset_ == kUniformOffsetUnset) { + VK_CHECK_COND( + (uniforms_size_ + sizeof(numel_)) <= kMaxUniformBufferSize, + "Uniform data allocation has exceeded Tensor uniform buffer size"); + numel_uniform_offset_ = uniforms_size_; + uniforms_size_ += sizeof(numel_); + uniforms_.update(numel_, numel_uniform_offset_); } - return vkapi::BufferBindInfo(numel_uniform_.buffer()); + return vkapi::BufferBindInfo(uniforms_.buffer(), numel_uniform_offset_); } size_t vTensor::staging_buffer_numel() const { @@ -690,17 +734,19 @@ void vTensor::update_metadata() { set_logical_limits( calculate_image_extents(padded_sizes_, axis_map_, packed_dim_)); - if (sizes_uniform_.buffer()) { - sizes_uniform_.update(utils::make_whcn_ivec4(sizes_)); + if (sizes_uniform_offset_ != kUniformOffsetUnset) { + uniforms_.update(utils::make_whcn_ivec4(sizes_), sizes_uniform_offset_); } - if (strides_uniform_.buffer()) { - strides_uniform_.update(utils::make_whcn_ivec4(unsqueezed_strides_)); + if (unsqueezed_strides_offset_ != kUniformOffsetUnset) { + uniforms_.update( + utils::make_whcn_ivec4(unsqueezed_strides_), + unsqueezed_strides_offset_); } - if (numel_uniform_.buffer()) { - numel_uniform_.update(numel_); + if (numel_uniform_offset_ != kUniformOffsetUnset) { + uniforms_.update(numel_, numel_uniform_offset_); } - if (logical_limits_uniform_.buffer()) { - logical_limits_uniform_.update(logical_limits_); + if (logical_limits_uniform_offset_ != kUniformOffsetUnset) { + uniforms_.update(logical_limits_, logical_limits_uniform_offset_); } } diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h index 35b74915d2..116dada88d 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.h +++ b/backends/vulkan/runtime/api/containers/Tensor.h @@ -297,18 +297,26 @@ class vTensor final { TextureLimits logical_limits_; /* - * Utility GPU buffers that can be passed to shaders in order to convey tensor - * metadata. These buffers will be initialized the first time they are - * accessed via the corresponding *_ubo() function, and their contents will be - * updated whenever virtual_resize() is called. + * Utility GPU buffer that can be passed to shaders in order to convey tensor + * metadata. Uniform buffer will be initialized only the first time a ubo is + * requested. Buffer offsets will be initialized the first time they are + * accessed via the corresponding *_ubo() function. Uniform buffer's contents + * will be updated whenever virtual_resize() is called. * * Refer to the comments for the corresponding *_ubo() functions for more * context about the data contained in each buffer. */ - ParamsBuffer sizes_uniform_; - ParamsBuffer strides_uniform_; - ParamsBuffer numel_uniform_; - ParamsBuffer logical_limits_uniform_; + ParamsBuffer uniforms_; + uint32_t uniforms_size_; + uint32_t sizes_uniform_offset_; + uint32_t unsqueezed_strides_offset_; + uint32_t numel_uniform_offset_; + uint32_t logical_limits_uniform_offset_; + + // Total size of tensor's uniform buffer + constexpr static uint32_t kMaxUniformBufferSize = 4 * 20; + // Initial value of uniform buffer offsets + constexpr static uint32_t kUniformOffsetUnset = kMaxUniformBufferSize; vTensorStorage storage_; diff --git a/backends/vulkan/runtime/vk_api/Descriptor.cpp b/backends/vulkan/runtime/vk_api/Descriptor.cpp index 8fea28a798..956711bccc 100644 --- a/backends/vulkan/runtime/vk_api/Descriptor.cpp +++ b/backends/vulkan/runtime/vk_api/Descriptor.cpp @@ -23,9 +23,11 @@ namespace vkapi { BufferBindInfo::BufferBindInfo() : handle(VK_NULL_HANDLE), offset(0u), range(0u) {} -BufferBindInfo::BufferBindInfo(const VulkanBuffer& buffer_p) +BufferBindInfo::BufferBindInfo( + const VulkanBuffer& buffer_p, + const uint32_t offset_p) : handle(buffer_p.handle()), - offset(buffer_p.mem_offset()), + offset(buffer_p.mem_offset() + offset_p), range(buffer_p.mem_range()) {} // diff --git a/backends/vulkan/runtime/vk_api/Descriptor.h b/backends/vulkan/runtime/vk_api/Descriptor.h index 418d79a6b3..38401f2243 100644 --- a/backends/vulkan/runtime/vk_api/Descriptor.h +++ b/backends/vulkan/runtime/vk_api/Descriptor.h @@ -33,7 +33,7 @@ struct BufferBindInfo final { VkDeviceSize range; BufferBindInfo(); - BufferBindInfo(const VulkanBuffer& buffer_p); + BufferBindInfo(const VulkanBuffer& buffer_p, const uint32_t offset_p = 0u); }; struct ParamsBindList final { diff --git a/backends/vulkan/runtime/vk_api/memory/Buffer.h b/backends/vulkan/runtime/vk_api/memory/Buffer.h index 14722511f4..347c5dd917 100644 --- a/backends/vulkan/runtime/vk_api/memory/Buffer.h +++ b/backends/vulkan/runtime/vk_api/memory/Buffer.h @@ -195,8 +195,8 @@ class MemoryMap final { public: template - T* data() { - return reinterpret_cast(data_); + T* data(const uint32_t offset = 0) { + return reinterpret_cast(static_cast(data_) + offset); } inline size_t nbytes() {