Skip to content

Commit

Permalink
[ET-VK] Using a single GPU buffer for all tensor uniforms.
Browse files Browse the repository at this point in the history
Pull Request resolved: #7015

This diff changes Tensor class to store all uniforms in a single uniform buffer.

Entities stored in uniforms ie. size, stride, numel and logical limits are now stored in a single buffer and their offsets are stored as unsigned ints in Tensor class.

Other changes includes:
Adding a new ctor for ParamsBuffer class to allow allocation with size without data ptr.

Adding an offset input to Buffer::data function.

Adding an offset parameter to BufferBindInfo ctor, so additional offset can be supplied when binding a buffer.
ghstack-source-id: 255516791
@exported-using-ghexport

Differential Revision: [D65841750](https://our.internmc.facebook.com/intern/diff/D65841750/)
  • Loading branch information
trivedivivek committed Nov 26, 2024
1 parent 1743889 commit 1e976f5
Show file tree
Hide file tree
Showing 6 changed files with 116 additions and 56 deletions.
14 changes: 9 additions & 5 deletions backends/vulkan/runtime/api/containers/ParamsBuffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,13 @@ class ParamsBuffer final {
vulkan_buffer_(
context_p_->adapter_ptr()->vma().create_params_buffer(block)) {}

template <typename Block>
ParamsBuffer(Context* context_p, const VkDeviceSize nbytes)
: context_p_(context_p),
nbytes_(nbytes),
vulkan_buffer_(
context_p_->adapter_ptr()->vma().create_uniform_buffer(nbytes)) {}

ParamsBuffer(const ParamsBuffer&);
ParamsBuffer& operator=(const ParamsBuffer&);

Expand All @@ -50,14 +57,11 @@ class ParamsBuffer final {
}

template <typename Block>
void update(const Block& block) {
if (sizeof(block) != nbytes_) {
VK_THROW("Attempted to update ParamsBuffer with data of different size");
}
void update(const Block& block, const uint32_t offset = 0) {
// Fill the uniform buffer with data in block
{
vkapi::MemoryMap mapping(vulkan_buffer_, vkapi::kWrite);
Block* data_ptr = mapping.template data<Block>();
Block* data_ptr = mapping.template data<Block>(offset);

*data_ptr = block;
}
Expand Down
122 changes: 84 additions & 38 deletions backends/vulkan/runtime/api/containers/Tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -451,11 +451,13 @@ vTensor::vTensor(
unsqueezed_strides_{unsqueeze_strides(strides_, numel_)},
padded_numel_(utils::multiply_integers(padded_sizes_)),
logical_limits_{{0, 0, 0}},
uniforms_(),
// Utility Uniform Buffers that can be passed to shaders as arguments
sizes_uniform_(),
strides_uniform_(),
numel_uniform_(),
logical_limits_uniform_(),
uniforms_size_(0),
sizes_uniform_offset_(kUniformOffsetUnset),
unsqueezed_strides_offset_(kUniformOffsetUnset),
numel_uniform_offset_(kUniformOffsetUnset),
logical_limits_uniform_offset_(kUniformOffsetUnset),
// Construct Tensor storage
storage_(
context,
Expand Down Expand Up @@ -497,11 +499,13 @@ vTensor::vTensor(
unsqueezed_strides_(),
padded_numel_(utils::multiply_integers(padded_sizes_)),
logical_limits_(),
uniforms_(),
// Utility Uniform Buffers that can be passed to shaders as arguments
sizes_uniform_(),
strides_uniform_(),
numel_uniform_(),
logical_limits_uniform_(),
uniforms_size_(0),
sizes_uniform_offset_(kUniformOffsetUnset),
unsqueezed_strides_offset_(kUniformOffsetUnset),
numel_uniform_offset_(kUniformOffsetUnset),
logical_limits_uniform_offset_(kUniformOffsetUnset),
// Construct Tensor storage
storage_(context, image) {
set_logical_limits(storage_.image_extents_);
Expand All @@ -522,11 +526,13 @@ vTensor::vTensor(vTensor& other)
other.unsqueezed_strides_.end()},
padded_numel_(other.padded_numel_),
logical_limits_{other.logical_limits_},
uniforms_(),
// Empty initialize Utility Uniform Buffers
sizes_uniform_(),
strides_uniform_(),
numel_uniform_(),
logical_limits_uniform_(),
uniforms_size_(0),
sizes_uniform_offset_(kUniformOffsetUnset),
unsqueezed_strides_offset_(kUniformOffsetUnset),
numel_uniform_offset_(kUniformOffsetUnset),
logical_limits_uniform_offset_(kUniformOffsetUnset),
// Copy Tensor storage
storage_(other.storage_) {}

Expand All @@ -547,11 +553,13 @@ vTensor::vTensor(
unsqueezed_strides_{unsqueeze_strides(strides_, numel_)},
padded_numel_(utils::multiply_integers(padded_sizes_)),
logical_limits_(other.logical_limits_),
uniforms_(),
// Empty initialize Utility Uniform Buffers
sizes_uniform_(),
strides_uniform_(),
numel_uniform_(),
logical_limits_uniform_(),
uniforms_size_(0),
sizes_uniform_offset_(kUniformOffsetUnset),
unsqueezed_strides_offset_(kUniformOffsetUnset),
numel_uniform_offset_(kUniformOffsetUnset),
logical_limits_uniform_offset_(kUniformOffsetUnset),
// Copy Tensor storage
storage_(other.storage_, vkapi::element_size(dtype_) * offset_numel) {
VK_CHECK_COND(
Expand Down Expand Up @@ -612,33 +620,69 @@ utils::GPUMemoryLayout vTensor::estimate_memory_layout() const {
}

const vkapi::BufferBindInfo vTensor::sizes_ubo() {
if (!sizes_uniform_.buffer()) {
sizes_uniform_ =
ParamsBuffer(storage_.context_, utils::make_whcn_ivec4(sizes_));
if (!uniforms_.buffer()) {
uniforms_ = ParamsBuffer(storage_.context_, kMaxUniformBufferSize);
}
return vkapi::BufferBindInfo(sizes_uniform_.buffer());
if (sizes_uniform_offset_ == kUniformOffsetUnset) {
const auto allocation_size = sizes_.size() * sizeof(sizes_[0]);
VK_CHECK_COND(
(uniforms_size_ + allocation_size) <= kMaxUniformBufferSize,
"Uniform data allocation has exceeded Tensor uniform buffer size");
sizes_uniform_offset_ = uniforms_size_;
uniforms_size_ += allocation_size;
uniforms_.update(utils::make_whcn_ivec4(sizes_), sizes_uniform_offset_);
}
return vkapi::BufferBindInfo(uniforms_.buffer(), sizes_uniform_offset_);
}

const vkapi::BufferBindInfo vTensor::strides_ubo() {
if (!strides_uniform_.buffer()) {
strides_uniform_ = ParamsBuffer(
storage_.context_, utils::make_whcn_ivec4(unsqueezed_strides_));
if (!uniforms_.buffer()) {
uniforms_ = ParamsBuffer(storage_.context_, kMaxUniformBufferSize);
}
if (unsqueezed_strides_offset_ == kUniformOffsetUnset) {
const auto allocation_size =
unsqueezed_strides_.size() * sizeof(unsqueezed_strides_[0]);
VK_CHECK_COND(
(uniforms_size_ + allocation_size) <= kMaxUniformBufferSize,
"Uniform data allocation has exceeded Tensor uniform buffer size");
unsqueezed_strides_offset_ = uniforms_size_;
uniforms_size_ += allocation_size;
uniforms_.update(
utils::make_whcn_ivec4(unsqueezed_strides_),
unsqueezed_strides_offset_);
}
return vkapi::BufferBindInfo(strides_uniform_.buffer());
return vkapi::BufferBindInfo(uniforms_.buffer(), unsqueezed_strides_offset_);
}

const vkapi::BufferBindInfo vTensor::logical_limits_ubo() {
if (!logical_limits_uniform_.buffer()) {
logical_limits_uniform_ = ParamsBuffer(storage_.context_, logical_limits_);
if (!uniforms_.buffer()) {
uniforms_ = ParamsBuffer(storage_.context_, kMaxUniformBufferSize);
}
return vkapi::BufferBindInfo(logical_limits_uniform_.buffer());
if (logical_limits_uniform_offset_ == kUniformOffsetUnset) {
VK_CHECK_COND(
(uniforms_size_ + sizeof(logical_limits_)) <= kMaxUniformBufferSize,
"Uniform data allocation has exceeded Tensor uniform buffer size");
logical_limits_uniform_offset_ = uniforms_size_;
uniforms_size_ += sizeof(logical_limits_);
uniforms_.update(logical_limits_, logical_limits_uniform_offset_);
}
return vkapi::BufferBindInfo(
uniforms_.buffer(), logical_limits_uniform_offset_);
}

const vkapi::BufferBindInfo vTensor::numel_ubo() {
if (!numel_uniform_.buffer()) {
numel_uniform_ = ParamsBuffer(storage_.context_, numel_);
if (!uniforms_.buffer()) {
uniforms_ = ParamsBuffer(storage_.context_, kMaxUniformBufferSize);
}
if (numel_uniform_offset_ == kUniformOffsetUnset) {
VK_CHECK_COND(
(uniforms_size_ + sizeof(numel_)) <= kMaxUniformBufferSize,
"Uniform data allocation has exceeded Tensor uniform buffer size");
numel_uniform_offset_ = uniforms_size_;
uniforms_size_ += sizeof(numel_);
uniforms_.update(numel_, numel_uniform_offset_);
}
return vkapi::BufferBindInfo(numel_uniform_.buffer());
return vkapi::BufferBindInfo(uniforms_.buffer(), numel_uniform_offset_);
}

size_t vTensor::staging_buffer_numel() const {
Expand Down Expand Up @@ -690,17 +734,19 @@ void vTensor::update_metadata() {
set_logical_limits(
calculate_image_extents(padded_sizes_, axis_map_, packed_dim_));

if (sizes_uniform_.buffer()) {
sizes_uniform_.update(utils::make_whcn_ivec4(sizes_));
if (sizes_uniform_offset_ != kUniformOffsetUnset) {
uniforms_.update(utils::make_whcn_ivec4(sizes_), sizes_uniform_offset_);
}
if (strides_uniform_.buffer()) {
strides_uniform_.update(utils::make_whcn_ivec4(unsqueezed_strides_));
if (unsqueezed_strides_offset_ != kUniformOffsetUnset) {
uniforms_.update(
utils::make_whcn_ivec4(unsqueezed_strides_),
unsqueezed_strides_offset_);
}
if (numel_uniform_.buffer()) {
numel_uniform_.update(numel_);
if (numel_uniform_offset_ != kUniformOffsetUnset) {
uniforms_.update(numel_, numel_uniform_offset_);
}
if (logical_limits_uniform_.buffer()) {
logical_limits_uniform_.update(logical_limits_);
if (logical_limits_uniform_offset_ != kUniformOffsetUnset) {
uniforms_.update(logical_limits_, logical_limits_uniform_offset_);
}
}

Expand Down
24 changes: 16 additions & 8 deletions backends/vulkan/runtime/api/containers/Tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -297,18 +297,26 @@ class vTensor final {
TextureLimits logical_limits_;

/*
* Utility GPU buffers that can be passed to shaders in order to convey tensor
* metadata. These buffers will be initialized the first time they are
* accessed via the corresponding *_ubo() function, and their contents will be
* updated whenever virtual_resize() is called.
* Utility GPU buffer that can be passed to shaders in order to convey tensor
* metadata. Uniform buffer will be initialized only the first time a ubo is
* requested. Buffer offsets will be initialized the first time they are
* accessed via the corresponding *_ubo() function. Uniform buffer's contents
* will be updated whenever virtual_resize() is called.
*
* Refer to the comments for the corresponding *_ubo() functions for more
* context about the data contained in each buffer.
*/
ParamsBuffer sizes_uniform_;
ParamsBuffer strides_uniform_;
ParamsBuffer numel_uniform_;
ParamsBuffer logical_limits_uniform_;
ParamsBuffer uniforms_;
uint32_t uniforms_size_;
uint32_t sizes_uniform_offset_;
uint32_t unsqueezed_strides_offset_;
uint32_t numel_uniform_offset_;
uint32_t logical_limits_uniform_offset_;

// Total size of tensor's uniform buffer
constexpr static uint32_t kMaxUniformBufferSize = 4 * 20;
// Initial value of uniform buffer offsets
constexpr static uint32_t kUniformOffsetUnset = kMaxUniformBufferSize;

vTensorStorage storage_;

Expand Down
6 changes: 4 additions & 2 deletions backends/vulkan/runtime/vk_api/Descriptor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,11 @@ namespace vkapi {
BufferBindInfo::BufferBindInfo()
: handle(VK_NULL_HANDLE), offset(0u), range(0u) {}

BufferBindInfo::BufferBindInfo(const VulkanBuffer& buffer_p)
BufferBindInfo::BufferBindInfo(
const VulkanBuffer& buffer_p,
const uint32_t offset_p)
: handle(buffer_p.handle()),
offset(buffer_p.mem_offset()),
offset(buffer_p.mem_offset() + offset_p),
range(buffer_p.mem_range()) {}

//
Expand Down
2 changes: 1 addition & 1 deletion backends/vulkan/runtime/vk_api/Descriptor.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ struct BufferBindInfo final {
VkDeviceSize range;

BufferBindInfo();
BufferBindInfo(const VulkanBuffer& buffer_p);
BufferBindInfo(const VulkanBuffer& buffer_p, const uint32_t offset_p = 0u);
};

struct ParamsBindList final {
Expand Down
4 changes: 2 additions & 2 deletions backends/vulkan/runtime/vk_api/memory/Buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -195,8 +195,8 @@ class MemoryMap final {

public:
template <typename T>
T* data() {
return reinterpret_cast<T*>(data_);
T* data(const uint32_t offset = 0) {
return reinterpret_cast<T*>(static_cast<uint8_t*>(data_) + offset);
}

inline size_t nbytes() {
Expand Down

0 comments on commit 1e976f5

Please sign in to comment.