From 28969713c3642b62b63fe3b2417530357743c392 Mon Sep 17 00:00:00 2001 From: Akhmed Rakhmati Date: Wed, 22 May 2024 20:23:21 +0000 Subject: [PATCH] #8579: change TTNN_TENSOR_PRINT_PROFILE from inline to extern --- tt_eager/tensor/tensor_impl.cpp | 2 + tt_eager/tensor/tensor_impl.hpp | 2 +- .../tt_dnn/op_library/operation_history.cpp | 15 +- .../tt_dnn/op_library/operation_history.hpp | 16 +- tt_metal/impl/buffers/buffer.cpp | 98 +++++---- tt_metal/impl/buffers/buffer.hpp | 193 ++++++++---------- 6 files changed, 167 insertions(+), 159 deletions(-) diff --git a/tt_eager/tensor/tensor_impl.cpp b/tt_eager/tensor/tensor_impl.cpp index f4f87e03385..f2f31beaca7 100644 --- a/tt_eager/tensor/tensor_impl.cpp +++ b/tt_eager/tensor/tensor_impl.cpp @@ -11,6 +11,8 @@ namespace tt_metal { namespace tensor_impl { +TensorPrintProfile TTNN_TENSOR_PRINT_PROFILE = TensorPrintProfile::Short; + std::ostream& operator<<(std::ostream& os, const DataType& dtype) { switch (dtype) { case DataType::BFLOAT8_B: os << "bfloat8_b"; break; diff --git a/tt_eager/tensor/tensor_impl.hpp b/tt_eager/tensor/tensor_impl.hpp index 025ad977b3f..7c8f71b25f1 100644 --- a/tt_eager/tensor/tensor_impl.hpp +++ b/tt_eager/tensor/tensor_impl.hpp @@ -707,7 +707,7 @@ enum class TensorPrintProfile { Full, }; -inline TensorPrintProfile TTNN_TENSOR_PRINT_PROFILE = TensorPrintProfile::Short; +extern TensorPrintProfile TTNN_TENSOR_PRINT_PROFILE; namespace detail { diff --git a/tt_eager/tt_dnn/op_library/operation_history.cpp b/tt_eager/tt_dnn/op_library/operation_history.cpp index f5b5086fa91..688425b36c7 100644 --- a/tt_eager/tt_dnn/op_library/operation_history.cpp +++ b/tt_eager/tt_dnn/op_library/operation_history.cpp @@ -8,16 +8,13 @@ namespace tt { namespace tt_metal { - #ifdef DEBUG namespace operation_history { namespace detail { -OperationHistory::~OperationHistory() { - this->dump_to_csv(); -} +OperationHistory::~OperationHistory() { this->dump_to_csv(); } void OperationHistory::append(OperationRecord&& record) { std::scoped_lock lock(op_history_mutex); @@ -132,15 +129,13 @@ void OperationHistory::clear() { this->records.clear(); } +OperationHistory OPERATION_HISTORY{}; + } // namespace detail -const char* csv_file_name() { - return std::getenv("OPERATION_HISTORY_CSV"); -} +const char* csv_file_name() { return std::getenv("OPERATION_HISTORY_CSV"); } -bool enabled() { - return csv_file_name() != nullptr; -} +bool enabled() { return csv_file_name() != nullptr; } void dump_to_csv() { detail::OPERATION_HISTORY.dump_to_csv(); } void clear() { detail::OPERATION_HISTORY.clear(); } diff --git a/tt_eager/tt_dnn/op_library/operation_history.hpp b/tt_eager/tt_dnn/op_library/operation_history.hpp index be338f507bf..fe97edab4a9 100644 --- a/tt_eager/tt_dnn/op_library/operation_history.hpp +++ b/tt_eager/tt_dnn/op_library/operation_history.hpp @@ -5,6 +5,7 @@ #pragma once #include + #include "tt_dnn/op_library/operation.hpp" namespace tt { @@ -22,10 +23,15 @@ struct TensorRecord { const Layout layout; const std::optional memory_config; - static constexpr auto attribute_names = std::make_tuple("storage_type", "shape", "data_type", "layout", "memory_config"); + static constexpr auto attribute_names = + std::make_tuple("storage_type", "shape", "data_type", "layout", "memory_config"); const auto attribute_values() const { return std::make_tuple( - std::cref(this->storage_type), std::cref(this->shape), std::cref(this->data_type), std::cref(this->layout), std::cref(this->memory_config)); + std::cref(this->storage_type), + std::cref(this->shape), + std::cref(this->data_type), + std::cref(this->layout), + std::cref(this->memory_config)); } }; @@ -54,12 +60,12 @@ struct OperationHistory { std::vector records; }; -inline OperationHistory OPERATION_HISTORY{}; +extern OperationHistory OPERATION_HISTORY; } // namespace detail -template -inline void append(Args&& ... args) { +template +inline void append(Args&&... args) { detail::OPERATION_HISTORY.append(std::forward(args)...); } diff --git a/tt_metal/impl/buffers/buffer.cpp b/tt_metal/impl/buffers/buffer.cpp index 45c2a6312ad..87761774c7c 100644 --- a/tt_metal/impl/buffers/buffer.cpp +++ b/tt_metal/impl/buffers/buffer.cpp @@ -4,8 +4,8 @@ #include "tt_metal/impl/buffers/buffer.hpp" -#include "tt_metal/common/assert.hpp" #include "llrt/llrt.hpp" +#include "tt_metal/common/assert.hpp" #include "tt_metal/common/math.hpp" #include "tt_metal/detail/tt_metal.hpp" #include "tt_metal/hostdevcommon/common_values.hpp" @@ -17,28 +17,36 @@ namespace tt { namespace tt_metal { -bool is_sharded(const TensorMemoryLayout & layout){ +bool is_sharded(const TensorMemoryLayout &layout) { return ( - layout == TensorMemoryLayout::HEIGHT_SHARDED || - layout == TensorMemoryLayout::WIDTH_SHARDED || - layout == TensorMemoryLayout::BLOCK_SHARDED ); + layout == TensorMemoryLayout::HEIGHT_SHARDED || layout == TensorMemoryLayout::WIDTH_SHARDED || + layout == TensorMemoryLayout::BLOCK_SHARDED); } - -void validate_buffer_size_and_page_size(uint64_t size, uint64_t page_size, const BufferType &buffer_type, const TensorMemoryLayout &buffer_layout, std::optional shard_parameters) { +void validate_buffer_size_and_page_size( + uint64_t size, + uint64_t page_size, + const BufferType &buffer_type, + const TensorMemoryLayout &buffer_layout, + std::optional shard_parameters) { TT_FATAL(size != 0 and page_size != 0, "Buffer size and page size should be larger than 0 bytes!"); bool valid_page_size = (size % page_size == 0); - TT_FATAL(valid_page_size, "For valid non-interleaved buffers page size {} must equal buffer size {}. For interleaved-buffers page size should be divisible by buffer size", page_size, size); - TT_FATAL(page_size % sizeof(uint32_t) == 0, "Page size must be divisible by sizeof(uint32_t) because buffers hold uint32_t values"); - if(buffer_layout == TensorMemoryLayout::SINGLE_BANK){ - TT_ASSERT(page_size == size , "Continguous buffer must be one contiguous page"); - } - else if(is_sharded(buffer_layout)){ - TT_ASSERT(shard_parameters != std::nullopt , "Sharded buffers must have a core grid assigned"); + TT_FATAL( + valid_page_size, + "For valid non-interleaved buffers page size {} must equal buffer size {}. For interleaved-buffers page size " + "should be divisible by buffer size", + page_size, + size); + TT_FATAL( + page_size % sizeof(uint32_t) == 0, + "Page size must be divisible by sizeof(uint32_t) because buffers hold uint32_t values"); + if (buffer_layout == TensorMemoryLayout::SINGLE_BANK) { + TT_ASSERT(page_size == size, "Continguous buffer must be one contiguous page"); + } else if (is_sharded(buffer_layout)) { + TT_ASSERT(shard_parameters != std::nullopt, "Sharded buffers must have a core grid assigned"); } } - inline std::tuple>, std::vector>> core_to_host_pages( const uint32_t &total_pages, const uint32_t &pages_per_shard, @@ -105,12 +113,20 @@ inline std::tuple>, std::vector shard_parameters, - bool allocate) - : device_(device), size_(size), page_size_(page_size), buffer_type_(buffer_type), buffer_layout_(buffer_layout), shard_parameters_(shard_parameters) { +Buffer::Buffer( + Device *device, + uint64_t size, + uint64_t page_size, + const BufferType buffer_type, + const TensorMemoryLayout buffer_layout, + std::optional shard_parameters, + bool allocate) : + device_(device), + size_(size), + page_size_(page_size), + buffer_type_(buffer_type), + buffer_layout_(buffer_layout), + shard_parameters_(shard_parameters) { TT_FATAL(this->device_ != nullptr and this->device_->allocator_ != nullptr); validate_buffer_size_and_page_size(size, page_size, buffer_type, buffer_layout, shard_parameters); if (allocate) { @@ -118,8 +134,6 @@ Buffer::Buffer(Device *device, uint64_t size, uint64_t page_size, const BufferTy } } - - BufferPageMapping generate_buffer_page_mapping(const Buffer &buffer) { BufferPageMapping buffer_page_mapping; bool row_major = buffer.shard_spec().orientation() == ShardOrientation::ROW_MAJOR; @@ -128,7 +142,7 @@ BufferPageMapping generate_buffer_page_mapping(const Buffer &buffer) { buffer_page_mapping.all_cores_ = corerange_to_cores(buffer.shard_spec().grid(), num_cores, row_major); TT_ASSERT(num_cores == buffer_page_mapping.all_cores_.size()); uint32_t core_id = 0; - for (const auto& core : buffer_page_mapping.all_cores_) { + for (const auto &core : buffer_page_mapping.all_cores_) { buffer_page_mapping.core_to_core_id_.insert({core, core_id}); core_id++; } @@ -180,7 +194,6 @@ BufferPageMapping generate_buffer_page_mapping(const Buffer &buffer) { return buffer_page_mapping; } - Buffer::Buffer(const Buffer &other) : device_(other.device_), size_(other.size_), @@ -204,9 +217,16 @@ Buffer &Buffer::operator=(const Buffer &other) { return *this; } -Buffer::Buffer(Buffer &&other) : device_(other.device_), size_(other.size_), address_(other.address_), page_size_(other.page_size_), buffer_type_(other.buffer_type_) , - buffer_layout_(other.buffer_layout_), shard_parameters_(other.shard_parameters_) { - // Set `other.device_` to be nullptr so destroying other does not deallocate reserved address space that is transferred to `this` +Buffer::Buffer(Buffer &&other) : + device_(other.device_), + size_(other.size_), + address_(other.address_), + page_size_(other.page_size_), + buffer_type_(other.buffer_type_), + buffer_layout_(other.buffer_layout_), + shard_parameters_(other.shard_parameters_) { + // Set `other.device_` to be nullptr so destroying other does not deallocate reserved address space that is + // transferred to `this` other.device_ = nullptr; } @@ -219,7 +239,8 @@ Buffer &Buffer::operator=(Buffer &&other) { this->buffer_type_ = other.buffer_type_; this->buffer_layout_ = other.buffer_layout_; this->shard_parameters_ = other.shard_parameters_; - // Set `other.device_` to be nullptr so destroying other does not deallocate reserved address space that is transferred to `this` + // Set `other.device_` to be nullptr so destroying other does not deallocate reserved address space that is + // transferred to `this` other.device_ = nullptr; } return *this; @@ -257,15 +278,12 @@ CoreCoord Buffer::noc_coordinates(uint32_t bank_id) const { case BufferType::SYSTEM_MEMORY: { TT_THROW("Host buffer is located in system memory! Cannot retrieve NoC coordinates for it"); } break; - default: - TT_ASSERT(false && "Unsupported buffer type!"); + default: TT_ASSERT(false && "Unsupported buffer type!"); } return CoreCoord{0, 0}; } -CoreCoord Buffer::noc_coordinates() const { - return this->noc_coordinates(0); -} +CoreCoord Buffer::noc_coordinates() const { return this->noc_coordinates(0); } uint64_t Buffer::page_address(uint32_t bank_id, uint32_t page_index) const { auto num_banks = this->device_->num_banks(this->buffer_type_); @@ -301,9 +319,7 @@ void Buffer::deallocate() { detail::DeallocateBuffer(this); } -Buffer::~Buffer() { - this->deallocate(); -} +Buffer::~Buffer() { this->deallocate(); } tt::stl::reflection::Attributes ShardSpec::attributes() const { return { @@ -314,7 +330,7 @@ tt::stl::reflection::Attributes ShardSpec::attributes() const { }; } -bool operator==(const ShardSpec& spec_a, const ShardSpec& spec_b) { +bool operator==(const ShardSpec &spec_a, const ShardSpec &spec_b) { if (spec_a.grid != spec_b.grid) { return false; } @@ -330,8 +346,10 @@ bool operator==(const ShardSpec& spec_a, const ShardSpec& spec_b) { return true; } -bool operator!=(const ShardSpec& spec_a, const ShardSpec& spec_b) { - return not (spec_a == spec_b); +bool operator!=(const ShardSpec &spec_a, const ShardSpec &spec_b) { return not(spec_a == spec_b); } + +namespace detail { +buffer_map_t BUFFER_MAP = {}; } } // namespace tt_metal diff --git a/tt_metal/impl/buffers/buffer.hpp b/tt_metal/impl/buffers/buffer.hpp index 3ed7829c2cc..48feb992ff4 100644 --- a/tt_metal/impl/buffers/buffer.hpp +++ b/tt_metal/impl/buffers/buffer.hpp @@ -4,18 +4,18 @@ #pragma once -#include "common/tt_backend_api_types.hpp" -#include "common/core_coord.h" +#include +#include +#include + #include "common/bfloat16.hpp" +#include "common/core_coord.h" +#include "common/tt_backend_api_types.hpp" #include "hostdevcommon/common_values.hpp" #include "tt_metal/common/constants.hpp" #include "tt_metal/common/math.hpp" #include "tt_metal/tt_stl/concepts.hpp" #include "tt_metal/tt_stl/reflection.hpp" -#include "tt_metal/common/math.hpp" -#include -#include -#include namespace tt { @@ -43,82 +43,71 @@ enum class ShardOrientation { COL_MAJOR, }; - struct ShardSpec { CoreRangeSet grid; std::array shape; ShardOrientation orientation = ShardOrientation::ROW_MAJOR; bool halo = false; - ShardSpec(const CoreRangeSet & core_sets_, - const std::array & shard_shape_, - const ShardOrientation & shard_orientation_ = ShardOrientation::ROW_MAJOR, - const bool & halo_ = false): - grid(core_sets_), shape(shard_shape_), - orientation(shard_orientation_), halo(halo_) - {;} + ShardSpec( + const CoreRangeSet &core_sets_, + const std::array &shard_shape_, + const ShardOrientation &shard_orientation_ = ShardOrientation::ROW_MAJOR, + const bool &halo_ = false) : + grid(core_sets_), shape(shard_shape_), orientation(shard_orientation_), halo(halo_) { + ; + } - const uint32_t num_cores() const {return this->grid.num_cores();} + const uint32_t num_cores() const { return this->grid.num_cores(); } const uint32_t numel() const { return this->shape[0] * this->shape[1]; } tt::stl::reflection::Attributes attributes() const; - }; -bool operator==(const ShardSpec& spec_a, const ShardSpec& spec_b); -bool operator!=(const ShardSpec& spec_a, const ShardSpec& spec_b); +bool operator==(const ShardSpec &spec_a, const ShardSpec &spec_b); +bool operator!=(const ShardSpec &spec_a, const ShardSpec &spec_b); struct ShardSpecBuffer { ShardSpec tensor_shard_spec; std::array page_shape; - std::array tensor2d_shape; - ShardSpecBuffer(const CoreRangeSet & core_sets_, - const std::array & shard_shape_, - const ShardOrientation & shard_orientation_, - const bool & halo_, - const std::array & page_shape, - const std::array & tensor2d_shape - ): tensor_shard_spec(core_sets_, shard_shape_, shard_orientation_, halo_) - { - this->page_shape = page_shape; - this->tensor2d_shape = tensor2d_shape; - } + std::array tensor2d_shape; ShardSpecBuffer( - const ShardSpec & shard_spec, - const std::array & page_shape, - const std::array & tensor2d_shape - ): tensor_shard_spec(shard_spec) - { - this->page_shape = page_shape; - this-> tensor2d_shape = tensor2d_shape; - } - CoreRangeSet grid() const { - return tensor_shard_spec.grid; + const CoreRangeSet &core_sets_, + const std::array &shard_shape_, + const ShardOrientation &shard_orientation_, + const bool &halo_, + const std::array &page_shape, + const std::array &tensor2d_shape) : + tensor_shard_spec(core_sets_, shard_shape_, shard_orientation_, halo_) { + this->page_shape = page_shape; + this->tensor2d_shape = tensor2d_shape; } - std::array shape() const { - return tensor_shard_spec.shape; - } - ShardOrientation orientation() const { - return tensor_shard_spec.orientation; - } - bool halo() const { - return tensor_shard_spec.halo; + ShardSpecBuffer( + const ShardSpec &shard_spec, + const std::array &page_shape, + const std::array &tensor2d_shape) : + tensor_shard_spec(shard_spec) { + this->page_shape = page_shape; + this->tensor2d_shape = tensor2d_shape; } + CoreRangeSet grid() const { return tensor_shard_spec.grid; } + std::array shape() const { return tensor_shard_spec.shape; } + ShardOrientation orientation() const { return tensor_shard_spec.orientation; } + bool halo() const { return tensor_shard_spec.halo; } std::array shape_in_pages() const { auto width_in_pages = tensor_shard_spec.shape[0] / page_shape[0]; auto height_in_pages = tensor_shard_spec.shape[1] / page_shape[1]; return {width_in_pages, height_in_pages}; } - uint32_t size() const{ + uint32_t size() const { auto shape_in_pages_ = this->shape_in_pages(); return shape_in_pages_[0] * shape_in_pages_[1]; } }; - struct BufferConfig { Device *device; - uint64_t size; // Size in bytes - uint64_t page_size; // Size of unit being interleaved. For non-interleaved buffers: size == page_size + uint64_t size; // Size in bytes + uint64_t page_size; // Size of unit being interleaved. For non-interleaved buffers: size == page_size BufferType buffer_type; TensorMemoryLayout buffer_layout = TensorMemoryLayout::INTERLEAVED; }; @@ -129,32 +118,29 @@ typedef BufferConfig InterleavedBufferConfig; // designator constructor struct ShardedBufferConfig { Device *device; - uint64_t size; // Size in bytes - uint64_t page_size; // Size of unit being interleaved. For non-interleaved buffers: size == page_size + uint64_t size; // Size in bytes + uint64_t page_size; // Size of unit being interleaved. For non-interleaved buffers: size == page_size BufferType buffer_type = BufferType::L1; TensorMemoryLayout buffer_layout = TensorMemoryLayout::HEIGHT_SHARDED; ShardSpecBuffer shard_parameters; }; -bool is_sharded(const TensorMemoryLayout & layout); +bool is_sharded(const TensorMemoryLayout &layout); struct BufferPageMapping { - std::vector< CoreCoord> all_cores_; - std::vector< uint32_t> core_bank_indices_; - std::vector< std::vector > core_host_page_indices_; + std::vector all_cores_; + std::vector core_bank_indices_; + std::vector> core_host_page_indices_; std::vector dev_page_to_core_mapping_; - //some dev pages don't have mapping to host (in case of padding) - std::vector > dev_page_to_host_page_mapping_; + // some dev pages don't have mapping to host (in case of padding) + std::vector> dev_page_to_host_page_mapping_; std::vector host_page_to_dev_page_mapping_; std::unordered_map core_to_core_id_; - std::vector< uint32_t> host_page_to_local_shard_page_mapping_; - std::vector < std::array > core_shard_shape_; - + std::vector host_page_to_local_shard_page_mapping_; + std::vector> core_shard_shape_; }; - - class Buffer { public: Buffer() : @@ -163,16 +149,20 @@ class Buffer { buffer_layout_(TensorMemoryLayout::INTERLEAVED), shard_parameters_(std::nullopt) {} - Buffer(Device *device, uint64_t size, uint64_t page_size, const BufferType buffer_type, - const TensorMemoryLayout buffer_layout=TensorMemoryLayout::INTERLEAVED, + Buffer( + Device *device, + uint64_t size, + uint64_t page_size, + const BufferType buffer_type, + const TensorMemoryLayout buffer_layout = TensorMemoryLayout::INTERLEAVED, std::optional shard_parameter = std::nullopt, bool allocate = true); Buffer(const Buffer &other); - Buffer& operator=(const Buffer &other); + Buffer &operator=(const Buffer &other); Buffer(Buffer &&other); - Buffer& operator=(Buffer &&other); + Buffer &operator=(Buffer &&other); ~Buffer(); Device *device() const { return device_; } @@ -192,8 +182,7 @@ class Buffer { uint32_t num_dev_pages() const { if (!is_sharded(this->buffer_layout_)) { return this->num_pages(); - } - else { + } else { return this->shard_spec().size() * this->num_cores(); } } @@ -216,22 +205,21 @@ class Buffer { uint64_t page_address(uint32_t bank_id, uint32_t page_index) const; - // SHARDED API STARTS HERE // TODO: WILL SEPARATE INTO SHARDED BUFFER CLASS uint64_t sharded_page_address(uint32_t bank_id, uint32_t page_index) const; ShardSpecBuffer shard_spec() const { - TT_ASSERT(is_sharded(this->buffer_layout_) , "Buffer not sharded"); + TT_ASSERT(is_sharded(this->buffer_layout_), "Buffer not sharded"); TT_ASSERT(shard_parameters_.has_value()); return this->shard_parameters_.value(); } - uint32_t num_cores() const{ - if(!is_sharded(this->buffer_layout_)) + uint32_t num_cores() const { + if (!is_sharded(this->buffer_layout_)) return 1; - else{ + else { return this->shard_spec().tensor_shard_spec.grid.num_cores(); } } @@ -245,49 +233,48 @@ class Buffer { uint64_t translate_page_address(uint64_t offset, uint32_t bank_id) const; Device *device_; - uint64_t size_; // Size in bytes - uint64_t address_; // Address of buffer - uint64_t page_size_; // Size of unit being interleaved. For non-interleaved buffers: size == page_size + uint64_t size_; // Size in bytes + uint64_t address_; // Address of buffer + uint64_t page_size_; // Size of unit being interleaved. For non-interleaved buffers: size == page_size BufferType buffer_type_; TensorMemoryLayout buffer_layout_; std::optional shard_parameters_; }; - BufferPageMapping generate_buffer_page_mapping(const Buffer &buffer); namespace detail { using PageAddress = uint32_t; using Deviceid = uint32_t; -class buffer_map { - public: - void insert(std::tuple buf_attr, Buffer * buffer) { - std::scoped_lock lock(this->map_mutex); - this->map.insert({buf_attr, buffer}); - } +class buffer_map_t { + public: + void insert(std::tuple buf_attr, Buffer *buffer) { + std::scoped_lock lock(this->map_mutex); + this->map.insert({buf_attr, buffer}); + } - void erase(std::tuple buf_attr) { - std::scoped_lock lock(this->map_mutex); - this->map.erase(buf_attr); - } + void erase(std::tuple buf_attr) { + std::scoped_lock lock(this->map_mutex); + this->map.erase(buf_attr); + } - void clear() { - std::scoped_lock lock(this->map_mutex); - this->map.clear(); - } + void clear() { + std::scoped_lock lock(this->map_mutex); + this->map.clear(); + } - std::map, Buffer *> value() { - std::scoped_lock lock(this->map_mutex); - return this->map; - } + std::map, Buffer *> value() { + std::scoped_lock lock(this->map_mutex); + return this->map; + } - private: - std::mutex map_mutex; - std::map, Buffer *> map = {}; + private: + std::mutex map_mutex; + std::map, Buffer *> map = {}; }; -inline buffer_map BUFFER_MAP; +extern buffer_map_t BUFFER_MAP; } // namespace detail using HostDataType = std::variant< @@ -296,7 +283,7 @@ using HostDataType = std::variant< const std::shared_ptr>, const std::shared_ptr>, const std::shared_ptr>, - const void*>; + const void *>; } // namespace tt_metal