diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp index 20dd7fb9403..4e970d4ffb5 100644 --- a/tt_metal/impl/program/program.cpp +++ b/tt_metal/impl/program/program.cpp @@ -3,95 +3,83 @@ // SPDX-License-Identifier: Apache-2.0 #include "tt_metal/impl/program/program.hpp" -#include "tt_metal/llrt/llrt.hpp" -#include "tt_metal/detail/tt_metal.hpp" -#include "tt_metal/impl/buffers/semaphore.hpp" -#include "tt_metal/impl/allocator/allocator.hpp" -#include "tt_metal/host_api.hpp" -#include "tt_metal/detail/reports/compilation_reporter.hpp" -#include "tt_metal/detail/reports/memory_reporter.hpp" + #include "common/executor.hpp" -#include "tt_metal/detail/persistent_kernel_cache.hpp" +#include "tools/profiler/profiler.hpp" #include "tt_metal/detail/kernel_cache.hpp" +#include "tt_metal/detail/persistent_kernel_cache.hpp" +#include "tt_metal/detail/reports/compilation_reporter.hpp" +#include "tt_metal/detail/reports/memory_reporter.hpp" +#include "tt_metal/detail/tt_metal.hpp" +#include "tt_metal/host_api.hpp" +#include "tt_metal/impl/allocator/allocator.hpp" +#include "tt_metal/impl/buffers/semaphore.hpp" +#include "tt_metal/impl/debug/dprint_server.hpp" #include "tt_metal/jit_build/genfiles.hpp" - +#include "tt_metal/llrt/llrt.hpp" #include "tt_metal/third_party/tracy/public/tracy/Tracy.hpp" -#include "tools/profiler/profiler.hpp" -#include "tt_metal/impl/debug/dprint_server.hpp" namespace tt::tt_metal { +namespace { +std::atomic enable_persistent_kernel_cache = false; -namespace{ - std::atomic enable_persistent_kernel_cache = false; - - void GenerateBinaries(Device *device, JitBuildOptions& build_options, std::shared_ptr kernel) { - ZoneScoped; - const std::string tracyPrefix = "GenerateBinaries_"; - ZoneName( (tracyPrefix + build_options.name).c_str(), build_options.name.length() + tracyPrefix.length()); - try { - jit_build_genfiles_descriptors(device->build_env(), build_options); - kernel->generate_binaries(device, build_options); - } catch (std::runtime_error &ex) { - TT_THROW("Failed to generate binaries for {} {}", kernel->name(), ex.what()); - } - } - - - #ifdef GENERATE_HASH_LOG - #include - #endif - - size_t KernelCompileHash( - const std::shared_ptr kernel, JitBuildOptions &build_options, uint32_t build_key) { - // Account for device id in hash because generated headers are dependent on harvesting config, which can differ per device - // This can be removed with https://github.com/tenstorrent/tt-metal/issues/3381 - - // Also account for watcher/dprint enabled in hash because they enable additional code to - // be compiled into the kernel. - string compile_hash_str = fmt::format( - "{}_{}_{}_{}_{}", - build_key, - std::to_string(std::hash{}(build_options.hlk_desc)), - kernel->compute_hash(), - tt::llrt::OptionsG.get_watcher_enabled(), - tt::llrt::OptionsG.get_dprint_enabled()); - size_t compile_hash = std::hash{}(compile_hash_str); - - #ifdef GENERATE_HASH_LOG - static std::ofstream f("/tmp/hashlog.txt"); - static std::mutex mutex_; - { - unique_lock lock; - f << kernel->name() << " :: " - << build_key << "::" - << std::hash{}(build_options.hlk_desc) << " :: " - << kernel->compute_hash() << " :: " - << compile_hash_str << " " - << compile_hash << std::endl << std::flush; - } - #endif - return compile_hash; - } -} -namespace detail{ - void EnablePersistentKernelCache() - { - enable_persistent_kernel_cache = true; - } - - void DisablePersistentKernelCache() +void GenerateBinaries(Device *device, JitBuildOptions &build_options, std::shared_ptr kernel) { + ZoneScoped; + const std::string tracyPrefix = "GenerateBinaries_"; + ZoneName((tracyPrefix + build_options.name).c_str(), build_options.name.length() + tracyPrefix.length()); + try { + jit_build_genfiles_descriptors(device->build_env(), build_options); + kernel->generate_binaries(device, build_options); + } catch (std::runtime_error &ex) { + TT_THROW("Failed to generate binaries for {} {}", kernel->name(), ex.what()); + } +} + +#ifdef GENERATE_HASH_LOG +#include +#endif + +size_t KernelCompileHash(const std::shared_ptr kernel, JitBuildOptions &build_options, uint32_t build_key) { + // Account for device id in hash because generated headers are dependent on harvesting config, which can differ per + // device This can be removed with https://github.com/tenstorrent/tt-metal/issues/3381 + + // Also account for watcher/dprint enabled in hash because they enable additional code to + // be compiled into the kernel. + string compile_hash_str = fmt::format( + "{}_{}_{}_{}_{}", + build_key, + std::to_string(std::hash{}(build_options.hlk_desc)), + kernel->compute_hash(), + tt::llrt::OptionsG.get_watcher_enabled(), + tt::llrt::OptionsG.get_dprint_enabled()); + size_t compile_hash = std::hash{}(compile_hash_str); + +#ifdef GENERATE_HASH_LOG + static std::ofstream f("/tmp/hashlog.txt"); + static std::mutex mutex_; { - enable_persistent_kernel_cache = false; + unique_lock lock; + f << kernel->name() << " :: " << build_key << "::" << std::hash{}(build_options.hlk_desc) + << " :: " << kernel->compute_hash() << " :: " << compile_hash_str << " " << compile_hash << std::endl + << std::flush; } +#endif + return compile_hash; } +} // namespace +namespace detail { +void EnablePersistentKernelCache() { enable_persistent_kernel_cache = true; } +void DisablePersistentKernelCache() { enable_persistent_kernel_cache = false; } +} // namespace detail std::atomic Program::program_counter = 0; -Program::Program(): id(program_counter++),worker_crs_({}), local_circular_buffer_allocation_needed_(false), loaded_onto_device(false) { +Program::Program() : + id(program_counter++), worker_crs_({}), local_circular_buffer_allocation_needed_(false), loaded_onto_device(false) { std::set supported_core_types = {CoreType::WORKER, CoreType::ETH}; - for (const auto& core_type : supported_core_types) { + for (const auto &core_type : supported_core_types) { kernels_.insert({core_type, {}}); grid_extent_.insert({core_type, {}}); kernel_groups_.insert({core_type, {}}); @@ -110,8 +98,9 @@ KernelHandle Program::add_kernel(std::shared_ptr kernel, const CoreType } std::shared_ptr Program::get_kernel(KernelHandle kernel_id) const { - //TT_ASSERT(kernel_id < this->kernels_.size(), "Expected Kernel with ID {} to be in Program {}", kernel_id, this->id); - // find coretype based on kernel_id + // TT_ASSERT(kernel_id < this->kernels_.size(), "Expected Kernel with ID {} to be in Program {}", kernel_id, + // this->id); + // find coretype based on kernel_id for (const auto &[core_type, kernels] : this->kernels_) { if (kernels.find(kernel_id) != kernels.end()) { return kernels.at(kernel_id); @@ -122,8 +111,7 @@ std::shared_ptr Program::get_kernel(KernelHandle kernel_id) const { return nullptr; } -KernelGroup::KernelGroup() : core_ranges({}) { -} +KernelGroup::KernelGroup() : core_ranges({}) {} KernelGroup::KernelGroup( const Program &program, @@ -147,7 +135,8 @@ KernelGroup::KernelGroup( if (brisc_id) { // Use brisc's noc if brisc specifies a noc this->launch_msg.enable_brisc = true; - this->launch_msg.brisc_noc_id = std::get(program.get_kernel(brisc_id.value())->config()).noc; + this->launch_msg.brisc_noc_id = + std::get(program.get_kernel(brisc_id.value())->config()).noc; this->launch_msg.brisc_watcher_kernel_id = program.get_kernel(brisc_id.value())->get_watcher_kernel_id(); } else { this->launch_msg.brisc_watcher_kernel_id = 0; @@ -196,14 +185,15 @@ CoreType KernelGroup::get_core_type() const { } }; -std::vector& Program::get_kernel_groups(const CoreType &core_type) { +std::vector &Program::get_kernel_groups(const CoreType &core_type) { update_kernel_groups(core_type); return kernel_groups_[core_type]; } -KernelGroup * Program::kernels_on_core(const CoreCoord &core, const CoreType &core_type) { +KernelGroup *Program::kernels_on_core(const CoreCoord &core, const CoreType &core_type) { update_kernel_groups(core_type); - if (core.x >= grid_extent_[core_type].x || core.y >= grid_extent_[core_type].y) return nullptr; + if (core.x >= grid_extent_[core_type].x || core.y >= grid_extent_[core_type].y) + return nullptr; uint8_t index = core_to_kernel_group_index_table_[core_type].at(core.y * grid_extent_[core_type].x + core.x); return (index == core_to_kernel_group_invalid_index) ? nullptr : &kernel_groups_[core_type].at(index); } @@ -215,33 +205,24 @@ struct KernelGroupInt { std::optional ncrisc_id = std::nullopt; std::optional erisc_id = std::nullopt; - bool operator==(const KernelGroupInt& b) const; + bool operator==(const KernelGroupInt &b) const; void update(RISCV riscv_processor, size_t kernel_idx) { switch (riscv_processor) { - case RISCV::BRISC: - this->brisc_id = static_cast(kernel_idx); - break; - case RISCV::NCRISC: - this->ncrisc_id = static_cast(kernel_idx); - break; - case RISCV::COMPUTE: - this->trisc_id = static_cast(kernel_idx); - break; - case RISCV::ERISC: - this->erisc_id = static_cast(kernel_idx); - break; - default: - TT_ASSERT(false, "Unsupported kernel processor!"); + case RISCV::BRISC: this->brisc_id = static_cast(kernel_idx); break; + case RISCV::NCRISC: this->ncrisc_id = static_cast(kernel_idx); break; + case RISCV::COMPUTE: this->trisc_id = static_cast(kernel_idx); break; + case RISCV::ERISC: this->erisc_id = static_cast(kernel_idx); break; + default: TT_ASSERT(false, "Unsupported kernel processor!"); } } }; -bool KernelGroupInt::operator==(const KernelGroupInt& b) const { +bool KernelGroupInt::operator==(const KernelGroupInt &b) const { return trisc_id == b.trisc_id && brisc_id == b.brisc_id && ncrisc_id == b.ncrisc_id && erisc_id == b.erisc_id; } struct KernelGroupIntHasher { - std::size_t operator()(const KernelGroupInt& x) const { + std::size_t operator()(const KernelGroupInt &x) const { return static_cast(x.erisc_id.value_or(0)) | static_cast(x.trisc_id.value_or(0)) | static_cast(x.brisc_id.value_or(0)) << 16 | static_cast(x.ncrisc_id.value_or(0)) << 32; } @@ -250,15 +231,18 @@ struct KernelGroupIntHasher { void Program::update_kernel_groups(const CoreType &core_type) { if (core_to_kernel_group_index_table_[core_type].size() == 0) { // Get the extent of the kernels in x, y - CoreCoord base = {std::numeric_limits::max(), - std::numeric_limits::max()}; + CoreCoord base = {std::numeric_limits::max(), std::numeric_limits::max()}; grid_extent_[core_type] = {0, 0}; for (auto [id, kernel] : kernels_[core_type]) { for (auto core : kernel->logical_cores()) { - if (core.x > grid_extent_[core_type].x) grid_extent_[core_type].x = core.x; - if (core.y > grid_extent_[core_type].y) grid_extent_[core_type].y = core.y; - if (core.x < base.x) base.x = core.x; - if (core.y < base.y) base.y = core.y; + if (core.x > grid_extent_[core_type].x) + grid_extent_[core_type].x = core.x; + if (core.y > grid_extent_[core_type].y) + grid_extent_[core_type].y = core.y; + if (core.x < base.x) + base.x = core.x; + if (core.y < base.y) + base.y = core.y; } } grid_extent_[core_type].x++; @@ -267,7 +251,7 @@ void Program::update_kernel_groups(const CoreType &core_type) { // grid maps cores to sets-of-kernels running on that core std::vector grid; grid.resize(grid_extent_[core_type].x * grid_extent_[core_type].y); - for (auto [id, kernel]: kernels_[core_type]) { + for (auto [id, kernel] : kernels_[core_type]) { for (auto core : kernel->logical_cores()) { int core_index = core.y * grid_extent_[core_type].x + core.x; grid[core_index].valid = true; @@ -281,7 +265,7 @@ void Program::update_kernel_groups(const CoreType &core_type) { for (auto x = base.x; x < grid_extent_[core_type].x; x++) { int index = y * grid_extent_[core_type].x + x; if (grid[index].valid) { - std::set& set = map[grid[index]]; + std::set &set = map[grid[index]]; set.insert(CoreRange({x, y}, {x, y})); } } @@ -292,9 +276,9 @@ void Program::update_kernel_groups(const CoreType &core_type) { TT_ASSERT(map.size() < core_to_kernel_group_invalid_index); kernel_groups_.reserve(map.size()); int index = 0; - core_to_kernel_group_index_table_[core_type].resize(grid_extent_[core_type].x * grid_extent_[core_type].y, core_to_kernel_group_invalid_index); - for (auto& kg_to_cores : map) { - + core_to_kernel_group_index_table_[core_type].resize( + grid_extent_[core_type].x * grid_extent_[core_type].y, core_to_kernel_group_invalid_index); + for (auto &kg_to_cores : map) { int last_cb_index = -1; // Map from core X,Y back to the unique KernelGroup @@ -333,7 +317,11 @@ void Program::update_kernel_groups(const CoreType &core_type) { void Program::CircularBufferAllocator::mark_address(uint64_t address, uint64_t size) { auto &last_region = this->l1_regions.back(); if (address < last_region.second) { - TT_THROW("Local buffer address {} has to append to last L1 region [{}, {}) or be at a higher address", address, last_region.first, last_region.second); + TT_THROW( + "Local buffer address {} has to append to last L1 region [{}, {}) or be at a higher address", + address, + last_region.first, + last_region.second); } if (address == last_region.second) { last_region.second += size; @@ -345,11 +333,11 @@ void Program::CircularBufferAllocator::mark_address(uint64_t address, uint64_t s CBHandle Program::add_circular_buffer(const CoreRangeSet &core_range_set, const CircularBufferConfig &config) { this->invalidate_compile(); std::shared_ptr circular_buffer = std::make_shared(core_range_set, config); - // Globally allocated circular buffer do not invalidate allocation because their addresses are tracked by memory allocator + // Globally allocated circular buffer do not invalidate allocation because their addresses are tracked by memory + // allocator if (not circular_buffer->globally_allocated()) { this->invalidate_circular_buffer_allocation(); - } - else { + } else { circular_buffer->assign_global_address(); } // Mark which buffer indices are being used on each core the circular buffer is used on @@ -361,20 +349,28 @@ CBHandle Program::add_circular_buffer(const CoreRangeSet &core_range_set, const for (uint32_t buffer_index : circular_buffer->buffer_indices()) { if (buffer_index > NUM_CIRCULAR_BUFFERS) { - TT_THROW("Invalid circular buffer index: {} should be between 0 and {}", buffer_index, NUM_CIRCULAR_BUFFERS); + TT_THROW( + "Invalid circular buffer index: {} should be between 0 and {}", + buffer_index, + NUM_CIRCULAR_BUFFERS); } if (cb_indices.to_ulong() & (1 << buffer_index)) { - TT_THROW("Invalid circular buffer index: Cannot add circular buffer at index {}, another circular buffer already exists", buffer_index); + TT_THROW( + "Invalid circular buffer index: Cannot add circular buffer at index {}, another circular " + "buffer already exists", + buffer_index); } cb_indices[buffer_index] = 1; } } } - // There is one CircularBufferAllocator per unique core range, create one if it does not already exist for current core range - auto val = std::find_if(cb_allocators_.begin(), cb_allocators_.end(), [&core_range](const CircularBufferAllocator &cb_allocator) { - return cb_allocator.core_range == core_range; - }); + // There is one CircularBufferAllocator per unique core range, create one if it does not already exist for + // current core range + auto val = std::find_if( + cb_allocators_.begin(), cb_allocators_.end(), [&core_range](const CircularBufferAllocator &cb_allocator) { + return cb_allocator.core_range == core_range; + }); if (val == cb_allocators_.end()) { this->cb_allocators_.emplace_back(core_range); } @@ -402,7 +398,7 @@ const std::vector> Program::circular_buffers_on_ return cbs_on_core; } -const std::vector> Program::circular_buffers_on_corerange(const CoreRange & cr) const { +const std::vector> Program::circular_buffers_on_corerange(const CoreRange &cr) const { std::vector> cbs_on_core; for (auto circular_buffer : circular_buffers_) { if (circular_buffer->is_on_logical_corerange(cr)) { @@ -460,8 +456,9 @@ void Program::allocate_circular_buffers() { for (CircularBufferAllocator &cb_allocator : this->cb_allocators_) { if (cb_allocator.core_range.intersects(core_range)) { if (cb_allocator.core_range != core_range and computed_addr < cb_allocator.get_cb_region_end()) { - // Intersecting core range has already been marked to have allocation at this address. This could have been marked by a circular buffer on a core range disjoint from - // current `core_range` but also intersecting `cb_allocator.core_range` + // Intersecting core range has already been marked to have allocation at this address. This + // could have been marked by a circular buffer on a core range disjoint from current + // `core_range` but also intersecting `cb_allocator.core_range` continue; } cb_allocator.mark_address(computed_addr, circular_buffer->size()); @@ -469,7 +466,6 @@ void Program::allocate_circular_buffers() { } } - circular_buffer->set_locally_allocated_address(computed_addr); } this->local_circular_buffer_allocation_needed_ = false; @@ -480,7 +476,7 @@ void Program::validate_circular_buffer_region(const Device *device) const { // Banks are in lockstep so we only need to get lowest L1 address of one compute and storage core // Only compute with storage cores can have CBs and all compute with storage cores will have the same bank offset - const std::vector& bank_ids = + const std::vector &bank_ids = device->bank_ids_from_logical_core(BufferType::L1, *device->compute_cores_.begin()); std::optional lowest_address = allocator::lowest_occupied_l1_address(*device->allocator_, bank_ids[0]); uint32_t max_l1_size = device->l1_size_per_core(); @@ -488,43 +484,51 @@ void Program::validate_circular_buffer_region(const Device *device) const { for (const CircularBufferAllocator &cb_allocator : this->cb_allocators_) { uint64_t cb_region_end = cb_allocator.get_cb_region_end(); if (cb_region_end > max_l1_size) { - TT_THROW("Statically allocated circular buffers on core range {} grow to {} B which is beyond max L1 size of {} B", cb_allocator.core_range.str(), cb_region_end, max_l1_size); + TT_THROW( + "Statically allocated circular buffers on core range {} grow to {} B which is beyond max L1 size of {} " + "B", + cb_allocator.core_range.str(), + cb_region_end, + max_l1_size); } if (lowest_address.has_value() and lowest_address.value() < cb_region_end) { - TT_THROW("Statically allocated circular buffers in program {} clash with L1 buffers on core range {}. L1 buffer allocated at {} and static circular buffer region ends at {}", this->id, cb_allocator.core_range.str(), lowest_address.value(), cb_region_end); + TT_THROW( + "Statically allocated circular buffers in program {} clash with L1 buffers on core range {}. L1 buffer " + "allocated at {} and static circular buffer region ends at {}", + this->id, + cb_allocator.core_range.str(), + lowest_address.value(), + cb_region_end); } } } -size_t Program::num_semaphores(const CoreCoord &core) const { - return semaphores_on_core(core).size(); -} +size_t Program::num_semaphores(const CoreCoord &core) const { return semaphores_on_core(core).size(); } -size_t Program::num_semaphores() const { - return semaphores_.size(); -} +size_t Program::num_semaphores() const { return semaphores_.size(); } -void Program::init_semaphores( const Device & device, const CoreCoord &logical_core, const CoreType core_type) const{ +void Program::init_semaphores(const Device &device, const CoreCoord &logical_core, const CoreType core_type) const { auto semaphores_on_core = this->semaphores_on_core(logical_core); for (auto semaphore : semaphores_on_core) { - llrt::write_hex_vec_to_core(device.id(), device.physical_core_from_logical_core(logical_core, core_type), {semaphore.get().initial_value()}, semaphore.get().address()); + llrt::write_hex_vec_to_core( + device.id(), + device.physical_core_from_logical_core(logical_core, core_type), + {semaphore.get().initial_value()}, + semaphore.get().address()); } } -void Program::add_semaphore(const CoreRangeSet & crs, uint32_t address, uint32_t init_value, CoreType core_type) { +void Program::add_semaphore(const CoreRangeSet &crs, uint32_t address, uint32_t init_value, CoreType core_type) { this->invalidate_compile(); - semaphores_.emplace_back(Semaphore( crs, address, init_value, core_type)); + semaphores_.emplace_back(Semaphore(crs, address, init_value, core_type)); } -void Program::add_config_buffer(std::shared_ptr config_buffer) { - this->invalidate_compile(); - config_buffers_.emplace_back(config_buffer); -} +void Program::add_config_buffer(std::shared_ptr config_buffer) { config_buffers_.emplace_back(config_buffer); } std::unordered_map> Program::logical_cores() const { std::unordered_map> cores_in_program; std::unordered_map> unique_cores; - for (auto [core_type, kernels] : kernels_){ + for (auto [core_type, kernels] : kernels_) { if (cores_in_program.find(core_type) == cores_in_program.end()) { cores_in_program.insert({core_type, {}}); } @@ -546,21 +550,21 @@ std::unordered_map> Program::logical_cores() co void Program::construct_core_range_set_for_worker_cores() { bool found_kernels = false; - for (auto [id, kernel] : kernels_[CoreType::WORKER]){ - this->worker_crs_ = this->worker_crs_.merge ( kernel->core_range_set() ); + for (auto [id, kernel] : kernels_[CoreType::WORKER]) { + this->worker_crs_ = this->worker_crs_.merge(kernel->core_range_set()); found_kernels = true; } TT_ASSERT(!found_kernels || this->worker_crs_.ranges().size() >= 1, "Invalid core range set"); } -void Program::set_cb_data_fmt( - Device *device, const std::vector & crs, JitBuildOptions &build_options) const { +void Program::set_cb_data_fmt(Device *device, const std::vector &crs, JitBuildOptions &build_options) const { ZoneScoped; for (auto logical_cr : crs) { auto cbs_on_core = this->circular_buffers_on_corerange(logical_cr); for (auto circular_buffer : cbs_on_core) { for (auto buffer_index : circular_buffer->buffer_indices()) { - build_options.set_cb_dataformat_all_cores(static_cast(buffer_index), circular_buffer->data_format(buffer_index)); + build_options.set_cb_dataformat_all_cores( + static_cast(buffer_index), circular_buffer->data_format(buffer_index)); } } } @@ -572,8 +576,7 @@ void Program::invalidate_compile() { } } - -void Program::populate_dispatch_data(Device* device) { +void Program::populate_dispatch_data(Device *device) { static const map processor_to_local_mem_addr = { {RISCV::BRISC, MEM_BRISC_INIT_LOCAL_L1_BASE}, {RISCV::NCRISC, MEM_NCRISC_INIT_LOCAL_L1_BASE}, @@ -583,10 +586,10 @@ void Program::populate_dispatch_data(Device* device) { {RISCV::ERISC, eth_l1_mem::address_map::FIRMWARE_BASE}}; auto extract_dst_noc_unicast_info = - [&device](const set& ranges, const CoreType core_type) -> vector> { + [&device](const set &ranges, const CoreType core_type) -> vector> { // This API extracts all the pairs of noc multicast encodings given a set of core ranges vector> dst_noc_unicast_info; - for (const CoreRange& core_range : ranges) { + for (const CoreRange &core_range : ranges) { for (auto x = core_range.start.x; x <= core_range.end.x; x++) { for (auto y = core_range.start.y; y <= core_range.end.y; y++) { CoreCoord physical_coord = device->physical_core_from_logical_core(CoreCoord({x, y}), core_type); @@ -600,13 +603,15 @@ void Program::populate_dispatch_data(Device* device) { }; // Unicast/Multicast Semaphores - for (const Semaphore& semaphore : this->semaphores()) { + for (const Semaphore &semaphore : this->semaphores()) { vector semaphore_data(1); semaphore_data[0] = semaphore.initial_value(); // TODO: use semaphore.core_type from main if (semaphore.core_type() == CoreType::WORKER) { - vector> dst_noc_multicast_info = extract_dst_noc_multicast_info>(device, semaphore.core_range_set().ranges(), semaphore.core_type()); + vector> dst_noc_multicast_info = + extract_dst_noc_multicast_info>( + device, semaphore.core_range_set().ranges(), semaphore.core_type()); transfer_info_2 transfer_info = { .dst_base_addr = semaphore.address(), .dst_noc_info = dst_noc_multicast_info, @@ -614,7 +619,8 @@ void Program::populate_dispatch_data(Device* device) { .data = semaphore_data}; this->program_transfer_info.multicast_semaphores[semaphore.address()].push_back(transfer_info); } else if (semaphore.core_type() == CoreType::ETH) { - vector> dst_noc_unicast_info = extract_dst_noc_unicast_info(semaphore.core_range_set().ranges(), semaphore.core_type()); + vector> dst_noc_unicast_info = + extract_dst_noc_unicast_info(semaphore.core_range_set().ranges(), semaphore.core_type()); transfer_info_2 transfer_info = { .dst_base_addr = semaphore.address(), .dst_noc_info = dst_noc_unicast_info, @@ -629,9 +635,9 @@ void Program::populate_dispatch_data(Device* device) { // Assume here and in command queue that kg_buffers is populated with multicast buffers first then unicast buffers // Program Binaries and Go Signals // TODO: cleanup put the WORKERS and ETH logic together.. - for (KernelGroup& kernel_group : this->get_kernel_groups(CoreType::WORKER)) { - vector> dst_noc_multicast_info = - extract_dst_noc_multicast_info>(device, kernel_group.core_ranges.ranges(), kernel_group.get_core_type()); + for (KernelGroup &kernel_group : this->get_kernel_groups(CoreType::WORKER)) { + vector> dst_noc_multicast_info = extract_dst_noc_multicast_info>( + device, kernel_group.core_ranges.ranges(), kernel_group.get_core_type()); // So far, we don't support linking optimizations for kernel groups // which use multiple core ranges @@ -654,10 +660,10 @@ void Program::populate_dispatch_data(Device* device) { } uint32_t sub_kernel_index = 0; - const auto& binaries = kernel->binaries(device->build_key()); + const auto &binaries = kernel->binaries(device->build_key()); for (size_t j = 0; j < binaries.size(); j++) { - const ll_api::memory& kernel_bin = binaries[j]; + const ll_api::memory &kernel_bin = binaries[j]; uint32_t k = 0; uint32_t num_spans = kernel_bin.num_spans(); @@ -699,7 +705,7 @@ void Program::populate_dispatch_data(Device* device) { } } } - for (KernelGroup& kernel_group : this->get_kernel_groups(CoreType::ETH)) { + for (KernelGroup &kernel_group : this->get_kernel_groups(CoreType::ETH)) { vector> dst_noc_unicast_info = extract_dst_noc_unicast_info(kernel_group.core_ranges.ranges(), kernel_group.get_core_type()); @@ -717,9 +723,9 @@ void Program::populate_dispatch_data(Device* device) { } uint32_t sub_kernel_index = 0; - const auto& binaries = kernel->binaries(device->build_key()); + const auto &binaries = kernel->binaries(device->build_key()); for (size_t j = 0; j < binaries.size(); j++) { - const ll_api::memory& kernel_bin = binaries[j]; + const ll_api::memory &kernel_bin = binaries[j]; uint32_t k = 0; uint32_t num_spans = kernel_bin.num_spans(); @@ -768,8 +774,7 @@ void Program::populate_dispatch_data(Device* device) { return; } -void Program::compile( Device * device ) -{ +void Program::compile(Device *device) { ZoneScoped; bool first_compile_on_device = compile_needed_.find(device->id()) == compile_needed_.end(); if (not first_compile_on_device and (not compile_needed_.at(device->id()))) { @@ -787,43 +792,46 @@ void Program::compile( Device * device ) DprintServerSetProfilerState(profile_kernel); for (auto &[core_type, kernels] : kernels_) { - for (auto &[id, kernel]: kernels) { - launch_build_step([kernel, device, this] { - JitBuildOptions build_options(device->build_env()); - kernel->set_common_runtime_args_offset(); - kernel->set_build_options(build_options); - this->set_cb_data_fmt(device, kernel->logical_coreranges(), build_options); - - auto kernel_hash = KernelCompileHash(kernel, build_options, device->build_key()); - std::string kernel_path_suffix = kernel->name() + "/" + std::to_string(kernel_hash) + "/"; - kernel->set_full_name(kernel_path_suffix); - build_options.set_name(kernel_path_suffix); - bool cache_hit = true; - bool path_exists = std::filesystem::exists(build_options.path); - if (enable_persistent_kernel_cache && path_exists) { - if (not detail::HashLookup::inst().exists(kernel_hash)) { - detail::HashLookup::inst().add(kernel_hash); + for (auto &[id, kernel] : kernels) { + launch_build_step( + [kernel, device, this] { + JitBuildOptions build_options(device->build_env()); + kernel->set_common_runtime_args_offset(); + kernel->set_build_options(build_options); + this->set_cb_data_fmt(device, kernel->logical_coreranges(), build_options); + + auto kernel_hash = KernelCompileHash(kernel, build_options, device->build_key()); + std::string kernel_path_suffix = kernel->name() + "/" + std::to_string(kernel_hash) + "/"; + kernel->set_full_name(kernel_path_suffix); + build_options.set_name(kernel_path_suffix); + bool cache_hit = true; + bool path_exists = std::filesystem::exists(build_options.path); + if (enable_persistent_kernel_cache && path_exists) { + if (not detail::HashLookup::inst().exists(kernel_hash)) { + detail::HashLookup::inst().add(kernel_hash); + detail::HashLookup::inst().add_generated_bin(kernel_hash); + } + } else if (detail::HashLookup::inst().add(kernel_hash)) { + GenerateBinaries(device, build_options, kernel); + cache_hit = false; detail::HashLookup::inst().add_generated_bin(kernel_hash); } - } else if (detail::HashLookup::inst().add(kernel_hash)) { - GenerateBinaries(device, build_options, kernel); - cache_hit = false; - detail::HashLookup::inst().add_generated_bin(kernel_hash); - } - while (not detail::HashLookup::inst().is_bin_generated(kernel_hash)) { - } - if (detail::CompilationReporter::enabled()) { - detail::CompilationReporter::inst().add_kernel_compile_stats(*this, kernel, cache_hit, kernel_hash); - } - kernel->set_binary_path(build_options.path); - }, events); + while (not detail::HashLookup::inst().is_bin_generated(kernel_hash)) { + } + if (detail::CompilationReporter::enabled()) { + detail::CompilationReporter::inst().add_kernel_compile_stats( + *this, kernel, cache_hit, kernel_hash); + } + kernel->set_binary_path(build_options.path); + }, + events); } } sync_build_step(events); for (auto &[core_type, kernels] : kernels_) { for (auto &[id, kernel] : kernels) { - launch_build_step ([kernel, device] { kernel->read_binaries(device); }, events); + launch_build_step([kernel, device] { kernel->read_binaries(device); }, events); } } @@ -845,6 +853,5 @@ void Program::compile( Device * device ) this->loaded_onto_device = false; } -Program::~Program() { -} +Program::~Program() {} } // namespace tt::tt_metal