From dbd2bb6e8597efeeba4a9df0e3bc159a92486ff4 Mon Sep 17 00:00:00 2001 From: Bojan Rosko Date: Tue, 5 Nov 2024 11:07:34 +0000 Subject: [PATCH] clang for device --- device/.clang-format | 2 - device/architecture_implementation.cpp | 12 +- device/architecture_implementation.h | 7 +- .../blackhole/blackhole_coordinate_manager.h | 8 +- device/blackhole/blackhole_implementation.cpp | 14 +- device/blackhole/blackhole_implementation.h | 96 +- device/coordinate_manager.cpp | 26 +- device/coordinate_manager.h | 19 +- device/cpuset_lib.cpp | 489 ++-- device/cpuset_lib.hpp | 168 +- device/device_api_metal.h | 2 +- device/driver_atomics.h | 34 +- .../grayskull/grayskull_coordinate_manager.h | 6 +- device/grayskull/grayskull_implementation.cpp | 7 +- device/grayskull/grayskull_implementation.h | 43 +- device/ioctl.h | 142 +- device/mockup/tt_mockup_device.hpp | 37 +- device/pcie/pci_device.cpp | 284 +- device/pcie/pci_device.hpp | 82 +- .../deprecated/tt_emulation_device.cpp | 244 +- .../deprecated/tt_emulation_device.h | 123 +- .../deprecated/tt_emulation_stub.cpp | 105 +- .../deprecated/tt_versim_device.cpp | 435 +-- .../simulation/deprecated/tt_versim_device.h | 98 +- .../simulation/deprecated/tt_versim_stub.cpp | 119 +- device/simulation/tt_simulation_device.cpp | 125 +- device/simulation/tt_simulation_device.h | 33 +- device/simulation/tt_simulation_host.cpp | 19 +- device/simulation/tt_simulation_host.hpp | 3 +- device/tlb.h | 8 +- device/tt_arch_types.h | 2 +- device/tt_cluster_descriptor.cpp | 463 ++-- device/tt_cluster_descriptor.h | 173 +- device/tt_cluster_descriptor_types.h | 18 +- device/tt_device.cpp | 20 +- device/tt_device.h | 520 ++-- device/tt_io.hpp | 26 +- device/tt_silicon_driver.cpp | 2460 +++++++++++------ device/tt_silicon_driver_common.cpp | 28 +- device/tt_silicon_driver_common.hpp | 55 +- device/tt_soc_descriptor.cpp | 76 +- device/tt_soc_descriptor.h | 65 +- device/tt_xy_pair.h | 12 + .../wormhole/wormhole_coordinate_manager.cpp | 6 +- device/wormhole/wormhole_coordinate_manager.h | 8 +- device/wormhole/wormhole_implementation.cpp | 7 +- device/wormhole/wormhole_implementation.h | 40 +- device/xy_pair.h | 7 +- 48 files changed, 4205 insertions(+), 2571 deletions(-) delete mode 100644 device/.clang-format diff --git a/device/.clang-format b/device/.clang-format deleted file mode 100644 index 9d159247..00000000 --- a/device/.clang-format +++ /dev/null @@ -1,2 +0,0 @@ -DisableFormat: true -SortIncludes: false diff --git a/device/architecture_implementation.cpp b/device/architecture_implementation.cpp index 7cd1dac8..186c6c14 100644 --- a/device/architecture_implementation.cpp +++ b/device/architecture_implementation.cpp @@ -12,10 +12,14 @@ namespace tt::umd { std::unique_ptr architecture_implementation::create(tt::ARCH architecture) { switch (architecture) { - case tt::ARCH::BLACKHOLE: return std::make_unique(); - case tt::ARCH::GRAYSKULL: return std::make_unique(); - case tt::ARCH::WORMHOLE_B0: return std::make_unique(); - default: return nullptr; + case tt::ARCH::BLACKHOLE: + return std::make_unique(); + case tt::ARCH::GRAYSKULL: + return std::make_unique(); + case tt::ARCH::WORMHOLE_B0: + return std::make_unique(); + default: + return nullptr; } } diff --git a/device/architecture_implementation.h b/device/architecture_implementation.h index 41767081..5f966255 100644 --- a/device/architecture_implementation.h +++ b/device/architecture_implementation.h @@ -12,15 +12,15 @@ #include #include "device/tlb.h" -#include "device/xy_pair.h" #include "device/tt_arch_types.h" +#include "device/xy_pair.h" struct tt_driver_host_address_params; namespace tt::umd { class architecture_implementation { - public: +public: virtual ~architecture_implementation() = default; virtual tt::ARCH get_architecture() const = 0; @@ -63,7 +63,8 @@ class architecture_implementation { virtual std::tuple multicast_workaround(xy_pair start, xy_pair end) const = 0; virtual tlb_configuration get_tlb_configuration(uint32_t tlb_index) const = 0; virtual std::optional> describe_tlb(std::int32_t tlb_index) const = 0; - virtual std::pair get_tlb_data(std::uint32_t tlb_index, const tlb_data& data) const = 0; + virtual std::pair get_tlb_data( + std::uint32_t tlb_index, const tlb_data& data) const = 0; virtual tt_driver_host_address_params get_host_address_params() const = 0; diff --git a/device/blackhole/blackhole_coordinate_manager.h b/device/blackhole/blackhole_coordinate_manager.h index 76f1ebc6..88d385ad 100644 --- a/device/blackhole/blackhole_coordinate_manager.h +++ b/device/blackhole/blackhole_coordinate_manager.h @@ -9,15 +9,15 @@ #include "device/coordinate_manager.h" class BlackholeCoordinateManager : public CoordinateManager { - public: - BlackholeCoordinateManager(const tt_xy_pair& worker_grid_size, const std::vector& workers, std::size_t harvesting_mask) - : CoordinateManager(worker_grid_size, workers, harvesting_mask) {} + BlackholeCoordinateManager( + const tt_xy_pair& worker_grid_size, const std::vector& workers, std::size_t harvesting_mask) : + CoordinateManager(worker_grid_size, workers, harvesting_mask) {} tt_translated_coords to_translated_coords(tt_logical_coords logical_coords) override; tt_logical_coords to_logical_coords(tt_translated_coords translated_coords) override; -protected: +protected: std::set get_x_coordinates_to_harvest(std::size_t harvesting_mask) override; }; \ No newline at end of file diff --git a/device/blackhole/blackhole_implementation.cpp b/device/blackhole/blackhole_implementation.cpp index 91d80bc7..c3b39da8 100644 --- a/device/blackhole/blackhole_implementation.cpp +++ b/device/blackhole/blackhole_implementation.cpp @@ -4,9 +4,8 @@ #include "blackhole_implementation.h" -#include "src/firmware/riscv/blackhole/host_mem_address_map.h" - #include "device/tt_device.h" +#include "src/firmware/riscv/blackhole/host_mem_address_map.h" namespace tt::umd { @@ -22,10 +21,9 @@ std::tuple blackhole_implementation::multicast_workaround(xy_p } tlb_configuration blackhole_implementation::get_tlb_configuration(uint32_t tlb_index) const { - // If TLB index is in range for 4GB tlbs (8 TLBs after 202 TLBs for 2MB) if (tlb_index >= blackhole::TLB_COUNT_2M && tlb_index < blackhole::TLB_COUNT_2M + blackhole::TLB_COUNT_4G) { - return tlb_configuration { + return tlb_configuration{ .size = blackhole::DYNAMIC_TLB_4G_SIZE, .base = blackhole::DYNAMIC_TLB_4G_BASE, .cfg_addr = blackhole::DYNAMIC_TLB_4G_CFG_ADDR, @@ -33,7 +31,7 @@ tlb_configuration blackhole_implementation::get_tlb_configuration(uint32_t tlb_i .offset = blackhole::TLB_4G_OFFSET, }; } - + return tlb_configuration{ .size = blackhole::DYNAMIC_TLB_2M_SIZE, .base = blackhole::DYNAMIC_TLB_2M_BASE, @@ -69,17 +67,17 @@ std::optional> blackhole_implementation std::pair blackhole_implementation::get_tlb_data( std::uint32_t tlb_index, const tlb_data& data) const { - if (tlb_index < blackhole::TLB_COUNT_2M) { return data.apply_offset(blackhole::TLB_2M_OFFSET); } else { throw std::runtime_error("Invalid TLB index for Blackhole arch"); } - } tt_driver_host_address_params blackhole_implementation::get_host_address_params() const { - return {::blackhole::host_mem::address_map::ETH_ROUTING_BLOCK_SIZE, ::blackhole::host_mem::address_map::ETH_ROUTING_BUFFERS_START}; + return { + ::blackhole::host_mem::address_map::ETH_ROUTING_BLOCK_SIZE, + ::blackhole::host_mem::address_map::ETH_ROUTING_BUFFERS_START}; } } // namespace tt::umd diff --git a/device/blackhole/blackhole_implementation.h b/device/blackhole/blackhole_implementation.h index 9a7799b7..2bb7c678 100644 --- a/device/blackhole/blackhole_implementation.h +++ b/device/blackhole/blackhole_implementation.h @@ -7,10 +7,10 @@ #pragma once #include +#include #include "device/architecture_implementation.h" #include "device/tlb.h" -#include namespace tt::umd { @@ -59,30 +59,8 @@ enum class arc_message_type { // DEVICE_DATA static constexpr std::array DRAM_LOCATIONS = { - {{0, 0}, - {0, 1}, - {0, 11}, - {0, 2}, - {0, 10}, - {0, 3}, - {0, 9}, - {0, 4}, - {0, 8}, - {0, 5}, - {0, 7}, - {0, 6}, - {9, 0}, - {9, 1}, - {9, 11}, - {9, 2}, - {9, 10}, - {9, 3}, - {9, 9}, - {9, 4}, - {9, 8}, - {9, 5}, - {9, 7}, - {9, 6}}}; + {{0, 0}, {0, 1}, {0, 11}, {0, 2}, {0, 10}, {0, 3}, {0, 9}, {0, 4}, {0, 8}, {0, 5}, {0, 7}, {0, 6}, + {9, 0}, {9, 1}, {9, 11}, {9, 2}, {9, 10}, {9, 3}, {9, 9}, {9, 4}, {9, 8}, {9, 5}, {9, 7}, {9, 6}}}; static constexpr std::array ARC_LOCATIONS = {{{8, 0}}}; static constexpr std::array PCI_LOCATIONS = {{{11, 0}}}; @@ -113,14 +91,14 @@ static constexpr uint32_t BROADCAST_TLB_INDEX = 0; // TODO: Copied from worm static constexpr uint32_t STATIC_TLB_CFG_ADDR = 0x1fc00000; static constexpr uint32_t TLB_COUNT_2M = 202; -static constexpr uint32_t TLB_BASE_2M = 0; // 0 in BAR0 +static constexpr uint32_t TLB_BASE_2M = 0; // 0 in BAR0 static constexpr uint32_t TLB_BASE_INDEX_2M = 0; static constexpr uint32_t TLB_2M_SIZE = 2 * 1024 * 1024; static constexpr uint32_t TLB_CFG_REG_SIZE_BYTES = 12; static constexpr uint32_t TLB_COUNT_4G = 8; -static constexpr uint32_t TLB_BASE_4G = 0; // 0 in BAR4 +static constexpr uint32_t TLB_BASE_4G = 0; // 0 in BAR4 static constexpr uint32_t TLB_BASE_INDEX_4G = TLB_COUNT_2M; static constexpr uint64_t TLB_4G_SIZE = 4ULL * 1024ULL * 1024ULL * 1024ULL; static constexpr uint64_t DYNAMIC_TLB_4G_SIZE = TLB_4G_SIZE; @@ -168,59 +146,108 @@ static constexpr uint32_t MSG_TYPE_SETUP_IATU_FOR_PEER_TO_PEER = 0x97; } // namespace blackhole class blackhole_implementation : public architecture_implementation { - public: +public: tt::ARCH get_architecture() const override { return tt::ARCH::BLACKHOLE; } + uint32_t get_arc_message_arc_get_harvesting() const override { return static_cast(blackhole::arc_message_type::ARC_GET_HARVESTING); } + uint32_t get_arc_message_arc_go_busy() const override { return static_cast(blackhole::arc_message_type::ARC_GO_BUSY); } + uint32_t get_arc_message_arc_go_long_idle() const override { return static_cast(blackhole::arc_message_type::ARC_GO_LONG_IDLE); } + uint32_t get_arc_message_arc_go_short_idle() const override { return static_cast(blackhole::arc_message_type::ARC_GO_SHORT_IDLE); } + uint32_t get_arc_message_deassert_riscv_reset() const override { return static_cast(blackhole::arc_message_type::DEASSERT_RISCV_RESET); } + uint32_t get_arc_message_get_aiclk() const override { return static_cast(blackhole::arc_message_type::GET_AICLK); } + uint32_t get_arc_message_setup_iatu_for_peer_to_peer() const override { return static_cast(blackhole::arc_message_type::SETUP_IATU_FOR_PEER_TO_PEER); } + uint32_t get_arc_message_test() const override { return static_cast(blackhole::arc_message_type::TEST); } - uint32_t get_arc_csm_mailbox_offset() const override { throw std::runtime_error("Not supported for Blackhole arch"); return 0; } + + uint32_t get_arc_csm_mailbox_offset() const override { + throw std::runtime_error("Not supported for Blackhole arch"); + return 0; + } + uint32_t get_arc_reset_arc_misc_cntl_offset() const override { return blackhole::ARC_RESET_ARC_MISC_CNTL_OFFSET; } + uint32_t get_arc_reset_scratch_offset() const override { return blackhole::ARC_RESET_SCRATCH_OFFSET; } + uint32_t get_dram_channel_0_peer2peer_region_start() const override { return blackhole::DRAM_CHANNEL_0_PEER2PEER_REGION_START; } + uint32_t get_dram_channel_0_x() const override { return blackhole::DRAM_CHANNEL_0_X; } + uint32_t get_dram_channel_0_y() const override { return blackhole::DRAM_CHANNEL_0_Y; } + uint32_t get_broadcast_tlb_index() const override { return blackhole::BROADCAST_TLB_INDEX; } + uint32_t get_dynamic_tlb_2m_base() const override { return blackhole::DYNAMIC_TLB_2M_BASE; } + uint32_t get_dynamic_tlb_2m_size() const override { return blackhole::DYNAMIC_TLB_2M_SIZE; } - uint32_t get_dynamic_tlb_16m_base() const override { throw std::runtime_error("No 16MB TLBs for Blackhole arch"); return 0; } - uint32_t get_dynamic_tlb_16m_size() const override { throw std::runtime_error("No 16MB TLBs for Blackhole arch"); return 0; } - uint32_t get_dynamic_tlb_16m_cfg_addr() const override { throw std::runtime_error("No 16MB TLBs for Blackhole arch"); return 0; } + + uint32_t get_dynamic_tlb_16m_base() const override { + throw std::runtime_error("No 16MB TLBs for Blackhole arch"); + return 0; + } + + uint32_t get_dynamic_tlb_16m_size() const override { + throw std::runtime_error("No 16MB TLBs for Blackhole arch"); + return 0; + } + + uint32_t get_dynamic_tlb_16m_cfg_addr() const override { + throw std::runtime_error("No 16MB TLBs for Blackhole arch"); + return 0; + } + uint32_t get_mem_large_read_tlb() const override { return blackhole::MEM_LARGE_READ_TLB; } + uint32_t get_mem_large_write_tlb() const override { return blackhole::MEM_LARGE_WRITE_TLB; } + uint32_t get_static_tlb_cfg_addr() const override { return blackhole::STATIC_TLB_CFG_ADDR; } - uint32_t get_static_tlb_size() const override { return blackhole::STATIC_TLB_SIZE; } + + uint32_t get_static_tlb_size() const override { return blackhole::STATIC_TLB_SIZE; } + uint32_t get_reg_tlb() const override { return blackhole::REG_TLB; } - uint32_t get_tlb_base_index_16m() const override { throw std::runtime_error("No 16MB TLBs for Blackhole arch"); return 0; } + + uint32_t get_tlb_base_index_16m() const override { + throw std::runtime_error("No 16MB TLBs for Blackhole arch"); + return 0; + } + uint32_t get_tensix_soft_reset_addr() const override { return blackhole::TENSIX_SOFT_RESET_ADDR; } + uint32_t get_grid_size_x() const override { return blackhole::GRID_SIZE_X; } + uint32_t get_grid_size_y() const override { return blackhole::GRID_SIZE_Y; } + uint32_t get_tlb_cfg_reg_size_bytes() const override { return blackhole::TLB_CFG_REG_SIZE_BYTES; } + uint32_t get_small_read_write_tlb() const override { return blackhole::MEM_SMALL_READ_WRITE_TLB; } + const std::vector& get_harvesting_noc_locations() const override { return blackhole::HARVESTING_NOC_LOCATIONS; } + const std::vector& get_t6_x_locations() const override { return blackhole::T6_X_LOCATIONS; } + const std::vector& get_t6_y_locations() const override { return blackhole::T6_Y_LOCATIONS; } std::tuple multicast_workaround(xy_pair start, xy_pair end) const override; @@ -229,7 +256,6 @@ class blackhole_implementation : public architecture_implementation { std::pair get_tlb_data(std::uint32_t tlb_index, const tlb_data& data) const override; tt_driver_host_address_params get_host_address_params() const override; - }; } // namespace tt::umd diff --git a/device/coordinate_manager.cpp b/device/coordinate_manager.cpp index 438e002f..de6a7649 100644 --- a/device/coordinate_manager.cpp +++ b/device/coordinate_manager.cpp @@ -1,5 +1,7 @@ #include "device/coordinate_manager.h" + #include + #include "coordinate_manager.h" #include "grayskull/grayskull_coordinate_manager.h" @@ -66,13 +68,9 @@ void CoordinateManager::clear_harvesting_structures() { virtual_y_to_logical_y.clear(); } -std::set CoordinateManager::get_x_coordinates_to_harvest(std::size_t harvesting_mask) { - return {}; -} +std::set CoordinateManager::get_x_coordinates_to_harvest(std::size_t harvesting_mask) { return {}; } -std::set CoordinateManager::get_y_coordinates_to_harvest(std::size_t harvesting_mask) { - return {}; -} +std::set CoordinateManager::get_y_coordinates_to_harvest(std::size_t harvesting_mask) { return {}; } void CoordinateManager::perform_harvesting(std::size_t harvesting_mask) { clear_harvesting_structures(); @@ -99,14 +97,16 @@ void CoordinateManager::perform_harvesting(std::size_t harvesting_mask) { logical_x_to_virtual_x.resize(grid_size_x - num_harvested_x); logical_y_to_virtual_y.resize(grid_size_y - num_harvested_y); - fill_logical_to_physical_mapping(x_coordinates_to_harvest, y_coordinates_to_harvest, physical_x_unharvested, physical_y_unharvested); + fill_logical_to_physical_mapping( + x_coordinates_to_harvest, y_coordinates_to_harvest, physical_x_unharvested, physical_y_unharvested); fill_logical_to_virtual_mapping(physical_x_unharvested, physical_y_unharvested); } void CoordinateManager::fill_logical_to_physical_mapping( - const std::set& x_to_harvest, const std::set& y_to_harvest, - const std::set& physical_x_unharvested, const std::set& physical_y_unharvested) { - + const std::set& x_to_harvest, + const std::set& y_to_harvest, + const std::set& physical_x_unharvested, + const std::set& physical_y_unharvested) { auto physical_y_it = physical_y_unharvested.begin(); std::size_t logical_y = 0; for (size_t y = 0; y < worker_grid_size.y; y++) { @@ -125,7 +125,7 @@ void CoordinateManager::fill_logical_to_physical_mapping( auto physical_x_it = physical_x_unharvested.begin(); std::size_t logical_x = 0; - for(std::size_t x = 0; x < worker_grid_size.x; x++) { + for (std::size_t x = 0; x < worker_grid_size.x; x++) { if (x_to_harvest.find(x) == x_to_harvest.end()) { logical_x_to_physical_x[logical_x] = *physical_x_it; if (physical_x_to_logical_x.find(*physical_x_it) != physical_x_to_logical_x.end()) { @@ -140,7 +140,8 @@ void CoordinateManager::fill_logical_to_physical_mapping( } } -void CoordinateManager::fill_logical_to_virtual_mapping(const std::set& physical_x_unharvested, const std::set& physical_y_unharvested) { +void CoordinateManager::fill_logical_to_virtual_mapping( + const std::set& physical_x_unharvested, const std::set& physical_y_unharvested) { auto physical_y_it = physical_y_unharvested.begin(); for (std::size_t y = 0; y < logical_y_to_virtual_y.size(); y++) { logical_y_to_virtual_y[y] = *physical_y_it; @@ -171,7 +172,6 @@ std::unique_ptr CoordinateManager::get_coordinate_manager( const tt_xy_pair& worker_grid_size, const std::vector& workers, std::size_t harvesting_mask) { - switch (arch) { case tt::ARCH::GRAYSKULL: return std::make_unique(worker_grid_size, workers, harvesting_mask); diff --git a/device/coordinate_manager.h b/device/coordinate_manager.h index a71764df..967e5237 100644 --- a/device/coordinate_manager.h +++ b/device/coordinate_manager.h @@ -7,17 +7,17 @@ #pragma once #include -#include #include +#include -#include "device/tt_xy_pair.h" #include "device/tt_arch_types.h" +#include "device/tt_xy_pair.h" class CoordinateManager { - public: - CoordinateManager(const tt_xy_pair& worker_grid_size, const std::vector& workers, std::size_t harvesting_mask) - : worker_grid_size(worker_grid_size), workers(workers), harvesting_mask(harvesting_mask) {} + CoordinateManager( + const tt_xy_pair& worker_grid_size, const std::vector& workers, std::size_t harvesting_mask) : + worker_grid_size(worker_grid_size), workers(workers), harvesting_mask(harvesting_mask) {} virtual void perform_harvesting(std::size_t harvesting_mask); @@ -49,14 +49,17 @@ class CoordinateManager { protected: virtual void clear_harvesting_structures(); - + virtual std::set get_x_coordinates_to_harvest(std::size_t harvesting_mask); virtual std::set get_y_coordinates_to_harvest(std::size_t harvesting_mask); virtual void fill_logical_to_physical_mapping( - const std::set& x_to_harvest, const std::set& y_to_harvest, + const std::set& x_to_harvest, + const std::set& y_to_harvest, + const std::set& physical_x_unharvested, + const std::set& physical_y_unharvested); + virtual void fill_logical_to_virtual_mapping( const std::set& physical_x_unharvested, const std::set& physical_y_unharvested); - virtual void fill_logical_to_virtual_mapping(const std::set& physical_x_unharvested, const std::set& physical_y_unharvested); std::map physical_y_to_logical_y; std::map physical_x_to_logical_x; diff --git a/device/cpuset_lib.cpp b/device/cpuset_lib.cpp index 00f82a46..1bbbb291 100644 --- a/device/cpuset_lib.cpp +++ b/device/cpuset_lib.cpp @@ -2,17 +2,20 @@ // // SPDX-License-Identifier: Apache-2.0 +#include "cpuset_lib.hpp" + #include +#include +#include -#include "cpuset_lib.hpp" #include "common/logger.hpp" -#include #include "device/tt_device.h" -#include #include "fmt/core.h" + namespace tt { namespace fs = std::filesystem; + namespace cpuset { ///////////////////////////////////////////////////////////////////////// @@ -21,15 +24,18 @@ namespace cpuset { // Constructor for singleton class cpu id allocator tt_cpuset_allocator::tt_cpuset_allocator() { - - m_pid = getpid(); - m_debug = std::getenv("TT_BACKEND_CPUSET_ALLOCATOR_DEBUG") ? true : false; + m_pid = getpid(); + m_debug = std::getenv("TT_BACKEND_CPUSET_ALLOCATOR_DEBUG") ? true : false; // Chicken bit to disable this entire feature for debug/comparison. bool cpuset_allocator_enable_env = std::getenv("TT_BACKEND_CPUSET_ALLOCATOR_ENABLE") ? true : false; auto system_tid = std::this_thread::get_id(); - log_debug(LogSiliconDriver, "Starting tt_cpuset_allocator constructor now for process_id: {} thread_id: {}", m_pid, system_tid); + log_debug( + LogSiliconDriver, + "Starting tt_cpuset_allocator constructor now for process_id: {} thread_id: {}", + m_pid, + system_tid); m_enable_cpuset_allocator = true; @@ -38,86 +44,102 @@ tt_cpuset_allocator::tt_cpuset_allocator() { m_enable_cpuset_allocator &= init_get_number_of_packages(); m_enable_cpuset_allocator &= init_find_tt_pci_devices_packages_numanodes(); - if (!cpuset_allocator_enable_env){ + if (!cpuset_allocator_enable_env) { m_enable_cpuset_allocator = false; - }else{ - - bool is_cpu_supported = init_is_cpu_model_supported(); + } else { + bool is_cpu_supported = init_is_cpu_model_supported(); - if (is_cpu_supported){ + if (is_cpu_supported) { m_enable_cpuset_allocator &= init_determine_cpuset_allocations(); - }else{ + } else { m_enable_cpuset_allocator = false; } - log_debug(LogSiliconDriver,"Finished tt_cpuset_allocator constructor now with m_enable_cpuset_allocator: {} for process_id: {} thread_id: {} ", m_enable_cpuset_allocator, m_pid, system_tid); + log_debug( + LogSiliconDriver, + "Finished tt_cpuset_allocator constructor now with m_enable_cpuset_allocator: {} for process_id: {} " + "thread_id: {} ", + m_enable_cpuset_allocator, + m_pid, + system_tid); } } // Step 1 : Initialize and perform m_topology detection -bool tt_cpuset_allocator::init_topology_init_and_load(){ - log_debug(LogSiliconDriver,"Inside tt_cpuset_allocator::topology_init_and_load()"); +bool tt_cpuset_allocator::init_topology_init_and_load() { + log_debug(LogSiliconDriver, "Inside tt_cpuset_allocator::topology_init_and_load()"); - if (!m_enable_cpuset_allocator){ + if (!m_enable_cpuset_allocator) { return false; } - if (hwloc_topology_init(&m_topology)){ + if (hwloc_topology_init(&m_topology)) { log_warning(LogSiliconDriver, "Problem initializing topology"); return false; } - hwloc_topology_set_type_filter(m_topology, HWLOC_OBJ_PCI_DEVICE, HWLOC_TYPE_FILTER_KEEP_ALL); // Need to find PCI devices. + hwloc_topology_set_type_filter( + m_topology, HWLOC_OBJ_PCI_DEVICE, HWLOC_TYPE_FILTER_KEEP_ALL); // Need to find PCI devices. - if (hwloc_topology_load(m_topology)){ + if (hwloc_topology_load(m_topology)) { log_warning(LogSiliconDriver, "Problem loading topology"); return false; } - return true; // Success + return true; // Success } -// Step 2 - Find TT PCI devices in topology by vendor_id to get their PCI bus_id and physical device_id, and package and numamode. -bool tt_cpuset_allocator::init_find_tt_pci_devices_packages_numanodes(){ - - if (!m_enable_cpuset_allocator){ +// Step 2 - Find TT PCI devices in topology by vendor_id to get their PCI bus_id and physical device_id, and package and +// numamode. +bool tt_cpuset_allocator::init_find_tt_pci_devices_packages_numanodes() { + if (!m_enable_cpuset_allocator) { return false; } - log_debug(LogSiliconDriver,"Starting tt_cpuset_allocator::init_find_tt_pci_devices_packages_numanodes()"); + log_debug(LogSiliconDriver, "Starting tt_cpuset_allocator::init_find_tt_pci_devices_packages_numanodes()"); m_num_tt_device_by_pci_device_id_map.clear(); hwloc_obj_t pci_device_obj = NULL; const std::regex tt_device_re("tenstorrent!([0-9]+)"); - while ((pci_device_obj = hwloc_get_next_pcidev(m_topology, pci_device_obj))){ - - if (hwloc_obj_type_is_io(pci_device_obj->type) && (pci_device_obj->attr->pcidev.vendor_id == TENSTORRENT_VENDOR_ID)) { - - std::pair device_id_revision = std::make_pair(pci_device_obj->attr->pcidev.device_id, pci_device_obj->attr->pcidev.revision); + while ((pci_device_obj = hwloc_get_next_pcidev(m_topology, pci_device_obj))) { + if (hwloc_obj_type_is_io(pci_device_obj->type) && + (pci_device_obj->attr->pcidev.vendor_id == TENSTORRENT_VENDOR_ID)) { + std::pair device_id_revision = + std::make_pair(pci_device_obj->attr->pcidev.device_id, pci_device_obj->attr->pcidev.revision); m_num_tt_device_by_pci_device_id_map[device_id_revision] += 1; - std::string pci_bus_id_str = get_pci_bus_id(pci_device_obj); + std::string pci_bus_id_str = get_pci_bus_id(pci_device_obj); std::string pci_device_dir = fmt::format("/sys/bus/pci/devices/{}/tenstorrent/", pci_bus_id_str); int physical_device_id = -1; - log_trace(LogSiliconDriver, "Found TT device with pci_bus_id_str: {} num_devices_by_pci_device_id: {}", pci_bus_id_str, m_num_tt_device_by_pci_device_id_map[device_id_revision]); + log_trace( + LogSiliconDriver, + "Found TT device with pci_bus_id_str: {} num_devices_by_pci_device_id: {}", + pci_bus_id_str, + m_num_tt_device_by_pci_device_id_map[device_id_revision]); // First, get the physical_device_id of the device. - if (fs::exists(pci_device_dir)){ - for (const auto &entry : fs::directory_iterator(pci_device_dir)){ + if (fs::exists(pci_device_dir)) { + for (const auto &entry : fs::directory_iterator(pci_device_dir)) { auto entry_str = entry.path().string(); - if (std::smatch device_match; std::regex_search(entry_str, device_match, tt_device_re) and (stoi(device_match[1]) >= 0)){ + if (std::smatch device_match; + std::regex_search(entry_str, device_match, tt_device_re) and (stoi(device_match[1]) >= 0)) { physical_device_id = stoi(device_match[1]); m_all_tt_devices.push_back(physical_device_id); - log_debug(LogSiliconDriver, "Found physical_device_id: {} from file: {}", physical_device_id, entry_str); + log_debug( + LogSiliconDriver, + "Found physical_device_id: {} from file: {}", + physical_device_id, + entry_str); break; } } - if (physical_device_id == -1){ - log_warning(LogSiliconDriver, "Did not find file containing physical_device_id in {}", pci_device_dir); + if (physical_device_id == -1) { + log_warning( + LogSiliconDriver, "Did not find file containing physical_device_id in {}", pci_device_dir); return false; } @@ -125,19 +147,23 @@ bool tt_cpuset_allocator::init_find_tt_pci_devices_packages_numanodes(){ // Next, get the PackageID of the device and update maps. auto package_id = get_package_id_from_device(pci_device_obj, physical_device_id); - - // This package was not previously seen. Initialize structures tracking the TT Devices mapped to this + + // This package was not previously seen. Initialize structures tracking the TT Devices mapped to this // package and structures storing the CPU characteristics per package. if (m_package_id_to_devices_map.find(package_id) == m_package_id_to_devices_map.end()) { m_package_id_to_devices_map.insert({package_id, {}}); m_package_id_to_num_l3_per_ccx_map.insert({package_id, 0}); m_package_id_to_num_ccx_per_ccd_map.insert({package_id, 0}); } - if (package_id != -1){ + if (package_id != -1) { m_package_id_to_devices_map.at(package_id).push_back(physical_device_id); m_physical_device_id_to_package_id_map.insert({physical_device_id, package_id}); } else { - log_warning(LogSiliconDriver, "Could not find package_id for TT Device (physical_device_id: {} pci_bus_id: {})", physical_device_id, pci_bus_id_str); + log_warning( + LogSiliconDriver, + "Could not find package_id for TT Device (physical_device_id: {} pci_bus_id: {})", + physical_device_id, + pci_bus_id_str); return false; } @@ -145,378 +171,479 @@ bool tt_cpuset_allocator::init_find_tt_pci_devices_packages_numanodes(){ auto numa_nodeset = get_numa_nodeset_from_device(pci_device_obj, physical_device_id); m_physical_device_id_to_numa_nodeset_map.insert({physical_device_id, numa_nodeset}); - if (numa_nodeset == 0x0){ - log_warning(LogSiliconDriver, "Could not find NumaNodeSet for TT Device (physical_device_id: {} pci_bus_id: {})", physical_device_id, pci_bus_id_str); + if (numa_nodeset == 0x0) { + log_warning( + LogSiliconDriver, + "Could not find NumaNodeSet for TT Device (physical_device_id: {} pci_bus_id: {})", + physical_device_id, + pci_bus_id_str); return false; } - m_physical_device_id_to_cpusets_map.insert({physical_device_id, {}}); // Empty vector. + m_physical_device_id_to_cpusets_map.insert({physical_device_id, {}}); // Empty vector. m_num_cpu_cores_allocated_per_tt_device.insert({physical_device_id, 0}); } } } - if (m_all_tt_devices.size() == 0){ - log_warning(LogSiliconDriver, "Did not find any PCI devices matching Tenstorrent vendor_id 0x{:x}", TENSTORRENT_VENDOR_ID); + if (m_all_tt_devices.size() == 0) { + log_warning( + LogSiliconDriver, + "Did not find any PCI devices matching Tenstorrent vendor_id 0x{:x}", + TENSTORRENT_VENDOR_ID); return false; } - log_debug(LogSiliconDriver,"Finshed tt_cpuset_allocator::init_find_tt_pci_devices_packages_numanodes() found {} devices", m_all_tt_devices.size()); - + log_debug( + LogSiliconDriver, + "Finshed tt_cpuset_allocator::init_find_tt_pci_devices_packages_numanodes() found {} devices", + m_all_tt_devices.size()); // Sort these 2 vectors of device_ids before we are done, since discovery can be in any order. - for (auto &p: m_package_id_to_devices_map){ + for (auto &p : m_package_id_to_devices_map) { std::sort(p.second.begin(), p.second.end()); } std::sort(m_all_tt_devices.begin(), m_all_tt_devices.end()); - return true; // Success + return true; // Success } - // Step 3 : Detect the number of packages. -bool tt_cpuset_allocator::init_get_number_of_packages(){ - - if (!m_enable_cpuset_allocator){ +bool tt_cpuset_allocator::init_get_number_of_packages() { + if (!m_enable_cpuset_allocator) { return false; } m_num_packages = hwloc_get_nbobjs_by_type(m_topology, HWLOC_OBJ_PACKAGE); - log_debug(LogSiliconDriver,"Found {} CPU packages", m_num_packages); - return m_num_packages > 0; // Success + log_debug(LogSiliconDriver, "Found {} CPU packages", m_num_packages); + return m_num_packages > 0; // Success } // Step 4 : Return true if all packages are models we want to support. Env-var can be used to ignore this check. -bool tt_cpuset_allocator::init_is_cpu_model_supported(){ - - if (!m_enable_cpuset_allocator){ +bool tt_cpuset_allocator::init_is_cpu_model_supported() { + if (!m_enable_cpuset_allocator) { return false; } - if (m_num_packages == 0){ - log_debug(LogSiliconDriver,"init_is_cpu_model_supported(): Found 0 packages, functions run out of order?"); + if (m_num_packages == 0) { + log_debug(LogSiliconDriver, "init_is_cpu_model_supported(): Found 0 packages, functions run out of order?"); return false; } bool use_any_cpu = std::getenv("TT_BACKEND_CPUSET_ALLOCATOR_SUPPORT_ANY_CPU") ? true : false; - log_debug(LogSiliconDriver,"Inside tt_cpuset_allocator::check_if_cpu_model_supported()"); + log_debug(LogSiliconDriver, "Inside tt_cpuset_allocator::check_if_cpu_model_supported()"); // Supported CPU Models for enabling CPUSET Allocator. Keep the list small to production machines to start. - std::vector supported_cpu_models = { "AMD EPYC 7352 24-Core Processor", - "AMD EPYC 7532 32-Core Processor"}; + std::vector supported_cpu_models = { + "AMD EPYC 7352 24-Core Processor", "AMD EPYC 7532 32-Core Processor"}; // CPU Models that have L3 per CCX and 2 CCX per CCD - std::vector opt_2ccx_per_ccd_cpu_models = { "AMD EPYC 7352 24-Core Processor", - "AMD EPYC 7532 32-Core Processor"}; - for(const auto& package: m_package_id_to_devices_map) { + std::vector opt_2ccx_per_ccd_cpu_models = { + "AMD EPYC 7352 24-Core Processor", "AMD EPYC 7532 32-Core Processor"}; + for (const auto &package : m_package_id_to_devices_map) { int package_id = package.first; auto package_obj = hwloc_get_obj_by_type(m_topology, HWLOC_OBJ_PACKAGE, package_id); - if (m_debug) print_hwloc_object(package_obj, 0, true, true); + if (m_debug) { + print_hwloc_object(package_obj, 0, true, true); + } std::string pkg_cpu_model = hwloc_obj_get_info_by_name(package_obj, "CPUModel"); // First find out if this CPU is supported by CPUSET Allocator at all. bool has_supported_cpu = use_any_cpu ? true : false; - for (auto &supported_cpu_model : supported_cpu_models){ + for (auto &supported_cpu_model : supported_cpu_models) { has_supported_cpu |= (pkg_cpu_model.find(supported_cpu_model) != std::string::npos); } - log_debug(LogSiliconDriver,"Detected package-id: {} has_supported_cpu: {} for CpuModel: {}", package_id, has_supported_cpu, pkg_cpu_model); + log_debug( + LogSiliconDriver, + "Detected package-id: {} has_supported_cpu: {} for CpuModel: {}", + package_id, + has_supported_cpu, + pkg_cpu_model); - if (!has_supported_cpu){ + if (!has_supported_cpu) { return false; } // Then, determine if the 2CCX-PER-CCD optimization can be enabled for this CPU Model in the package. - for (auto &opt_cpu_model : opt_2ccx_per_ccd_cpu_models){ - if (pkg_cpu_model.find(opt_cpu_model) != std::string::npos){ + for (auto &opt_cpu_model : opt_2ccx_per_ccd_cpu_models) { + if (pkg_cpu_model.find(opt_cpu_model) != std::string::npos) { m_package_id_to_num_l3_per_ccx_map.at(package_id) = 1; m_package_id_to_num_ccx_per_ccd_map.at(package_id) = 2; } } } - return true; // Successhwloc + return true; // Successhwloc } - -// Step 5: Get all target allocation objects (ie. L3Cache if IO thread to be allocated per L3Cache cpuset) for a given socket/package. -bool tt_cpuset_allocator::init_determine_cpuset_allocations(){ - - if (!m_enable_cpuset_allocator){ +// Step 5: Get all target allocation objects (ie. L3Cache if IO thread to be allocated per L3Cache cpuset) for a given +// socket/package. +bool tt_cpuset_allocator::init_determine_cpuset_allocations() { + if (!m_enable_cpuset_allocator) { return false; } - log_debug(LogSiliconDriver,"Inside tt_cpuset_allocator::init_determine_cpuset_allocations()"); - for (const auto& package : m_package_id_to_devices_map) { + log_debug(LogSiliconDriver, "Inside tt_cpuset_allocator::init_determine_cpuset_allocations()"); + for (const auto &package : m_package_id_to_devices_map) { int package_id = package.first; auto num_tt_devices_for_cpu_package = package.second.size(); - if (num_tt_devices_for_cpu_package == 0){ - log_debug(LogSiliconDriver, "init_determine_cpuset_allocations() -- no TT devices for package_id: {}, skipping.", package_id); + if (num_tt_devices_for_cpu_package == 0) { + log_debug( + LogSiliconDriver, + "init_determine_cpuset_allocations() -- no TT devices for package_id: {}, skipping.", + package_id); continue; } - log_debug(LogSiliconDriver, "init_determine_cpuset_allocations(). starting to detect allocation slots for package_id: {} ", package_id); + log_debug( + LogSiliconDriver, + "init_determine_cpuset_allocations(). starting to detect allocation slots for package_id: {} ", + package_id); auto package_obj = hwloc_get_obj_by_type(m_topology, HWLOC_OBJ_PACKAGE, package_id); - if (m_debug) print_hwloc_object(package_obj, 0, true, true); + if (m_debug) { + print_hwloc_object(package_obj, 0, true, true); + } - auto num_alloc_slots_in_package = hwloc_get_nbobjs_inside_cpuset_by_type(m_topology, package_obj->cpuset, m_object_per_alloc_slot); - if (num_alloc_slots_in_package == 0){ - log_warning(LogSiliconDriver, "Could not find any of the alloc objects in package_id: {} for this cpu arc", package_id); + auto num_alloc_slots_in_package = + hwloc_get_nbobjs_inside_cpuset_by_type(m_topology, package_obj->cpuset, m_object_per_alloc_slot); + if (num_alloc_slots_in_package == 0) { + log_warning( + LogSiliconDriver, + "Could not find any of the alloc objects in package_id: {} for this cpu arc", + package_id); return false; } auto num_alloc_slots_per_tt_device = num_alloc_slots_in_package / num_tt_devices_for_cpu_package; // Above splits evenly by devices, leaves remainder unused in the example case of 3 devices but 8 slots. - log_debug(LogSiliconDriver, "init_determine_cpuset_allocations(). package_id: {} num_alloc_slots_in_package: {} num_tt_devices_for_cpu_package: {} num_alloc_slots_per_tt_device: {}", - package_id, num_alloc_slots_in_package, num_tt_devices_for_cpu_package, num_alloc_slots_per_tt_device); + log_debug( + LogSiliconDriver, + "init_determine_cpuset_allocations(). package_id: {} num_alloc_slots_in_package: {} " + "num_tt_devices_for_cpu_package: {} num_alloc_slots_per_tt_device: {}", + package_id, + num_alloc_slots_in_package, + num_tt_devices_for_cpu_package, + num_alloc_slots_per_tt_device); int device_idx = 0; - for (int obj_idx = 0; obj_idx < num_alloc_slots_in_package; obj_idx++){ + for (int obj_idx = 0; obj_idx < num_alloc_slots_in_package; obj_idx++) { + auto obj = hwloc_get_obj_below_by_type( + m_topology, HWLOC_OBJ_PACKAGE, package_id, m_object_per_alloc_slot, obj_idx); - auto obj = hwloc_get_obj_below_by_type(m_topology, HWLOC_OBJ_PACKAGE, package_id, m_object_per_alloc_slot, obj_idx); - - if (obj){ - if (m_debug) print_hwloc_object(obj, 1, true); + if (obj) { + if (m_debug) { + print_hwloc_object(obj, 1, true); + } auto physical_device_id = m_package_id_to_devices_map.at(package_id).at(device_idx); // Hack for maximum number of slots per device. // if (m_physical_device_id_to_cpusets_map.at(physical_device_id).size() < 2){ m_physical_device_id_to_cpusets_map.at(physical_device_id).push_back(obj->cpuset); - int num_cpus = hwloc_get_nbobjs_inside_cpuset_by_type(m_topology,obj->cpuset,HWLOC_OBJ_CORE); + int num_cpus = hwloc_get_nbobjs_inside_cpuset_by_type(m_topology, obj->cpuset, HWLOC_OBJ_CORE); m_num_cpu_cores_allocated_per_tt_device.at(physical_device_id) += num_cpus; // } // We're distributing allocation objects per package across TT devices, so switch to next one. - if (((obj_idx + 1) % num_alloc_slots_per_tt_device) == 0){ - device_idx = (device_idx + 1) % num_tt_devices_for_cpu_package; // Loop around if extra slots remain. Assigned to first device for that package. + if (((obj_idx + 1) % num_alloc_slots_per_tt_device) == 0) { + device_idx = (device_idx + 1) % + num_tt_devices_for_cpu_package; // Loop around if extra slots remain. Assigned to + // first device for that package. } - }else{ - log_warning(LogSiliconDriver, "init_determine_cpuset_allocations(). Something went wrong looking for cpuset alloc object under package"); + } else { + log_warning( + LogSiliconDriver, + "init_determine_cpuset_allocations(). Something went wrong looking for cpuset alloc object under " + "package"); return false; } } - log_debug(LogSiliconDriver, "init_determine_cpuset_allocations(). Done detecting allocation slots for package_id: {} ", package_id); + log_debug( + LogSiliconDriver, + "init_determine_cpuset_allocations(). Done detecting allocation slots for package_id: {} ", + package_id); } - // Summary for Debug purposes. - for (auto &physical_device_id : m_all_tt_devices){ - for (size_t device_alloc_idx=0; device_alloc_idx < m_physical_device_id_to_cpusets_map.at(physical_device_id).size(); device_alloc_idx++){ + for (auto &physical_device_id : m_all_tt_devices) { + for (size_t device_alloc_idx = 0; + device_alloc_idx < m_physical_device_id_to_cpusets_map.at(physical_device_id).size(); + device_alloc_idx++) { auto cpuset = m_physical_device_id_to_cpusets_map.at(physical_device_id).at(device_alloc_idx); auto pu_ids_vector = get_hwloc_bitmap_vector(cpuset); auto num_pu_ids = pu_ids_vector.size(); auto package_id = m_physical_device_id_to_package_id_map.at(physical_device_id); - log_debug(LogSiliconDriver, "Done init_determine_cpuset_allocations(). Summary => for mmio physical_device_id: {} package_id: {} device_alloc_idx: {} picked {} PU's {}", physical_device_id, package_id, device_alloc_idx, num_pu_ids, pu_ids_vector); + log_debug( + LogSiliconDriver, + "Done init_determine_cpuset_allocations(). Summary => for mmio physical_device_id: {} package_id: {} " + "device_alloc_idx: {} picked {} PU's {}", + physical_device_id, + package_id, + device_alloc_idx, + num_pu_ids, + pu_ids_vector); } } - return true; // Success - + return true; // Success } ///////////////////////////////////////////////////////////////////////// // Runtime Functions //////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////// -// Given a physical device_id, determine the right numa nodes associated with it and attempt to membind a previously allocated memory region to it. -bool tt_cpuset_allocator::bind_area_memory_nodeset(chip_id_t physical_device_id, const void * addr, size_t len){ - +// Given a physical device_id, determine the right numa nodes associated with it and attempt to membind a previously +// allocated memory region to it. +bool tt_cpuset_allocator::bind_area_memory_nodeset(chip_id_t physical_device_id, const void *addr, size_t len) { auto tid = std::this_thread::get_id(); - log_debug(LogSiliconDriver,"bind_area_memory_nodeset(): Going to attempt memory binding of addr/len to NumaNode for physical_device_id: {} (pid: {} tid: {})", physical_device_id, m_pid, tid); - - if (m_physical_device_id_to_numa_nodeset_map.count(physical_device_id) == 0){ - log_fatal("bind_area_memory_nodeset(): Did not find physical_device_id: {} in numanode_mask map, this is not expected.", physical_device_id); + log_debug( + LogSiliconDriver, + "bind_area_memory_nodeset(): Going to attempt memory binding of addr/len to NumaNode for physical_device_id: " + "{} (pid: {} tid: {})", + physical_device_id, + m_pid, + tid); + + if (m_physical_device_id_to_numa_nodeset_map.count(physical_device_id) == 0) { + log_fatal( + "bind_area_memory_nodeset(): Did not find physical_device_id: {} in numanode_mask map, this is not " + "expected.", + physical_device_id); return false; } auto target_nodeset = m_physical_device_id_to_numa_nodeset_map.at(physical_device_id); - if (target_nodeset != 0){ - if (hwloc_set_area_membind(m_topology, addr, len, target_nodeset, HWLOC_MEMBIND_BIND, HWLOC_MEMBIND_BYNODESET | HWLOC_MEMBIND_STRICT | HWLOC_MEMBIND_MIGRATE) ){ - log_warning(LogSiliconDriver,"hwloc_set_area_membind(): failed for physical_device_id: {} on NodeSet: {} with errno: {} (pid: {} tid: {})", - physical_device_id, get_hwloc_bitmap_vector(target_nodeset), strerror(errno), m_pid, tid); + if (target_nodeset != 0) { + if (hwloc_set_area_membind( + m_topology, + addr, + len, + target_nodeset, + HWLOC_MEMBIND_BIND, + HWLOC_MEMBIND_BYNODESET | HWLOC_MEMBIND_STRICT | HWLOC_MEMBIND_MIGRATE)) { + log_warning( + LogSiliconDriver, + "hwloc_set_area_membind(): failed for physical_device_id: {} on NodeSet: {} with errno: {} (pid: {} " + "tid: {})", + physical_device_id, + get_hwloc_bitmap_vector(target_nodeset), + strerror(errno), + m_pid, + tid); return false; - }else{ - log_debug(LogSiliconDriver,"hwloc_set_area_membind(): success for physical_device_id: {} on NodeSet: {} (pid: {} tid: {})", physical_device_id, get_hwloc_bitmap_vector(target_nodeset), m_pid, tid); + } else { + log_debug( + LogSiliconDriver, + "hwloc_set_area_membind(): success for physical_device_id: {} on NodeSet: {} (pid: {} tid: {})", + physical_device_id, + get_hwloc_bitmap_vector(target_nodeset), + m_pid, + tid); } - }else{ - log_warning(LogSiliconDriver,"bind_area_memory_nodeset(): Unable to determine TT Device to NumaNode mapping for physical_device_id: {}. Skipping membind.", physical_device_id); + } else { + log_warning( + LogSiliconDriver, + "bind_area_memory_nodeset(): Unable to determine TT Device to NumaNode mapping for physical_device_id: {}. " + "Skipping membind.", + physical_device_id); return false; } - return true; // Success + return true; // Success } int tt_cpuset_allocator::_get_num_tt_pci_devices() { - for (auto &d : m_physical_device_id_to_package_id_map) { log_trace(LogSiliconDriver, "Found physical_device_id: {} ", d.first); } return m_physical_device_id_to_package_id_map.size(); } - - - ///////////////////////////////////////////////////////////////////////// -//Helper Functions ////////////////////////////////////////////////////// +// Helper Functions ////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////// - -std::string tt_cpuset_allocator::get_pci_bus_id(hwloc_obj_t pci_device_obj){ - +std::string tt_cpuset_allocator::get_pci_bus_id(hwloc_obj_t pci_device_obj) { std::string pci_bus_id_str = ""; - if (hwloc_obj_type_is_io(pci_device_obj->type)) { + if (hwloc_obj_type_is_io(pci_device_obj->type)) { auto attrs = pci_device_obj->attr->pcidev; pci_bus_id_str = fmt::format("{:04x}:{:02x}:{:02x}.{:01x}", attrs.domain, attrs.bus, attrs.dev, attrs.func); } return pci_bus_id_str; - } -int tt_cpuset_allocator::get_package_id_from_device(hwloc_obj_t pci_device_obj, chip_id_t physical_device_id){ - +int tt_cpuset_allocator::get_package_id_from_device(hwloc_obj_t pci_device_obj, chip_id_t physical_device_id) { auto pci_bus_id_str = m_physical_device_id_to_pci_bus_id_map.at(physical_device_id); - log_debug(LogSiliconDriver, "Checking TT device (physical_device_id: {} pci_bus_id: {}) to find it's corresponding CPU package", physical_device_id, pci_bus_id_str); + log_debug( + LogSiliconDriver, + "Checking TT device (physical_device_id: {} pci_bus_id: {}) to find it's corresponding CPU package", + physical_device_id, + pci_bus_id_str); hwloc_obj_t tmp_obj = hwloc_get_non_io_ancestor_obj(m_topology, pci_device_obj); int package_id = -1; // Keep going up until package/machine hierarchy is found, in case we don't find it right away. - while (package_id == -1){ - - if ((hwloc_compare_types(tmp_obj->type, HWLOC_OBJ_PACKAGE) == 0) || (hwloc_compare_types(tmp_obj->type, HWLOC_OBJ_MACHINE) == 0)){ - if (tmp_obj->os_index != (unsigned) -1){ + while (package_id == -1) { + if ((hwloc_compare_types(tmp_obj->type, HWLOC_OBJ_PACKAGE) == 0) || + (hwloc_compare_types(tmp_obj->type, HWLOC_OBJ_MACHINE) == 0)) { + if (tmp_obj->os_index != (unsigned)-1) { package_id = tmp_obj->os_index; - }else{ - log_warning(LogSiliconDriver, "Could not find os_index of package or machine object for TT device (physical_device_id: {} pci_bus_id: {})", physical_device_id, pci_bus_id_str); + } else { + log_warning( + LogSiliconDriver, + "Could not find os_index of package or machine object for TT device (physical_device_id: {} " + "pci_bus_id: {})", + physical_device_id, + pci_bus_id_str); break; } - }else{ - if (tmp_obj->parent){ + } else { + if (tmp_obj->parent) { tmp_obj = tmp_obj->parent; - }else{ + } else { break; } } } - if (m_debug) print_hwloc_object(pci_device_obj, 1, true, true); - if (m_debug) print_hwloc_object(tmp_obj, 1, true, true); + if (m_debug) { + print_hwloc_object(pci_device_obj, 1, true, true); + } + if (m_debug) { + print_hwloc_object(tmp_obj, 1, true, true); + } return package_id; } -hwloc_nodeset_t tt_cpuset_allocator::get_numa_nodeset_from_device(hwloc_obj_t pci_device_obj, chip_id_t physical_device_id){ - +hwloc_nodeset_t tt_cpuset_allocator::get_numa_nodeset_from_device( + hwloc_obj_t pci_device_obj, chip_id_t physical_device_id) { hwloc_nodeset_t nodeset = 0x0; // Currently an issue in non-EPYC machines where PCI devices are directly under Machine, and not any NumaNodes. // As quick workaround, skip this if there is only single numanode since returning 1 seems fine. - if (hwloc_get_nbobjs_by_type(m_topology, HWLOC_OBJ_NUMANODE) == 1){ + if (hwloc_get_nbobjs_by_type(m_topology, HWLOC_OBJ_NUMANODE) == 1) { auto numanode = hwloc_get_obj_by_type(m_topology, HWLOC_OBJ_NUMANODE, 0); return numanode->nodeset; } auto pci_bus_id_str = m_physical_device_id_to_pci_bus_id_map.at(physical_device_id); - log_debug(LogSiliconDriver, "init_detect_tt_device_numanodes(): Checking TT device (physical_device_id: {} pci_bus_id: {}) to find it's corresponding NumaNode.", physical_device_id, pci_bus_id_str); + log_debug( + LogSiliconDriver, + "init_detect_tt_device_numanodes(): Checking TT device (physical_device_id: {} pci_bus_id: {}) to find it's " + "corresponding NumaNode.", + physical_device_id, + pci_bus_id_str); hwloc_obj_t tmp_obj = pci_device_obj->parent; - while (tmp_obj && !tmp_obj->memory_arity){ + while (tmp_obj && !tmp_obj->memory_arity) { tmp_obj = tmp_obj->parent; /* no memory child, walk up */ } - if (tmp_obj && tmp_obj->nodeset){ - log_debug(LogSiliconDriver, "init_detect_tt_device_numanodes(): For TT device (physical_device_id: {} pci_bus_id: {}) found NumaNodeSet: {}", physical_device_id, pci_bus_id_str, get_hwloc_bitmap_vector(tmp_obj->nodeset)); + if (tmp_obj && tmp_obj->nodeset) { + log_debug( + LogSiliconDriver, + "init_detect_tt_device_numanodes(): For TT device (physical_device_id: {} pci_bus_id: {}) found " + "NumaNodeSet: {}", + physical_device_id, + pci_bus_id_str, + get_hwloc_bitmap_vector(tmp_obj->nodeset)); nodeset = tmp_obj->nodeset; - }else{ - log_warning(LogSiliconDriver, "init_detect_tt_device_numanodes(): Could not determine NumaNodeSet for TT device (physical_device_id: {} pci_bus_id: {})", physical_device_id, pci_bus_id_str); + } else { + log_warning( + LogSiliconDriver, + "init_detect_tt_device_numanodes(): Could not determine NumaNodeSet for TT device (physical_device_id: {} " + "pci_bus_id: {})", + physical_device_id, + pci_bus_id_str); } return nodeset; - } int tt_cpuset_allocator::_get_num_tt_pci_devices_by_pci_device_id(uint16_t device_id, uint16_t revision) { - std::pair device_id_revision = std::make_pair(device_id, revision); if (m_num_tt_device_by_pci_device_id_map.find(device_id_revision) != m_num_tt_device_by_pci_device_id_map.end()) { return m_num_tt_device_by_pci_device_id_map.at(device_id_revision); } else { - log_warning(LogSiliconDriver, "Cannot find any TT device with PCI device_id: 0x{:x} and revision: {} in topology.", device_id, revision); + log_warning( + LogSiliconDriver, + "Cannot find any TT device with PCI device_id: 0x{:x} and revision: {} in topology.", + device_id, + revision); return 0; } } ///////////////////////////////////////////////////////////////////////// -//Debug Functions /////////////////////////////////////////////////////// +// Debug Functions /////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////// // Get all PU ids (or numa nodes) in a vector, for legacy/back-compat/debug purposes. -std::vector tt_cpuset_allocator::get_hwloc_bitmap_vector(hwloc_bitmap_t &bitmap){ - +std::vector tt_cpuset_allocator::get_hwloc_bitmap_vector(hwloc_bitmap_t &bitmap) { std::vector indices; int index; - if (bitmap){ - hwloc_bitmap_foreach_begin(index, bitmap) - indices.push_back(index); + if (bitmap) { + hwloc_bitmap_foreach_begin(index, bitmap) indices.push_back(index); hwloc_bitmap_foreach_end(); } return indices; } -std::vector tt_cpuset_allocator::get_hwloc_cpuset_vector(hwloc_obj_t &obj){ +std::vector tt_cpuset_allocator::get_hwloc_cpuset_vector(hwloc_obj_t &obj) { return get_hwloc_bitmap_vector(obj->cpuset); } -std::vector tt_cpuset_allocator::get_hwloc_nodeset_vector(hwloc_obj_t &obj){ +std::vector tt_cpuset_allocator::get_hwloc_nodeset_vector(hwloc_obj_t &obj) { return get_hwloc_bitmap_vector(obj->nodeset); } - // Nicer way to print pu ids as a vector on single line. -void tt_cpuset_allocator::print_hwloc_cpuset(hwloc_obj_t &obj){ +void tt_cpuset_allocator::print_hwloc_cpuset(hwloc_obj_t &obj) { std::cout << " Number: " << hwloc_bitmap_weight(obj->cpuset) << " cpuset_pu_ids: " << get_hwloc_cpuset_vector(obj); } -void tt_cpuset_allocator::print_hwloc_nodeset(hwloc_obj_t &obj){ - std::cout << " Number: " << hwloc_bitmap_weight(obj->nodeset) << " nodeset node_ids: " << get_hwloc_nodeset_vector(obj); +void tt_cpuset_allocator::print_hwloc_nodeset(hwloc_obj_t &obj) { + std::cout << " Number: " << hwloc_bitmap_weight(obj->nodeset) + << " nodeset node_ids: " << get_hwloc_nodeset_vector(obj); } -void tt_cpuset_allocator::print_hwloc_object(hwloc_obj_t &obj, int depth, bool verbose, bool show_cpuids){ - +void tt_cpuset_allocator::print_hwloc_object(hwloc_obj_t &obj, int depth, bool verbose, bool show_cpuids) { char type[32], attr[1024]; hwloc_obj_type_snprintf(type, sizeof(type), obj, verbose); - printf("%*s%s", 2*depth, "", type); - if (obj->os_index != (unsigned) -1) + printf("%*s%s", 2 * depth, "", type); + if (obj->os_index != (unsigned)-1) { printf("#%u", obj->os_index); + } hwloc_obj_attr_snprintf(attr, sizeof(attr), obj, " ", verbose); - if (*attr) + if (*attr) { printf("(%s)", attr); - if (show_cpuids && obj->cpuset) + } + if (show_cpuids && obj->cpuset) { print_hwloc_cpuset(obj); + } printf("\n"); } - } // namespace cpuset } // namespace tt - diff --git a/device/cpuset_lib.hpp b/device/cpuset_lib.hpp index a14a4f33..46994833 100644 --- a/device/cpuset_lib.hpp +++ b/device/cpuset_lib.hpp @@ -4,18 +4,17 @@ * SPDX-License-Identifier: Apache-2.0 */ - #pragma once +#include + #include -#include -#include #include +#include #include -#include - -#include "device/tt_cluster_descriptor.h" // For chip_id_t +#include +#include "device/tt_cluster_descriptor.h" // For chip_id_t #include "hwloc.h" using tt_cluster_description = tt_ClusterDescriptor; @@ -27,90 +26,87 @@ namespace cpuset { // CPU ID allocator for pinning threads to cpu_ids // It's a singleton that should be retrieved via get() struct tt_cpuset_allocator { - public: - - tt_cpuset_allocator(tt_cpuset_allocator const&) = delete; - void operator=(tt_cpuset_allocator const&) = delete; - - // Bind an already allocated memory region to particular numa nodes - static bool bind_area_to_memory_nodeset(chip_id_t physical_device_id, const void * addr, size_t len){ - auto& instance = tt_cpuset_allocator::get(); - return instance.bind_area_memory_nodeset(physical_device_id, addr, len); - } - - static int get_num_tt_pci_devices(){ - auto& instance = tt_cpuset_allocator::get(); - return instance._get_num_tt_pci_devices(); - } - - static int get_num_tt_pci_devices_by_pci_device_id(uint16_t device_id, uint16_t revision_id){ - auto& instance = tt_cpuset_allocator::get(); - return instance._get_num_tt_pci_devices_by_pci_device_id(device_id, revision_id); - } - - private: - - static tt_cpuset_allocator& get() { - static tt_cpuset_allocator instance; - return instance; - } - - tt_cpuset_allocator(); - - int TENSTORRENT_VENDOR_ID = 0x1e52; - - bool bind_area_memory_nodeset(chip_id_t physical_device_id, const void * addr, size_t len); - int _get_num_tt_pci_devices(); - int _get_num_tt_pci_devices_by_pci_device_id(uint16_t device_id, uint16_t revision_id); - - // Series of init functions, must be called in this order. Seperated out to support - // early exit in case of errors. - bool init_topology_init_and_load(); - bool init_find_tt_pci_devices_packages_numanodes(); - bool init_get_number_of_packages(); - bool init_is_cpu_model_supported(); - bool init_determine_cpuset_allocations(); - - // Helper Functions - std::string get_pci_bus_id(hwloc_obj_t pci_device_obj); - int get_package_id_from_device(hwloc_obj_t pci_device_obj, chip_id_t physical_device_id); - hwloc_nodeset_t get_numa_nodeset_from_device(hwloc_obj_t pci_device_obj, chip_id_t physical_device_id); - - // Debug Functions - void print_hwloc_cpuset(hwloc_obj_t &obj); - void print_hwloc_nodeset(hwloc_obj_t &obj); - void print_hwloc_object(hwloc_obj_t &obj, int depth = 0, bool verbose = false, bool show_cpuids = true); - std::vector get_hwloc_bitmap_vector(hwloc_bitmap_t &bitmap); - std::vector get_hwloc_cpuset_vector(hwloc_obj_t &obj); - std::vector get_hwloc_nodeset_vector(hwloc_obj_t &obj); - hwloc_topology_t m_topology; - bool m_debug; - pid_t m_pid; - - // Items calculated by parsing system info, used by allocation algorithm: - std::map> m_package_id_to_devices_map; - std::map m_physical_device_id_to_pci_bus_id_map; // Debug/Info - std::map, int> m_num_tt_device_by_pci_device_id_map; - - std::map> m_physical_device_id_to_cpusets_map; - std::map m_physical_device_id_to_package_id_map; - - bool m_enable_cpuset_allocator = true; // Enable feature, otherwise do nothing. - int m_num_packages = 0; - std::vector m_all_tt_devices = {}; - - hwloc_obj_type_t m_object_per_alloc_slot = HWLOC_OBJ_L3CACHE; // Default +public: + tt_cpuset_allocator(tt_cpuset_allocator const &) = delete; + void operator=(tt_cpuset_allocator const &) = delete; + + // Bind an already allocated memory region to particular numa nodes + static bool bind_area_to_memory_nodeset(chip_id_t physical_device_id, const void *addr, size_t len) { + auto &instance = tt_cpuset_allocator::get(); + return instance.bind_area_memory_nodeset(physical_device_id, addr, len); + } - // For 2CCX-PER-CCD Optimization detection. - std::map m_package_id_to_num_l3_per_ccx_map; - std::map m_package_id_to_num_ccx_per_ccd_map; + static int get_num_tt_pci_devices() { + auto &instance = tt_cpuset_allocator::get(); + return instance._get_num_tt_pci_devices(); + } - // Memory Binding - std::map m_physical_device_id_to_numa_nodeset_map; + static int get_num_tt_pci_devices_by_pci_device_id(uint16_t device_id, uint16_t revision_id) { + auto &instance = tt_cpuset_allocator::get(); + return instance._get_num_tt_pci_devices_by_pci_device_id(device_id, revision_id); + } - // Helper for some dynamic multi-threading. - std::map m_num_cpu_cores_allocated_per_tt_device; +private: + static tt_cpuset_allocator &get() { + static tt_cpuset_allocator instance; + return instance; + } + tt_cpuset_allocator(); + + int TENSTORRENT_VENDOR_ID = 0x1e52; + + bool bind_area_memory_nodeset(chip_id_t physical_device_id, const void *addr, size_t len); + int _get_num_tt_pci_devices(); + int _get_num_tt_pci_devices_by_pci_device_id(uint16_t device_id, uint16_t revision_id); + + // Series of init functions, must be called in this order. Seperated out to support + // early exit in case of errors. + bool init_topology_init_and_load(); + bool init_find_tt_pci_devices_packages_numanodes(); + bool init_get_number_of_packages(); + bool init_is_cpu_model_supported(); + bool init_determine_cpuset_allocations(); + + // Helper Functions + std::string get_pci_bus_id(hwloc_obj_t pci_device_obj); + int get_package_id_from_device(hwloc_obj_t pci_device_obj, chip_id_t physical_device_id); + hwloc_nodeset_t get_numa_nodeset_from_device(hwloc_obj_t pci_device_obj, chip_id_t physical_device_id); + + // Debug Functions + void print_hwloc_cpuset(hwloc_obj_t &obj); + void print_hwloc_nodeset(hwloc_obj_t &obj); + void print_hwloc_object(hwloc_obj_t &obj, int depth = 0, bool verbose = false, bool show_cpuids = true); + std::vector get_hwloc_bitmap_vector(hwloc_bitmap_t &bitmap); + std::vector get_hwloc_cpuset_vector(hwloc_obj_t &obj); + std::vector get_hwloc_nodeset_vector(hwloc_obj_t &obj); + hwloc_topology_t m_topology; + bool m_debug; + pid_t m_pid; + + // Items calculated by parsing system info, used by allocation algorithm: + std::map> m_package_id_to_devices_map; + std::map m_physical_device_id_to_pci_bus_id_map; // Debug/Info + std::map, int> m_num_tt_device_by_pci_device_id_map; + + std::map> m_physical_device_id_to_cpusets_map; + std::map m_physical_device_id_to_package_id_map; + + bool m_enable_cpuset_allocator = true; // Enable feature, otherwise do nothing. + int m_num_packages = 0; + std::vector m_all_tt_devices = {}; + + hwloc_obj_type_t m_object_per_alloc_slot = HWLOC_OBJ_L3CACHE; // Default + + // For 2CCX-PER-CCD Optimization detection. + std::map m_package_id_to_num_l3_per_ccx_map; + std::map m_package_id_to_num_ccx_per_ccd_map; + + // Memory Binding + std::map m_physical_device_id_to_numa_nodeset_map; + + // Helper for some dynamic multi-threading. + std::map m_num_cpu_cores_allocated_per_tt_device; }; template diff --git a/device/device_api_metal.h b/device/device_api_metal.h index 0fc7820c..c148a71a 100644 --- a/device/device_api_metal.h +++ b/device/device_api_metal.h @@ -5,5 +5,5 @@ */ #pragma once -#include "device/tt_device.h" #include "device/driver_atomics.h" +#include "device/tt_device.h" diff --git a/device/driver_atomics.h b/device/driver_atomics.h index cbf4f6c7..6ed52416 100644 --- a/device/driver_atomics.h +++ b/device/driver_atomics.h @@ -12,54 +12,44 @@ namespace tt_driver_atomics { #if defined(__x86_64__) || defined(__i386__) // Store-Any barrier. -static inline __attribute__((always_inline)) void sfence() { - _mm_sfence(); -} +static inline __attribute__((always_inline)) void sfence() { _mm_sfence(); } + // Load-Any barrier. -static inline __attribute__((always_inline)) void lfence() { - _mm_lfence(); -} +static inline __attribute__((always_inline)) void lfence() { _mm_lfence(); } + // Any-Any barrier. -static inline __attribute__((always_inline)) void mfence() { - _mm_mfence(); -} +static inline __attribute__((always_inline)) void mfence() { _mm_mfence(); } #elif defined(__ARM_ARCH) static inline __attribute__((always_inline)) void sfence() { // Full memory barrier (full system). ARM does not have a Store-Any barrier. // https://developer.arm.com/documentation/100941/0101/Barriers - asm volatile ("DMB SY" : : : "memory"); + asm volatile("DMB SY" : : : "memory"); } static inline __attribute__((always_inline)) void lfence() { // Load-Any barrier (full system) // https://developer.arm.com/documentation/100941/0101/Barriers - asm volatile ("DMB LD" : : : "memory"); + asm volatile("DMB LD" : : : "memory"); } static inline __attribute__((always_inline)) void mfence() { // Full memory barrier (full system). // https://developer.arm.com/documentation/100941/0101/Barriers - asm volatile ("DMB SY" : : : "memory"); + asm volatile("DMB SY" : : : "memory"); } #elif defined(__riscv) -static inline __attribute__((always_inline)) void sfence() { - asm volatile ("fence ow, ow" : : : "memory"); -} +static inline __attribute__((always_inline)) void sfence() { asm volatile("fence ow, ow" : : : "memory"); } -static inline __attribute__((always_inline)) void lfence() { - asm volatile ("fence ir, ir" : : : "memory"); -} +static inline __attribute__((always_inline)) void lfence() { asm volatile("fence ir, ir" : : : "memory"); } -static inline __attribute__((always_inline)) void mfence() { - asm volatile ("fence iorw, iorw" : : : "memory"); -} +static inline __attribute__((always_inline)) void mfence() { asm volatile("fence iorw, iorw" : : : "memory"); } #else #error "Unsupported architecture" #endif -} // namespace tt_driver_atomics \ No newline at end of file +} // namespace tt_driver_atomics \ No newline at end of file diff --git a/device/grayskull/grayskull_coordinate_manager.h b/device/grayskull/grayskull_coordinate_manager.h index f7f6720c..5be371cd 100644 --- a/device/grayskull/grayskull_coordinate_manager.h +++ b/device/grayskull/grayskull_coordinate_manager.h @@ -9,8 +9,8 @@ #include "device/coordinate_manager.h" class GrayskullCoordinateManager : public CoordinateManager { - public: - GrayskullCoordinateManager(const tt_xy_pair& worker_grid_size, const std::vector& workers, std::size_t harvesting_mask) - : CoordinateManager(worker_grid_size, workers, harvesting_mask) {} + GrayskullCoordinateManager( + const tt_xy_pair& worker_grid_size, const std::vector& workers, std::size_t harvesting_mask) : + CoordinateManager(worker_grid_size, workers, harvesting_mask) {} }; \ No newline at end of file diff --git a/device/grayskull/grayskull_implementation.cpp b/device/grayskull/grayskull_implementation.cpp index 2b94d187..b14029ca 100644 --- a/device/grayskull/grayskull_implementation.cpp +++ b/device/grayskull/grayskull_implementation.cpp @@ -4,9 +4,8 @@ #include "grayskull_implementation.h" -#include "src/firmware/riscv/grayskull/host_mem_address_map.h" - #include "device/tt_device.h" +#include "src/firmware/riscv/grayskull/host_mem_address_map.h" namespace tt::umd { @@ -86,7 +85,9 @@ std::pair grayskull_implementation::get_tlb_data( } tt_driver_host_address_params grayskull_implementation::get_host_address_params() const { - return {::grayskull::host_mem::address_map::ETH_ROUTING_BLOCK_SIZE, ::grayskull::host_mem::address_map::ETH_ROUTING_BUFFERS_START}; + return { + ::grayskull::host_mem::address_map::ETH_ROUTING_BLOCK_SIZE, + ::grayskull::host_mem::address_map::ETH_ROUTING_BUFFERS_START}; } } // namespace tt::umd diff --git a/device/grayskull/grayskull_implementation.h b/device/grayskull/grayskull_implementation.h index 35b4c78b..610361f2 100644 --- a/device/grayskull/grayskull_implementation.h +++ b/device/grayskull/grayskull_implementation.h @@ -104,7 +104,8 @@ enum class arc_message_type { }; // DEVICE_DATA -static const std::array DRAM_LOCATIONS = {{{1, 6}, {4, 6}, {7, 6}, {10, 6}, {1, 0}, {4, 0}, {7, 0}, {10, 0}}}; +static const std::array DRAM_LOCATIONS = { + {{1, 6}, {4, 6}, {7, 6}, {10, 6}, {1, 0}, {4, 0}, {7, 0}, {10, 0}}}; static const std::array ARC_LOCATIONS = {{{0, 2}}}; static const std::array PCI_LOCATIONS = {{{0, 4}}}; static const std::array ETH_LOCATIONS = {}; @@ -134,7 +135,8 @@ static constexpr uint32_t STATIC_TLB_CFG_ADDR = 0x1fc00000; static constexpr uint32_t TLB_CFG_REG_SIZE_BYTES = 8; static constexpr uint32_t DYNAMIC_TLB_16M_SIZE = 16 * 1024 * 1024; -static constexpr uint32_t DYNAMIC_TLB_16M_CFG_ADDR = STATIC_TLB_CFG_ADDR + (TLB_BASE_INDEX_16M * TLB_CFG_REG_SIZE_BYTES); +static constexpr uint32_t DYNAMIC_TLB_16M_CFG_ADDR = + STATIC_TLB_CFG_ADDR + (TLB_BASE_INDEX_16M * TLB_CFG_REG_SIZE_BYTES); static constexpr uint32_t DYNAMIC_TLB_16M_BASE = TLB_BASE_16M; static constexpr uint32_t DYNAMIC_TLB_2M_SIZE = 2 * 1024 * 1024; @@ -171,59 +173,93 @@ static constexpr uint32_t TENSIX_SOFT_RESET_ADDR = 0xFFB121B0; } // namespace grayskull class grayskull_implementation : public architecture_implementation { - public: +public: tt::ARCH get_architecture() const override { return tt::ARCH::GRAYSKULL; } + uint32_t get_arc_message_arc_get_harvesting() const override { return static_cast(grayskull::arc_message_type::ARC_GET_HARVESTING); } + uint32_t get_arc_message_arc_go_busy() const override { return static_cast(grayskull::arc_message_type::ARC_GO_BUSY); } + uint32_t get_arc_message_arc_go_long_idle() const override { return static_cast(grayskull::arc_message_type::ARC_GO_LONG_IDLE); } + uint32_t get_arc_message_arc_go_short_idle() const override { return static_cast(grayskull::arc_message_type::ARC_GO_SHORT_IDLE); } + uint32_t get_arc_message_deassert_riscv_reset() const override { return static_cast(grayskull::arc_message_type::DEASSERT_RISCV_RESET); } + uint32_t get_arc_message_get_aiclk() const override { return static_cast(grayskull::arc_message_type::GET_AICLK); } + uint32_t get_arc_message_setup_iatu_for_peer_to_peer() const override { return static_cast(grayskull::arc_message_type::SETUP_IATU_FOR_PEER_TO_PEER); } + uint32_t get_arc_message_test() const override { return static_cast(grayskull::arc_message_type::TEST); } + uint32_t get_arc_csm_mailbox_offset() const override { return grayskull::ARC_CSM_MAILBOX_OFFSET; } + uint32_t get_arc_reset_arc_misc_cntl_offset() const override { return grayskull::ARC_RESET_ARC_MISC_CNTL_OFFSET; } + uint32_t get_arc_reset_scratch_offset() const override { return grayskull::ARC_RESET_SCRATCH_OFFSET; } + uint32_t get_dram_channel_0_peer2peer_region_start() const override { return grayskull::DRAM_CHANNEL_0_PEER2PEER_REGION_START; } + uint32_t get_dram_channel_0_x() const override { return grayskull::DRAM_CHANNEL_0_X; } + uint32_t get_dram_channel_0_y() const override { return grayskull::DRAM_CHANNEL_0_Y; } + uint32_t get_broadcast_tlb_index() const override { return grayskull::BROADCAST_TLB_INDEX; } + uint32_t get_dynamic_tlb_2m_base() const override { return grayskull::DYNAMIC_TLB_2M_BASE; } + uint32_t get_dynamic_tlb_2m_size() const override { return grayskull::DYNAMIC_TLB_2M_SIZE; } + uint32_t get_dynamic_tlb_16m_base() const override { return grayskull::DYNAMIC_TLB_16M_BASE; } + uint32_t get_dynamic_tlb_16m_size() const override { return grayskull::DYNAMIC_TLB_16M_SIZE; } + uint32_t get_dynamic_tlb_16m_cfg_addr() const override { return grayskull::DYNAMIC_TLB_16M_CFG_ADDR; } + uint32_t get_mem_large_read_tlb() const override { return grayskull::MEM_LARGE_READ_TLB; } + uint32_t get_mem_large_write_tlb() const override { return grayskull::MEM_LARGE_WRITE_TLB; } + uint32_t get_static_tlb_cfg_addr() const override { return grayskull::STATIC_TLB_CFG_ADDR; } + uint32_t get_static_tlb_size() const override { return grayskull::STATIC_TLB_SIZE; } + uint32_t get_reg_tlb() const override { return grayskull::REG_TLB; } + uint32_t get_tlb_base_index_16m() const override { return grayskull::TLB_BASE_INDEX_16M; } + uint32_t get_tensix_soft_reset_addr() const override { return grayskull::TENSIX_SOFT_RESET_ADDR; } + uint32_t get_grid_size_x() const override { return grayskull::GRID_SIZE_X; } + uint32_t get_grid_size_y() const override { return grayskull::GRID_SIZE_Y; } + uint32_t get_tlb_cfg_reg_size_bytes() const override { return grayskull::TLB_CFG_REG_SIZE_BYTES; } + uint32_t get_small_read_write_tlb() const override { return grayskull::MEM_SMALL_READ_WRITE_TLB; } + const std::vector& get_harvesting_noc_locations() const override { return grayskull::HARVESTING_NOC_LOCATIONS; } + const std::vector& get_t6_x_locations() const override { return grayskull::T6_X_LOCATIONS; } + const std::vector& get_t6_y_locations() const override { return grayskull::T6_Y_LOCATIONS; } std::tuple multicast_workaround(xy_pair start, xy_pair end) const override; @@ -232,7 +268,6 @@ class grayskull_implementation : public architecture_implementation { std::pair get_tlb_data(std::uint32_t tlb_index, const tlb_data& data) const override; tt_driver_host_address_params get_host_address_params() const override; - }; } // namespace tt::umd diff --git a/device/ioctl.h b/device/ioctl.h index 60ec7b2f..a2e04980 100644 --- a/device/ioctl.h +++ b/device/ioctl.h @@ -7,151 +7,149 @@ #ifndef TTDRIVER_IOCTL_H_INCLUDED #define TTDRIVER_IOCTL_H_INCLUDED -#include #include +#include #define TENSTORRENT_DRIVER_VERSION 1 #define TENSTORRENT_IOCTL_MAGIC 0xFA -#define TENSTORRENT_IOCTL_GET_DEVICE_INFO _IO(TENSTORRENT_IOCTL_MAGIC, 0) -#define TENSTORRENT_IOCTL_GET_HARVESTING _IO(TENSTORRENT_IOCTL_MAGIC, 1) -#define TENSTORRENT_IOCTL_QUERY_MAPPINGS _IO(TENSTORRENT_IOCTL_MAGIC, 2) -#define TENSTORRENT_IOCTL_ALLOCATE_DMA_BUF _IO(TENSTORRENT_IOCTL_MAGIC, 3) -#define TENSTORRENT_IOCTL_FREE_DMA_BUF _IO(TENSTORRENT_IOCTL_MAGIC, 4) -#define TENSTORRENT_IOCTL_GET_DRIVER_INFO _IO(TENSTORRENT_IOCTL_MAGIC, 5) -#define TENSTORRENT_IOCTL_RESET_DEVICE _IO(TENSTORRENT_IOCTL_MAGIC, 6) -#define TENSTORRENT_IOCTL_PIN_PAGES _IO(TENSTORRENT_IOCTL_MAGIC, 7) +#define TENSTORRENT_IOCTL_GET_DEVICE_INFO _IO(TENSTORRENT_IOCTL_MAGIC, 0) +#define TENSTORRENT_IOCTL_GET_HARVESTING _IO(TENSTORRENT_IOCTL_MAGIC, 1) +#define TENSTORRENT_IOCTL_QUERY_MAPPINGS _IO(TENSTORRENT_IOCTL_MAGIC, 2) +#define TENSTORRENT_IOCTL_ALLOCATE_DMA_BUF _IO(TENSTORRENT_IOCTL_MAGIC, 3) +#define TENSTORRENT_IOCTL_FREE_DMA_BUF _IO(TENSTORRENT_IOCTL_MAGIC, 4) +#define TENSTORRENT_IOCTL_GET_DRIVER_INFO _IO(TENSTORRENT_IOCTL_MAGIC, 5) +#define TENSTORRENT_IOCTL_RESET_DEVICE _IO(TENSTORRENT_IOCTL_MAGIC, 6) +#define TENSTORRENT_IOCTL_PIN_PAGES _IO(TENSTORRENT_IOCTL_MAGIC, 7) // For tenstorrent_mapping.mapping_id. These are not array indices. -#define TENSTORRENT_MAPPING_UNUSED 0 -#define TENSTORRENT_MAPPING_RESOURCE0_UC 1 -#define TENSTORRENT_MAPPING_RESOURCE0_WC 2 -#define TENSTORRENT_MAPPING_RESOURCE1_UC 3 -#define TENSTORRENT_MAPPING_RESOURCE1_WC 4 -#define TENSTORRENT_MAPPING_RESOURCE2_UC 5 -#define TENSTORRENT_MAPPING_RESOURCE2_WC 6 +#define TENSTORRENT_MAPPING_UNUSED 0 +#define TENSTORRENT_MAPPING_RESOURCE0_UC 1 +#define TENSTORRENT_MAPPING_RESOURCE0_WC 2 +#define TENSTORRENT_MAPPING_RESOURCE1_UC 3 +#define TENSTORRENT_MAPPING_RESOURCE1_WC 4 +#define TENSTORRENT_MAPPING_RESOURCE2_UC 5 +#define TENSTORRENT_MAPPING_RESOURCE2_WC 6 -#define TENSTORRENT_MAX_DMA_BUFS 8 +#define TENSTORRENT_MAX_DMA_BUFS 8 struct tenstorrent_get_device_info_in { - __u32 output_size_bytes; + __u32 output_size_bytes; }; struct tenstorrent_get_device_info_out { - __u32 output_size_bytes; - __u16 vendor_id; - __u16 device_id; - __u16 subsystem_vendor_id; - __u16 subsystem_id; - __u16 bus_dev_fn; // [0:2] function, [3:7] device, [8:15] bus - __u16 max_dma_buf_size_log2; - __u16 pci_domain; + __u32 output_size_bytes; + __u16 vendor_id; + __u16 device_id; + __u16 subsystem_vendor_id; + __u16 subsystem_id; + __u16 bus_dev_fn; // [0:2] function, [3:7] device, [8:15] bus + __u16 max_dma_buf_size_log2; + __u16 pci_domain; }; struct tenstorrent_get_device_info { - struct tenstorrent_get_device_info_in in; - struct tenstorrent_get_device_info_out out; + struct tenstorrent_get_device_info_in in; + struct tenstorrent_get_device_info_out out; }; struct tenstorrent_query_mappings_in { - __u32 output_mapping_count; - __u32 reserved; + __u32 output_mapping_count; + __u32 reserved; }; struct tenstorrent_mapping { - __u32 mapping_id; - __u32 reserved; - __u64 mapping_base; - __u64 mapping_size; + __u32 mapping_id; + __u32 reserved; + __u64 mapping_base; + __u64 mapping_size; }; struct tenstorrent_query_mappings_out { - struct tenstorrent_mapping mappings[0]; + struct tenstorrent_mapping mappings[0]; }; struct tenstorrent_query_mappings { - struct tenstorrent_query_mappings_in in; - struct tenstorrent_query_mappings_out out; + struct tenstorrent_query_mappings_in in; + struct tenstorrent_query_mappings_out out; }; struct tenstorrent_allocate_dma_buf_in { - __u32 requested_size; - __u8 buf_index; // [0,TENSTORRENT_MAX_DMA_BUFS) - __u8 reserved0[3]; - __u64 reserved1[2]; + __u32 requested_size; + __u8 buf_index; // [0,TENSTORRENT_MAX_DMA_BUFS) + __u8 reserved0[3]; + __u64 reserved1[2]; }; struct tenstorrent_allocate_dma_buf_out { - __u64 physical_address; - __u64 mapping_offset; - __u32 size; - __u32 reserved0; - __u64 reserved1[2]; + __u64 physical_address; + __u64 mapping_offset; + __u32 size; + __u32 reserved0; + __u64 reserved1[2]; }; struct tenstorrent_allocate_dma_buf { - struct tenstorrent_allocate_dma_buf_in in; - struct tenstorrent_allocate_dma_buf_out out; + struct tenstorrent_allocate_dma_buf_in in; + struct tenstorrent_allocate_dma_buf_out out; }; -struct tenstorrent_free_dma_buf_in { -}; +struct tenstorrent_free_dma_buf_in {}; -struct tenstorrent_free_dma_buf_out { -}; +struct tenstorrent_free_dma_buf_out {}; struct tenstorrent_free_dma_buf { - struct tenstorrent_free_dma_buf_in in; - struct tenstorrent_free_dma_buf_out out; + struct tenstorrent_free_dma_buf_in in; + struct tenstorrent_free_dma_buf_out out; }; struct tenstorrent_get_driver_info_in { - __u32 output_size_bytes; + __u32 output_size_bytes; }; struct tenstorrent_get_driver_info_out { - __u32 output_size_bytes; - __u32 driver_version; + __u32 output_size_bytes; + __u32 driver_version; }; struct tenstorrent_get_driver_info { - struct tenstorrent_get_driver_info_in in; - struct tenstorrent_get_driver_info_out out; + struct tenstorrent_get_driver_info_in in; + struct tenstorrent_get_driver_info_out out; }; struct tenstorrent_reset_device_in { - __u32 output_size_bytes; - __u32 flags; + __u32 output_size_bytes; + __u32 flags; }; struct tenstorrent_reset_device_out { - __u32 output_size_bytes; - __u32 result; + __u32 output_size_bytes; + __u32 result; }; struct tenstorrent_reset_device { - struct tenstorrent_reset_device_in in; - struct tenstorrent_reset_device_out out; + struct tenstorrent_reset_device_in in; + struct tenstorrent_reset_device_out out; }; // tenstorrent_pin_pages_in.flags #define TENSTORRENT_PIN_PAGES_CONTIGUOUS 1 struct tenstorrent_pin_pages_in { - __u32 output_size_bytes; - __u32 flags; - __u64 virtual_address; - __u64 size; + __u32 output_size_bytes; + __u32 flags; + __u64 virtual_address; + __u64 size; }; struct tenstorrent_pin_pages_out { - __u64 physical_address; + __u64 physical_address; }; struct tenstorrent_pin_pages { - struct tenstorrent_pin_pages_in in; - struct tenstorrent_pin_pages_out out; + struct tenstorrent_pin_pages_in in; + struct tenstorrent_pin_pages_out out; }; #endif diff --git a/device/mockup/tt_mockup_device.hpp b/device/mockup/tt_mockup_device.hpp index bacfb832..86de3e29 100644 --- a/device/mockup/tt_mockup_device.hpp +++ b/device/mockup/tt_mockup_device.hpp @@ -13,27 +13,37 @@ #include "device/tt_device.h" class tt_MockupDevice : public tt_device { - public: +public: tt_MockupDevice(const std::string& sdesc_path) : tt_device(sdesc_path) { soc_descriptor_per_chip.emplace(0, tt_SocDescriptor(sdesc_path)); std::set target_devices = {0}; } + virtual ~tt_MockupDevice() {} // Setup/Teardown Functions virtual std::unordered_map& get_virtual_soc_descriptors() override { return soc_descriptor_per_chip; } + void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) override {} + void set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_) override {} + void set_driver_host_address_params(const tt_driver_host_address_params& host_address_params_) override {} - void set_driver_eth_interface_params( - const tt_driver_eth_interface_params& eth_interface_params_) override {} + + void set_driver_eth_interface_params(const tt_driver_eth_interface_params& eth_interface_params_) override {} + void start_device(const tt_device_params& device_params) override {} + void assert_risc_reset() override {} + void deassert_risc_reset() override {} + void deassert_risc_reset_at_core(tt_cxy_pair core) override {} + void assert_risc_reset_at_core(tt_cxy_pair core) override {} + void close_device() override {} // Runtime Functions @@ -43,10 +53,13 @@ class tt_MockupDevice : public tt_device { tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) override {} + void read_from_device( void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) override {} + void write_to_sysmem( const void* mem_ptr, std::uint32_t size, uint64_t addr, uint16_t channel, chip_id_t src_device_id) override {} + void read_from_sysmem( void* mem_ptr, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id) override {} @@ -54,10 +67,12 @@ class tt_MockupDevice : public tt_device { const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}) override {} + void dram_membar( const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels = {}) override {} + void dram_membar( const chip_id_t chip, const std::string& fallback_tlb, @@ -66,27 +81,35 @@ class tt_MockupDevice : public tt_device { void wait_for_non_mmio_flush() override {} // Misc. Functions to Query/Set Device State - std::unordered_map get_harvesting_masks_for_soc_descriptors() override { - return {{0, 0}}; - } + std::unordered_map get_harvesting_masks_for_soc_descriptors() override { return {{0, 0}}; } + static std::vector detect_available_device_ids() { return {0}; }; + std::set get_target_remote_device_ids() override { return target_remote_chips; } + std::map get_clocks() override { return {{0, 0}}; } + void* host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const override { return nullptr; } + std::uint64_t get_pcie_base_addr_from_device(const chip_id_t chip_id) const override { return 0; } + std::uint32_t get_num_dram_channels(std::uint32_t device_id) override { return get_soc_descriptor(device_id).get_num_dram_channels(); }; + std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel) override { return get_soc_descriptor(device_id).dram_bank_size; } + std::uint32_t get_num_host_channels(std::uint32_t device_id) override { return 1; } + std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel) override { return 0; } + std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id) override { return 0; } - private: +private: std::vector archs_in_cluster = {}; std::set target_devices_in_cluster = {}; std::set target_remote_chips = {}; diff --git a/device/pcie/pci_device.cpp b/device/pcie/pci_device.cpp index 4cd3ab79..7f5627b2 100644 --- a/device/pcie/pci_device.cpp +++ b/device/pcie/pci_device.cpp @@ -4,24 +4,24 @@ * SPDX-License-Identifier: Apache-2.0 */ +#include "pci_device.hpp" + +#include // for ::open +#include // for PCI_SLOT, PCI_FUNC +#include // for ioctl +#include // for mmap, munmap +#include // for ::close + #include -#include // for memcpy +#include // for memcpy #include -#include // for ::open -#include // for ::close -#include // for ioctl -#include // for mmap, munmap -#include // for PCI_SLOT, PCI_FUNC - -#include "pci_device.hpp" -#include "ioctl.h" -#include "ioctl.h" -#include "device/tt_arch_types.h" -#include "device/driver_atomics.h" -#include "device/architecture_implementation.h" #include "common/assert.hpp" #include "common/logger.hpp" +#include "device/architecture_implementation.h" +#include "device/driver_atomics.h" +#include "device/tt_arch_types.h" +#include "ioctl.h" static const uint16_t GS_PCIE_DEVICE_ID = 0xfaca; static const uint16_t WH_PCIE_DEVICE_ID = 0x401e; @@ -29,19 +29,23 @@ static const uint16_t BH_PCIE_DEVICE_ID = 0xb140; // TODO: we'll have to rethink this when KMD takes control of the inbound PCIe // TLB windows and there is no longer a pre-defined WC/UC split. -static const uint32_t GS_BAR0_WC_MAPPING_SIZE = (156<<20) + (10<<21) + (18<<24); +static const uint32_t GS_BAR0_WC_MAPPING_SIZE = (156 << 20) + (10 << 21) + (18 << 24); // Defines the address for WC region. addresses 0 to BH_BAR0_WC_MAPPING_SIZE are in WC, above that are UC -static const uint32_t BH_BAR0_WC_MAPPING_SIZE = 188<<21; +static const uint32_t BH_BAR0_WC_MAPPING_SIZE = 188 << 21; static const uint32_t BH_NOC_NODE_ID_OFFSET = 0x1FD04044; static const uint32_t GS_WH_ARC_SCRATCH_6_OFFSET = 0x1FF30078; template static T read_sysfs(const PciDeviceInfo &device_info, const std::string &attribute_name) { - const auto sysfs_path = fmt::format("/sys/bus/pci/devices/{:04x}:{:02x}:{:02x}.{:x}/{}", - device_info.pci_domain, device_info.pci_bus, - device_info.pci_device, device_info.pci_function, attribute_name); + const auto sysfs_path = fmt::format( + "/sys/bus/pci/devices/{:04x}:{:02x}:{:02x}.{:x}/{}", + device_info.pci_domain, + device_info.pci_bus, + device_info.pci_device, + device_info.pci_function, + attribute_name); std::ifstream attribute_file(sysfs_path); std::string value_str; T value; @@ -66,8 +70,7 @@ static T read_sysfs(const PciDeviceInfo &device_info, const std::string &attribu return value; } -static PciDeviceInfo read_device_info(int fd) -{ +static PciDeviceInfo read_device_info(int fd) { tenstorrent_get_device_info info{}; info.in.output_size_bytes = sizeof(info.out); @@ -83,11 +86,11 @@ static PciDeviceInfo read_device_info(int fd) } static tt::ARCH detect_arch(uint32_t pcie_device_id, uint32_t pcie_revision_id) { - if (pcie_device_id == GS_PCIE_DEVICE_ID){ + if (pcie_device_id == GS_PCIE_DEVICE_ID) { return tt::ARCH::GRAYSKULL; - } else if (pcie_device_id == WH_PCIE_DEVICE_ID && pcie_revision_id == 0x01){ + } else if (pcie_device_id == WH_PCIE_DEVICE_ID && pcie_revision_id == 0x01) { return tt::ARCH::WORMHOLE_B0; - } else if (pcie_device_id == BH_PCIE_DEVICE_ID){ + } else if (pcie_device_id == BH_PCIE_DEVICE_ID) { return tt::ARCH::BLACKHOLE; } else { TT_THROW("Unknown pcie device id that does not match any known architecture: ", pcie_device_id); @@ -113,28 +116,29 @@ inline void memcpy_to_device(void *dest, const void *src, std::size_t num_bytes) if (dest_misalignment != 0) { // Read-modify-write for the first dest element. - dp = reinterpret_cast(dest_addr - dest_misalignment); + dp = reinterpret_cast(dest_addr - dest_misalignment); copy_t tmp = *dp; auto leading_len = std::min(sizeof(tmp) - dest_misalignment, num_bytes); - std::memcpy(reinterpret_cast(&tmp) + dest_misalignment, src, leading_len); + std::memcpy(reinterpret_cast(&tmp) + dest_misalignment, src, leading_len); num_bytes -= leading_len; src = static_cast(src) + leading_len; *dp++ = tmp; } else { - dp = static_cast(dest); + dp = static_cast(dest); } // Copy the destination-aligned middle. - const copy_t *sp = static_cast(src); + const copy_t *sp = static_cast(src); std::size_t num_words = num_bytes / sizeof(copy_t); - for (std::size_t i = 0; i < num_words; i++) + for (std::size_t i = 0; i < num_words; i++) { *dp++ = *sp++; + } // Finally copy any sub-word trailer, again RMW on the destination. auto trailing_len = num_bytes % sizeof(copy_t); @@ -157,7 +161,7 @@ inline void memcpy_from_device(void *dest, const void *src, std::size_t num_byte unsigned int src_misalignment = src_addr % sizeof(copy_t); if (src_misalignment != 0) { - sp = reinterpret_cast(src_addr - src_misalignment); + sp = reinterpret_cast(src_addr - src_misalignment); copy_t tmp = *sp++; @@ -167,15 +171,16 @@ inline void memcpy_from_device(void *dest, const void *src, std::size_t num_byte dest = static_cast(dest) + leading_len; } else { - sp = static_cast(src); + sp = static_cast(src); } // Copy the source-aligned middle. copy_t *dp = static_cast(dest); std::size_t num_words = num_bytes / sizeof(copy_t); - for (std::size_t i = 0; i < num_words; i++) + for (std::size_t i = 0; i < num_words; i++) { *dp++ = *sp++; + } // Finally copy any sub-word trailer. auto trailing_len = num_bytes % sizeof(copy_t); @@ -186,17 +191,16 @@ inline void memcpy_from_device(void *dest, const void *src, std::size_t num_byte } tt::ARCH PciDeviceInfo::get_arch() const { - if (this->device_id == GS_PCIE_DEVICE_ID){ + if (this->device_id == GS_PCIE_DEVICE_ID) { return tt::ARCH::GRAYSKULL; } else if (this->device_id == WH_PCIE_DEVICE_ID) { return tt::ARCH::WORMHOLE_B0; - } else if (this->device_id == BH_PCIE_DEVICE_ID){ + } else if (this->device_id == BH_PCIE_DEVICE_ID) { return tt::ARCH::BLACKHOLE; } return tt::ARCH::Invalid; } - /* static */ std::vector PCIDevice::enumerate_devices() { std::vector device_ids; std::string path = "/dev/tenstorrent/"; @@ -204,7 +208,7 @@ tt::ARCH PciDeviceInfo::get_arch() const { if (!std::filesystem::exists(path)) { return device_ids; } - for (const auto& entry : std::filesystem::directory_iterator(path)) { + for (const auto &entry : std::filesystem::directory_iterator(path)) { std::string filename = entry.path().filename().string(); // TODO: this will skip any device that has a non-numeric name, which @@ -228,28 +232,29 @@ tt::ARCH PciDeviceInfo::get_arch() const { try { infos[n] = read_device_info(fd); - } catch (...) {} + } catch (...) { + } close(fd); } return infos; } -PCIDevice::PCIDevice(int pci_device_number, int logical_device_id) - : device_path(fmt::format("/dev/tenstorrent/{}", pci_device_number)) - , pci_device_num(pci_device_number) - , logical_id(logical_device_id) - , pci_device_file_desc(open(device_path.c_str(), O_RDWR | O_CLOEXEC)) - , info(read_device_info(pci_device_file_desc)) - , numa_node(read_sysfs(info, "numa_node")) - , revision(read_sysfs(info, "revision")) - , arch(detect_arch(info.device_id, revision)) - , architecture_implementation(tt::umd::architecture_implementation::create(arch)) -{ +PCIDevice::PCIDevice(int pci_device_number, int logical_device_id) : + device_path(fmt::format("/dev/tenstorrent/{}", pci_device_number)), + pci_device_num(pci_device_number), + logical_id(logical_device_id), + pci_device_file_desc(open(device_path.c_str(), O_RDWR | O_CLOEXEC)), + info(read_device_info(pci_device_file_desc)), + numa_node(read_sysfs(info, "numa_node")), + revision(read_sysfs(info, "revision")), + arch(detect_arch(info.device_id, revision)), + architecture_implementation(tt::umd::architecture_implementation::create(arch)) { struct { tenstorrent_query_mappings query_mappings; tenstorrent_mapping mapping_array[8]; } mappings; + memset(&mappings, 0, sizeof(mappings)); mappings.query_mappings.in.output_mapping_count = 8; @@ -293,7 +298,9 @@ PCIDevice::PCIDevice(int pci_device_number, int logical_device_id) bar4_wc_mapping = mappings.mapping_array[i]; } - log_debug(LogSiliconDriver, "BAR mapping id {} base {} size {}", + log_debug( + LogSiliconDriver, + "BAR mapping id {} base {} size {}", mappings.mapping_array[i].mapping_id, (void *)mappings.mapping_array[i].mapping_base, mappings.mapping_array[i].mapping_size); @@ -308,7 +315,8 @@ PCIDevice::PCIDevice(int pci_device_number, int logical_device_id) // Attempt WC mapping first so we can fall back to all-UC if it fails. if (bar0_wc_mapping.mapping_id == TENSTORRENT_MAPPING_RESOURCE0_WC) { bar0_wc_size = std::min(bar0_wc_mapping.mapping_size, wc_mapping_size); - bar0_wc = mmap(NULL, bar0_wc_size, PROT_READ | PROT_WRITE, MAP_SHARED, pci_device_file_desc, bar0_wc_mapping.mapping_base); + bar0_wc = mmap( + NULL, bar0_wc_size, PROT_READ | PROT_WRITE, MAP_SHARED, pci_device_file_desc, bar0_wc_mapping.mapping_base); if (bar0_wc == MAP_FAILED) { bar0_wc_size = 0; bar0_wc = nullptr; @@ -325,7 +333,13 @@ PCIDevice::PCIDevice(int pci_device_number, int logical_device_id) bar0_uc_offset = 0; } - bar0_uc = mmap(NULL, bar0_uc_size, PROT_READ | PROT_WRITE, MAP_SHARED, pci_device_file_desc, bar0_uc_mapping.mapping_base + bar0_uc_offset); + bar0_uc = mmap( + NULL, + bar0_uc_size, + PROT_READ | PROT_WRITE, + MAP_SHARED, + pci_device_file_desc, + bar0_uc_mapping.mapping_base + bar0_uc_offset); if (bar0_uc == MAP_FAILED) { throw std::runtime_error(fmt::format("BAR0 UC mapping failed for device {}.", pci_device_num)); @@ -342,22 +356,34 @@ PCIDevice::PCIDevice(int pci_device_number, int logical_device_id) system_reg_mapping_size = bar4_uc_mapping.mapping_size; - system_reg_mapping = mmap(NULL, bar4_uc_mapping.mapping_size, PROT_READ | PROT_WRITE, MAP_SHARED, pci_device_file_desc, bar4_uc_mapping.mapping_base); + system_reg_mapping = mmap( + NULL, + bar4_uc_mapping.mapping_size, + PROT_READ | PROT_WRITE, + MAP_SHARED, + pci_device_file_desc, + bar4_uc_mapping.mapping_base); if (system_reg_mapping == MAP_FAILED) { throw std::runtime_error(fmt::format("BAR4 UC mapping failed for device {}.", pci_device_num)); } - system_reg_start_offset = (512 - 16) * 1024*1024; - system_reg_offset_adjust = (512 - 32) * 1024*1024; - } else if(arch == tt::ARCH::BLACKHOLE) { + system_reg_start_offset = (512 - 16) * 1024 * 1024; + system_reg_offset_adjust = (512 - 32) * 1024 * 1024; + } else if (arch == tt::ARCH::BLACKHOLE) { if (bar2_uc_mapping.mapping_id != TENSTORRENT_MAPPING_RESOURCE1_UC) { throw std::runtime_error(fmt::format("Device {} has no BAR2 UC mapping.", pci_device_num)); } // Using UnCachable memory mode. This is used for accessing registers on Blackhole. bar2_uc_size = bar2_uc_mapping.mapping_size; - bar2_uc = mmap(NULL, bar2_uc_mapping.mapping_size, PROT_READ | PROT_WRITE, MAP_SHARED, pci_device_file_desc, bar2_uc_mapping.mapping_base); + bar2_uc = mmap( + NULL, + bar2_uc_mapping.mapping_size, + PROT_READ | PROT_WRITE, + MAP_SHARED, + pci_device_file_desc, + bar2_uc_mapping.mapping_base); if (bar2_uc == MAP_FAILED) { throw std::runtime_error(fmt::format("BAR2 UC mapping failed for device {}.", pci_device_num)); @@ -370,7 +396,13 @@ PCIDevice::PCIDevice(int pci_device_number, int logical_device_id) // Using Write-Combine memory mode. This is used for accessing DRAM on Blackhole. // WC doesn't guarantee write ordering but has better performance. bar4_wc_size = bar4_wc_mapping.mapping_size; - bar4_wc = mmap(NULL, bar4_wc_mapping.mapping_size, PROT_READ | PROT_WRITE, MAP_SHARED, pci_device_file_desc, bar4_wc_mapping.mapping_base); + bar4_wc = mmap( + NULL, + bar4_wc_mapping.mapping_size, + PROT_READ | PROT_WRITE, + MAP_SHARED, + pci_device_file_desc, + bar4_wc_mapping.mapping_base); if (bar4_wc == MAP_FAILED) { throw std::runtime_error(fmt::format("BAR4 WC mapping failed for device {}.", pci_device_num)); @@ -390,8 +422,8 @@ PCIDevice::~PCIDevice() { // essential for correctness then it needs to move to the driver. uint64_t iatu_index = 0; uint64_t iatu_base = UNROLL_ATU_OFFSET_BAR + iatu_index * 0x200; - uint32_t region_ctrl_2 = 0 << 31; // REGION_EN = 0 - write_regs(reinterpret_cast(static_cast(bar2_uc) + iatu_base + 0x04), ®ion_ctrl_2, 1); + uint32_t region_ctrl_2 = 0 << 31; // REGION_EN = 0 + write_regs(reinterpret_cast(static_cast(bar2_uc) + iatu_base + 0x04), ®ion_ctrl_2, 1); } close(pci_device_file_desc); @@ -417,8 +449,8 @@ PCIDevice::~PCIDevice() { } } -template -T* PCIDevice::get_register_address(uint32_t register_offset) { +template +T *PCIDevice::get_register_address(uint32_t register_offset) { // Right now, address can either be exposed register in BAR, or TLB window in BAR0 (BAR4 for Blackhole). // Should clarify this interface void *reg_mapping; @@ -431,10 +463,10 @@ T* PCIDevice::get_register_address(uint32_t register_offset) { register_offset -= bar0_uc_offset; reg_mapping = bar0_uc; } - return reinterpret_cast(static_cast(reg_mapping) + register_offset); + return reinterpret_cast(static_cast(reg_mapping) + register_offset); } -void PCIDevice::write_block(uint64_t byte_addr, uint64_t num_bytes, const uint8_t* buffer_addr) { +void PCIDevice::write_block(uint64_t byte_addr, uint64_t num_bytes, const uint8_t *buffer_addr) { void *dest = nullptr; if (bar4_wc != nullptr && byte_addr >= BAR0_BH_SIZE) { byte_addr -= BAR0_BH_SIZE; @@ -451,7 +483,7 @@ void PCIDevice::write_block(uint64_t byte_addr, uint64_t num_bytes, const uint8_ } } -void PCIDevice::read_block(uint64_t byte_addr, uint64_t num_bytes, uint8_t* buffer_addr) { +void PCIDevice::read_block(uint64_t byte_addr, uint64_t num_bytes, uint8_t *buffer_addr) { void *src = nullptr; if (bar4_wc != nullptr && byte_addr >= BAR0_BH_SIZE) { byte_addr -= BAR0_BH_SIZE; @@ -468,7 +500,7 @@ void PCIDevice::read_block(uint64_t byte_addr, uint64_t num_bytes, uint8_t* buff } if (num_bytes >= sizeof(std::uint32_t)) { - detect_hang_read(*reinterpret_cast(dest)); + detect_hang_read(*reinterpret_cast(dest)); } } @@ -481,14 +513,14 @@ void PCIDevice::write_regs(volatile uint32_t *dest, const uint32_t *src, uint32_ void PCIDevice::write_regs(uint32_t byte_addr, uint32_t word_len, const void *data) { volatile uint32_t *dest = get_register_address(byte_addr); - const uint32_t *src = reinterpret_cast(data); + const uint32_t *src = reinterpret_cast(data); write_regs(dest, src, word_len); } void PCIDevice::read_regs(uint32_t byte_addr, uint32_t word_len, void *data) { const volatile uint32_t *src = get_register_address(byte_addr); - uint32_t *dest = reinterpret_cast(data); + uint32_t *dest = reinterpret_cast(data); while (word_len-- != 0) { uint32_t temp = *src++; @@ -496,29 +528,34 @@ void PCIDevice::read_regs(uint32_t byte_addr, uint32_t word_len, void *data) { } } -void PCIDevice::write_tlb_reg(uint32_t byte_addr, uint64_t value_lower, uint64_t value_upper, uint32_t tlb_cfg_reg_size){ - log_assert((tlb_cfg_reg_size == 8) or (tlb_cfg_reg_size == 12), "Tenstorrent hardware supports only 64bit or 96bit TLB config regs"); +void PCIDevice::write_tlb_reg( + uint32_t byte_addr, uint64_t value_lower, uint64_t value_upper, uint32_t tlb_cfg_reg_size) { + log_assert( + (tlb_cfg_reg_size == 8) or (tlb_cfg_reg_size == 12), + "Tenstorrent hardware supports only 64bit or 96bit TLB config regs"); volatile uint64_t *dest_qw = get_register_address(byte_addr); - volatile uint32_t *dest_extra_dw = get_register_address(byte_addr+8); + volatile uint32_t *dest_extra_dw = get_register_address(byte_addr + 8); #if defined(__ARM_ARCH) || defined(__riscv) // The store below goes through UC memory on x86, which has implicit ordering constraints with WC accesses. - // ARM has no concept of UC memory. This will not allow for implicit ordering of this store wrt other memory accesses. - // Insert an explicit full memory barrier for ARM. - // Do the same for RISC-V. + // ARM has no concept of UC memory. This will not allow for implicit ordering of this store wrt other memory + // accesses. Insert an explicit full memory barrier for ARM. Do the same for RISC-V. tt_driver_atomics::mfence(); #endif *dest_qw = value_lower; if (tlb_cfg_reg_size > 8) { - uint32_t* p_value_upper = reinterpret_cast(&value_upper); + uint32_t *p_value_upper = reinterpret_cast(&value_upper); *dest_extra_dw = p_value_upper[0]; } - tt_driver_atomics::mfence(); // Otherwise subsequent WC loads move earlier than the above UC store to the TLB register. + tt_driver_atomics::mfence(); // Otherwise subsequent WC loads move earlier than the above UC store to the TLB + // register. } bool PCIDevice::is_hardware_hung() { - volatile const void *addr = reinterpret_cast(bar0_uc) + (get_architecture_implementation()->get_arc_reset_scratch_offset() + 6 * 4) - bar0_uc_offset; - std::uint32_t scratch_data = *reinterpret_cast(addr); + volatile const void *addr = reinterpret_cast(bar0_uc) + + (get_architecture_implementation()->get_arc_reset_scratch_offset() + 6 * 4) - + bar0_uc_offset; + std::uint32_t scratch_data = *reinterpret_cast(addr); return (scratch_data == c_hang_read_value); } @@ -532,52 +569,91 @@ void PCIDevice::detect_hang_read(std::uint32_t data_read) { } // Get TLB index (from zero), check if it's in 16MB, 2MB or 1MB TLB range, and dynamically program it. -dynamic_tlb PCIDevice::set_dynamic_tlb(unsigned int tlb_index, tt_xy_pair start, tt_xy_pair end, - std::uint64_t address, bool multicast, std::unordered_map>& harvested_coord_translation, std::uint64_t ordering) { +dynamic_tlb PCIDevice::set_dynamic_tlb( + unsigned int tlb_index, + tt_xy_pair start, + tt_xy_pair end, + std::uint64_t address, + bool multicast, + std::unordered_map> &harvested_coord_translation, + std::uint64_t ordering) { auto architecture_implementation = get_architecture_implementation(); if (multicast) { std::tie(start, end) = architecture_implementation->multicast_workaround(start, end); } - log_trace(LogSiliconDriver, "set_dynamic_tlb with arguments: tlb_index = {}, start = ({}, {}), end = ({}, {}), address = 0x{:x}, multicast = {}, ordering = {}", - tlb_index, start.x, start.y, end.x, end.y, address, multicast, (int)ordering); + log_trace( + LogSiliconDriver, + "set_dynamic_tlb with arguments: tlb_index = {}, start = ({}, {}), end = ({}, {}), address = 0x{:x}, multicast " + "= {}, ordering = {}", + tlb_index, + start.x, + start.y, + end.x, + end.y, + address, + multicast, + (int)ordering); tt::umd::tlb_configuration tlb_config = architecture_implementation->get_tlb_configuration(tlb_index); std::uint32_t TLB_CFG_REG_SIZE_BYTES = architecture_implementation->get_tlb_cfg_reg_size_bytes(); auto translated_start_coords = harvested_coord_translation.at(logical_id).at(start); auto translated_end_coords = harvested_coord_translation.at(logical_id).at(end); - uint32_t tlb_address = address / tlb_config.size; - uint32_t local_address = address % tlb_config.size; - uint64_t tlb_base = tlb_config.base + (tlb_config.size * tlb_config.index_offset); - uint32_t tlb_cfg_reg = tlb_config.cfg_addr + (TLB_CFG_REG_SIZE_BYTES * tlb_config.index_offset); - - std::pair tlb_data = tt::umd::tlb_data { - .local_offset = tlb_address, - .x_end = static_cast(translated_end_coords.x), - .y_end = static_cast(translated_end_coords.y), - .x_start = static_cast(translated_start_coords.x), - .y_start = static_cast(translated_start_coords.y), - .mcast = multicast, - .ordering = ordering, - // TODO #2715: hack for Blackhole A0, will potentially be fixed in B0. - // Using the same static vc for reads and writes through TLBs can hang the card. It doesn't even have to be the same TLB. - // Dynamic vc should not have this issue. There might be a perf impact with using dynamic vc. - .static_vc = (get_arch() == tt::ARCH::BLACKHOLE) ? false : true, - }.apply_offset(tlb_config.offset); - - log_debug(LogSiliconDriver, "set_dynamic_tlb() with tlb_index: {} tlb_index_offset: {} dynamic_tlb_size: {}MB tlb_base: 0x{:x} tlb_cfg_reg: 0x{:x}", tlb_index, tlb_config.index_offset, tlb_config.size/(1024*1024), tlb_base, tlb_cfg_reg); + uint32_t tlb_address = address / tlb_config.size; + uint32_t local_address = address % tlb_config.size; + uint64_t tlb_base = tlb_config.base + (tlb_config.size * tlb_config.index_offset); + uint32_t tlb_cfg_reg = tlb_config.cfg_addr + (TLB_CFG_REG_SIZE_BYTES * tlb_config.index_offset); + + std::pair tlb_data = + tt::umd::tlb_data{ + .local_offset = tlb_address, + .x_end = static_cast(translated_end_coords.x), + .y_end = static_cast(translated_end_coords.y), + .x_start = static_cast(translated_start_coords.x), + .y_start = static_cast(translated_start_coords.y), + .mcast = multicast, + .ordering = ordering, + // TODO #2715: hack for Blackhole A0, will potentially be fixed in B0. + // Using the same static vc for reads and writes through TLBs can hang the card. It doesn't even have to be + // the same TLB. Dynamic vc should not have this issue. There might be a perf impact with using dynamic vc. + .static_vc = (get_arch() == tt::ARCH::BLACKHOLE) ? false : true, + } + .apply_offset(tlb_config.offset); + + log_debug( + LogSiliconDriver, + "set_dynamic_tlb() with tlb_index: {} tlb_index_offset: {} dynamic_tlb_size: {}MB tlb_base: 0x{:x} " + "tlb_cfg_reg: 0x{:x}", + tlb_index, + tlb_config.index_offset, + tlb_config.size / (1024 * 1024), + tlb_base, + tlb_cfg_reg); write_tlb_reg(tlb_cfg_reg, tlb_data.first, tlb_data.second, TLB_CFG_REG_SIZE_BYTES); - return { tlb_base + local_address, tlb_config.size - local_address }; + return {tlb_base + local_address, tlb_config.size - local_address}; } -dynamic_tlb PCIDevice::set_dynamic_tlb(unsigned int tlb_index, tt_xy_pair target, std::uint64_t address, std::unordered_map>& harvested_coord_translation, std::uint64_t ordering) { +dynamic_tlb PCIDevice::set_dynamic_tlb( + unsigned int tlb_index, + tt_xy_pair target, + std::uint64_t address, + std::unordered_map> &harvested_coord_translation, + std::uint64_t ordering) { return set_dynamic_tlb(tlb_index, tt_xy_pair(0, 0), target, address, false, harvested_coord_translation, ordering); } -dynamic_tlb PCIDevice::set_dynamic_tlb_broadcast(unsigned int tlb_index, std::uint64_t address, std::unordered_map>& harvested_coord_translation, tt_xy_pair start, tt_xy_pair end, std::uint64_t ordering) { +dynamic_tlb PCIDevice::set_dynamic_tlb_broadcast( + unsigned int tlb_index, + std::uint64_t address, + std::unordered_map> &harvested_coord_translation, + tt_xy_pair start, + tt_xy_pair end, + std::uint64_t ordering) { // Issue a broadcast to cores included in the start (top left) and end (bottom right) grid return set_dynamic_tlb(tlb_index, start, end, address, true, harvested_coord_translation, ordering); } -tt::umd::architecture_implementation* PCIDevice::get_architecture_implementation() const {return architecture_implementation.get();} \ No newline at end of file +tt::umd::architecture_implementation *PCIDevice::get_architecture_implementation() const { + return architecture_implementation.get(); +} \ No newline at end of file diff --git a/device/pcie/pci_device.hpp b/device/pcie/pci_device.hpp index c79c1089..3f8f604c 100644 --- a/device/pcie/pci_device.hpp +++ b/device/pcie/pci_device.hpp @@ -12,32 +12,33 @@ #include #include -#include "device/tt_xy_pair.h" +#include "device/tlb.h" #include "device/tt_arch_types.h" #include "device/tt_cluster_descriptor_types.h" -#include "device/tlb.h" +#include "device/tt_xy_pair.h" // TODO: this is used up in tt_silicon_driver.cpp but that logic ought to be // lowered into the PCIDevice class since it is specific to PCIe cards. // See /vendor_ip/synopsys/052021/bh_pcie_ctl_gen5/export/configuration/DWC_pcie_ctl.h static const uint64_t UNROLL_ATU_OFFSET_BAR = 0x1200; -// TODO: this is a bit of a hack... something to revisit when we formalize an +// TODO: this is a bit of a hack... something to revisit when we formalize an // abstraction for IO. // BAR0 size for Blackhole, used to determine whether write block should use BAR0 or BAR4 static const uint64_t BAR0_BH_SIZE = 512 * 1024 * 1024; constexpr unsigned int c_hang_read_value = 0xffffffffu; -namespace tt::umd { class architecture_implementation; } +namespace tt::umd { +class architecture_implementation; +} struct dynamic_tlb { - uint64_t bar_offset; // Offset that address is mapped to, within the PCI BAR. - uint64_t remaining_size; // Bytes remaining between bar_offset and end of the TLB. + uint64_t bar_offset; // Offset that address is mapped to, within the PCI BAR. + uint64_t remaining_size; // Bytes remaining between bar_offset and end of the TLB. }; -struct PciDeviceInfo -{ +struct PciDeviceInfo { uint16_t vendor_id; uint16_t device_id; uint16_t pci_domain; @@ -51,14 +52,14 @@ struct PciDeviceInfo }; class PCIDevice { - const std::string device_path; // Path to character device: /dev/tenstorrent/N - const int pci_device_num; // N in /dev/tenstorrent/N - const int logical_id; // Unique identifier for each device in entire network topology - const int pci_device_file_desc; // Character device file descriptor - const PciDeviceInfo info; // PCI device info - const int numa_node; // -1 if non-NUMA - const int revision; // PCI revision value from sysfs - const tt::ARCH arch; // e.g. Grayskull, Wormhole, Blackhole + const std::string device_path; // Path to character device: /dev/tenstorrent/N + const int pci_device_num; // N in /dev/tenstorrent/N + const int logical_id; // Unique identifier for each device in entire network topology + const int pci_device_file_desc; // Character device file descriptor + const PciDeviceInfo info; // PCI device info + const int numa_node; // -1 if non-NUMA + const int revision; // PCI revision value from sysfs + const tt::ARCH arch; // e.g. Grayskull, Wormhole, Blackhole std::unique_ptr architecture_implementation; public: @@ -77,7 +78,7 @@ class PCIDevice { * * Opens the character device file descriptor, reads device information from * sysfs, and maps device memory region(s) into the process address space. - * + * * @param pci_device_number N in /dev/tenstorrent/N * @param logical_device_id unique identifier for this device in the network topology */ @@ -89,8 +90,8 @@ class PCIDevice { */ ~PCIDevice(); - PCIDevice(const PCIDevice&) = delete; // copy - void operator=(const PCIDevice&) = delete; // copy assignment + PCIDevice(const PCIDevice &) = delete; // copy + void operator=(const PCIDevice &) = delete; // copy assignment /** * @return PCI device info @@ -149,21 +150,39 @@ class PCIDevice { // NOC endpoints. Probably worth waiting for the KMD to start owning the // resource management aspect of these PCIe->NOC mappings (the "TLBs") // before doing too much work here... - void write_block(uint64_t byte_addr, uint64_t num_bytes, const uint8_t* buffer_addr); - void read_block(uint64_t byte_addr, uint64_t num_bytes, uint8_t* buffer_addr); + void write_block(uint64_t byte_addr, uint64_t num_bytes, const uint8_t *buffer_addr); + void read_block(uint64_t byte_addr, uint64_t num_bytes, uint8_t *buffer_addr); void write_regs(uint32_t byte_addr, uint32_t word_len, const void *data); void write_regs(volatile uint32_t *dest, const uint32_t *src, uint32_t word_len); void read_regs(uint32_t byte_addr, uint32_t word_len, void *data); // TLB related functions. // TODO: These are architecture specific, and will be moved out of the class. - void write_tlb_reg(uint32_t byte_addr, std::uint64_t value_lower, std::uint64_t value_upper, std::uint32_t tlb_cfg_reg_size); - dynamic_tlb set_dynamic_tlb(unsigned int tlb_index, tt_xy_pair start, tt_xy_pair end, - std::uint64_t address, bool multicast, std::unordered_map>& harvested_coord_translation, std::uint64_t ordering); - dynamic_tlb set_dynamic_tlb(unsigned int tlb_index, tt_xy_pair target, std::uint64_t address, std::unordered_map>& harvested_coord_translation, std::uint64_t ordering = tt::umd::tlb_data::Relaxed); - dynamic_tlb set_dynamic_tlb_broadcast(unsigned int tlb_index, std::uint64_t address, std::unordered_map>& harvested_coord_translation, tt_xy_pair start, tt_xy_pair end, std::uint64_t ordering = tt::umd::tlb_data::Relaxed); - - tt::umd::architecture_implementation* get_architecture_implementation() const; + void write_tlb_reg( + uint32_t byte_addr, std::uint64_t value_lower, std::uint64_t value_upper, std::uint32_t tlb_cfg_reg_size); + dynamic_tlb set_dynamic_tlb( + unsigned int tlb_index, + tt_xy_pair start, + tt_xy_pair end, + std::uint64_t address, + bool multicast, + std::unordered_map> &harvested_coord_translation, + std::uint64_t ordering); + dynamic_tlb set_dynamic_tlb( + unsigned int tlb_index, + tt_xy_pair target, + std::uint64_t address, + std::unordered_map> &harvested_coord_translation, + std::uint64_t ordering = tt::umd::tlb_data::Relaxed); + dynamic_tlb set_dynamic_tlb_broadcast( + unsigned int tlb_index, + std::uint64_t address, + std::unordered_map> &harvested_coord_translation, + tt_xy_pair start, + tt_xy_pair end, + std::uint64_t ordering = tt::umd::tlb_data::Relaxed); + + tt::umd::architecture_implementation *get_architecture_implementation() const; void detect_hang_read(uint32_t data_read = c_hang_read_value); public: @@ -186,8 +205,8 @@ class PCIDevice { // and simplify the code. void *system_reg_mapping = nullptr; size_t system_reg_mapping_size; - uint32_t system_reg_start_offset; // Registers >= this are system regs, use the mapping. - uint32_t system_reg_offset_adjust; // This is the offset of the first reg in the system reg mapping. + uint32_t system_reg_start_offset; // Registers >= this are system regs, use the mapping. + uint32_t system_reg_offset_adjust; // This is the offset of the first reg in the system reg mapping. uint32_t read_checking_offset; @@ -195,6 +214,5 @@ class PCIDevice { bool is_hardware_hung(); template - T* get_register_address(uint32_t register_offset); + T *get_register_address(uint32_t register_offset); }; - diff --git a/device/simulation/deprecated/tt_emulation_device.cpp b/device/simulation/deprecated/tt_emulation_device.cpp index 3e64c15e..2073ff41 100644 --- a/device/simulation/deprecated/tt_emulation_device.cpp +++ b/device/simulation/deprecated/tt_emulation_device.cpp @@ -1,190 +1,228 @@ -#include +#include "tt_emulation_device.h" + #include +#include #include "common/logger.hpp" #include "device/tt_cluster_descriptor.h" -#include "tt_emulation_device.h" #include "tt_emu_zemi3_wrapper.h" - tt_emulation_device::tt_emulation_device(const std::string& sdesc_path) : tt_device(sdesc_path) { - soc_descriptor_per_chip.emplace(0, tt_SocDescriptor(sdesc_path)); - std::set target_devices = {0}; - // create just a default one, we do not have cluster anyway - ndesc = tt_ClusterDescriptor::create_for_grayskull_cluster(target_devices, {}); - tt_zebu_wrapper_inst = new tt_emu_zemi3_wrapper(); + soc_descriptor_per_chip.emplace(0, tt_SocDescriptor(sdesc_path)); + std::set target_devices = {0}; + // create just a default one, we do not have cluster anyway + ndesc = tt_ClusterDescriptor::create_for_grayskull_cluster(target_devices, {}); + tt_zebu_wrapper_inst = new tt_emu_zemi3_wrapper(); - log_info(tt::LogEmulationDriver, "Created Emulation Device "); + log_info(tt::LogEmulationDriver, "Created Emulation Device "); } tt_emulation_device::~tt_emulation_device() { - ndesc.reset(); - delete tt_zebu_wrapper_inst; - log_info(tt::LogEmulationDriver, "Destroyed Emulation Device "); + ndesc.reset(); + delete tt_zebu_wrapper_inst; + log_info(tt::LogEmulationDriver, "Destroyed Emulation Device "); } - + void tt_emulation_device::write(tt_cxy_pair core, uint64_t addr, const std::vector& data) { - const uint32_t size = static_cast(data.size()); - tt_zebu_wrapper_inst->axi_write(0, core.x, core.y, addr, size, data); - log_info(tt::LogEmulationDriver, "Wrote {} bytes to address {:#016x}, core {},{}", size, addr, core.x, core.y); + const uint32_t size = static_cast(data.size()); + tt_zebu_wrapper_inst->axi_write(0, core.x, core.y, addr, size, data); + log_info(tt::LogEmulationDriver, "Wrote {} bytes to address {:#016x}, core {},{}", size, addr, core.x, core.y); } std::vector tt_emulation_device::read(tt_cxy_pair core, uint64_t addr, uint32_t size) { - std::vector data(size); - tt_zebu_wrapper_inst->axi_read(0, core.x, core.y, addr, size, data); - log_info(tt::LogEmulationDriver, "Read {} bytes from address {:#016x}", size, addr); + std::vector data(size); + tt_zebu_wrapper_inst->axi_read(0, core.x, core.y, addr, size, data); + log_info(tt::LogEmulationDriver, "Read {} bytes from address {:#016x}", size, addr); - return data; + return data; } - void tt_emulation_device::start_device(const tt_device_params& device_params) { - tt_zebu_wrapper_inst->zebu_start(); - tt_zebu_wrapper_inst->zebu_enable_waveform_dump(tt_zebu_wrapper::WAVEFORM_DUMP_QIWC); - log_info(tt::LogEmulationDriver, "Started Emulation Device "); + tt_zebu_wrapper_inst->zebu_start(); + tt_zebu_wrapper_inst->zebu_enable_waveform_dump(tt_zebu_wrapper::WAVEFORM_DUMP_QIWC); + log_info(tt::LogEmulationDriver, "Started Emulation Device "); } void tt_emulation_device::deassert_risc_reset() { - tt_zebu_wrapper_inst->all_tensix_reset_deassert(); - log_info(tt::LogEmulationDriver, "Deasserted all tensix RISC Reset "); + tt_zebu_wrapper_inst->all_tensix_reset_deassert(); + log_info(tt::LogEmulationDriver, "Deasserted all tensix RISC Reset "); } void tt_emulation_device::assert_risc_reset() { - tt_zebu_wrapper_inst->all_tensix_reset_assert(); - log_info(tt::LogEmulationDriver, "Asserted all tensix RISC Reset "); + tt_zebu_wrapper_inst->all_tensix_reset_assert(); + log_info(tt::LogEmulationDriver, "Asserted all tensix RISC Reset "); } void tt_emulation_device::deassert_risc_reset_at_core(tt_cxy_pair core) { - tt_zebu_wrapper_inst->tensix_reset_deassert(core.x, core.y); + tt_zebu_wrapper_inst->tensix_reset_deassert(core.x, core.y); } void tt_emulation_device::assert_risc_reset_at_core(tt_cxy_pair core) { - tt_zebu_wrapper_inst->tensix_reset_assert(core.x, core.y); + tt_zebu_wrapper_inst->tensix_reset_assert(core.x, core.y); } - - void tt_emulation_device::close_device() { log_info(tt::LogEmulationDriver, "Closing Emulation Device "); tt_zebu_wrapper_inst->zebu_finish(); } -void tt_emulation_device::start(std::vector plusargs, std::vector dump_cores, bool no_checkers, bool /*init_device*/, bool /*skip_driver_allocs*/ +void tt_emulation_device::start( + std::vector plusargs, + std::vector dump_cores, + bool no_checkers, + bool /*init_device*/, + bool /*skip_driver_allocs*/ ) { - log_info(tt::LogEmulationDriver, "Starting Emulation Device "); + log_info(tt::LogEmulationDriver, "Starting Emulation Device "); +} + +void tt_emulation_device::broadcast_write_to_cluster( + const void* mem_ptr, + uint32_t size_in_bytes, + uint64_t address, + const std::set& chips_to_exclude, + std::set& rows_to_exclude, + std::set& cols_to_exclude, + const std::string& fallback_tlb) { + for (const auto& core : get_soc_descriptor(0)->cores) { + // if(cols_to_exclude.find(core.first.x) == cols_to_exclude.end() and rows_to_exclude.find(core.first.y) == + // rows_to_exclude.end() and core.second.type != CoreType::HARVESTED) { + // write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, ""); + // } + // MT: Iterate through all the worker cores for bcast: + // if (get_soc_descriptor(0)->is_worker_core(core.first)) { + // write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, ""); + // } + // Emulation only broadcasts to all Tensix cores or all DRAM cores. + // differentiate which bcast pattern to use based on exclude columns + if (cols_to_exclude.find(0) == cols_to_exclude.end()) { + // Detect DRAM bcast + if (get_soc_descriptor(0)->is_dram_core(core.first)) { + write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, ""); + } + } else { + if (get_soc_descriptor(0)->is_worker_core(core.first)) { + write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, ""); + } + } + } } - -void tt_emulation_device::broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set& chips_to_exclude, std::set& rows_to_exclude, std::set& cols_to_exclude, const std::string& fallback_tlb) { - for(const auto& core : get_soc_descriptor(0) -> cores) { - // if(cols_to_exclude.find(core.first.x) == cols_to_exclude.end() and rows_to_exclude.find(core.first.y) == rows_to_exclude.end() and core.second.type != CoreType::HARVESTED) { - // write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, ""); - // } - // MT: Iterate through all the worker cores for bcast: - // if (get_soc_descriptor(0)->is_worker_core(core.first)) { - // write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, ""); - // } - // Emulation only broadcasts to all Tensix cores or all DRAM cores. - // differentiate which bcast pattern to use based on exclude columns - if (cols_to_exclude.find(0) == cols_to_exclude.end()) { - // Detect DRAM bcast - if (get_soc_descriptor(0)->is_dram_core(core.first)) { - write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, ""); - } - } else { - if (get_soc_descriptor(0)->is_worker_core(core.first)) { - write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, ""); - } +void tt_emulation_device::rolled_write_to_device( + std::vector& base_vec, + uint32_t unroll_count, + tt_cxy_pair core, + uint64_t base_addr, + const std::string& tlb_to_use) { + std::vector vec = base_vec; + uint32_t byte_increment = 4 * vec.size(); + for (uint32_t i = 0; i < unroll_count; ++i) { + vec[0] = i; // slot id for debug + uint64_t offset_addr = base_addr + i * byte_increment; + write_to_device(vec, core, offset_addr, tlb_to_use); } - } -} -void tt_emulation_device::rolled_write_to_device(std::vector& base_vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t base_addr, const std::string& tlb_to_use) { - std::vector vec = base_vec; - uint32_t byte_increment = 4 * vec.size(); - for (uint32_t i = 0; i < unroll_count; ++i) { - vec[0] = i; // slot id for debug - uint64_t offset_addr = base_addr + i * byte_increment; - write_to_device(vec, core, offset_addr, tlb_to_use); - } } -void tt_emulation_device::write_to_device(const void *mem_ptr, uint32_t size, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) { - log_assert(!(size % 4), "Writes to Emulation Backend should be 4 byte aligned!"); - std::vector mem_vector((uint32_t*)mem_ptr, (uint32_t*)mem_ptr + size / sizeof(uint32_t)); - write_to_device(mem_vector, core, addr, tlb_to_use, send_epoch_cmd, last_send_epoch_cmd, ordered_with_prev_remote_write); -} +void tt_emulation_device::write_to_device( + const void* mem_ptr, + uint32_t size, + tt_cxy_pair core, + uint64_t addr, + const std::string& tlb_to_use, + bool send_epoch_cmd, + bool last_send_epoch_cmd, + bool ordered_with_prev_remote_write) { + log_assert(!(size % 4), "Writes to Emulation Backend should be 4 byte aligned!"); -void tt_emulation_device::write_to_device(std::vector& vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) { + std::vector mem_vector((uint32_t*)mem_ptr, (uint32_t*)mem_ptr + size / sizeof(uint32_t)); + write_to_device( + mem_vector, core, addr, tlb_to_use, send_epoch_cmd, last_send_epoch_cmd, ordered_with_prev_remote_write); +} - std::vector byte_data(vec.size() * sizeof(uint32_t)); - std::memcpy(byte_data.data(), vec.data(), byte_data.size()); +void tt_emulation_device::write_to_device( + std::vector& vec, + tt_cxy_pair core, + uint64_t addr, + const std::string& tlb_to_use, + bool send_epoch_cmd, + bool last_send_epoch_cmd, + bool ordered_with_prev_remote_write) { + std::vector byte_data(vec.size() * sizeof(uint32_t)); + std::memcpy(byte_data.data(), vec.data(), byte_data.size()); - write(core, addr, byte_data); + write(core, addr, byte_data); } -void tt_emulation_device::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) { +void tt_emulation_device::l1_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) { // Placeholder - implement later - https://yyz-gitlab.local.tenstorrent.com/tenstorrent/open-umd/-/issues/26 } -void tt_emulation_device::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) { +void tt_emulation_device::dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) { // Placeholder - implement later - https://yyz-gitlab.local.tenstorrent.com/tenstorrent/open-umd/-/issues/26 } -void tt_emulation_device::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) { +void tt_emulation_device::dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) { // Placeholder - implement later - https://yyz-gitlab.local.tenstorrent.com/tenstorrent/open-umd/-/issues/26 } +void tt_emulation_device::read_from_device( + std::vector& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& /*tlb_to_use*/) { + std::vector byte_data = read(core, addr, size); + // Verify that the received byte data can be converted to uint32_t + // if (byte_data.size() % sizeof(uint32_t) != 0) { + // throw std::runtime_error("Received byte data size is not a multiple of uint32_t size."); + // } -void tt_emulation_device::read_from_device(std::vector& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& /*tlb_to_use*/) { - std::vector byte_data = read(core, addr, size); - - // Verify that the received byte data can be converted to uint32_t - // if (byte_data.size() % sizeof(uint32_t) != 0) { - // throw std::runtime_error("Received byte data size is not a multiple of uint32_t size."); - // } - - vec.clear(); - vec.resize(byte_data.size() / sizeof(uint32_t)); - std::memcpy(vec.data(), byte_data.data(), byte_data.size()); + vec.clear(); + vec.resize(byte_data.size() / sizeof(uint32_t)); + std::memcpy(vec.data(), byte_data.data(), byte_data.size()); } void tt_emulation_device::translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c) { - // No translation is performed - return; + // No translation is performed + return; } + tt_ClusterDescriptor* tt_emulation_device::get_cluster_description() { return ndesc.get(); } std::set tt_emulation_device::get_target_mmio_device_ids() { - log_error("LogEmulationDriver: get_target_mmio_device_ids not implemented"); - return {}; + log_error("LogEmulationDriver: get_target_mmio_device_ids not implemented"); + return {}; } std::set tt_emulation_device::get_target_remote_device_ids() { - log_error("LogEmulationDriver: get_target_remote_device_ids not implemented"); - return {}; + log_error("LogEmulationDriver: get_target_remote_device_ids not implemented"); + return {}; } void tt_emulation_device::set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_) { dram_address_params = dram_address_params_; } + int tt_emulation_device::get_number_of_chips_in_cluster() { return detect_number_of_chips(); } -std::unordered_set tt_emulation_device::get_all_chips_in_cluster() { return { 0 }; } + +std::unordered_set tt_emulation_device::get_all_chips_in_cluster() { return {0}; } + int tt_emulation_device::detect_number_of_chips() { return 1; } bool tt_emulation_device::using_harvested_soc_descriptors() { return false; } -bool tt_emulation_device::noc_translation_en() { return false; } -std::unordered_map tt_emulation_device::get_harvesting_masks_for_soc_descriptors() { return {{0, 0}};} -std::unordered_map& tt_emulation_device::get_virtual_soc_descriptors() {return soc_descriptor_per_chip;} +bool tt_emulation_device::noc_translation_en() { return false; } -std::map tt_emulation_device::get_clocks() { - return std::map(); +std::unordered_map tt_emulation_device::get_harvesting_masks_for_soc_descriptors() { + return {{0, 0}}; } -void tt_emulation_device::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) { - l1_address_params = l1_address_params_; +std::unordered_map& tt_emulation_device::get_virtual_soc_descriptors() { + return soc_descriptor_per_chip; } +std::map tt_emulation_device::get_clocks() { return std::map(); } - +void tt_emulation_device::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) { + l1_address_params = l1_address_params_; +} diff --git a/device/simulation/deprecated/tt_emulation_device.h b/device/simulation/deprecated/tt_emulation_device.h index fb2b5e0d..f3f932a6 100644 --- a/device/simulation/deprecated/tt_emulation_device.h +++ b/device/simulation/deprecated/tt_emulation_device.h @@ -9,63 +9,96 @@ #include #include #include + +#include "tt_device.h" #include "tt_soc_descriptor.h" #include "tt_xy_pair.h" -#include "tt_device.h" // use forward declaration here so we do not need to include tt_zebu_wrapper.h class tt_zebu_wrapper; class tt_emulation_device : public tt_device { public: - virtual void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_); // Dont care - tt_emulation_device(const std::string& sdesc_path); - virtual void start(std::vector plusargs, std::vector dump_cores, bool no_checkers, bool init_device, bool skip_driver_allocs); - virtual void start_device(const tt_device_params& device_params); - virtual void close_device(); - virtual void deassert_risc_reset(); - virtual void deassert_risc_reset_at_core(tt_cxy_pair core); - virtual void assert_risc_reset(); - virtual void assert_risc_reset_at_core(tt_cxy_pair core); - virtual void write_to_device(std::vector& vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false); - virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false); - virtual void broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set& chips_to_exclude, std::set& rows_to_exclude, std::set& columns_to_exclude, const std::string& fallback_tlb); + virtual void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_); // Dont care + tt_emulation_device(const std::string& sdesc_path); + virtual void start( + std::vector plusargs, + std::vector dump_cores, + bool no_checkers, + bool init_device, + bool skip_driver_allocs); + virtual void start_device(const tt_device_params& device_params); + virtual void close_device(); + virtual void deassert_risc_reset(); + virtual void deassert_risc_reset_at_core(tt_cxy_pair core); + virtual void assert_risc_reset(); + virtual void assert_risc_reset_at_core(tt_cxy_pair core); + virtual void write_to_device( + std::vector& vec, + tt_cxy_pair core, + uint64_t addr, + const std::string& tlb_to_use, + bool send_epoch_cmd = false, + bool last_send_epoch_cmd = true, + bool ordered_with_prev_remote_write = false); + virtual void write_to_device( + const void* mem_ptr, + uint32_t size_in_bytes, + tt_cxy_pair core, + uint64_t addr, + const std::string& tlb_to_use, + bool send_epoch_cmd = false, + bool last_send_epoch_cmd = true, + bool ordered_with_prev_remote_write = false); + virtual void broadcast_write_to_cluster( + const void* mem_ptr, + uint32_t size_in_bytes, + uint64_t address, + const std::set& chips_to_exclude, + std::set& rows_to_exclude, + std::set& columns_to_exclude, + const std::string& fallback_tlb); - void l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); - void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels); - void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); + void l1_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); + void dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels); + void dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); - virtual void rolled_write_to_device(std::vector& base_vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t base_addr, const std::string& tlb_to_use); // See Versim Implementation - virtual void read_from_device(std::vector& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use); + virtual void rolled_write_to_device( + std::vector& base_vec, + uint32_t unroll_count, + tt_cxy_pair core, + uint64_t base_addr, + const std::string& tlb_to_use); // See Versim Implementation + virtual void read_from_device( + std::vector& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use); - virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c); - virtual bool using_harvested_soc_descriptors(); - virtual std::unordered_map get_harvesting_masks_for_soc_descriptors(); - virtual std::unordered_map& get_virtual_soc_descriptors(); - virtual bool noc_translation_en(); - virtual std::set get_target_mmio_device_ids(); - virtual std::set get_target_remote_device_ids(); - virtual ~tt_emulation_device(); - virtual tt_ClusterDescriptor* get_cluster_description(); - virtual void set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_); - virtual int get_number_of_chips_in_cluster(); - virtual std::unordered_set get_all_chips_in_cluster(); - static int detect_number_of_chips(); - virtual std::map get_clocks(); -private: - - tt_device_l1_address_params l1_address_params; - std::shared_ptr ndesc; - tt_device_dram_address_params dram_address_params; - - // zebu wrapper, provides interface to zebu emulator device through axi and command transactors - tt_zebu_wrapper *tt_zebu_wrapper_inst = NULL; + virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c); + virtual bool using_harvested_soc_descriptors(); + virtual std::unordered_map get_harvesting_masks_for_soc_descriptors(); + virtual std::unordered_map& get_virtual_soc_descriptors(); + virtual bool noc_translation_en(); + virtual std::set get_target_mmio_device_ids(); + virtual std::set get_target_remote_device_ids(); + virtual ~tt_emulation_device(); + virtual tt_ClusterDescriptor* get_cluster_description(); + virtual void set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_); + virtual int get_number_of_chips_in_cluster(); + virtual std::unordered_set get_all_chips_in_cluster(); + static int detect_number_of_chips(); + virtual std::map get_clocks(); +private: + tt_device_l1_address_params l1_address_params; + std::shared_ptr ndesc; + tt_device_dram_address_params dram_address_params; + // zebu wrapper, provides interface to zebu emulator device through axi and command transactors + tt_zebu_wrapper* tt_zebu_wrapper_inst = NULL; - // These functions implement the "protocol" between the RTL simulation and the UMD - void write(tt_cxy_pair core, uint64_t addr, const std::vector& data); - std::vector read(tt_cxy_pair core, uint64_t addr, uint32_t size); - + // These functions implement the "protocol" between the RTL simulation and the UMD + void write(tt_cxy_pair core, uint64_t addr, const std::vector& data); + std::vector read(tt_cxy_pair core, uint64_t addr, uint32_t size); }; - diff --git a/device/simulation/deprecated/tt_emulation_stub.cpp b/device/simulation/deprecated/tt_emulation_stub.cpp index 33fc3c90..db9ba2cc 100644 --- a/device/simulation/deprecated/tt_emulation_stub.cpp +++ b/device/simulation/deprecated/tt_emulation_stub.cpp @@ -1,20 +1,18 @@ -#include #include +#include #include "common/logger.hpp" #include "tt_emulation_device.h" tt_emulation_device::tt_emulation_device(const std::string& sdesc_path) : tt_device(sdesc_path) { - throw std::runtime_error("tt_emulation_device() -- Zebu Emulation is not supported in this build\n"); + throw std::runtime_error("tt_emulation_device() -- Zebu Emulation is not supported in this build\n"); } - tt_emulation_device::~tt_emulation_device() {} - -void tt_emulation_device::write(tt_cxy_pair core, uint64_t addr, const std::vector& data) {} -std::vector tt_emulation_device::read(tt_cxy_pair core, uint64_t addr, uint32_t size) {return {};} +void tt_emulation_device::write(tt_cxy_pair core, uint64_t addr, const std::vector& data) {} +std::vector tt_emulation_device::read(tt_cxy_pair core, uint64_t addr, uint32_t size) { return {}; } void tt_emulation_device::start_device(const tt_device_params& device_params) {} @@ -28,46 +26,93 @@ void tt_emulation_device::assert_risc_reset_at_core(tt_cxy_pair core) {} void tt_emulation_device::close_device() {} -void tt_emulation_device::start(std::vector plusargs, std::vector dump_cores, bool no_checkers, bool /*init_device*/, bool /*skip_driver_allocs*/) {} - - -void tt_emulation_device::broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set& chips_to_exclude, std::set& rows_to_exclude, std::set& cols_to_exclude, const std::string& fallback_tlb) {} -void tt_emulation_device::rolled_write_to_device(std::vector& base_vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t base_addr, const std::string& tlb_to_use) {} - -void tt_emulation_device::write_to_device(std::vector& vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {} -void tt_emulation_device::write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {}; -void tt_emulation_device::read_from_device(std::vector& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& /*tlb_to_use*/) {} -void tt_emulation_device::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) {} -void tt_emulation_device::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) {} -void tt_emulation_device::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) {} - +void tt_emulation_device::start( + std::vector plusargs, + std::vector dump_cores, + bool no_checkers, + bool /*init_device*/, + bool /*skip_driver_allocs*/) {} + +void tt_emulation_device::broadcast_write_to_cluster( + const void* mem_ptr, + uint32_t size_in_bytes, + uint64_t address, + const std::set& chips_to_exclude, + std::set& rows_to_exclude, + std::set& cols_to_exclude, + const std::string& fallback_tlb) {} + +void tt_emulation_device::rolled_write_to_device( + std::vector& base_vec, + uint32_t unroll_count, + tt_cxy_pair core, + uint64_t base_addr, + const std::string& tlb_to_use) {} + +void tt_emulation_device::write_to_device( + std::vector& vec, + tt_cxy_pair core, + uint64_t addr, + const std::string& tlb_to_use, + bool send_epoch_cmd, + bool last_send_epoch_cmd, + bool ordered_with_prev_remote_write) {} + +void tt_emulation_device::write_to_device( + const void* mem_ptr, + uint32_t size_in_bytes, + tt_cxy_pair core, + uint64_t addr, + const std::string& tlb_to_use, + bool send_epoch_cmd, + bool last_send_epoch_cmd, + bool ordered_with_prev_remote_write){}; + +void tt_emulation_device::read_from_device( + std::vector& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& /*tlb_to_use*/) {} + +void tt_emulation_device::l1_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) {} + +void tt_emulation_device::dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) {} + +void tt_emulation_device::dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) {} // ------------------------- // Not sure how to implement these functions below, leaving them blank/default for now void tt_emulation_device::translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c) { - // No translation is performed - return; + // No translation is performed + return; } + tt_ClusterDescriptor* tt_emulation_device::get_cluster_description() { return ndesc.get(); } -std::set tt_emulation_device::get_target_mmio_device_ids() {return {};} +std::set tt_emulation_device::get_target_mmio_device_ids() { return {}; } -std::set tt_emulation_device::get_target_remote_device_ids() {return {};} +std::set tt_emulation_device::get_target_remote_device_ids() { return {}; } void tt_emulation_device::set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_) {} + int tt_emulation_device::get_number_of_chips_in_cluster() { return detect_number_of_chips(); } -std::unordered_set tt_emulation_device::get_all_chips_in_cluster() { return { 0 }; } + +std::unordered_set tt_emulation_device::get_all_chips_in_cluster() { return {0}; } + int tt_emulation_device::detect_number_of_chips() { return 1; } bool tt_emulation_device::using_harvested_soc_descriptors() { return false; } -bool tt_emulation_device::noc_translation_en() { return false; } -std::unordered_map tt_emulation_device::get_harvesting_masks_for_soc_descriptors() { return {{0, 0}};} - -std::unordered_map& tt_emulation_device::get_virtual_soc_descriptors() {return soc_descriptor_per_chip;} -std::map tt_emulation_device::get_clocks() {return std::map();} +bool tt_emulation_device::noc_translation_en() { return false; } -void tt_emulation_device::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) {} +std::unordered_map tt_emulation_device::get_harvesting_masks_for_soc_descriptors() { + return {{0, 0}}; +} +std::unordered_map& tt_emulation_device::get_virtual_soc_descriptors() { + return soc_descriptor_per_chip; +} +std::map tt_emulation_device::get_clocks() { return std::map(); } +void tt_emulation_device::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) {} diff --git a/device/simulation/deprecated/tt_versim_device.cpp b/device/simulation/deprecated/tt_versim_device.cpp index e7ac7506..b59ffc08 100644 --- a/device/simulation/deprecated/tt_versim_device.cpp +++ b/device/simulation/deprecated/tt_versim_device.cpp @@ -2,16 +2,14 @@ // // SPDX-License-Identifier: Apache-2.0 - - -#include "tt_device.h" -#include "device/driver_atomics.h" -#include "common/logger.hpp" -#include #include +#include #include #include +#include "common/logger.hpp" +#include "device/driver_atomics.h" +#include "tt_device.h" #include "yaml-cpp/yaml.h" // TODO: Remove dependency on command_assembler + soc @@ -19,112 +17,134 @@ #include "device/tt_cluster_descriptor.h" namespace CA = CommandAssembler; - -void translate_soc_descriptor_to_ca_soc(CA::Soc &soc, const tt_SocDescriptor soc_descriptor) { - for (auto &core : soc_descriptor.cores) { - CA::SocNocNode node; - CA::xy_pair CA_coord(core.first.x, core.first.y); - node.noc_coord = CA_coord; - node.memory_size = core.second.l1_size; - switch (core.second.type) { - case CoreType::ARC: node.arc = true; break; - case CoreType::DRAM: { - node.dram = true; - #ifdef EN_DRAM_ALIAS - node.dram_channel_id = std::get<0>(soc_descriptor.dram_core_channel_map.at(core.first)); - #endif - } break; - case CoreType::ETH: node.eth = true; break; - case CoreType::PCIE: node.pcie = true; break; - case CoreType::WORKER: node.worker = true; break; - case CoreType::HARVESTED: node.harvested = true; break; - case CoreType::ROUTER_ONLY: node.router_only = true; break; - default: std::cout << " Error: Unsupported CoreType type: " << static_cast(core.second.type) << std::endl; break; +void translate_soc_descriptor_to_ca_soc(CA::Soc& soc, const tt_SocDescriptor soc_descriptor) { + for (auto& core : soc_descriptor.cores) { + CA::SocNocNode node; + CA::xy_pair CA_coord(core.first.x, core.first.y); + node.noc_coord = CA_coord; + node.memory_size = core.second.l1_size; + switch (core.second.type) { + case CoreType::ARC: + node.arc = true; + break; + case CoreType::DRAM: { + node.dram = true; +#ifdef EN_DRAM_ALIAS + node.dram_channel_id = std::get<0>(soc_descriptor.dram_core_channel_map.at(core.first)); +#endif + } break; + case CoreType::ETH: + node.eth = true; + break; + case CoreType::PCIE: + node.pcie = true; + break; + case CoreType::WORKER: + node.worker = true; + break; + case CoreType::HARVESTED: + node.harvested = true; + break; + case CoreType::ROUTER_ONLY: + node.router_only = true; + break; + default: + std::cout << " Error: Unsupported CoreType type: " << static_cast(core.second.type) << std::endl; + break; + } + soc.SetNodeProperties(node.noc_coord, node); } - soc.SetNodeProperties(node.noc_coord, node); - } } //////// // Device Versim //////// +#include + #include "device.h" #include "sim_interactive.h" -#include -tt_VersimDevice::tt_VersimDevice(const std::string &sdesc_path, const std::string &ndesc_path) : tt_device(sdesc_path) { - soc_descriptor_per_chip.emplace(0, tt_SocDescriptor(sdesc_path)); - std::set target_devices = {0}; - if (ndesc_path == "") { - ndesc = tt_ClusterDescriptor::create_for_grayskull_cluster(target_devices, {}); - } - else { - ndesc = tt_ClusterDescriptor::create_from_yaml(ndesc_path); - } +tt_VersimDevice::tt_VersimDevice(const std::string& sdesc_path, const std::string& ndesc_path) : tt_device(sdesc_path) { + soc_descriptor_per_chip.emplace(0, tt_SocDescriptor(sdesc_path)); + std::set target_devices = {0}; + if (ndesc_path == "") { + ndesc = tt_ClusterDescriptor::create_for_grayskull_cluster(target_devices, {}); + } else { + ndesc = tt_ClusterDescriptor::create_from_yaml(ndesc_path); + } } -std::unordered_map& tt_VersimDevice::get_virtual_soc_descriptors() {return soc_descriptor_per_chip;} - -tt_ClusterDescriptor* tt_VersimDevice::get_cluster_description() {return ndesc.get();} -void tt_VersimDevice::start_device(const tt_device_params &device_params) { - bool no_checkers = true; - std::vector dump_cores = device_params.unroll_vcd_dump_cores(get_soc_descriptor(0) -> grid_size); - start(device_params.expand_plusargs(), dump_cores, no_checkers, device_params.init_device, false); +std::unordered_map& tt_VersimDevice::get_virtual_soc_descriptors() { + return soc_descriptor_per_chip; } -void tt_VersimDevice::close_device() { - stop(); +tt_ClusterDescriptor* tt_VersimDevice::get_cluster_description() { return ndesc.get(); } + +void tt_VersimDevice::start_device(const tt_device_params& device_params) { + bool no_checkers = true; + std::vector dump_cores = device_params.unroll_vcd_dump_cores(get_soc_descriptor(0)->grid_size); + start(device_params.expand_plusargs(), dump_cores, no_checkers, device_params.init_device, false); } +void tt_VersimDevice::close_device() { stop(); } + void tt_VersimDevice::start( std::vector plusargs, std::vector dump_cores, bool no_checkers, bool /*init_device*/, bool /*skip_driver_allocs*/ - ) { - - std::cout << "Start Versim Device " << std::endl; - std::string device_descriptor_dir = "./"; +) { + std::cout << "Start Versim Device " << std::endl; + std::string device_descriptor_dir = "./"; - std::optional vcd_suffix; - if (dump_cores.size() > 0) { - vcd_suffix = "core_dump.vcd"; - } + std::optional vcd_suffix; + if (dump_cores.size() > 0) { + vcd_suffix = "core_dump.vcd"; + } - std::vector vcd_cores; + std::vector vcd_cores; - // TODO: For now create a temporary stuff from CA and populate from descriptor before passing back to versim-core - // interface. mainly bypasses arch_configs etc from llir. We can populate soc directly - // MT: have to preserve ca_soc_descriptor object since versim references it at runtime - CA::xy_pair CA_grid_size((soc_descriptor_per_chip.begin() -> second).grid_size.x, (soc_descriptor_per_chip.begin() -> second).grid_size.y); - // CA::Soc ca_soc_manager(CA_grid_size); - std::unique_ptr p_ca_soc_manager_unique = std::make_unique(CA_grid_size); - translate_soc_descriptor_to_ca_soc(*p_ca_soc_manager_unique, (soc_descriptor_per_chip.begin() -> second)); - // TODO: End + // TODO: For now create a temporary stuff from CA and populate from descriptor before passing back to versim-core + // interface. mainly bypasses arch_configs etc from llir. We can populate soc directly + // MT: have to preserve ca_soc_descriptor object since versim references it at runtime + CA::xy_pair CA_grid_size( + (soc_descriptor_per_chip.begin()->second).grid_size.x, (soc_descriptor_per_chip.begin()->second).grid_size.y); + // CA::Soc ca_soc_manager(CA_grid_size); + std::unique_ptr p_ca_soc_manager_unique = std::make_unique(CA_grid_size); + translate_soc_descriptor_to_ca_soc(*p_ca_soc_manager_unique, (soc_descriptor_per_chip.begin()->second)); + // TODO: End - std::cout << "Versim Device: turn_on_device "; - std::vector trisc_sizes = {static_cast(l1_address_params.trisc0_size), static_cast(l1_address_params.trisc1_size), static_cast(l1_address_params.trisc2_size)}; - std::unique_ptr versim_unique = versim::turn_on_device(CA_grid_size, *p_ca_soc_manager_unique, plusargs, vcd_suffix, dump_cores, no_checkers, - l1_address_params.trisc_base, trisc_sizes); - versim = versim_unique.release(); + std::cout << "Versim Device: turn_on_device "; + std::vector trisc_sizes = { + static_cast(l1_address_params.trisc0_size), + static_cast(l1_address_params.trisc1_size), + static_cast(l1_address_params.trisc2_size)}; + std::unique_ptr versim_unique = versim::turn_on_device( + CA_grid_size, + *p_ca_soc_manager_unique, + plusargs, + vcd_suffix, + dump_cores, + no_checkers, + l1_address_params.trisc_base, + trisc_sizes); + versim = versim_unique.release(); - std::cout << "Versim Device: write info to tvm db " << std::endl; - versim::write_info_to_tvm_db(l1_address_params.trisc_base, trisc_sizes); - versim::build_and_connect_tvm_phase(); + std::cout << "Versim Device: write info to tvm db " << std::endl; + versim::write_info_to_tvm_db(l1_address_params.trisc_base, trisc_sizes); + versim::build_and_connect_tvm_phase(); - versim->spin_threads(*p_ca_soc_manager_unique, false); - versim::assert_reset(*versim); + versim->spin_threads(*p_ca_soc_manager_unique, false); + versim::assert_reset(*versim); - p_ca_soc_manager = (void*)(p_ca_soc_manager_unique.release()); + p_ca_soc_manager = (void*)(p_ca_soc_manager_unique.release()); - std::cout << "Versim Device: Done start " << std::endl; + std::cout << "Versim Device: Done start " << std::endl; } -tt_VersimDevice::~tt_VersimDevice () { - ndesc.reset(); -} +tt_VersimDevice::~tt_VersimDevice() { ndesc.reset(); } // bool tt_VersimDevice::run() { // std::cout << "Versim Device: Run " << std::endl; @@ -136,165 +156,218 @@ tt_VersimDevice::~tt_VersimDevice () { // } void tt_VersimDevice::deassert_risc_reset() { - std::cout << "Versim Device: Deassert risc resets start" << std::endl; - versim::handle_resetting_triscs(*versim); - std::cout << "Versim Device: Start main loop " << std::endl; - versim::startup_versim_main_loop(*versim); + std::cout << "Versim Device: Deassert risc resets start" << std::endl; + versim::handle_resetting_triscs(*versim); + std::cout << "Versim Device: Start main loop " << std::endl; + versim::startup_versim_main_loop(*versim); } void tt_VersimDevice::deassert_risc_reset_at_core(tt_cxy_pair core) { - // This function deasserts reset on the full versim device (don't need core level granularity for versim) - deassert_risc_reset(); + // This function deasserts reset on the full versim device (don't need core level granularity for versim) + deassert_risc_reset(); } void tt_VersimDevice::assert_risc_reset() { - std::cout << "Pause all the cores" << std::endl; - versim::pause(*versim); + std::cout << "Pause all the cores" << std::endl; + versim::pause(*versim); - std::cout << "Wait for cores to go to paused state" << std::endl; - versim::sleep_wait_for_paused (*versim); + std::cout << "Wait for cores to go to paused state" << std::endl; + versim::sleep_wait_for_paused(*versim); - std::cout << "Assert riscv reset" << std::endl; - versim::assert_riscv_reset(*versim); + std::cout << "Assert riscv reset" << std::endl; + versim::assert_riscv_reset(*versim); } void tt_VersimDevice::assert_risc_reset_at_core(tt_cxy_pair core) { - // This function asserts reset on the full versim device (don't need core level granularity for versim) - assert_risc_reset(); -} - -void tt_VersimDevice::rolled_write_to_device(std::vector &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) { - uint32_t byte_increment = vec.size() * 4; - for (int i=0; i mem_vector(mem_ptr, mem_ptr + len); - rolled_write_to_device(mem_vector, unroll_count, core, addr, fallback_tlb); + // This function asserts reset on the full versim device (don't need core level granularity for versim) + assert_risc_reset(); } -void tt_VersimDevice::write_to_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) { - - log_debug(tt::LogSiliconDriver, "Versim Device ({}): Write vector at target core {}, address: {}", get_sim_time(*versim), core.str(), addr); - - bool aligned_32B = (soc_descriptor_per_chip.begin() -> second).cores.at(core).type == CoreType::DRAM; - // MT: Remove these completely - CommandAssembler::xy_pair CA_target(core.x, core.y); - CommandAssembler::memory CA_tensor_memory(addr, vec); - - nuapi::device::write_memory_to_core(*versim, CA_target, CA_tensor_memory); +void tt_VersimDevice::rolled_write_to_device( + std::vector& vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) { + uint32_t byte_increment = vec.size() * 4; + for (int i = 0; i < unroll_count; i++) { + vec[0] = i; // slot id for debug + write_to_device(vec, core, addr + i * byte_increment, tlb_to_use); + } } -void tt_VersimDevice::write_to_device(const void *mem_ptr, uint32_t size, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) { - log_assert(!(size % 4), "Writes to Versim Backend should be 4 byte aligned!"); - - std::vector mem_vector((uint32_t*)mem_ptr, (uint32_t*)mem_ptr + size / sizeof(uint32_t)); - write_to_device(mem_vector, core, addr, tlb_to_use, send_epoch_cmd, last_send_epoch_cmd, ordered_with_prev_remote_write); +void tt_VersimDevice::rolled_write_to_device( + uint32_t* mem_ptr, + uint32_t len, + uint32_t unroll_count, + tt_cxy_pair core, + uint64_t addr, + const std::string& fallback_tlb) { + std::vector mem_vector(mem_ptr, mem_ptr + len); + rolled_write_to_device(mem_vector, unroll_count, core, addr, fallback_tlb); +} + +void tt_VersimDevice::write_to_device( + std::vector& vec, + tt_cxy_pair core, + uint64_t addr, + const std::string& tlb_to_use, + bool send_epoch_cmd, + bool last_send_epoch_cmd, + bool ordered_with_prev_remote_write) { + log_debug( + tt::LogSiliconDriver, + "Versim Device ({}): Write vector at target core {}, address: {}", + get_sim_time(*versim), + core.str(), + addr); + + bool aligned_32B = (soc_descriptor_per_chip.begin()->second).cores.at(core).type == CoreType::DRAM; + // MT: Remove these completely + CommandAssembler::xy_pair CA_target(core.x, core.y); + CommandAssembler::memory CA_tensor_memory(addr, vec); + + nuapi::device::write_memory_to_core(*versim, CA_target, CA_tensor_memory); +} + +void tt_VersimDevice::write_to_device( + const void* mem_ptr, + uint32_t size, + tt_cxy_pair core, + uint64_t addr, + const std::string& tlb_to_use, + bool send_epoch_cmd, + bool last_send_epoch_cmd, + bool ordered_with_prev_remote_write) { + log_assert(!(size % 4), "Writes to Versim Backend should be 4 byte aligned!"); + + std::vector mem_vector((uint32_t*)mem_ptr, (uint32_t*)mem_ptr + size / sizeof(uint32_t)); + write_to_device( + mem_vector, core, addr, tlb_to_use, send_epoch_cmd, last_send_epoch_cmd, ordered_with_prev_remote_write); +} + +void tt_VersimDevice::broadcast_write_to_cluster( + const void* mem_ptr, + uint32_t size_in_bytes, + uint64_t address, + const std::set& chips_to_exclude, + std::set& rows_to_exclude, + std::set& cols_to_exclude, + const std::string& fallback_tlb) { + for (const auto& core : get_soc_descriptor(0)->cores) { + if (cols_to_exclude.find(core.first.x) == cols_to_exclude.end() and + rows_to_exclude.find(core.first.y) == rows_to_exclude.end() and core.second.type != CoreType::HARVESTED) { + write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, ""); + } + } } -void tt_VersimDevice::broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set& chips_to_exclude, std::set& rows_to_exclude, std::set& cols_to_exclude, const std::string& fallback_tlb) { - for(const auto& core : get_soc_descriptor(0) -> cores) { - if(cols_to_exclude.find(core.first.x) == cols_to_exclude.end() and rows_to_exclude.find(core.first.y) == rows_to_exclude.end() and core.second.type != CoreType::HARVESTED) { - write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, ""); - } - } -} void tt_VersimDevice::wait_for_non_mmio_flush() { - // Do nothing, since Versim does not simulate non-mmio mapped chips + // Do nothing, since Versim does not simulate non-mmio mapped chips } -void tt_VersimDevice::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) { - tt_driver_atomics::mfence(); // Ensure no reordering of loads/stores around this +void tt_VersimDevice::l1_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) { + tt_driver_atomics::mfence(); // Ensure no reordering of loads/stores around this } -void tt_VersimDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) { - tt_driver_atomics::mfence(); // Ensure no reordering of loads/stores around this +void tt_VersimDevice::dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) { + tt_driver_atomics::mfence(); // Ensure no reordering of loads/stores around this } -void tt_VersimDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& dram_cores) { - tt_driver_atomics::mfence(); // Ensure no reordering of loads/stores around this +void tt_VersimDevice::dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& dram_cores) { + tt_driver_atomics::mfence(); // Ensure no reordering of loads/stores around this } -void tt_VersimDevice::read_from_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) { - log_debug(tt::LogSiliconDriver, "Versim Device ({}): Read vector from address: {}, with size: {} Bytes", get_sim_time(*versim), addr, size); +void tt_VersimDevice::read_from_device( + std::vector& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) { + log_debug( + tt::LogSiliconDriver, + "Versim Device ({}): Read vector from address: {}, with size: {} Bytes", + get_sim_time(*versim), + addr, + size); - CommandAssembler::xy_pair CA_target(core.x, core.y); + CommandAssembler::xy_pair CA_target(core.x, core.y); - size_t size_in_words = size / 4; - auto result = nuapi::device::read_memory_from_core(*versim, CA_target, addr, size_in_words); - vec = result; + size_t size_in_words = size / 4; + auto result = nuapi::device::read_memory_from_core(*versim, CA_target, addr, size_in_words); + vec = result; } -void tt_VersimDevice::read_from_device(void *mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) { - log_debug(tt::LogSiliconDriver, "Versim Device ({}): Read vector from address: {}, with size: {} Bytes", get_sim_time(*versim), addr, size); - log_assert(!(size % 4), "Reads from Versim backend should be 4 byte aligned!"); +void tt_VersimDevice::read_from_device( + void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) { + log_debug( + tt::LogSiliconDriver, + "Versim Device ({}): Read vector from address: {}, with size: {} Bytes", + get_sim_time(*versim), + addr, + size); + log_assert(!(size % 4), "Reads from Versim backend should be 4 byte aligned!"); - CommandAssembler::xy_pair CA_target(core.x, core.y); + CommandAssembler::xy_pair CA_target(core.x, core.y); - size_t size_in_words = size / 4; - auto result = nuapi::device::read_memory_from_core(*versim, CA_target, addr, size_in_words); - memcpy(mem_ptr, result.data(), result.size()*sizeof(uint32_t)); + size_t size_in_words = size / 4; + auto result = nuapi::device::read_memory_from_core(*versim, CA_target, addr, size_in_words); + memcpy(mem_ptr, result.data(), result.size() * sizeof(uint32_t)); } -void tt_VersimDevice::translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c) { - // No translation is performed - return; +void tt_VersimDevice::translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c) { + // No translation is performed + return; } std::set tt_VersimDevice::get_target_mmio_device_ids() { - // Must only be used for silicon - return {}; + // Must only be used for silicon + return {}; } std::set tt_VersimDevice::get_target_remote_device_ids() { - // Must only be used for silicon - return {}; + // Must only be used for silicon + return {}; } - -bool versim_check_dram_core_exists(const std::vector> &dram_core_channels, tt_xy_pair target_core) { +bool versim_check_dram_core_exists( + const std::vector>& dram_core_channels, tt_xy_pair target_core) { bool dram_core_exists = false; - for (const auto &dram_cores_in_channel: dram_core_channels) { - for (const auto &dram_core : dram_cores_in_channel) { - if (dram_core.x == target_core.x && dram_core.y == target_core.y) { - return true; + for (const auto& dram_cores_in_channel : dram_core_channels) { + for (const auto& dram_core : dram_cores_in_channel) { + if (dram_core.x == target_core.x && dram_core.y == target_core.y) { + return true; + } } - } } return false; } int tt_VersimDevice::get_number_of_chips_in_cluster() { return detect_number_of_chips(); } + std::unordered_set tt_VersimDevice::get_all_chips_in_cluster() { return {0}; } + int tt_VersimDevice::detect_number_of_chips() { return 1; } bool tt_VersimDevice::using_harvested_soc_descriptors() { return false; } + bool tt_VersimDevice::noc_translation_en() { return false; } -std::unordered_map tt_VersimDevice::get_harvesting_masks_for_soc_descriptors() { return {{0, 0}};} + +std::unordered_map tt_VersimDevice::get_harvesting_masks_for_soc_descriptors() { return {{0, 0}}; } // Meant to breakout running functions for simulator bool tt_VersimDevice::stop() { - std::cout << "Versim Device: Stop " << std::endl; - - versim::turn_off_device(*versim); - versim->shutdown(); - // Force free of all versim cores - for (auto x = 0; x < versim->grid_size.x; x++) { - for (auto y = 0; y < versim->grid_size.y; y++) { - delete versim->core_grid.at(x).at(y); + std::cout << "Versim Device: Stop " << std::endl; + + versim::turn_off_device(*versim); + versim->shutdown(); + // Force free of all versim cores + for (auto x = 0; x < versim->grid_size.x; x++) { + for (auto y = 0; y < versim->grid_size.y; y++) { + delete versim->core_grid.at(x).at(y); + } } - } - std::cout << "Versim Device: Stop completed " << std::endl; - delete versim; - return true; + std::cout << "Versim Device: Stop completed " << std::endl; + delete versim; + return true; } -std::map tt_VersimDevice::get_clocks() { - return std::map(); -} +std::map tt_VersimDevice::get_clocks() { return std::map(); } void tt_VersimDevice::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) { l1_address_params = l1_address_params_; @@ -305,11 +378,11 @@ void tt_VersimDevice::set_device_dram_address_params(const tt_device_dram_addres } std::uint32_t tt_VersimDevice::get_num_dram_channels(std::uint32_t device_id) { - return get_soc_descriptor(device_id) -> get_num_dram_channels(); + return get_soc_descriptor(device_id)->get_num_dram_channels(); } std::uint64_t tt_VersimDevice::get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel) { - return get_soc_descriptor(device_id) -> dram_bank_size; // Space per channel is identical for now + return get_soc_descriptor(device_id)->dram_bank_size; // Space per channel is identical for now } std::uint32_t tt_VersimDevice::get_num_host_channels(std::uint32_t device_id) { diff --git a/device/simulation/deprecated/tt_versim_device.h b/device/simulation/deprecated/tt_versim_device.h index 087b7336..5cfb1ea7 100644 --- a/device/simulation/deprecated/tt_versim_device.h +++ b/device/simulation/deprecated/tt_versim_device.h @@ -11,42 +11,91 @@ #include "tt_xy_pair.h" class c_versim_core; -namespace nuapi {namespace device {template class Simulator;}} -namespace versim { - struct VersimSimulatorState; - using VersimSimulator = nuapi::device::Simulator; + +namespace nuapi { +namespace device { +template +class Simulator; } +} // namespace nuapi + +namespace versim { +struct VersimSimulatorState; +using VersimSimulator = nuapi::device::Simulator; +} // namespace versim /** * @brief Versim Backend Class, derived from the tt_device class * Implements APIs to communicate with a simulated (using Verilator) Tenstorrent Device. -*/ -class tt_VersimDevice: public tt_device -{ - public: + */ +class tt_VersimDevice : public tt_device { +public: virtual void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_); virtual void set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_); - tt_VersimDevice(const std::string &sdesc_path, const std::string &ndesc_path); + tt_VersimDevice(const std::string& sdesc_path, const std::string& ndesc_path); virtual std::unordered_map& get_virtual_soc_descriptors(); - virtual void start(std::vector plusargs, std::vector dump_cores, bool no_checkers, bool init_device, bool skip_driver_allocs); - virtual void start_device(const tt_device_params &device_params); + virtual void start( + std::vector plusargs, + std::vector dump_cores, + bool no_checkers, + bool init_device, + bool skip_driver_allocs); + virtual void start_device(const tt_device_params& device_params); virtual void close_device(); virtual void deassert_risc_reset(); virtual void deassert_risc_reset_at_core(tt_cxy_pair core); virtual void assert_risc_reset(); virtual void assert_risc_reset_at_core(tt_cxy_pair core); - virtual void write_to_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false); - virtual void broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set& chips_to_exclude, std::set& rows_to_exclude, std::set& columns_to_exclude, const std::string& fallback_tlb); - virtual void rolled_write_to_device(std::vector &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use); - virtual void read_from_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use); - virtual void rolled_write_to_device(uint32_t* mem_ptr, uint32_t size_in_bytes, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb); - virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false); - virtual void read_from_device(void *mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use); + virtual void write_to_device( + std::vector& vec, + tt_cxy_pair core, + uint64_t addr, + const std::string& tlb_to_use, + bool send_epoch_cmd = false, + bool last_send_epoch_cmd = true, + bool ordered_with_prev_remote_write = false); + virtual void broadcast_write_to_cluster( + const void* mem_ptr, + uint32_t size_in_bytes, + uint64_t address, + const std::set& chips_to_exclude, + std::set& rows_to_exclude, + std::set& columns_to_exclude, + const std::string& fallback_tlb); + virtual void rolled_write_to_device( + std::vector& vec, + uint32_t unroll_count, + tt_cxy_pair core, + uint64_t addr, + const std::string& tlb_to_use); + virtual void read_from_device( + std::vector& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use); + virtual void rolled_write_to_device( + uint32_t* mem_ptr, + uint32_t size_in_bytes, + uint32_t unroll_count, + tt_cxy_pair core, + uint64_t addr, + const std::string& fallback_tlb); + virtual void write_to_device( + const void* mem_ptr, + uint32_t size_in_bytes, + tt_cxy_pair core, + uint64_t addr, + const std::string& tlb_to_use, + bool send_epoch_cmd = false, + bool last_send_epoch_cmd = true, + bool ordered_with_prev_remote_write = false); + virtual void read_from_device( + void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use); virtual void wait_for_non_mmio_flush(); - void l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); - void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels); - void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); - virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c); + void l1_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); + void dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels); + void dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); + virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c); virtual bool using_harvested_soc_descriptors(); virtual std::unordered_map get_harvesting_masks_for_soc_descriptors(); virtual bool noc_translation_en(); @@ -57,12 +106,13 @@ class tt_VersimDevice: public tt_device virtual int get_number_of_chips_in_cluster(); virtual std::unordered_set get_all_chips_in_cluster(); static int detect_number_of_chips(); - virtual std::map get_clocks(); + virtual std::map get_clocks(); virtual std::uint32_t get_num_dram_channels(std::uint32_t device_id); virtual std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel); virtual std::uint32_t get_num_host_channels(std::uint32_t device_id); virtual std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel); - private: + +private: bool stop(); tt_device_l1_address_params l1_address_params; tt_device_dram_address_params dram_address_params; diff --git a/device/simulation/deprecated/tt_versim_stub.cpp b/device/simulation/deprecated/tt_versim_stub.cpp index 27c69f80..1a0e5cc3 100644 --- a/device/simulation/deprecated/tt_versim_stub.cpp +++ b/device/simulation/deprecated/tt_versim_stub.cpp @@ -2,19 +2,18 @@ // // SPDX-License-Identifier: Apache-2.0 - -#include "tt_device.h" - -#include #include +#include #include #include -tt_VersimDevice::tt_VersimDevice(const std::string &sdesc_path, const std::string &ndesc_path) : tt_device(sdesc_path) { - throw std::runtime_error("tt_VersimDevice() -- VERSIM is not supported in this build\n"); +#include "tt_device.h" + +tt_VersimDevice::tt_VersimDevice(const std::string& sdesc_path, const std::string& ndesc_path) : tt_device(sdesc_path) { + throw std::runtime_error("tt_VersimDevice() -- VERSIM is not supported in this build\n"); } -tt_VersimDevice::~tt_VersimDevice () {} +tt_VersimDevice::~tt_VersimDevice() {} std::unordered_map& tt_VersimDevice::get_virtual_soc_descriptors() { throw std::runtime_error("tt_VersimDevice() -- VERSIM is not supported in this build\n"); @@ -22,23 +21,71 @@ std::unordered_map& tt_VersimDevice::get_virtual_so } int tt_VersimDevice::get_number_of_chips_in_cluster() { return detect_number_of_chips(); } + std::unordered_set tt_VersimDevice::get_all_chips_in_cluster() { return {}; } + int tt_VersimDevice::detect_number_of_chips() { return 0; } -void tt_VersimDevice::start_device(const tt_device_params &device_params) {} +void tt_VersimDevice::start_device(const tt_device_params& device_params) {} + void tt_VersimDevice::close_device() {} -void tt_VersimDevice::write_to_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {} -void tt_VersimDevice::broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set& chips_to_exclude, std::set& rows_to_exclude, std::set& cols_to_exclude, const std::string& fallback_tlb) {} -void tt_VersimDevice::read_from_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) {} -void tt_VersimDevice::rolled_write_to_device(std::vector &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) {} -void tt_VersimDevice::write_to_device(const void *mem_ptr, uint32_t len, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {} -void tt_VersimDevice::read_from_device(void *mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) {} -void tt_VersimDevice::rolled_write_to_device(uint32_t* mem_ptr, uint32_t len, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb) {} + +void tt_VersimDevice::write_to_device( + std::vector& vec, + tt_cxy_pair core, + uint64_t addr, + const std::string& tlb_to_use, + bool send_epoch_cmd, + bool last_send_epoch_cmd, + bool ordered_with_prev_remote_write) {} + +void tt_VersimDevice::broadcast_write_to_cluster( + const void* mem_ptr, + uint32_t size_in_bytes, + uint64_t address, + const std::set& chips_to_exclude, + std::set& rows_to_exclude, + std::set& cols_to_exclude, + const std::string& fallback_tlb) {} + +void tt_VersimDevice::read_from_device( + std::vector& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) {} + +void tt_VersimDevice::rolled_write_to_device( + std::vector& vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) { +} + +void tt_VersimDevice::write_to_device( + const void* mem_ptr, + uint32_t len, + tt_cxy_pair core, + uint64_t addr, + const std::string& tlb_to_use, + bool send_epoch_cmd, + bool last_send_epoch_cmd, + bool ordered_with_prev_remote_write) {} + +void tt_VersimDevice::read_from_device( + void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) {} + +void tt_VersimDevice::rolled_write_to_device( + uint32_t* mem_ptr, + uint32_t len, + uint32_t unroll_count, + tt_cxy_pair core, + uint64_t addr, + const std::string& fallback_tlb) {} + void tt_VersimDevice::wait_for_non_mmio_flush() {} -void tt_VersimDevice::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) {} -void tt_VersimDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) {} -void tt_VersimDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& dram_cores) {} +void tt_VersimDevice::l1_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) {} + +void tt_VersimDevice::dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) {} + +void tt_VersimDevice::dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& dram_cores) {} void tt_VersimDevice::start( std::vector plusargs, @@ -49,36 +96,48 @@ void tt_VersimDevice::start( ) {} void tt_VersimDevice::deassert_risc_reset() {} + void tt_VersimDevice::deassert_risc_reset_at_core(tt_cxy_pair core) {} + void tt_VersimDevice::assert_risc_reset() {} + void tt_VersimDevice::assert_risc_reset_at_core(tt_cxy_pair core) {} -void tt_VersimDevice::translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c) {}; +void tt_VersimDevice::translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c){}; + // void tt_VersimDevice::dump_wall_clock_mailbox(std::string output_path, int device_id) {} -std::set tt_VersimDevice::get_target_mmio_device_ids() {return {};} -std::set tt_VersimDevice::get_target_remote_device_ids() {return {};} +std::set tt_VersimDevice::get_target_mmio_device_ids() { return {}; } + +std::set tt_VersimDevice::get_target_remote_device_ids() { return {}; } bool versim_check_dram_core_exists( - const std::vector> &dram_core_channels, tt_xy_pair target_core) { - return false; + const std::vector>& dram_core_channels, tt_xy_pair target_core) { + return false; } bool tt_VersimDevice::using_harvested_soc_descriptors() { return false; } + bool tt_VersimDevice::noc_translation_en() { return false; } -std::unordered_map tt_VersimDevice::get_harvesting_masks_for_soc_descriptors() { return std::unordered_map();} + +std::unordered_map tt_VersimDevice::get_harvesting_masks_for_soc_descriptors() { + return std::unordered_map(); +} bool tt_VersimDevice::stop() { return true; } void tt_VersimDevice::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) {} + void tt_VersimDevice::set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_) {} -std::uint32_t tt_VersimDevice::get_num_dram_channels(std::uint32_t device_id) {return 0;} -std::uint64_t tt_VersimDevice::get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel) {return 0;} -std::uint32_t tt_VersimDevice::get_num_host_channels(std::uint32_t device_id) {return 0;} -std::uint32_t tt_VersimDevice::get_host_channel_size(std::uint32_t device_id, std::uint32_t channel) {return 0;} +std::uint32_t tt_VersimDevice::get_num_dram_channels(std::uint32_t device_id) { return 0; } + +std::uint64_t tt_VersimDevice::get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel) { return 0; } + +std::uint32_t tt_VersimDevice::get_num_host_channels(std::uint32_t device_id) { return 0; } -std::map tt_VersimDevice::get_clocks() {return std::map();} +std::uint32_t tt_VersimDevice::get_host_channel_size(std::uint32_t device_id, std::uint32_t channel) { return 0; } -tt_ClusterDescriptor* tt_VersimDevice::get_cluster_description() {return ndesc.get();} +std::map tt_VersimDevice::get_clocks() { return std::map(); } +tt_ClusterDescriptor* tt_VersimDevice::get_cluster_description() { return ndesc.get(); } diff --git a/device/simulation/tt_simulation_device.cpp b/device/simulation/tt_simulation_device.cpp index 9b0457d4..59bb2dfd 100644 --- a/device/simulation/tt_simulation_device.cpp +++ b/device/simulation/tt_simulation_device.cpp @@ -4,43 +4,44 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include +#include "tt_simulation_device.h" + +#include +#include + #include +#include #include #include -#include -#include - -#include "common/logger.hpp" #include "common/assert.hpp" +#include "common/logger.hpp" #include "device/driver_atomics.h" #include "device/tt_cluster_descriptor.h" - -#include "tt_simulation_device.h" #include "tt_simulation_device_generated.h" -flatbuffers::FlatBufferBuilder create_flatbuffer(DEVICE_COMMAND rw, std::vector vec, tt_cxy_pair core_, uint64_t addr, uint64_t size_=0){ +flatbuffers::FlatBufferBuilder create_flatbuffer( + DEVICE_COMMAND rw, std::vector vec, tt_cxy_pair core_, uint64_t addr, uint64_t size_ = 0) { flatbuffers::FlatBufferBuilder builder; auto data = builder.CreateVector(vec); auto core = tt_vcs_core(core_.x, core_.y); - uint64_t size = size_ == 0 ? size = vec.size()*sizeof(uint32_t) : size = size_; + uint64_t size = size_ == 0 ? size = vec.size() * sizeof(uint32_t) : size = size_; auto device_cmd = CreateDeviceRequestResponse(builder, rw, data, &core, addr, size); builder.Finish(device_cmd); return builder; } -void print_flatbuffer(const DeviceRequestResponse *buf){ +void print_flatbuffer(const DeviceRequestResponse* buf) { std::vector data_vec(buf->data()->begin(), buf->data()->end()); uint64_t addr = buf->address(); uint32_t size = buf->size(); tt_cxy_pair core = {0, buf->core()->x(), buf->core()->y()}; - + std::stringstream ss; ss << std::hex << reinterpret_cast(addr); std::string addr_hex = ss.str(); log_info(tt::LogEmulationDriver, "{} bytes @ address {} in core ({}, {})", size, addr_hex, core.x, core.y); - for(int i = 0; i < data_vec.size(); i++){ + for (int i = 0; i < data_vec.size(); i++) { std::ios_base::fmtflags save = std::cout.flags(); std::cout << "0x" << std::hex << std::setw(8) << std::setfill('0') << data_vec[i] << " "; std::cout.flags(save); @@ -48,14 +49,14 @@ void print_flatbuffer(const DeviceRequestResponse *buf){ std::cout << std::endl; } -tt_SimulationDevice::tt_SimulationDevice(const std::string &sdesc_path) : tt_device(sdesc_path){ +tt_SimulationDevice::tt_SimulationDevice(const std::string& sdesc_path) : tt_device(sdesc_path) { log_info(tt::LogEmulationDriver, "Instantiating simulation device"); soc_descriptor_per_chip.emplace(0, tt_SocDescriptor(sdesc_path)); std::set target_devices = {0}; - + // Start VCS simulator in a separate process TT_ASSERT(std::getenv("TT_REMOTE_EXE"), "TT_REMOTE_EXE not set, please provide path to the VCS binary"); - uv_loop_t *loop = uv_default_loop(); + uv_loop_t* loop = uv_default_loop(); uv_process_t child_p; uv_process_options_t child_options = {0}; @@ -69,14 +70,12 @@ tt_SimulationDevice::tt_SimulationDevice(const std::string &sdesc_path) : tt_dev log_info(tt::LogEmulationDriver, "Simulator process spawned with PID: {}", child_p.pid); } - uv_unref((uv_handle_t *) &child_p); + uv_unref((uv_handle_t*)&child_p); uv_run(loop, UV_RUN_DEFAULT); uv_loop_close(loop); } -tt_SimulationDevice::~tt_SimulationDevice() { - close_device(); -} +tt_SimulationDevice::~tt_SimulationDevice() { close_device(); } // Setup/Teardown Functions std::unordered_map& tt_SimulationDevice::get_virtual_soc_descriptors() { @@ -99,11 +98,11 @@ void tt_SimulationDevice::set_driver_eth_interface_params(const tt_driver_eth_in eth_interface_params = eth_interface_params_; } -void tt_SimulationDevice::start_device(const tt_device_params &device_params) { - void *buf_ptr = nullptr; +void tt_SimulationDevice::start_device(const tt_device_params& device_params) { + void* buf_ptr = nullptr; host.start_host(); - + log_info(tt::LogEmulationDriver, "Waiting for ack msg from remote..."); size_t buf_size = host.recv_from_device(&buf_ptr); auto buf = GetDeviceRequestResponse(buf_ptr); @@ -114,8 +113,9 @@ void tt_SimulationDevice::start_device(const tt_device_params &device_params) { void tt_SimulationDevice::assert_risc_reset() { log_info(tt::LogEmulationDriver, "Sending assert_risc_reset signal.."); - auto wr_buffer = create_flatbuffer(DEVICE_COMMAND_ALL_TENSIX_RESET_ASSERT, std::vector(1, 0), {0, 0, 0}, 0); - uint8_t *wr_buffer_ptr = wr_buffer.GetBufferPointer(); + auto wr_buffer = + create_flatbuffer(DEVICE_COMMAND_ALL_TENSIX_RESET_ASSERT, std::vector(1, 0), {0, 0, 0}, 0); + uint8_t* wr_buffer_ptr = wr_buffer.GetBufferPointer(); size_t wr_buffer_size = wr_buffer.GetSize(); print_flatbuffer(GetDeviceRequestResponse(wr_buffer_ptr)); @@ -124,20 +124,25 @@ void tt_SimulationDevice::assert_risc_reset() { void tt_SimulationDevice::deassert_risc_reset() { log_info(tt::LogEmulationDriver, "Sending 'deassert_risc_reset' signal.."); - auto wr_buffer = create_flatbuffer(DEVICE_COMMAND_ALL_TENSIX_RESET_DEASSERT, std::vector(1, 0), {0, 0, 0}, 0); - uint8_t *wr_buffer_ptr = wr_buffer.GetBufferPointer(); + auto wr_buffer = + create_flatbuffer(DEVICE_COMMAND_ALL_TENSIX_RESET_DEASSERT, std::vector(1, 0), {0, 0, 0}, 0); + uint8_t* wr_buffer_ptr = wr_buffer.GetBufferPointer(); size_t wr_buffer_size = wr_buffer.GetSize(); host.send_to_device(wr_buffer_ptr, wr_buffer_size); } void tt_SimulationDevice::deassert_risc_reset_at_core(tt_cxy_pair core) { - log_info(tt::LogEmulationDriver, "Sending 'deassert_risc_reset_at_core'.. (Not implemented, defaulting to 'deassert_risc_reset' instead)"); + log_info( + tt::LogEmulationDriver, + "Sending 'deassert_risc_reset_at_core'.. (Not implemented, defaulting to 'deassert_risc_reset' instead)"); deassert_risc_reset(); } void tt_SimulationDevice::assert_risc_reset_at_core(tt_cxy_pair core) { - log_info(tt::LogEmulationDriver, "Sending 'assert_risc_reset_at_core'.. (Not implemented, defaulting to 'assert_risc_reset' instead)"); + log_info( + tt::LogEmulationDriver, + "Sending 'assert_risc_reset_at_core'.. (Not implemented, defaulting to 'assert_risc_reset' instead)"); assert_risc_reset(); } @@ -149,19 +154,21 @@ void tt_SimulationDevice::close_device() { } // Runtime Functions -void tt_SimulationDevice::write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) { +void tt_SimulationDevice::write_to_device( + const void* mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) { log_info(tt::LogEmulationDriver, "Device writing"); std::vector data((uint32_t*)mem_ptr, (uint32_t*)mem_ptr + size_in_bytes / sizeof(uint32_t)); auto wr_buffer = create_flatbuffer(DEVICE_COMMAND_WRITE, data, core, addr); - uint8_t *wr_buffer_ptr = wr_buffer.GetBufferPointer(); + uint8_t* wr_buffer_ptr = wr_buffer.GetBufferPointer(); size_t wr_buffer_size = wr_buffer.GetSize(); - - print_flatbuffer(GetDeviceRequestResponse(wr_buffer_ptr)); // sanity print + + print_flatbuffer(GetDeviceRequestResponse(wr_buffer_ptr)); // sanity print host.send_to_device(wr_buffer_ptr, wr_buffer_size); } -void tt_SimulationDevice::read_from_device(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) { - void *rd_resp; +void tt_SimulationDevice::read_from_device( + void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) { + void* rd_resp; // Send read request auto rd_req_buf = create_flatbuffer(DEVICE_COMMAND_READ, {0}, core, addr, size); @@ -171,50 +178,49 @@ void tt_SimulationDevice::read_from_device(void* mem_ptr, tt_cxy_pair core, uint size_t rd_rsp_sz = host.recv_from_device(&rd_resp); auto rd_resp_buf = GetDeviceRequestResponse(rd_resp); - if (addr != 0x40){ + if (addr != 0x40) { log_info(tt::LogEmulationDriver, "Device reading vec"); - print_flatbuffer(rd_resp_buf); // 0x40 is host polling device, don't print since it'll spam + print_flatbuffer(rd_resp_buf); // 0x40 is host polling device, don't print since it'll spam } std::memcpy(mem_ptr, rd_resp_buf->data()->data(), rd_resp_buf->data()->size() * sizeof(uint32_t)); nng_free(rd_resp, rd_rsp_sz); } void tt_SimulationDevice::wait_for_non_mmio_flush() {} + void tt_SimulationDevice::wait_for_non_mmio_flush(const chip_id_t chip) {} -void tt_SimulationDevice::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) {} -void tt_SimulationDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) {} -void tt_SimulationDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) {} + +void tt_SimulationDevice::l1_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) {} + +void tt_SimulationDevice::dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) {} + +void tt_SimulationDevice::dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) {} // Misc. Functions to Query/Set Device State std::unordered_map tt_SimulationDevice::get_harvesting_masks_for_soc_descriptors() { return {{0, 0}}; } -std::vector tt_SimulationDevice::detect_available_device_ids() { - return {0}; -} +std::vector tt_SimulationDevice::detect_available_device_ids() { return {0}; } -std::set tt_SimulationDevice::get_target_remote_device_ids() { - return target_remote_chips; -} +std::set tt_SimulationDevice::get_target_remote_device_ids() { return target_remote_chips; } -std::map tt_SimulationDevice::get_clocks() { - return {{0, 0}}; -} +std::map tt_SimulationDevice::get_clocks() { return {{0, 0}}; } -void *tt_SimulationDevice::host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const { +void* tt_SimulationDevice::host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const { return nullptr; } std::uint64_t tt_SimulationDevice::get_pcie_base_addr_from_device(const chip_id_t chip_id) const { - if(arch_name == tt::ARCH::WORMHOLE_B0) { + if (arch_name == tt::ARCH::WORMHOLE_B0) { return 0x800000000; - } - else if (arch_name == tt::ARCH::BLACKHOLE) { + } else if (arch_name == tt::ARCH::BLACKHOLE) { // Enable 4th ATU window. return 1ULL << 60; - } - else { + } else { return 0; } } @@ -224,12 +230,11 @@ std::uint32_t tt_SimulationDevice::get_num_dram_channels(std::uint32_t device_id } std::uint64_t tt_SimulationDevice::get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel) { - return get_soc_descriptor(device_id).dram_bank_size; // Space per channel is identical for now + return get_soc_descriptor(device_id).dram_bank_size; // Space per channel is identical for now } -std::uint32_t tt_SimulationDevice::get_num_host_channels(std::uint32_t device_id) { - return 1; -} +std::uint32_t tt_SimulationDevice::get_num_host_channels(std::uint32_t device_id) { return 1; } + +std::uint32_t tt_SimulationDevice::get_host_channel_size(std::uint32_t device_id, std::uint32_t channel) { return 0; } -std::uint32_t tt_SimulationDevice::get_host_channel_size(std::uint32_t device_id, std::uint32_t channel) {return 0;} -std::uint32_t tt_SimulationDevice::get_numa_node_for_pcie_device(std::uint32_t device_id) {return 0;} +std::uint32_t tt_SimulationDevice::get_numa_node_for_pcie_device(std::uint32_t device_id) { return 0; } diff --git a/device/simulation/tt_simulation_device.h b/device/simulation/tt_simulation_device.h index 95bdec82..a5eb0c85 100644 --- a/device/simulation/tt_simulation_device.h +++ b/device/simulation/tt_simulation_device.h @@ -10,23 +10,23 @@ #include #include -#include "device/tt_device.h" #include "device/simulation/tt_simulation_host.hpp" +#include "device/tt_device.h" -class tt_SimulationDevice: public tt_device { - public: - tt_SimulationDevice(const std::string &sdesc_path); +class tt_SimulationDevice : public tt_device { +public: + tt_SimulationDevice(const std::string& sdesc_path); ~tt_SimulationDevice(); tt_SimulationHost host; - //Setup/Teardown Functions + // Setup/Teardown Functions virtual std::unordered_map& get_virtual_soc_descriptors(); virtual void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_); virtual void set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_); virtual void set_driver_host_address_params(const tt_driver_host_address_params& host_address_params_); virtual void set_driver_eth_interface_params(const tt_driver_eth_interface_params& eth_interface_params_); - virtual void start_device(const tt_device_params &device_params); + virtual void start_device(const tt_device_params& device_params); virtual void assert_risc_reset(); virtual void deassert_risc_reset(); virtual void deassert_risc_reset_at_core(tt_cxy_pair core); @@ -34,22 +34,27 @@ class tt_SimulationDevice: public tt_device { virtual void close_device(); // Runtime Functions - virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use); - virtual void read_from_device(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb); + virtual void write_to_device( + const void* mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use); + virtual void read_from_device( + void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb); virtual void wait_for_non_mmio_flush(); virtual void wait_for_non_mmio_flush(const chip_id_t chip); - void l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); - void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels); - void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); + void l1_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); + void dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels); + void dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); // Misc. Functions to Query/Set Device State // virtual bool using_harvested_soc_descriptors(); virtual std::unordered_map get_harvesting_masks_for_soc_descriptors(); static std::vector detect_available_device_ids(); virtual std::set get_target_remote_device_ids(); - virtual std::map get_clocks(); - virtual void *host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const; + virtual std::map get_clocks(); + virtual void* host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const; virtual std::uint64_t get_pcie_base_addr_from_device(const chip_id_t chip_id) const; virtual std::uint32_t get_num_dram_channels(std::uint32_t device_id); virtual std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel); @@ -57,7 +62,7 @@ class tt_SimulationDevice: public tt_device { virtual std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel); virtual std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id); - private: +private: // State variables tt_device_dram_address_params dram_address_params; tt_device_l1_address_params l1_address_params; diff --git a/device/simulation/tt_simulation_host.cpp b/device/simulation/tt_simulation_host.cpp index ed9cf7e9..309bb7be 100644 --- a/device/simulation/tt_simulation_host.cpp +++ b/device/simulation/tt_simulation_host.cpp @@ -2,19 +2,20 @@ // // SPDX-License-Identifier: Apache-2.0 -#include -#include -#include -#include -#include -#include +#include "tt_simulation_host.hpp" #include #include -#include "common/logger.hpp" +#include +#include +#include +#include +#include +#include + #include "common/assert.hpp" -#include "tt_simulation_host.hpp" +#include "common/logger.hpp" tt_SimulationHost::tt_SimulationHost() { // Initialize socket and dialer @@ -64,7 +65,7 @@ void tt_SimulationHost::start_host() { void tt_SimulationHost::send_to_device(uint8_t *buf, size_t buf_size) { int rv; log_debug(tt::LogEmulationDriver, "Sending messsage to remote.."); - + void *msg = nng_alloc(buf_size); std::memcpy(msg, buf, buf_size); diff --git a/device/simulation/tt_simulation_host.hpp b/device/simulation/tt_simulation_host.hpp index 6de18a04..26897a44 100644 --- a/device/simulation/tt_simulation_host.hpp +++ b/device/simulation/tt_simulation_host.hpp @@ -1,9 +1,9 @@ // SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc. // // SPDX-License-Identifier: Apache-2.0 -#include #include #include +#include #include "device/tt_xy_pair.h" @@ -20,6 +20,7 @@ class tt_SimulationHost { void start_host(); void send_to_device(uint8_t *buf, size_t buf_size); size_t recv_from_device(void **data_ptr); + private: std::unique_ptr host_socket; std::unique_ptr host_dialer; diff --git a/device/tlb.h b/device/tlb.h index 3e8fb826..30094202 100644 --- a/device/tlb.h +++ b/device/tlb.h @@ -8,8 +8,8 @@ #include #include -#include #include +#include namespace tt::umd { @@ -41,10 +41,10 @@ struct tlb_data { // Orderings static constexpr uint64_t Relaxed = 0; - static constexpr uint64_t Strict = 1; - static constexpr uint64_t Posted = 2; + static constexpr uint64_t Strict = 1; + static constexpr uint64_t Posted = 2; - bool check(const tlb_offsets & offset) const; + bool check(const tlb_offsets &offset) const; std::pair apply_offset(const tlb_offsets &offset) const; }; diff --git a/device/tt_arch_types.h b/device/tt_arch_types.h index 8a7c5dba..c165bf1b 100644 --- a/device/tt_arch_types.h +++ b/device/tt_arch_types.h @@ -17,4 +17,4 @@ enum class ARCH { BLACKHOLE = 3, Invalid = 0xFF, }; -} +} // namespace tt diff --git a/device/tt_cluster_descriptor.cpp b/device/tt_cluster_descriptor.cpp index 558fb0ab..8c1472f1 100644 --- a/device/tt_cluster_descriptor.cpp +++ b/device/tt_cluster_descriptor.cpp @@ -2,22 +2,23 @@ // // SPDX-License-Identifier: Apache-2.0 - #include "tt_cluster_descriptor.h" #include #include -#include +#include #include "common/logger.hpp" -#include "yaml-cpp/yaml.h" - #include "fmt/core.h" +#include "yaml-cpp/yaml.h" using namespace tt; -bool tt_ClusterDescriptor::ethernet_core_has_active_ethernet_link(chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const { + +bool tt_ClusterDescriptor::ethernet_core_has_active_ethernet_link( + chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const { return this->ethernet_connections.find(local_chip) != this->ethernet_connections.end() && - this->ethernet_connections.at(local_chip).find(local_ethernet_channel) != this->ethernet_connections.at(local_chip).end(); + this->ethernet_connections.at(local_chip).find(local_ethernet_channel) != + this->ethernet_connections.at(local_chip).end(); } std::tuple tt_ClusterDescriptor::get_chip_and_channel_of_remote_ethernet_core( @@ -38,10 +39,14 @@ std::tuple tt_ClusterDescriptor::get_chip_and_cha } } -// NOTE: It might be worthwhile to precompute this for every pair of directly connected chips, depending on how extensively router needs to use it -std::vector> tt_ClusterDescriptor::get_directly_connected_ethernet_channels_between_chips(const chip_id_t &first, const chip_id_t &second) const { +// NOTE: It might be worthwhile to precompute this for every pair of directly connected chips, depending on how +// extensively router needs to use it +std::vector> +tt_ClusterDescriptor::get_directly_connected_ethernet_channels_between_chips( + const chip_id_t &first, const chip_id_t &second) const { std::vector> directly_connected_channels = {}; - if (this->enabled_active_chips.find(first) == this->enabled_active_chips.end() || this->enabled_active_chips.find(second) == this->enabled_active_chips.end()) { + if (this->enabled_active_chips.find(first) == this->enabled_active_chips.end() || + this->enabled_active_chips.find(second) == this->enabled_active_chips.end()) { return {}; } @@ -58,9 +63,7 @@ bool tt_ClusterDescriptor::is_chip_mmio_capable(const chip_id_t chip_id) const { return this->chips_with_mmio.find(chip_id) != this->chips_with_mmio.end(); } -bool tt_ClusterDescriptor::is_chip_remote(const chip_id_t chip_id) const { - return !is_chip_mmio_capable(chip_id); -} +bool tt_ClusterDescriptor::is_chip_remote(const chip_id_t chip_id) const { return !is_chip_mmio_capable(chip_id); } // given two coordinates, finds the number of hops between the two chips // it assumes that shelves are connected in x-dim and racks are connected in y-dim @@ -69,11 +72,19 @@ bool tt_ClusterDescriptor::is_chip_remote(const chip_id_t chip_id) const { // then once a chip on the same shelf&rack is found, // the distance from this chip to either location_a or location_b is just x&y dim difference. // the function returns the total distance of travelled between shelves and racks, plust the x&y dim difference -int tt_ClusterDescriptor::get_ethernet_link_coord_distance(const eth_coord_t &location_a, const eth_coord_t &location_b) const { - - log_trace(LogSiliconDriver, "get_ethernet_link_coord_distance from ({}, {}, {}, {}) to ({}, {}, {}, {})", - std::get<0>(location_a), std::get<1>(location_a), std::get<2>(location_a), std::get<3>(location_a), - std::get<0>(location_b), std::get<1>(location_b), std::get<2>(location_b), std::get<3>(location_b)); +int tt_ClusterDescriptor::get_ethernet_link_coord_distance( + const eth_coord_t &location_a, const eth_coord_t &location_b) const { + log_trace( + LogSiliconDriver, + "get_ethernet_link_coord_distance from ({}, {}, {}, {}) to ({}, {}, {}, {})", + std::get<0>(location_a), + std::get<1>(location_a), + std::get<2>(location_a), + std::get<3>(location_a), + std::get<0>(location_b), + std::get<1>(location_b), + std::get<2>(location_b), + std::get<3>(location_b)); // eth_coord_t: x, y, rack, shelf @@ -93,166 +104,236 @@ int tt_ClusterDescriptor::get_ethernet_link_coord_distance(const eth_coord_t &lo int y_distance = std::abs(y_a - y_b); // move along y-dim to exit from the shelf to go to a higher shelf - if(shelf_b > shelf_a) { + if (shelf_b > shelf_a) { // this is already verified where galaxy_shelves_exit_chip_coords_per_y_dim is populated, but just to be safe - log_assert(galaxy_shelves_exit_chip_coords_per_y_dim.find(shelf_a) != galaxy_shelves_exit_chip_coords_per_y_dim.end(), + log_assert( + galaxy_shelves_exit_chip_coords_per_y_dim.find(shelf_a) != galaxy_shelves_exit_chip_coords_per_y_dim.end(), "Expected shelf-to-shelf connection"); // this row does not have a shelf-to-shelf connection - if(galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_a).find(y_a) == galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_a).end()) { + if (galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_a).find(y_a) == + galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_a).end()) { return std::numeric_limits::max(); } - const Chip2ChipConnection& shelf_to_shelf_connection = galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_a).at(y_a); - log_assert(shelf_to_shelf_connection.destination_chip_coords.size(), "Expecting at least one shelf-to-shelf connection, possibly one-to-many"); + const Chip2ChipConnection &shelf_to_shelf_connection = + galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_a).at(y_a); + log_assert( + shelf_to_shelf_connection.destination_chip_coords.size(), + "Expecting at least one shelf-to-shelf connection, possibly one-to-many"); // for each shelf-to-shelf connection at y_a, find the distance to location_b, take min int distance = std::numeric_limits::max(); eth_coord_t exit_shelf = shelf_to_shelf_connection.source_chip_coord; - for(eth_coord_t next_shelf : shelf_to_shelf_connection.destination_chip_coords) { - - log_assert(std::get<1>(exit_shelf) == y_a && std::get<3>(exit_shelf) == shelf_a && std::get<2>(exit_shelf) == rack_a, + for (eth_coord_t next_shelf : shelf_to_shelf_connection.destination_chip_coords) { + log_assert( + std::get<1>(exit_shelf) == y_a && std::get<3>(exit_shelf) == shelf_a && + std::get<2>(exit_shelf) == rack_a, "Invalid shelf exit coordinates"); // next shelf could be at a different y-dim in nebula->galaxy systems - log_assert(std::get<3>(next_shelf) == (shelf_a+1) && std::get<2>(next_shelf) == rack_a, + log_assert( + std::get<3>(next_shelf) == (shelf_a + 1) && std::get<2>(next_shelf) == rack_a, "Invalid shelf entry coordinates"); // hop onto the next shelf and find distance from there int distance_to_exit = get_ethernet_link_coord_distance(location_a, exit_shelf); int distance_in_next_shelf = get_ethernet_link_coord_distance(next_shelf, location_b); // no path found - if(distance_to_exit == std::numeric_limits::max() || distance_in_next_shelf == std::numeric_limits::max()) { + if (distance_to_exit == std::numeric_limits::max() || + distance_in_next_shelf == std::numeric_limits::max()) { continue; } distance = std::min(distance, distance_to_exit + distance_in_next_shelf + 1); } - log_trace(LogSiliconDriver, "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}", - std::get<0>(location_a), std::get<1>(location_a), std::get<2>(location_a), std::get<3>(location_a), - std::get<0>(location_b), std::get<1>(location_b), std::get<2>(location_b), std::get<3>(location_b), distance); + log_trace( + LogSiliconDriver, + "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}", + std::get<0>(location_a), + std::get<1>(location_a), + std::get<2>(location_a), + std::get<3>(location_a), + std::get<0>(location_b), + std::get<1>(location_b), + std::get<2>(location_b), + std::get<3>(location_b), + distance); return distance; - } - else if(shelf_a > shelf_b) { - + } else if (shelf_a > shelf_b) { // this is already verified where galaxy_shelves_exit_chip_coords_per_y_dim is populated, but just to be safe - log_assert(galaxy_shelves_exit_chip_coords_per_y_dim.find(shelf_b) != galaxy_shelves_exit_chip_coords_per_y_dim.end(), + log_assert( + galaxy_shelves_exit_chip_coords_per_y_dim.find(shelf_b) != galaxy_shelves_exit_chip_coords_per_y_dim.end(), "Expected shelf-to-shelf connection"); // this row does not have a shelf-to-shelf connection - if(galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_b).find(y_b) == galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_b).end()) { + if (galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_b).find(y_b) == + galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_b).end()) { return std::numeric_limits::max(); } - const Chip2ChipConnection& shelf_to_shelf_connection = galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_b).at(y_b); - log_assert(shelf_to_shelf_connection.destination_chip_coords.size(), "Expecting at least one shelf-to-shelf connection, possibly one-to-many") + const Chip2ChipConnection &shelf_to_shelf_connection = + galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_b).at(y_b); + log_assert( + shelf_to_shelf_connection.destination_chip_coords.size(), + "Expecting at least one shelf-to-shelf connection, possibly one-to-many") - // for each shelf-to-shelf connection at y_b, find the distance to location_a, take min - int distance = std::numeric_limits::max(); + // for each shelf-to-shelf connection at y_b, find the distance to location_a, take min + int distance = std::numeric_limits::max(); eth_coord_t exit_shelf = shelf_to_shelf_connection.source_chip_coord; - for(eth_coord_t next_shelf : shelf_to_shelf_connection.destination_chip_coords) { - - log_assert(std::get<1>(exit_shelf) == y_b && std::get<3>(exit_shelf) == shelf_b && std::get<2>(exit_shelf) == rack_b, + for (eth_coord_t next_shelf : shelf_to_shelf_connection.destination_chip_coords) { + log_assert( + std::get<1>(exit_shelf) == y_b && std::get<3>(exit_shelf) == shelf_b && + std::get<2>(exit_shelf) == rack_b, "Invalid shelf exit coordinates"); // next shelf could be at a different y-dim in nebula->galaxy systems - log_assert(std::get<3>(next_shelf) == (shelf_b+1) && std::get<2>(next_shelf) == rack_b, + log_assert( + std::get<3>(next_shelf) == (shelf_b + 1) && std::get<2>(next_shelf) == rack_b, "Invalid shelf entry coordinates"); // hop onto the next shelf and find distance from there int distance_to_exit = get_ethernet_link_coord_distance(location_b, exit_shelf); int distance_in_next_shelf = get_ethernet_link_coord_distance(next_shelf, location_a); // no path found - if(distance_to_exit == std::numeric_limits::max() || distance_in_next_shelf == std::numeric_limits::max()) { + if (distance_to_exit == std::numeric_limits::max() || + distance_in_next_shelf == std::numeric_limits::max()) { continue; } distance = std::min(distance, distance_to_exit + distance_in_next_shelf + 1); } - log_trace(LogSiliconDriver, "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}", - std::get<0>(location_a), std::get<1>(location_a), std::get<2>(location_a), std::get<3>(location_a), - std::get<0>(location_b), std::get<1>(location_b), std::get<2>(location_b), std::get<3>(location_b), distance); + log_trace( + LogSiliconDriver, + "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}", + std::get<0>(location_a), + std::get<1>(location_a), + std::get<2>(location_a), + std::get<3>(location_a), + std::get<0>(location_b), + std::get<1>(location_b), + std::get<2>(location_b), + std::get<3>(location_b), + distance); return distance; } // move along y-dim to exit from the shelf to go to a higher shelf - if(rack_b > rack_a) { - + if (rack_b > rack_a) { // this is already verified where galaxy_racks_exit_chip_coords_per_x_dim is populated, but just to be safe - log_assert(galaxy_racks_exit_chip_coords_per_x_dim.find(rack_a) != galaxy_racks_exit_chip_coords_per_x_dim.end(), + log_assert( + galaxy_racks_exit_chip_coords_per_x_dim.find(rack_a) != galaxy_racks_exit_chip_coords_per_x_dim.end(), "Expected rack-to-rack connection"); // this row does not have a rack-to-rack connection - if(galaxy_racks_exit_chip_coords_per_x_dim.at(rack_a).find(x_a) == galaxy_racks_exit_chip_coords_per_x_dim.at(rack_a).end()) { + if (galaxy_racks_exit_chip_coords_per_x_dim.at(rack_a).find(x_a) == + galaxy_racks_exit_chip_coords_per_x_dim.at(rack_a).end()) { return std::numeric_limits::max(); } - const Chip2ChipConnection& rack_to_rack_connection = galaxy_racks_exit_chip_coords_per_x_dim.at(rack_a).at(x_a); - log_assert(rack_to_rack_connection.destination_chip_coords.size(), "Expecting at least one rack-to-rack connection, possibly one-to-many"); + const Chip2ChipConnection &rack_to_rack_connection = galaxy_racks_exit_chip_coords_per_x_dim.at(rack_a).at(x_a); + log_assert( + rack_to_rack_connection.destination_chip_coords.size(), + "Expecting at least one rack-to-rack connection, possibly one-to-many"); // for each rack-to-rack connection at x_a, find the distance to location_b, take min int distance = std::numeric_limits::max(); eth_coord_t exit_rack = rack_to_rack_connection.source_chip_coord; - for(eth_coord_t next_rack : rack_to_rack_connection.destination_chip_coords) { - - log_assert(std::get<0>(exit_rack) == x_a && std::get<3>(exit_rack) == shelf_a && std::get<2>(exit_rack) == rack_a, + for (eth_coord_t next_rack : rack_to_rack_connection.destination_chip_coords) { + log_assert( + std::get<0>(exit_rack) == x_a && std::get<3>(exit_rack) == shelf_a && std::get<2>(exit_rack) == rack_a, "Invalid rack exit coordinates"); - log_assert(std::get<0>(next_rack) == x_a && std::get<3>(next_rack) == shelf_a && std::get<2>(next_rack) == (rack_a+1), + log_assert( + std::get<0>(next_rack) == x_a && std::get<3>(next_rack) == shelf_a && + std::get<2>(next_rack) == (rack_a + 1), "Invalid rack entry coordinates"); // hop onto the next rack and find distance from there int distance_to_exit = get_ethernet_link_coord_distance(location_a, exit_rack); int distance_in_next_rack = get_ethernet_link_coord_distance(next_rack, location_b); // no path found - if (distance_to_exit == std::numeric_limits::max() || distance_in_next_rack == std::numeric_limits::max()) { + if (distance_to_exit == std::numeric_limits::max() || + distance_in_next_rack == std::numeric_limits::max()) { continue; } distance = std::min(distance, distance_to_exit + distance_in_next_rack + 1); } - log_trace(LogSiliconDriver, "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}", - std::get<0>(location_a), std::get<1>(location_a), std::get<2>(location_a), std::get<3>(location_a), - std::get<0>(location_b), std::get<1>(location_b), std::get<2>(location_b), std::get<3>(location_b), distance); + log_trace( + LogSiliconDriver, + "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}", + std::get<0>(location_a), + std::get<1>(location_a), + std::get<2>(location_a), + std::get<3>(location_a), + std::get<0>(location_b), + std::get<1>(location_b), + std::get<2>(location_b), + std::get<3>(location_b), + distance); return distance; - } - else if(rack_a > rack_b) { - + } else if (rack_a > rack_b) { // this is already verified where galaxy_racks_exit_chip_coords_per_x_dim is populated, but just to be safe - log_assert(galaxy_racks_exit_chip_coords_per_x_dim.find(rack_b) != galaxy_racks_exit_chip_coords_per_x_dim.end(), + log_assert( + galaxy_racks_exit_chip_coords_per_x_dim.find(rack_b) != galaxy_racks_exit_chip_coords_per_x_dim.end(), "Expected rack-to-rack connection"); // this row does not have a rack-to-rack connection - if(galaxy_racks_exit_chip_coords_per_x_dim.at(rack_b).find(x_b) == galaxy_racks_exit_chip_coords_per_x_dim.at(rack_b).end()) { + if (galaxy_racks_exit_chip_coords_per_x_dim.at(rack_b).find(x_b) == + galaxy_racks_exit_chip_coords_per_x_dim.at(rack_b).end()) { return std::numeric_limits::max(); } - const Chip2ChipConnection& rack_to_rack_connection = galaxy_racks_exit_chip_coords_per_x_dim.at(rack_b).at(x_b); - log_assert(rack_to_rack_connection.destination_chip_coords.size(), "Expecting at least one rack-to-rack connection, possibly one-to-many"); + const Chip2ChipConnection &rack_to_rack_connection = galaxy_racks_exit_chip_coords_per_x_dim.at(rack_b).at(x_b); + log_assert( + rack_to_rack_connection.destination_chip_coords.size(), + "Expecting at least one rack-to-rack connection, possibly one-to-many"); // for each rack-to-rack connection at x_a, find the distance to location_b, take min int distance = std::numeric_limits::max(); eth_coord_t exit_rack = rack_to_rack_connection.source_chip_coord; - for(eth_coord_t next_rack : rack_to_rack_connection.destination_chip_coords) { - - log_assert(std::get<0>(exit_rack) == x_b && std::get<3>(exit_rack) == shelf_b && std::get<2>(exit_rack) == rack_b, + for (eth_coord_t next_rack : rack_to_rack_connection.destination_chip_coords) { + log_assert( + std::get<0>(exit_rack) == x_b && std::get<3>(exit_rack) == shelf_b && std::get<2>(exit_rack) == rack_b, "Invalid rack exit coordinates"); - log_assert(std::get<0>(next_rack) == x_b && std::get<3>(next_rack) == shelf_b && std::get<2>(next_rack) == (rack_b+1), + log_assert( + std::get<0>(next_rack) == x_b && std::get<3>(next_rack) == shelf_b && + std::get<2>(next_rack) == (rack_b + 1), "Invalid rack entry coordinates"); // hop onto the next rack and find distance from there int distance_to_exit = get_ethernet_link_coord_distance(location_b, exit_rack); int distance_in_next_rack = get_ethernet_link_coord_distance(next_rack, location_a); // no path found - if (distance_to_exit == std::numeric_limits::max() || distance_in_next_rack == std::numeric_limits::max()) { + if (distance_to_exit == std::numeric_limits::max() || + distance_in_next_rack == std::numeric_limits::max()) { continue; } distance = std::min(distance, distance_to_exit + distance_in_next_rack + 1); } - log_trace(LogSiliconDriver, "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}", - std::get<0>(location_a), std::get<1>(location_a), std::get<2>(location_a), std::get<3>(location_a), - std::get<0>(location_b), std::get<1>(location_b), std::get<2>(location_b), std::get<3>(location_b), distance); + log_trace( + LogSiliconDriver, + "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}", + std::get<0>(location_a), + std::get<1>(location_a), + std::get<2>(location_a), + std::get<3>(location_a), + std::get<0>(location_b), + std::get<1>(location_b), + std::get<2>(location_b), + std::get<3>(location_b), + distance); return distance; } - log_trace(LogSiliconDriver, "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}", - std::get<0>(location_a), std::get<1>(location_a), std::get<2>(location_a), std::get<3>(location_a), - std::get<0>(location_b), std::get<1>(location_b), std::get<2>(location_b), std::get<3>(location_b), x_distance + y_distance); + log_trace( + LogSiliconDriver, + "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}", + std::get<0>(location_a), + std::get<1>(location_a), + std::get<2>(location_a), + std::get<3>(location_a), + std::get<0>(location_b), + std::get<1>(location_b), + std::get<2>(location_b), + std::get<3>(location_b), + x_distance + y_distance); // on same shelf/rack, the distance is just x+y difference return x_distance + y_distance; @@ -260,14 +341,13 @@ int tt_ClusterDescriptor::get_ethernet_link_coord_distance(const eth_coord_t &lo // Returns the closest mmio chip to the given chip chip_id_t tt_ClusterDescriptor::get_closest_mmio_capable_chip(const chip_id_t chip) { - log_debug(LogSiliconDriver, "get_closest_mmio_chip to chip{}", chip); if (this->is_chip_mmio_capable(chip)) { return chip; } - if(closest_mmio_chip_cache.find(chip) != closest_mmio_chip_cache.end()) { + if (closest_mmio_chip_cache.find(chip) != closest_mmio_chip_cache.end()) { return closest_mmio_chip_cache[chip]; } @@ -279,7 +359,14 @@ chip_id_t tt_ClusterDescriptor::get_closest_mmio_capable_chip(const chip_id_t ch const chip_id_t &mmio_chip = pair.first; eth_coord_t mmio_eth_coord = this->chip_locations.at(mmio_chip); - log_debug(LogSiliconDriver, "Checking chip{} at ({}, {}, {}, {})", mmio_chip, std::get<0>(mmio_eth_coord), std::get<1>(mmio_eth_coord), std::get<2>(mmio_eth_coord), std::get<3>(mmio_eth_coord)); + log_debug( + LogSiliconDriver, + "Checking chip{} at ({}, {}, {}, {})", + mmio_chip, + std::get<0>(mmio_eth_coord), + std::get<1>(mmio_eth_coord), + std::get<2>(mmio_eth_coord), + std::get<3>(mmio_eth_coord)); int distance = get_ethernet_link_coord_distance(mmio_eth_coord, chip_eth_coord); if (distance < min_distance) { @@ -287,7 +374,8 @@ chip_id_t tt_ClusterDescriptor::get_closest_mmio_capable_chip(const chip_id_t ch closest_chip = mmio_chip; } } - log_assert(min_distance != std::numeric_limits::max(), "Chip{} is not connected to any MMIO capable chip", chip); + log_assert( + min_distance != std::numeric_limits::max(), "Chip{} is not connected to any MMIO capable chip", chip); log_assert(is_chip_mmio_capable(closest_chip), "Closest MMIO chip must be MMIO capable"); @@ -298,12 +386,14 @@ chip_id_t tt_ClusterDescriptor::get_closest_mmio_capable_chip(const chip_id_t ch return closest_chip; } -std::unique_ptr tt_ClusterDescriptor::create_from_yaml(const std::string &cluster_descriptor_file_path) { +std::unique_ptr tt_ClusterDescriptor::create_from_yaml( + const std::string &cluster_descriptor_file_path) { std::unique_ptr desc = std::unique_ptr(new tt_ClusterDescriptor()); std::ifstream fdesc(cluster_descriptor_file_path); if (fdesc.fail()) { - throw std::runtime_error(fmt::format("Error: cluster connectivity descriptor file {} does not exist!", cluster_descriptor_file_path)); + throw std::runtime_error(fmt::format( + "Error: cluster connectivity descriptor file {} does not exist!", cluster_descriptor_file_path)); } fdesc.close(); @@ -319,22 +409,31 @@ std::unique_ptr tt_ClusterDescriptor::create_from_yaml(con } std::unique_ptr tt_ClusterDescriptor::create_for_grayskull_cluster( - const std::set &logical_mmio_device_ids, - const std::vector &physical_mmio_device_ids) { + const std::set &logical_mmio_device_ids, const std::vector &physical_mmio_device_ids) { std::unique_ptr desc = std::unique_ptr(new tt_ClusterDescriptor()); // Some users need not care about physical ids, can provide empty set. - auto use_physical_ids = physical_mmio_device_ids.size() ? true : false; - auto largest_workload_logical_device_id = *logical_mmio_device_ids.rbegin(); // Last element in ordered set. - auto num_available_physical_devices = physical_mmio_device_ids.size(); - auto required_physical_devices = largest_workload_logical_device_id + 1; - - log_debug(tt::LogSiliconDriver, "{} - use_physical_ids: {} largest_workload_logical_device_id: {} num_available_physical_devices: {} required_physical_devices: {}", - __FUNCTION__, use_physical_ids, largest_workload_logical_device_id, num_available_physical_devices, required_physical_devices); - - log_assert(!use_physical_ids || num_available_physical_devices >= required_physical_devices, + auto use_physical_ids = physical_mmio_device_ids.size() ? true : false; + auto largest_workload_logical_device_id = *logical_mmio_device_ids.rbegin(); // Last element in ordered set. + auto num_available_physical_devices = physical_mmio_device_ids.size(); + auto required_physical_devices = largest_workload_logical_device_id + 1; + + log_debug( + tt::LogSiliconDriver, + "{} - use_physical_ids: {} largest_workload_logical_device_id: {} num_available_physical_devices: {} " + "required_physical_devices: {}", + __FUNCTION__, + use_physical_ids, + largest_workload_logical_device_id, + num_available_physical_devices, + required_physical_devices); + + log_assert( + !use_physical_ids || num_available_physical_devices >= required_physical_devices, "Insufficient silicon devices. Workload requires device_id: {} (ie. {} devices) but only {} present", - largest_workload_logical_device_id, required_physical_devices, num_available_physical_devices); + largest_workload_logical_device_id, + required_physical_devices, + num_available_physical_devices); // All Grayskull devices are MMIO mapped so physical_mmio_device_ids correspond to all available devices for (auto &logical_id : logical_mmio_device_ids) { @@ -343,8 +442,10 @@ std::unique_ptr tt_ClusterDescriptor::create_for_grayskull desc->all_chips.insert(logical_id); eth_coord_t chip_location{logical_id, 0, 0, 0}; desc->chip_locations.insert({logical_id, chip_location}); - desc->coords_to_chip_ids[std::get<2>(chip_location)][std::get<3>(chip_location)][std::get<1>(chip_location)][std::get<0>(chip_location)] = logical_id; - log_debug(tt::LogSiliconDriver, "{} - adding logical: {} => physical: {}", __FUNCTION__, logical_id, physical_id); + desc->coords_to_chip_ids[std::get<2>(chip_location)][std::get<3>(chip_location)][std::get<1>(chip_location)] + [std::get<0>(chip_location)] = logical_id; + log_debug( + tt::LogSiliconDriver, "{} - adding logical: {} => physical: {}", __FUNCTION__, logical_id, physical_id); } desc->enable_all_devices(); @@ -352,7 +453,8 @@ std::unique_ptr tt_ClusterDescriptor::create_for_grayskull return desc; } -void tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc) { +void tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descriptor( + YAML::Node &yaml, tt_ClusterDescriptor &desc) { log_assert(yaml["ethernet_connections"].IsSequence(), "Invalid YAML"); for (YAML::Node &connected_endpoints : yaml["ethernet_connections"].as>()) { log_assert(connected_endpoints.IsSequence(), "Invalid YAML"); @@ -385,7 +487,13 @@ void tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descripto log_debug(LogSiliconDriver, "Ethernet Connectivity Descriptor:"); for (const auto &[chip, chan_to_chip_chan_map] : desc.ethernet_connections) { for (const auto &[chan, chip_and_chan] : chan_to_chip_chan_map) { - log_debug(LogSiliconDriver, "\tchip: {}, chan: {} <--> chip: {}, chan: {}", chip, chan, std::get<0>(chip_and_chan), std::get<1>(chip_and_chan)); + log_debug( + LogSiliconDriver, + "\tchip: {}, chan: {} <--> chip: {}, chan: {}", + chip, + chan, + std::get<0>(chip_and_chan), + std::get<1>(chip_and_chan)); } } @@ -407,47 +515,57 @@ void tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descripto int highest_rack_id = 0; // shelves and racks can be connected at different chip coordinates - // determine which chips are connected to the next (i.e. higher id) shelf/rack and what the coordinate of the chip on the other shelf/rack is - // this is used in get_ethernet_link_coord_distance to find the distance between two chips + // determine which chips are connected to the next (i.e. higher id) shelf/rack and what the coordinate of the chip + // on the other shelf/rack is this is used in get_ethernet_link_coord_distance to find the distance between two + // chips for (const auto &[chip_id, chip_eth_coord] : desc.chip_locations) { highest_shelf_id = std::max(highest_shelf_id, std::get<3>(chip_eth_coord)); highest_rack_id = std::max(highest_rack_id, std::get<2>(chip_eth_coord)); // iterate over all neighbors - if(desc.ethernet_connections.find(chip_id) == desc.ethernet_connections.end()) { - continue; // chip has no eth connections + if (desc.ethernet_connections.find(chip_id) == desc.ethernet_connections.end()) { + continue; // chip has no eth connections } for (const auto &[chan, chip_and_chan] : desc.ethernet_connections.at(chip_id)) { const chip_id_t &neighbor_chip = std::get<0>(chip_and_chan); eth_coord_t neighbor_eth_coord = desc.chip_locations.at(neighbor_chip); // shelves are connected in x-dim - if(std::get<3>(neighbor_eth_coord) != std::get<3>(chip_eth_coord)) { - eth_coord_t higher_shelf_coord = std::get<3>(neighbor_eth_coord) > std::get<3>(chip_eth_coord) ? neighbor_eth_coord : chip_eth_coord; - eth_coord_t lower_shelf_coord = std::get<3>(neighbor_eth_coord) < std::get<3>(chip_eth_coord) ? neighbor_eth_coord : chip_eth_coord; + if (std::get<3>(neighbor_eth_coord) != std::get<3>(chip_eth_coord)) { + eth_coord_t higher_shelf_coord = + std::get<3>(neighbor_eth_coord) > std::get<3>(chip_eth_coord) ? neighbor_eth_coord : chip_eth_coord; + eth_coord_t lower_shelf_coord = + std::get<3>(neighbor_eth_coord) < std::get<3>(chip_eth_coord) ? neighbor_eth_coord : chip_eth_coord; int lower_shelf_id = std::get<3>(lower_shelf_coord); int lower_shelf_y = std::get<1>(lower_shelf_coord); - auto& galaxy_shelf_exit_chip_coords_per_y_dim = desc.galaxy_shelves_exit_chip_coords_per_y_dim[lower_shelf_id]; + auto &galaxy_shelf_exit_chip_coords_per_y_dim = + desc.galaxy_shelves_exit_chip_coords_per_y_dim[lower_shelf_id]; log_assert( - galaxy_shelf_exit_chip_coords_per_y_dim.find(lower_shelf_y) == galaxy_shelf_exit_chip_coords_per_y_dim.end() || - galaxy_shelf_exit_chip_coords_per_y_dim[lower_shelf_y].source_chip_coord == lower_shelf_coord, + galaxy_shelf_exit_chip_coords_per_y_dim.find(lower_shelf_y) == + galaxy_shelf_exit_chip_coords_per_y_dim.end() || + galaxy_shelf_exit_chip_coords_per_y_dim[lower_shelf_y].source_chip_coord == lower_shelf_coord, "Expected a single exit chip on each shelf row"); galaxy_shelf_exit_chip_coords_per_y_dim[lower_shelf_y].source_chip_coord = lower_shelf_coord; - galaxy_shelf_exit_chip_coords_per_y_dim[lower_shelf_y].destination_chip_coords.insert(higher_shelf_coord); + galaxy_shelf_exit_chip_coords_per_y_dim[lower_shelf_y].destination_chip_coords.insert( + higher_shelf_coord); } // racks are connected in y-dim - if(std::get<2>(neighbor_eth_coord) != std::get<2>(chip_eth_coord)) { - eth_coord_t higher_rack_coord = std::get<2>(neighbor_eth_coord) > std::get<2>(chip_eth_coord) ? neighbor_eth_coord : chip_eth_coord; - eth_coord_t lower_rack_coord = std::get<2>(neighbor_eth_coord) < std::get<2>(chip_eth_coord) ? neighbor_eth_coord : chip_eth_coord; + if (std::get<2>(neighbor_eth_coord) != std::get<2>(chip_eth_coord)) { + eth_coord_t higher_rack_coord = + std::get<2>(neighbor_eth_coord) > std::get<2>(chip_eth_coord) ? neighbor_eth_coord : chip_eth_coord; + eth_coord_t lower_rack_coord = + std::get<2>(neighbor_eth_coord) < std::get<2>(chip_eth_coord) ? neighbor_eth_coord : chip_eth_coord; int lower_rack_id = std::get<2>(lower_rack_coord); int lower_rack_x = std::get<0>(lower_rack_coord); - auto& galaxy_rack_exit_chip_coords_per_x_dim = desc.galaxy_racks_exit_chip_coords_per_x_dim[lower_rack_id]; + auto &galaxy_rack_exit_chip_coords_per_x_dim = + desc.galaxy_racks_exit_chip_coords_per_x_dim[lower_rack_id]; log_assert( - galaxy_rack_exit_chip_coords_per_x_dim.find(lower_rack_x) == galaxy_rack_exit_chip_coords_per_x_dim.end() || - galaxy_rack_exit_chip_coords_per_x_dim[lower_rack_x].source_chip_coord == lower_rack_coord, + galaxy_rack_exit_chip_coords_per_x_dim.find(lower_rack_x) == + galaxy_rack_exit_chip_coords_per_x_dim.end() || + galaxy_rack_exit_chip_coords_per_x_dim[lower_rack_x].source_chip_coord == lower_rack_coord, "Expected a single exit chip on each rack column"); galaxy_rack_exit_chip_coords_per_x_dim[lower_rack_x].source_chip_coord = lower_rack_coord; galaxy_rack_exit_chip_coords_per_x_dim[lower_rack_x].destination_chip_coords.insert(higher_rack_coord); @@ -458,23 +576,36 @@ void tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descripto // verify that every shelf (except the highest in id) is found in galaxy_shelves_exit_chip_coords_per_y_dim // this means that we expect the shelves to be connected linearly in a daisy-chain fashion. // shelf0->shelf1->shelf2->...->shelfN - for(int shelf_id = 0; shelf_id < highest_shelf_id; shelf_id++) { - log_assert(desc.galaxy_shelves_exit_chip_coords_per_y_dim.find(shelf_id) != desc.galaxy_shelves_exit_chip_coords_per_y_dim.end(), - "Expected shelf {} to be connected to the next shelf", shelf_id); + for (int shelf_id = 0; shelf_id < highest_shelf_id; shelf_id++) { + log_assert( + desc.galaxy_shelves_exit_chip_coords_per_y_dim.find(shelf_id) != + desc.galaxy_shelves_exit_chip_coords_per_y_dim.end(), + "Expected shelf {} to be connected to the next shelf", + shelf_id); } // this prints the exit chip coordinates for each shelf // this is used in get_ethernet_link_coord_distance to find the distance between two chips for (const auto &[shelf, shelf_exit_chip_coords_per_y_dim] : desc.galaxy_shelves_exit_chip_coords_per_y_dim) { for (const auto &[y_dim, shelf_exit_chip_coords] : shelf_exit_chip_coords_per_y_dim) { - log_debug(LogSiliconDriver, "shelf: {} y_dim: {} exit_coord:({}, {}, {}, {})", - shelf, y_dim, - std::get<0>(shelf_exit_chip_coords.source_chip_coord), std::get<1>(shelf_exit_chip_coords.source_chip_coord), - std::get<2>(shelf_exit_chip_coords.source_chip_coord), std::get<3>(shelf_exit_chip_coords.source_chip_coord)); + log_debug( + LogSiliconDriver, + "shelf: {} y_dim: {} exit_coord:({}, {}, {}, {})", + shelf, + y_dim, + std::get<0>(shelf_exit_chip_coords.source_chip_coord), + std::get<1>(shelf_exit_chip_coords.source_chip_coord), + std::get<2>(shelf_exit_chip_coords.source_chip_coord), + std::get<3>(shelf_exit_chip_coords.source_chip_coord)); for (const auto &destination_chip_coord : shelf_exit_chip_coords.destination_chip_coords) { // print shelf_exit_chip_coord in the format: (x, y, rack, shelf) - log_debug(LogSiliconDriver, "\tdestination_chip_coord: ({}, {}, {}, {})", - std::get<0>(destination_chip_coord), std::get<1>(destination_chip_coord), std::get<2>(destination_chip_coord), std::get<3>(destination_chip_coord)); + log_debug( + LogSiliconDriver, + "\tdestination_chip_coord: ({}, {}, {}, {})", + std::get<0>(destination_chip_coord), + std::get<1>(destination_chip_coord), + std::get<2>(destination_chip_coord), + std::get<3>(destination_chip_coord)); } } } @@ -482,21 +613,35 @@ void tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descripto // verify that every rack (except the highest in id) is found in galaxy_racks_exit_chip_coords_per_x_dim // this means that we expect the racks to be connected linearly in a daisy-chain fashion. // rack0->rack1->rack2->...->rackN - for(int rack_id = 0; rack_id < highest_rack_id; rack_id++) { - log_assert(desc.galaxy_racks_exit_chip_coords_per_x_dim.find(rack_id) != desc.galaxy_racks_exit_chip_coords_per_x_dim.end(), - "Expected rack {} to be connected to the next rack", rack_id); + for (int rack_id = 0; rack_id < highest_rack_id; rack_id++) { + log_assert( + desc.galaxy_racks_exit_chip_coords_per_x_dim.find(rack_id) != + desc.galaxy_racks_exit_chip_coords_per_x_dim.end(), + "Expected rack {} to be connected to the next rack", + rack_id); } // this prints the exit chip coordinates for each rack // this is used in get_ethernet_link_coord_distance to find the distance between two chips for (const auto &[rack, rack_exit_chip_coords_per_x_dim] : desc.galaxy_racks_exit_chip_coords_per_x_dim) { for (const auto &[x_dim, rack_exit_chip_coords] : rack_exit_chip_coords_per_x_dim) { - log_debug(LogSiliconDriver, "rack: {} x_dim: {} exit_coord:({}, {}, {}, {})", rack, x_dim, - std::get<0>(rack_exit_chip_coords.source_chip_coord), std::get<1>(rack_exit_chip_coords.source_chip_coord), - std::get<2>(rack_exit_chip_coords.source_chip_coord), std::get<3>(rack_exit_chip_coords.source_chip_coord)); + log_debug( + LogSiliconDriver, + "rack: {} x_dim: {} exit_coord:({}, {}, {}, {})", + rack, + x_dim, + std::get<0>(rack_exit_chip_coords.source_chip_coord), + std::get<1>(rack_exit_chip_coords.source_chip_coord), + std::get<2>(rack_exit_chip_coords.source_chip_coord), + std::get<3>(rack_exit_chip_coords.source_chip_coord)); for (const auto &destination_chip_coord : rack_exit_chip_coords.destination_chip_coords) { - log_debug(LogSiliconDriver, "\tdestination_chip_coord: ({}, {}, {}, {})", - std::get<0>(destination_chip_coord), std::get<1>(destination_chip_coord), std::get<2>(destination_chip_coord), std::get<3>(destination_chip_coord)); + log_debug( + LogSiliconDriver, + "\tdestination_chip_coord: ({}, {}, {}, {})", + std::get<0>(destination_chip_coord), + std::get<1>(destination_chip_coord), + std::get<2>(destination_chip_coord), + std::get<3>(destination_chip_coord)); } } } @@ -509,19 +654,19 @@ void tt_ClusterDescriptor::load_chips_from_connectivity_descriptor(YAML::Node &y log_assert(chip_rack_coords.size() == 4, "Galaxy (x, y, rack, shelf) coords must be size 4"); eth_coord_t chip_location{ chip_rack_coords.at(0), chip_rack_coords.at(1), chip_rack_coords.at(2), chip_rack_coords.at(3)}; - + desc.chip_locations.insert({chip_id, chip_location}); - desc.coords_to_chip_ids[std::get<2>(chip_location)][std::get<3>(chip_location)][std::get<1>(chip_location)][std::get<0>(chip_location)] = chip_id; + desc.coords_to_chip_ids[std::get<2>(chip_location)][std::get<3>(chip_location)][std::get<1>(chip_location)] + [std::get<0>(chip_location)] = chip_id; desc.all_chips.insert(chip_id); } - - for(const auto& chip : yaml["chips_with_mmio"]) { - if(chip.IsMap()) { + + for (const auto &chip : yaml["chips_with_mmio"]) { + if (chip.IsMap()) { const auto &chip_map = chip.as>(); const auto &chips = chip_map.begin(); desc.chips_with_mmio.insert({chips->first, chips->second}); - } - else { + } else { const auto &chip_val = chip.as(); desc.chips_with_mmio.insert({chip_val, chip_val}); } @@ -538,8 +683,8 @@ void tt_ClusterDescriptor::load_chips_from_connectivity_descriptor(YAML::Node &y std::get<3>(chip_location)); } - if (yaml["boardtype"]) { - for (const auto& chip_board_type : yaml["boardtype"].as>()) { + if (yaml["boardtype"]) { + for (const auto &chip_board_type : yaml["boardtype"].as>()) { auto &chip = chip_board_type.first; BoardType board_type; if (chip_board_type.second == "n150") { @@ -554,15 +699,15 @@ void tt_ClusterDescriptor::load_chips_from_connectivity_descriptor(YAML::Node &y desc.chip_board_type.insert({chip, board_type}); } } else { - for (const auto& chip: desc.all_chips) { + for (const auto &chip : desc.all_chips) { desc.chip_board_type.insert({chip, BoardType::DEFAULT}); } } } void tt_ClusterDescriptor::load_harvesting_information(YAML::Node &yaml, tt_ClusterDescriptor &desc) { - if(yaml["harvesting"]) { - for (const auto& chip_node : yaml["harvesting"].as>()) { + if (yaml["harvesting"]) { + for (const auto &chip_node : yaml["harvesting"].as>()) { chip_id_t chip = chip_node.first; auto harvesting_info = chip_node.second; desc.noc_translation_enabled.insert({chip, harvesting_info["noc_translation"].as()}); @@ -571,9 +716,7 @@ void tt_ClusterDescriptor::load_harvesting_information(YAML::Node &yaml, tt_Clus } } -void tt_ClusterDescriptor::enable_all_devices() { - this->enabled_active_chips = this->all_chips; -} +void tt_ClusterDescriptor::enable_all_devices() { this->enabled_active_chips = this->all_chips; } void tt_ClusterDescriptor::fill_chips_grouped_by_closest_mmio() { for (const auto &chip : this->all_chips) { @@ -583,8 +726,10 @@ void tt_ClusterDescriptor::fill_chips_grouped_by_closest_mmio() { } } -std::unordered_map > > tt_ClusterDescriptor::get_ethernet_connections() const { - auto eth_connections = std::unordered_map > >(); +std::unordered_map>> +tt_ClusterDescriptor::get_ethernet_connections() const { + auto eth_connections = std:: + unordered_map>>(); for (const auto &[chip, channel_mapping] : this->ethernet_connections) { if (this->enabled_active_chips.find(chip) != this->enabled_active_chips.end()) { @@ -613,7 +758,8 @@ std::unordered_map tt_ClusterDescriptor::get_chip_locati chip_id_t tt_ClusterDescriptor::get_shelf_local_physical_chip_coords(chip_id_t virtual_coord) { // Physical cooridnates of chip inside a single rack. Calculated based on Galaxy topology. - // See: https://yyz-gitlab.local.tenstorrent.com/tenstorrent/budabackend/-/wikis/uploads/23e7a5168f38dfb706f9887fde78cb03/image.png + // See: + // https://yyz-gitlab.local.tenstorrent.com/tenstorrent/budabackend/-/wikis/uploads/23e7a5168f38dfb706f9887fde78cb03/image.png int x = std::get<0>(get_chip_locations().at(virtual_coord)); int y = std::get<1>(get_chip_locations().at(virtual_coord)); return 8 * x + y; @@ -632,9 +778,7 @@ std::unordered_map tt_ClusterDescriptor::get_chips_with_mm return chips_map; } -std::unordered_set tt_ClusterDescriptor::get_all_chips() const { - return this->enabled_active_chips; -} +std::unordered_set tt_ClusterDescriptor::get_all_chips() const { return this->enabled_active_chips; } std::unordered_map tt_ClusterDescriptor::get_harvesting_info() const { return harvesting_masks; @@ -651,10 +795,11 @@ int tt_ClusterDescriptor::get_ethernet_link_distance(chip_id_t chip_a, chip_id_t } BoardType tt_ClusterDescriptor::get_board_type(chip_id_t chip_id) const { - BoardType board_type = this->chip_board_type.at(chip_id); - return board_type; + BoardType board_type = this->chip_board_type.at(chip_id); + return board_type; } -std::unordered_map> tt_ClusterDescriptor::get_chips_grouped_by_closest_mmio() const { +std::unordered_map> tt_ClusterDescriptor::get_chips_grouped_by_closest_mmio() + const { return chips_grouped_by_closest_mmio; } diff --git a/device/tt_cluster_descriptor.h b/device/tt_cluster_descriptor.h index ea8d0f52..ef99c574 100644 --- a/device/tt_cluster_descriptor.h +++ b/device/tt_cluster_descriptor.h @@ -4,23 +4,24 @@ * SPDX-License-Identifier: Apache-2.0 */ - #pragma once -#include "device/tt_xy_pair.h" - #include -#include -#include -#include #include -#include +#include +#include #include +#include +#include +#include #include -#include + #include "device/tt_cluster_descriptor_types.h" +#include "device/tt_xy_pair.h" -namespace YAML { class Node; } +namespace YAML { +class Node; +} enum BoardType : uint32_t { N150 = 0, @@ -30,80 +31,82 @@ enum BoardType : uint32_t { }; class tt_ClusterDescriptor { - - private: - int get_ethernet_link_coord_distance(const eth_coord_t &location_a, const eth_coord_t &location_b) const; - - protected: - - std::unordered_map > > ethernet_connections; - std::unordered_map chip_locations; - // reverse map: rack/shelf/y/x -> chip_id - std::map > > > coords_to_chip_ids; - std::unordered_map chips_with_mmio; - std::unordered_set all_chips; - std::unordered_map noc_translation_enabled = {}; - std::unordered_map harvesting_masks = {}; - std::unordered_set enabled_active_chips; - std::unordered_map closest_mmio_chip_cache = {}; - std::unordered_map chip_board_type = {}; - std::unordered_map> chips_grouped_by_closest_mmio; - - // one-to-many chip connections - struct Chip2ChipConnection { - eth_coord_t source_chip_coord; - std::unordered_set destination_chip_coords; - }; - - // shelf_id -> y dim -> list of chip2chip connections between different shelves - // assumption is that on every row of the shelf there is a chip that is connected to the other shelf - // there could be one-to-many connections between shelves, i.e. one chip is connected to multiple chips on the other shelf (in case of nebula->galaxy) - std::unordered_map > galaxy_shelves_exit_chip_coords_per_y_dim = {}; - // rack_id -> x dim -> list of chip2chip connections between different racks - // assumption is that on every row of the rack there is a chip that is connected to the other rack - std::unordered_map > galaxy_racks_exit_chip_coords_per_x_dim = {}; - - static void load_ethernet_connections_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc); - static void load_chips_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc); - static void load_harvesting_information(YAML::Node &yaml, tt_ClusterDescriptor &desc); - - void fill_chips_grouped_by_closest_mmio(); - - public: - tt_ClusterDescriptor() = default; - tt_ClusterDescriptor(const tt_ClusterDescriptor&) = default; - - /* - * Returns the pairs of channels that are connected where the first entry in the pair corresponds to the argument ordering when calling the function - * An empty result implies that the two chips do not share any direct connection - */ - std::vector> get_directly_connected_ethernet_channels_between_chips(const chip_id_t &first, const chip_id_t &second) const; - - bool is_chip_mmio_capable(const chip_id_t chip_id) const; - bool is_chip_remote(const chip_id_t chip_id) const; - chip_id_t get_closest_mmio_capable_chip(const chip_id_t chip); - chip_id_t get_shelf_local_physical_chip_coords(chip_id_t virtual_coord); - static std::unique_ptr create_from_yaml(const std::string &cluster_descriptor_file_path); - static std::unique_ptr create_for_grayskull_cluster( - const std::set &logical_mmio_device_ids, - const std::vector &physical_mmio_device_ids); - - std::unordered_map get_harvesting_info() const; - std::unordered_map get_noc_translation_table_en() const; - std::unordered_map get_chip_locations() const; - std::unordered_map > > get_ethernet_connections() const; - std::unordered_map get_chips_with_mmio() const; - std::unordered_set get_all_chips() const; - std::size_t get_number_of_chips() const; - std::unordered_map> get_chips_grouped_by_closest_mmio() const; - - int get_ethernet_link_distance(chip_id_t chip_a, chip_id_t chip_b) const; - - BoardType get_board_type(chip_id_t chip_id) const; - - bool ethernet_core_has_active_ethernet_link(chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const; - std::tuple get_chip_and_channel_of_remote_ethernet_core(chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const; - - void enable_all_devices(); - +private: + int get_ethernet_link_coord_distance(const eth_coord_t &location_a, const eth_coord_t &location_b) const; + +protected: + std::unordered_map>> + ethernet_connections; + std::unordered_map chip_locations; + // reverse map: rack/shelf/y/x -> chip_id + std::map>>> coords_to_chip_ids; + std::unordered_map chips_with_mmio; + std::unordered_set all_chips; + std::unordered_map noc_translation_enabled = {}; + std::unordered_map harvesting_masks = {}; + std::unordered_set enabled_active_chips; + std::unordered_map closest_mmio_chip_cache = {}; + std::unordered_map chip_board_type = {}; + std::unordered_map> chips_grouped_by_closest_mmio; + + // one-to-many chip connections + struct Chip2ChipConnection { + eth_coord_t source_chip_coord; + std::unordered_set destination_chip_coords; + }; + + // shelf_id -> y dim -> list of chip2chip connections between different shelves + // assumption is that on every row of the shelf there is a chip that is connected to the other shelf + // there could be one-to-many connections between shelves, i.e. one chip is connected to multiple chips on the other + // shelf (in case of nebula->galaxy) + std::unordered_map> galaxy_shelves_exit_chip_coords_per_y_dim = + {}; + // rack_id -> x dim -> list of chip2chip connections between different racks + // assumption is that on every row of the rack there is a chip that is connected to the other rack + std::unordered_map> galaxy_racks_exit_chip_coords_per_x_dim = {}; + + static void load_ethernet_connections_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc); + static void load_chips_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc); + static void load_harvesting_information(YAML::Node &yaml, tt_ClusterDescriptor &desc); + + void fill_chips_grouped_by_closest_mmio(); + +public: + tt_ClusterDescriptor() = default; + tt_ClusterDescriptor(const tt_ClusterDescriptor &) = default; + + /* + * Returns the pairs of channels that are connected where the first entry in the pair corresponds to the argument + * ordering when calling the function An empty result implies that the two chips do not share any direct connection + */ + std::vector> + get_directly_connected_ethernet_channels_between_chips(const chip_id_t &first, const chip_id_t &second) const; + + bool is_chip_mmio_capable(const chip_id_t chip_id) const; + bool is_chip_remote(const chip_id_t chip_id) const; + chip_id_t get_closest_mmio_capable_chip(const chip_id_t chip); + chip_id_t get_shelf_local_physical_chip_coords(chip_id_t virtual_coord); + static std::unique_ptr create_from_yaml(const std::string &cluster_descriptor_file_path); + static std::unique_ptr create_for_grayskull_cluster( + const std::set &logical_mmio_device_ids, const std::vector &physical_mmio_device_ids); + + std::unordered_map get_harvesting_info() const; + std::unordered_map get_noc_translation_table_en() const; + std::unordered_map get_chip_locations() const; + std::unordered_map>> + get_ethernet_connections() const; + std::unordered_map get_chips_with_mmio() const; + std::unordered_set get_all_chips() const; + std::size_t get_number_of_chips() const; + std::unordered_map> get_chips_grouped_by_closest_mmio() const; + + int get_ethernet_link_distance(chip_id_t chip_a, chip_id_t chip_b) const; + + BoardType get_board_type(chip_id_t chip_id) const; + + bool ethernet_core_has_active_ethernet_link(chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const; + std::tuple get_chip_and_channel_of_remote_ethernet_core( + chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const; + + void enable_all_devices(); }; diff --git a/device/tt_cluster_descriptor_types.h b/device/tt_cluster_descriptor_types.h index 6508a08d..841d80a2 100644 --- a/device/tt_cluster_descriptor_types.h +++ b/device/tt_cluster_descriptor_types.h @@ -4,7 +4,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -#pragma once +#pragma once #include @@ -15,13 +15,11 @@ using eth_coord_t = std::tuple; // x, y, rack, shelf namespace std { template <> struct hash { - std::size_t operator()(eth_coord_t const &c) const { - std::size_t seed = 0; - seed = std::hash()(std::get<0>(c)) << 48 | - std::hash()(std::get<1>(c)) << 32 | - std::hash()(std::get<2>(c)) << 16 | - std::hash()(std::get<3>(c)); - return seed; - } + std::size_t operator()(eth_coord_t const &c) const { + std::size_t seed = 0; + seed = std::hash()(std::get<0>(c)) << 48 | std::hash()(std::get<1>(c)) << 32 | + std::hash()(std::get<2>(c)) << 16 | std::hash()(std::get<3>(c)); + return seed; + } }; -} +} // namespace std diff --git a/device/tt_device.cpp b/device/tt_device.cpp index 9d974936..b7c3590e 100644 --- a/device/tt_device.cpp +++ b/device/tt_device.cpp @@ -2,30 +2,32 @@ // // SPDX-License-Identifier: Apache-2.0 - #ifdef TT_DEBUG_LOGGING -#define DEBUG_LOG(str) do { std::cout << str << std::endl; } while( false ) +#define DEBUG_LOG(str) \ + do { \ + std::cout << str << std::endl; \ + } while (false) #else #define DEBUG_LOG(str) ((void)0) #endif #include "tt_device.h" -#include "device/tt_cluster_descriptor_types.h" -#include + #include +#include #include -#include #include +#include + +#include "device/tt_cluster_descriptor_types.h" #include "yaml-cpp/yaml.h" //////// // Device base //////// -tt_device::tt_device(const std::string& sdesc_path) : soc_descriptor_per_chip({}) { -} +tt_device::tt_device(const std::string& sdesc_path) : soc_descriptor_per_chip({}) {} -tt_device::~tt_device() { -} +tt_device::~tt_device() {} const tt_SocDescriptor& tt_device::get_soc_descriptor(chip_id_t chip_id) const { return soc_descriptor_per_chip.at(chip_id); diff --git a/device/tt_device.h b/device/tt_device.h index f5749ea5..60d8bf5a 100644 --- a/device/tt_device.h +++ b/device/tt_device.h @@ -8,20 +8,19 @@ #include #include #include +#include #include #include #include -#include -#include "tt_soc_descriptor.h" -#include "tt_xy_pair.h" -#include "tt_silicon_driver_common.hpp" -#include "device/tt_cluster_descriptor_types.h" #include "device/tlb.h" +#include "device/tt_cluster_descriptor_types.h" #include "device/tt_io.hpp" - -#include "pcie/pci_device.hpp" #include "fmt/core.h" +#include "pcie/pci_device.hpp" +#include "tt_silicon_driver_common.hpp" +#include "tt_soc_descriptor.h" +#include "tt_xy_pair.h" using TLB_DATA = tt::umd::tlb_data; @@ -30,29 +29,32 @@ using TLB_DATA = tt::umd::tlb_data; tt::ARCH detect_arch(int pci_device_num); tt::ARCH detect_arch(); -namespace boost::interprocess{ - class named_mutex; +namespace boost::interprocess { +class named_mutex; } class tt_ClusterDescriptor; -enum tt_DevicePowerState { - BUSY, - SHORT_IDLE, - LONG_IDLE -}; +enum tt_DevicePowerState { BUSY, SHORT_IDLE, LONG_IDLE }; enum tt_MemBarFlag { SET = 0xaa, RESET = 0xbb, }; -inline std::ostream &operator <<(std::ostream &os, const tt_DevicePowerState power_state) { +inline std::ostream& operator<<(std::ostream& os, const tt_DevicePowerState power_state) { switch (power_state) { - case tt_DevicePowerState::BUSY: os << "Busy"; break; - case tt_DevicePowerState::SHORT_IDLE: os << "SHORT_IDLE"; break; - case tt_DevicePowerState::LONG_IDLE: os << "LONG_IDLE"; break; - default: throw ("Unknown DevicePowerState"); + case tt_DevicePowerState::BUSY: + os << "Busy"; + break; + case tt_DevicePowerState::SHORT_IDLE: + os << "SHORT_IDLE"; + break; + case tt_DevicePowerState::LONG_IDLE: + os << "LONG_IDLE"; + break; + default: + throw("Unknown DevicePowerState"); } return os; } @@ -119,20 +121,22 @@ struct tt_version { std::uint16_t major = 0xffff; std::uint8_t minor = 0xff; std::uint8_t patch = 0xff; + tt_version() {} + tt_version(std::uint16_t major_, std::uint8_t minor_, std::uint8_t patch_) { major = major_; minor = minor_; patch = patch_; } + tt_version(std::uint32_t version) { major = (version >> 16) & 0xff; minor = (version >> 12) & 0xf; patch = version & 0xfff; } - std::string str() const { - return fmt::format("{}.{}.{}", major, minor, patch); - } + + std::string str() const { return fmt::format("{}.{}.{}", major, minor, patch); } }; struct tt_device_params { @@ -143,29 +147,32 @@ struct tt_device_params { bool init_device = true; bool early_open_device = false; int aiclk = 0; + // The command-line input for vcd_dump_cores can have the following format: // {"*-2", "1-*", "*-*", "1-2"} // '*' indicates we must dump all the cores in that dimension. // This function takes the vector above and unrolles the coords with '*' in one or both dimensions. std::vector unroll_vcd_dump_cores(tt_xy_pair grid_size) const { std::vector unrolled_dump_core; - for (auto &dump_core: vcd_dump_cores) { + for (auto& dump_core : vcd_dump_cores) { // If the input is a single *, then dump all cores. if (dump_core == "*") { for (size_t x = 0; x < grid_size.x; x++) { - for (size_t y = 0; y < grid_size.y; y++) { - std::string current_core_coord = fmt::format("{}-{}", x, y); - if (std::find(std::begin(unrolled_dump_core), std::end(unrolled_dump_core), current_core_coord) == std::end(unrolled_dump_core)) { - unrolled_dump_core.push_back(current_core_coord); + for (size_t y = 0; y < grid_size.y; y++) { + std::string current_core_coord = fmt::format("{}-{}", x, y); + if (std::find( + std::begin(unrolled_dump_core), std::end(unrolled_dump_core), current_core_coord) == + std::end(unrolled_dump_core)) { + unrolled_dump_core.push_back(current_core_coord); + } } } - } continue; } // Each core coordinate must contain three characters: "core.x-core.y". assert(dump_core.size() <= 5); size_t delimiter_pos = dump_core.find('-'); - assert (delimiter_pos != std::string::npos); // y-dim should exist in core coord. + assert(delimiter_pos != std::string::npos); // y-dim should exist in core coord. std::string core_dim_x = dump_core.substr(0, delimiter_pos); size_t core_dim_y_start = delimiter_pos + 1; @@ -175,7 +182,9 @@ struct tt_device_params { for (size_t x = 0; x < grid_size.x; x++) { for (size_t y = 0; y < grid_size.y; y++) { std::string current_core_coord = fmt::format("{}-{}", x, y); - if (std::find(std::begin(unrolled_dump_core), std::end(unrolled_dump_core), current_core_coord) == std::end(unrolled_dump_core)) { + if (std::find( + std::begin(unrolled_dump_core), std::end(unrolled_dump_core), current_core_coord) == + std::end(unrolled_dump_core)) { unrolled_dump_core.push_back(current_core_coord); } } @@ -183,14 +192,16 @@ struct tt_device_params { } else if (core_dim_x == "*") { for (size_t x = 0; x < grid_size.x; x++) { std::string current_core_coord = fmt::format("{}-{}", x, core_dim_y); - if (std::find(std::begin(unrolled_dump_core), std::end(unrolled_dump_core), current_core_coord) == std::end(unrolled_dump_core)) { + if (std::find(std::begin(unrolled_dump_core), std::end(unrolled_dump_core), current_core_coord) == + std::end(unrolled_dump_core)) { unrolled_dump_core.push_back(current_core_coord); } } } else if (core_dim_y == "*") { for (size_t y = 0; y < grid_size.y; y++) { std::string current_core_coord = fmt::format("{}-{}", core_dim_x, y); - if (std::find(std::begin(unrolled_dump_core), std::end(unrolled_dump_core), current_core_coord) == std::end(unrolled_dump_core)) { + if (std::find(std::begin(unrolled_dump_core), std::end(unrolled_dump_core), current_core_coord) == + std::end(unrolled_dump_core)) { unrolled_dump_core.push_back(current_core_coord); } } @@ -202,10 +213,9 @@ struct tt_device_params { } std::vector expand_plusargs() const { - std::vector all_plusargs { + std::vector all_plusargs{ fmt::format("+enable_perf_scoreboard={}", enable_perf_scoreboard), - fmt::format("+register_monitor={}", register_monitor) - }; + fmt::format("+register_monitor={}", register_monitor)}; all_plusargs.insert(all_plusargs.end(), plusargs.begin(), plusargs.end()); @@ -218,18 +228,18 @@ struct tt_device_params { * Exposes a generic interface to callers, providing declarations for virtual functions defined differently for Silicon. * Valid usage consists of declaring a tt_device object and initializing it to Silicon backend. * Using tt_device itself will throw errors, since its APIs are undefined. - */ -class tt_device -{ - public: + */ +class tt_device { +public: tt_device(const std::string& sdesc_path); virtual ~tt_device(); + // Setup/Teardown Functions /** * Set L1 Address Map parameters used by UMD to communicate with the TT Device. * * @param l1_address_params_ All the L1 parameters required by UMD - */ + */ virtual void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) { throw std::runtime_error("---- tt_device::set_device_l1_address_params is not implemented\n"); } @@ -242,9 +252,9 @@ class tt_device * Set Host Address Map parameters used by UMD to communicate with the TT Device (used for remote transactions). * * @param host_address_params_ All the Host Address space parameters required by UMD. - */ - [[deprecated("Using unnecessary function.")]] - virtual void set_driver_host_address_params(const tt_driver_host_address_params& host_address_params_) { + */ + [[deprecated("Using unnecessary function.")]] virtual void set_driver_host_address_params( + const tt_driver_host_address_params& host_address_params_) { throw std::runtime_error("---- tt_device::set_driver_host_address_params is not implemented\n"); } @@ -252,7 +262,7 @@ class tt_device * Set ERISC Firmware parameters used by UMD to communicate with the TT Device (used for remote transactions). * * @param eth_interface_params_ All the Ethernet Firmware parameters required by UMD. - */ + */ virtual void set_driver_eth_interface_params(const tt_driver_eth_interface_params& eth_interface_params_) { throw std::runtime_error("---- tt_device::set_driver_eth_interface_params is not implemented\n"); } @@ -265,8 +275,13 @@ class tt_device * @param tlb_index TLB id that will be programmed. * @param address Start address TLB is mapped to. * @param ordering Ordering mode for the TLB. - */ - virtual void configure_tlb(chip_id_t logical_device_id, tt_xy_pair core, std::int32_t tlb_index, std::int32_t address, uint64_t ordering = TLB_DATA::Relaxed) { + */ + virtual void configure_tlb( + chip_id_t logical_device_id, + tt_xy_pair core, + std::int32_t tlb_index, + std::int32_t address, + uint64_t ordering = TLB_DATA::Relaxed) { throw std::runtime_error("---- tt_device::configure_tlb is not implemented\n"); } @@ -275,45 +290,51 @@ class tt_device * * @param fallback_tlb Dynamic TLB being targeted. * @param ordering Ordering mode for the TLB. - */ + */ virtual void set_fallback_tlb_ordering_mode(const std::string& fallback_tlb, uint64_t ordering = TLB_DATA::Posted) { throw std::runtime_error("---- tt_device::set_fallback_tlb_ordering_mode is not implemented\n"); } - + /** - * Give UMD a 1:1 function mapping a core to its appropriate static TLB (currently only support a single TLB per core). + * Give UMD a 1:1 function mapping a core to its appropriate static TLB (currently only support a single TLB per + * core). * * @param logical_device_id MMIO chip being targeted. * @param mapping_function Function which maps core to TLB index. */ - virtual void setup_core_to_tlb_map(const chip_id_t logical_device_id, std::function mapping_function) { + virtual void setup_core_to_tlb_map( + const chip_id_t logical_device_id, std::function mapping_function) { throw std::runtime_error("---- tt_device::setup_core_to_tlb_map is not implemented\n"); } /** - * Pass in ethernet cores with active links for a specific MMIO chip. When called, this function will force UMD to use a subset of cores from the active_eth_cores_per_chip set for all host->cluster - * non-MMIO transfers. If this function is not called, UMD will use a default set of ethernet core indices for these transfers (0 through 5). - * If default behaviour is not desired, this function must be called for all MMIO devices. + * Pass in ethernet cores with active links for a specific MMIO chip. When called, this function will force UMD to + * use a subset of cores from the active_eth_cores_per_chip set for all host->cluster non-MMIO transfers. If this + * function is not called, UMD will use a default set of ethernet core indices for these transfers (0 through 5). If + * default behaviour is not desired, this function must be called for all MMIO devices. * * @param mmio_chip Device being targeted. * @param active_eth_cores_per_chip The active ethernet cores for this chip. */ - virtual void configure_active_ethernet_cores_for_mmio_device(chip_id_t mmio_chip, const std::unordered_set& active_eth_cores_per_chip) { - throw std::runtime_error("---- tt_device::configure_active_ethernet_cores_for_mmio_device is not implemented\n"); + virtual void configure_active_ethernet_cores_for_mmio_device( + chip_id_t mmio_chip, const std::unordered_set& active_eth_cores_per_chip) { + throw std::runtime_error( + "---- tt_device::configure_active_ethernet_cores_for_mmio_device is not implemented\n"); } /** - * On Silicon: Assert soft Tensix reset, deassert RiscV reset, set power state to busy (ramp up AICLK), initialize iATUs for PCIe devices and ethernet queues for remote chips. + * On Silicon: Assert soft Tensix reset, deassert RiscV reset, set power state to busy (ramp up AICLK), initialize + * iATUs for PCIe devices and ethernet queues for remote chips. * * @param device_params Object specifying initialization configuration. */ - virtual void start_device(const tt_device_params &device_params) { + virtual void start_device(const tt_device_params& device_params) { throw std::runtime_error("---- tt_device::start_device is not implemented\n"); } /** * Broadcast deassert soft Tensix Reset to the entire device (to be done after start_device is called). - */ + */ virtual void deassert_risc_reset() { throw std::runtime_error("---- tt_device::deassert_risc_reset is not implemented\n"); } @@ -322,14 +343,14 @@ class tt_device * Send a soft deassert reset signal to a single tensix core. * * @param core Chip and core being targeted. - */ + */ virtual void deassert_risc_reset_at_core(tt_cxy_pair core) { throw std::runtime_error("---- tt_device::deassert_risc_reset_at_core is not implemented\n"); } /** * Broadcast assert soft Tensix Reset to the entire device. - */ + */ virtual void assert_risc_reset() { throw std::runtime_error("---- tt_device::assert_risc_reset is not implemented\n"); } @@ -338,7 +359,7 @@ class tt_device * Send a soft assert reset signal to a single tensix core. * * @param core Chip and core being targeted. - */ + */ virtual void assert_risc_reset_at_core(tt_cxy_pair core) { throw std::runtime_error("---- tt_device::assert_risc_reset_at_core is not implemented\n"); } @@ -346,17 +367,15 @@ class tt_device /** * To be called at the end of a run. * Set power state to idle, assert tensix reset at all cores. - */ - virtual void close_device() { - throw std::runtime_error("---- tt_device::close_device is not implemented\n"); - } + */ + virtual void close_device() { throw std::runtime_error("---- tt_device::close_device is not implemented\n"); } // Runtime functions /** * Non-MMIO (ethernet) barrier. - * Similar to an mfence for host -> host transfers. Will flush all in-flight ethernet transactions before proceeding with the next one. - * This will be applied to all chips in the cluster. - */ + * Similar to an mfence for host -> host transfers. Will flush all in-flight ethernet transactions before proceeding + * with the next one. This will be applied to all chips in the cluster. + */ virtual void wait_for_non_mmio_flush() { throw std::runtime_error("---- tt_device::wait_for_non_mmio_flush is not implemented\n"); } @@ -378,12 +397,20 @@ class tt_device * @param addr Address to write to. * @param tlb_to_use Specifies fallback/dynamic TLB to use. */ - virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) { + virtual void write_to_device( + const void* mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) { // Only implement this for Silicon Backend throw std::runtime_error("---- tt_device::write_to_device is not implemented\n"); } - virtual void broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set& chips_to_exclude, std::set& rows_to_exclude, std::set& columns_to_exclude, const std::string& fallback_tlb) { + virtual void broadcast_write_to_cluster( + const void* mem_ptr, + uint32_t size_in_bytes, + uint64_t address, + const std::set& chips_to_exclude, + std::set& rows_to_exclude, + std::set& columns_to_exclude, + const std::string& fallback_tlb) { throw std::runtime_error("---- tt_device::broadcast_write_to_cluster is not implemented\n"); } @@ -396,44 +423,54 @@ class tt_device * @param size Number of bytes to read. * @param fallback_tlb Specifies fallback/dynamic TLB to use. */ - virtual void read_from_device(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) { + virtual void read_from_device( + void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) { // Only implement this for Silicon Backend throw std::runtime_error("---- tt_device::read_from_device is not implemented\n"); } /** * Write uint32_t vector to specified address and channel on host (defined for Silicon). - * + * * @param vec Data to write. * @param addr Address to write to. * @param channel Host channel to target. * @param src_device_id Chip to target. */ - virtual void write_to_sysmem(const void* mem_ptr, std::uint32_t size, uint64_t addr, uint16_t channel, chip_id_t src_device_id) { + virtual void write_to_sysmem( + const void* mem_ptr, std::uint32_t size, uint64_t addr, uint16_t channel, chip_id_t src_device_id) { throw std::runtime_error("---- tt_device::write_to_sysmem is not implemented\n"); } - virtual void read_from_sysmem(void* mem_ptr, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id) { + + virtual void read_from_sysmem( + void* mem_ptr, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id) { throw std::runtime_error("---- tt_device::read_from_sysmem is not implemented\n"); } - virtual void l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}) { + + virtual void l1_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}) { throw std::runtime_error("---- tt_device::l1_membar is not implemented\n"); } - virtual void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels = {}) { + + virtual void dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels = {}) { throw std::runtime_error("---- tt_device::dram_membar is not implemented\n"); } - virtual void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}) { + + virtual void dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}) { throw std::runtime_error("---- tt_device::dram_membar is not implemented\n"); } // Misc. Functions to Query/Set Device State /** - * Query post harvesting SOC descriptors from UMD in virtual coordinates. + * Query post harvesting SOC descriptors from UMD in virtual coordinates. * These descriptors should be used for looking up cores that are passed into UMD APIs. */ virtual std::unordered_map& get_virtual_soc_descriptors() { throw std::runtime_error("---- tt_device:get_virtual_soc_descriptors is not implemented\n"); } - + /** * Determine if UMD performed harvesting on SOC descriptors. */ @@ -441,18 +478,18 @@ class tt_device throw std::runtime_error("---- tt_device:using_harvested_soc_descriptors is not implemented\n"); return 0; } - + /** * Get harvesting masks for all chips/SOC Descriptors in the cluster. * Each mask represents a map of enabled (0) and disabled (1) rows on a specific chip (in NOC0 Coordinateds). - */ + */ virtual std::unordered_map get_harvesting_masks_for_soc_descriptors() { throw std::runtime_error("---- tt_device:get_harvesting_masks_for_soc_descriptors is not implemented\n"); } /** * Issue message to device, meant to be picked up by ARC firmware. - * + * * @param logical_device_id Chip to target. * @param msg_code Specifies type of ARC message. * @param wait_for_done Block until ARC responds. @@ -461,8 +498,16 @@ class tt_device * @param timeout Timeout on ARC. * @param return3 Return value from ARC. * @param return4 Return value from ARC. - */ - virtual int arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done = true, uint32_t arg0 = 0, uint32_t arg1 = 0, int timeout=1, uint32_t *return_3 = nullptr, uint32_t *return_4 = nullptr) { + */ + virtual int arc_msg( + int logical_device_id, + uint32_t msg_code, + bool wait_for_done = true, + uint32_t arg0 = 0, + uint32_t arg1 = 0, + int timeout = 1, + uint32_t* return_3 = nullptr, + uint32_t* return_4 = nullptr) { throw std::runtime_error("---- tt_device::arc_msg is not implemented\n"); } @@ -472,28 +517,28 @@ class tt_device * @param device_id Chip to target. * @param r Row coordinate. * @param c Column coordinate. - */ - virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c) { + */ + virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c) { throw std::runtime_error("---- tt_device::translate_to_noc_table_coords is not implemented\n"); } /** * Get the total number of chips in the cluster based on the network descriptor. - */ + */ virtual int get_number_of_chips_in_cluster() { throw std::runtime_error("---- tt_device::get_number_of_chips_in_cluster is not implemented\n"); } /** * Get the logical ids for all chips in the cluster - */ + */ virtual std::unordered_set get_all_chips_in_cluster() { throw std::runtime_error("---- tt_device::get_all_chips_in_cluster is not implemented\n"); } /** * Get cluster descriptor object being used in UMD instance. - */ + */ virtual tt_ClusterDescriptor* get_cluster_description() { throw std::runtime_error("---- tt_device::get_cluster_description is not implemented\n"); } @@ -515,9 +560,9 @@ class tt_device /** * Get clock frequencies for all MMIO devices targeted by UMD. */ - virtual std::map get_clocks() { + virtual std::map get_clocks() { throw std::runtime_error("---- tt_device::get_clocks is not implemented\n"); - return std::map(); + return std::map(); } virtual std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id) { @@ -535,7 +580,7 @@ class tt_device * Query number of DRAM channels on a specific device. * * @param device_id Logical device id to query. - */ + */ virtual std::uint32_t get_num_dram_channels(std::uint32_t device_id) { throw std::runtime_error("---- tt_device::get_num_dram_channels is not implemented\n"); return 0; @@ -543,10 +588,10 @@ class tt_device /** * Get size for a specific DRAM channel on a device. - * + * * @param device_id Device to target. * @param channel DRAM channel to target. - */ + */ virtual std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel) { throw std::runtime_error("---- tt_device::get_dram_channel_size is not implemented\n"); return 0; @@ -556,7 +601,7 @@ class tt_device * Query number of Host channels (hugepages) allocated for a specific device. * * @param device_id Logical device id to target. - */ + */ virtual std::uint32_t get_num_host_channels(std::uint32_t device_id) { throw std::runtime_error("---- tt_device::get_num_host_channels is not implemented\n"); return 0; @@ -567,20 +612,21 @@ class tt_device * * @param device_id Logical device id to target. * @param channel Logical host channel to target. - */ + */ virtual std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel) { throw std::runtime_error("---- tt_device::get_host_channel_size is not implemented\n"); return 0; } /** - * Get absolute address corresponding to a zero based offset into a specific host memory channel for a specific device. - * + * Get absolute address corresponding to a zero based offset into a specific host memory channel for a specific + * device. + * * @param offset Offset wrt the start of the channel's address space. - * @param src_device_id Device to target. + * @param src_device_id Device to target. * @param channel Host memory channel. */ - virtual void *host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const { + virtual void* host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const { throw std::runtime_error("---- tt_device::host_dma_address is not implemented\n"); return nullptr; } @@ -589,25 +635,25 @@ class tt_device throw std::runtime_error("---- tt_device::get_pcie_base_addr_from_device is not implemented\n"); return 0; } + const tt_SocDescriptor& get_soc_descriptor(chip_id_t chip_id) const; bool performed_harvesting = false; std::unordered_map harvested_rows_per_target = {}; bool translation_tables_en = false; - protected: +protected: std::unordered_map soc_descriptor_per_chip = {}; }; #include "device/architecture_implementation.h" /** -* Silicon Driver Class, derived from the tt_device class + * Silicon Driver Class, derived from the tt_device class * Implements APIs to communicate with a physical Tenstorrent Device. -*/ -class tt_SiliconDevice: public tt_device -{ - public: + */ +class tt_SiliconDevice : public tt_device { +public: // Constructor /** * Silicon Driver constructor. @@ -620,22 +666,35 @@ class tt_SiliconDevice: public tt_device * @param clean_system_resource Specifies if host state from previous runs needs to be cleaned up. * @param perform_harvesting Allow the driver to modify the SOC descriptors per chip. * @param simulated_harvesting_masks - */ - tt_SiliconDevice(const std::string &sdesc_path, const std::string &ndesc_path, const std::set &target_devices, - const uint32_t &num_host_mem_ch_per_mmio_device = 1, const bool skip_driver_allocs = false, - const bool clean_system_resources = false, bool perform_harvesting = true, std::unordered_map simulated_harvesting_masks = {}); - - //Setup/Teardown Functions + */ + tt_SiliconDevice( + const std::string& sdesc_path, + const std::string& ndesc_path, + const std::set& target_devices, + const uint32_t& num_host_mem_ch_per_mmio_device = 1, + const bool skip_driver_allocs = false, + const bool clean_system_resources = false, + bool perform_harvesting = true, + std::unordered_map simulated_harvesting_masks = {}); + + // Setup/Teardown Functions virtual std::unordered_map& get_virtual_soc_descriptors(); virtual void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_); virtual void set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_); virtual void set_driver_host_address_params(const tt_driver_host_address_params& host_address_params_); virtual void set_driver_eth_interface_params(const tt_driver_eth_interface_params& eth_interface_params_); - virtual void configure_tlb(chip_id_t logical_device_id, tt_xy_pair core, std::int32_t tlb_index, std::int32_t address, uint64_t ordering = TLB_DATA::Posted); + virtual void configure_tlb( + chip_id_t logical_device_id, + tt_xy_pair core, + std::int32_t tlb_index, + std::int32_t address, + uint64_t ordering = TLB_DATA::Posted); virtual void set_fallback_tlb_ordering_mode(const std::string& fallback_tlb, uint64_t ordering = TLB_DATA::Posted); - virtual void setup_core_to_tlb_map(const chip_id_t logical_device_id, std::function mapping_function); - virtual void configure_active_ethernet_cores_for_mmio_device(chip_id_t mmio_chip, const std::unordered_set& active_eth_cores_per_chip); - virtual void start_device(const tt_device_params &device_params); + virtual void setup_core_to_tlb_map( + const chip_id_t logical_device_id, std::function mapping_function); + virtual void configure_active_ethernet_cores_for_mmio_device( + chip_id_t mmio_chip, const std::unordered_set& active_eth_cores_per_chip); + virtual void start_device(const tt_device_params& device_params); virtual void assert_risc_reset(); virtual void deassert_risc_reset(); virtual void deassert_risc_reset_at_core(tt_cxy_pair core); @@ -643,20 +702,34 @@ class tt_SiliconDevice: public tt_device virtual void close_device(); // Runtime Functions - virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use); - void broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set& chips_to_exclude, std::set& rows_to_exclude, std::set& columns_to_exclude, const std::string& fallback_tlb); - - virtual void read_from_device(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb); - virtual void write_to_sysmem(const void* mem_ptr, std::uint32_t size, uint64_t addr, uint16_t channel, chip_id_t src_device_id); - virtual void read_from_sysmem(void* mem_ptr, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id); + virtual void write_to_device( + const void* mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use); + void broadcast_write_to_cluster( + const void* mem_ptr, + uint32_t size_in_bytes, + uint64_t address, + const std::set& chips_to_exclude, + std::set& rows_to_exclude, + std::set& columns_to_exclude, + const std::string& fallback_tlb); + + virtual void read_from_device( + void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb); + virtual void write_to_sysmem( + const void* mem_ptr, std::uint32_t size, uint64_t addr, uint16_t channel, chip_id_t src_device_id); + virtual void read_from_sysmem( + void* mem_ptr, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id); virtual void wait_for_non_mmio_flush(); virtual void wait_for_non_mmio_flush(const chip_id_t chip_id); - void l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); - void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels); - void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); + void l1_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); + void dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels); + void dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); // These functions are used by Debuda, so make them public - void bar_write32 (int logical_device_id, uint32_t addr, uint32_t data); - uint32_t bar_read32 (int logical_device_id, uint32_t addr); + void bar_write32(int logical_device_id, uint32_t addr, uint32_t data); + uint32_t bar_read32(int logical_device_id, uint32_t addr); /** * If the tlbs are initialized, returns a tuple with the TLB base address and its size @@ -674,16 +747,24 @@ class tt_SiliconDevice: public tt_device * - the mapping is unchanged during the lifetime of the returned object. * - the tt_SiliconDevice instance outlives the returned object. * - use of the returned object is congruent with the target's TLB setup. - * + * * @param target The target chip and core to write to. */ tt::Writer get_static_tlb_writer(tt_cxy_pair target); // Misc. Functions to Query/Set Device State - virtual int arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done = true, uint32_t arg0 = 0, uint32_t arg1 = 0, int timeout=1, uint32_t *return_3 = nullptr, uint32_t *return_4 = nullptr); + virtual int arc_msg( + int logical_device_id, + uint32_t msg_code, + bool wait_for_done = true, + uint32_t arg0 = 0, + uint32_t arg1 = 0, + int timeout = 1, + uint32_t* return_3 = nullptr, + uint32_t* return_4 = nullptr); virtual bool using_harvested_soc_descriptors(); virtual std::unordered_map get_harvesting_masks_for_soc_descriptors(); - virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c); + virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c); virtual int get_number_of_chips_in_cluster(); virtual std::unordered_set get_all_chips_in_cluster(); virtual tt_ClusterDescriptor* get_cluster_description(); @@ -691,13 +772,16 @@ class tt_SiliconDevice: public tt_device static std::vector detect_available_device_ids(); virtual std::set get_target_mmio_device_ids(); virtual std::set get_target_remote_device_ids(); - virtual std::map get_clocks(); - virtual void *host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const; + virtual std::map get_clocks(); + virtual void* host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const; virtual std::uint64_t get_pcie_base_addr_from_device(const chip_id_t chip_id) const; - static std::vector extract_rows_to_remove(const tt::ARCH &arch, const int worker_grid_rows, const int harvested_rows); - static void remove_worker_row_from_descriptor(tt_SocDescriptor& full_soc_descriptor, const std::vector& row_coordinates_to_remove); + static std::vector extract_rows_to_remove( + const tt::ARCH& arch, const int worker_grid_rows, const int harvested_rows); + static void remove_worker_row_from_descriptor( + tt_SocDescriptor& full_soc_descriptor, const std::vector& row_coordinates_to_remove); static void harvest_rows_in_soc_descriptor(tt::ARCH arch, tt_SocDescriptor& sdesc, uint32_t harvested_rows); - static std::unordered_map create_harvested_coord_translation(const tt::ARCH arch, bool identity_map); + static std::unordered_map create_harvested_coord_translation( + const tt::ARCH arch, bool identity_map); std::unordered_map get_harvested_coord_translation_map(chip_id_t logical_device_id); virtual std::uint32_t get_num_dram_channels(std::uint32_t device_id); virtual std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel); @@ -706,70 +790,144 @@ class tt_SiliconDevice: public tt_device virtual std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id); virtual tt_version get_ethernet_fw_version() const; // TODO: This should be accessible through public API, probably to be moved to tt_device. - PCIDevice *get_pci_device(int device_id) const; + PCIDevice* get_pci_device(int device_id) const; // Destructor - virtual ~tt_SiliconDevice (); + virtual ~tt_SiliconDevice(); - private: +private: // Helper functions // Startup + teardown - void create_device(const std::unordered_set &target_mmio_device_ids, const uint32_t &num_host_mem_ch_per_mmio_device, const bool skip_driver_allocs, const bool clean_system_resources); + void create_device( + const std::unordered_set& target_mmio_device_ids, + const uint32_t& num_host_mem_ch_per_mmio_device, + const bool skip_driver_allocs, + const bool clean_system_resources); void initialize_interprocess_mutexes(int pci_interface_id, bool cleanup_mutexes_in_shm); void cleanup_shared_host_state(); void initialize_pcie_devices(); - void broadcast_pcie_tensix_risc_reset(chip_id_t chip_id, const TensixSoftResetOptions &cores); - void broadcast_tensix_risc_reset_to_cluster(const TensixSoftResetOptions &soft_resets); - void send_remote_tensix_risc_reset_to_core(const tt_cxy_pair &core, const TensixSoftResetOptions &soft_resets); - void send_tensix_risc_reset_to_core(const tt_cxy_pair &core, const TensixSoftResetOptions &soft_resets); + void broadcast_pcie_tensix_risc_reset(chip_id_t chip_id, const TensixSoftResetOptions& cores); + void broadcast_tensix_risc_reset_to_cluster(const TensixSoftResetOptions& soft_resets); + void send_remote_tensix_risc_reset_to_core(const tt_cxy_pair& core, const TensixSoftResetOptions& soft_resets); + void send_tensix_risc_reset_to_core(const tt_cxy_pair& core, const TensixSoftResetOptions& soft_resets); void perform_harvesting_and_populate_soc_descriptors(const std::string& sdesc_path, const bool perform_harvesting); void populate_cores(); - void init_pcie_iatus(); // No more p2p support. + void init_pcie_iatus(); // No more p2p support. bool init_hugepage(chip_id_t device_id); void check_pcie_device_initialized(int device_id); void set_pcie_power_state(tt_DevicePowerState state); - int set_remote_power_state(const chip_id_t &chip, tt_DevicePowerState device_state); + int set_remote_power_state(const chip_id_t& chip, tt_DevicePowerState device_state); void set_power_state(tt_DevicePowerState state); uint32_t get_power_state_arc_msg(chip_id_t chip_id, tt_DevicePowerState state); void enable_local_ethernet_queue(const chip_id_t& chip, int timeout); void enable_ethernet_queue(int timeout); void enable_remote_ethernet_queue(const chip_id_t& chip, int timeout); void deassert_resets_and_set_power_state(); - int open_hugepage_file(const std::string &dir, chip_id_t device_id, uint16_t channel); - int iatu_configure_peer_region (int logical_device_id, uint32_t peer_region_id, uint64_t bar_addr_64, uint32_t region_size); - uint32_t get_harvested_noc_rows (uint32_t harvesting_mask); - uint32_t get_harvested_rows (int logical_device_id); + int open_hugepage_file(const std::string& dir, chip_id_t device_id, uint16_t channel); + int iatu_configure_peer_region( + int logical_device_id, uint32_t peer_region_id, uint64_t bar_addr_64, uint32_t region_size); + uint32_t get_harvested_noc_rows(uint32_t harvesting_mask); + uint32_t get_harvested_rows(int logical_device_id); int get_clock(int logical_device_id); // Communication Functions - void read_buffer(void* mem_ptr, std::uint32_t address, std::uint16_t channel, std::uint32_t size_in_bytes, chip_id_t src_device_id); - void write_buffer(const void *mem_ptr, std::uint32_t size, std::uint32_t address, std::uint16_t channel, chip_id_t src_device_id); - void write_device_memory(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair target, std::uint32_t address, const std::string& fallback_tlb); - void write_to_non_mmio_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t address, bool broadcast = false, std::vector broadcast_header = {}); - void read_device_memory(void *mem_ptr, tt_cxy_pair target, std::uint32_t address, std::uint32_t size_in_bytes, const std::string& fallback_tlb); + void read_buffer( + void* mem_ptr, + std::uint32_t address, + std::uint16_t channel, + std::uint32_t size_in_bytes, + chip_id_t src_device_id); + void write_buffer( + const void* mem_ptr, std::uint32_t size, std::uint32_t address, std::uint16_t channel, chip_id_t src_device_id); + void write_device_memory( + const void* mem_ptr, + uint32_t size_in_bytes, + tt_cxy_pair target, + std::uint32_t address, + const std::string& fallback_tlb); + void write_to_non_mmio_device( + const void* mem_ptr, + uint32_t size_in_bytes, + tt_cxy_pair core, + uint64_t address, + bool broadcast = false, + std::vector broadcast_header = {}); + void read_device_memory( + void* mem_ptr, + tt_cxy_pair target, + std::uint32_t address, + std::uint32_t size_in_bytes, + const std::string& fallback_tlb); void read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_t address, uint32_t size_in_bytes); - void read_mmio_device_register(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb); - void write_mmio_device_register(const void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb); - void pcie_broadcast_write(chip_id_t chip, const void* mem_ptr, uint32_t size_in_bytes, std::uint32_t addr, const tt_xy_pair& start, const tt_xy_pair& end, const std::string& fallback_tlb); - void ethernet_broadcast_write(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set& chips_to_exclude, const std::set& rows_to_exclude, - std::set& cols_to_exclude, const std::string& fallback_tlb, bool use_virtual_coords); - void set_membar_flag(const chip_id_t chip, const std::unordered_set& cores, const uint32_t barrier_value, const uint32_t barrier_addr, const std::string& fallback_tlb); - void insert_host_to_device_barrier(const chip_id_t chip, const std::unordered_set& cores, const uint32_t barrier_addr, const std::string& fallback_tlb); + void read_mmio_device_register( + void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb); + void write_mmio_device_register( + const void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb); + void pcie_broadcast_write( + chip_id_t chip, + const void* mem_ptr, + uint32_t size_in_bytes, + std::uint32_t addr, + const tt_xy_pair& start, + const tt_xy_pair& end, + const std::string& fallback_tlb); + void ethernet_broadcast_write( + const void* mem_ptr, + uint32_t size_in_bytes, + uint64_t address, + const std::set& chips_to_exclude, + const std::set& rows_to_exclude, + std::set& cols_to_exclude, + const std::string& fallback_tlb, + bool use_virtual_coords); + void set_membar_flag( + const chip_id_t chip, + const std::unordered_set& cores, + const uint32_t barrier_value, + const uint32_t barrier_addr, + const std::string& fallback_tlb); + void insert_host_to_device_barrier( + const chip_id_t chip, + const std::unordered_set& cores, + const uint32_t barrier_addr, + const std::string& fallback_tlb); void init_membars(); uint64_t get_sys_addr(uint32_t chip_x, uint32_t chip_y, uint32_t noc_x, uint32_t noc_y, uint64_t offset); uint16_t get_sys_rack(uint32_t rack_x, uint32_t rack_y); bool is_non_mmio_cmd_q_full(uint32_t curr_wptr, uint32_t curr_rptr); - int pcie_arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done = true, uint32_t arg0 = 0, uint32_t arg1 = 0, int timeout=1, uint32_t *return_3 = nullptr, uint32_t *return_4 = nullptr); - int remote_arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done = true, uint32_t arg0 = 0, uint32_t arg1 = 0, int timeout=1, uint32_t *return_3 = nullptr, uint32_t *return_4 = nullptr); - bool address_in_tlb_space(uint32_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size, uint32_t chip); + int pcie_arc_msg( + int logical_device_id, + uint32_t msg_code, + bool wait_for_done = true, + uint32_t arg0 = 0, + uint32_t arg1 = 0, + int timeout = 1, + uint32_t* return_3 = nullptr, + uint32_t* return_4 = nullptr); + int remote_arc_msg( + int logical_device_id, + uint32_t msg_code, + bool wait_for_done = true, + uint32_t arg0 = 0, + uint32_t arg1 = 0, + int timeout = 1, + uint32_t* return_3 = nullptr, + uint32_t* return_4 = nullptr); + bool address_in_tlb_space( + uint32_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size, uint32_t chip); std::shared_ptr get_mutex(const std::string& tlb_name, int pci_interface_id); - virtual uint32_t get_harvested_noc_rows_for_chip(int logical_device_id); // Returns one-hot encoded harvesting mask for PCIe mapped chips - void generate_tensix_broadcast_grids_for_grayskull( std::set>& broadcast_grids, std::set& rows_to_exclude, std::set& cols_to_exclude); - std::unordered_map>>& get_ethernet_broadcast_headers(const std::set& chips_to_exclude); + virtual uint32_t get_harvested_noc_rows_for_chip( + int logical_device_id); // Returns one-hot encoded harvesting mask for PCIe mapped chips + void generate_tensix_broadcast_grids_for_grayskull( + std::set>& broadcast_grids, + std::set& rows_to_exclude, + std::set& cols_to_exclude); + std::unordered_map>>& get_ethernet_broadcast_headers( + const std::set& chips_to_exclude); // Test functions void verify_eth_fw(); - void verify_sw_fw_versions(int device_id, std::uint32_t sw_version, std::vector &fw_versions); - int test_setup_interface (); + void verify_sw_fw_versions(int device_id, std::uint32_t sw_version, std::vector& fw_versions); + int test_setup_interface(); // This functions has to be called for local chip, and then it will wait for all connected remote chips to flush. void wait_for_connected_non_mmio_flush(chip_id_t chip_id); @@ -783,22 +941,24 @@ class tt_SiliconDevice: public tt_device std::set target_devices_in_cluster = {}; std::set target_remote_chips = {}; tt::ARCH arch_name; - std::unordered_map> m_pci_device_map; // Map of enabled pci devices - int m_num_pci_devices; // Number of pci devices in system (enabled or disabled) + std::unordered_map> m_pci_device_map; // Map of enabled pci devices + int m_num_pci_devices; // Number of pci devices in system (enabled or disabled) std::shared_ptr ndesc; // remote eth transfer setup static constexpr std::uint32_t NUM_ETH_CORES_FOR_NON_MMIO_TRANSFERS = 6; static constexpr std::uint32_t NON_EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS = 4; static constexpr std::uint32_t NON_EPOCH_ETH_CORES_START_ID = 0; - static constexpr std::uint32_t NON_EPOCH_ETH_CORES_MASK = (NON_EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS-1); + static constexpr std::uint32_t NON_EPOCH_ETH_CORES_MASK = (NON_EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS - 1); - static constexpr std::uint32_t EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS = NUM_ETH_CORES_FOR_NON_MMIO_TRANSFERS - NON_EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS; - static constexpr std::uint32_t EPOCH_ETH_CORES_START_ID = NON_EPOCH_ETH_CORES_START_ID + NON_EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS; - static constexpr std::uint32_t EPOCH_ETH_CORES_MASK = (EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS-1); + static constexpr std::uint32_t EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS = + NUM_ETH_CORES_FOR_NON_MMIO_TRANSFERS - NON_EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS; + static constexpr std::uint32_t EPOCH_ETH_CORES_START_ID = + NON_EPOCH_ETH_CORES_START_ID + NON_EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS; + static constexpr std::uint32_t EPOCH_ETH_CORES_MASK = (EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS - 1); int active_core = NON_EPOCH_ETH_CORES_START_ID; - std::vector< std::vector > remote_transfer_ethernet_cores; + std::vector> remote_transfer_ethernet_cores; std::unordered_map flush_non_mmio_per_chip = {}; bool non_mmio_transfer_cores_customized = false; std::unordered_map active_eth_core_idx_per_chip = {}; @@ -810,7 +970,7 @@ class tt_SiliconDevice: public tt_device std::unordered_set eth_cores = {}; std::unordered_set dram_cores = {}; uint32_t m_num_host_mem_channels = 0; - std::unordered_map> hugepage_mapping; + std::unordered_map> hugepage_mapping; std::unordered_map> hugepage_mapping_size; std::unordered_map> hugepage_physical_address; std::map> tlb_config_map = {}; @@ -828,7 +988,7 @@ class tt_SiliconDevice: public tt_device bool use_ethernet_ordered_writes = true; bool use_ethernet_broadcast = true; bool use_virtual_coords_for_eth_broadcast = true; - tt_version eth_fw_version; // Ethernet FW the driver is interfacing with + tt_version eth_fw_version; // Ethernet FW the driver is interfacing with // Named Mutexes static constexpr char NON_MMIO_MUTEX_NAME[] = "NON_MMIO"; static constexpr char ARC_MSG_MUTEX_NAME[] = "ARC_MSG"; @@ -839,11 +999,11 @@ class tt_SiliconDevice: public tt_device uint32_t get_num_hugepages(); -constexpr inline bool operator==(const tt_version &a, const tt_version &b) { +constexpr inline bool operator==(const tt_version& a, const tt_version& b) { return a.major == b.major && a.minor == b.minor && a.patch == b.patch; } -constexpr inline bool operator>=(const tt_version &a, const tt_version &b) { +constexpr inline bool operator>=(const tt_version& a, const tt_version& b) { bool fw_major_greater = a.major > b.major; bool fw_minor_greater = (a.major == b.major) && (a.minor > b.minor); bool patch_greater_or_equal = (a.major == b.major) && (a.minor == b.minor) && (a.patch >= b.patch); diff --git a/device/tt_io.hpp b/device/tt_io.hpp index 5daa7262..304f2ee8 100644 --- a/device/tt_io.hpp +++ b/device/tt_io.hpp @@ -20,20 +20,18 @@ namespace tt { * * It is the caller's responsibility to manage the lifetime of Writer objects. */ -class Writer -{ +class Writer { friend class ::tt_SiliconDevice; public: /** * @brief Write to a SoC core. - * + * * @param address must be aligned to the size of T - * @param value + * @param value */ template - void write(uint32_t address, T value) - { + void write(uint32_t address, T value) { auto dst = reinterpret_cast(base) + address; if (address >= tlb_size) { @@ -44,27 +42,23 @@ class Writer throw std::runtime_error("Unaligned write"); } - *reinterpret_cast(dst) = value; + *reinterpret_cast(dst) = value; } private: /** * @brief tt_SiliconDriver interface to construct a new Writer object. - * + * * @param base pointer to the base address of a mapped TLB. * @param tlb_size size of the mapped TLB. */ - Writer(void *base, size_t tlb_size) - : base(base) - , tlb_size(tlb_size) - { + Writer(void *base, size_t tlb_size) : base(base), tlb_size(tlb_size) { assert(base); assert(tlb_size > 0); } - void *base{ nullptr }; - size_t tlb_size{ 0 }; + void *base{nullptr}; + size_t tlb_size{0}; }; - -} // namespace tt +} // namespace tt diff --git a/device/tt_silicon_driver.cpp b/device/tt_silicon_driver.cpp index 6164f6fc..0bc899fb 100644 --- a/device/tt_silicon_driver.cpp +++ b/device/tt_silicon_driver.cpp @@ -2,58 +2,56 @@ // // SPDX-License-Identifier: Apache-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include #include -#include #include - +#include +#include +#include +#include +#include +#include +#include #include #include #include #include -#include #include #include +#include +#include #include #include #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include +#include -#include "yaml-cpp/yaml.h" #include "common/logger.hpp" - +#include "device/architecture_implementation.h" #include "device/cpuset_lib.hpp" #include "device/driver_atomics.h" -#include "device/architecture_implementation.h" #include "device/tlb.h" #include "device/tt_arch_types.h" -#include "tt_device.h" #include "ioctl.h" +#include "tt_device.h" +#include "yaml-cpp/yaml.h" using namespace boost::interprocess; using namespace tt; const uint32_t g_MAX_HOST_MEM_CHANNELS = 4; -const uint32_t HUGEPAGE_REGION_SIZE = 1 << 30; // 1GB +const uint32_t HUGEPAGE_REGION_SIZE = 1 << 30; // 1GB const uint32_t HUGEPAGE_MAP_MASK = HUGEPAGE_REGION_SIZE - 1; static const uint32_t MSG_ERROR_REPLY = 0xFFFFFFFF; @@ -99,7 +97,7 @@ tt::ARCH detect_arch() { } template -void size_buffer_to_capacity(std::vector &data_buf, std::size_t size_in_bytes) { +void size_buffer_to_capacity(std::vector& data_buf, std::size_t size_in_bytes) { std::size_t target_size = 0; if (size_in_bytes > 0) { target_size = ((size_in_bytes - 1) / sizeof(T)) + 1; @@ -108,14 +106,13 @@ void size_buffer_to_capacity(std::vector &data_buf, std::size_t size_in_bytes } // Get number of 1GB host hugepages installed. They are used for host queues. -uint32_t get_num_hugepages(){ - +uint32_t get_num_hugepages() { log_assert(HUGEPAGE_REGION_SIZE == 1 << 30, "Hugepages must be 1GB in size"); std::string nr_hugepages_path = "/sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages"; std::ifstream hugepages_file(nr_hugepages_path); uint32_t num_hugepages = 0; - if(hugepages_file.is_open()) { + if (hugepages_file.is_open()) { std::string value; std::getline(hugepages_file, value); num_hugepages = std::stoi(value); @@ -125,56 +122,72 @@ uint32_t get_num_hugepages(){ } return num_hugepages; - } // Dynamically figure out how many host memory channels (based on hugepages installed) for each device, based on arch. -uint32_t get_available_num_host_mem_channels(const uint32_t num_channels_per_device_target, const uint16_t device_id, const uint16_t revision_id) { - +uint32_t get_available_num_host_mem_channels( + const uint32_t num_channels_per_device_target, const uint16_t device_id, const uint16_t revision_id) { // To minimally support hybrid dev systems with mix of ARCH, get only devices matching current ARCH's device_id. - uint32_t total_num_tt_mmio_devices = tt::cpuset::tt_cpuset_allocator::get_num_tt_pci_devices(); - uint32_t num_tt_mmio_devices_for_arch = tt::cpuset::tt_cpuset_allocator::get_num_tt_pci_devices_by_pci_device_id(device_id, revision_id); - uint32_t total_hugepages = get_num_hugepages(); + uint32_t total_num_tt_mmio_devices = tt::cpuset::tt_cpuset_allocator::get_num_tt_pci_devices(); + uint32_t num_tt_mmio_devices_for_arch = + tt::cpuset::tt_cpuset_allocator::get_num_tt_pci_devices_by_pci_device_id(device_id, revision_id); + uint32_t total_hugepages = get_num_hugepages(); // This shouldn't happen on silicon machines. if (num_tt_mmio_devices_for_arch == 0) { - log_warning(LogSiliconDriver, + log_warning( + LogSiliconDriver, "No TT devices found that match PCI device_id: 0x{:x} revision: {}, returning NumHostMemChannels:0", - device_id, revision_id); + device_id, + revision_id); return 0; } - // GS will use P2P + 1 channel, others may support 4 host channels. Apply min of 1 to not completely break setups that were incomplete - // ie fewer hugepages than devices, which would partially work previously for some devices. - uint32_t num_channels_per_device_available = std::min(num_channels_per_device_target, std::max((uint32_t) 1, total_hugepages / num_tt_mmio_devices_for_arch)); + // GS will use P2P + 1 channel, others may support 4 host channels. Apply min of 1 to not completely break setups + // that were incomplete ie fewer hugepages than devices, which would partially work previously for some devices. + uint32_t num_channels_per_device_available = + std::min(num_channels_per_device_target, std::max((uint32_t)1, total_hugepages / num_tt_mmio_devices_for_arch)); - // Perform some helpful assertion checks to guard against common pitfalls that would show up as runtime issues later on. + // Perform some helpful assertion checks to guard against common pitfalls that would show up as runtime issues later + // on. if (total_num_tt_mmio_devices > num_tt_mmio_devices_for_arch) { - log_warning(LogSiliconDriver, - "Hybrid system mixing different TTDevices - this is not well supported. Ensure sufficient Hugepages/HostMemChannels per device."); + log_warning( + LogSiliconDriver, + "Hybrid system mixing different TTDevices - this is not well supported. Ensure sufficient " + "Hugepages/HostMemChannels per device."); } if (total_hugepages < num_tt_mmio_devices_for_arch) { - log_warning(LogSiliconDriver, - "Insufficient NumHugepages: {} should be at least NumMMIODevices: {} for device_id: 0x{:x} revision: {}. NumHostMemChannels would be 0, bumping to 1.", - total_hugepages, num_tt_mmio_devices_for_arch, device_id, revision_id); + log_warning( + LogSiliconDriver, + "Insufficient NumHugepages: {} should be at least NumMMIODevices: {} for device_id: 0x{:x} revision: {}. " + "NumHostMemChannels would be 0, bumping to 1.", + total_hugepages, + num_tt_mmio_devices_for_arch, + device_id, + revision_id); } if (num_channels_per_device_available < num_channels_per_device_target) { - log_warning(LogSiliconDriver, - "NumHostMemChannels: {} used for device_id: 0x{:x} less than target: {}. Workload will fail if it exceeds NumHostMemChannels. Increase Number of Hugepages.", - num_channels_per_device_available, device_id, num_channels_per_device_target); + log_warning( + LogSiliconDriver, + "NumHostMemChannels: {} used for device_id: 0x{:x} less than target: {}. Workload will fail if it exceeds " + "NumHostMemChannels. Increase Number of Hugepages.", + num_channels_per_device_available, + device_id, + num_channels_per_device_target); } - log_assert(num_channels_per_device_available <= g_MAX_HOST_MEM_CHANNELS, + log_assert( + num_channels_per_device_available <= g_MAX_HOST_MEM_CHANNELS, "NumHostMemChannels: {} exceeds supported maximum: {}, this is unexpected.", - num_channels_per_device_available, g_MAX_HOST_MEM_CHANNELS); + num_channels_per_device_available, + g_MAX_HOST_MEM_CHANNELS); return num_channels_per_device_available; - } -bool is_char_dev(const dirent *ent, const char *parent_dir) { +bool is_char_dev(const dirent* ent, const char* parent_dir) { if (ent->d_type == DT_UNKNOWN || ent->d_type == DT_LNK) { char name[2 * NAME_MAX + 2]; strcpy(name, parent_dir); @@ -192,18 +205,16 @@ bool is_char_dev(const dirent *ent, const char *parent_dir) { } } - - // -------------------------------------------------------------------------------------------------------------- // -------------------------------------------------------------------------------------------------------------- // -------------------------------------------------------------------------------------------------------------- -#include "tt_silicon_driver_common.hpp" -#include "tt_xy_pair.h" -#include #include #include +#include +#include "tt_silicon_driver_common.hpp" +#include "tt_xy_pair.h" struct routing_cmd_t { uint64_t sys_addr; @@ -212,47 +223,51 @@ struct routing_cmd_t { uint16_t rack; uint16_t src_resp_buf_index; uint32_t local_buf_index; - uint8_t src_resp_q_id; - uint8_t host_mem_txn_id; + uint8_t src_resp_q_id; + uint8_t host_mem_txn_id; uint16_t padding; - uint32_t src_addr_tag; //upper 32-bits of request source address. + uint32_t src_addr_tag; // upper 32-bits of request source address. }; -struct remote_update_ptr_t{ - uint32_t ptr; - uint32_t pad[3]; +struct remote_update_ptr_t { + uint32_t ptr; + uint32_t pad[3]; }; namespace { - struct tt_4_byte_aligned_buffer { - // Stores a 4 byte aligned buffer - // If the input buffer is already 4 byte aligned, this is a nop - std::uint32_t* local_storage = nullptr; - std::uint32_t input_size = 0; - std::uint32_t block_size = 0; - - tt_4_byte_aligned_buffer(const void* mem_ptr, uint32_t size_in_bytes) { - input_size = size_in_bytes; - local_storage = (uint32_t*)mem_ptr; - uint32_t alignment_mask = sizeof(uint32_t) - 1; - uint32_t aligned_size = (size_in_bytes + alignment_mask) & ~alignment_mask; +struct tt_4_byte_aligned_buffer { + // Stores a 4 byte aligned buffer + // If the input buffer is already 4 byte aligned, this is a nop + std::uint32_t* local_storage = nullptr; + std::uint32_t input_size = 0; + std::uint32_t block_size = 0; - if(size_in_bytes < aligned_size) { - local_storage = new uint32_t[aligned_size / sizeof(uint32_t)]; - } - block_size = aligned_size; + tt_4_byte_aligned_buffer(const void* mem_ptr, uint32_t size_in_bytes) { + input_size = size_in_bytes; + local_storage = (uint32_t*)mem_ptr; + uint32_t alignment_mask = sizeof(uint32_t) - 1; + uint32_t aligned_size = (size_in_bytes + alignment_mask) & ~alignment_mask; + + if (size_in_bytes < aligned_size) { + local_storage = new uint32_t[aligned_size / sizeof(uint32_t)]; } + block_size = aligned_size; + } - ~tt_4_byte_aligned_buffer() { - if(block_size > input_size) { - delete [] local_storage; - } + ~tt_4_byte_aligned_buffer() { + if (block_size > input_size) { + delete[] local_storage; } - }; -} + } +}; +} // namespace -bool tt_SiliconDevice::address_in_tlb_space(uint32_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size, std::uint32_t chip) { - return ((tlb_config_map.at(chip).find(tlb_index) != tlb_config_map.at(chip).end()) && address >= tlb_config_map.at(chip).at(tlb_index) && (address + size_in_bytes <= tlb_config_map.at(chip).at(tlb_index) + tlb_size)); +bool tt_SiliconDevice::address_in_tlb_space( + uint32_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size, std::uint32_t chip) { + return ( + (tlb_config_map.at(chip).find(tlb_index) != tlb_config_map.at(chip).end()) && + address >= tlb_config_map.at(chip).at(tlb_index) && + (address + size_in_bytes <= tlb_config_map.at(chip).at(tlb_index) + tlb_size)); } std::unordered_map& tt_SiliconDevice::get_virtual_soc_descriptors() { @@ -260,10 +275,10 @@ std::unordered_map& tt_SiliconDevice::get_virtual_s } void tt_SiliconDevice::initialize_interprocess_mutexes(int pci_interface_id, bool cleanup_mutexes_in_shm) { - // These mutexes are intended to be based on physical devices/pci-intf not logical. Set these up ahead of time here (during device init) - // since its unsafe to modify shared state during multithreaded runtime. - // cleanup_mutexes_in_shm is tied to clean_system_resources from the constructor. The main process is responsible for initializing the driver with this - // field set to cleanup after an aborted process. + // These mutexes are intended to be based on physical devices/pci-intf not logical. Set these up ahead of time here + // (during device init) since its unsafe to modify shared state during multithreaded runtime. cleanup_mutexes_in_shm + // is tied to clean_system_resources from the constructor. The main process is responsible for initializing the + // driver with this field set to cleanup after an aborted process. // Store old mask and clear processes umask auto old_umask = umask(0); @@ -272,70 +287,108 @@ void tt_SiliconDevice::initialize_interprocess_mutexes(int pci_interface_id, boo std::string mutex_name = ""; // Initialize Dynamic TLB mutexes - for(auto &tlb : dynamic_tlb_config) { + for (auto& tlb : dynamic_tlb_config) { mutex_name = tlb.first + std::to_string(pci_interface_id); - if (cleanup_mutexes_in_shm) named_mutex::remove(mutex_name.c_str()); - hardware_resource_mutex_map[mutex_name] = std::make_shared(open_or_create, mutex_name.c_str(), unrestricted_permissions); + if (cleanup_mutexes_in_shm) { + named_mutex::remove(mutex_name.c_str()); + } + hardware_resource_mutex_map[mutex_name] = + std::make_shared(open_or_create, mutex_name.c_str(), unrestricted_permissions); } // Initialize ARC core mutex mutex_name = fmt::format("ARC_MSG{}", pci_interface_id); - if (cleanup_mutexes_in_shm) named_mutex::remove(mutex_name.c_str()); - hardware_resource_mutex_map[mutex_name] = std::make_shared(open_or_create, mutex_name.c_str(), unrestricted_permissions); + if (cleanup_mutexes_in_shm) { + named_mutex::remove(mutex_name.c_str()); + } + hardware_resource_mutex_map[mutex_name] = + std::make_shared(open_or_create, mutex_name.c_str(), unrestricted_permissions); if (arch_name == tt::ARCH::WORMHOLE_B0) { mutex_name = NON_MMIO_MUTEX_NAME + std::to_string(pci_interface_id); - // Initialize non-MMIO mutexes for WH devices regardless of number of chips, since these may be used for ethernet broadcast - if (cleanup_mutexes_in_shm) named_mutex::remove(mutex_name.c_str()); - hardware_resource_mutex_map[mutex_name] = std::make_shared(open_or_create, mutex_name.c_str(), unrestricted_permissions); + // Initialize non-MMIO mutexes for WH devices regardless of number of chips, since these may be used for + // ethernet broadcast + if (cleanup_mutexes_in_shm) { + named_mutex::remove(mutex_name.c_str()); + } + hardware_resource_mutex_map[mutex_name] = + std::make_shared(open_or_create, mutex_name.c_str(), unrestricted_permissions); } // Initialize interprocess mutexes to make host -> device memory barriers atomic mutex_name = MEM_BARRIER_MUTEX_NAME + std::to_string(pci_interface_id); - if (cleanup_mutexes_in_shm) named_mutex::remove(mutex_name.c_str()); - hardware_resource_mutex_map[mutex_name] = std::make_shared(open_or_create, mutex_name.c_str(), unrestricted_permissions); - + if (cleanup_mutexes_in_shm) { + named_mutex::remove(mutex_name.c_str()); + } + hardware_resource_mutex_map[mutex_name] = + std::make_shared(open_or_create, mutex_name.c_str(), unrestricted_permissions); + // Restore old mask umask(old_umask); } -void tt_SiliconDevice::create_device(const std::unordered_set &target_mmio_device_ids, const uint32_t &num_host_mem_ch_per_mmio_device, const bool skip_driver_allocs, const bool clean_system_resources) { +void tt_SiliconDevice::create_device( + const std::unordered_set& target_mmio_device_ids, + const uint32_t& num_host_mem_ch_per_mmio_device, + const bool skip_driver_allocs, + const bool clean_system_resources) { log_debug(LogSiliconDriver, "tt_SiliconDevice::tt_SiliconDevice"); // Don't buffer stdout. setbuf(stdout, NULL); - // Just use PCI interface id from physical_device_id given by cluster desc mmio map. For GS, already virtualized to use available devices. + // Just use PCI interface id from physical_device_id given by cluster desc mmio map. For GS, already virtualized to + // use available devices. auto logical_to_physical_device_id_map = ndesc->get_chips_with_mmio(); - log_assert(target_mmio_device_ids.size() > 0, "Must provide set of target_mmio_device_ids to tt_SiliconDevice constructor now."); + log_assert( + target_mmio_device_ids.size() > 0, + "Must provide set of target_mmio_device_ids to tt_SiliconDevice constructor now."); - for (const chip_id_t &logical_device_id : target_mmio_device_ids) { - log_assert(logical_to_physical_device_id_map.count(logical_device_id) != 0, "Cannot find logical mmio device_id: {} in cluster desc / logical-to-physical-map", logical_device_id); + for (const chip_id_t& logical_device_id : target_mmio_device_ids) { + log_assert( + logical_to_physical_device_id_map.count(logical_device_id) != 0, + "Cannot find logical mmio device_id: {} in cluster desc / logical-to-physical-map", + logical_device_id); int pci_interface_id = logical_to_physical_device_id_map.at(logical_device_id); if (!m_pci_device_map.count(logical_device_id)) { - log_debug(LogSiliconDriver, "Opening TT_PCI_INTERFACE_ID {} for netlist target_device_id: {}", pci_interface_id, logical_device_id); - m_pci_device_map.insert({logical_device_id, std::make_unique(pci_interface_id, logical_device_id)}); + log_debug( + LogSiliconDriver, + "Opening TT_PCI_INTERFACE_ID {} for netlist target_device_id: {}", + pci_interface_id, + logical_device_id); + m_pci_device_map.insert( + {logical_device_id, std::make_unique(pci_interface_id, logical_device_id)}); } auto dev = m_pci_device_map.at(logical_device_id).get(); uint16_t pcie_device_id = dev->get_pci_device_id(); uint32_t pcie_revision = dev->get_pci_revision(); // TODO: get rid of this, it doesn't make any sense. - m_num_host_mem_channels = get_available_num_host_mem_channels(num_host_mem_ch_per_mmio_device, pcie_device_id, pcie_revision); + m_num_host_mem_channels = + get_available_num_host_mem_channels(num_host_mem_ch_per_mmio_device, pcie_device_id, pcie_revision); if (dev->get_arch() == tt::ARCH::BLACKHOLE && m_num_host_mem_channels > 1) { // TODO: Implement support for multiple host channels on BLACKHOLE. - log_warning(LogSiliconDriver, "Forcing a single channel for Blackhole device. Multiple host channels not supported."); + log_warning( + LogSiliconDriver, + "Forcing a single channel for Blackhole device. Multiple host channels not supported."); m_num_host_mem_channels = 1; } - log_debug(LogSiliconDriver, "Using {} Hugepages/NumHostMemChannels for PCIDevice (logical_device_id: {} pci_interface_id: {} device_id: 0x{:x} revision: {})", - m_num_host_mem_channels, logical_device_id, pci_interface_id, pci_device->get_device_num(), pci_device->revision_id); + log_debug( + LogSiliconDriver, + "Using {} Hugepages/NumHostMemChannels for PCIDevice (logical_device_id: {} pci_interface_id: {} " + "device_id: 0x{:x} revision: {})", + m_num_host_mem_channels, + logical_device_id, + pci_interface_id, + pci_device->get_device_num(), + pci_device->revision_id); // Initialize these. Used to be in header file. - for (int ch = 0; ch < g_MAX_HOST_MEM_CHANNELS; ch ++) { - hugepage_mapping[logical_device_id][ch]= nullptr; + for (int ch = 0; ch < g_MAX_HOST_MEM_CHANNELS; ch++) { + hugepage_mapping[logical_device_id][ch] = nullptr; hugepage_mapping_size[logical_device_id][ch] = 0; hugepage_physical_address[logical_device_id][ch] = 0; } @@ -344,49 +397,63 @@ void tt_SiliconDevice::create_device(const std::unordered_set &target // MT: Initial BH - hugepages will fail init // For using silicon driver without workload to query mission mode params, no need for hugepage. - if (!skip_driver_allocs){ + if (!skip_driver_allocs) { bool hugepages_initialized = init_hugepage(logical_device_id); // Large writes to remote chips require hugepages to be initialized. - // Conservative assert - end workload if remote chips present but hugepages not initialized (failures caused if using remote only for small transactions) - if(target_remote_chips.size()) { - log_assert(hugepages_initialized, "Hugepages must be successfully initialized if workload contains remote chips!"); + // Conservative assert - end workload if remote chips present but hugepages not initialized (failures caused + // if using remote only for small transactions) + if (target_remote_chips.size()) { + log_assert( + hugepages_initialized, + "Hugepages must be successfully initialized if workload contains remote chips!"); } if (not hugepage_mapping.at(logical_device_id).at(0)) { log_warning(LogSiliconDriver, "No hugepage mapping at device {}.", logical_device_id); } } - harvested_coord_translation.insert({logical_device_id, create_harvested_coord_translation(arch_name, true)}); //translation layer for harvested coords. Default is identity map + harvested_coord_translation.insert( + {logical_device_id, + create_harvested_coord_translation( + arch_name, true)}); // translation layer for harvested coords. Default is identity map } - for(const chip_id_t& chip : target_devices_in_cluster) { + for (const chip_id_t& chip : target_devices_in_cluster) { // Initialize identity mapping for Non-MMIO chips as well - if(!ndesc -> is_chip_mmio_capable(chip)) { + if (!ndesc->is_chip_mmio_capable(chip)) { harvested_coord_translation.insert({chip, create_harvested_coord_translation(arch_name, true)}); flush_non_mmio_per_chip[chip] = false; } } } -bool tt_SiliconDevice::using_harvested_soc_descriptors() { - return perform_harvesting_on_sdesc && performed_harvesting; -} +bool tt_SiliconDevice::using_harvested_soc_descriptors() { return perform_harvesting_on_sdesc && performed_harvesting; } -std::unordered_map tt_SiliconDevice::get_harvested_coord_translation_map(chip_id_t logical_device_id) { +std::unordered_map tt_SiliconDevice::get_harvested_coord_translation_map( + chip_id_t logical_device_id) { return harvested_coord_translation.at(logical_device_id); } std::unordered_map tt_SiliconDevice::get_harvesting_masks_for_soc_descriptors() { - if(using_harvested_soc_descriptors()) { + if (using_harvested_soc_descriptors()) { return harvested_rows_per_target; } std::unordered_map default_harvesting_masks = {}; - for(const auto chip : target_devices_in_cluster) default_harvesting_masks.insert({chip, 0}); + for (const auto chip : target_devices_in_cluster) { + default_harvesting_masks.insert({chip, 0}); + } return default_harvesting_masks; } -tt_SiliconDevice::tt_SiliconDevice(const std::string &sdesc_path, const std::string &ndesc_path, const std::set &target_devices, - const uint32_t &num_host_mem_ch_per_mmio_device, const bool skip_driver_allocs, - const bool clean_system_resources, bool perform_harvesting, std::unordered_map simulated_harvesting_masks) : tt_device(sdesc_path) { +tt_SiliconDevice::tt_SiliconDevice( + const std::string& sdesc_path, + const std::string& ndesc_path, + const std::set& target_devices, + const uint32_t& num_host_mem_ch_per_mmio_device, + const bool skip_driver_allocs, + const bool clean_system_resources, + bool perform_harvesting, + std::unordered_map simulated_harvesting_masks) : + tt_device(sdesc_path) { std::unordered_set target_mmio_device_ids; target_devices_in_cluster = target_devices; arch_name = tt_SocDescriptor(sdesc_path).arch; @@ -396,117 +463,143 @@ tt_SiliconDevice::tt_SiliconDevice(const std::string &sdesc_path, const std::str m_num_pci_devices = available_device_ids.size(); if (!skip_driver_allocs) { - log_info(LogSiliconDriver, "Detected {} PCI device{} : {}", m_num_pci_devices, (m_num_pci_devices > 1) ? "s":"", available_device_ids); + log_info( + LogSiliconDriver, + "Detected {} PCI device{} : {}", + m_num_pci_devices, + (m_num_pci_devices > 1) ? "s" : "", + available_device_ids); log_debug(LogSiliconDriver, "Passed target devices: {}", target_devices); } if (ndesc_path == "") { ndesc = tt_ClusterDescriptor::create_for_grayskull_cluster(target_devices, available_device_ids); - } - else { + } else { ndesc = tt_ClusterDescriptor::create_from_yaml(ndesc_path); } - for (auto &d: target_devices){ - if (ndesc->is_chip_mmio_capable(d)){ + for (auto& d : target_devices) { + if (ndesc->is_chip_mmio_capable(d)) { target_mmio_device_ids.insert(d); - } - else { + } else { target_remote_chips.insert(d); } } - // It is mandatory for all devices to have these TLBs set aside, as the driver needs them to issue remote reads and writes. + // It is mandatory for all devices to have these TLBs set aside, as the driver needs them to issue remote reads and + // writes. auto architecture_implementation = tt::umd::architecture_implementation::create(arch_name); - dynamic_tlb_config["LARGE_READ_TLB"] = architecture_implementation->get_mem_large_read_tlb(); + dynamic_tlb_config["LARGE_READ_TLB"] = architecture_implementation->get_mem_large_read_tlb(); dynamic_tlb_config["LARGE_WRITE_TLB"] = architecture_implementation->get_mem_large_write_tlb(); dynamic_tlb_config["REG_TLB"] = architecture_implementation->get_reg_tlb(); dynamic_tlb_config["SMALL_READ_WRITE_TLB"] = architecture_implementation->get_small_read_write_tlb(); - for(const auto& tlb : dynamic_tlb_config) { - dynamic_tlb_ordering_modes.insert({tlb.first, TLB_DATA::Relaxed}); // All dynamic TLBs use Relaxed Ordering by default; MT: Good for BH + for (const auto& tlb : dynamic_tlb_config) { + dynamic_tlb_ordering_modes.insert( + {tlb.first, TLB_DATA::Relaxed}); // All dynamic TLBs use Relaxed Ordering by default; MT: Good for BH } create_device(target_mmio_device_ids, num_host_mem_ch_per_mmio_device, skip_driver_allocs, clean_system_resources); // MT: Initial BH - Disable dependency to ethernet firmware - if(arch_name == tt::ARCH::BLACKHOLE) { + if (arch_name == tt::ARCH::BLACKHOLE) { use_ethernet_ordered_writes = false; use_ethernet_broadcast = false; use_virtual_coords_for_eth_broadcast = false; } - if(arch_name == tt::ARCH::WORMHOLE_B0) { - const auto& harvesting_masks = ndesc -> get_harvesting_info(); - const auto& noc_translation_enabled = ndesc -> get_noc_translation_table_en(); + if (arch_name == tt::ARCH::WORMHOLE_B0) { + const auto& harvesting_masks = ndesc->get_harvesting_info(); + const auto& noc_translation_enabled = ndesc->get_noc_translation_table_en(); translation_tables_en = false; - for(auto& masks : harvesting_masks) { - if(target_devices.find(masks.first) != target_devices.end()) { + for (auto& masks : harvesting_masks) { + if (target_devices.find(masks.first) != target_devices.end()) { harvested_rows_per_target[masks.first] = get_harvested_noc_rows(masks.second); noc_translation_enabled_for_chip[masks.first] = noc_translation_enabled.at(masks.first); num_rows_harvested.insert({masks.first, std::bitset<32>(masks.second).count()}); - if(harvested_rows_per_target[masks.first]) { + if (harvested_rows_per_target[masks.first]) { performed_harvesting = true; } } } - if(noc_translation_enabled_for_chip.size() > 0) { - auto const consistent_translation_table_state = [&] (std::pair const& i) { - return noc_translation_enabled_for_chip.begin() -> second == i.second; + if (noc_translation_enabled_for_chip.size() > 0) { + auto const consistent_translation_table_state = [&](std::pair const& i) { + return noc_translation_enabled_for_chip.begin()->second == i.second; }; - bool translation_tables_match_on_all_chips = std::all_of(noc_translation_enabled_for_chip.begin(), noc_translation_enabled_for_chip.end(), consistent_translation_table_state); - log_assert(translation_tables_match_on_all_chips, "Cluster uses NOC translation tables inconsistently across chips."); - translation_tables_en = noc_translation_enabled_for_chip.begin() -> second; + bool translation_tables_match_on_all_chips = std::all_of( + noc_translation_enabled_for_chip.begin(), + noc_translation_enabled_for_chip.end(), + consistent_translation_table_state); + log_assert( + translation_tables_match_on_all_chips, + "Cluster uses NOC translation tables inconsistently across chips."); + translation_tables_en = noc_translation_enabled_for_chip.begin()->second; } - if(translation_tables_en) { + if (translation_tables_en) { harvested_coord_translation.clear(); - for(const chip_id_t& chip : target_devices_in_cluster) { + for (const chip_id_t& chip : target_devices_in_cluster) { harvested_coord_translation.insert({chip, create_harvested_coord_translation(arch_name, false)}); } } - log_assert(performed_harvesting ? translation_tables_en : true, "Using a harvested WH cluster with NOC translation disabled."); - } - else if(arch_name == tt::ARCH::BLACKHOLE) { + log_assert( + performed_harvesting ? translation_tables_en : true, + "Using a harvested WH cluster with NOC translation disabled."); + } else if (arch_name == tt::ARCH::BLACKHOLE) { // Default harvesting info for Blackhole, describing no harvesting - for(auto chip_id = target_devices.begin(); chip_id != target_devices.end(); chip_id++){ - harvested_rows_per_target[*chip_id] = 0; //get_harvested_noc_rows_for_chip(*chip_id); - num_rows_harvested.insert({*chip_id, 0}); // Only set for broadcast TLB to get RISCS out of reset. We want all rows to have a reset signal sent. - if(harvested_rows_per_target[*chip_id]) { + for (auto chip_id = target_devices.begin(); chip_id != target_devices.end(); chip_id++) { + harvested_rows_per_target[*chip_id] = 0; // get_harvested_noc_rows_for_chip(*chip_id); + num_rows_harvested.insert({*chip_id, 0}); // Only set for broadcast TLB to get RISCS out of reset. We want + // all rows to have a reset signal sent. + if (harvested_rows_per_target[*chip_id]) { performed_harvesting = true; } } - } - else if(arch_name == tt::ARCH::GRAYSKULL) { + } else if (arch_name == tt::ARCH::GRAYSKULL) { // Multichip harvesting is supported for GS. - for(auto chip_id = target_devices.begin(); chip_id != target_devices.end(); chip_id++){ - harvested_rows_per_target[*chip_id] = get_harvested_noc_rows_for_chip(*chip_id); - num_rows_harvested.insert({*chip_id, 0}); // Only set for broadcast TLB to get RISCS out of reset. We want all rows to have a reset signal sent. - if(harvested_rows_per_target[*chip_id]) { + for (auto chip_id = target_devices.begin(); chip_id != target_devices.end(); chip_id++) { + harvested_rows_per_target[*chip_id] = get_harvested_noc_rows_for_chip(*chip_id); + num_rows_harvested.insert({*chip_id, 0}); // Only set for broadcast TLB to get RISCS out of reset. We want + // all rows to have a reset signal sent. + if (harvested_rows_per_target[*chip_id]) { performed_harvesting = true; } } } - if(simulated_harvesting_masks.size()) { + if (simulated_harvesting_masks.size()) { performed_harvesting = true; for (auto device_id = target_devices.begin(); device_id != target_devices.end(); device_id++) { - log_assert(simulated_harvesting_masks.find(*device_id) != simulated_harvesting_masks.end(), "Could not find harvesting mask for device_id {}", *device_id); - if(arch_name == tt::ARCH::GRAYSKULL) { - if ((simulated_harvesting_masks.at(*device_id) & harvested_rows_per_target[*device_id]) != harvested_rows_per_target[*device_id]) { - log_warning(LogSiliconDriver, - "Simulated harvesting config for device {} does not include the actual harvesting config. Simulated harvesting mask will be added to the real harvesting mask. Actual Harvested Rows : {} Simulated Harvested Rows : {}", - *device_id, harvested_rows_per_target[*device_id], simulated_harvesting_masks.at(*device_id)); + log_assert( + simulated_harvesting_masks.find(*device_id) != simulated_harvesting_masks.end(), + "Could not find harvesting mask for device_id {}", + *device_id); + if (arch_name == tt::ARCH::GRAYSKULL) { + if ((simulated_harvesting_masks.at(*device_id) & harvested_rows_per_target[*device_id]) != + harvested_rows_per_target[*device_id]) { + log_warning( + LogSiliconDriver, + "Simulated harvesting config for device {} does not include the actual harvesting config. " + "Simulated harvesting mask will be added to the real harvesting mask. Actual Harvested Rows : " + "{} Simulated Harvested Rows : {}", + *device_id, + harvested_rows_per_target[*device_id], + simulated_harvesting_masks.at(*device_id)); } simulated_harvesting_masks.at(*device_id) |= harvested_rows_per_target[*device_id]; - } - else if(arch_name == tt::ARCH::WORMHOLE_B0) { - log_assert(std::bitset<32>(simulated_harvesting_masks.at(*device_id)).count() >= std::bitset<32>(harvested_rows_per_target[*device_id]).count(), - "Simulated Harvesting for WH must contain at least as many rows as the actual harvesting config. Actual Harvested Rows : {} Simulated Harvested Rows : {}", - harvested_rows_per_target[*device_id], simulated_harvesting_masks.at(*device_id)); - num_rows_harvested.at(*device_id) = std::bitset<32>(simulated_harvesting_masks.at(*device_id)).count(); - log_assert(performed_harvesting ? translation_tables_en : true, "Using a harvested WH cluster with NOC translation disabled."); + } else if (arch_name == tt::ARCH::WORMHOLE_B0) { + log_assert( + std::bitset<32>(simulated_harvesting_masks.at(*device_id)).count() >= + std::bitset<32>(harvested_rows_per_target[*device_id]).count(), + "Simulated Harvesting for WH must contain at least as many rows as the actual harvesting config. " + "Actual Harvested Rows : {} Simulated Harvested Rows : {}", + harvested_rows_per_target[*device_id], + simulated_harvesting_masks.at(*device_id)); + num_rows_harvested.at(*device_id) = std::bitset<32>(simulated_harvesting_masks.at(*device_id)).count(); + log_assert( + performed_harvesting ? translation_tables_en : true, + "Using a harvested WH cluster with NOC translation disabled."); } harvested_rows_per_target[*device_id] = simulated_harvesting_masks.at(*device_id); } @@ -516,38 +609,43 @@ tt_SiliconDevice::tt_SiliconDevice(const std::string &sdesc_path, const std::str populate_cores(); // MT: Initial BH - skip this for BH - if(arch_name == tt::ARCH::WORMHOLE_B0) { + if (arch_name == tt::ARCH::WORMHOLE_B0) { remote_transfer_ethernet_cores.resize(target_mmio_device_ids.size()); - for (const auto &logical_mmio_chip_id : target_mmio_device_ids) { + for (const auto& logical_mmio_chip_id : target_mmio_device_ids) { const tt_SocDescriptor& soc_desc = get_soc_descriptor(logical_mmio_chip_id); // 4-5 is for send_epoch_commands, 0-3 are for everything else for (std::uint32_t i = 0; i < NUM_ETH_CORES_FOR_NON_MMIO_TRANSFERS; i++) { - if(remote_transfer_ethernet_cores.size() <= logical_mmio_chip_id) { + if (remote_transfer_ethernet_cores.size() <= logical_mmio_chip_id) { remote_transfer_ethernet_cores.resize(logical_mmio_chip_id + 1); } - remote_transfer_ethernet_cores.at(logical_mmio_chip_id).push_back( - tt_cxy_pair(logical_mmio_chip_id, soc_desc.ethernet_cores.at(i).x, soc_desc.ethernet_cores.at(i).y) - ); + remote_transfer_ethernet_cores.at(logical_mmio_chip_id) + .push_back(tt_cxy_pair( + logical_mmio_chip_id, soc_desc.ethernet_cores.at(i).x, soc_desc.ethernet_cores.at(i).y)); } } } // Default initialize host_address_params based on detected arch host_address_params = architecture_implementation->get_host_address_params(); - } -void tt_SiliconDevice::configure_active_ethernet_cores_for_mmio_device(chip_id_t mmio_chip, const std::unordered_set& active_eth_cores_per_chip) { +void tt_SiliconDevice::configure_active_ethernet_cores_for_mmio_device( + chip_id_t mmio_chip, const std::unordered_set& active_eth_cores_per_chip) { // Makes UMD aware of which ethernet cores have active links. // Based on this information, UMD determines which ethernet cores can be used for host->cluster non-MMIO transfers. - // This overrides the default ethernet cores tagged for host to cluster routing in the constructor and must be called for all MMIO devices, if default behaviour - // is not desired. - log_assert(get_soc_descriptor(mmio_chip).arch == tt::ARCH::WORMHOLE_B0, "{} can only be called for Wormhole arch", __FUNCTION__); + // This overrides the default ethernet cores tagged for host to cluster routing in the constructor and must be + // called for all MMIO devices, if default behaviour is not desired. + log_assert( + get_soc_descriptor(mmio_chip).arch == tt::ARCH::WORMHOLE_B0, + "{} can only be called for Wormhole arch", + __FUNCTION__); auto& eth_cores = get_soc_descriptor(mmio_chip).ethernet_cores; // Cores 0, 1, 6, 7 are only available if in the active set - static std::unordered_set eth_cores_available_if_active = {eth_cores.at(0), eth_cores.at(1), eth_cores.at(6), eth_cores.at(7)}; + static std::unordered_set eth_cores_available_if_active = { + eth_cores.at(0), eth_cores.at(1), eth_cores.at(6), eth_cores.at(7)}; // Eth cores 8 and 9 are always available - std::vector non_mmio_access_cores_for_chip = {tt_cxy_pair(mmio_chip, eth_cores.at(8)), tt_cxy_pair(mmio_chip, eth_cores.at(9))}; + std::vector non_mmio_access_cores_for_chip = { + tt_cxy_pair(mmio_chip, eth_cores.at(8)), tt_cxy_pair(mmio_chip, eth_cores.at(9))}; for (const auto& active_eth_core : active_eth_cores_per_chip) { if (eth_cores_available_if_active.find(active_eth_core) != eth_cores_available_if_active.end()) { non_mmio_access_cores_for_chip.push_back(tt_cxy_pair(mmio_chip, active_eth_core)); @@ -561,27 +659,33 @@ void tt_SiliconDevice::configure_active_ethernet_cores_for_mmio_device(chip_id_t void tt_SiliconDevice::populate_cores() { std::uint32_t count = 0; - for(const auto chip : soc_descriptor_per_chip) { - workers_per_chip.insert({chip.first, std::unordered_set(chip.second.workers.begin(), chip.second.workers.end())}); - if(count == 0) { - eth_cores = std::unordered_set(chip.second.ethernet_cores.begin(), chip.second.ethernet_cores.end()); - for(std::uint32_t dram_idx = 0; dram_idx < chip.second.get_num_dram_channels(); dram_idx++) { - dram_cores.insert(chip.second.get_core_for_dram_channel(dram_idx, 0)) ; + for (const auto chip : soc_descriptor_per_chip) { + workers_per_chip.insert( + {chip.first, std::unordered_set(chip.second.workers.begin(), chip.second.workers.end())}); + if (count == 0) { + eth_cores = + std::unordered_set(chip.second.ethernet_cores.begin(), chip.second.ethernet_cores.end()); + for (std::uint32_t dram_idx = 0; dram_idx < chip.second.get_num_dram_channels(); dram_idx++) { + dram_cores.insert(chip.second.get_core_for_dram_channel(dram_idx, 0)); } } count++; } } -std::vector tt_SiliconDevice::extract_rows_to_remove(const tt::ARCH &arch, const int worker_grid_rows, const int harvested_rows) { +std::vector tt_SiliconDevice::extract_rows_to_remove( + const tt::ARCH& arch, const int worker_grid_rows, const int harvested_rows) { // Check if harvesting config is legal for GS and WH - log_assert(!((harvested_rows & 1) || (harvested_rows & 64) || (harvested_rows & 0xFFFFF000)), "For grayskull and wormhole, only rows 1-5 and 7-11 can be harvested"); + log_assert( + !((harvested_rows & 1) || (harvested_rows & 64) || (harvested_rows & 0xFFFFF000)), + "For grayskull and wormhole, only rows 1-5 and 7-11 can be harvested"); std::vector row_coordinates_to_remove; int row_coordinate = 0; int tmp = harvested_rows; while (tmp) { - if (tmp & 1) + if (tmp & 1) { row_coordinates_to_remove.push_back(row_coordinate); + } tmp = tmp >> 1; row_coordinate++; @@ -595,13 +699,14 @@ std::vector tt_SiliconDevice::extract_rows_to_remove(const tt::ARCH &arch, return row_coordinates_to_remove; } -void tt_SiliconDevice::remove_worker_row_from_descriptor(tt_SocDescriptor& full_soc_descriptor, const std::vector& row_coordinates_to_remove) { +void tt_SiliconDevice::remove_worker_row_from_descriptor( + tt_SocDescriptor& full_soc_descriptor, const std::vector& row_coordinates_to_remove) { std::vector workers_to_keep; - for(auto worker = (full_soc_descriptor.workers).begin(); worker != (full_soc_descriptor.workers).end(); worker++){ - if(find(row_coordinates_to_remove.begin(), row_coordinates_to_remove.end(), (*worker).y) == row_coordinates_to_remove.end()){ + for (auto worker = (full_soc_descriptor.workers).begin(); worker != (full_soc_descriptor.workers).end(); worker++) { + if (find(row_coordinates_to_remove.begin(), row_coordinates_to_remove.end(), (*worker).y) == + row_coordinates_to_remove.end()) { workers_to_keep.push_back(*worker); - } - else{ + } else { (full_soc_descriptor.harvested_workers).push_back(*worker); full_soc_descriptor.cores.at(*worker).type = CoreType::HARVESTED; } @@ -613,28 +718,32 @@ void tt_SiliconDevice::remove_worker_row_from_descriptor(tt_SocDescriptor& full_ std::set modified_y_coords = {}; - for(const auto& core : full_soc_descriptor.workers) { + for (const auto& core : full_soc_descriptor.workers) { modified_y_coords.insert(core.y); } int logical_y_coord = 0; - for(const auto& y_coord : modified_y_coords) { + for (const auto& y_coord : modified_y_coords) { full_soc_descriptor.routing_y_to_worker_y.insert({y_coord, logical_y_coord}); - full_soc_descriptor.worker_log_to_routing_y.insert({logical_y_coord, y_coord}); + full_soc_descriptor.worker_log_to_routing_y.insert({logical_y_coord, y_coord}); logical_y_coord++; } } void tt_SiliconDevice::harvest_rows_in_soc_descriptor(tt::ARCH arch, tt_SocDescriptor& sdesc, uint32_t harvested_rows) { - std::uint32_t max_row_to_remove = (*std::max_element((sdesc.workers).begin(), (sdesc.workers).end(), [] (const auto& a, const auto& b) { return a.y < b.y; })).y; + std::uint32_t max_row_to_remove = + (*std::max_element((sdesc.workers).begin(), (sdesc.workers).end(), [](const auto& a, const auto& b) { + return a.y < b.y; + })).y; std::vector row_coordinates_to_remove = extract_rows_to_remove(arch, max_row_to_remove, harvested_rows); remove_worker_row_from_descriptor(sdesc, row_coordinates_to_remove); } -void tt_SiliconDevice::perform_harvesting_and_populate_soc_descriptors(const std::string& sdesc_path, const bool perform_harvesting) { +void tt_SiliconDevice::perform_harvesting_and_populate_soc_descriptors( + const std::string& sdesc_path, const bool perform_harvesting) { const auto default_sdesc = tt_SocDescriptor(sdesc_path); - for(const auto& chip : harvested_rows_per_target) { + for (const auto& chip : harvested_rows_per_target) { auto temp_sdesc = default_sdesc; - if(perform_harvesting) { + if (perform_harvesting) { harvest_rows_in_soc_descriptor(arch_name, temp_sdesc, chip.second); } soc_descriptor_per_chip.insert({chip.first, temp_sdesc}); @@ -642,25 +751,24 @@ void tt_SiliconDevice::perform_harvesting_and_populate_soc_descriptors(const std } void tt_SiliconDevice::check_pcie_device_initialized(int device_id) { - - PCIDevice *pci_device = get_pci_device(device_id); + PCIDevice* pci_device = get_pci_device(device_id); tt::ARCH device_arch = pci_device->get_arch(); if (arch_name == tt::ARCH::GRAYSKULL) { if (device_arch != tt::ARCH::GRAYSKULL) { - throw std::runtime_error(fmt::format("Attempted to run grayskull configured tt_device on {}", get_arch_str(device_arch))); + throw std::runtime_error( + fmt::format("Attempted to run grayskull configured tt_device on {}", get_arch_str(device_arch))); } - } - else if (arch_name == tt::ARCH::WORMHOLE_B0) { + } else if (arch_name == tt::ARCH::WORMHOLE_B0) { if (device_arch != tt::ARCH::WORMHOLE_B0) { - throw std::runtime_error(fmt::format("Attempted to run wormhole configured tt_device on {}", get_arch_str(device_arch))); + throw std::runtime_error( + fmt::format("Attempted to run wormhole configured tt_device on {}", get_arch_str(device_arch))); } - } - else if (arch_name == tt::ARCH::BLACKHOLE) { + } else if (arch_name == tt::ARCH::BLACKHOLE) { if (device_arch != tt::ARCH::BLACKHOLE) { - throw std::runtime_error(fmt::format("Attempted to run blackhole configured tt_device on {}", get_arch_str(device_arch))); + throw std::runtime_error( + fmt::format("Attempted to run blackhole configured tt_device on {}", get_arch_str(device_arch))); } - } - else { + } else { throw std::runtime_error(fmt::format("Unsupported architecture: {}", get_arch_str(arch_name))); } auto architecture_implementation = pci_device->get_architecture_implementation(); @@ -668,29 +776,36 @@ void tt_SiliconDevice::check_pcie_device_initialized(int device_id) { // MT Initial BH - Add check for blackhole once access to ARC registers is setup through TLBs if (arch_name != tt::ARCH::BLACKHOLE) { log_debug(LogSiliconDriver, "== Check if device_id: {} is initialized", device_id); - uint32_t bar_read_initial = bar_read32(device_id, architecture_implementation->get_arc_reset_scratch_offset() + 3 * 4); + uint32_t bar_read_initial = + bar_read32(device_id, architecture_implementation->get_arc_reset_scratch_offset() + 3 * 4); uint32_t arg = bar_read_initial == 500 ? 325 : 500; uint32_t bar_read_again; - uint32_t arc_msg_return = arc_msg(device_id, 0xaa00 | architecture_implementation->get_arc_message_test(), true, arg, 0, 1, &bar_read_again); + uint32_t arc_msg_return = arc_msg( + device_id, 0xaa00 | architecture_implementation->get_arc_message_test(), true, arg, 0, 1, &bar_read_again); if (arc_msg_return != 0 || bar_read_again != arg + 1) { auto postcode = bar_read32(device_id, architecture_implementation->get_arc_reset_scratch_offset()); - throw std::runtime_error(fmt::format("Device is not initialized: arc_fw postcode: {} arc_msg_return: {} arg: {} bar_read_initial: {} bar_read_again: {}", - postcode, - arc_msg_return, - arg, - bar_read_initial, - bar_read_again)); + throw std::runtime_error(fmt::format( + "Device is not initialized: arc_fw postcode: {} arc_msg_return: {} arg: {} bar_read_initial: {} " + "bar_read_again: {}", + postcode, + arc_msg_return, + arg, + bar_read_initial, + bar_read_again)); } } - if (test_setup_interface()) { - throw std::runtime_error("Device is incorrectly initialized. If this is a harvested Wormhole machine, it is likely that NOC Translation Tables are not enabled on device. These need to be enabled for the silicon driver to run."); + throw std::runtime_error( + "Device is incorrectly initialized. If this is a harvested Wormhole machine, it is likely that NOC " + "Translation Tables are not enabled on device. These need to be enabled for the silicon driver to run."); } } -std::unordered_map tt_SiliconDevice::create_harvested_coord_translation(const tt::ARCH arch, bool identity_map) { - log_assert(identity_map ? true : (arch != tt::ARCH::GRAYSKULL), "NOC Translation can only be performed for WH devices"); +std::unordered_map tt_SiliconDevice::create_harvested_coord_translation( + const tt::ARCH arch, bool identity_map) { + log_assert( + identity_map ? true : (arch != tt::ARCH::GRAYSKULL), "NOC Translation can only be performed for WH devices"); std::unordered_map translation_table = {}; tt_xy_pair grid_size; @@ -698,29 +813,42 @@ std::unordered_map tt_SiliconDevice::create_harvested_co std::vector T6_y = {}; std::vector ethernet = {}; // Store device specific data for GS and WH depending on arch - if(arch == tt::ARCH::GRAYSKULL) { + if (arch == tt::ARCH::GRAYSKULL) { grid_size = tt_xy_pair(13, 12); T6_x = {12, 1, 11, 2, 10, 3, 9, 4, 8, 5, 7, 6}; T6_y = {11, 1, 10, 2, 9, 3, 8, 4, 7, 5}; - } - else if (arch == tt::ARCH::BLACKHOLE) { + } else if (arch == tt::ARCH::BLACKHOLE) { grid_size = tt_xy_pair(17, 12); T6_x = {16, 1, 15, 2, 14, 3, 13, 4, 12, 5, 11, 6, 10, 7}; T6_y = {11, 2, 10, 3, 9, 4, 8, 5, 7, 6}; - } - else { + } else { grid_size = tt_xy_pair(10, 12); T6_x = {1, 2, 3, 4, 6, 7, 8, 9}; T6_y = {1, 2, 3, 4, 5, 7, 8, 9, 10, 11}; - ethernet = {{1, 0}, {2, 0}, {3, 0}, {4, 0}, {6, 0}, {7, 0}, {8, 0}, {9, 0}, {1, 6}, {2, 6}, {3, 6}, {4, 6}, {6, 6}, {7, 6}, {8, 6}, {9, 6}}; - } - - - if(identity_map) { + ethernet = { + {1, 0}, + {2, 0}, + {3, 0}, + {4, 0}, + {6, 0}, + {7, 0}, + {8, 0}, + {9, 0}, + {1, 6}, + {2, 6}, + {3, 6}, + {4, 6}, + {6, 6}, + {7, 6}, + {8, 6}, + {9, 6}}; + } + + if (identity_map) { // When device is initialized, assume no harvesting and create an identity map for cores // This flow is always used for GS, since there is no hardware harvesting - for(int x = 0; x < grid_size.x; x++) { - for(int y = 0; y < grid_size.y; y++) { + for (int x = 0; x < grid_size.x; x++) { + for (int y = 0; y < grid_size.y; y++) { tt_xy_pair curr_core = tt_xy_pair(x, y); translation_table.insert({curr_core, curr_core}); } @@ -731,34 +859,50 @@ std::unordered_map tt_SiliconDevice::create_harvested_co // If this function is called with identity_map = false, we have perform NOC translation // This can only happen for WH devices // Setup coord translation for workers. Map all worker cores - for(int x = 0; x < grid_size.x; x++) { - for(int y = 0; y < grid_size.y; y++) { + for (int x = 0; x < grid_size.x; x++) { + for (int y = 0; y < grid_size.y; y++) { tt_xy_pair curr_core = tt_xy_pair(x, y); - if(std::find(T6_x.begin(), T6_x.end(), x) != T6_x.end() && - std::find(T6_y.begin(), T6_y.end(), y) != T6_y.end()) { + if (std::find(T6_x.begin(), T6_x.end(), x) != T6_x.end() && + std::find(T6_y.begin(), T6_y.end(), y) != T6_y.end()) { // This is a worker core. Apply translation for WH. tt_xy_pair harvested_worker; - if(x >= 1 && x <= 4) harvested_worker.x = x + 17; - else if(x <= 9 && x > 5) harvested_worker.x = x + 16; - else log_assert(false, "Invalid WH worker x coord {} when creating translation tables.", x); + if (x >= 1 && x <= 4) { + harvested_worker.x = x + 17; + } else if (x <= 9 && x > 5) { + harvested_worker.x = x + 16; + } else { + log_assert(false, "Invalid WH worker x coord {} when creating translation tables.", x); + } - if(y >= 1 && y <= 5) harvested_worker.y = y + 17; - else if(y <= 11 && y > 6) harvested_worker.y = y + 16; - else log_assert(false, "Invalid WH worker y coord {} when creating translation tables.", y); + if (y >= 1 && y <= 5) { + harvested_worker.y = y + 17; + } else if (y <= 11 && y > 6) { + harvested_worker.y = y + 16; + } else { + log_assert(false, "Invalid WH worker y coord {} when creating translation tables.", y); + } translation_table.insert({curr_core, harvested_worker}); } - else if(std::find(ethernet.begin(), ethernet.end(), curr_core) != ethernet.end()){ + else if (std::find(ethernet.begin(), ethernet.end(), curr_core) != ethernet.end()) { // This is an eth core. Apply translation for WH. tt_xy_pair harvested_eth_core; - if(x >= 1 && x <= 4) harvested_eth_core.x = x + 17; - else if(x <= 9 && x > 5) harvested_eth_core.x = x + 16; - else log_assert(false, "Invalid WH eth_core x coord {} when creating translation tables.", x); + if (x >= 1 && x <= 4) { + harvested_eth_core.x = x + 17; + } else if (x <= 9 && x > 5) { + harvested_eth_core.x = x + 16; + } else { + log_assert(false, "Invalid WH eth_core x coord {} when creating translation tables.", x); + } - if(y == 0) harvested_eth_core.y = y + 16; - else if(y == 6) harvested_eth_core.y = y + 11; - else log_assert(false, "Invalid WH eth_core y coord {} when creating translation tables.", y); + if (y == 0) { + harvested_eth_core.y = y + 16; + } else if (y == 6) { + harvested_eth_core.y = y + 11; + } else { + log_assert(false, "Invalid WH eth_core y coord {} when creating translation tables.", y); + } translation_table.insert({curr_core, harvested_eth_core}); } @@ -771,7 +915,7 @@ std::unordered_map tt_SiliconDevice::create_harvested_co return translation_table; } -void tt_SiliconDevice::translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c) { +void tt_SiliconDevice::translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c) { auto translated_coords = harvested_coord_translation[device_id].at(tt_xy_pair(c, r)); c = translated_coords.x; r = translated_coords.y; @@ -780,18 +924,20 @@ void tt_SiliconDevice::translate_to_noc_table_coords(chip_id_t device_id, std::s void tt_SiliconDevice::initialize_pcie_devices() { log_debug(LogSiliconDriver, "tt_SiliconDevice::start"); - for (auto &device_it : m_pci_device_map){ + for (auto& device_it : m_pci_device_map) { check_pcie_device_initialized(device_it.first); } // TODO: Implement support for multiple host channels on BLACKHOLE. - log_assert(!(arch_name == tt::ARCH::BLACKHOLE && m_num_host_mem_channels > 1), "More channels are not yet supported for Blackhole"); + log_assert( + !(arch_name == tt::ARCH::BLACKHOLE && m_num_host_mem_channels > 1), + "More channels are not yet supported for Blackhole"); init_pcie_iatus(); init_membars(); } -void tt_SiliconDevice::broadcast_pcie_tensix_risc_reset(chip_id_t chip_id, const TensixSoftResetOptions &soft_resets) { +void tt_SiliconDevice::broadcast_pcie_tensix_risc_reset(chip_id_t chip_id, const TensixSoftResetOptions& soft_resets) { log_debug(LogSiliconDriver, "tt_SiliconDevice::broadcast_tensix_risc_reset"); PCIDevice* device = get_pci_device(chip_id); @@ -799,7 +945,10 @@ void tt_SiliconDevice::broadcast_pcie_tensix_risc_reset(chip_id_t chip_id, const auto valid = soft_resets & ALL_TENSIX_SOFT_RESET; auto logical_id = device->get_logical_id(); - log_debug(LogSiliconDriver, "== For all tensix set soft-reset for {} risc cores.", TensixSoftResetOptionsToString(valid).c_str()); + log_debug( + LogSiliconDriver, + "== For all tensix set soft-reset for {} risc cores.", + TensixSoftResetOptionsToString(valid).c_str()); auto architecture_implementation = device->get_architecture_implementation(); @@ -818,77 +967,87 @@ void tt_SiliconDevice::broadcast_pcie_tensix_risc_reset(chip_id_t chip_id, const } std::set tt_SiliconDevice::get_target_mmio_device_ids() { - if(!all_target_mmio_devices.size()) { - for (const auto &it: m_pci_device_map) { + if (!all_target_mmio_devices.size()) { + for (const auto& it : m_pci_device_map) { all_target_mmio_devices.insert(it.first); } } return all_target_mmio_devices; } -void tt_SiliconDevice::assert_risc_reset() { - broadcast_tensix_risc_reset_to_cluster(TENSIX_ASSERT_SOFT_RESET); -} +void tt_SiliconDevice::assert_risc_reset() { broadcast_tensix_risc_reset_to_cluster(TENSIX_ASSERT_SOFT_RESET); } -void tt_SiliconDevice::deassert_risc_reset() { - broadcast_tensix_risc_reset_to_cluster(TENSIX_DEASSERT_SOFT_RESET); -} +void tt_SiliconDevice::deassert_risc_reset() { broadcast_tensix_risc_reset_to_cluster(TENSIX_DEASSERT_SOFT_RESET); } void tt_SiliconDevice::deassert_risc_reset_at_core(tt_cxy_pair core) { - std::uint32_t target_device = core.chip; // Get Target Device to query soc descriptor and determine location in cluster - log_assert(std::find(get_soc_descriptor(target_device).workers.begin(), get_soc_descriptor(target_device).workers.end(), core) != get_soc_descriptor(target_device).workers.end() || - std::find(get_soc_descriptor(target_device).ethernet_cores.begin(), get_soc_descriptor(target_device).ethernet_cores.end(), core) != get_soc_descriptor(target_device).ethernet_cores.end(), - "Cannot deassert reset on a non-tensix or harvested core"); - bool target_is_mmio_capable = ndesc -> is_chip_mmio_capable(target_device); - if(target_is_mmio_capable) { - log_assert(m_pci_device_map.find(target_device) != m_pci_device_map.end(), "Could not find MMIO mapped device in devices connected over PCIe"); + std::uint32_t target_device = + core.chip; // Get Target Device to query soc descriptor and determine location in cluster + log_assert( + std::find( + get_soc_descriptor(target_device).workers.begin(), get_soc_descriptor(target_device).workers.end(), core) != + get_soc_descriptor(target_device).workers.end() || + std::find( + get_soc_descriptor(target_device).ethernet_cores.begin(), + get_soc_descriptor(target_device).ethernet_cores.end(), + core) != get_soc_descriptor(target_device).ethernet_cores.end(), + "Cannot deassert reset on a non-tensix or harvested core"); + bool target_is_mmio_capable = ndesc->is_chip_mmio_capable(target_device); + if (target_is_mmio_capable) { + log_assert( + m_pci_device_map.find(target_device) != m_pci_device_map.end(), + "Could not find MMIO mapped device in devices connected over PCIe"); send_tensix_risc_reset_to_core(core, TENSIX_DEASSERT_SOFT_RESET); - } - else { + } else { log_assert(arch_name != tt::ARCH::BLACKHOLE, "Can't issue access to remote core in BH"); send_remote_tensix_risc_reset_to_core(core, TENSIX_DEASSERT_SOFT_RESET); } } void tt_SiliconDevice::assert_risc_reset_at_core(tt_cxy_pair core) { - std::uint32_t target_device = core.chip; // Get Target Device to query soc descriptor and determine location in cluster - log_assert(std::find(get_soc_descriptor(target_device).workers.begin(), get_soc_descriptor(target_device).workers.end(), core) != get_soc_descriptor(target_device).workers.end() || - std::find(get_soc_descriptor(target_device).ethernet_cores.begin(), get_soc_descriptor(target_device).ethernet_cores.end(), core) != get_soc_descriptor(target_device).ethernet_cores.end(), - "Cannot assert reset on a non-tensix or harvested core"); - bool target_is_mmio_capable = ndesc -> is_chip_mmio_capable(target_device); - if(target_is_mmio_capable) { - log_assert(m_pci_device_map.find(target_device) != m_pci_device_map.end(), "Could not find MMIO mapped device in devices connected over PCIe"); + std::uint32_t target_device = + core.chip; // Get Target Device to query soc descriptor and determine location in cluster + log_assert( + std::find( + get_soc_descriptor(target_device).workers.begin(), get_soc_descriptor(target_device).workers.end(), core) != + get_soc_descriptor(target_device).workers.end() || + std::find( + get_soc_descriptor(target_device).ethernet_cores.begin(), + get_soc_descriptor(target_device).ethernet_cores.end(), + core) != get_soc_descriptor(target_device).ethernet_cores.end(), + "Cannot assert reset on a non-tensix or harvested core"); + bool target_is_mmio_capable = ndesc->is_chip_mmio_capable(target_device); + if (target_is_mmio_capable) { + log_assert( + m_pci_device_map.find(target_device) != m_pci_device_map.end(), + "Could not find MMIO mapped device in devices connected over PCIe"); send_tensix_risc_reset_to_core(core, TENSIX_ASSERT_SOFT_RESET); - } - else { + } else { send_remote_tensix_risc_reset_to_core(core, TENSIX_ASSERT_SOFT_RESET); } } // Free memory during teardown, and remove (clean/unlock) from any leftover mutexes. void tt_SiliconDevice::cleanup_shared_host_state() { - for(auto &mutex : hardware_resource_mutex_map) { + for (auto& mutex : hardware_resource_mutex_map) { mutex.second.reset(); mutex.second = nullptr; named_mutex::remove(mutex.first.c_str()); } } -std::unordered_set tt_SiliconDevice::get_all_chips_in_cluster() { - return ndesc -> get_all_chips(); -} +std::unordered_set tt_SiliconDevice::get_all_chips_in_cluster() { return ndesc->get_all_chips(); } + int tt_SiliconDevice::get_number_of_chips_in_cluster() { // Returns the number of chips seen in the network descriptor - return ndesc -> get_all_chips().size(); + return ndesc->get_all_chips().size(); } -tt_ClusterDescriptor* tt_SiliconDevice::get_cluster_description() {return ndesc.get();} +tt_ClusterDescriptor* tt_SiliconDevice::get_cluster_description() { return ndesc.get(); } + // Can be used before instantiating a silicon device int tt_SiliconDevice::detect_number_of_chips() { - auto available_device_ids = detect_available_device_ids(); return available_device_ids.size(); - } // Can be used before instantiating a silicon device @@ -902,7 +1061,8 @@ std::vector tt_SiliconDevice::detect_available_device_ids() { return PCIDevice::enumerate_devices(); } -std::function tt_SiliconDevice::get_fast_pcie_static_tlb_write_callable(int device_id) { +std::function tt_SiliconDevice::get_fast_pcie_static_tlb_write_callable( + int device_id) { PCIDevice* dev = get_pci_device(device_id); const auto callable = [dev](uint32_t byte_addr, uint32_t num_bytes, const uint8_t* buffer_addr) { @@ -921,7 +1081,7 @@ tt::Writer tt_SiliconDevice::get_static_tlb_writer(tt_cxy_pair target) { throw std::runtime_error("TLBs not initialized"); } - auto *dev = get_pci_device(target.chip); + auto* dev = get_pci_device(target.chip); if (!dev->bar0_wc) { throw std::runtime_error("No write-combined mapping for BAR0"); @@ -935,26 +1095,39 @@ tt::Writer tt_SiliconDevice::get_static_tlb_writer(tt_cxy_pair target) { } auto [tlb_offset, tlb_size] = tlb_data.value(); - auto *base = reinterpret_cast(dev->bar0_wc); + auto* base = reinterpret_cast(dev->bar0_wc); return tt::Writer(base + tlb_offset, tlb_size); } -void tt_SiliconDevice::write_device_memory(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair target, std::uint32_t address, const std::string& fallback_tlb) { - PCIDevice *dev = get_pci_device(target.chip); +void tt_SiliconDevice::write_device_memory( + const void* mem_ptr, + uint32_t size_in_bytes, + tt_cxy_pair target, + std::uint32_t address, + const std::string& fallback_tlb) { + PCIDevice* dev = get_pci_device(target.chip); const uint8_t* buffer_addr = static_cast(mem_ptr); - log_debug(LogSiliconDriver, "tt_SiliconDevice::write_device_memory to chip:{} {}-{} at 0x{:x} size_in_bytes: {} small_access: {}", - target.chip, target.x, target.y, address, size_in_bytes, small_access); + log_debug( + LogSiliconDriver, + "tt_SiliconDevice::write_device_memory to chip:{} {}-{} at 0x{:x} size_in_bytes: {} small_access: {}", + target.chip, + target.x, + target.y, + address, + size_in_bytes, + small_access); std::int32_t tlb_index = 0; std::optional> tlb_data = std::nullopt; - if(tlbs_init_per_chip[target.chip]) { + if (tlbs_init_per_chip[target.chip]) { tlb_index = map_core_to_tlb_per_chip[target.chip](tt_xy_pair(target.x, target.y)); tlb_data = dev->get_architecture_implementation()->describe_tlb(tlb_index); } - if (tlb_data.has_value() && address_in_tlb_space(address, size_in_bytes, tlb_index, std::get<1>(tlb_data.value()), target.chip)) { + if (tlb_data.has_value() && + address_in_tlb_space(address, size_in_bytes, tlb_index, std::get<1>(tlb_data.value()), target.chip)) { auto [tlb_offset, tlb_size] = tlb_data.value(); if (dev->bar4_wc != nullptr && tlb_size == BH_4GB_TLB_SIZE) { // This is only for Blackhole. If we want to write to DRAM (BAR4 space), we add offset @@ -967,9 +1140,9 @@ void tt_SiliconDevice::write_device_memory(const void *mem_ptr, uint32_t size_in const auto tlb_index = dynamic_tlb_config.at(fallback_tlb); const scoped_lock lock(*get_mutex(fallback_tlb, dev->get_device_num())); - while(size_in_bytes > 0) { - - auto [mapped_address, tlb_size] = dev->set_dynamic_tlb(tlb_index, target, address, harvested_coord_translation, dynamic_tlb_ordering_modes.at(fallback_tlb)); + while (size_in_bytes > 0) { + auto [mapped_address, tlb_size] = dev->set_dynamic_tlb( + tlb_index, target, address, harvested_coord_translation, dynamic_tlb_ordering_modes.at(fallback_tlb)); uint32_t transfer_size = std::min((uint64_t)size_in_bytes, tlb_size); dev->write_block(mapped_address, transfer_size, buffer_addr); @@ -981,22 +1154,36 @@ void tt_SiliconDevice::write_device_memory(const void *mem_ptr, uint32_t size_in } } -void tt_SiliconDevice::read_device_memory(void *mem_ptr, tt_cxy_pair target, std::uint32_t address, std::uint32_t size_in_bytes, const std::string& fallback_tlb) { - // Assume that mem_ptr has been allocated adequate memory on host when this function is called. Otherwise, this function will cause a segfault. - log_debug(LogSiliconDriver, "tt_SiliconDevice::read_device_memory to chip:{} {}-{} at 0x{:x} size_in_bytes: {}", target.chip, target.x, target.y, address, size_in_bytes); - PCIDevice *dev = get_pci_device(target.chip); +void tt_SiliconDevice::read_device_memory( + void* mem_ptr, + tt_cxy_pair target, + std::uint32_t address, + std::uint32_t size_in_bytes, + const std::string& fallback_tlb) { + // Assume that mem_ptr has been allocated adequate memory on host when this function is called. Otherwise, this + // function will cause a segfault. + log_debug( + LogSiliconDriver, + "tt_SiliconDevice::read_device_memory to chip:{} {}-{} at 0x{:x} size_in_bytes: {}", + target.chip, + target.x, + target.y, + address, + size_in_bytes); + PCIDevice* dev = get_pci_device(target.chip); uint8_t* buffer_addr = static_cast(mem_ptr); std::int32_t tlb_index = 0; std::optional> tlb_data = std::nullopt; - if(tlbs_init_per_chip[target.chip]) { + if (tlbs_init_per_chip[target.chip]) { tlb_index = map_core_to_tlb_per_chip[target.chip](tt_xy_pair(target.x, target.y)); tlb_data = dev->get_architecture_implementation()->describe_tlb(tlb_index); } log_debug(LogSiliconDriver, " tlb_index: {}, tlb_data.has_value(): {}", tlb_index, tlb_data.has_value()); - if (tlb_data.has_value() && address_in_tlb_space(address, size_in_bytes, tlb_index, std::get<1>(tlb_data.value()), target.chip)) { + if (tlb_data.has_value() && + address_in_tlb_space(address, size_in_bytes, tlb_index, std::get<1>(tlb_data.value()), target.chip)) { auto [tlb_offset, tlb_size] = tlb_data.value(); if (dev->bar4_wc != nullptr && tlb_size == BH_4GB_TLB_SIZE) { // This is only for Blackhole. If we want to read from DRAM (BAR4 space), we add offset @@ -1010,9 +1197,9 @@ void tt_SiliconDevice::read_device_memory(void *mem_ptr, tt_cxy_pair target, std const auto tlb_index = dynamic_tlb_config.at(fallback_tlb); const scoped_lock lock(*get_mutex(fallback_tlb, dev->get_device_num())); log_debug(LogSiliconDriver, " dynamic tlb_index: {}", tlb_index); - while(size_in_bytes > 0) { - - auto [mapped_address, tlb_size] = dev->set_dynamic_tlb(tlb_index, target, address, harvested_coord_translation, dynamic_tlb_ordering_modes.at(fallback_tlb)); + while (size_in_bytes > 0) { + auto [mapped_address, tlb_size] = dev->set_dynamic_tlb( + tlb_index, target, address, harvested_coord_translation, dynamic_tlb_ordering_modes.at(fallback_tlb)); uint32_t transfer_size = std::min((uint64_t)size_in_bytes, tlb_size); dev->read_block(mapped_address, transfer_size, buffer_addr); @@ -1025,55 +1212,61 @@ void tt_SiliconDevice::read_device_memory(void *mem_ptr, tt_cxy_pair target, std } void tt_SiliconDevice::read_buffer( - void* mem_ptr, - std::uint32_t address, - std::uint16_t channel, - std::uint32_t size_in_bytes, - chip_id_t src_device_id) { - + void* mem_ptr, std::uint32_t address, std::uint16_t channel, std::uint32_t size_in_bytes, chip_id_t src_device_id) { log_assert(src_device_id != -1, "Must provide src_device_id for host_resident read/write"); - log_assert(channel >= 0 && channel <= g_MAX_HOST_MEM_CHANNELS, "{} - Invalid channel {} for host_resident read/write.", __FUNCTION__, channel); - void * user_scratchspace = nullptr; - - if(hugepage_mapping.at(src_device_id).at(channel)) { - user_scratchspace = static_cast(hugepage_mapping.at(src_device_id).at(channel)) + (address & HUGEPAGE_MAP_MASK); + log_assert( + channel >= 0 && channel <= g_MAX_HOST_MEM_CHANNELS, + "{} - Invalid channel {} for host_resident read/write.", + __FUNCTION__, + channel); + void* user_scratchspace = nullptr; + + if (hugepage_mapping.at(src_device_id).at(channel)) { + user_scratchspace = + static_cast(hugepage_mapping.at(src_device_id).at(channel)) + (address & HUGEPAGE_MAP_MASK); } else { - throw std::runtime_error(fmt::format("write_buffer: Hugepages are not allocated for src_device_id: {} ch: {}." - " - Ensure sufficient number of Hugepages installed per device (1 per host mem ch, per device)", - src_device_id, - channel)); + throw std::runtime_error(fmt::format( + "write_buffer: Hugepages are not allocated for src_device_id: {} ch: {}." + " - Ensure sufficient number of Hugepages installed per device (1 per host mem ch, per device)", + src_device_id, + channel)); } - log_debug(LogSiliconDriver, "tt_SiliconDevice::read_buffer (src_device_id: {}, ch: {}) from 0x{:x}", src_device_id, channel, user_scratchspace); - + log_debug( + LogSiliconDriver, + "tt_SiliconDevice::read_buffer (src_device_id: {}, ch: {}) from 0x{:x}", + src_device_id, + channel, + user_scratchspace); + memcpy(mem_ptr, user_scratchspace, size_in_bytes); } void tt_SiliconDevice::write_buffer( - const void *mem_ptr, - std::uint32_t size, - std::uint32_t address, - std::uint16_t channel, - chip_id_t src_device_id) { - - void * user_scratchspace = nullptr; - if(hugepage_mapping.at(src_device_id).at(channel)) { - log_assert(size <= HUGEPAGE_REGION_SIZE, "write_buffer data has larger size {} than destination buffer {}", size, HUGEPAGE_REGION_SIZE); - log_debug(LogSiliconDriver, "Using hugepage mapping at address {} offset {} chan {} size {}", - hugepage_mapping.at(src_device_id).at(channel), - (address & HUGEPAGE_MAP_MASK), - channel, - size); - user_scratchspace = static_cast(hugepage_mapping.at(src_device_id).at(channel)) + (address & HUGEPAGE_MAP_MASK); + const void* mem_ptr, std::uint32_t size, std::uint32_t address, std::uint16_t channel, chip_id_t src_device_id) { + void* user_scratchspace = nullptr; + if (hugepage_mapping.at(src_device_id).at(channel)) { + log_assert( + size <= HUGEPAGE_REGION_SIZE, + "write_buffer data has larger size {} than destination buffer {}", + size, + HUGEPAGE_REGION_SIZE); + log_debug( + LogSiliconDriver, + "Using hugepage mapping at address {} offset {} chan {} size {}", + hugepage_mapping.at(src_device_id).at(channel), + (address & HUGEPAGE_MAP_MASK), + channel, + size); + user_scratchspace = + static_cast(hugepage_mapping.at(src_device_id).at(channel)) + (address & HUGEPAGE_MAP_MASK); } else { - throw std::runtime_error(fmt::format("write_buffer: Hugepage are not allocated for src_device_id: {} ch: {}", - src_device_id, - channel)); + throw std::runtime_error(fmt::format( + "write_buffer: Hugepage are not allocated for src_device_id: {} ch: {}", src_device_id, channel)); } memcpy(user_scratchspace, mem_ptr, size); } - uint32_t tt_SiliconDevice::get_power_state_arc_msg(chip_id_t chip_id, tt_DevicePowerState state) { PCIDevice* pci_device = get_pci_device(chip_id); uint32_t msg = 0xaa00; @@ -1090,34 +1283,37 @@ uint32_t tt_SiliconDevice::get_power_state_arc_msg(chip_id_t chip_id, tt_DeviceP msg |= pci_device->get_architecture_implementation()->get_arc_message_arc_go_short_idle(); break; } - default: throw std::runtime_error("Unrecognized power state."); + default: + throw std::runtime_error("Unrecognized power state."); } return msg; } void tt_SiliconDevice::set_pcie_power_state(tt_DevicePowerState state) { - - for (auto &device_it : m_pci_device_map){ + for (auto& device_it : m_pci_device_map) { int chip_id = device_it.first; uint32_t msg = get_power_state_arc_msg(chip_id, state); std::stringstream ss; ss << state; auto exit_code = arc_msg(chip_id, 0xaa00 | msg, true, 0, 0); if (exit_code != 0) { - throw std::runtime_error(fmt::format("Failed to set power state to {} with exit code {}", ss.str(), exit_code)); + throw std::runtime_error( + fmt::format("Failed to set power state to {} with exit code {}", ss.str(), exit_code)); } } } int tt_SiliconDevice::get_clock(int logical_device_id) { - // TODO: remove this once ARC messages work. // This is currently used only for testing and bringing up Blackhole on Buda. if (arch_name == tt::ARCH::BLACKHOLE) { char* clk_env_var = getenv("TT_SILICON_DRIVER_AICLK"); if (clk_env_var != nullptr) { - log_warning(LogSiliconDriver, "ARC messages are not enabled on Blackhole. " - "Using AICLK value from environment variable TT_SILICON_DRIVER_AICLK: {}" , clk_env_var); + log_warning( + LogSiliconDriver, + "ARC messages are not enabled on Blackhole. " + "Using AICLK value from environment variable TT_SILICON_DRIVER_AICLK: {}", + clk_env_var); return std::stoi(clk_env_var); } } @@ -1125,7 +1321,14 @@ int tt_SiliconDevice::get_clock(int logical_device_id) { uint32_t clock; auto mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(logical_device_id); PCIDevice* pci_device = get_pci_device(mmio_capable_chip_logical); - auto exit_code = arc_msg(logical_device_id, 0xaa00 | pci_device->get_architecture_implementation()->get_arc_message_get_aiclk(), true, 0xFFFF, 0xFFFF, 1, &clock); + auto exit_code = arc_msg( + logical_device_id, + 0xaa00 | pci_device->get_architecture_implementation()->get_arc_message_get_aiclk(), + true, + 0xFFFF, + 0xFFFF, + 1, + &clock); if (exit_code != 0) { throw std::runtime_error(fmt::format("Failed to get aiclk value with exit code {}", exit_code)); } @@ -1133,31 +1336,29 @@ int tt_SiliconDevice::get_clock(int logical_device_id) { } std::map tt_SiliconDevice::get_clocks() { - std::map clock_freq_map; - for (auto &device_it : m_pci_device_map){ + std::map clock_freq_map; + for (auto& device_it : m_pci_device_map) { int d = device_it.first; clock_freq_map.insert({d, get_clock(d)}); } return clock_freq_map; } -tt_SiliconDevice::~tt_SiliconDevice () { - +tt_SiliconDevice::~tt_SiliconDevice() { log_debug(LogSiliconDriver, "tt_SiliconDevice::~tt_SiliconDevice"); cleanup_shared_host_state(); - for (auto &device_it : m_pci_device_map){ - + for (auto& device_it : m_pci_device_map) { chip_id_t device_id = device_it.first; // PCIDevice *dev = device_it.second.get(); - for (int ch = 0; ch < m_num_host_mem_channels; ch ++) { + for (int ch = 0; ch < m_num_host_mem_channels; ch++) { if (hugepage_mapping.at(device_id).at(ch)) { munmap(hugepage_mapping.at(device_id).at(ch), hugepage_mapping_size.at(device_id).at(ch)); } } - + device_it.second.reset(); } m_pci_device_map.clear(); @@ -1176,23 +1377,34 @@ std::optional> tt_SiliconDevice::get_tlb_data_fro tlb_index = map_core_to_tlb_per_chip[target.chip](tt_xy_pair(target.x, target.y)); auto architecture_implementation = tt::umd::architecture_implementation::create(arch_name); tlb_data = architecture_implementation->describe_tlb(tlb_index); - } + } return tlb_data; } -void tt_SiliconDevice::configure_tlb(chip_id_t logical_device_id, tt_xy_pair core, std::int32_t tlb_index, std::int32_t address, uint64_t ordering) { - log_assert(ordering == TLB_DATA::Strict || ordering == TLB_DATA::Posted || ordering == TLB_DATA::Relaxed, "Invalid ordering specified in tt_SiliconDevice::configure_tlb"); - PCIDevice *pci_device = get_pci_device(logical_device_id); +void tt_SiliconDevice::configure_tlb( + chip_id_t logical_device_id, tt_xy_pair core, std::int32_t tlb_index, std::int32_t address, uint64_t ordering) { + log_assert( + ordering == TLB_DATA::Strict || ordering == TLB_DATA::Posted || ordering == TLB_DATA::Relaxed, + "Invalid ordering specified in tt_SiliconDevice::configure_tlb"); + PCIDevice* pci_device = get_pci_device(logical_device_id); pci_device->set_dynamic_tlb(tlb_index, core, address, harvested_coord_translation, ordering); auto tlb_size = std::get<1>(pci_device->get_architecture_implementation()->describe_tlb(tlb_index).value()); - if(tlb_config_map.find(logical_device_id) == tlb_config_map.end()) tlb_config_map.insert({logical_device_id, {}}); + if (tlb_config_map.find(logical_device_id) == tlb_config_map.end()) { + tlb_config_map.insert({logical_device_id, {}}); + } tlb_config_map[logical_device_id].insert({tlb_index, (address / tlb_size) * tlb_size}); } void tt_SiliconDevice::set_fallback_tlb_ordering_mode(const std::string& fallback_tlb, uint64_t ordering) { - log_assert(ordering == TLB_DATA::Strict || ordering == TLB_DATA::Posted || ordering == TLB_DATA::Relaxed, "Invalid ordering specified in tt_SiliconDevice::configure_tlb."); - log_assert(dynamic_tlb_ordering_modes.find(fallback_tlb) != dynamic_tlb_ordering_modes.end(), "Invalid TLB specified in tt_SiliconDevice::set_fallback_tlb_ordering_mode."); - log_assert(fallback_tlb != "LARGE_READ_TLB" && fallback_tlb != "LARGE_WRITE_TLB", "Ordering modes for LARGE_READ_TLB and LARGE_WRITE_TLB cannot be modified."); + log_assert( + ordering == TLB_DATA::Strict || ordering == TLB_DATA::Posted || ordering == TLB_DATA::Relaxed, + "Invalid ordering specified in tt_SiliconDevice::configure_tlb."); + log_assert( + dynamic_tlb_ordering_modes.find(fallback_tlb) != dynamic_tlb_ordering_modes.end(), + "Invalid TLB specified in tt_SiliconDevice::set_fallback_tlb_ordering_mode."); + log_assert( + fallback_tlb != "LARGE_READ_TLB" && fallback_tlb != "LARGE_WRITE_TLB", + "Ordering modes for LARGE_READ_TLB and LARGE_WRITE_TLB cannot be modified."); dynamic_tlb_ordering_modes.at(fallback_tlb) = ordering; } @@ -1201,9 +1413,12 @@ void tt_SiliconDevice::set_fallback_tlb_ordering_mode(const std::string& fallbac void tt_SiliconDevice::init_pcie_iatus() { int num_enabled_devices = m_pci_device_map.size(); log_debug(LogSiliconDriver, "tt_SiliconDevice::init_pcie_iatus() num_enabled_devices: {}", num_enabled_devices); - log_assert(m_num_host_mem_channels <= g_MAX_HOST_MEM_CHANNELS, "Maximum of {} 1GB Host memory channels supported.", g_MAX_HOST_MEM_CHANNELS); + log_assert( + m_num_host_mem_channels <= g_MAX_HOST_MEM_CHANNELS, + "Maximum of {} 1GB Host memory channels supported.", + g_MAX_HOST_MEM_CHANNELS); - for (auto &src_device_it : m_pci_device_map){ + for (auto& src_device_it : m_pci_device_map) { int logical_id = src_device_it.first; PCIDevice* src_pci_device = src_device_it.second.get(); @@ -1211,57 +1426,67 @@ void tt_SiliconDevice::init_pcie_iatus() { for (int channel_id = 0; channel_id < m_num_host_mem_channels; channel_id++) { if (hugepage_mapping.at(logical_id).at(channel_id)) { std::uint32_t region_size = HUGEPAGE_REGION_SIZE; - if (channel_id == 3) region_size = 805306368; // Remove 256MB from full 1GB for channel 3 (iATU limitation) + if (channel_id == 3) { + region_size = 805306368; // Remove 256MB from full 1GB for channel 3 (iATU limitation) + } // This log message doesn't look right. - log_debug(LogSiliconDriver, "Configuring ATU channel {} to point to hugepage {}.", channel_id, logical_id); - iatu_configure_peer_region(logical_id, channel_id, hugepage_physical_address.at(logical_id).at(channel_id), region_size); + log_debug( + LogSiliconDriver, "Configuring ATU channel {} to point to hugepage {}.", channel_id, logical_id); + iatu_configure_peer_region( + logical_id, channel_id, hugepage_physical_address.at(logical_id).at(channel_id), region_size); if (host_channel_size.find(logical_id) == host_channel_size.end()) { - host_channel_size.insert({logical_id, {}}); + host_channel_size.insert({logical_id, {}}); } host_channel_size.at(logical_id).push_back(region_size); } else { - throw std::runtime_error(fmt::format("init_pcie_iatus: Hugepages are not allocated for logical device id: {} ch: {}", logical_id, channel_id)); + throw std::runtime_error(fmt::format( + "init_pcie_iatus: Hugepages are not allocated for logical device id: {} ch: {}", + logical_id, + channel_id)); } } } } // Looks for hugetlbfs inside /proc/mounts matching desired pagesize (typically 1G) -std::string find_hugepage_dir(std::size_t pagesize) -{ - - static const std::regex hugetlbfs_mount_re(fmt::format("^(nodev|hugetlbfs) ({}) hugetlbfs ([^ ]+) 0 0$", hugepage_dir)); +std::string find_hugepage_dir(std::size_t pagesize) { + static const std::regex hugetlbfs_mount_re( + fmt::format("^(nodev|hugetlbfs) ({}) hugetlbfs ([^ ]+) 0 0$", hugepage_dir)); static const std::regex pagesize_re("(?:^|,)pagesize=([0-9]+)([KMGT])(?:,|$)"); std::ifstream proc_mounts("/proc/mounts"); - for (std::string line; std::getline(proc_mounts, line); ) - { - if (std::smatch mount_match; std::regex_match(line, mount_match, hugetlbfs_mount_re)) - { + for (std::string line; std::getline(proc_mounts, line);) { + if (std::smatch mount_match; std::regex_match(line, mount_match, hugetlbfs_mount_re)) { std::string options = mount_match[3]; - if (std::smatch pagesize_match; std::regex_search(options, pagesize_match, pagesize_re)) - { + if (std::smatch pagesize_match; std::regex_search(options, pagesize_match, pagesize_re)) { std::size_t mount_page_size = std::stoull(pagesize_match[1]); - switch (pagesize_match[2].str()[0]) - { - case 'T': mount_page_size <<= 10; - case 'G': mount_page_size <<= 10; - case 'M': mount_page_size <<= 10; - case 'K': mount_page_size <<= 10; + switch (pagesize_match[2].str()[0]) { + case 'T': + mount_page_size <<= 10; + case 'G': + mount_page_size <<= 10; + case 'M': + mount_page_size <<= 10; + case 'K': + mount_page_size <<= 10; } - if (mount_page_size == pagesize) - { + if (mount_page_size == pagesize) { return mount_match[2]; } } } } - log_warning(LogSiliconDriver, "ttSiliconDevice::find_hugepage_dir: no huge page mount found in /proc/mounts for path: {} with hugepage_size: {}.", hugepage_dir, pagesize); + log_warning( + LogSiliconDriver, + "ttSiliconDevice::find_hugepage_dir: no huge page mount found in /proc/mounts for path: {} with hugepage_size: " + "{}.", + hugepage_dir, + pagesize); return std::string(); } @@ -1269,16 +1494,18 @@ std::string find_hugepage_dir(std::size_t pagesize) // All processes operating on the same pipeline must agree on the file name. // Today we assume there's only one pipeline running within the system. // One hugepage per device such that each device gets unique memory. -int tt_SiliconDevice::open_hugepage_file(const std::string &dir, chip_id_t physical_device_id, uint16_t channel) { +int tt_SiliconDevice::open_hugepage_file(const std::string& dir, chip_id_t physical_device_id, uint16_t channel) { std::vector filename; static const char pipeline_name[] = "tenstorrent"; filename.insert(filename.end(), dir.begin(), dir.end()); - if (filename.back() != '/') filename.push_back('/'); + if (filename.back() != '/') { + filename.push_back('/'); + } // In order to limit number of hugepages while transition from shared hugepage (1 per system) to unique // hugepage per device, will share original/shared hugepage filename with physical device 0. - if (physical_device_id != 0 || channel != 0){ + if (physical_device_id != 0 || channel != 0) { std::string device_id_str = fmt::format("device_{}_", physical_device_id); filename.insert(filename.end(), device_id_str.begin(), device_id_str.end()); } @@ -1288,20 +1515,32 @@ int tt_SiliconDevice::open_hugepage_file(const std::string &dir, chip_id_t physi filename.insert(filename.end(), channel_id_str.begin(), channel_id_str.end()); } - filename.insert(filename.end(), std::begin(pipeline_name), std::end(pipeline_name)); // includes NUL terminator + filename.insert(filename.end(), std::begin(pipeline_name), std::end(pipeline_name)); // includes NUL terminator std::string filename_str(filename.begin(), filename.end()); - filename_str.erase(std::find(filename_str.begin(), filename_str.end(), '\0'), filename_str.end()); // Erase NULL terminator for printing. - log_debug(LogSiliconDriver, "ttSiliconDevice::open_hugepage_file: using filename: {} for physical_device_id: {} channel: {}", filename_str.c_str(), physical_device_id, channel); + filename_str.erase( + std::find(filename_str.begin(), filename_str.end(), '\0'), + filename_str.end()); // Erase NULL terminator for printing. + log_debug( + LogSiliconDriver, + "ttSiliconDevice::open_hugepage_file: using filename: {} for physical_device_id: {} channel: {}", + filename_str.c_str(), + physical_device_id, + channel); // Save original and set umask to unrestricted. auto old_umask = umask(0); - int fd = open(filename.data(), O_RDWR | O_CREAT | O_CLOEXEC, S_IWUSR | S_IRUSR | S_IWGRP | S_IRGRP | S_IWOTH | S_IROTH ); + int fd = + open(filename.data(), O_RDWR | O_CREAT | O_CLOEXEC, S_IWUSR | S_IRUSR | S_IWGRP | S_IRGRP | S_IWOTH | S_IROTH); if (fd == -1 && errno == EACCES) { - log_warning(LogSiliconDriver, "ttSiliconDevice::open_hugepage_file could not open filename: {} on first try, unlinking it and retrying.", filename_str); + log_warning( + LogSiliconDriver, + "ttSiliconDevice::open_hugepage_file could not open filename: {} on first try, unlinking it and retrying.", + filename_str); unlink(filename.data()); - fd = open(filename.data(), O_RDWR | O_CREAT | O_CLOEXEC, S_IWUSR | S_IRUSR | S_IWGRP | S_IRGRP | S_IWOTH | S_IROTH ); + fd = open( + filename.data(), O_RDWR | O_CREAT | O_CLOEXEC, S_IWUSR | S_IRUSR | S_IWGRP | S_IRGRP | S_IWOTH | S_IROTH); } // Verify opened file size. @@ -1310,7 +1549,11 @@ int tt_SiliconDevice::open_hugepage_file(const std::string &dir, chip_id_t physi log_warning(LogSiliconDriver, "Error reading file size after opening: {}", filename_str); } else { if (st.st_size == 0) { - log_warning(LogSiliconDriver, "Opened hugepage file has zero size, mapping it might fail: {}. Verify that enough hugepages are provided.", filename_str); + log_warning( + LogSiliconDriver, + "Opened hugepage file has zero size, mapping it might fail: {}. Verify that enough hugepages are " + "provided.", + filename_str); } } @@ -1326,10 +1569,10 @@ int tt_SiliconDevice::open_hugepage_file(const std::string &dir, chip_id_t physi } // For debug purposes when various stages fails. -void print_file_contents(std::string filename, std::string hint = ""){ - if (std::filesystem::exists(filename)){ +void print_file_contents(std::string filename, std::string hint = "") { + if (std::filesystem::exists(filename)) { std::ifstream meminfo(filename); - if (meminfo.is_open()){ + if (meminfo.is_open()) { std::cout << std::endl << "File " << filename << " " << hint << " is: " << std::endl; std::cout << meminfo.rdbuf(); } @@ -1346,7 +1589,10 @@ bool tt_SiliconDevice::init_hugepage(chip_id_t device_id) { std::string hugepage_dir = find_hugepage_dir(hugepage_size); if (hugepage_dir.empty()) { - log_warning(LogSiliconDriver, "ttSiliconDevice::init_hugepage: no huge page mount found for hugepage_size: {}.", hugepage_size); + log_warning( + LogSiliconDriver, + "ttSiliconDevice::init_hugepage: no huge page mount found for hugepage_size: {}.", + hugepage_size); return false; } @@ -1354,32 +1600,49 @@ bool tt_SiliconDevice::init_hugepage(chip_id_t device_id) { // Support for more than 1GB host memory accessible per device, via channels. for (int ch = 0; ch < m_num_host_mem_channels; ch++) { - int hugepage_fd = open_hugepage_file(hugepage_dir, physical_device_id, ch); if (hugepage_fd == -1) { // Probably a permissions problem. - log_warning(LogSiliconDriver, "ttSiliconDevice::init_hugepage: physical_device_id: {} ch: {} creating hugepage mapping file failed.", physical_device_id, ch); + log_warning( + LogSiliconDriver, + "ttSiliconDevice::init_hugepage: physical_device_id: {} ch: {} creating hugepage mapping file failed.", + physical_device_id, + ch); success = false; continue; } - std::byte *mapping = static_cast(mmap(nullptr, hugepage_size, PROT_READ|PROT_WRITE, MAP_SHARED | MAP_POPULATE, hugepage_fd, 0)); + std::byte* mapping = static_cast( + mmap(nullptr, hugepage_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, hugepage_fd, 0)); close(hugepage_fd); if (mapping == MAP_FAILED) { - log_warning(LogSiliconDriver, "UMD: Mapping a hugepage failed. (device: {}, {}/{} errno: {}).", physical_device_id, ch, m_num_host_mem_channels, strerror(errno)); - print_file_contents("/proc/cmdline");\ - print_file_contents("/sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages"); // Hardcoded for 1GB hugepage. + log_warning( + LogSiliconDriver, + "UMD: Mapping a hugepage failed. (device: {}, {}/{} errno: {}).", + physical_device_id, + ch, + m_num_host_mem_channels, + strerror(errno)); + print_file_contents("/proc/cmdline"); + print_file_contents( + "/sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages"); // Hardcoded for 1GB hugepage. success = false; continue; } - // Beter performance if hugepage just allocated (populate flag to prevent lazy alloc) is migrated to same numanode as TT device. - if (!tt::cpuset::tt_cpuset_allocator::bind_area_to_memory_nodeset(physical_device_id, mapping, hugepage_size)){ - log_warning(LogSiliconDriver, "---- ttSiliconDevice::init_hugepage: bind_area_to_memory_nodeset() failed (physical_device_id: {} ch: {}). " - "Hugepage allocation is not on NumaNode matching TT Device. Side-Effect is decreased Device->Host perf (Issue #893).", - physical_device_id, ch); + // Beter performance if hugepage just allocated (populate flag to prevent lazy alloc) is migrated to same + // numanode as TT device. + if (!tt::cpuset::tt_cpuset_allocator::bind_area_to_memory_nodeset(physical_device_id, mapping, hugepage_size)) { + log_warning( + LogSiliconDriver, + "---- ttSiliconDevice::init_hugepage: bind_area_to_memory_nodeset() failed (physical_device_id: {} ch: " + "{}). " + "Hugepage allocation is not on NumaNode matching TT Device. Side-Effect is decreased Device->Host perf " + "(Issue #893).", + physical_device_id, + ch); } tenstorrent_pin_pages pin_pages; @@ -1392,7 +1655,13 @@ bool tt_SiliconDevice::init_hugepage(chip_id_t device_id) { auto fd = dev->get_fd(); if (ioctl(fd, TENSTORRENT_IOCTL_PIN_PAGES, &pin_pages) == -1) { - log_warning(LogSiliconDriver, "---- ttSiliconDevice::init_hugepage: physical_device_id: {} ch: {} TENSTORRENT_IOCTL_PIN_PAGES failed (errno: {}). Common Issue: Requires TTMKD >= 1.11, see following file contents...", physical_device_id, ch, strerror(errno)); + log_warning( + LogSiliconDriver, + "---- ttSiliconDevice::init_hugepage: physical_device_id: {} ch: {} TENSTORRENT_IOCTL_PIN_PAGES failed " + "(errno: {}). Common Issue: Requires TTMKD >= 1.11, see following file contents...", + physical_device_id, + ch, + strerror(errno)); munmap(mapping, hugepage_size); print_file_contents("/sys/module/tenstorrent/version", "(TTKMD version)"); print_file_contents("/proc/meminfo"); @@ -1405,64 +1674,80 @@ bool tt_SiliconDevice::init_hugepage(chip_id_t device_id) { hugepage_mapping_size.at(device_id).at(ch) = hugepage_size; hugepage_physical_address.at(device_id).at(ch) = pin_pages.out.physical_address; - log_debug(LogSiliconDriver, "ttSiliconDevice::init_hugepage: physical_device_id: {} ch: {} mapping_size: {} physical address 0x{:x}", physical_device_id, ch, hugepage_size, (unsigned long long)hugepage_physical_address.at(device_id).at(ch)); + log_debug( + LogSiliconDriver, + "ttSiliconDevice::init_hugepage: physical_device_id: {} ch: {} mapping_size: {} physical address 0x{:x}", + physical_device_id, + ch, + hugepage_size, + (unsigned long long)hugepage_physical_address.at(device_id).at(ch)); } return success; } -int tt_SiliconDevice::test_setup_interface () { +int tt_SiliconDevice::test_setup_interface() { if (arch_name == tt::ARCH::GRAYSKULL) { int ret_val = 0; - PCIDevice *dev = m_pci_device_map.begin()->second.get(); + PCIDevice* dev = m_pci_device_map.begin()->second.get(); - uint32_t mapped_reg = dev->set_dynamic_tlb(dev->get_architecture_implementation()->get_reg_tlb(), tt_xy_pair(0, 0), 0xffb20108, harvested_coord_translation).bar_offset; + uint32_t mapped_reg = dev->set_dynamic_tlb( + dev->get_architecture_implementation()->get_reg_tlb(), + tt_xy_pair(0, 0), + 0xffb20108, + harvested_coord_translation) + .bar_offset; uint32_t regval = 0; dev->read_regs(mapped_reg, 1, ®val); ret_val = (regval != 0xffffffff && ((regval & 0x1) == 1)) ? 0 : 1; return ret_val; - } - else if (arch_name == tt::ARCH::WORMHOLE_B0) { + } else if (arch_name == tt::ARCH::WORMHOLE_B0) { int ret_val = 0; - PCIDevice *dev = m_pci_device_map.begin()->second.get(); + PCIDevice* dev = m_pci_device_map.begin()->second.get(); - uint32_t mapped_reg = dev->set_dynamic_tlb(dev->get_architecture_implementation()->get_reg_tlb(), tt_xy_pair(1, 0), 0xffb20108, harvested_coord_translation).bar_offset; + uint32_t mapped_reg = dev->set_dynamic_tlb( + dev->get_architecture_implementation()->get_reg_tlb(), + tt_xy_pair(1, 0), + 0xffb20108, + harvested_coord_translation) + .bar_offset; uint32_t regval = 0; dev->read_regs(mapped_reg, 1, ®val); ret_val = (regval != 0xffffffff && (regval == 33)) ? 0 : 1; return ret_val; - } - else if (arch_name == tt::ARCH::BLACKHOLE) { + } else if (arch_name == tt::ARCH::BLACKHOLE) { // MT Inital BH - Try to enable this, but double check "regval == 33" // int ret_val = 0; // PCIDevice *dev = m_pci_device_map.begin()->second->hdev; - // uint32_t mapped_reg = dev->set_dynamic_tlb(m_pci_device_map.begin()->second, dev->get_architecture_implementation()->get_reg_tlb(), tt_xy_pair(1, 0), 0xffb20108, harvested_coord_translation).bar_offset; + // uint32_t mapped_reg = dev->set_dynamic_tlb(m_pci_device_map.begin()->second, + // dev->get_architecture_implementation()->get_reg_tlb(), tt_xy_pair(1, 0), 0xffb20108, + // harvested_coord_translation).bar_offset; // uint32_t regval = 0; // read_regs(dev, mapped_reg, 1, ®val); // ret_val = (regval != 0xffffffff && (regval == 33)) ? 0 : 1; // return ret_val; return 0; - } - else { + } else { throw std::runtime_error(fmt::format("Unsupported architecture: {}", get_arch_str(arch_name))); } } -void tt_SiliconDevice::bar_write32 (int logical_device_id, uint32_t addr, uint32_t data) { - PCIDevice *dev = get_pci_device(logical_device_id); +void tt_SiliconDevice::bar_write32(int logical_device_id, uint32_t addr, uint32_t data) { + PCIDevice* dev = get_pci_device(logical_device_id); if (addr < dev->bar0_uc_offset) { - dev->write_block(addr, sizeof(data), reinterpret_cast(&data)); // do we have to reinterpret_cast? + dev->write_block( + addr, sizeof(data), reinterpret_cast(&data)); // do we have to reinterpret_cast? } else { dev->write_regs(addr, 1, &data); } } -uint32_t tt_SiliconDevice::bar_read32 (int logical_device_id, uint32_t addr) { +uint32_t tt_SiliconDevice::bar_read32(int logical_device_id, uint32_t addr) { PCIDevice* dev = get_pci_device(logical_device_id); uint32_t data; @@ -1475,32 +1760,39 @@ uint32_t tt_SiliconDevice::bar_read32 (int logical_device_id, uint32_t addr) { } // Returns 0 if everything was OK -int tt_SiliconDevice::pcie_arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done, uint32_t arg0, uint32_t arg1, int timeout, uint32_t *return_3, uint32_t *return_4) { - - +int tt_SiliconDevice::pcie_arc_msg( + int logical_device_id, + uint32_t msg_code, + bool wait_for_done, + uint32_t arg0, + uint32_t arg1, + int timeout, + uint32_t* return_3, + uint32_t* return_4) { if ((msg_code & 0xff00) != 0xaa00) { log_error("Malformed message. msg_code is 0x{:x} but should be 0xaa..", msg_code); } - log_assert(arg0 <= 0xffff and arg1 <= 0xffff, "Only 16 bits allowed in arc_msg args"); // Only 16 bits are allowed + log_assert(arg0 <= 0xffff and arg1 <= 0xffff, "Only 16 bits allowed in arc_msg args"); // Only 16 bits are allowed - PCIDevice *pci_device = get_pci_device(logical_device_id); + PCIDevice* pci_device = get_pci_device(logical_device_id); auto architecture_implementation = pci_device->get_architecture_implementation(); // Exclusive access for a single process at a time. Based on physical pci interface id. std::string msg_type = "ARC_MSG"; const scoped_lock lock(*get_mutex(msg_type, pci_device->get_device_num())); - uint32_t fw_arg = arg0 | (arg1<<16); + uint32_t fw_arg = arg0 | (arg1 << 16); int exit_code = 0; - bar_write32 (logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 3 * 4, fw_arg); - bar_write32 (logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 5 * 4, msg_code); + bar_write32(logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 3 * 4, fw_arg); + bar_write32(logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 5 * 4, msg_code); - uint32_t misc = bar_read32 (logical_device_id, architecture_implementation->get_arc_reset_arc_misc_cntl_offset()); + uint32_t misc = bar_read32(logical_device_id, architecture_implementation->get_arc_reset_arc_misc_cntl_offset()); if (misc & (1 << 16)) { log_error("trigger_fw_int failed on device {}", logical_device_id); return 1; } else { - bar_write32(logical_device_id, architecture_implementation->get_arc_reset_arc_misc_cntl_offset(), misc | (1 << 16)); + bar_write32( + logical_device_id, architecture_implementation->get_arc_reset_arc_misc_cntl_offset(), misc | (1 << 16)); } if (wait_for_done) { @@ -1509,24 +1801,31 @@ int tt_SiliconDevice::pcie_arc_msg(int logical_device_id, uint32_t msg_code, boo auto start = std::chrono::system_clock::now(); while (true) { if (std::chrono::system_clock::now() - start > timeout_seconds) { - throw std::runtime_error(fmt::format("Timed out after waiting {} seconds for device {} ARC to respond", timeout, logical_device_id)); + throw std::runtime_error(fmt::format( + "Timed out after waiting {} seconds for device {} ARC to respond", timeout, logical_device_id)); } status = bar_read32(logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 5 * 4); if ((status & 0xffff) == (msg_code & 0xff)) { if (return_3 != nullptr) { - *return_3 = bar_read32(logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 3 * 4); + *return_3 = bar_read32( + logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 3 * 4); } if (return_4 != nullptr) { - *return_4 = bar_read32(logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 4 * 4); + *return_4 = bar_read32( + logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 4 * 4); } exit_code = (status & 0xffff0000) >> 16; break; } else if (status == MSG_ERROR_REPLY) { - log_warning(LogSiliconDriver, "On device {}, message code 0x{:x} not recognized by FW", logical_device_id, msg_code); + log_warning( + LogSiliconDriver, + "On device {}, message code 0x{:x} not recognized by FW", + logical_device_id, + msg_code); exit_code = MSG_ERROR_REPLY; break; } @@ -1537,12 +1836,16 @@ int tt_SiliconDevice::pcie_arc_msg(int logical_device_id, uint32_t msg_code, boo return exit_code; } -int tt_SiliconDevice::iatu_configure_peer_region (int logical_device_id, uint32_t peer_region_id, uint64_t bar_addr_64, uint32_t region_size) { +int tt_SiliconDevice::iatu_configure_peer_region( + int logical_device_id, uint32_t peer_region_id, uint64_t bar_addr_64, uint32_t region_size) { uint32_t dest_bar_lo = bar_addr_64 & 0xffffffff; uint32_t dest_bar_hi = (bar_addr_64 >> 32) & 0xffffffff; std::uint32_t region_id_to_use = peer_region_id; - if(peer_region_id == 3) region_id_to_use = 4; // Hack use region 4 for channel 3..this ensures that we have a smaller chan 3 address space with the correct start offset - PCIDevice *pci_device = get_pci_device(logical_device_id); + if (peer_region_id == 3) { + region_id_to_use = 4; // Hack use region 4 for channel 3..this ensures that we have a smaller chan 3 address + // space with the correct start offset + } + PCIDevice* pci_device = get_pci_device(logical_device_id); auto architecture_implementation = pci_device->get_architecture_implementation(); // BR: ARC doesn't work yet on Blackhole, so programming ATU directly. Should be removed when arc starts working. @@ -1552,8 +1855,8 @@ int tt_SiliconDevice::iatu_configure_peer_region (int logical_device_id, uint32_ uint64_t base_size = (region_id_to_use + 1) * region_size; uint64_t limit_address = base_addr + base_size - 1; - uint32_t region_ctrl_1 = 1 << 13; // INCREASE_REGION_SIZE = 1 - uint32_t region_ctrl_2 = 1 << 31; // REGION_EN = 1 + uint32_t region_ctrl_1 = 1 << 13; // INCREASE_REGION_SIZE = 1 + uint32_t region_ctrl_2 = 1 << 31; // REGION_EN = 1 uint32_t region_ctrl_3 = 0; uint32_t base_addr_lo = base_addr & 0xffffffff; uint32_t base_addr_hi = (base_addr >> 32) & 0xffffffff; @@ -1563,43 +1866,83 @@ int tt_SiliconDevice::iatu_configure_peer_region (int logical_device_id, uint32_ uint64_t iatu_index = 0; uint64_t iatu_base = UNROLL_ATU_OFFSET_BAR + iatu_index * 0x200; - pci_device->write_regs(reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x00), ®ion_ctrl_1, 1); - pci_device->write_regs(reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x04), ®ion_ctrl_2, 1); - pci_device->write_regs(reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x08), &base_addr_lo, 1); - pci_device->write_regs(reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x0c), &base_addr_hi, 1); - pci_device->write_regs(reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x10), &limit_address_lo, 1); - pci_device->write_regs(reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x14), &dest_bar_lo, 1); - pci_device->write_regs(reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x18), &dest_bar_hi, 1); - pci_device->write_regs(reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x1c), ®ion_ctrl_3, 1); - pci_device->write_regs(reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x20), &limit_address_hi, 1); - } - else { - bar_write32(logical_device_id, architecture_implementation->get_arc_csm_mailbox_offset() + 0 * 4, region_id_to_use); + pci_device->write_regs( + reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x00), + ®ion_ctrl_1, + 1); + pci_device->write_regs( + reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x04), + ®ion_ctrl_2, + 1); + pci_device->write_regs( + reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x08), + &base_addr_lo, + 1); + pci_device->write_regs( + reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x0c), + &base_addr_hi, + 1); + pci_device->write_regs( + reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x10), + &limit_address_lo, + 1); + pci_device->write_regs( + reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x14), + &dest_bar_lo, + 1); + pci_device->write_regs( + reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x18), + &dest_bar_hi, + 1); + pci_device->write_regs( + reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x1c), + ®ion_ctrl_3, + 1); + pci_device->write_regs( + reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x20), + &limit_address_hi, + 1); + } else { + bar_write32( + logical_device_id, architecture_implementation->get_arc_csm_mailbox_offset() + 0 * 4, region_id_to_use); bar_write32(logical_device_id, architecture_implementation->get_arc_csm_mailbox_offset() + 1 * 4, dest_bar_lo); bar_write32(logical_device_id, architecture_implementation->get_arc_csm_mailbox_offset() + 2 * 4, dest_bar_hi); bar_write32(logical_device_id, architecture_implementation->get_arc_csm_mailbox_offset() + 3 * 4, region_size); - arc_msg(logical_device_id, 0xaa00 | architecture_implementation->get_arc_message_setup_iatu_for_peer_to_peer(), true, 0, 0); + arc_msg( + logical_device_id, + 0xaa00 | architecture_implementation->get_arc_message_setup_iatu_for_peer_to_peer(), + true, + 0, + 0); } // Print what just happened - uint32_t peer_region_start = region_id_to_use*region_size; - uint32_t peer_region_end = (region_id_to_use+1)*region_size - 1; - log_debug(LogSiliconDriver, " [region id {}] NOC to PCI address range 0x{:x}-0x{:x} mapped to addr 0x{:x}", peer_region_id, peer_region_start, peer_region_end, bar_addr_64); + uint32_t peer_region_start = region_id_to_use * region_size; + uint32_t peer_region_end = (region_id_to_use + 1) * region_size - 1; + log_debug( + LogSiliconDriver, + " [region id {}] NOC to PCI address range 0x{:x}-0x{:x} mapped to addr 0x{:x}", + peer_region_id, + peer_region_start, + peer_region_end, + bar_addr_64); return 0; } // Returns broken rows as bits set to 1 in 'memory' and 'logic' uint32_t tt_SiliconDevice::get_harvested_noc_rows(uint32_t harvesting_mask) { auto architecture_implementation = tt::umd::architecture_implementation::create(arch_name); - const std::vector &harv_to_noc_loc = architecture_implementation->get_harvesting_noc_locations(); + const std::vector& harv_to_noc_loc = architecture_implementation->get_harvesting_noc_locations(); uint32_t harv_noc_rows = 0; std::string harv_noc_rows_str = ""; - for (int pos=0; pos> 1; @@ -1610,36 +1953,45 @@ uint32_t tt_SiliconDevice::get_harvested_noc_rows(uint32_t harvesting_mask) { return harv_noc_rows; } -uint32_t tt_SiliconDevice::get_harvested_rows (int logical_device_id) { +uint32_t tt_SiliconDevice::get_harvested_rows(int logical_device_id) { const char* harv_override = std::getenv("T6PY_HARVESTING_OVERRIDE"); uint32_t harv = 0xffffffff; if (harv_override) { harv = std::stoul(harv_override, nullptr, 16); } else { auto mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(logical_device_id); - PCIDevice *pci_device = get_pci_device(mmio_capable_chip_logical); - int harvesting_msg_code = arc_msg(logical_device_id, 0xaa00 | pci_device->get_architecture_implementation()->get_arc_message_arc_get_harvesting(), true, 0, 0, 1, &harv); - log_assert(harvesting_msg_code != MSG_ERROR_REPLY, "Failed to read harvested rows from device {}", logical_device_id); + PCIDevice* pci_device = get_pci_device(mmio_capable_chip_logical); + int harvesting_msg_code = arc_msg( + logical_device_id, + 0xaa00 | pci_device->get_architecture_implementation()->get_arc_message_arc_get_harvesting(), + true, + 0, + 0, + 1, + &harv); + log_assert( + harvesting_msg_code != MSG_ERROR_REPLY, "Failed to read harvested rows from device {}", logical_device_id); } log_assert(harv != 0xffffffff, "Readback 0xffffffff for harvesting info. Chip is fused incorrectly!"); - log_debug(LogSiliconDriver, "HARVESTING {}, 0x{:x}", (harv==0) ? "DISABLED":"ENABLED", harv); - + log_debug(LogSiliconDriver, "HARVESTING {}, 0x{:x}", (harv == 0) ? "DISABLED" : "ENABLED", harv); + uint32_t memory = harv & 0x3ff; uint32_t logic = (harv >> 10) & 0x3ff; - return (memory|logic); + return (memory | logic); } -uint32_t tt_SiliconDevice::get_harvested_noc_rows_for_chip (int logical_device_id) { +uint32_t tt_SiliconDevice::get_harvested_noc_rows_for_chip(int logical_device_id) { return get_harvested_noc_rows(get_harvested_rows(logical_device_id)); } -void tt_SiliconDevice::enable_local_ethernet_queue(const chip_id_t &device_id, int timeout) { +void tt_SiliconDevice::enable_local_ethernet_queue(const chip_id_t& device_id, int timeout) { uint32_t msg_success = 0x0; auto timeout_seconds = std::chrono::seconds(timeout); auto start = std::chrono::system_clock::now(); while (msg_success != 1) { if (std::chrono::system_clock::now() - start > timeout_seconds) { - throw std::runtime_error(fmt::format("Timed out after waiting {} seconds for for DRAM to finish training", timeout)); + throw std::runtime_error( + fmt::format("Timed out after waiting {} seconds for for DRAM to finish training", timeout)); } if (arc_msg(device_id, 0xaa58, true, 0xFFFF, 0xFFFF, 1, &msg_success) == MSG_ERROR_REPLY) { @@ -1648,7 +2000,7 @@ void tt_SiliconDevice::enable_local_ethernet_queue(const chip_id_t &device_id, i } } -void *tt_SiliconDevice::host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const { +void* tt_SiliconDevice::host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const { if (hugepage_mapping.at(src_device_id).at(channel) != nullptr) { return static_cast(hugepage_mapping.at(src_device_id).at(channel)) + offset; } else { @@ -1658,18 +2010,20 @@ void *tt_SiliconDevice::host_dma_address(std::uint64_t offset, chip_id_t src_dev // Wrapper for throwing more helpful exception when not-enabled pci intf is accessed. inline PCIDevice* tt_SiliconDevice::get_pci_device(int device_id) const { - if (!m_pci_device_map.count(device_id)){ + if (!m_pci_device_map.count(device_id)) { throw std::runtime_error(fmt::format("device_id: {} attempted to be accessed, but is not enabled.", device_id)); } return m_pci_device_map.at(device_id).get(); } -std::shared_ptr tt_SiliconDevice::get_mutex(const std::string& tlb_name, int pci_interface_id) { +std::shared_ptr tt_SiliconDevice::get_mutex( + const std::string& tlb_name, int pci_interface_id) { std::string mutex_name = tlb_name + std::to_string(pci_interface_id); return hardware_resource_mutex_map.at(mutex_name); } -uint64_t tt_SiliconDevice::get_sys_addr(uint32_t chip_x, uint32_t chip_y, uint32_t noc_x, uint32_t noc_y, uint64_t offset) { +uint64_t tt_SiliconDevice::get_sys_addr( + uint32_t chip_x, uint32_t chip_y, uint32_t noc_x, uint32_t noc_y, uint64_t offset) { uint64_t result = chip_y; uint64_t noc_addr_local_bits_mask = (1UL << eth_interface_params.noc_addr_local_bits) - 1; result <<= eth_interface_params.noc_addr_node_id_bits; @@ -1692,7 +2046,8 @@ uint16_t tt_SiliconDevice::get_sys_rack(uint32_t rack_x, uint32_t rack_y) { } bool tt_SiliconDevice::is_non_mmio_cmd_q_full(uint32_t curr_wptr, uint32_t curr_rptr) { - return (curr_wptr != curr_rptr) && ((curr_wptr & eth_interface_params.cmd_buf_size_mask) == (curr_rptr & eth_interface_params.cmd_buf_size_mask)); + return (curr_wptr != curr_rptr) && ((curr_wptr & eth_interface_params.cmd_buf_size_mask) == + (curr_rptr & eth_interface_params.cmd_buf_size_mask)); } /* @@ -1741,35 +2096,37 @@ bool tt_SiliconDevice::is_non_mmio_cmd_q_full(uint32_t curr_wptr, uint32_t curr_ * Other schemes may be more performant. */ - /* * Note that this function is required to acquire the `NON_MMIO_MUTEX_NAME` mutex for interacting with the * ethernet core (host) command queue DO NOT issue any pcie reads/writes to the ethernet core prior to acquiring the * mutex. For extra information, see the "NON_MMIO_MUTEX Usage" above */ - void tt_SiliconDevice::write_to_non_mmio_device( - const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t address, - bool broadcast, std::vector broadcast_header) { - + const void* mem_ptr, + uint32_t size_in_bytes, + tt_cxy_pair core, + uint64_t address, + bool broadcast, + std::vector broadcast_header) { chip_id_t mmio_capable_chip_logical; - - if(broadcast) { + + if (broadcast) { mmio_capable_chip_logical = core.chip; - } - else { + } else { mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(core.chip); } flush_non_mmio_per_chip[ndesc->get_closest_mmio_capable_chip(core.chip)] = true; if (non_mmio_transfer_cores_customized) { - log_assert(active_eth_core_idx_per_chip.find(mmio_capable_chip_logical) != active_eth_core_idx_per_chip.end(), "Ethernet Cores for Host to Cluster communication were not initialized for all MMIO devices."); + log_assert( + active_eth_core_idx_per_chip.find(mmio_capable_chip_logical) != active_eth_core_idx_per_chip.end(), + "Ethernet Cores for Host to Cluster communication were not initialized for all MMIO devices."); } using data_word_t = uint32_t; constexpr int DATA_WORD_SIZE = sizeof(data_word_t); - constexpr int BROADCAST_HEADER_SIZE = sizeof(data_word_t) * 8; // Broadcast header is 8 words + constexpr int BROADCAST_HEADER_SIZE = sizeof(data_word_t) * 8; // Broadcast header is 8 words const auto target_chip = ndesc->get_chip_locations().at(core.chip); std::string write_tlb = "LARGE_WRITE_TLB"; @@ -1778,14 +2135,15 @@ void tt_SiliconDevice::write_to_non_mmio_device( translate_to_noc_table_coords(core.chip, core.y, core.x); std::vector erisc_command; std::vector erisc_q_rptr = std::vector(1); - std::vector erisc_q_ptrs = std::vector(eth_interface_params.remote_update_ptr_size_bytes*2 / sizeof(uint32_t)); + std::vector erisc_q_ptrs = + std::vector(eth_interface_params.remote_update_ptr_size_bytes * 2 / sizeof(uint32_t)); std::vector data_block; - routing_cmd_t *new_cmd; + routing_cmd_t* new_cmd; uint32_t buffer_id = 0; - uint32_t timestamp = 0; //CMD_TIMESTAMP; + uint32_t timestamp = 0; // CMD_TIMESTAMP; bool use_dram; uint32_t max_block_size; @@ -1797,14 +2155,22 @@ void tt_SiliconDevice::write_to_non_mmio_device( // MUTEX ACQUIRE (NON-MMIO) // do not locate any ethernet core reads/writes before this acquire // - const scoped_lock lock(*get_mutex(NON_MMIO_MUTEX_NAME, this->get_pci_device(mmio_capable_chip_logical)->get_device_num())); - - int& active_core_for_txn = non_mmio_transfer_cores_customized ? active_eth_core_idx_per_chip.at(mmio_capable_chip_logical) : active_core; - tt_cxy_pair remote_transfer_ethernet_core = remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn]; - - erisc_command.resize(sizeof(routing_cmd_t)/DATA_WORD_SIZE); - new_cmd = (routing_cmd_t *)&erisc_command[0]; - read_device_memory(erisc_q_ptrs.data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb); + const scoped_lock lock( + *get_mutex(NON_MMIO_MUTEX_NAME, this->get_pci_device(mmio_capable_chip_logical)->get_device_num())); + + int& active_core_for_txn = + non_mmio_transfer_cores_customized ? active_eth_core_idx_per_chip.at(mmio_capable_chip_logical) : active_core; + tt_cxy_pair remote_transfer_ethernet_core = + remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn]; + + erisc_command.resize(sizeof(routing_cmd_t) / DATA_WORD_SIZE); + new_cmd = (routing_cmd_t*)&erisc_command[0]; + read_device_memory( + erisc_q_ptrs.data(), + remote_transfer_ethernet_core, + eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, + eth_interface_params.remote_update_ptr_size_bytes * 2, + read_tlb); uint32_t full_count = 0; uint32_t offset = 0; uint32_t block_size; @@ -1814,40 +2180,55 @@ void tt_SiliconDevice::write_to_non_mmio_device( erisc_q_rptr[0] = erisc_q_ptrs[4]; while (offset < size_in_bytes) { while (full) { - read_device_memory(erisc_q_rptr.data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes + eth_interface_params.remote_update_ptr_size_bytes, DATA_WORD_SIZE, read_tlb); - full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0],erisc_q_rptr[0]); + read_device_memory( + erisc_q_rptr.data(), + remote_transfer_ethernet_core, + eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes + + eth_interface_params.remote_update_ptr_size_bytes, + DATA_WORD_SIZE, + read_tlb); + full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0], erisc_q_rptr[0]); full_count++; } - //full = true; - // set full only if this command will make the q full. - // otherwise full stays false so that we do not poll the rd pointer in next iteration. - // As long as current command push does not fill up the queue completely, we do not want - // to poll rd pointer in every iteration. - //full = is_non_mmio_cmd_q_full((erisc_q_ptrs[0] + 1) & CMD_BUF_PTR_MASK, erisc_q_rptr[0]); + // full = true; + // set full only if this command will make the q full. + // otherwise full stays false so that we do not poll the rd pointer in next iteration. + // As long as current command push does not fill up the queue completely, we do not want + // to poll rd pointer in every iteration. + // full = is_non_mmio_cmd_q_full((erisc_q_ptrs[0] + 1) & CMD_BUF_PTR_MASK, erisc_q_rptr[0]); uint32_t req_wr_ptr = erisc_q_ptrs[0] & eth_interface_params.cmd_buf_size_mask; - if ((address + offset) & 0x1F) { // address not 32-byte aligned - block_size = DATA_WORD_SIZE; // 4 byte aligned + if ((address + offset) & 0x1F) { // address not 32-byte aligned + block_size = DATA_WORD_SIZE; // 4 byte aligned } else { // For broadcast we prepend a 32byte header. Decrease block size (size of payload) by this amount. - block_size = offset + max_block_size > size_in_bytes + 32 * broadcast ? size_in_bytes - offset : max_block_size - 32 * broadcast; + block_size = offset + max_block_size > size_in_bytes + 32 * broadcast ? size_in_bytes - offset + : max_block_size - 32 * broadcast; // Explictly align block_size to 4 bytes, in case the input buffer is not uint32_t aligned uint32_t alignment_mask = sizeof(uint32_t) - 1; block_size = (block_size + alignment_mask) & ~alignment_mask; } - // For 4 byte aligned data, transfer_size always == block_size. For unaligned data, transfer_size < block_size in the last block - uint64_t transfer_size = std::min(block_size, size_in_bytes - offset); // Host side data size that needs to be copied + // For 4 byte aligned data, transfer_size always == block_size. For unaligned data, transfer_size < block_size + // in the last block + uint64_t transfer_size = + std::min(block_size, size_in_bytes - offset); // Host side data size that needs to be copied // Use block mode for broadcast - uint32_t req_flags = (broadcast || (block_size > DATA_WORD_SIZE)) ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_wr_req | timestamp) : eth_interface_params.cmd_wr_req; - uint32_t resp_flags = block_size > DATA_WORD_SIZE ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_wr_ack) : eth_interface_params.cmd_wr_ack; + uint32_t req_flags = (broadcast || (block_size > DATA_WORD_SIZE)) + ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_wr_req | timestamp) + : eth_interface_params.cmd_wr_req; + uint32_t resp_flags = block_size > DATA_WORD_SIZE + ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_wr_ack) + : eth_interface_params.cmd_wr_ack; timestamp = 0; - - if(broadcast) { + + if (broadcast) { req_flags |= eth_interface_params.cmd_broadcast; } - uint32_t host_dram_block_addr = host_address_params.eth_routing_buffers_start + (active_core_for_txn * eth_interface_params.cmd_buf_size + req_wr_ptr) * max_block_size; - uint16_t host_dram_channel = 0; // This needs to be 0, since WH can only map ETH buffers to chan 0. + uint32_t host_dram_block_addr = + host_address_params.eth_routing_buffers_start + + (active_core_for_txn * eth_interface_params.cmd_buf_size + req_wr_ptr) * max_block_size; + uint16_t host_dram_channel = 0; // This needs to be 0, since WH can only map ETH buffers to chan 0. if (req_flags & eth_interface_params.cmd_data_block) { // Copy data to sysmem or device DRAM for Block mode @@ -1856,46 +2237,61 @@ void tt_SiliconDevice::write_to_non_mmio_device( resp_flags |= eth_interface_params.cmd_data_block_dram; size_buffer_to_capacity(data_block, block_size); memcpy(&data_block[0], (uint8_t*)mem_ptr + offset, transfer_size); - if(broadcast) { + if (broadcast) { // Write broadcast header to sysmem - write_to_sysmem(broadcast_header.data(), broadcast_header.size() * sizeof(uint32_t), host_dram_block_addr, host_dram_channel, mmio_capable_chip_logical); + write_to_sysmem( + broadcast_header.data(), + broadcast_header.size() * sizeof(uint32_t), + host_dram_block_addr, + host_dram_channel, + mmio_capable_chip_logical); } // Write payload to sysmem - write_to_sysmem(data_block.data(), data_block.size() * DATA_WORD_SIZE, host_dram_block_addr + BROADCAST_HEADER_SIZE * broadcast, host_dram_channel, mmio_capable_chip_logical); + write_to_sysmem( + data_block.data(), + data_block.size() * DATA_WORD_SIZE, + host_dram_block_addr + BROADCAST_HEADER_SIZE * broadcast, + host_dram_channel, + mmio_capable_chip_logical); } else { uint32_t buf_address = eth_interface_params.eth_routing_data_buffer_addr + req_wr_ptr * max_block_size; size_buffer_to_capacity(data_block, block_size); memcpy(&data_block[0], (uint8_t*)mem_ptr + offset, transfer_size); - write_device_memory(data_block.data(), data_block.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, buf_address, write_tlb); + write_device_memory( + data_block.data(), + data_block.size() * DATA_WORD_SIZE, + remote_transfer_ethernet_core, + buf_address, + write_tlb); } tt_driver_atomics::sfence(); } // Send the read request - log_assert(broadcast || (req_flags == eth_interface_params.cmd_wr_req) || (((address + offset) % 32) == 0), "Block mode address must be 32-byte aligned."); // Block mode address must be 32-byte aligned. - - if(broadcast) { + log_assert( + broadcast || (req_flags == eth_interface_params.cmd_wr_req) || (((address + offset) % 32) == 0), + "Block mode address must be 32-byte aligned."); // Block mode address must be 32-byte aligned. + + if (broadcast) { // Only specify endpoint local address for broadcast new_cmd->sys_addr = address + offset; - } - else { - new_cmd->sys_addr = get_sys_addr(std::get<0>(target_chip), std::get<1>(target_chip), core.x, core.y, address + offset); + } else { + new_cmd->sys_addr = + get_sys_addr(std::get<0>(target_chip), std::get<1>(target_chip), core.x, core.y, address + offset); new_cmd->rack = get_sys_rack(std::get<2>(target_chip), std::get<3>(target_chip)); } - - if(req_flags & eth_interface_params.cmd_data_block) { + + if (req_flags & eth_interface_params.cmd_data_block) { // Block mode new_cmd->data = block_size + BROADCAST_HEADER_SIZE * broadcast; - } - else { - if(size_in_bytes - offset < sizeof(uint32_t)) { + } else { + if (size_in_bytes - offset < sizeof(uint32_t)) { // Handle misalignment at the end of the buffer: // Assemble a padded uint32_t from single bytes, in case we have less than 4 bytes remaining memcpy(&new_cmd->data, static_cast(mem_ptr) + offset, size_in_bytes - offset); - } - else { - new_cmd->data = *((uint32_t*)mem_ptr + offset/DATA_WORD_SIZE); + } else { + new_cmd->data = *((uint32_t*)mem_ptr + offset / DATA_WORD_SIZE); } } @@ -1903,14 +2299,24 @@ void tt_SiliconDevice::write_to_non_mmio_device( if (use_dram) { new_cmd->src_addr_tag = host_dram_block_addr; } - write_device_memory(erisc_command.data(), erisc_command.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.request_routing_cmd_queue_base + (sizeof(routing_cmd_t) * req_wr_ptr), write_tlb); + write_device_memory( + erisc_command.data(), + erisc_command.size() * DATA_WORD_SIZE, + remote_transfer_ethernet_core, + eth_interface_params.request_routing_cmd_queue_base + (sizeof(routing_cmd_t) * req_wr_ptr), + write_tlb); tt_driver_atomics::sfence(); erisc_q_ptrs[0] = (erisc_q_ptrs[0] + 1) & eth_interface_params.cmd_buf_ptr_mask; std::vector erisc_q_wptr; erisc_q_wptr.resize(1); erisc_q_wptr[0] = erisc_q_ptrs[0]; - write_device_memory(erisc_q_wptr.data(), erisc_q_wptr.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, write_tlb); + write_device_memory( + erisc_q_wptr.data(), + erisc_q_wptr.size() * DATA_WORD_SIZE, + remote_transfer_ethernet_core, + eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, + write_tlb); tt_driver_atomics::sfence(); offset += transfer_size; @@ -1923,10 +2329,19 @@ void tt_SiliconDevice::write_to_non_mmio_device( if (is_non_mmio_cmd_q_full((erisc_q_ptrs[0]) & eth_interface_params.cmd_buf_ptr_mask, erisc_q_rptr[0])) { active_core_for_txn++; uint32_t update_mask_for_chip = remote_transfer_ethernet_cores[mmio_capable_chip_logical].size() - 1; - active_core_for_txn = non_mmio_transfer_cores_customized ? (active_core_for_txn & update_mask_for_chip) : ((active_core_for_txn & NON_EPOCH_ETH_CORES_MASK) + NON_EPOCH_ETH_CORES_START_ID); + active_core_for_txn = + non_mmio_transfer_cores_customized + ? (active_core_for_txn & update_mask_for_chip) + : ((active_core_for_txn & NON_EPOCH_ETH_CORES_MASK) + NON_EPOCH_ETH_CORES_START_ID); // active_core = (active_core & NON_EPOCH_ETH_CORES_MASK) + NON_EPOCH_ETH_CORES_START_ID; - remote_transfer_ethernet_core = remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn]; - read_device_memory(erisc_q_ptrs.data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb); + remote_transfer_ethernet_core = + remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn]; + read_device_memory( + erisc_q_ptrs.data(), + remote_transfer_ethernet_core, + eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, + eth_interface_params.remote_update_ptr_size_bytes * 2, + read_tlb); full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0], erisc_q_ptrs[4]); erisc_q_rptr[0] = erisc_q_ptrs[4]; } @@ -1934,11 +2349,12 @@ void tt_SiliconDevice::write_to_non_mmio_device( } /* - * Note that this function is required to acquire the `NON_MMIO_MUTEX_NAME` mutex for interacting with the ethernet core (host) command queue - * DO NOT use `active_core` or issue any pcie reads/writes to the ethernet core prior to acquiring the mutex. For extra information, see the "NON_MMIO_MUTEX Usage" above + * Note that this function is required to acquire the `NON_MMIO_MUTEX_NAME` mutex for interacting with the ethernet core + * (host) command queue DO NOT use `active_core` or issue any pcie reads/writes to the ethernet core prior to acquiring + * the mutex. For extra information, see the "NON_MMIO_MUTEX Usage" above */ -void tt_SiliconDevice::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_t address, uint32_t size_in_bytes) { - +void tt_SiliconDevice::read_from_non_mmio_device( + void* mem_ptr, tt_cxy_pair core, uint64_t address, uint32_t size_in_bytes) { using data_word_t = uint32_t; constexpr int DATA_WORD_SIZE = sizeof(data_word_t); std::string write_tlb = "LARGE_WRITE_TLB"; @@ -1946,33 +2362,50 @@ void tt_SiliconDevice::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core std::string empty_tlb = ""; translate_to_noc_table_coords(core.chip, core.y, core.x); - const auto &mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(core.chip); + const auto& mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(core.chip); const eth_coord_t target_chip = ndesc->get_chip_locations().at(core.chip); std::vector erisc_command; std::vector erisc_q_rptr; - std::vector erisc_q_ptrs = std::vector(eth_interface_params.remote_update_ptr_size_bytes*2 / DATA_WORD_SIZE); + std::vector erisc_q_ptrs = + std::vector(eth_interface_params.remote_update_ptr_size_bytes * 2 / DATA_WORD_SIZE); std::vector erisc_resp_q_wptr = std::vector(1); std::vector erisc_resp_q_rptr = std::vector(1); - std::vector data_block; - routing_cmd_t *new_cmd; + routing_cmd_t* new_cmd; - erisc_command.resize(sizeof(routing_cmd_t)/DATA_WORD_SIZE); - new_cmd = (routing_cmd_t *)&erisc_command[0]; + erisc_command.resize(sizeof(routing_cmd_t) / DATA_WORD_SIZE); + new_cmd = (routing_cmd_t*)&erisc_command[0]; // // MUTEX ACQUIRE (NON-MMIO) // do not locate any ethernet core reads/writes before this acquire // - const scoped_lock lock(*get_mutex(NON_MMIO_MUTEX_NAME, this->get_pci_device(mmio_capable_chip_logical)->get_device_num())); + const scoped_lock lock( + *get_mutex(NON_MMIO_MUTEX_NAME, this->get_pci_device(mmio_capable_chip_logical)->get_device_num())); const tt_cxy_pair remote_transfer_ethernet_core = remote_transfer_ethernet_cores[mmio_capable_chip_logical].at(0); - read_device_memory(erisc_q_ptrs.data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb); - read_device_memory(erisc_resp_q_wptr.data(), remote_transfer_ethernet_core, eth_interface_params.response_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, DATA_WORD_SIZE, read_tlb); - read_device_memory(erisc_resp_q_rptr.data(), remote_transfer_ethernet_core, eth_interface_params.response_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes + eth_interface_params.remote_update_ptr_size_bytes, DATA_WORD_SIZE, read_tlb); + read_device_memory( + erisc_q_ptrs.data(), + remote_transfer_ethernet_core, + eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, + eth_interface_params.remote_update_ptr_size_bytes * 2, + read_tlb); + read_device_memory( + erisc_resp_q_wptr.data(), + remote_transfer_ethernet_core, + eth_interface_params.response_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, + DATA_WORD_SIZE, + read_tlb); + read_device_memory( + erisc_resp_q_rptr.data(), + remote_transfer_ethernet_core, + eth_interface_params.response_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes + + eth_interface_params.remote_update_ptr_size_bytes, + DATA_WORD_SIZE, + read_tlb); bool full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0], erisc_q_ptrs[4]); erisc_q_rptr.resize(1); @@ -1990,25 +2423,34 @@ void tt_SiliconDevice::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core while (offset < size_in_bytes) { while (full) { - read_device_memory(erisc_q_rptr.data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes + eth_interface_params.remote_update_ptr_size_bytes, DATA_WORD_SIZE, read_tlb); - full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0],erisc_q_rptr[0]); + read_device_memory( + erisc_q_rptr.data(), + remote_transfer_ethernet_core, + eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes + + eth_interface_params.remote_update_ptr_size_bytes, + DATA_WORD_SIZE, + read_tlb); + full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0], erisc_q_rptr[0]); } uint32_t req_wr_ptr = erisc_q_ptrs[0] & eth_interface_params.cmd_buf_size_mask; - if ((address + offset) & 0x1F) { // address not 32-byte aligned - block_size = DATA_WORD_SIZE; // 4 byte aligned block + if ((address + offset) & 0x1F) { // address not 32-byte aligned + block_size = DATA_WORD_SIZE; // 4 byte aligned block } else { block_size = offset + max_block_size > size_in_bytes ? size_in_bytes - offset : max_block_size; // Align up to 4 bytes. uint32_t alignment_mask = sizeof(uint32_t) - 1; block_size = (block_size + alignment_mask) & ~alignment_mask; - } - uint32_t req_flags = block_size > DATA_WORD_SIZE ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_rd_req) : eth_interface_params.cmd_rd_req; - uint32_t resp_flags = block_size > DATA_WORD_SIZE ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_rd_data) : eth_interface_params.cmd_rd_data; + uint32_t req_flags = block_size > DATA_WORD_SIZE + ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_rd_req) + : eth_interface_params.cmd_rd_req; + uint32_t resp_flags = block_size > DATA_WORD_SIZE + ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_rd_data) + : eth_interface_params.cmd_rd_data; uint32_t resp_rd_ptr = erisc_resp_q_rptr[0] & eth_interface_params.cmd_buf_size_mask; uint32_t host_dram_block_addr = host_address_params.eth_routing_buffers_start + resp_rd_ptr * max_block_size; - uint16_t host_dram_channel = 0; // This needs to be 0, since WH can only map ETH buffers to chan 0. + uint16_t host_dram_channel = 0; // This needs to be 0, since WH can only map ETH buffers to chan 0. if (use_dram && block_size > DATA_WORD_SIZE) { req_flags |= eth_interface_params.cmd_data_block_dram; @@ -2016,22 +2458,36 @@ void tt_SiliconDevice::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core } // Send the read request - log_assert((req_flags == eth_interface_params.cmd_rd_req) || (((address + offset) & 0x1F) == 0), "Block mode offset must be 32-byte aligned."); // Block mode offset must be 32-byte aligned. - new_cmd->sys_addr = get_sys_addr(std::get<0>(target_chip), std::get<1>(target_chip), core.x, core.y, address + offset); + log_assert( + (req_flags == eth_interface_params.cmd_rd_req) || (((address + offset) & 0x1F) == 0), + "Block mode offset must be 32-byte aligned."); // Block mode offset must be 32-byte aligned. + new_cmd->sys_addr = + get_sys_addr(std::get<0>(target_chip), std::get<1>(target_chip), core.x, core.y, address + offset); new_cmd->rack = get_sys_rack(std::get<2>(target_chip), std::get<3>(target_chip)); new_cmd->data = block_size; new_cmd->flags = req_flags; if (use_dram) { new_cmd->src_addr_tag = host_dram_block_addr; } - write_device_memory(erisc_command.data(), erisc_command.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.request_routing_cmd_queue_base + (sizeof(routing_cmd_t) * req_wr_ptr), write_tlb);; + write_device_memory( + erisc_command.data(), + erisc_command.size() * DATA_WORD_SIZE, + remote_transfer_ethernet_core, + eth_interface_params.request_routing_cmd_queue_base + (sizeof(routing_cmd_t) * req_wr_ptr), + write_tlb); + ; tt_driver_atomics::sfence(); erisc_q_ptrs[0] = (erisc_q_ptrs[0] + 1) & eth_interface_params.cmd_buf_ptr_mask; std::vector erisc_q_wptr; erisc_q_wptr.resize(1); erisc_q_wptr[0] = erisc_q_ptrs[0]; - write_device_memory(erisc_q_wptr.data(), erisc_q_wptr.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, write_tlb); + write_device_memory( + erisc_q_wptr.data(), + erisc_q_wptr.size() * DATA_WORD_SIZE, + remote_transfer_ethernet_core, + eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, + write_tlb); tt_driver_atomics::sfence(); // If there is more data to read and this command will make the q full, set full to 1. // otherwise full stays false so that we do not poll the rd pointer in next iteration. @@ -2039,7 +2495,12 @@ void tt_SiliconDevice::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core // to poll rd pointer in every iteration. if (is_non_mmio_cmd_q_full((erisc_q_ptrs[0]), erisc_q_rptr[0])) { - read_device_memory(erisc_q_ptrs.data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb); + read_device_memory( + erisc_q_ptrs.data(), + remote_transfer_ethernet_core, + eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, + eth_interface_params.remote_update_ptr_size_bytes * 2, + read_tlb); full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0], erisc_q_ptrs[4]); erisc_q_rptr[0] = erisc_q_ptrs[4]; } @@ -2055,13 +2516,23 @@ void tt_SiliconDevice::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core // So we have to wait for wrptr to advance, then wait for flags to be nonzero, then read data. do { - read_device_memory(erisc_resp_q_wptr.data(), remote_transfer_ethernet_core, eth_interface_params.response_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, DATA_WORD_SIZE, read_tlb); + read_device_memory( + erisc_resp_q_wptr.data(), + remote_transfer_ethernet_core, + eth_interface_params.response_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, + DATA_WORD_SIZE, + read_tlb); } while (erisc_resp_q_rptr[0] == erisc_resp_q_wptr[0]); tt_driver_atomics::lfence(); uint32_t flags_offset = 12 + sizeof(routing_cmd_t) * resp_rd_ptr; std::vector erisc_resp_flags = std::vector(1); do { - read_device_memory(erisc_resp_flags.data(), remote_transfer_ethernet_core, eth_interface_params.response_routing_cmd_queue_base + flags_offset, DATA_WORD_SIZE, read_tlb); + read_device_memory( + erisc_resp_flags.data(), + remote_transfer_ethernet_core, + eth_interface_params.response_routing_cmd_queue_base + flags_offset, + DATA_WORD_SIZE, + read_tlb); } while (erisc_resp_flags[0] == 0); if (erisc_resp_flags[0] == resp_flags) { @@ -2069,27 +2540,40 @@ void tt_SiliconDevice::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core uint32_t data_offset = 8 + sizeof(routing_cmd_t) * resp_rd_ptr; if (block_size == DATA_WORD_SIZE) { std::vector erisc_resp_data = std::vector(1); - read_device_memory(erisc_resp_data.data(), remote_transfer_ethernet_core, eth_interface_params.response_routing_cmd_queue_base + data_offset, DATA_WORD_SIZE, read_tlb); - if(size_in_bytes - offset < 4) { + read_device_memory( + erisc_resp_data.data(), + remote_transfer_ethernet_core, + eth_interface_params.response_routing_cmd_queue_base + data_offset, + DATA_WORD_SIZE, + read_tlb); + if (size_in_bytes - offset < 4) { // Handle misaligned (4 bytes) data at the end of the block. // Only read remaining bytes into the host buffer, instead of reading the full uint32_t std::memcpy((uint8_t*)mem_ptr + offset, erisc_resp_data.data(), size_in_bytes - offset); - } - else { - *((uint32_t*)mem_ptr + offset/DATA_WORD_SIZE) = erisc_resp_data[0]; + } else { + *((uint32_t*)mem_ptr + offset / DATA_WORD_SIZE) = erisc_resp_data[0]; } } else { // Read 4 byte aligned block from device/sysmem if (use_dram) { size_buffer_to_capacity(data_block, block_size); - read_from_sysmem(data_block.data(), host_dram_block_addr, host_dram_channel, block_size, mmio_capable_chip_logical); + read_from_sysmem( + data_block.data(), + host_dram_block_addr, + host_dram_channel, + block_size, + mmio_capable_chip_logical); } else { - uint32_t buf_address = eth_interface_params.eth_routing_data_buffer_addr + resp_rd_ptr * max_block_size; + uint32_t buf_address = + eth_interface_params.eth_routing_data_buffer_addr + resp_rd_ptr * max_block_size; size_buffer_to_capacity(data_block, block_size); - read_device_memory(data_block.data(), remote_transfer_ethernet_core, buf_address, block_size, read_tlb); + read_device_memory( + data_block.data(), remote_transfer_ethernet_core, buf_address, block_size, read_tlb); } // assert(mem_ptr.size() - (offset/DATA_WORD_SIZE) >= (block_size * DATA_WORD_SIZE)); - log_assert((data_block.size() * DATA_WORD_SIZE) >= block_size, "Incorrect data size read back from sysmem/device"); + log_assert( + (data_block.size() * DATA_WORD_SIZE) >= block_size, + "Incorrect data size read back from sysmem/device"); // Account for misalignment by skipping any padding bytes in the copied data_block memcpy((uint8_t*)mem_ptr + offset, data_block.data(), std::min(block_size, size_in_bytes - offset)); } @@ -2097,40 +2581,53 @@ void tt_SiliconDevice::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core // Finally increment the rdptr for the response command q erisc_resp_q_rptr[0] = (erisc_resp_q_rptr[0] + 1) & eth_interface_params.cmd_buf_ptr_mask; - write_device_memory(erisc_resp_q_rptr.data(), erisc_resp_q_rptr.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.response_cmd_queue_base + sizeof(remote_update_ptr_t) + eth_interface_params.cmd_counters_size_bytes, write_tlb); + write_device_memory( + erisc_resp_q_rptr.data(), + erisc_resp_q_rptr.size() * DATA_WORD_SIZE, + remote_transfer_ethernet_core, + eth_interface_params.response_cmd_queue_base + sizeof(remote_update_ptr_t) + + eth_interface_params.cmd_counters_size_bytes, + write_tlb); tt_driver_atomics::sfence(); log_assert(erisc_resp_flags[0] == resp_flags, "Unexpected ERISC Response Flags."); offset += block_size; } - } void tt_SiliconDevice::wait_for_connected_non_mmio_flush(const chip_id_t chip_id) { - if(flush_non_mmio_per_chip[chip_id]) { + if (flush_non_mmio_per_chip[chip_id]) { log_assert(arch_name != tt::ARCH::BLACKHOLE, "Non-MMIO flush not supported in Blackhole"); std::string read_tlb = "LARGE_READ_TLB"; auto chips_with_mmio = this->get_target_mmio_device_ids(); if (chips_with_mmio.find(chip_id) == chips_with_mmio.end()) { - log_debug(LogSiliconDriver, "Chip {} is not an MMIO chip, skipping wait_for_connected_non_mmio_flush", chip_id); + log_debug( + LogSiliconDriver, "Chip {} is not an MMIO chip, skipping wait_for_connected_non_mmio_flush", chip_id); return; } if (arch_name == tt::ARCH::WORMHOLE_B0) { std::vector erisc_txn_counters = std::vector(2); - std::vector erisc_q_ptrs = std::vector(eth_interface_params.remote_update_ptr_size_bytes*2 / sizeof(uint32_t)); + std::vector erisc_q_ptrs = + std::vector(eth_interface_params.remote_update_ptr_size_bytes * 2 / sizeof(uint32_t)); - //wait for all queues to be empty. - for (tt_cxy_pair &cxy : remote_transfer_ethernet_cores.at(chip_id)) { + // wait for all queues to be empty. + for (tt_cxy_pair& cxy : remote_transfer_ethernet_cores.at(chip_id)) { do { - read_device_memory(erisc_q_ptrs.data(), cxy, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb); + read_device_memory( + erisc_q_ptrs.data(), + cxy, + eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, + eth_interface_params.remote_update_ptr_size_bytes * 2, + read_tlb); } while (erisc_q_ptrs[0] != erisc_q_ptrs[4]); } - //wait for all write responses to come back. - for (tt_cxy_pair &cxy : remote_transfer_ethernet_cores.at(chip_id)) { + // wait for all write responses to come back. + for (tt_cxy_pair& cxy : remote_transfer_ethernet_cores.at(chip_id)) { do { - read_device_memory(erisc_txn_counters.data(), cxy, eth_interface_params.request_cmd_queue_base, 8, read_tlb); + read_device_memory( + erisc_txn_counters.data(), cxy, eth_interface_params.request_cmd_queue_base, 8, read_tlb); } while (erisc_txn_counters[0] != erisc_txn_counters[1]); } } @@ -2138,7 +2635,6 @@ void tt_SiliconDevice::wait_for_connected_non_mmio_flush(const chip_id_t chip_id } } - void tt_SiliconDevice::wait_for_non_mmio_flush(const chip_id_t chip_id) { log_assert(arch_name != tt::ARCH::BLACKHOLE, "Non-MMIO flush not supported in Blackhole"); std::string read_tlb = "LARGE_READ_TLB"; @@ -2159,39 +2655,48 @@ void tt_SiliconDevice::wait_for_non_mmio_flush() { } // Broadcast Functions -void tt_SiliconDevice::generate_tensix_broadcast_grids_for_grayskull(std::set>& broadcast_grids, std::set& rows_to_exclude, std::set& cols_to_exclude) { +void tt_SiliconDevice::generate_tensix_broadcast_grids_for_grayskull( + std::set>& broadcast_grids, + std::set& rows_to_exclude, + std::set& cols_to_exclude) { // If row 0 is not explicitly excluded, exclude it here since its non-tensix rows_to_exclude.insert(0); // If row 11 is excluded, we can close the SOC grid. If not, exclude row 12 to close grid. - if(rows_to_exclude.find(11) == rows_to_exclude.end()) { + if (rows_to_exclude.find(11) == rows_to_exclude.end()) { rows_to_exclude.insert(12); } // If col 0 is not explicitly excluded, exclude it here since its non-tensix cols_to_exclude.insert(0); // If col 12 is excluded, we can close the SOC grid. If not, exclude col 13 to close grid. - if(cols_to_exclude.find(12) == cols_to_exclude.end()) { + if (cols_to_exclude.find(12) == cols_to_exclude.end()) { cols_to_exclude.insert(13); } std::vector> bb_x_coords = {}; std::vector> bb_y_coords = {}; // Generate starting and ending x coordinates of each bounding box/grid - for(auto x_it = cols_to_exclude.begin(); x_it != cols_to_exclude.end(); x_it++) { - if(x_it == std::prev(cols_to_exclude.end(), 1)) continue; - if(cols_to_exclude.find(*(x_it) + 1) == cols_to_exclude.end() and cols_to_exclude.find(*(std::next(x_it, 1)) - 1) == cols_to_exclude.end()) { + for (auto x_it = cols_to_exclude.begin(); x_it != cols_to_exclude.end(); x_it++) { + if (x_it == std::prev(cols_to_exclude.end(), 1)) { + continue; + } + if (cols_to_exclude.find(*(x_it) + 1) == cols_to_exclude.end() and + cols_to_exclude.find(*(std::next(x_it, 1)) - 1) == cols_to_exclude.end()) { bb_x_coords.push_back({*(x_it) + 1, *(std::next(x_it, 1)) - 1}); } } - for(auto y_it = rows_to_exclude.begin(); y_it != rows_to_exclude.end(); y_it++) { - if(y_it == std::prev(rows_to_exclude.end(), 1)) continue; - if(rows_to_exclude.find((*y_it) + 1) == rows_to_exclude.end() and rows_to_exclude.find(*std::next(y_it, 1) - 1) == rows_to_exclude.end()) { + for (auto y_it = rows_to_exclude.begin(); y_it != rows_to_exclude.end(); y_it++) { + if (y_it == std::prev(rows_to_exclude.end(), 1)) { + continue; + } + if (rows_to_exclude.find((*y_it) + 1) == rows_to_exclude.end() and + rows_to_exclude.find(*std::next(y_it, 1) - 1) == rows_to_exclude.end()) { bb_y_coords.push_back({*(y_it) + 1, *(std::next(y_it, 1)) - 1}); } } // Assemble x and y coordinates into bounding box vertices - for(const auto& x_pair : bb_x_coords) { - for(const auto& y_pair : bb_y_coords) { + for (const auto& x_pair : bb_x_coords) { + for (const auto& y_pair : bb_y_coords) { tt_xy_pair top_left = tt_xy_pair(x_pair.first, y_pair.first); tt_xy_pair bot_right = tt_xy_pair(x_pair.second, y_pair.second); broadcast_grids.insert({top_left, bot_right}); @@ -2199,81 +2704,94 @@ void tt_SiliconDevice::generate_tensix_broadcast_grids_for_grayskull(std::set>>& tt_SiliconDevice::get_ethernet_broadcast_headers(const std::set& chips_to_exclude) { +std::unordered_map>>& tt_SiliconDevice::get_ethernet_broadcast_headers( + const std::set& chips_to_exclude) { // Generate headers for Ethernet Broadcast (WH) only. Each header corresponds to a unique broadcast "grid". - if(bcast_header_cache.find(chips_to_exclude) == bcast_header_cache.end()) { + if (bcast_header_cache.find(chips_to_exclude) == bcast_header_cache.end()) { bcast_header_cache[chips_to_exclude] = {}; - std::unordered_map>> broadcast_mask_for_target_chips_per_group = {}; + std::unordered_map>> + broadcast_mask_for_target_chips_per_group = {}; std::map, std::tuple>> broadcast_header_union_per_group = {}; chip_id_t first_mmio_chip = *(get_target_mmio_device_ids().begin()); - for(const auto& chip : target_devices_in_cluster) { - if(chips_to_exclude.find(chip) == chips_to_exclude.end()) { + for (const auto& chip : target_devices_in_cluster) { + if (chips_to_exclude.find(chip) == chips_to_exclude.end()) { // Get shelf local physical chip id included in broadcast - chip_id_t physical_chip_id = ndesc -> get_shelf_local_physical_chip_coords(chip); - eth_coord_t eth_coords = ndesc -> get_chip_locations().at(chip); + chip_id_t physical_chip_id = ndesc->get_shelf_local_physical_chip_coords(chip); + eth_coord_t eth_coords = ndesc->get_chip_locations().at(chip); // Rack word to be set in header uint32_t rack_word = std::get<2>(eth_coords) >> 2; // Rack byte to be set in header uint32_t rack_byte = std::get<2>(eth_coords) % 4; // 1st level grouping: Group broadcasts based on the MMIO chip they must go through - // Nebula + Galaxy Topology assumption: Disjoint sets can only be present in the first shelf, with each set connected to host through its closest MMIO chip - // For the first shelf, pass broadcasts to specific chips through their closest MMIO chip - // All other shelves are fully connected galaxy grids. These are connected to all MMIO devices. Use any (or the first) MMIO device in the list. + // Nebula + Galaxy Topology assumption: Disjoint sets can only be present in the first shelf, with each + // set connected to host through its closest MMIO chip For the first shelf, pass broadcasts to specific + // chips through their closest MMIO chip All other shelves are fully connected galaxy grids. These are + // connected to all MMIO devices. Use any (or the first) MMIO device in the list. chip_id_t closest_mmio_chip = 0; if (std::get<2>(eth_coords) == 0 && std::get<3>(eth_coords) == 0) { - // Shelf 0 + Rack 0: Either an MMIO chip or a remote chip potentially connected to host through its own MMIO counterpart. - closest_mmio_chip = ndesc -> get_closest_mmio_capable_chip(chip); - } - else { - // All other shelves: Group these under the same/first MMIO chip, since all MMIO chips are connected. + // Shelf 0 + Rack 0: Either an MMIO chip or a remote chip potentially connected to host through its + // own MMIO counterpart. + closest_mmio_chip = ndesc->get_closest_mmio_capable_chip(chip); + } else { + // All other shelves: Group these under the same/first MMIO chip, since all MMIO chips are + // connected. closest_mmio_chip = first_mmio_chip; } - if(broadcast_mask_for_target_chips_per_group.find(closest_mmio_chip) == broadcast_mask_for_target_chips_per_group.end()) { + if (broadcast_mask_for_target_chips_per_group.find(closest_mmio_chip) == + broadcast_mask_for_target_chips_per_group.end()) { broadcast_mask_for_target_chips_per_group.insert({closest_mmio_chip, {}}); } - // For each target physical chip id (local to a shelf), generate headers based on all racks and shelves that contain this physical id. - if(broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip).find(physical_chip_id) == broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip).end()) { + // For each target physical chip id (local to a shelf), generate headers based on all racks and shelves + // that contain this physical id. + if (broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip).find(physical_chip_id) == + broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip).end()) { // Target seen for the first time. std::vector broadcast_mask(8, 0); broadcast_mask.at(rack_word) |= (1 << std::get<3>(eth_coords)) << rack_byte; broadcast_mask.at(3) |= 1 << physical_chip_id; - broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip).insert({physical_chip_id, broadcast_mask}); + broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip) + .insert({physical_chip_id, broadcast_mask}); - } - else { + } else { // Target was seen before -> include curr rack and shelf in header - broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip).at(physical_chip_id).at(rack_word) |= static_cast(1 << std::get<3>(eth_coords)) << rack_byte; + broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip) + .at(physical_chip_id) + .at(rack_word) |= static_cast(1 << std::get<3>(eth_coords)) << rack_byte; } } } - // 2nd level grouping: For each MMIO group, further group the chips based on their rack and shelf headers. The number of groups after this step represent the final set of broadcast grids. - for(auto& mmio_group : broadcast_mask_for_target_chips_per_group) { - for(auto& chip : mmio_group.second) { + // 2nd level grouping: For each MMIO group, further group the chips based on their rack and shelf headers. The + // number of groups after this step represent the final set of broadcast grids. + for (auto& mmio_group : broadcast_mask_for_target_chips_per_group) { + for (auto& chip : mmio_group.second) { // Generate a hash for this MMIO Chip + Rack + Shelf group - std::vector header_hash = {mmio_group.first, chip.second.at(0), chip.second.at(1), chip.second.at(2)}; - if(broadcast_header_union_per_group.find(header_hash) == broadcast_header_union_per_group.end()) { - broadcast_header_union_per_group.insert({header_hash, std::make_tuple(mmio_group.first, chip.second)}); - } - else { + std::vector header_hash = { + mmio_group.first, chip.second.at(0), chip.second.at(1), chip.second.at(2)}; + if (broadcast_header_union_per_group.find(header_hash) == broadcast_header_union_per_group.end()) { + broadcast_header_union_per_group.insert( + {header_hash, std::make_tuple(mmio_group.first, chip.second)}); + } else { // If group found, update chip header entry std::get<1>(broadcast_header_union_per_group.at(header_hash)).at(3) |= chip.second.at(3); } } } // Get all broadcast headers per MMIO group - for(const auto& header : broadcast_header_union_per_group) { + for (const auto& header : broadcast_header_union_per_group) { chip_id_t mmio_chip = std::get<0>(header.second); - if(bcast_header_cache[chips_to_exclude].find(mmio_chip) == bcast_header_cache[chips_to_exclude].end()) { + if (bcast_header_cache[chips_to_exclude].find(mmio_chip) == bcast_header_cache[chips_to_exclude].end()) { bcast_header_cache[chips_to_exclude].insert({mmio_chip, {}}); } bcast_header_cache[chips_to_exclude].at(mmio_chip).push_back(std::get<1>(header.second)); } // Invert headers (FW convention) - for(auto& bcast_group : bcast_header_cache[chips_to_exclude]) { - for(auto& header : bcast_group.second) { + for (auto& bcast_group : bcast_header_cache[chips_to_exclude]) { + for (auto& header : bcast_group.second) { int header_idx = 0; - for(auto& header_entry : header) { - if(header_idx == 4) break; + for (auto& header_entry : header) { + if (header_idx == 4) { + break; + } header_entry = ~header_entry; header_idx++; } @@ -2283,14 +2801,23 @@ std::unordered_map>>& tt_SiliconDevice:: return bcast_header_cache[chips_to_exclude]; } -void tt_SiliconDevice::pcie_broadcast_write(chip_id_t chip, const void* mem_ptr, uint32_t size_in_bytes, std::uint32_t addr, const tt_xy_pair& start, const tt_xy_pair& end, const std::string& fallback_tlb) { - // Use the specified TLB to broadcast data to all cores included in the [start, end] grid -> GS Only. Use Ethernet Broadcast for WH. - PCIDevice *pci_device = get_pci_device(chip); +void tt_SiliconDevice::pcie_broadcast_write( + chip_id_t chip, + const void* mem_ptr, + uint32_t size_in_bytes, + std::uint32_t addr, + const tt_xy_pair& start, + const tt_xy_pair& end, + const std::string& fallback_tlb) { + // Use the specified TLB to broadcast data to all cores included in the [start, end] grid -> GS Only. Use Ethernet + // Broadcast for WH. + PCIDevice* pci_device = get_pci_device(chip); const auto tlb_index = dynamic_tlb_config.at(fallback_tlb); const uint8_t* buffer_addr = static_cast(mem_ptr); const scoped_lock lock(*get_mutex(fallback_tlb, pci_device->get_device_num())); - while(size_in_bytes > 0) { - auto [mapped_address, tlb_size] = pci_device->set_dynamic_tlb_broadcast(tlb_index, addr, harvested_coord_translation, start, end, dynamic_tlb_ordering_modes.at(fallback_tlb)); + while (size_in_bytes > 0) { + auto [mapped_address, tlb_size] = pci_device->set_dynamic_tlb_broadcast( + tlb_index, addr, harvested_coord_translation, start, end, dynamic_tlb_ordering_modes.at(fallback_tlb)); uint64_t transfer_size = std::min((uint64_t)size_in_bytes, tlb_size); pci_device->write_block(mapped_address, transfer_size, buffer_addr); @@ -2300,155 +2827,235 @@ void tt_SiliconDevice::pcie_broadcast_write(chip_id_t chip, const void* mem_ptr, } } -inline bool tensix_or_eth_in_broadcast(const std::set& cols_to_exclude, const tt::umd::architecture_implementation* architecture_implementation) { +inline bool tensix_or_eth_in_broadcast( + const std::set& cols_to_exclude, + const tt::umd::architecture_implementation* architecture_implementation) { bool found_tensix_or_eth = false; - for(const auto& col : architecture_implementation->get_t6_x_locations()) { + for (const auto& col : architecture_implementation->get_t6_x_locations()) { found_tensix_or_eth |= (cols_to_exclude.find(col) == cols_to_exclude.end()); } return found_tensix_or_eth; } -inline bool valid_tensix_broadcast_grid(const std::set& rows_to_exclude, const std::set& cols_to_exclude, const tt::umd::architecture_implementation* architecture_implementation) { +inline bool valid_tensix_broadcast_grid( + const std::set& rows_to_exclude, + const std::set& cols_to_exclude, + const tt::umd::architecture_implementation* architecture_implementation) { bool t6_bcast_rows_complete = true; bool t6_bcast_rows_empty = true; - - for(const auto& row : architecture_implementation->get_t6_y_locations()) { + + for (const auto& row : architecture_implementation->get_t6_y_locations()) { t6_bcast_rows_complete &= (rows_to_exclude.find(row) == rows_to_exclude.end()); t6_bcast_rows_empty &= (rows_to_exclude.find(row) != rows_to_exclude.end()); } return t6_bcast_rows_complete || t6_bcast_rows_empty; } - -void tt_SiliconDevice::ethernet_broadcast_write(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, - const std::set& chips_to_exclude, const std::set& rows_to_exclude, - std::set& cols_to_exclude, const std::string& fallback_tlb, bool use_virtual_coords) { - if(use_ethernet_broadcast) { +void tt_SiliconDevice::ethernet_broadcast_write( + const void* mem_ptr, + uint32_t size_in_bytes, + uint64_t address, + const std::set& chips_to_exclude, + const std::set& rows_to_exclude, + std::set& cols_to_exclude, + const std::string& fallback_tlb, + bool use_virtual_coords) { + if (use_ethernet_broadcast) { // Broadcast through ERISC core supported - std::unordered_map>>& broadcast_headers = get_ethernet_broadcast_headers(chips_to_exclude); - // Apply row and column exclusion mask explictly. Placing this here if we want to cache the higher level broadcast headers on future/ + std::unordered_map>>& broadcast_headers = + get_ethernet_broadcast_headers(chips_to_exclude); + // Apply row and column exclusion mask explictly. Placing this here if we want to cache the higher level + // broadcast headers on future/ std::uint32_t row_exclusion_mask = 0; std::uint32_t col_exclusion_mask = 0; - for(const auto& row : rows_to_exclude) { + for (const auto& row : rows_to_exclude) { row_exclusion_mask |= 1 << row; } - for(const auto& col : cols_to_exclude) { + for (const auto& col : cols_to_exclude) { col_exclusion_mask |= 1 << (16 + col); } // Write broadcast block to device. - for(auto& mmio_group : broadcast_headers) { - for(auto& header : mmio_group.second) { - header.at(4) = use_virtual_coords * 0x8000; // Reset row/col exclusion masks + for (auto& mmio_group : broadcast_headers) { + for (auto& header : mmio_group.second) { + header.at(4) = use_virtual_coords * 0x8000; // Reset row/col exclusion masks header.at(4) |= row_exclusion_mask; header.at(4) |= col_exclusion_mask; // Write Target: x-y endpoint is a don't care. Initialize to tt_xy_pair(1, 1) - write_to_non_mmio_device(mem_ptr, size_in_bytes, tt_cxy_pair(mmio_group.first, tt_xy_pair(1, 1)), address, true, header); + write_to_non_mmio_device( + mem_ptr, size_in_bytes, tt_cxy_pair(mmio_group.first, tt_xy_pair(1, 1)), address, true, header); } } - } - else { + } else { // Broadcast not supported. Implement this at the software level as a for loop std::vector cores_to_write = {}; - for(const auto& chip : target_devices_in_cluster) { - if(chips_to_exclude.find(chip) != chips_to_exclude.end()) continue; - for(const auto& core : get_soc_descriptor(chip).cores) { - if(cols_to_exclude.find(core.first.x) == cols_to_exclude.end() and rows_to_exclude.find(core.first.y) == rows_to_exclude.end() and core.second.type != CoreType::HARVESTED) { - write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(chip, core.first.x, core.first.y), address, fallback_tlb); + for (const auto& chip : target_devices_in_cluster) { + if (chips_to_exclude.find(chip) != chips_to_exclude.end()) { + continue; + } + for (const auto& core : get_soc_descriptor(chip).cores) { + if (cols_to_exclude.find(core.first.x) == cols_to_exclude.end() and + rows_to_exclude.find(core.first.y) == rows_to_exclude.end() and + core.second.type != CoreType::HARVESTED) { + write_to_device( + mem_ptr, size_in_bytes, tt_cxy_pair(chip, core.first.x, core.first.y), address, fallback_tlb); } } } } } -void tt_SiliconDevice::broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, - const std::set& chips_to_exclude, std::set& rows_to_exclude, std::set& cols_to_exclude, const std::string& fallback_tlb) { +void tt_SiliconDevice::broadcast_write_to_cluster( + const void* mem_ptr, + uint32_t size_in_bytes, + uint64_t address, + const std::set& chips_to_exclude, + std::set& rows_to_exclude, + std::set& cols_to_exclude, + const std::string& fallback_tlb) { if (arch_name == tt::ARCH::GRAYSKULL) { // Device FW disables broadcasts to all non tensix cores. std::vector dram_cores_to_write = {}; std::vector dram_rows = {0, 6}; std::vector dram_cols = {1, 4, 7, 10}; - for(const auto& row : dram_rows) { - for(const auto& col : dram_cols) { - if(rows_to_exclude.find(row) == rows_to_exclude.end() and cols_to_exclude.find(col) == cols_to_exclude.end()) { + for (const auto& row : dram_rows) { + for (const auto& col : dram_cols) { + if (rows_to_exclude.find(row) == rows_to_exclude.end() and + cols_to_exclude.find(col) == cols_to_exclude.end()) { dram_cores_to_write.push_back(tt_xy_pair(col, row)); } } } - + std::set> broadcast_grids = {}; generate_tensix_broadcast_grids_for_grayskull(broadcast_grids, rows_to_exclude, cols_to_exclude); - for(const auto& chip : target_devices_in_cluster) { - if(chips_to_exclude.find(chip) != chips_to_exclude.end()) continue; - for(const auto& dram : dram_cores_to_write) { + for (const auto& chip : target_devices_in_cluster) { + if (chips_to_exclude.find(chip) != chips_to_exclude.end()) { + continue; + } + for (const auto& dram : dram_cores_to_write) { write_device_memory(mem_ptr, size_in_bytes, tt_cxy_pair(chip, dram), address, fallback_tlb); } - for(const auto& grid : broadcast_grids) { + for (const auto& grid : broadcast_grids) { pcie_broadcast_write(chip, mem_ptr, size_in_bytes, address, grid.first, grid.second, fallback_tlb); } - } - } - else if (arch_name == tt::ARCH::BLACKHOLE) { + } + } else if (arch_name == tt::ARCH::BLACKHOLE) { auto architecture_implementation = tt::umd::architecture_implementation::create(arch_name); - if(cols_to_exclude.find(0) == cols_to_exclude.end() or cols_to_exclude.find(9) == cols_to_exclude.end()) { - log_assert(!tensix_or_eth_in_broadcast(cols_to_exclude, architecture_implementation.get()), "Cannot broadcast to tensix/ethernet and DRAM simultaneously on Blackhole."); - if(cols_to_exclude.find(0) == cols_to_exclude.end()) { + if (cols_to_exclude.find(0) == cols_to_exclude.end() or cols_to_exclude.find(9) == cols_to_exclude.end()) { + log_assert( + !tensix_or_eth_in_broadcast(cols_to_exclude, architecture_implementation.get()), + "Cannot broadcast to tensix/ethernet and DRAM simultaneously on Blackhole."); + if (cols_to_exclude.find(0) == cols_to_exclude.end()) { // When broadcast includes column zero do not exclude anything std::set unsafe_rows = {}; std::set cols_to_exclude_for_col_0_bcast = cols_to_exclude; std::set rows_to_exclude_for_col_0_bcast = rows_to_exclude; cols_to_exclude_for_col_0_bcast.insert(9); rows_to_exclude_for_col_0_bcast.insert(unsafe_rows.begin(), unsafe_rows.end()); - ethernet_broadcast_write(mem_ptr, size_in_bytes, address, chips_to_exclude, - rows_to_exclude_for_col_0_bcast, cols_to_exclude_for_col_0_bcast, fallback_tlb, false); + ethernet_broadcast_write( + mem_ptr, + size_in_bytes, + address, + chips_to_exclude, + rows_to_exclude_for_col_0_bcast, + cols_to_exclude_for_col_0_bcast, + fallback_tlb, + false); } - if(cols_to_exclude.find(9) == cols_to_exclude.end()) { + if (cols_to_exclude.find(9) == cols_to_exclude.end()) { std::set cols_to_exclude_for_col_9_bcast = cols_to_exclude; cols_to_exclude_for_col_9_bcast.insert(0); - ethernet_broadcast_write(mem_ptr, size_in_bytes, address, chips_to_exclude, - rows_to_exclude, cols_to_exclude_for_col_9_bcast, fallback_tlb, false); + ethernet_broadcast_write( + mem_ptr, + size_in_bytes, + address, + chips_to_exclude, + rows_to_exclude, + cols_to_exclude_for_col_9_bcast, + fallback_tlb, + false); } + } else { + log_assert( + use_virtual_coords_for_eth_broadcast or + valid_tensix_broadcast_grid(rows_to_exclude, cols_to_exclude, architecture_implementation.get()), + "Must broadcast to all tensix rows when ERISC FW is < 6.8.0."); + ethernet_broadcast_write( + mem_ptr, + size_in_bytes, + address, + chips_to_exclude, + rows_to_exclude, + cols_to_exclude, + fallback_tlb, + use_virtual_coords_for_eth_broadcast); } - else { - log_assert(use_virtual_coords_for_eth_broadcast or valid_tensix_broadcast_grid(rows_to_exclude, cols_to_exclude, architecture_implementation.get()), - "Must broadcast to all tensix rows when ERISC FW is < 6.8.0."); - ethernet_broadcast_write(mem_ptr, size_in_bytes, address, chips_to_exclude, - rows_to_exclude, cols_to_exclude, fallback_tlb, use_virtual_coords_for_eth_broadcast); - } - } - else { + } else { auto architecture_implementation = tt::umd::architecture_implementation::create(arch_name); - if(cols_to_exclude.find(0) == cols_to_exclude.end() or cols_to_exclude.find(5) == cols_to_exclude.end()) { - log_assert(!tensix_or_eth_in_broadcast(cols_to_exclude, architecture_implementation.get()), "Cannot broadcast to tensix/ethernet and DRAM simultaneously on Wormhole."); - if(cols_to_exclude.find(0) == cols_to_exclude.end()) { - // When broadcast includes column zero Exclude PCIe, ARC and router cores from broadcast explictly, since writing to these is unsafe - // ERISC FW does not exclude these. + if (cols_to_exclude.find(0) == cols_to_exclude.end() or cols_to_exclude.find(5) == cols_to_exclude.end()) { + log_assert( + !tensix_or_eth_in_broadcast(cols_to_exclude, architecture_implementation.get()), + "Cannot broadcast to tensix/ethernet and DRAM simultaneously on Wormhole."); + if (cols_to_exclude.find(0) == cols_to_exclude.end()) { + // When broadcast includes column zero Exclude PCIe, ARC and router cores from broadcast explictly, + // since writing to these is unsafe ERISC FW does not exclude these. std::set unsafe_rows = {2, 3, 4, 8, 9, 10}; std::set cols_to_exclude_for_col_0_bcast = cols_to_exclude; std::set rows_to_exclude_for_col_0_bcast = rows_to_exclude; cols_to_exclude_for_col_0_bcast.insert(5); rows_to_exclude_for_col_0_bcast.insert(unsafe_rows.begin(), unsafe_rows.end()); - ethernet_broadcast_write(mem_ptr, size_in_bytes, address, chips_to_exclude, - rows_to_exclude_for_col_0_bcast, cols_to_exclude_for_col_0_bcast, fallback_tlb, false); + ethernet_broadcast_write( + mem_ptr, + size_in_bytes, + address, + chips_to_exclude, + rows_to_exclude_for_col_0_bcast, + cols_to_exclude_for_col_0_bcast, + fallback_tlb, + false); } - if(cols_to_exclude.find(5) == cols_to_exclude.end()) { + if (cols_to_exclude.find(5) == cols_to_exclude.end()) { std::set cols_to_exclude_for_col_5_bcast = cols_to_exclude; cols_to_exclude_for_col_5_bcast.insert(0); - ethernet_broadcast_write(mem_ptr, size_in_bytes, address, chips_to_exclude, - rows_to_exclude, cols_to_exclude_for_col_5_bcast, fallback_tlb, false); + ethernet_broadcast_write( + mem_ptr, + size_in_bytes, + address, + chips_to_exclude, + rows_to_exclude, + cols_to_exclude_for_col_5_bcast, + fallback_tlb, + false); } - } - else { - log_assert(use_virtual_coords_for_eth_broadcast or valid_tensix_broadcast_grid(rows_to_exclude, cols_to_exclude, architecture_implementation.get()), - "Must broadcast to all tensix rows when ERISC FW is < 6.8.0."); - ethernet_broadcast_write(mem_ptr, size_in_bytes, address, chips_to_exclude, - rows_to_exclude, cols_to_exclude, fallback_tlb, use_virtual_coords_for_eth_broadcast); - } - } -} - -int tt_SiliconDevice::remote_arc_msg(int chip, uint32_t msg_code, bool wait_for_done, uint32_t arg0, uint32_t arg1, int timeout, uint32_t *return_3, uint32_t *return_4) { + } else { + log_assert( + use_virtual_coords_for_eth_broadcast or + valid_tensix_broadcast_grid(rows_to_exclude, cols_to_exclude, architecture_implementation.get()), + "Must broadcast to all tensix rows when ERISC FW is < 6.8.0."); + ethernet_broadcast_write( + mem_ptr, + size_in_bytes, + address, + chips_to_exclude, + rows_to_exclude, + cols_to_exclude, + fallback_tlb, + use_virtual_coords_for_eth_broadcast); + } + } +} + +int tt_SiliconDevice::remote_arc_msg( + int chip, + uint32_t msg_code, + bool wait_for_done, + uint32_t arg0, + uint32_t arg1, + int timeout, + uint32_t* return_3, + uint32_t* return_4) { constexpr uint64_t ARC_RESET_SCRATCH_ADDR = 0x880030060; constexpr uint64_t ARC_RESET_MISC_CNTL_ADDR = 0x880030100; @@ -2457,18 +3064,14 @@ int tt_SiliconDevice::remote_arc_msg(int chip, uint32_t msg_code, bool wait_for_ if ((msg_code & 0xff00) != 0xaa00) { log_error("Malformed message. msg_code is 0x{:x} but should be 0xaa..", msg_code); } - log_assert (arg0 <= 0xffff and arg1 <= 0xffff, "Only 16 bits allowed in arc_msg args"); // Only 16 bits are allowed + log_assert(arg0 <= 0xffff and arg1 <= 0xffff, "Only 16 bits allowed in arc_msg args"); // Only 16 bits are allowed - uint32_t fw_arg = arg0 | (arg1<<16); + uint32_t fw_arg = arg0 | (arg1 << 16); int exit_code = 0; - { - write_to_non_mmio_device(&fw_arg, sizeof(fw_arg), core, ARC_RESET_SCRATCH_ADDR + 3 * 4); - } + { write_to_non_mmio_device(&fw_arg, sizeof(fw_arg), core, ARC_RESET_SCRATCH_ADDR + 3 * 4); } - { - write_to_non_mmio_device(&msg_code, sizeof(fw_arg), core, ARC_RESET_SCRATCH_ADDR + 5 * 4); - } + { write_to_non_mmio_device(&msg_code, sizeof(fw_arg), core, ARC_RESET_SCRATCH_ADDR + 5 * 4); } wait_for_non_mmio_flush(); uint32_t misc = 0; @@ -2490,7 +3093,11 @@ int tt_SiliconDevice::remote_arc_msg(int chip, uint32_t msg_code, bool wait_for_ if (std::chrono::system_clock::now() - start > timeout_seconds) { std::stringstream ss; ss << std::hex << msg_code; - throw std::runtime_error(fmt::format("Timed out after waiting {} seconds for device {} ARC to respond to message 0x{}", timeout, chip, ss.str())); + throw std::runtime_error(fmt::format( + "Timed out after waiting {} seconds for device {} ARC to respond to message 0x{}", + timeout, + chip, + ss.str())); } uint32_t status = 0; @@ -2516,66 +3123,96 @@ int tt_SiliconDevice::remote_arc_msg(int chip, uint32_t msg_code, bool wait_for_ return exit_code; } -void tt_SiliconDevice::write_to_sysmem(const void* mem_ptr, std::uint32_t size, uint64_t addr, uint16_t channel, chip_id_t src_device_id) { +void tt_SiliconDevice::write_to_sysmem( + const void* mem_ptr, std::uint32_t size, uint64_t addr, uint16_t channel, chip_id_t src_device_id) { write_buffer(mem_ptr, size, addr, channel, src_device_id); } -void tt_SiliconDevice::read_from_sysmem(void* mem_ptr, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id) { +void tt_SiliconDevice::read_from_sysmem( + void* mem_ptr, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id) { read_buffer(mem_ptr, addr, channel, size, src_device_id); } -void tt_SiliconDevice::set_membar_flag(const chip_id_t chip, const std::unordered_set& cores, const uint32_t barrier_value, const uint32_t barrier_addr, const std::string& fallback_tlb) { - tt_driver_atomics::sfence(); // Ensure that writes before this do not get reordered +void tt_SiliconDevice::set_membar_flag( + const chip_id_t chip, + const std::unordered_set& cores, + const uint32_t barrier_value, + const uint32_t barrier_addr, + const std::string& fallback_tlb) { + tt_driver_atomics::sfence(); // Ensure that writes before this do not get reordered std::unordered_set cores_synced = {}; std::vector barrier_val_vec = {barrier_value}; for (const auto& core : cores) { - write_to_device(barrier_val_vec.data(), barrier_val_vec.size() * sizeof(uint32_t), tt_cxy_pair(chip, core), barrier_addr, fallback_tlb); - } - tt_driver_atomics::sfence(); // Ensure that all writes in the Host WC buffer are flushed + write_to_device( + barrier_val_vec.data(), + barrier_val_vec.size() * sizeof(uint32_t), + tt_cxy_pair(chip, core), + barrier_addr, + fallback_tlb); + } + tt_driver_atomics::sfence(); // Ensure that all writes in the Host WC buffer are flushed while (cores_synced.size() != cores.size()) { - for(const auto& core : cores) { + for (const auto& core : cores) { if (cores_synced.find(core) == cores_synced.end()) { uint32_t readback_val; - read_from_device(&readback_val, tt_cxy_pair(chip, core), barrier_addr, sizeof(std::uint32_t), fallback_tlb); + read_from_device( + &readback_val, tt_cxy_pair(chip, core), barrier_addr, sizeof(std::uint32_t), fallback_tlb); if (readback_val == barrier_value) { cores_synced.insert(core); - } - else { - log_trace(LogSiliconDriver, "Waiting for core {} to recieve mem bar flag {} in function", core.str(), barrier_value); + } else { + log_trace( + LogSiliconDriver, + "Waiting for core {} to recieve mem bar flag {} in function", + core.str(), + barrier_value); } } } } // Ensure that reads or writes after this do not get reordered. // Reordering can cause races where data gets transferred before the barrier has returned - tt_driver_atomics::mfence(); + tt_driver_atomics::mfence(); } -void tt_SiliconDevice::insert_host_to_device_barrier(const chip_id_t chip, const std::unordered_set& cores, const uint32_t barrier_addr, const std::string& fallback_tlb) { +void tt_SiliconDevice::insert_host_to_device_barrier( + const chip_id_t chip, + const std::unordered_set& cores, + const uint32_t barrier_addr, + const std::string& fallback_tlb) { // Ensure that this memory barrier is atomic across processes/threads - const scoped_lock lock(*get_mutex(MEM_BARRIER_MUTEX_NAME, this->get_pci_device(chip)->get_device_num())); + const scoped_lock lock( + *get_mutex(MEM_BARRIER_MUTEX_NAME, this->get_pci_device(chip)->get_device_num())); set_membar_flag(chip, cores, tt_MemBarFlag::SET, barrier_addr, fallback_tlb); set_membar_flag(chip, cores, tt_MemBarFlag::RESET, barrier_addr, fallback_tlb); } void tt_SiliconDevice::init_membars() { - for(const auto& chip : target_devices_in_cluster) { - if (ndesc -> is_chip_mmio_capable(chip)) { - set_membar_flag(chip, workers_per_chip.at(chip), tt_MemBarFlag::RESET, l1_address_params.tensix_l1_barrier_base, "LARGE_WRITE_TLB"); - set_membar_flag(chip, eth_cores, tt_MemBarFlag::RESET, l1_address_params.eth_l1_barrier_base, "LARGE_WRITE_TLB"); - set_membar_flag(chip, dram_cores, tt_MemBarFlag::RESET, dram_address_params.DRAM_BARRIER_BASE, "LARGE_WRITE_TLB"); + for (const auto& chip : target_devices_in_cluster) { + if (ndesc->is_chip_mmio_capable(chip)) { + set_membar_flag( + chip, + workers_per_chip.at(chip), + tt_MemBarFlag::RESET, + l1_address_params.tensix_l1_barrier_base, + "LARGE_WRITE_TLB"); + set_membar_flag( + chip, eth_cores, tt_MemBarFlag::RESET, l1_address_params.eth_l1_barrier_base, "LARGE_WRITE_TLB"); + set_membar_flag( + chip, dram_cores, tt_MemBarFlag::RESET, dram_address_params.DRAM_BARRIER_BASE, "LARGE_WRITE_TLB"); } } } -void tt_SiliconDevice::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) { - if (ndesc -> is_chip_mmio_capable(chip)) { + +void tt_SiliconDevice::l1_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) { + if (ndesc->is_chip_mmio_capable(chip)) { const auto& all_workers = workers_per_chip.at(chip); const auto& all_eth = eth_cores; if (cores.size()) { // Insert barrier on specific cores with L1 std::unordered_set workers_to_sync = {}; std::unordered_set eth_to_sync = {}; - + for (const auto& core : cores) { if (all_workers.find(core) != all_workers.end()) { workers_to_sync.insert(core); @@ -2585,59 +3222,60 @@ void tt_SiliconDevice::l1_membar(const chip_id_t chip, const std::string& fallba log_fatal("Can only insert an L1 Memory barrier on Tensix or Ethernet cores."); } } - insert_host_to_device_barrier(chip, workers_to_sync, l1_address_params.tensix_l1_barrier_base, fallback_tlb); + insert_host_to_device_barrier( + chip, workers_to_sync, l1_address_params.tensix_l1_barrier_base, fallback_tlb); insert_host_to_device_barrier(chip, eth_to_sync, l1_address_params.eth_l1_barrier_base, fallback_tlb); } else { // Insert barrier on all cores with L1 insert_host_to_device_barrier(chip, all_workers, l1_address_params.tensix_l1_barrier_base, fallback_tlb); insert_host_to_device_barrier(chip, all_eth, l1_address_params.eth_l1_barrier_base, fallback_tlb); } - } - else { + } else { wait_for_non_mmio_flush(); } } -void tt_SiliconDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) { - if (ndesc -> is_chip_mmio_capable(chip)) { +void tt_SiliconDevice::dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) { + if (ndesc->is_chip_mmio_capable(chip)) { if (cores.size()) { - for(const auto& core : cores) { - log_assert(dram_cores.find(core) != dram_cores.end(), "Can only insert a DRAM Memory barrier on DRAM cores."); + for (const auto& core : cores) { + log_assert( + dram_cores.find(core) != dram_cores.end(), "Can only insert a DRAM Memory barrier on DRAM cores."); } insert_host_to_device_barrier(chip, cores, dram_address_params.DRAM_BARRIER_BASE, fallback_tlb); - } - else { + } else { // Insert Barrier on all DRAM Cores insert_host_to_device_barrier(chip, dram_cores, dram_address_params.DRAM_BARRIER_BASE, fallback_tlb); } - } - else { + } else { wait_for_non_mmio_flush(); } } -void tt_SiliconDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) { - if (ndesc -> is_chip_mmio_capable(chip)) { +void tt_SiliconDevice::dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) { + if (ndesc->is_chip_mmio_capable(chip)) { if (channels.size()) { std::unordered_set dram_cores_to_sync = {}; - for(const auto& chan : channels) { + for (const auto& chan : channels) { dram_cores_to_sync.insert(get_soc_descriptor(chip).get_core_for_dram_channel(chan, 0)); } - insert_host_to_device_barrier(chip, dram_cores_to_sync, dram_address_params.DRAM_BARRIER_BASE, fallback_tlb); - } - else { + insert_host_to_device_barrier( + chip, dram_cores_to_sync, dram_address_params.DRAM_BARRIER_BASE, fallback_tlb); + } else { // Insert Barrier on all DRAM Cores insert_host_to_device_barrier(chip, dram_cores, dram_address_params.DRAM_BARRIER_BASE, fallback_tlb); } - } - else { + } else { wait_for_non_mmio_flush(); } } -void tt_SiliconDevice::write_to_device(const void *mem_ptr, uint32_t size, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb) { - bool target_is_mmio_capable = ndesc -> is_chip_mmio_capable(core.chip); - if(target_is_mmio_capable) { +void tt_SiliconDevice::write_to_device( + const void* mem_ptr, uint32_t size, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb) { + bool target_is_mmio_capable = ndesc->is_chip_mmio_capable(core.chip); + if (target_is_mmio_capable) { if (fallback_tlb == "REG_TLB") { write_mmio_device_register(mem_ptr, core, addr, size, fallback_tlb); } else { @@ -2645,100 +3283,119 @@ void tt_SiliconDevice::write_to_device(const void *mem_ptr, uint32_t size, tt_cx } } else { log_assert(arch_name != tt::ARCH::BLACKHOLE, "Non-MMIO targets not supported in Blackhole"); - log_assert((get_soc_descriptor(core.chip).ethernet_cores).size() > 0 && get_number_of_chips_in_cluster() > 1, "Cannot issue ethernet writes to a single chip cluster!"); + log_assert( + (get_soc_descriptor(core.chip).ethernet_cores).size() > 0 && get_number_of_chips_in_cluster() > 1, + "Cannot issue ethernet writes to a single chip cluster!"); write_to_non_mmio_device(mem_ptr, size, core, addr); } } -void tt_SiliconDevice::read_mmio_device_register(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) { - PCIDevice *pci_device = get_pci_device(core.chip); +void tt_SiliconDevice::read_mmio_device_register( + void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) { + PCIDevice* pci_device = get_pci_device(core.chip); const auto tlb_index = dynamic_tlb_config.at(fallback_tlb); const scoped_lock lock(*get_mutex(fallback_tlb, pci_device->get_device_num())); log_debug(LogSiliconDriver, " dynamic tlb_index: {}", tlb_index); - auto [mapped_address, tlb_size] = pci_device->set_dynamic_tlb(tlb_index, core, addr, harvested_coord_translation, TLB_DATA::Strict); - // Align block to 4bytes if needed. + auto [mapped_address, tlb_size] = + pci_device->set_dynamic_tlb(tlb_index, core, addr, harvested_coord_translation, TLB_DATA::Strict); + // Align block to 4bytes if needed. auto aligned_buf = tt_4_byte_aligned_buffer(mem_ptr, size); pci_device->read_regs(mapped_address, aligned_buf.block_size / sizeof(std::uint32_t), aligned_buf.local_storage); - if(aligned_buf.input_size != aligned_buf.block_size) { + if (aligned_buf.input_size != aligned_buf.block_size) { // Copy value from aligned buffer to main buffer. std::memcpy(mem_ptr, aligned_buf.local_storage, size); } } - -void tt_SiliconDevice::write_mmio_device_register(const void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) { - PCIDevice *pci_device = get_pci_device(core.chip); +void tt_SiliconDevice::write_mmio_device_register( + const void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) { + PCIDevice* pci_device = get_pci_device(core.chip); const auto tlb_index = dynamic_tlb_config.at(fallback_tlb); const scoped_lock lock(*get_mutex(fallback_tlb, pci_device->get_device_num())); log_debug(LogSiliconDriver, " dynamic tlb_index: {}", tlb_index); - auto [mapped_address, tlb_size] = pci_device->set_dynamic_tlb(tlb_index, core, addr, harvested_coord_translation, TLB_DATA::Strict); - // Align block to 4bytes if needed. + auto [mapped_address, tlb_size] = + pci_device->set_dynamic_tlb(tlb_index, core, addr, harvested_coord_translation, TLB_DATA::Strict); + // Align block to 4bytes if needed. auto aligned_buf = tt_4_byte_aligned_buffer(mem_ptr, size); - if(aligned_buf.input_size != aligned_buf.block_size) { + if (aligned_buf.input_size != aligned_buf.block_size) { // Copy value from main buffer to aligned buffer std::memcpy(aligned_buf.local_storage, mem_ptr, size); } pci_device->write_regs(mapped_address, aligned_buf.block_size / sizeof(uint32_t), aligned_buf.local_storage); } -void tt_SiliconDevice::read_from_device(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) { - bool target_is_mmio_capable = ndesc -> is_chip_mmio_capable(core.chip); +void tt_SiliconDevice::read_from_device( + void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) { + bool target_is_mmio_capable = ndesc->is_chip_mmio_capable(core.chip); if (target_is_mmio_capable) { if (fallback_tlb == "REG_TLB") { read_mmio_device_register(mem_ptr, core, addr, size, fallback_tlb); } else { read_device_memory(mem_ptr, core, addr, size, fallback_tlb); } - } - else { - log_assert(arch_name != tt::ARCH::BLACKHOLE, "Non-MMIO targets not supported in Blackhole"); // MT: Use only dynamic TLBs and never program static - log_assert((get_soc_descriptor(core.chip).ethernet_cores).size() > 0 && get_number_of_chips_in_cluster() > 1, "Cannot issue ethernet reads from a single chip cluster!"); + } else { + log_assert( + arch_name != tt::ARCH::BLACKHOLE, + "Non-MMIO targets not supported in Blackhole"); // MT: Use only dynamic TLBs and never program static + log_assert( + (get_soc_descriptor(core.chip).ethernet_cores).size() > 0 && get_number_of_chips_in_cluster() > 1, + "Cannot issue ethernet reads from a single chip cluster!"); read_from_non_mmio_device(mem_ptr, core, addr, size); } } -int tt_SiliconDevice::arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done, uint32_t arg0, uint32_t arg1, int timeout, uint32_t *return_3, uint32_t *return_4) { +int tt_SiliconDevice::arc_msg( + int logical_device_id, + uint32_t msg_code, + bool wait_for_done, + uint32_t arg0, + uint32_t arg1, + int timeout, + uint32_t* return_3, + uint32_t* return_4) { log_assert(arch_name != tt::ARCH::BLACKHOLE, "ARC messages not supported in Blackhole"); - if(ndesc -> is_chip_mmio_capable(logical_device_id)) { + if (ndesc->is_chip_mmio_capable(logical_device_id)) { return pcie_arc_msg(logical_device_id, msg_code, wait_for_done, arg0, arg1, timeout, return_3, return_4); - } - else { + } else { return remote_arc_msg(logical_device_id, msg_code, wait_for_done, arg0, arg1, timeout, return_3, return_4); } } -void tt_SiliconDevice::send_tensix_risc_reset_to_core(const tt_cxy_pair &core, const TensixSoftResetOptions &soft_resets) { +void tt_SiliconDevice::send_tensix_risc_reset_to_core( + const tt_cxy_pair& core, const TensixSoftResetOptions& soft_resets) { auto valid = soft_resets & ALL_TENSIX_SOFT_RESET; - uint32_t valid_val = (std::underlying_type::type) valid; + uint32_t valid_val = (std::underlying_type::type)valid; write_to_device(&valid_val, sizeof(uint32_t), core, 0xFFB121B0, "REG_TLB"); tt_driver_atomics::sfence(); } -void tt_SiliconDevice::send_remote_tensix_risc_reset_to_core(const tt_cxy_pair &core, const TensixSoftResetOptions &soft_resets) { +void tt_SiliconDevice::send_remote_tensix_risc_reset_to_core( + const tt_cxy_pair& core, const TensixSoftResetOptions& soft_resets) { auto valid = soft_resets & ALL_TENSIX_SOFT_RESET; - uint32_t valid_val = (std::underlying_type::type) valid; + uint32_t valid_val = (std::underlying_type::type)valid; write_to_non_mmio_device(&valid_val, sizeof(uint32_t), core, 0xFFB121B0); tt_driver_atomics::sfence(); } -int tt_SiliconDevice::set_remote_power_state(const chip_id_t &chip, tt_DevicePowerState device_state) { +int tt_SiliconDevice::set_remote_power_state(const chip_id_t& chip, tt_DevicePowerState device_state) { auto mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(chip); - return remote_arc_msg(chip, get_power_state_arc_msg(mmio_capable_chip_logical, device_state), true, 0, 0, 1, NULL, NULL); + return remote_arc_msg( + chip, get_power_state_arc_msg(mmio_capable_chip_logical, device_state), true, 0, 0, 1, NULL, NULL); } - void tt_SiliconDevice::enable_remote_ethernet_queue(const chip_id_t& chip, int timeout) { uint32_t msg_success = 0x0; auto timeout_seconds = std::chrono::seconds(timeout); auto start = std::chrono::system_clock::now(); while (msg_success != 1) { if (std::chrono::system_clock::now() - start > timeout_seconds) { - throw std::runtime_error(fmt::format("Timed out after waiting {} seconds for DRAM to finish training", timeout)); + throw std::runtime_error( + fmt::format("Timed out after waiting {} seconds for DRAM to finish training", timeout)); } int msg_rt = remote_arc_msg(chip, 0xaa58, true, 0xFFFF, 0xFFFF, 1, &msg_success, NULL); if (msg_rt == MSG_ERROR_REPLY) { @@ -2747,16 +3404,14 @@ void tt_SiliconDevice::enable_remote_ethernet_queue(const chip_id_t& chip, int t } } - -void tt_SiliconDevice::broadcast_tensix_risc_reset_to_cluster(const TensixSoftResetOptions &soft_resets) { - if(arch_name == tt::ARCH::GRAYSKULL) { - for (auto &device_it : m_pci_device_map) { +void tt_SiliconDevice::broadcast_tensix_risc_reset_to_cluster(const TensixSoftResetOptions& soft_resets) { + if (arch_name == tt::ARCH::GRAYSKULL) { + for (auto& device_it : m_pci_device_map) { broadcast_pcie_tensix_risc_reset(device_it.first, soft_resets); } - } - else { + } else { auto valid = soft_resets & ALL_TENSIX_SOFT_RESET; - uint32_t valid_val = (std::underlying_type::type) valid; + uint32_t valid_val = (std::underlying_type::type)valid; std::set chips_to_exclude = {}; std::set rows_to_exclude; std::set columns_to_exclude; @@ -2768,7 +3423,14 @@ void tt_SiliconDevice::broadcast_tensix_risc_reset_to_cluster(const TensixSoftRe columns_to_exclude = {0, 5}; } std::string fallback_tlb = "LARGE_WRITE_TLB"; - broadcast_write_to_cluster(&valid_val, sizeof(uint32_t), 0xFFB121B0, chips_to_exclude, rows_to_exclude, columns_to_exclude, fallback_tlb); + broadcast_write_to_cluster( + &valid_val, + sizeof(uint32_t), + 0xFFB121B0, + chips_to_exclude, + rows_to_exclude, + columns_to_exclude, + fallback_tlb); // Ensure that reset signal is globally visible wait_for_non_mmio_flush(); } @@ -2777,22 +3439,23 @@ void tt_SiliconDevice::broadcast_tensix_risc_reset_to_cluster(const TensixSoftRe void tt_SiliconDevice::set_power_state(tt_DevicePowerState device_state) { // MT Initial BH - ARC messages not supported in Blackhole if (arch_name != tt::ARCH::BLACKHOLE) { - for(auto& chip : target_devices_in_cluster) { - if(ndesc -> is_chip_mmio_capable(chip)) { + for (auto& chip : target_devices_in_cluster) { + if (ndesc->is_chip_mmio_capable(chip)) { set_pcie_power_state(device_state); } else { int exit_code = set_remote_power_state(chip, device_state); - log_assert(exit_code == 0, "Failed to set power state to {} with exit code: {}", (int)device_state, exit_code); + log_assert( + exit_code == 0, "Failed to set power state to {} with exit code: {}", (int)device_state, exit_code); } } } } void tt_SiliconDevice::enable_ethernet_queue(int timeout) { - for (const chip_id_t &chip : target_devices_in_cluster) { + for (const chip_id_t& chip : target_devices_in_cluster) { auto arch = get_soc_descriptor(chip).arch; - switch (arch) { + switch (arch) { case tt::ARCH::WORMHOLE_B0: { if (ndesc->is_chip_mmio_capable(chip)) { enable_local_ethernet_queue(chip, timeout); @@ -2801,20 +3464,17 @@ void tt_SiliconDevice::enable_ethernet_queue(int timeout) { } break; - case tt::ARCH::BLACKHOLE: - log_assert(false, "Arch BLACKHOLE doesn't support ethernet queues yet"); + case tt::ARCH::BLACKHOLE: + log_assert(false, "Arch BLACKHOLE doesn't support ethernet queues yet"); } default: { break; } } - } } -std::set tt_SiliconDevice::get_target_remote_device_ids() { - return target_remote_chips; -} +std::set tt_SiliconDevice::get_target_remote_device_ids() { return target_remote_chips; } void tt_SiliconDevice::deassert_resets_and_set_power_state() { // Assert tensix resets on all chips in cluster @@ -2823,15 +3483,29 @@ void tt_SiliconDevice::deassert_resets_and_set_power_state() { // MT Initial BH - ARC messages not supported in Blackhole if (arch_name != tt::ARCH::BLACKHOLE) { // Send ARC Messages to deassert RISCV resets - for (auto &device_it : m_pci_device_map){ - arc_msg(device_it.first, 0xaa00 | device_it.second.get()->get_architecture_implementation()->get_arc_message_deassert_riscv_reset(), true, 0, 0); - } - if(ndesc != nullptr) { - for(const chip_id_t& chip : target_devices_in_cluster) { - if(!ndesc -> is_chip_mmio_capable(chip)) { + for (auto& device_it : m_pci_device_map) { + arc_msg( + device_it.first, + 0xaa00 | + device_it.second.get()->get_architecture_implementation()->get_arc_message_deassert_riscv_reset(), + true, + 0, + 0); + } + if (ndesc != nullptr) { + for (const chip_id_t& chip : target_devices_in_cluster) { + if (!ndesc->is_chip_mmio_capable(chip)) { auto mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(chip); auto pci_device = get_pci_device(mmio_capable_chip_logical); - remote_arc_msg(chip, 0xaa00 | pci_device->get_architecture_implementation()->get_arc_message_deassert_riscv_reset(), true, 0x0, 0x0, 1, NULL, NULL); + remote_arc_msg( + chip, + 0xaa00 | pci_device->get_architecture_implementation()->get_arc_message_deassert_riscv_reset(), + true, + 0x0, + 0x0, + 1, + NULL, + NULL); } } enable_ethernet_queue(30); @@ -2842,11 +3516,16 @@ void tt_SiliconDevice::deassert_resets_and_set_power_state() { } void tt_SiliconDevice::verify_eth_fw() { - for(const auto& chip : target_devices_in_cluster) { + for (const auto& chip : target_devices_in_cluster) { uint32_t fw_version; std::vector fw_versions; - for (const tt_xy_pair ð_core : get_soc_descriptor(chip).ethernet_cores) { - read_from_device(&fw_version, tt_cxy_pair(chip, eth_core), l1_address_params.fw_version_addr, sizeof(uint32_t), "LARGE_READ_TLB"); + for (const tt_xy_pair& eth_core : get_soc_descriptor(chip).ethernet_cores) { + read_from_device( + &fw_version, + tt_cxy_pair(chip, eth_core), + l1_address_params.fw_version_addr, + sizeof(uint32_t), + "LARGE_READ_TLB"); fw_versions.push_back(fw_version); } verify_sw_fw_versions(chip, SW_VERSION, fw_versions); @@ -2854,7 +3533,8 @@ void tt_SiliconDevice::verify_eth_fw() { } } -void tt_SiliconDevice::verify_sw_fw_versions(int device_id, std::uint32_t sw_version, std::vector &fw_versions) { +void tt_SiliconDevice::verify_sw_fw_versions( + int device_id, std::uint32_t sw_version, std::vector& fw_versions) { tt_version sw(sw_version), fw_first_eth_core(fw_versions.at(0)); log_info( LogSiliconDriver, @@ -2862,7 +3542,7 @@ void tt_SiliconDevice::verify_sw_fw_versions(int device_id, std::uint32_t sw_ver sw.str(), fw_first_eth_core.str(), device_id); - for (std::uint32_t &fw_version : fw_versions) { + for (std::uint32_t& fw_version : fw_versions) { tt_version fw(fw_version); log_assert(fw == fw_first_eth_core, "FW versions are not the same across different ethernet cores"); log_assert(sw.major == fw.major, "SW/FW major version number out of sync"); @@ -2875,14 +3555,16 @@ void tt_SiliconDevice::verify_sw_fw_versions(int device_id, std::uint32_t sw_ver use_ethernet_broadcast &= fw_first_eth_core >= tt_version(6, 5, 0); // Virtual coordinates can be used for broadcast headers if ERISC FW >= 6.8.0 and NOC translation is enabled // Temporarily enable this feature for 6.7.241 as well for testing. - use_virtual_coords_for_eth_broadcast &= (fw_first_eth_core >= tt_version(6, 8, 0) || fw_first_eth_core == tt_version(6, 7, 241)) && translation_tables_en; + use_virtual_coords_for_eth_broadcast &= + (fw_first_eth_core >= tt_version(6, 8, 0) || fw_first_eth_core == tt_version(6, 7, 241)) && + translation_tables_en; } -void tt_SiliconDevice::start_device(const tt_device_params &device_params) { - if(device_params.init_device) { +void tt_SiliconDevice::start_device(const tt_device_params& device_params) { + if (device_params.init_device) { initialize_pcie_devices(); // MT Initial BH - Ethernet firmware not present in Blackhole - if(arch_name == tt::ARCH::WORMHOLE_B0) { + if (arch_name == tt::ARCH::WORMHOLE_B0) { verify_eth_fw(); } deassert_resets_and_set_power_state(); @@ -2894,7 +3576,6 @@ void tt_SiliconDevice::close_device() { broadcast_tensix_risc_reset_to_cluster(TENSIX_ASSERT_SOFT_RESET); } - void tt_SiliconDevice::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) { l1_address_params = l1_address_params_; } @@ -2911,25 +3592,30 @@ void tt_SiliconDevice::set_driver_eth_interface_params(const tt_driver_eth_inter eth_interface_params = eth_interface_params_; } -void tt_SiliconDevice::setup_core_to_tlb_map(const chip_id_t logical_device_id, std::function mapping_function) { +void tt_SiliconDevice::setup_core_to_tlb_map( + const chip_id_t logical_device_id, std::function mapping_function) { map_core_to_tlb_per_chip[logical_device_id] = mapping_function; tlbs_init_per_chip[logical_device_id] = true; } std::uint32_t tt_SiliconDevice::get_num_dram_channels(std::uint32_t device_id) { - log_assert(target_devices_in_cluster.find(device_id) != target_devices_in_cluster.end(), "Querying DRAM parameters for a device that does not exist."); + log_assert( + target_devices_in_cluster.find(device_id) != target_devices_in_cluster.end(), + "Querying DRAM parameters for a device that does not exist."); return get_soc_descriptor(device_id).get_num_dram_channels(); } std::uint64_t tt_SiliconDevice::get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel) { log_assert(channel < get_num_dram_channels(device_id), "Querying size for a device channel that does not exist."); - return get_soc_descriptor(device_id).dram_bank_size; // Space per channel is identical for now + return get_soc_descriptor(device_id).dram_bank_size; // Space per channel is identical for now } std::uint32_t tt_SiliconDevice::get_num_host_channels(std::uint32_t device_id) { auto devices = get_target_mmio_device_ids(); - log_assert(devices.find(device_id) != devices.end(), "Querying Host Address parameters for a non-mmio device or a device does not exist."); - return m_num_host_mem_channels; // Same number of host channels per device for now + log_assert( + devices.find(device_id) != devices.end(), + "Querying Host Address parameters for a non-mmio device or a device does not exist."); + return m_num_host_mem_channels; // Same number of host channels per device for now } std::uint32_t tt_SiliconDevice::get_host_channel_size(std::uint32_t device_id, std::uint32_t channel) { @@ -2945,20 +3631,20 @@ std::uint32_t tt_SiliconDevice::get_numa_node_for_pcie_device(std::uint32_t devi std::uint64_t tt_SiliconDevice::get_pcie_base_addr_from_device(const chip_id_t chip_id) const { // TODO: Should probably be lowered to TTDevice. tt::ARCH arch = get_soc_descriptor(chip_id).arch; - if(arch == tt::ARCH::WORMHOLE_B0) { + if (arch == tt::ARCH::WORMHOLE_B0) { return 0x800000000; - } - else if (arch == tt::ARCH::BLACKHOLE) { + } else if (arch == tt::ARCH::BLACKHOLE) { // Enable 4th ATU window. return 1ULL << 60; - } - else { + } else { return 0; } } tt_version tt_SiliconDevice::get_ethernet_fw_version() const { log_assert(arch_name == tt::ARCH::WORMHOLE_B0, "Can only get Ethernet FW version for Wormhole architectures."); - log_assert(eth_fw_version.major != 0xffff and eth_fw_version.minor != 0xff and eth_fw_version.patch != 0xff, "Device must be started before querying Ethernet FW version."); + log_assert( + eth_fw_version.major != 0xffff and eth_fw_version.minor != 0xff and eth_fw_version.patch != 0xff, + "Device must be started before querying Ethernet FW version."); return eth_fw_version; } diff --git a/device/tt_silicon_driver_common.cpp b/device/tt_silicon_driver_common.cpp index a7429b8e..43905200 100644 --- a/device/tt_silicon_driver_common.cpp +++ b/device/tt_silicon_driver_common.cpp @@ -3,37 +3,37 @@ // SPDX-License-Identifier: Apache-2.0 #include "device/tt_silicon_driver_common.hpp" -#include "tt_xy_pair.h" + #include "tt_device.h" +#include "tt_xy_pair.h" std::string TensixSoftResetOptionsToString(TensixSoftResetOptions value) { std::string output; - if((value & TensixSoftResetOptions::BRISC) != TensixSoftResetOptions::NONE) { + if ((value & TensixSoftResetOptions::BRISC) != TensixSoftResetOptions::NONE) { output += "BRISC | "; } - if((value & TensixSoftResetOptions::TRISC0) != TensixSoftResetOptions::NONE) { + if ((value & TensixSoftResetOptions::TRISC0) != TensixSoftResetOptions::NONE) { output += "TRISC0 | "; } - if((value & TensixSoftResetOptions::TRISC1) != TensixSoftResetOptions::NONE) { + if ((value & TensixSoftResetOptions::TRISC1) != TensixSoftResetOptions::NONE) { output += "TRISC1 | "; } - if((value & TensixSoftResetOptions::TRISC2) != TensixSoftResetOptions::NONE) { + if ((value & TensixSoftResetOptions::TRISC2) != TensixSoftResetOptions::NONE) { output += "TRISC2 | "; } - if((value & TensixSoftResetOptions::NCRISC) != TensixSoftResetOptions::NONE) { + if ((value & TensixSoftResetOptions::NCRISC) != TensixSoftResetOptions::NONE) { output += "NCRISC | "; } - if((value & TensixSoftResetOptions::STAGGERED_START) != TensixSoftResetOptions::NONE) { + if ((value & TensixSoftResetOptions::STAGGERED_START) != TensixSoftResetOptions::NONE) { output += "STAGGERED_START | "; } - if(output.empty()) { - output = "UNKNOWN"; - } else { - output.erase(output.end() - 3, output.end()); - } + if (output.empty()) { + output = "UNKNOWN"; + } else { + output.erase(output.end() - 3, output.end()); + } - return output; + return output; } - diff --git a/device/tt_silicon_driver_common.hpp b/device/tt_silicon_driver_common.hpp index 9f275668..6dc6d7f4 100644 --- a/device/tt_silicon_driver_common.hpp +++ b/device/tt_silicon_driver_common.hpp @@ -9,53 +9,42 @@ #include #include -enum class TensixSoftResetOptions: std::uint32_t { +enum class TensixSoftResetOptions : std::uint32_t { NONE = 0, - BRISC = ((std::uint32_t) 1 << 11), - TRISC0 = ((std::uint32_t) 1 << 12), - TRISC1 = ((std::uint32_t) 1 << 13), - TRISC2 = ((std::uint32_t) 1 << 14), - NCRISC = ((std::uint32_t) 1 << 18), - STAGGERED_START = ((std::uint32_t) 1 << 31) + BRISC = ((std::uint32_t)1 << 11), + TRISC0 = ((std::uint32_t)1 << 12), + TRISC1 = ((std::uint32_t)1 << 13), + TRISC2 = ((std::uint32_t)1 << 14), + NCRISC = ((std::uint32_t)1 << 18), + STAGGERED_START = ((std::uint32_t)1 << 31) }; std::string TensixSoftResetOptionsToString(TensixSoftResetOptions value); + constexpr TensixSoftResetOptions operator|(TensixSoftResetOptions lhs, TensixSoftResetOptions rhs) { - return static_cast( - static_cast(lhs) | - static_cast(rhs) - ); + return static_cast(static_cast(lhs) | static_cast(rhs)); } constexpr TensixSoftResetOptions operator&(TensixSoftResetOptions lhs, TensixSoftResetOptions rhs) { - return static_cast( - static_cast(lhs) & - static_cast(rhs) - ); + return static_cast(static_cast(lhs) & static_cast(rhs)); } constexpr bool operator!=(TensixSoftResetOptions lhs, TensixSoftResetOptions rhs) { - return - static_cast(lhs) != - static_cast(rhs); + return static_cast(lhs) != static_cast(rhs); } -static constexpr TensixSoftResetOptions ALL_TRISC_SOFT_RESET = TensixSoftResetOptions::TRISC0 | - TensixSoftResetOptions::TRISC1 | - TensixSoftResetOptions::TRISC2; +static constexpr TensixSoftResetOptions ALL_TRISC_SOFT_RESET = + TensixSoftResetOptions::TRISC0 | TensixSoftResetOptions::TRISC1 | TensixSoftResetOptions::TRISC2; -static constexpr TensixSoftResetOptions ALL_TENSIX_SOFT_RESET = TensixSoftResetOptions::BRISC | - TensixSoftResetOptions::NCRISC | - TensixSoftResetOptions::STAGGERED_START | - ALL_TRISC_SOFT_RESET; +static constexpr TensixSoftResetOptions ALL_TENSIX_SOFT_RESET = + TensixSoftResetOptions::BRISC | TensixSoftResetOptions::NCRISC | TensixSoftResetOptions::STAGGERED_START | + ALL_TRISC_SOFT_RESET; -static constexpr TensixSoftResetOptions TENSIX_ASSERT_SOFT_RESET = TensixSoftResetOptions::BRISC | - TensixSoftResetOptions::NCRISC | - ALL_TRISC_SOFT_RESET; +static constexpr TensixSoftResetOptions TENSIX_ASSERT_SOFT_RESET = + TensixSoftResetOptions::BRISC | TensixSoftResetOptions::NCRISC | ALL_TRISC_SOFT_RESET; -static constexpr TensixSoftResetOptions TENSIX_DEASSERT_SOFT_RESET = TensixSoftResetOptions::NCRISC | - ALL_TRISC_SOFT_RESET | - TensixSoftResetOptions::STAGGERED_START; +static constexpr TensixSoftResetOptions TENSIX_DEASSERT_SOFT_RESET = + TensixSoftResetOptions::NCRISC | ALL_TRISC_SOFT_RESET | TensixSoftResetOptions::STAGGERED_START; -static constexpr TensixSoftResetOptions TENSIX_DEASSERT_SOFT_RESET_NO_STAGGER = TensixSoftResetOptions::NCRISC | - ALL_TRISC_SOFT_RESET; +static constexpr TensixSoftResetOptions TENSIX_DEASSERT_SOFT_RESET_NO_STAGGER = + TensixSoftResetOptions::NCRISC | ALL_TRISC_SOFT_RESET; diff --git a/device/tt_soc_descriptor.cpp b/device/tt_soc_descriptor.cpp index 9a572420..ccf6dc02 100644 --- a/device/tt_soc_descriptor.cpp +++ b/device/tt_soc_descriptor.cpp @@ -2,10 +2,10 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "yaml-cpp/yaml.h" #include "tt_soc_descriptor.h" #include + #include #include #include @@ -13,53 +13,53 @@ #include #include "fmt/core.h" +#include "yaml-cpp/yaml.h" // #include "l1_address_map.h" std::string format_node(tt_xy_pair xy) { return fmt::format("{}-{}", xy.x, xy.y); } tt_xy_pair format_node(std::string str) { - int x_coord; - int y_coord; - std::regex expr("([0-9]+)[-,xX]([0-9]+)"); - std::smatch x_y_pair; - - if (std::regex_search(str, x_y_pair, expr)) { - x_coord = std::stoi(x_y_pair[1]); - y_coord = std::stoi(x_y_pair[2]); - } else { - throw std::runtime_error(fmt::format("Could not parse the core id: {}", str)); - } + int x_coord; + int y_coord; + std::regex expr("([0-9]+)[-,xX]([0-9]+)"); + std::smatch x_y_pair; + + if (std::regex_search(str, x_y_pair, expr)) { + x_coord = std::stoi(x_y_pair[1]); + y_coord = std::stoi(x_y_pair[2]); + } else { + throw std::runtime_error(fmt::format("Could not parse the core id: {}", str)); + } - tt_xy_pair xy(x_coord, y_coord); + tt_xy_pair xy(x_coord, y_coord); - return xy; + return xy; } -const char* ws = " \t\n\r\f\v"; + +const char *ws = " \t\n\r\f\v"; // trim from end of string (right) -inline std::string& rtrim(std::string& s, const char* t = ws) -{ +inline std::string &rtrim(std::string &s, const char *t = ws) { s.erase(s.find_last_not_of(t) + 1); return s; } // trim from beginning of string (left) -inline std::string& ltrim(std::string& s, const char* t = ws) -{ +inline std::string <rim(std::string &s, const char *t = ws) { s.erase(0, s.find_first_not_of(t)); return s; } // trim from both ends of string (right then left) -inline std::string& trim(std::string& s, const char* t = ws) -{ - return ltrim(rtrim(s, t), t); -} +inline std::string &trim(std::string &s, const char *t = ws) { return ltrim(rtrim(s, t), t); } void tt_SocDescriptor::load_soc_features_from_device_descriptor(YAML::Node &device_descriptor_yaml) { overlay_version = device_descriptor_yaml["features"]["overlay"]["version"].as(); - noc_translation_id_enabled = device_descriptor_yaml["features"]["noc"] && device_descriptor_yaml["features"]["noc"]["translation_id_enabled"] ? device_descriptor_yaml["features"]["noc"]["translation_id_enabled"].as() : false; + noc_translation_id_enabled = + device_descriptor_yaml["features"]["noc"] && device_descriptor_yaml["features"]["noc"]["translation_id_enabled"] + ? device_descriptor_yaml["features"]["noc"]["translation_id_enabled"].as() + : false; packer_version = device_descriptor_yaml["features"]["packer"]["version"].as(); unpacker_version = device_descriptor_yaml["features"]["unpacker"]["version"].as(); dst_size_alignment = device_descriptor_yaml["features"]["math"]["dst_size_alignment"].as(); @@ -90,7 +90,8 @@ void tt_SocDescriptor::load_core_descriptors_from_device_descriptor(YAML::Node & } int current_dram_channel = 0; - for (auto channel_it = device_descriptor_yaml["dram"].begin(); channel_it != device_descriptor_yaml["dram"].end(); ++channel_it) { + for (auto channel_it = device_descriptor_yaml["dram"].begin(); channel_it != device_descriptor_yaml["dram"].end(); + ++channel_it) { dram_cores.push_back({}); auto &soc_dram_cores = dram_cores.at(dram_cores.size() - 1); const auto &dram_cores = (*channel_it).as>(); @@ -121,8 +122,8 @@ void tt_SocDescriptor::load_core_descriptors_from_device_descriptor(YAML::Node & std::vector worker_cores = device_descriptor_yaml["functional_workers"].as>(); std::set worker_routing_coords_x; std::set worker_routing_coords_y; - std::unordered_map routing_coord_worker_x; - std::unordered_map routing_coord_worker_y; + std::unordered_map routing_coord_worker_x; + std::unordered_map routing_coord_worker_y; for (const auto &core_string : worker_cores) { CoreDescriptor core_descriptor; core_descriptor.coord = format_node(core_string); @@ -137,12 +138,12 @@ void tt_SocDescriptor::load_core_descriptors_from_device_descriptor(YAML::Node & int func_x_start = 0; int func_y_start = 0; std::set::iterator it; - for (it=worker_routing_coords_x.begin(); it!=worker_routing_coords_x.end(); ++it) { + for (it = worker_routing_coords_x.begin(); it != worker_routing_coords_x.end(); ++it) { worker_log_to_routing_x[func_x_start] = *it; routing_x_to_worker_x[*it] = func_x_start; func_x_start++; } - for (it=worker_routing_coords_y.begin(); it!=worker_routing_coords_y.end(); ++it) { + for (it = worker_routing_coords_y.begin(); it != worker_routing_coords_y.end(); ++it) { worker_log_to_routing_y[func_y_start] = *it; routing_y_to_worker_y[*it] = func_y_start; func_y_start++; @@ -225,7 +226,8 @@ tt_virtual_coords tt_SocDescriptor::to_virtual_coords(tt_translated_coords trans tt_SocDescriptor::tt_SocDescriptor(std::string device_descriptor_path, std::size_t harvesting_mask) { std::ifstream fdesc(device_descriptor_path); if (fdesc.fail()) { - throw std::runtime_error(fmt::format("Error: device descriptor file {} does not exist!", device_descriptor_path)); + throw std::runtime_error( + fmt::format("Error: device descriptor file {} does not exist!", device_descriptor_path)); } fdesc.close(); @@ -233,10 +235,12 @@ tt_SocDescriptor::tt_SocDescriptor(std::string device_descriptor_path, std::size auto grid_size_x = device_descriptor_yaml["grid"]["x_size"].as(); auto grid_size_y = device_descriptor_yaml["grid"]["y_size"].as(); - int physical_grid_size_x = device_descriptor_yaml["physical"] && device_descriptor_yaml["physical"]["x_size"] ? - device_descriptor_yaml["physical"]["x_size"].as() : grid_size_x; - int physical_grid_size_y = device_descriptor_yaml["physical"] && device_descriptor_yaml["physical"]["y_size"] ? - device_descriptor_yaml["physical"]["y_size"].as() : grid_size_y; + int physical_grid_size_x = device_descriptor_yaml["physical"] && device_descriptor_yaml["physical"]["x_size"] + ? device_descriptor_yaml["physical"]["x_size"].as() + : grid_size_x; + int physical_grid_size_y = device_descriptor_yaml["physical"] && device_descriptor_yaml["physical"]["y_size"] + ? device_descriptor_yaml["physical"]["y_size"].as() + : grid_size_y; load_core_descriptors_from_device_descriptor(device_descriptor_yaml); grid_size = tt_xy_pair(grid_size_x, grid_size_y); physical_grid_size = tt_xy_pair(physical_grid_size_x, physical_grid_size_y); @@ -251,7 +255,7 @@ tt_SocDescriptor::tt_SocDescriptor(std::string device_descriptor_path, std::size int tt_SocDescriptor::get_num_dram_channels() const { int num_channels = 0; - for (auto& dram_core : dram_cores) { + for (auto &dram_core : dram_cores) { if (dram_core.size() > 0) { num_channels++; } @@ -281,7 +285,7 @@ std::ostream &operator<<(std::ostream &out, const tt::ARCH &arch_name) { } else if (arch_name == tt::ARCH::WORMHOLE_B0) { out << "wormhole_b0"; } else if (arch_name == tt::ARCH::BLACKHOLE) { - out << "blackhole"; //Just how many ARCH-to-string functions do we plan to have, anyway? + out << "blackhole"; // Just how many ARCH-to-string functions do we plan to have, anyway? } else { out << "ArchNameSerializationNotImplemented"; } diff --git a/device/tt_soc_descriptor.h b/device/tt_soc_descriptor.h index 372d0a29..de1511bf 100644 --- a/device/tt_soc_descriptor.h +++ b/device/tt_soc_descriptor.h @@ -7,29 +7,25 @@ #pragma once #include -#include +#include +#include #include +#include #include #include -#include -#include -#include - -#include "tt_xy_pair.h" -#include "device/tt_arch_types.h" - #include "device/coordinate_manager.h" - +#include "device/tt_arch_types.h" #include "fmt/core.h" +#include "tt_xy_pair.h" namespace YAML { - class Node; +class Node; } std::ostream &operator<<(std::ostream &out, const tt::ARCH &arch_name); -static inline std::string get_arch_str(const tt::ARCH arch_name){ +static inline std::string get_arch_str(const tt::ARCH arch_name) { std::string arch_name_str; if (arch_name == tt::ARCH::GRAYSKULL) { @@ -45,16 +41,18 @@ static inline std::string get_arch_str(const tt::ARCH arch_name){ return arch_name_str; } -static inline tt::ARCH get_arch_name(const std::string &arch_str){ +static inline tt::ARCH get_arch_name(const std::string &arch_str) { tt::ARCH arch; if ((arch_str == "grayskull") || (arch_str == "GRAYSKULL")) { arch = tt::ARCH::GRAYSKULL; - } else if ((arch_str == "wormhole") || (arch_str == "WORMHOLE") || (arch_str == "wormhole_b0") || (arch_str == "WORMHOLE_B0")){ + } else if ( + (arch_str == "wormhole") || (arch_str == "WORMHOLE") || (arch_str == "wormhole_b0") || + (arch_str == "WORMHOLE_B0")) { arch = tt::ARCH::WORMHOLE_B0; - } else if ((arch_str == "blackhole") || (arch_str == "BLACKHOLE")){ + } else if ((arch_str == "blackhole") || (arch_str == "BLACKHOLE")) { arch = tt::ARCH::BLACKHOLE; - }else { + } else { throw std::runtime_error( fmt::format("At LoadSocDescriptorFromYaml: \"{}\" is not recognized as tt::ARCH.", arch_str)); } @@ -69,13 +67,13 @@ tt_xy_pair format_node(std::string str); //! SocCore type enumerations /*! Superset for all chip generations */ enum class CoreType { - ARC, - DRAM, - ETH, - PCIE, - WORKER, - HARVESTED, - ROUTER_ONLY, + ARC, + DRAM, + ETH, + PCIE, + WORKER, + HARVESTED, + ROUTER_ONLY, }; @@ -84,10 +82,10 @@ enum class CoreType { Should only contain relevant configuration for SOC */ struct CoreDescriptor { - tt_xy_pair coord = tt_xy_pair(0, 0); - CoreType type; + tt_xy_pair coord = tt_xy_pair(0, 0); + CoreType type; - std::size_t l1_size = 0; + std::size_t l1_size = 0; }; //! tt_SocDescriptor contains information regarding the SOC configuration targetted. @@ -95,7 +93,6 @@ struct CoreDescriptor { Should only contain relevant configuration for SOC */ class tt_SocDescriptor { - public: tt::ARCH arch; tt_xy_pair grid_size; @@ -110,13 +107,15 @@ class tt_SocDescriptor { std::unordered_map worker_log_to_routing_y; std::unordered_map routing_x_to_worker_x; std::unordered_map routing_y_to_worker_y; - std::vector> dram_cores; // per channel list of dram cores + std::vector> dram_cores; // per channel list of dram cores std::unordered_map> dram_core_channel_map; // map dram core to chan/subchan - std::vector ethernet_cores; // ethernet cores (index == channel id) - std::unordered_map ethernet_core_channel_map; + std::vector ethernet_cores; // ethernet cores (index == channel id) + std::unordered_map ethernet_core_channel_map; std::vector trisc_sizes; // Most of software stack assumes same trisc size for whole chip.. std::string device_descriptor_file_path = std::string(""); + bool has(tt_xy_pair input) { return cores.find(input) != cores.end(); } + int overlay_version; int unpacker_version; int dst_size_alignment; @@ -129,15 +128,15 @@ class tt_SocDescriptor { int get_num_dram_channels() const; bool is_worker_core(const tt_xy_pair &core) const; tt_xy_pair get_core_for_dram_channel(int dram_chan, int subchannel) const; - bool is_ethernet_core(const tt_xy_pair& core) const; + bool is_ethernet_core(const tt_xy_pair &core) const; // Default constructor. Creates uninitialized object with public access to all of its attributes. tt_SocDescriptor() = default; - // Constructor used to build object from device descriptor file. + // Constructor used to build object from device descriptor file. tt_SocDescriptor(std::string device_descriptor_path, std::size_t harvesting_mask = 0); // Copy constructor - tt_SocDescriptor(const tt_SocDescriptor& other) : + tt_SocDescriptor(const tt_SocDescriptor &other) : arch(other.arch), grid_size(other.grid_size), physical_grid_size(other.physical_grid_size), @@ -167,7 +166,7 @@ class tt_SocDescriptor { dram_bank_size(other.dram_bank_size) { coordinate_manager.reset(new CoordinateManager(*other.coordinate_manager)); } - + // Coordinate conversions. // Conversions from logical coordinates should be used just for worker cores. diff --git a/device/tt_xy_pair.h b/device/tt_xy_pair.h index 052b6130..74b54a23 100644 --- a/device/tt_xy_pair.h +++ b/device/tt_xy_pair.h @@ -15,44 +15,56 @@ using tt_cxy_pair = tt::umd::cxy_pair; struct tt_physical_coords : public tt_xy_pair { tt_physical_coords() : tt_xy_pair() {} + tt_physical_coords(std::size_t x, std::size_t y) : tt_xy_pair(x, y) {} }; struct tt_chip_physical_coords : public tt_cxy_pair { tt_chip_physical_coords() : tt_cxy_pair() {} + tt_chip_physical_coords(std::size_t ichip, xy_pair pair) : tt_cxy_pair(ichip, pair) {} + tt_chip_physical_coords(std::size_t ichip, std::size_t x, std::size_t y) : tt_cxy_pair(ichip, x, y) {} }; struct tt_logical_coords : public tt_xy_pair { tt_logical_coords() : tt_xy_pair() {} + tt_logical_coords(std::size_t x, std::size_t y) : tt_xy_pair(x, y) {} }; struct tt_chip_logical_coords : public tt_cxy_pair { tt_chip_logical_coords() : tt_cxy_pair() {} + tt_chip_logical_coords(std::size_t ichip, xy_pair pair) : tt_cxy_pair(ichip, pair) {} + tt_chip_logical_coords(std::size_t ichip, std::size_t x, std::size_t y) : tt_cxy_pair(ichip, x, y) {} }; struct tt_virtual_coords : public tt_xy_pair { tt_virtual_coords() : tt_xy_pair() {} + tt_virtual_coords(std::size_t x, std::size_t y) : tt_xy_pair(x, y) {} }; struct tt_chip_virtual_coords : public tt_cxy_pair { tt_chip_virtual_coords() : tt_cxy_pair() {} + tt_chip_virtual_coords(std::size_t ichip, xy_pair pair) : tt_cxy_pair(ichip, pair) {} + tt_chip_virtual_coords(std::size_t ichip, std::size_t x, std::size_t y) : tt_cxy_pair(ichip, x, y) {} }; struct tt_translated_coords : public tt_xy_pair { tt_translated_coords() : tt_xy_pair() {} + tt_translated_coords(std::size_t x, std::size_t y) : tt_xy_pair(x, y) {} }; struct tt_chip_translated_coords : public tt_cxy_pair { tt_chip_translated_coords() : tt_cxy_pair() {} + tt_chip_translated_coords(std::size_t ichip, xy_pair pair) : tt_cxy_pair(ichip, pair) {} + tt_chip_translated_coords(std::size_t ichip, std::size_t x, std::size_t y) : tt_cxy_pair(ichip, x, y) {} }; \ No newline at end of file diff --git a/device/wormhole/wormhole_coordinate_manager.cpp b/device/wormhole/wormhole_coordinate_manager.cpp index eccc0a70..a7b60368 100644 --- a/device/wormhole/wormhole_coordinate_manager.cpp +++ b/device/wormhole/wormhole_coordinate_manager.cpp @@ -14,9 +14,11 @@ std::set WormholeCoordinateManager::get_y_coordinates_to_harvest(st } tt_translated_coords WormholeCoordinateManager::to_translated_coords(tt_logical_coords logical_coords) { - return tt_translated_coords(logical_coords.x + translated_coordinate_start_x, logical_coords.y + translated_coordinate_start_y); + return tt_translated_coords( + logical_coords.x + translated_coordinate_start_x, logical_coords.y + translated_coordinate_start_y); } tt_logical_coords WormholeCoordinateManager::to_logical_coords(tt_translated_coords translated_coords) { - return tt_logical_coords(translated_coords.x - translated_coordinate_start_x, translated_coords.y - translated_coordinate_start_y); + return tt_logical_coords( + translated_coords.x - translated_coordinate_start_x, translated_coords.y - translated_coordinate_start_y); } \ No newline at end of file diff --git a/device/wormhole/wormhole_coordinate_manager.h b/device/wormhole/wormhole_coordinate_manager.h index 9eca9fd1..e3e35886 100644 --- a/device/wormhole/wormhole_coordinate_manager.h +++ b/device/wormhole/wormhole_coordinate_manager.h @@ -9,16 +9,16 @@ #include "device/coordinate_manager.h" class WormholeCoordinateManager : public CoordinateManager { - public: - WormholeCoordinateManager(const tt_xy_pair& worker_grid_size, const std::vector& workers, std::size_t harvesting_mask) - : CoordinateManager(worker_grid_size, workers, harvesting_mask) {} + WormholeCoordinateManager( + const tt_xy_pair& worker_grid_size, const std::vector& workers, std::size_t harvesting_mask) : + CoordinateManager(worker_grid_size, workers, harvesting_mask) {} tt_translated_coords to_translated_coords(tt_logical_coords logical_coords) override; tt_logical_coords to_logical_coords(tt_translated_coords translated_coords) override; -protected: +protected: std::set get_y_coordinates_to_harvest(std::size_t harvesting_mask) override; private: diff --git a/device/wormhole/wormhole_implementation.cpp b/device/wormhole/wormhole_implementation.cpp index 0dc9a205..25a304a3 100644 --- a/device/wormhole/wormhole_implementation.cpp +++ b/device/wormhole/wormhole_implementation.cpp @@ -4,9 +4,8 @@ #include "wormhole_implementation.h" -#include "src/firmware/riscv/wormhole/host_mem_address_map.h" - #include "device/tt_device.h" +#include "src/firmware/riscv/wormhole/host_mem_address_map.h" namespace tt::umd { @@ -94,7 +93,9 @@ std::pair wormhole_implementation::get_tlb_data( } tt_driver_host_address_params wormhole_implementation::get_host_address_params() const { - return {::wormhole::host_mem::address_map::ETH_ROUTING_BLOCK_SIZE, ::wormhole::host_mem::address_map::ETH_ROUTING_BUFFERS_START}; + return { + ::wormhole::host_mem::address_map::ETH_ROUTING_BLOCK_SIZE, + ::wormhole::host_mem::address_map::ETH_ROUTING_BUFFERS_START}; } } // namespace tt::umd diff --git a/device/wormhole/wormhole_implementation.h b/device/wormhole/wormhole_implementation.h index f6214d06..e16205f0 100644 --- a/device/wormhole/wormhole_implementation.h +++ b/device/wormhole/wormhole_implementation.h @@ -167,7 +167,8 @@ static constexpr uint32_t TLB_BASE_INDEX_16M = TLB_BASE_INDEX_2M + TLB_COUNT_2M; static constexpr uint32_t DYNAMIC_TLB_COUNT = 16; static constexpr uint32_t DYNAMIC_TLB_16M_SIZE = 16 * 1024 * 1024; -static constexpr uint32_t DYNAMIC_TLB_16M_CFG_ADDR = STATIC_TLB_CFG_ADDR + (TLB_BASE_INDEX_16M * TLB_CFG_REG_SIZE_BYTES); +static constexpr uint32_t DYNAMIC_TLB_16M_CFG_ADDR = + STATIC_TLB_CFG_ADDR + (TLB_BASE_INDEX_16M * TLB_CFG_REG_SIZE_BYTES); static constexpr uint32_t DYNAMIC_TLB_16M_BASE = TLB_BASE_16M; static constexpr uint32_t DYNAMIC_TLB_2M_SIZE = 2 * 1024 * 1024; @@ -205,59 +206,93 @@ static constexpr uint32_t TENSIX_SOFT_RESET_ADDR = 0xFFB121B0; } // namespace wormhole class wormhole_implementation : public architecture_implementation { - public: +public: tt::ARCH get_architecture() const override { return tt::ARCH::WORMHOLE_B0; } + uint32_t get_arc_message_arc_get_harvesting() const override { return static_cast(wormhole::arc_message_type::ARC_GET_HARVESTING); } + uint32_t get_arc_message_arc_go_busy() const override { return static_cast(wormhole::arc_message_type::ARC_GO_BUSY); } + uint32_t get_arc_message_arc_go_long_idle() const override { return static_cast(wormhole::arc_message_type::ARC_GO_LONG_IDLE); } + uint32_t get_arc_message_arc_go_short_idle() const override { return static_cast(wormhole::arc_message_type::ARC_GO_SHORT_IDLE); } + uint32_t get_arc_message_deassert_riscv_reset() const override { return static_cast(wormhole::arc_message_type::DEASSERT_RISCV_RESET); } + uint32_t get_arc_message_get_aiclk() const override { return static_cast(wormhole::arc_message_type::GET_AICLK); } + uint32_t get_arc_message_setup_iatu_for_peer_to_peer() const override { return static_cast(wormhole::arc_message_type::SETUP_IATU_FOR_PEER_TO_PEER); } + uint32_t get_arc_message_test() const override { return static_cast(wormhole::arc_message_type::TEST); } + uint32_t get_arc_csm_mailbox_offset() const override { return wormhole::ARC_CSM_MAILBOX_OFFSET; } + uint32_t get_arc_reset_arc_misc_cntl_offset() const override { return wormhole::ARC_RESET_ARC_MISC_CNTL_OFFSET; } + uint32_t get_arc_reset_scratch_offset() const override { return wormhole::ARC_RESET_SCRATCH_OFFSET; } + uint32_t get_dram_channel_0_peer2peer_region_start() const override { return wormhole::DRAM_CHANNEL_0_PEER2PEER_REGION_START; } + uint32_t get_dram_channel_0_x() const override { return wormhole::DRAM_CHANNEL_0_X; } + uint32_t get_dram_channel_0_y() const override { return wormhole::DRAM_CHANNEL_0_Y; } + uint32_t get_broadcast_tlb_index() const override { return wormhole::BROADCAST_TLB_INDEX; } + uint32_t get_dynamic_tlb_2m_base() const override { return wormhole::DYNAMIC_TLB_2M_BASE; } + uint32_t get_dynamic_tlb_2m_size() const override { return wormhole::DYNAMIC_TLB_2M_SIZE; } + uint32_t get_dynamic_tlb_16m_base() const override { return wormhole::DYNAMIC_TLB_16M_BASE; } + uint32_t get_dynamic_tlb_16m_size() const override { return wormhole::DYNAMIC_TLB_16M_SIZE; } + uint32_t get_dynamic_tlb_16m_cfg_addr() const override { return wormhole::DYNAMIC_TLB_16M_CFG_ADDR; } + uint32_t get_mem_large_read_tlb() const override { return wormhole::MEM_LARGE_READ_TLB; } + uint32_t get_mem_large_write_tlb() const override { return wormhole::MEM_LARGE_WRITE_TLB; } + uint32_t get_static_tlb_cfg_addr() const override { return wormhole::STATIC_TLB_CFG_ADDR; } + uint32_t get_static_tlb_size() const override { return wormhole::STATIC_TLB_SIZE; } + uint32_t get_reg_tlb() const override { return wormhole::REG_TLB; } + uint32_t get_tlb_base_index_16m() const override { return wormhole::TLB_BASE_INDEX_16M; } + uint32_t get_tensix_soft_reset_addr() const override { return wormhole::TENSIX_SOFT_RESET_ADDR; } + uint32_t get_grid_size_x() const override { return wormhole::GRID_SIZE_X; } + uint32_t get_grid_size_y() const override { return wormhole::GRID_SIZE_Y; } + uint32_t get_tlb_cfg_reg_size_bytes() const override { return wormhole::TLB_CFG_REG_SIZE_BYTES; } + uint32_t get_small_read_write_tlb() const override { return wormhole::MEM_SMALL_READ_WRITE_TLB; } + const std::vector& get_harvesting_noc_locations() const override { return wormhole::HARVESTING_NOC_LOCATIONS; } + const std::vector& get_t6_x_locations() const override { return wormhole::T6_X_LOCATIONS; } + const std::vector& get_t6_y_locations() const override { return wormhole::T6_Y_LOCATIONS; } std::tuple multicast_workaround(xy_pair start, xy_pair end) const override; @@ -266,7 +301,6 @@ class wormhole_implementation : public architecture_implementation { std::pair get_tlb_data(std::uint32_t tlb_index, const tlb_data& data) const override; tt_driver_host_address_params get_host_address_params() const override; - }; } // namespace tt::umd diff --git a/device/xy_pair.h b/device/xy_pair.h index 92d7c639..84884590 100644 --- a/device/xy_pair.h +++ b/device/xy_pair.h @@ -14,6 +14,7 @@ namespace tt::umd { struct xy_pair { constexpr xy_pair() : x{}, y{} {} + constexpr xy_pair(std::size_t x, std::size_t y) : x(x), y(y) {} std::size_t x; @@ -32,14 +33,14 @@ constexpr inline bool operator<(const xy_pair &left, const xy_pair &right) { struct cxy_pair : public xy_pair { cxy_pair() : xy_pair{}, chip{} {} + cxy_pair(std::size_t ichip, xy_pair pair) : xy_pair(pair.x, pair.y), chip(ichip) {} + cxy_pair(std::size_t ichip, std::size_t x, std::size_t y) : xy_pair(x, y), chip(ichip) {} std::size_t chip; - std::string str() const { - return fmt::format("(chip={},x={},y={})", chip, x, y); - } + std::string str() const { return fmt::format("(chip={},x={},y={})", chip, x, y); } }; constexpr inline bool operator==(const cxy_pair &a, const cxy_pair &b) {