diff --git a/device/CMakeLists.txt b/device/CMakeLists.txt index cecc88db..2373b162 100644 --- a/device/CMakeLists.txt +++ b/device/CMakeLists.txt @@ -1,20 +1,19 @@ set(UMD_DEVICE_SRCS architecture_implementation.cpp - blackhole_implementation.cpp cpuset_lib.cpp - grayskull_implementation.cpp tlb.cpp tt_cluster_descriptor.cpp tt_device.cpp - tt_emulation_stub.cpp tt_silicon_driver.cpp tt_silicon_driver_common.cpp tt_soc_descriptor.cpp - tt_versim_stub.cpp - wormhole_implementation.cpp simulation/tt_simulation_device.cpp simulation/tt_simulation_host.cpp + blackhole/blackhole_implementation.cpp + grayskull/grayskull_implementation.cpp + wormhole/wormhole_implementation.cpp + pci_device.cpp ) add_library(umd_device SHARED ${UMD_DEVICE_SRCS}) target_link_libraries(umd_device diff --git a/device/architecture_implementation.cpp b/device/architecture_implementation.cpp index 96117d96..d55d3e29 100644 --- a/device/architecture_implementation.cpp +++ b/device/architecture_implementation.cpp @@ -4,9 +4,9 @@ #include "device/architecture_implementation.h" -#include "device/blackhole_implementation.h" -#include "device/grayskull_implementation.h" -#include "device/wormhole_implementation.h" +#include "device/blackhole/blackhole_implementation.h" +#include "device/grayskull/grayskull_implementation.h" +#include "device/wormhole/wormhole_implementation.h" namespace tt::umd { diff --git a/device/blackhole_implementation.cpp b/device/blackhole/blackhole_implementation.cpp similarity index 98% rename from device/blackhole_implementation.cpp rename to device/blackhole/blackhole_implementation.cpp index 4c36838c..eda2f140 100644 --- a/device/blackhole_implementation.cpp +++ b/device/blackhole/blackhole_implementation.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "device/blackhole_implementation.h" +#include "blackhole_implementation.h" namespace tt::umd { diff --git a/device/blackhole_implementation.h b/device/blackhole/blackhole_implementation.h similarity index 100% rename from device/blackhole_implementation.h rename to device/blackhole/blackhole_implementation.h diff --git a/device/blackhole/impl_device.hpp b/device/blackhole/impl_device.hpp deleted file mode 100644 index afb4091c..00000000 --- a/device/blackhole/impl_device.hpp +++ /dev/null @@ -1,76 +0,0 @@ -#pragma once - -#include "device/tt_silicon_driver_common.hpp" - -// See src/t6ifc/t6py/packages/tenstorrent/data/wormhole/pci/tlb.yaml -// local_offset: [ 0, 15, 0, "36-bit address prefix, prepended to the 20 LSBs of issued address to form a 56-bit NOC address. The 1MB TLB #n corresponds to the 1MB MMIO range starting at (0x0 + N*0x100000)."] -// x_end : [ 0, 21, 16, "" ] -// y_end : [ 0, 27, 22, "" ] -// x_start : [ 0, 33, 28, "" ] -// y_start : [ 0, 39, 34, "" ] -// noc_sel: [ 0, 40, 40, "NOC select (1 = NOC1, 0 = NOC0)"] -// mcast: [ 0, 41, 41, "1 = multicast, 0 = unicast"] -// ordering: [ 0, 43, 42, "ordering mode (01 = strict (full AXI ordering), 00 = relaxed (no RAW hazard), 10 = posted (may have RAW hazard)"] -// linked: [ 0, 44, 44, "linked"] - -// local_offset: [ 0, 14, 0, "35-bit address prefix, prepended to the 21 LSBs of issued address to form a 56-bit NOC address. The 2MB TLB #n corresponds to the 2MB MMIO range starting at (0x9C00000 + N*0x200000)."] -// x_end : [ 0, 20, 15, "" ] -// y_end : [ 0, 26, 21, "" ] -// x_start : [ 0, 32, 27, "" ] -// y_start : [ 0, 38, 33, "" ] -// noc_sel: [ 0, 39, 39, "NOC select (1 = NOC1, 0 = NOC0)"] -// mcast: [ 0, 40, 40, "1 = multicast, 0 = unicast"] -// ordering: [ 0, 42, 41, "ordering mode (01 = strict (full AXI ordering), 00 = relaxed (no RAW hazard), 10 = posted (may have RAW hazard)"] -// linked: [ 0, 43, 43, "linked"] - -// local_offset: [ 0, 11, 0, "32-bit address prefix, prepended to the 24 LSBs of issued address to form a 56-bit NOC address. The 16MB TLB #n corresponds to the 16MB MMIO range starting at (0xB000000 + N*0x1000000)."] -// x_end : [ 0, 17, 12, "" ] -// y_end : [ 0, 23, 18, "" ] -// x_start : [ 0, 29, 24, "" ] -// y_start : [ 0, 35, 30, "" ] -// noc_sel: [ 0, 36, 36, "NOC select (1 = NOC1, 0 = NOC0)"] -// mcast: [ 0, 37, 37, "1 = multicast, 0 = unicast"] -// ordering: [ 0, 39, 38, "ordering mode (01 = strict (full AXI ordering), 00 = relaxed (no RAW hazard), 10 = posted (may have RAW hazard)"] -// linked: [ 0, 40, 40, "linked"] - -const auto TLB_1M_OFFSET = TLB_OFFSETS { - .local_offset = 0, - .x_end = 16, - .y_end = 22, - .x_start = 28, - .y_start = 34, - .noc_sel = 40, - .mcast = 41, - .ordering = 42, - .linked = 44, - .static_vc = 45, - .static_vc_end = 46 -}; - -const auto TLB_2M_OFFSET = TLB_OFFSETS { - .local_offset = 0, - .x_end = 15, - .y_end = 21, - .x_start = 27, - .y_start = 33, - .noc_sel = 39, - .mcast = 40, - .ordering = 41, - .linked = 43, - .static_vc = 44, - .static_vc_end = 45 -}; - -const auto TLB_16M_OFFSET = TLB_OFFSETS { - .local_offset = 0, - .x_end = 12, - .y_end = 18, - .x_start = 24, - .y_start = 30, - .noc_sel = 36, - .mcast = 37, - .ordering = 38, - .linked = 40, - .static_vc = 41, - .static_vc_end = 42 -}; diff --git a/device/cpuset_lib.cpp b/device/cpuset_lib.cpp index 803ee8eb..123b5fd0 100644 --- a/device/cpuset_lib.cpp +++ b/device/cpuset_lib.cpp @@ -7,39 +7,13 @@ #include "cpuset_lib.hpp" #include "common/logger.hpp" #include -#include "device/device_api.h" +#include "device/tt_device.h" #include namespace tt { namespace fs = std::filesystem; namespace cpuset { -// Unrelated to hwloc binding of threads, instead to query cpu affinity to find reasonable number of threads to parallelize over. -int get_allowed_num_threads(){ - unsigned int num_pus_in_system = sysconf(_SC_NPROCESSORS_ONLN); - unsigned int num_threads = num_pus_in_system; - - cpu_set_t mask; - if (sched_getaffinity(0, sizeof(cpu_set_t), &mask) == -1) { - log_warning(LogSiliconDriver, "Could not detect current process cpu id affinity for calculating num_threads, will use default num_threads: {}.", num_threads); - } else{ - unsigned int visible_pu_count = CPU_COUNT(&mask); - if (visible_pu_count < num_pus_in_system){ - num_threads = visible_pu_count; - } - log_trace(LogSiliconDriver, "Detected (allowed) visible_pu_count: {}, setting num_threads: {}", visible_pu_count, num_threads); - } - - char const* override_thread_count = std::getenv("TT_BACKEND_COMPILE_THREADS"); - if (override_thread_count != nullptr && std::atoi(override_thread_count) > 0){ - num_threads = std::atoi(override_thread_count); - log_trace(LogSiliconDriver, "Overriding via env-var to num_threads: {}", num_threads); - } - - return num_threads; -} - - ///////////////////////////////////////////////////////////////////////// // Initialization Functions ///////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////// @@ -49,7 +23,6 @@ tt_cpuset_allocator::tt_cpuset_allocator() { m_pid = getpid(); m_debug = std::getenv("TT_BACKEND_CPUSET_ALLOCATOR_DEBUG") ? true : false; - m_skip_singlify = std::getenv("TT_BACKEND_CPUSET_ALLOCATOR_SKIP_SINGLIFY") ? true : false; // Chicken bit to disable this entire feature for debug/comparison. bool cpuset_allocator_enable_env = std::getenv("TT_BACKEND_CPUSET_ALLOCATOR_ENABLE") ? true : false; @@ -72,7 +45,6 @@ tt_cpuset_allocator::tt_cpuset_allocator() { if (is_cpu_supported){ m_enable_cpuset_allocator &= init_determine_cpuset_allocations(); - m_enable_cpuset_allocator &= init_populate_physical_mmio_device_id_map(); }else{ m_enable_cpuset_allocator = false; } @@ -351,206 +323,10 @@ bool tt_cpuset_allocator::init_determine_cpuset_allocations(){ } -// Step 6 - Populate map of logical to physical mmio device map. -bool tt_cpuset_allocator::init_populate_physical_mmio_device_id_map(){ - - if (!m_enable_cpuset_allocator){ - return false; - } - - log_debug(LogSiliconDriver,"Starting tt_cpuset_allocator::populate_physical_mmio_device_id_map()"); - - // Get map of logical to physical device ids - FIXME: This is not accurate for some WHB0 clusters. - std::vector available_device_ids = tt_SiliconDevice::detect_available_device_ids(); - m_logical_to_physical_mmio_device_id_map = tt_SiliconDevice::get_logical_to_physical_mmio_device_id_map(available_device_ids); - - for (auto &d: m_logical_to_physical_mmio_device_id_map){ - auto logical_device_id = d.first; - auto physical_device_id = d.second; - log_debug(LogSiliconDriver, "populate_physical_mmio_device_id_map() -- available_devices: {} logical_device_id: {} => physical_device_id: {}", available_device_ids.size(), (int) logical_device_id, (int) physical_device_id); - m_num_threads_pinned_per_tt_device.insert({physical_device_id, 0}); - } - - return true; // Success -} - - ///////////////////////////////////////////////////////////////////////// // Runtime Functions //////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////// -// Idea - Something to compare cpuset from Slurm to cpuset picked by this function. -hwloc_cpuset_t tt_cpuset_allocator::allocate_cpu_set_for_thread(chip_id_t physical_device_id, bool skip_singlify){ - - // To prevent races on read/modify/write to m_num_threads_pinned_per_tt_device across threads to same device. - const std::lock_guard lock(allocate_cpu_id_mutex); - - int num_alloc_slots_for_tt_device = m_physical_device_id_to_cpusets_map.at(physical_device_id).size(); - int tt_device_alloc_idx = m_num_threads_pinned_per_tt_device.at(physical_device_id) % num_alloc_slots_for_tt_device; - - // Check if 2CCX-PER-CCD Optimization can be enabled. For AMD EPYC models : There is 1 L3Cache per CCX and 2 CCX per CCD. - // Better perf to first allocate to unique CCD's if we have enough per device. Expand to other CPU types? - bool enable_special_case = true; - auto package_id = m_physical_device_id_to_package_id_map.at(physical_device_id); - auto num_l3_per_ccx = m_package_id_to_num_l3_per_ccx_map.at(package_id); - auto num_ccx_per_ccd = m_package_id_to_num_ccx_per_ccd_map.at(package_id); - - if (enable_special_case && num_l3_per_ccx == 1 && num_ccx_per_ccd == 2 && num_alloc_slots_for_tt_device > num_ccx_per_ccd && m_object_per_alloc_slot == HWLOC_OBJ_L3CACHE){ - int alloc_idx_for_device = m_num_threads_pinned_per_tt_device.at(physical_device_id); - int ccx_in_ccd = (alloc_idx_for_device % num_alloc_slots_for_tt_device) < num_alloc_slots_for_tt_device/num_ccx_per_ccd ? 0 : 1; - tt_device_alloc_idx = (ccx_in_ccd + (alloc_idx_for_device * num_ccx_per_ccd)) % num_alloc_slots_for_tt_device; - log_debug(LogSiliconDriver,"Special L3Cache case physical_device_id: {} alloc_idx_for_device: {} ccx_in_ccd: {} tt_device_alloc_idx: {}", physical_device_id, alloc_idx_for_device, ccx_in_ccd, tt_device_alloc_idx); - } - - - // Get the desired cpuset and prevent migration between different PU's in set by singlifying to single PU. - hwloc_cpuset_t cpuset = hwloc_bitmap_dup(m_physical_device_id_to_cpusets_map.at(physical_device_id).at(tt_device_alloc_idx)); - if (!m_skip_singlify && !skip_singlify){ - hwloc_bitmap_singlify(cpuset); - } - - // Debug - auto tid = std::this_thread::get_id(); - log_debug(LogSiliconDriver,"Allocating for physical_device_id: {} num_alloc_slots: {} num_threads_pinned: {} alloc_idx: {} skip_singlify: {} (pid: {} tid: {}) => {} PU's {}", - physical_device_id, num_alloc_slots_for_tt_device, m_num_threads_pinned_per_tt_device.at(physical_device_id), tt_device_alloc_idx, skip_singlify, - m_pid, tid, hwloc_bitmap_weight(cpuset), get_hwloc_bitmap_vector(cpuset)); - - // Increment counter to keep track of number of pinned thread per device, to get unique cpuset per thread. - m_num_threads_pinned_per_tt_device.at(physical_device_id)++; - - return cpuset; -} - -void tt_cpuset_allocator::store_thread_original_cpuset(){ - - auto tid = std::this_thread::get_id(); - hwloc_cpuset_t orig_cpuset = hwloc_bitmap_alloc(); - - if (hwloc_get_cpubind(m_topology, orig_cpuset, HWLOC_CPUBIND_THREAD)){ - log_warning(LogSiliconDriver,"store_thread_original_cpuset() calling hwloc_get_cpubind() failed with errno: {} (pid: {} tid:{})", strerror(errno), m_pid, tid); - }else{ - auto orig_cpuset_vector = get_hwloc_bitmap_vector(orig_cpuset); - log_debug(LogSiliconDriver, "store_thread_original_cpuset() success - got orig cpuset: {} PU's: {} (pid: {} tid: {})", orig_cpuset_vector.size(), orig_cpuset_vector, m_pid, tid); - m_global_thread_id_to_original_cpuset_map.insert({tid, hwloc_bitmap_dup(orig_cpuset)}); - } - hwloc_bitmap_free(orig_cpuset); -} - - - -// Given a logical device_id, determine the right cpu_ids associated with it and pin this thread to them. -void tt_cpuset_allocator::bind_thread_cpuset(tt_cluster_description *ndesc, chip_id_t logical_device_id, bool skip_singlify){ - - auto tid = std::this_thread::get_id(); - - // This needed to be protected by not-empty otherwise arithmetic error. - if ((!m_global_thread_ids_pinned.empty() && m_global_thread_ids_pinned.count(tid)) || (!m_enable_cpuset_allocator)){ - return; - }else{ - - if (!ndesc->is_chip_mmio_capable(logical_device_id)){ - logical_device_id = ndesc->get_closest_mmio_capable_chip(logical_device_id); - } - - log_debug(LogSiliconDriver,"bind_thread_cpuset_cpuset() for logical_device_id: {} m_logical_to_physical_mmio_device_id_map.size(): {}", logical_device_id, m_logical_to_physical_mmio_device_id_map.size()); - - // If a main thread ID was captured, make sure it is not attempted to be pinned. Only IO API sub threads are expected to be pinned today. - if (m_stored_main_thread_id && tid == m_main_thread_id){ - log_warning(LogSiliconDriver, "bind_thread_cpuset() - Skipping cpubind for runtime main thread_id: {} to prevent undesired inheritence. Consider moving device IO (ie. push/pop/get) to sub-threads for binding to be supported.", m_main_thread_id); - return; - } - - if (m_logical_to_physical_mmio_device_id_map.count(logical_device_id) > 0){ - - auto physical_device_id = m_logical_to_physical_mmio_device_id_map.at(logical_device_id); - auto package_id = m_physical_device_id_to_package_id_map.at(physical_device_id); - - store_thread_original_cpuset(); // Store original cpuset for later unbinding if necessary. - - // Get the cpuset, and attempt to bind thread to it. - hwloc_cpuset_t cpuset = allocate_cpu_set_for_thread(physical_device_id, skip_singlify); - auto cpuset_vector = get_hwloc_bitmap_vector(cpuset); - - if (hwloc_set_cpubind(m_topology, cpuset, HWLOC_CPUBIND_THREAD | HWLOC_CPUBIND_STRICT )){; // HWLOC_CPUBIND_NOMEMBIND - log_warning(LogSiliconDriver,"bind_thread_cpuset() binding failed (errno: {}) for physical_device_id: {} on package_id: {} to {} PU's: {} (pid: {} tid: {})", - strerror(errno), physical_device_id, package_id, cpuset_vector.size(), cpuset_vector, m_pid, tid); - }else{ - log_debug(LogSiliconDriver,"bind_thread_cpuset() binding success skip: {} for physical_device_id: {} on package_id: {} to {} PU's: {} (pid: {} tid: {})", - skip_singlify, physical_device_id, package_id, cpuset_vector.size(), cpuset_vector, m_pid, tid); - // Record that this thread is pinned, no need to repeat on subsequent IO API calls. - m_global_thread_ids_pinned.insert(tid); - m_global_thread_id_to_physical_device_id_map.insert({tid, physical_device_id}); - } - - }else{ - log_warning(LogSiliconDriver,"Could not find logical_device_id: {} in m_logical_to_physical_mmio_device_id_map. This shouldn't happen.", logical_device_id); - } - } -} - - -// Restore thread's original cpubind. Perhaps could be simplified to not require physical_device_id or previous binding, and just always bind to MACHINE cpuset. -void tt_cpuset_allocator::unbind_thread_cpuset(){ - - if (m_enable_cpuset_allocator){ - auto tid = std::this_thread::get_id(); - - // Make sure this thread was successfully and previously binded to a cpuset. - if (!m_global_thread_id_to_original_cpuset_map.count(tid)){ - log_warning(LogSiliconDriver,"unbind_thread_cpuset() called for tid: {} but no original cpuset for this thread found. Previous cpu binding skipped or failed?", tid); - return; - } - - if (!m_global_thread_id_to_physical_device_id_map.count(tid)){ - log_warning(LogSiliconDriver,"unbind_thread_cpuset() called for tid: {} but no physical_device_id this thread found. Previous cpu binding skipped or failed?", tid); - return; - } - - // Handle the case where something goes wrong during original binding above, don't want to error out. - auto cpuset = m_global_thread_id_to_original_cpuset_map.at(tid); - auto physical_device_id = m_global_thread_id_to_physical_device_id_map.at(tid); - auto cpuset_vector = get_hwloc_bitmap_vector(cpuset); // Can tighten this up and remove, it's purely for debug anyways. - - if (hwloc_set_cpubind(m_topology, cpuset, HWLOC_CPUBIND_THREAD)){ - log_warning(LogSiliconDriver,"unbind_thread_cpuset() binding failed (errno: {}) for physical_device_id: {} to original {} PU's: {} (pid: {} tid: {})", - strerror(errno), physical_device_id, cpuset_vector.size(), cpuset_vector, m_pid, tid); - }else{ - log_debug(LogSiliconDriver,"unbind_thread_cpuset() binding success for physical_device_id: {} to original {} PU's: {} (pid: {} tid: {})", - physical_device_id, cpuset_vector.size(), cpuset_vector, m_pid, tid); - - // To prevent races on read/modify/write to m_num_threads_pinned_per_tt_device across threads to same device. - const std::lock_guard lock(allocate_cpu_id_mutex); - - // Update book-keeping by removing entry, so this thread can be re-pinned in the future. - m_num_threads_pinned_per_tt_device.at(physical_device_id)--; - m_global_thread_ids_pinned.erase(tid); - m_global_thread_id_to_physical_device_id_map.erase(tid); - } - } -} - -// Teardown/Cleanup for end of process. Don't do anything if feature disabled. Probably don't even need this if process is going to be ended. -void tt_cpuset_allocator::clear_state(){ - if (m_enable_cpuset_allocator){ - - auto tid = std::this_thread::get_id(); - log_debug(LogSiliconDriver,"Clearing state and unbinding entire process' cpuset (pid: {} tid: {}).", m_pid, tid); - - // Reset state variables so that next time the thread can be freshly pinned - m_global_thread_ids_pinned.clear(); - for (auto &device: m_num_threads_pinned_per_tt_device){ - device.second = 0; - } - - // Undo previous pinning, by binding to full machine cpuset. Alternatively could have saved and restored orig cpuset per thread. - auto machine_obj = hwloc_get_obj_by_type(m_topology, HWLOC_OBJ_MACHINE, 0); - if (hwloc_set_cpubind(m_topology, machine_obj->cpuset, HWLOC_CPUBIND_PROCESS)){ - log_warning(LogSiliconDriver,"clear_state() binding failed (errno: {}) to Machine cpuset (pid: {} tid: {})", strerror(errno), m_pid, tid); - } - } -} - - // Given a physical device_id, determine the right numa nodes associated with it and attempt to membind a previously allocated memory region to it. bool tt_cpuset_allocator::bind_area_memory_nodeset(chip_id_t physical_device_id, const void * addr, size_t len){ @@ -580,14 +356,6 @@ bool tt_cpuset_allocator::bind_area_memory_nodeset(chip_id_t physical_device_id, return true; // Success } - -// For checking purposes, to make sure main thread is not cpubinded accidentally. -void tt_cpuset_allocator::_set_main_thread_id(){ - m_main_thread_id = std::this_thread::get_id(); - m_stored_main_thread_id = true; - log_debug(LogSiliconDriver,"Captured main_thread_id: {}", m_main_thread_id); -} - int tt_cpuset_allocator::_get_num_tt_pci_devices() { for (auto &d : m_physical_device_id_to_package_id_map) { diff --git a/device/cpuset_lib.hpp b/device/cpuset_lib.hpp index 65e31eaa..a14a4f33 100644 --- a/device/cpuset_lib.hpp +++ b/device/cpuset_lib.hpp @@ -24,8 +24,6 @@ namespace tt { //! Utility functions for various backend paramsf namespace cpuset { -int get_allowed_num_threads(); - // CPU ID allocator for pinning threads to cpu_ids // It's a singleton that should be retrieved via get() struct tt_cpuset_allocator { @@ -34,39 +32,12 @@ struct tt_cpuset_allocator { tt_cpuset_allocator(tt_cpuset_allocator const&) = delete; void operator=(tt_cpuset_allocator const&) = delete; - static void bind_thread_to_cpuset(tt_cluster_description *ndesc, chip_id_t device_id, bool skip_singlify=false){ - auto& instance = tt_cpuset_allocator::get(); - instance.bind_thread_cpuset(ndesc, device_id, skip_singlify); - } - - static void unbind_thread_from_cpuset(){ - auto& instance = tt_cpuset_allocator::get(); - instance.unbind_thread_cpuset(); - } - - static void clear_state_and_cpuset_pins(){ - auto& instance = tt_cpuset_allocator::get(); - instance.clear_state(); - } - // Bind an already allocated memory region to particular numa nodes static bool bind_area_to_memory_nodeset(chip_id_t physical_device_id, const void * addr, size_t len){ auto& instance = tt_cpuset_allocator::get(); return instance.bind_area_memory_nodeset(physical_device_id, addr, len); } - // Store process' main thread_id (not required, mainly for checking purposes to ensure no cpubinds on it occur). - static void set_main_thread_id(){ - auto& instance = tt_cpuset_allocator::get(); - instance._set_main_thread_id(); - } - - static int get_num_cpu_cores_allocated_to_device(chip_id_t physical_device_id){ - auto& instance = tt_cpuset_allocator::get(); - auto num_cores = instance.m_enable_cpuset_allocator ? instance.m_num_cpu_cores_allocated_per_tt_device.at(physical_device_id) : get_allowed_num_threads(); - return num_cores; - } - static int get_num_tt_pci_devices(){ auto& instance = tt_cpuset_allocator::get(); return instance._get_num_tt_pci_devices(); @@ -88,17 +59,10 @@ struct tt_cpuset_allocator { int TENSTORRENT_VENDOR_ID = 0x1e52; - void bind_thread_cpuset(tt_cluster_description *ndesc, chip_id_t device_id, bool skip_singlify); - void unbind_thread_cpuset(); - void store_thread_original_cpuset(); bool bind_area_memory_nodeset(chip_id_t physical_device_id, const void * addr, size_t len); - void _set_main_thread_id(); int _get_num_tt_pci_devices(); int _get_num_tt_pci_devices_by_pci_device_id(uint16_t device_id, uint16_t revision_id); - void clear_state(); - hwloc_cpuset_t allocate_cpu_set_for_thread(chip_id_t physical_device_id, bool skip_singlify); - // Series of init functions, must be called in this order. Seperated out to support // early exit in case of errors. bool init_topology_init_and_load(); @@ -106,7 +70,6 @@ struct tt_cpuset_allocator { bool init_get_number_of_packages(); bool init_is_cpu_model_supported(); bool init_determine_cpuset_allocations(); - bool init_populate_physical_mmio_device_id_map(); // Helper Functions std::string get_pci_bus_id(hwloc_obj_t pci_device_obj); @@ -122,11 +85,8 @@ struct tt_cpuset_allocator { std::vector get_hwloc_nodeset_vector(hwloc_obj_t &obj); hwloc_topology_t m_topology; bool m_debug; - bool m_skip_singlify; pid_t m_pid; - std::unordered_map m_logical_to_physical_mmio_device_id_map; - // Items calculated by parsing system info, used by allocation algorithm: std::map> m_package_id_to_devices_map; std::map m_physical_device_id_to_pci_bus_id_map; // Debug/Info @@ -135,30 +95,16 @@ struct tt_cpuset_allocator { std::map> m_physical_device_id_to_cpusets_map; std::map m_physical_device_id_to_package_id_map; - std::mutex allocate_cpu_id_mutex; - bool m_enable_cpuset_allocator = true; // Enable feature, otherwise do nothing. int m_num_packages = 0; std::vector m_all_tt_devices = {}; hwloc_obj_type_t m_object_per_alloc_slot = HWLOC_OBJ_L3CACHE; // Default - // For 2CCX-PER-CCD Optimization detection. std::map m_package_id_to_num_l3_per_ccx_map; std::map m_package_id_to_num_ccx_per_ccd_map; - std::map m_num_threads_pinned_per_tt_device; - std::unordered_set m_global_thread_ids_pinned = {}; - std::thread::id m_main_thread_id; - bool m_stored_main_thread_id = false; - - // For quicker unbinding of threads, record the physical_device_id during binding. - std::map m_global_thread_id_to_physical_device_id_map = {}; - - // For storing original cpuset during binding, to restore during unbinding. - std::map m_global_thread_id_to_original_cpuset_map = {}; - // Memory Binding std::map m_physical_device_id_to_numa_nodeset_map; diff --git a/device/device_api.h b/device/device_api.h deleted file mode 100644 index a2728e7a..00000000 --- a/device/device_api.h +++ /dev/null @@ -1,10 +0,0 @@ -/* - * SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc. - * - * SPDX-License-Identifier: Apache-2.0 - */ - -#pragma once -#include "device/tt_device.h" -#include "device/driver_atomics.h" -#include "device/tt_emulation_device.h" diff --git a/device/device_api_metal.h b/device/device_api_metal.h index a2728e7a..0fc7820c 100644 --- a/device/device_api_metal.h +++ b/device/device_api_metal.h @@ -7,4 +7,3 @@ #pragma once #include "device/tt_device.h" #include "device/driver_atomics.h" -#include "device/tt_emulation_device.h" diff --git a/device/grayskull_implementation.cpp b/device/grayskull/grayskull_implementation.cpp similarity index 98% rename from device/grayskull_implementation.cpp rename to device/grayskull/grayskull_implementation.cpp index 9d773166..6ed7aaaf 100644 --- a/device/grayskull_implementation.cpp +++ b/device/grayskull/grayskull_implementation.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "device/grayskull_implementation.h" +#include "grayskull_implementation.h" namespace tt::umd { diff --git a/device/grayskull_implementation.h b/device/grayskull/grayskull_implementation.h similarity index 99% rename from device/grayskull_implementation.h rename to device/grayskull/grayskull_implementation.h index 79bdfdee..c014350a 100644 --- a/device/grayskull_implementation.h +++ b/device/grayskull/grayskull_implementation.h @@ -99,7 +99,6 @@ enum class arc_message_type { ARC_GO_LONG_IDLE = 0x54, ARC_GET_HARVESTING = 0x57, TEST = 0x90, - NOC_DMA_TRANSFER = 0x9A, SETUP_IATU_FOR_PEER_TO_PEER = 0x97, DEASSERT_RISCV_RESET = 0xba }; diff --git a/device/grayskull/impl_device.hpp b/device/grayskull/impl_device.hpp deleted file mode 100644 index 21a18125..00000000 --- a/device/grayskull/impl_device.hpp +++ /dev/null @@ -1,85 +0,0 @@ -/* - * SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc. - * - * SPDX-License-Identifier: Apache-2.0 - */ - -#pragma once - -#include "device/tt_silicon_driver_common.hpp" - -// See src/t6ifc/t6py/packages/tenstorrent/data/grayskull/pci/tlb.yaml -// 1M -// local_offset: [ 0, 11, 0, "36-bit address prefix, prepended to the 20 LSBs of issued address to form a 56-bit NOC address. The 1MB TLB #n corresponds to the 1MB MMIO range starting at (0x0 + N*0x100000)."] -// x_end : [ 0, 17, 12, "" ] -// y_end : [ 0, 23, 18, "" ] -// x_start : [ 0, 29, 24, "" ] -// y_start : [ 0, 35, 30, "" ] -// noc_sel: [ 0, 36, 36, "NOC select (1 = NOC1, 0 = NOC0)"] -// mcast: [ 0, 37, 37, "1 = multicast, 0 = unicast"] -// ordering: [ 0, 39, 38, "ordering mode (01 = strict (full AXI ordering), 00 = relaxed (no RAW hazard), 10 = posted (may have RAW hazard)"] -// linked: [ 0, 40, 40, "linked"] - -// 2M -// local_offset: [ 0, 10, 0, "35-bit address prefix, prepended to the 21 LSBs of issued address to form a 56-bit NOC address. The 2MB TLB #n corresponds to the 2MB MMIO range starting at (0x9C00000 + N*0x200000)."] -// x_end : [ 0, 16, 11, "" ] -// y_end : [ 0, 22, 17, "" ] -// x_start : [ 0, 28, 23, "" ] -// y_start : [ 0, 34, 29, "" ] -// noc_sel: [ 0, 35, 35, "NOC select (1 = NOC1, 0 = NOC0)"] -// mcast: [ 0, 36, 36, "1 = multicast, 0 = unicast"] -// ordering: [ 0, 38, 37, "ordering mode (01 = strict (full AXI ordering), 00 = relaxed (no RAW hazard), 10 = posted (may have RAW hazard)"] -// linked: [ 0, 39, 39, "linked"] - -// 16M -// local_offset: [ 0, 7 , 0, "32-bit address prefix, prepended to the 24 LSBs of issued address to form a 56-bit NOC address. The 16MB TLB #n corresponds to the 16MB MMIO range starting at (0xB000000 + N*0x1000000)."] -// x_end : [ 0, 13, 8, "" ] -// y_end : [ 0, 19, 14, "" ] -// x_start : [ 0, 25, 20, "" ] -// y_start : [ 0, 31, 26, "" ] -// noc_sel: [ 0, 32, 32, "NOC select (1 = NOC1, 0 = NOC0)"] -// mcast: [ 0, 33, 33, "1 = multicast, 0 = unicast"] -// ordering: [ 0, 35, 34, "ordering mode (01 = strict (full AXI ordering), 00 = relaxed (no RAW hazard), 10 = posted (may have RAW hazard)"] -// linked: [ 0, 36, 36, "linked"] - -const auto TLB_1M_OFFSET = TLB_OFFSETS { - .local_offset = 0, - .x_end = 12, - .y_end = 18, - .x_start = 24, - .y_start = 30, - .noc_sel = 36, - .mcast = 37, - .ordering = 38, - .linked = 40, - .static_vc = 41, - .static_vc_end = 42 -}; - -const auto TLB_2M_OFFSET = TLB_OFFSETS { - .local_offset = 0, - .x_end = 11, - .y_end = 17, - .x_start = 23, - .y_start = 29, - .noc_sel = 35, - .mcast = 36, - .ordering = 37, - .linked = 39, - .static_vc = 40, - .static_vc_end = 41 -}; - -const auto TLB_16M_OFFSET = TLB_OFFSETS { - .local_offset = 0, - .x_end = 8, - .y_end = 14, - .x_start = 20, - .y_start = 26, - .noc_sel = 32, - .mcast = 33, - .ordering = 34, - .linked = 36, - .static_vc = 37, - .static_vc_end = 38 -}; diff --git a/device/kmdif.h b/device/kmdif.h index 32596d55..c013202b 100644 --- a/device/kmdif.h +++ b/device/kmdif.h @@ -9,15 +9,6 @@ typedef std::uint32_t DWORD; -const uint32_t MAX_DMA_BYTES = 4*1024*1024; - -// DMA -struct DMAbuffer { - void *pBuf = NULL; - std::uint64_t pDma = 0; - std::uint64_t size; -}; - struct TTDevice; struct PCIdevice { diff --git a/device/pci_device.cpp b/device/pci_device.cpp new file mode 100644 index 00000000..fa256100 --- /dev/null +++ b/device/pci_device.cpp @@ -0,0 +1,431 @@ +/* + * SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include // for ::open +#include // for ::close +#include // for ioctl +#include // for mmap, munmap +#include // for PCI_SLOT, PCI_FUNC + +#include "pci_device.hpp" +#include "architecture_implementation.h" +#include "ioctl.h" +#include "device/tt_arch_types.h" +#include "device/driver_atomics.h" + +#include "common/assert.hpp" +#include "common/logger.hpp" + +int find_device(const uint16_t device_id) { + // returns device id if found, otherwise -1 + const char device_name_pattern [] = "/dev/tenstorrent/%u"; + char device_name[sizeof(device_name_pattern) + std::numeric_limits::digits10]; + std::snprintf(device_name, sizeof(device_name), device_name_pattern, (unsigned int)device_id); + int device_fd = ::open(device_name, O_RDWR | O_CLOEXEC); + // LOG2 ("find_device() open call returns device_fd: %d for device_name: %s (device_id: %d)\n", device_fd, device_name, device_id); + return device_fd; +} + +tt::ARCH detect_arch(uint16_t pcie_device_id, int pcie_revision_id) { + if (pcie_device_id == 0xfaca){ + return tt::ARCH::GRAYSKULL; + } else if (pcie_device_id == 0x401e && pcie_revision_id == 0x01){ + return tt::ARCH::WORMHOLE_B0; + } else if (pcie_device_id == 0x401e){ + TT_THROW("Wormhole is not supported. Please use Wormhole B0 instead."); + return tt::ARCH::WORMHOLE; + } else if (pcie_device_id == 0xb140){ + return tt::ARCH::BLACKHOLE; + } else { + TT_THROW("Unknown pcie device id that does not match any known architecture: ", pcie_device_id); + } +} + +// -------------------------------------------------------------------------------------------------------------- +// -------------------------------------------------------------------------------------------------------------- +// -------------------------------------------------------------------------------------------------------------- + +TTDevice::TTDevice(int device_id, int logical_device_id){ + this->device_id = device_id; + this->logical_id = logical_device_id; + setup_device(); +} + +TTDevice::~TTDevice(){ + close_device(); +} + +void TTDevice::setup_device() { + device_fd = find_device(device_id); + get_pcie_info(); + if (device_fd == -1) { + throw std::runtime_error(std::string("Failed opening a handle for device ") + std::to_string(device_id)); + } + + arch = detect_arch(pcie_device_id, pcie_revision_id); + architecture_implementation = tt::umd::architecture_implementation::create(static_cast(arch)); + + // Get PCIe device info through IOTCL -> tt-kmd + tenstorrent_get_device_info device_info; + memset(&device_info, 0, sizeof(device_info)); + device_info.in.output_size_bytes = sizeof(device_info.out); + if (ioctl(device_fd, TENSTORRENT_IOCTL_GET_DEVICE_INFO, &device_info) == -1) { + throw std::runtime_error(std::string("Get device info failed on device ") + std::to_string(device_id) + "."); + } + + struct { + tenstorrent_query_mappings query_mappings; + tenstorrent_mapping mapping_array[8]; + } mappings; + + memset(&mappings, 0, sizeof(mappings)); + mappings.query_mappings.in.output_mapping_count = 8; + + if (ioctl(device_fd, TENSTORRENT_IOCTL_QUERY_MAPPINGS, &mappings.query_mappings) == -1) { + throw std::runtime_error(std::string("Query mappings failed on device ") + std::to_string(device_id) + "."); + } + + // Mapping resource to BAR + // Resource 0 -> BAR0 + // Resource 1 -> BAR2 + // Resource 2 -> BAR4 + tenstorrent_mapping bar0_uc_mapping; + tenstorrent_mapping bar0_wc_mapping; + tenstorrent_mapping bar2_uc_mapping; + tenstorrent_mapping bar2_wc_mapping; + tenstorrent_mapping bar4_uc_mapping; + tenstorrent_mapping bar4_wc_mapping; + + memset(&bar0_uc_mapping, 0, sizeof(bar0_uc_mapping)); + memset(&bar0_wc_mapping, 0, sizeof(bar0_wc_mapping)); + memset(&bar2_uc_mapping, 0, sizeof(bar2_uc_mapping)); + memset(&bar2_wc_mapping, 0, sizeof(bar2_wc_mapping)); + memset(&bar4_uc_mapping, 0, sizeof(bar4_uc_mapping)); + memset(&bar4_wc_mapping, 0, sizeof(bar4_wc_mapping)); + + for (unsigned int i = 0; i < mappings.query_mappings.in.output_mapping_count; i++) { + if (mappings.mapping_array[i].mapping_id == TENSTORRENT_MAPPING_RESOURCE0_UC) { + bar0_uc_mapping = mappings.mapping_array[i]; + } + + if (mappings.mapping_array[i].mapping_id == TENSTORRENT_MAPPING_RESOURCE0_WC) { + bar0_wc_mapping = mappings.mapping_array[i]; + } + + if (mappings.mapping_array[i].mapping_id == TENSTORRENT_MAPPING_RESOURCE1_UC) { + bar2_uc_mapping = mappings.mapping_array[i]; + } + + if (mappings.mapping_array[i].mapping_id == TENSTORRENT_MAPPING_RESOURCE1_WC) { + bar2_wc_mapping = mappings.mapping_array[i]; + } + + if (mappings.mapping_array[i].mapping_id == TENSTORRENT_MAPPING_RESOURCE2_UC) { + bar4_uc_mapping = mappings.mapping_array[i]; + } + + if (mappings.mapping_array[i].mapping_id == TENSTORRENT_MAPPING_RESOURCE2_WC) { + bar4_wc_mapping = mappings.mapping_array[i]; + } + + log_debug(LogSiliconDriver, "BAR mapping id {} base {} size {}", + mappings.mapping_array[i].mapping_id, + (void *)mappings.mapping_array[i].mapping_base, + mappings.mapping_array[i].mapping_size); + } + + if (bar0_uc_mapping.mapping_id != TENSTORRENT_MAPPING_RESOURCE0_UC) { + throw std::runtime_error(std::string("Device ") + std::to_string(device_id) + " has no BAR0 UC mapping."); + } + + auto wc_mapping_size = arch == tt::ARCH::BLACKHOLE ? BH_BAR0_WC_MAPPING_SIZE : GS_BAR0_WC_MAPPING_SIZE; + + // Attempt WC mapping first so we can fall back to all-UC if it fails. + if (bar0_wc_mapping.mapping_id == TENSTORRENT_MAPPING_RESOURCE0_WC) { + bar0_wc_size = std::min(bar0_wc_mapping.mapping_size, wc_mapping_size); + bar0_wc = mmap(NULL, bar0_wc_size, PROT_READ | PROT_WRITE, MAP_SHARED, device_fd, bar0_wc_mapping.mapping_base); + if (bar0_wc == MAP_FAILED) { + bar0_wc_size = 0; + bar0_wc = nullptr; + } + } + + if (bar0_wc) { + // The bottom part of the BAR is mapped WC. Map the top UC. + bar0_uc_size = bar0_uc_mapping.mapping_size - wc_mapping_size; + bar0_uc_offset = wc_mapping_size; + } else { + // No WC mapping, map the entire BAR UC. + bar0_uc_size = bar0_uc_mapping.mapping_size; + bar0_uc_offset = 0; + } + + bar0_uc = mmap(NULL, bar0_uc_size, PROT_READ | PROT_WRITE, MAP_SHARED, device_fd, bar0_uc_mapping.mapping_base + bar0_uc_offset); + + if (bar0_uc == MAP_FAILED) { + throw std::runtime_error(std::string("BAR0 UC memory mapping failed for device ") + std::to_string(device_id) + "."); + } + + if (!bar0_wc) { + bar0_wc = bar0_uc; + } + + if (arch == tt::ARCH::WORMHOLE_B0) { + if (bar4_uc_mapping.mapping_id != TENSTORRENT_MAPPING_RESOURCE2_UC) { + throw std::runtime_error(std::string("Device ") + std::to_string(device_id) + " has no BAR4 UC mapping."); + } + + system_reg_mapping_size = bar4_uc_mapping.mapping_size; + + system_reg_mapping = mmap(NULL, bar4_uc_mapping.mapping_size, PROT_READ | PROT_WRITE, MAP_SHARED, device_fd, bar4_uc_mapping.mapping_base); + + if (system_reg_mapping == MAP_FAILED) { + throw std::runtime_error(std::string("BAR4 UC memory mapping failed for device ") + std::to_string(device_id) + "."); + } + + system_reg_start_offset = (512 - 16) * 1024*1024; + system_reg_offset_adjust = (512 - 32) * 1024*1024; + } else if(arch == tt::ARCH::BLACKHOLE) { + if (bar2_uc_mapping.mapping_id != TENSTORRENT_MAPPING_RESOURCE1_UC) { + throw std::runtime_error(std::string("Device ") + std::to_string(device_id) + " has no BAR2 UC mapping."); + } + + // Using UnCachable memory mode. This is used for accessing registers on Blackhole. + bar2_uc_size = bar2_uc_mapping.mapping_size; + bar2_uc = mmap(NULL, bar2_uc_mapping.mapping_size, PROT_READ | PROT_WRITE, MAP_SHARED, device_fd, bar2_uc_mapping.mapping_base); + + if (bar2_uc == MAP_FAILED) { + throw std::runtime_error(std::string("BAR2 UC memory mapping failed for device ") + std::to_string(device_id) + "."); + } + + if (bar4_wc_mapping.mapping_id != TENSTORRENT_MAPPING_RESOURCE2_WC) { + throw std::runtime_error(std::string("Device ") + std::to_string(device_id) + " has no BAR4 WC mapping."); + } + + // Using Write-Combine memory mode. This is used for accessing DRAM on Blackhole. + // WC doesn't guarantee write ordering but has better performance. + bar4_wc_size = bar4_wc_mapping.mapping_size; + bar4_wc = mmap(NULL, bar4_wc_mapping.mapping_size, PROT_READ | PROT_WRITE, MAP_SHARED, device_fd, bar4_wc_mapping.mapping_base); + + if (bar4_wc == MAP_FAILED) { + throw std::runtime_error(std::string("BAR4 WC memory mapping failed for device ") + std::to_string(device_id) + "."); + } + } + + // GS+WH: ARC_SCRATCH[6], BH: NOC NODE_ID + read_checking_offset = arch == tt::ARCH::BLACKHOLE ? BH_NOC_NODE_ID_OFFSET : GS_WH_ARC_SCRATCH_6_OFFSET; +} + +void TTDevice::close_device() { + if (arch == tt::ARCH::BLACKHOLE && bar2_uc != nullptr && bar2_uc != MAP_FAILED) { + // Disable ATU index 0 + // TODO: Implement disabling for all indexes, once more host channels are enabled. + uint64_t iatu_index = 0; + uint64_t iatu_base = UNROLL_ATU_OFFSET_BAR + iatu_index * 0x200; + uint32_t region_ctrl_2 = 0 << 31; // REGION_EN = 0 + write_regs(reinterpret_cast(static_cast(bar2_uc) + iatu_base + 0x04), ®ion_ctrl_2, 1); + } + + if (device_fd != -1) { + ::close(device_fd); + } + + if (bar0_wc != nullptr && bar0_wc != MAP_FAILED && bar0_wc != bar0_uc) { + munmap(bar0_wc, bar0_wc_size); + } + + if (bar0_uc != nullptr && bar0_uc != MAP_FAILED) { + munmap(bar0_uc, bar0_uc_size); + } + + if (bar2_uc != nullptr && bar2_uc != MAP_FAILED) { + munmap(bar2_uc, bar2_uc_size); + } + + if (bar4_wc != nullptr && bar4_wc != MAP_FAILED) { + munmap(bar4_wc, bar4_wc_size); + } + + if (system_reg_mapping != nullptr && system_reg_mapping != MAP_FAILED) { + munmap(system_reg_mapping, system_reg_mapping_size); + } + + device_fd = -1; + bar0_uc = nullptr; + bar0_wc = nullptr; + bar2_uc = nullptr; + bar4_wc = nullptr; + system_reg_mapping = nullptr; +} + +void TTDevice::get_pcie_info() { + // Get PCIe device info through IOTCL -> tt-kmd and return pci_device_id and revision_id + std::uint16_t pcie_domain; + std::uint8_t pcie_bus; + std::uint8_t pcie_device; + std::uint8_t pcie_function; + + tenstorrent_get_device_info device_info; + memset(&device_info, 0, sizeof(device_info)); + device_info.in.output_size_bytes = sizeof(device_info.out); + if (ioctl(this->device_fd, TENSTORRENT_IOCTL_GET_DEVICE_INFO, &device_info) == -1) { + TT_THROW("Get PCIe device info failed on device: ", this->device_id); + } + pcie_domain = device_info.out.pci_domain; + pcie_bus = device_info.out.bus_dev_fn >> 8; + pcie_device = PCI_SLOT(device_info.out.bus_dev_fn); + pcie_function = PCI_FUNC(device_info.out.bus_dev_fn); + + // Get the PCIe revision ID from sysfs + static const char sys_pattern[] = "/sys/bus/pci/devices/%04x:%02x:%02x.%u/%s"; + char buf[sizeof(sys_pattern) + 10]; + + // revision pattern = "/sys/bus/pci/devices/%04x:%02x:%02x.%u/revision" + std::snprintf(buf, sizeof(buf), sys_pattern, pcie_domain, pcie_bus, pcie_device, pcie_function, "revision"); + + std::ifstream revision_file(buf); + std::string revision_string; + if (std::getline(revision_file, revision_string)) { + this->pcie_device_id = device_info.out.device_id; + this->pcie_revision_id = std::stoi(revision_string, nullptr, 0); + } else { + TT_THROW("Revision ID /sys/ read failed for device: ", this->device_id); + } + + // Get NUMA node from sysfs + // numa node pattern = "/sys/bus/pci/devices/%04x:%02x:%02x.%u/numa_node" + std::snprintf(buf, sizeof(buf), sys_pattern, pcie_domain, pcie_bus, pcie_device, pcie_function, "numa_node"); + + std::ifstream num_node_file(buf); + std::string numa_node_string; + if (std::getline(num_node_file, numa_node_string)) { + this->numa_node = std::stoi(numa_node_string, nullptr, 0); + } else { + TT_THROW("Numa node /sys/ read failed for device: ", this->device_id); + } +} + +// Open a unique device_id per host memory channel (workaround for ttkmd < 1.21 support for more than 1 pin per fd) +void TTDevice::open_hugepage_per_host_mem_ch(uint32_t num_host_mem_channels) { + for (int ch = 0; ch < num_host_mem_channels; ch++) { + log_debug(LogSiliconDriver, "Opening device_fd_per_host_ch device index: {} ch: {} (num_host_mem_channels: {})", device_id, ch, num_host_mem_channels); + int device_fd_for_host_mem = find_device(device_id); + if (device_fd_for_host_mem == -1) { + throw std::runtime_error(std::string("Failed opening a host memory device handle for device ") + std::to_string(device_id)); + } + device_fd_per_host_ch.push_back(device_fd_for_host_mem); + } +} + +tt::ARCH TTDevice::get_arch() const { + return arch; +} + +template +T* TTDevice::get_register_address(std::uint32_t register_offset) { + void *reg_mapping; + if (system_reg_mapping != nullptr && register_offset >= system_reg_start_offset) { + register_offset -= system_reg_offset_adjust; + reg_mapping = system_reg_mapping; + } else if (bar0_wc != bar0_uc && register_offset < bar0_wc_size) { + reg_mapping = bar0_wc; + } else { + register_offset -= bar0_uc_offset; + reg_mapping = bar0_uc; + } + return reinterpret_cast(static_cast(reg_mapping) + register_offset); +} + +void TTDevice::write_block(uint64_t byte_addr, uint64_t num_bytes, const uint8_t* buffer_addr) { + void *dest = nullptr; + if (bar4_wc != nullptr && byte_addr >= BAR0_BH_SIZE) { + byte_addr -= BAR0_BH_SIZE; + dest = reinterpret_cast(bar4_wc) + byte_addr; + }else { + dest = get_register_address(byte_addr); + } + + const void *src = reinterpret_cast(buffer_addr); + memcpy(dest, src, num_bytes); +// #ifndef DISABLE_ISSUE_3487_FIX +// // memcpy_to_device(dest, src, num_bytes); +// #else +// // ~4x faster than pci_read above, but works for all sizes and alignments +// memcpy(dest, src, num_bytes); +// #endif +} + +void TTDevice::read_block(uint64_t byte_addr, uint64_t num_bytes, uint8_t* buffer_addr) { + void *src = nullptr; + if (bar4_wc != nullptr && byte_addr >= BAR0_BH_SIZE) { //arch == tt::ARCH::BLACKHOLE && + byte_addr -= BAR0_BH_SIZE; + src = reinterpret_cast(bar4_wc) + byte_addr; + } else { + src = get_register_address(byte_addr); + } + + void *dest = reinterpret_cast(buffer_addr); + memcpy(dest, src, num_bytes); +// #ifndef DISABLE_ISSUE_3487_FIX +// // memcpy_from_device(dest, src, num_bytes); +// #else +// // ~4x faster than pci_read above, but works for all sizes and alignments +// memcpy(dest, src, num_bytes); +// #endif +} + +// This is only needed for the BH workaround in iatu_configure_peer_region since no arc +void TTDevice::write_regs(volatile uint32_t *dest, const uint32_t *src, uint32_t word_len) { + while (word_len-- != 0) { + *dest++ = *src++; + } +} + +void TTDevice::write_regs(uint32_t byte_addr, uint32_t word_len, const void *data) { + volatile uint32_t *dest = get_register_address(byte_addr); + const uint32_t *src = reinterpret_cast(data); + + write_regs(dest, src, word_len); +} + +void TTDevice::read_regs(uint32_t byte_addr, uint32_t word_len, void *data) { + const volatile uint32_t *src = get_register_address(byte_addr); + uint32_t *dest = reinterpret_cast(data); + + while (word_len-- != 0) { + uint32_t temp = *src++; + memcpy(dest++, &temp, sizeof(temp)); + } +} + +void TTDevice::write_tlb_reg(uint32_t byte_addr, std::uint64_t value_lower, std::uint64_t value_upper, std::uint32_t tlb_cfg_reg_size){ + log_assert((tlb_cfg_reg_size == 8) or (tlb_cfg_reg_size == 12), "Tenstorrent hardware supports only 64bit or 96bit TLB config regs"); + + volatile uint64_t *dest_qw = get_register_address(byte_addr); + volatile uint32_t *dest_extra_dw = get_register_address(byte_addr+8); +#if defined(__ARM_ARCH) || defined(__riscv) + // The store below goes through UC memory on x86, which has implicit ordering constraints with WC accesses. + // ARM has no concept of UC memory. This will not allow for implicit ordering of this store wrt other memory accesses. + // Insert an explicit full memory barrier for ARM. + // Do the same for RISC-V. + tt_driver_atomics::mfence(); +#endif + *dest_qw = value_lower; + if (tlb_cfg_reg_size > 8) { + uint32_t* p_value_upper = reinterpret_cast(&value_upper); + *dest_extra_dw = p_value_upper[0]; + } + tt_driver_atomics::mfence(); // Otherwise subsequent WC loads move earlier than the above UC store to the TLB register. + +// LOG2(" TLB "); +// print_buffer (&value_lower, sizeof(value_lower), true); +// if (tlb_cfg_reg_size > 8) { +// uint32_t* p_value_upper = reinterpret_cast(&value_upper); +// print_buffer (p_value_upper, sizeof(uint32_t), true); +// } +} diff --git a/device/pci_device.hpp b/device/pci_device.hpp new file mode 100644 index 00000000..fd918b77 --- /dev/null +++ b/device/pci_device.hpp @@ -0,0 +1,99 @@ +/* + * SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once +#include +#include + +#include "device/tt_arch_types.h" +#include "architecture_implementation.h" + +static uint32_t GS_BAR0_WC_MAPPING_SIZE = (156<<20) + (10<<21) + (18<<24); +static uint32_t BH_BAR0_WC_MAPPING_SIZE = 188<<21; // Defines the address for WC region. addresses 0 to BH_BAR0_WC_MAPPING_SIZE are in WC, above that are UC + +static const uint32_t BH_NOC_NODE_ID_OFFSET = 0x1FD04044; +static const uint32_t GS_WH_ARC_SCRATCH_6_OFFSET = 0x1FF30078; + +// See /vendor_ip/synopsys/052021/bh_pcie_ctl_gen5/export/configuration/DWC_pcie_ctl.h +static const uint64_t UNROLL_ATU_OFFSET_BAR = 0x1200; + +// BAR0 size for Blackhole, used to determine whether write block should use BAR0 or BAR4 +const uint64_t BAR0_BH_SIZE = 512 * 1024 * 1024; + +typedef std::uint32_t DWORD; + +class TTDevice { +public: + TTDevice(int device_id, int logical_device_id); + ~TTDevice(); + TTDevice(const TTDevice&) = delete; // copy + void operator = (const TTDevice&) = delete; // copy assignment + + void write_block(uint64_t byte_addr, uint64_t num_bytes, const uint8_t* buffer_addr); + void read_block(uint64_t byte_addr, uint64_t num_bytes, uint8_t* buffer_addr); + void write_regs(uint32_t byte_addr, uint32_t word_len, const void *data); + void write_regs(volatile uint32_t *dest, const uint32_t *src, uint32_t word_len); + void read_regs(uint32_t byte_addr, uint32_t word_len, void *data); + void write_tlb_reg(uint32_t byte_addr, std::uint64_t value_lower, std::uint64_t value_upper, std::uint32_t tlb_cfg_reg_size); + + void open_hugepage_per_host_mem_ch(uint32_t num_host_mem_channels); + bool reset_board(); + tt::umd::architecture_implementation* get_architecture_implementation() const { return architecture_implementation.get(); } + + int device_id; + int logical_id; + int device_fd = -1; + + // PCIe device info + std::uint32_t numa_node; + std::uint16_t pcie_device_id; + int pcie_revision_id; + + // BAR and regs mapping setup + std::vector device_fd_per_host_ch; + void *bar0_uc = nullptr; + std::size_t bar0_uc_size = 0; + std::size_t bar0_uc_offset = 0; + + void *bar0_wc = nullptr; + std::size_t bar0_wc_size = 0; + + void *bar2_uc = nullptr; + std::size_t bar2_uc_size; + + void *bar4_wc = nullptr; + std::uint64_t bar4_wc_size; + + void *system_reg_mapping = nullptr; + std::size_t system_reg_mapping_size; + + // These two are currently not used. + void *system_reg_wc_mapping = nullptr; + std::size_t system_reg_wc_mapping_size; + + std::uint32_t system_reg_start_offset; // Registers >= this are system regs, use the mapping. + std::uint32_t system_reg_offset_adjust; // This is the offset of the first reg in the system reg mapping. + + // int sysfs_config_fd = -1; // not used + std::uint32_t read_checking_offset; + + tt::ARCH get_arch() const; + +private: + void get_pcie_info(); + void setup_device(); + void close_device(); + void drop(); + + bool reset_by_sysfs(); + bool reset_by_ioctl(); + + template + T* get_register_address(std::uint32_t register_offset); + + tt::ARCH arch; + std::unique_ptr architecture_implementation; +}; diff --git a/device/simulation/tt_simulation_device.h b/device/simulation/tt_simulation_device.h index c57bc1da..ec407f65 100644 --- a/device/simulation/tt_simulation_device.h +++ b/device/simulation/tt_simulation_device.h @@ -39,8 +39,6 @@ class tt_SimulationDevice: public tt_device { // void broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set& chips_to_exclude, std::set& rows_to_exclude, std::set& columns_to_exclude, const std::string& fallback_tlb); - // virtual void rolled_write_to_device(uint32_t* mem_ptr, uint32_t size_in_bytes, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb); - // virtual void rolled_write_to_device(std::vector &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use); virtual void read_from_device(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb); virtual void write_to_sysmem(std::vector& vec, uint64_t addr, uint16_t channel, chip_id_t src_device_id); @@ -57,16 +55,13 @@ class tt_SimulationDevice: public tt_device { // Misc. Functions to Query/Set Device State // virtual bool using_harvested_soc_descriptors(); virtual std::unordered_map get_harvesting_masks_for_soc_descriptors(); - // virtual bool noc_translation_en(); // virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c); // virtual int get_number_of_chips_in_cluster(); // virtual std::unordered_set get_all_chips_in_cluster(); // virtual tt_ClusterDescriptor* get_cluster_description(); static std::vector detect_available_device_ids(); - // static std::unordered_map get_logical_to_physical_mmio_device_id_map(std::vector physical_device_ids); virtual std::set get_target_remote_device_ids(); virtual std::map get_clocks(); - // virtual uint32_t dma_allocation_size(chip_id_t src_device_id = -1); // virtual void *channel_0_address(std::uint32_t offset, std::uint32_t device_id) const; virtual void *host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const; virtual std::uint64_t get_pcie_base_addr_from_device() const; @@ -74,7 +69,6 @@ class tt_SimulationDevice: public tt_device { virtual std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel); virtual std::uint32_t get_num_host_channels(std::uint32_t device_id); virtual std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel); - // virtual std::uint32_t get_pcie_speed(std::uint32_t device_id); virtual std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id); // virtual tt_version get_ethernet_fw_version() const; diff --git a/device/tt_cluster_descriptor.cpp b/device/tt_cluster_descriptor.cpp index 2ef5ec64..90f53855 100644 --- a/device/tt_cluster_descriptor.cpp +++ b/device/tt_cluster_descriptor.cpp @@ -52,19 +52,6 @@ std::vector> tt_ClusterDescri return directly_connected_channels; } -bool tt_ClusterDescriptor::channels_are_directly_connected(const chip_id_t &first, const ethernet_channel_t &first_channel, const chip_id_t &second, const ethernet_channel_t &second_channel) const { - if (this->enabled_active_chips.find(first) == this->enabled_active_chips.end() || this->enabled_active_chips.find(second) == this->enabled_active_chips.end()) { - return false; - } - - if (this->ethernet_connections.at(first).find(first_channel) == this->ethernet_connections.at(first).end()) { - return false; - } - - const auto &[connected_chip, connected_channel] = this->ethernet_connections.at(first).at(first_channel); - return connected_chip == second && connected_channel == second_channel; -} - // const eth_coord_t tt_ClusterDescriptor::get_chip_xy(const chip_id_t &chip_id) const { // // For now we only support a 1D cluster, so the mapping is trivial (where the chip ID is the x value of the xy // location) return eth_coord_t(chip_id, 0, 0, 0); @@ -367,14 +354,6 @@ std::unique_ptr tt_ClusterDescriptor::create_for_grayskull return desc; } -std::set get_sequential_chip_id_set(int num_chips) { - std::set chip_ids; - for (int i = 0; i < num_chips; ++i) { - chip_ids.insert(static_cast(i)); - } - return chip_ids; -} - void tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc) { log_assert(yaml["ethernet_connections"].IsSequence(), "Invalid YAML"); for (YAML::Node &connected_endpoints : yaml["ethernet_connections"].as>()) { @@ -594,22 +573,10 @@ void tt_ClusterDescriptor::load_harvesting_information(YAML::Node &yaml, tt_Clus } } -void tt_ClusterDescriptor::specify_enabled_devices(const std::vector &chip_ids) { - this->enabled_active_chips.clear(); - for (auto chip_id : chip_ids) { - this->enabled_active_chips.insert(chip_id); - } -} - void tt_ClusterDescriptor::enable_all_devices() { this->enabled_active_chips = this->all_chips; } -bool tt_ClusterDescriptor::chips_have_ethernet_connectivity() const { - return ethernet_connections.size() > 0; -} - - std::unordered_map > > tt_ClusterDescriptor::get_ethernet_connections() const { auto eth_connections = std::unordered_map > >(); diff --git a/device/tt_cluster_descriptor.h b/device/tt_cluster_descriptor.h index 1a923a8b..a68e1d8c 100644 --- a/device/tt_cluster_descriptor.h +++ b/device/tt_cluster_descriptor.h @@ -76,7 +76,6 @@ class tt_ClusterDescriptor { */ std::vector> get_directly_connected_ethernet_channels_between_chips(const chip_id_t &first, const chip_id_t &second) const; - bool channels_are_directly_connected(const chip_id_t &first, const ethernet_channel_t &first_channel, const chip_id_t &second, const ethernet_channel_t &second_channel) const; bool is_chip_mmio_capable(const chip_id_t &chip_id) const; chip_id_t get_closest_mmio_capable_chip(const chip_id_t &chip); chip_id_t get_shelf_local_physical_chip_coords(chip_id_t virtual_coord); @@ -87,7 +86,6 @@ class tt_ClusterDescriptor { // const eth_coord_t get_chip_xy(const chip_id_t &chip_id) const; // const chip_id_t get_chip_id_at_location(const eth_coord_t &chip_location) const; - bool chips_have_ethernet_connectivity() const; std::unordered_map get_harvesting_info() const; std::unordered_map get_noc_translation_table_en() const; std::unordered_map get_chip_locations() const; @@ -103,9 +101,6 @@ class tt_ClusterDescriptor { bool ethernet_core_has_active_ethernet_link(chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const; std::tuple get_chip_and_channel_of_remote_ethernet_core(chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const; - void specify_enabled_devices(const std::vector &chip_ids); void enable_all_devices(); }; - -std::set get_sequential_chip_id_set(int num_chips); diff --git a/device/tt_device.h b/device/tt_device.h index f3064cd5..2fb3766d 100644 --- a/device/tt_device.h +++ b/device/tt_device.h @@ -20,7 +20,8 @@ #include "device/tlb.h" #include "device/tt_io.hpp" -using TLB_OFFSETS = tt::umd::tlb_offsets; +#include "pci_device.hpp" + using TLB_DATA = tt::umd::tlb_data; @@ -28,7 +29,6 @@ namespace boost::interprocess{ class named_mutex; } -class PCIDevice; class tt_ClusterDescriptor; enum tt_DevicePowerState { @@ -37,13 +37,6 @@ enum tt_DevicePowerState { LONG_IDLE }; -enum tt_MutexType { - LARGE_READ_TLB, - LARGE_WRITE_TLB, - SMALL_READ_WRITE_TLB, - ARC_MSG -}; - enum tt_MemBarFlag { SET = 0xaa, RESET = 0xbb, @@ -215,10 +208,9 @@ struct tt_device_params { }; /** - * @brief Parent class for tt_SiliconDevice (Silicon Driver) and tt_VersimDevice (Versim Backend API). - * Exposes a generic interface to callers, providing declarations for virtual functions defined differently for - * Silicon and Versim. - * Valid usage consists of declaring a tt_device object and initializing it to either a Silicon or Versim backend. + * @brief Parent class for tt_SiliconDevice (Silicon Driver). + * Exposes a generic interface to callers, providing declarations for virtual functions defined differently for Silicon. + * Valid usage consists of declaring a tt_device object and initializing it to Silicon backend. * Using tt_device itself will throw errors, since its APIs are undefined. */ class tt_device @@ -294,7 +286,7 @@ class tt_device throw std::runtime_error("---- tt_device::configure_active_ethernet_cores_for_mmio_device is not implemented\n"); } /** - * @brief Start the Silicon on Versim Device + * @brief * On Silicon: Assert soft Tensix reset, deassert RiscV reset, set power state to busy (ramp up AICLK), initialize iATUs for PCIe devices and ethernet queues for remote chips. * \param device_params tt_device_params object specifying initialization configuration */ @@ -353,10 +345,8 @@ class tt_device * \param core chip-x-y struct specifying device and core * \param addr Address to write to * \param tlb_to_use Specifies fallback/dynamic TLB to use for transaction, if this core does not have static TLBs mapped to this address (dynamic TLBs were initialized in driver constructor) - * \param send_epoch_cmd Specifies that this is an epoch_cmd write, forcing runtime to take a faster write path (Buda only) - * \param last_send_epoch_cmd Specifies that this is the last epoch command being written, which requires metadata to be updated (Buda only) */ - virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false) { + virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) { // Only implement this for Silicon Backend throw std::runtime_error("---- tt_device::write_to_device is not implemented\n"); } @@ -364,43 +354,16 @@ class tt_device throw std::runtime_error("---- tt_device::broadcast_write_to_cluster is not implemented\n"); } /** - * @brief Write uint32_t vector to specified device, core and address (defined for Silicon and Versim). + * @brief Write uint32_t vector to specified device, core and address (defined for Silicon). * \param vec Vector to write * \param core chip-x-y struct specifying device and core * \param addr Address to write to * \param tlb_to_use Specifies fallback/dynamic TLB to use for transaction, if this core does not have static TLBs mapped to this address (dynamic TLBs were initialized in driver constructor) - * \param send_epoch_cmd Specifies that this is an epoch_cmd write, forcing runtime to take a faster write path (Buda only) - * \param last_send_epoch_cmd Specifies that this is the last epoch command being written, which requires metadata to be updated (Buda only) */ - virtual void write_to_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false) { + virtual void write_to_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) { throw std::runtime_error("---- tt_device::write_to_device is not implemented\n"); } - /** - * @brief Unroll/replicate uint32_t data (as specified by ptr + len pair) and write it to specified device, core and address (defined for Silicon). - * \param mem_ptr src data address - * \param len src data size (specified for uint32_t) - * \param unroll_count Number of times vector should be unrolled - * \param core chip-x-y struct specifying device and core - * \param addr Address to write to - * \param fallback_tlb Specifies fallback/dynamic TLB to use for transaction, if this core does not have static TLBs mapped to this address (dynamic TLBs were initialized in driver constructor) - */ - virtual void rolled_write_to_device(uint32_t* mem_ptr, uint32_t size_in_bytes, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb) { - // Only implement this for Silicon Backend - throw std::runtime_error("---- tt_device::rolled_write_to_device is not implemented\n"); - } - /** - * @brief Unroll/replicate a uint32_t vector and write it to specified device, core and address (defined for Silicon and Versim). - * \param vec Vector to write - * \param unroll_count Number of times vector should be unrolled - * \param core chip-x-y struct specifying device and core - * \param addr Address to write to - * \param tlb_to_use Specifies fallback/dynamic TLB to use for transaction, if this core does not have static TLBs mapped to this address (dynamic TLBs were initialized in driver constructor) - */ - virtual void rolled_write_to_device(std::vector &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) { - throw std::runtime_error("---- tt_device::rolled_write_to_device is not implemented\n"); - } - /** * @brief Read uint32_t data from a specified device, core and address to host memory (defined for Silicon). * \param mem_ptr dest data address on host (expected to be preallocated, depending on transfer size) @@ -415,7 +378,7 @@ class tt_device } /** - * @brief Read a uint32_t vector from a specified device, core and address to host memory (defined for Silicon and Versim). + * @brief Read a uint32_t vector from a specified device, core and address to host memory (defined for Silicon). * \param vec host side vector to populate with data read from device (does not need to be preallocated) * \param core chip-x-y struct specifying device and core * \param addr Address to read from @@ -491,13 +454,7 @@ class tt_device virtual std::unordered_map get_harvesting_masks_for_soc_descriptors() { throw std::runtime_error("---- tt_device:get_harvesting_masks_for_soc_descriptors is not implemented\n"); } - /** - * @brief Get Hardware Translation Table state - * \returns true if translation tables are enabled (WH only) - */ - virtual bool noc_translation_en() { - throw std::runtime_error("---- tt_device:noc_translation_en is not implemented\n"); - } + /** * @brief Issue message to device, meant to be picked up by ARC Firmare * \param logical_device_id Chip to target @@ -566,14 +523,6 @@ class tt_device return std::map(); } - /** - * @brief Get the PCIe speed for a specific device based on link width and link speed - * \returns Bandwidth in Gbps - */ - virtual std::uint32_t get_pcie_speed(std::uint32_t device_id) { - return 8 * 16; // default to x8 at 16 GT/s - } - virtual std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id) { throw std::runtime_error("---- tt_device::get_numa_node_for_pcie_device is not implemented\n"); } @@ -585,30 +534,6 @@ class tt_device virtual tt_version get_ethernet_fw_version() const { throw std::runtime_error("---- tt_device::get_ethernet_fw_version is not implemented \n"); } - - /** - * @brief Get the total hugepage (host memory) size allocated for a device. - * This memory is not entirely accessible by device. To query the number of channels - * or memory per channel that is accessbile, see get_host_channel_size or get_num_host_channels - * \param src_device_id Device for which allocated host memory is being queried - * \returns Total memory allocated on host for a specific device - * - */ - virtual uint32_t dma_allocation_size(chip_id_t src_device_id = -1) { - throw std::runtime_error("---- tt_device::dma_allocation_size is not implemented\n"); - return 0; - } - - /** - * Get the address for the MMIO mapped region on Channel (as seen from host memory) - * \param offset Address in DRAM - * \param target chip-x-y struct specifying device and core of target DRAM - * \returns Host interpretation of MMIO mapped channel 0 address - */ - virtual void *channel_address(std::uint32_t offset, const tt_cxy_pair& target) { - throw std::runtime_error("---- tt_device::channel_address is not implemented\n"); - return nullptr; - } /** * @brief Query number of DRAM channels on a specific device * \param device_id Logical device id to query @@ -676,67 +601,6 @@ class tt_device std::unordered_map soc_descriptor_per_chip = {}; }; -class c_versim_core; -namespace nuapi {namespace device {template class Simulator;}} -namespace versim { - struct VersimSimulatorState; - using VersimSimulator = nuapi::device::Simulator; -} - -/** - * @brief Versim Backend Class, derived from the tt_device class - * Implements APIs to communicate with a simulated (using Verilator) Tenstorrent Device. -*/ -class tt_VersimDevice: public tt_device -{ - public: - virtual void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_); - virtual void set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_); - tt_VersimDevice(const std::string &sdesc_path, const std::string &ndesc_path); - virtual std::unordered_map& get_virtual_soc_descriptors(); - virtual void start(std::vector plusargs, std::vector dump_cores, bool no_checkers, bool init_device, bool skip_driver_allocs); - virtual void start_device(const tt_device_params &device_params); - virtual void close_device(); - virtual void deassert_risc_reset(); - virtual void deassert_risc_reset_at_core(tt_cxy_pair core); - virtual void assert_risc_reset(); - virtual void assert_risc_reset_at_core(tt_cxy_pair core); - virtual void write_to_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false); - virtual void broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set& chips_to_exclude, std::set& rows_to_exclude, std::set& columns_to_exclude, const std::string& fallback_tlb); - virtual void rolled_write_to_device(std::vector &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use); - virtual void read_from_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use); - virtual void rolled_write_to_device(uint32_t* mem_ptr, uint32_t size_in_bytes, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb); - virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false); - virtual void read_from_device(void *mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use); - virtual void wait_for_non_mmio_flush(); - void l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); - void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels); - void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); - virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c); - virtual bool using_harvested_soc_descriptors(); - virtual std::unordered_map get_harvesting_masks_for_soc_descriptors(); - virtual bool noc_translation_en(); - virtual std::set get_target_mmio_device_ids(); - virtual std::set get_target_remote_device_ids(); - virtual ~tt_VersimDevice(); - virtual tt_ClusterDescriptor* get_cluster_description(); - virtual int get_number_of_chips_in_cluster(); - virtual std::unordered_set get_all_chips_in_cluster(); - static int detect_number_of_chips(); - virtual std::map get_clocks(); - virtual std::uint32_t get_num_dram_channels(std::uint32_t device_id); - virtual std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel); - virtual std::uint32_t get_num_host_channels(std::uint32_t device_id); - virtual std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel); - private: - bool stop(); - tt_device_l1_address_params l1_address_params; - tt_device_dram_address_params dram_address_params; - versim::VersimSimulator* versim; - std::shared_ptr ndesc; - void* p_ca_soc_manager; -}; - #include "device/architecture_implementation.h" /** @@ -781,14 +645,10 @@ class tt_SiliconDevice: public tt_device virtual void close_device(); // Runtime Functions - virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false); - virtual void write_to_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false); + virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use); + virtual void write_to_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use); void broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set& chips_to_exclude, std::set& rows_to_exclude, std::set& columns_to_exclude, const std::string& fallback_tlb); - virtual void write_epoch_cmd_to_device(const uint32_t *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write); - virtual void write_epoch_cmd_to_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write); - virtual void rolled_write_to_device(uint32_t* mem_ptr, uint32_t size_in_bytes, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb); - virtual void rolled_write_to_device(std::vector &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use); virtual void read_from_device(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb); virtual void read_from_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use); virtual void write_to_sysmem(std::vector& vec, uint64_t addr, uint16_t channel, chip_id_t src_device_id); @@ -809,7 +669,7 @@ class tt_SiliconDevice: public tt_device /** * @brief This API allows you to write directly to device memory that is addressable by a static TLB */ - std::function get_fast_pcie_static_tlb_write_callable(int device_id); + std::function get_fast_pcie_static_tlb_write_callable(int device_id); /** * @brief Provide fast write access to a statically-mapped TLB. @@ -824,40 +684,30 @@ class tt_SiliconDevice: public tt_device */ tt::Writer get_static_tlb_writer(tt_cxy_pair target); - /** - * @brief Returns the DMA buf size - */ - uint32_t get_m_dma_buf_size() const; // Misc. Functions to Query/Set Device State virtual int arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done = true, uint32_t arg0 = 0, uint32_t arg1 = 0, int timeout=1, uint32_t *return_3 = nullptr, uint32_t *return_4 = nullptr); virtual bool using_harvested_soc_descriptors(); virtual std::unordered_map get_harvesting_masks_for_soc_descriptors(); - virtual bool noc_translation_en(); virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c); virtual int get_number_of_chips_in_cluster(); virtual std::unordered_set get_all_chips_in_cluster(); virtual tt_ClusterDescriptor* get_cluster_description(); static int detect_number_of_chips(); static std::vector detect_available_device_ids(); - static std::unordered_map get_logical_to_physical_mmio_device_id_map(std::vector physical_device_ids); virtual std::set get_target_mmio_device_ids(); virtual std::set get_target_remote_device_ids(); virtual std::map get_clocks(); - virtual uint32_t dma_allocation_size(chip_id_t src_device_id = -1); - virtual void *channel_address(std::uint32_t offset, const tt_cxy_pair& target); virtual void *host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const; virtual std::uint64_t get_pcie_base_addr_from_device() const; static std::vector extract_rows_to_remove(const tt::ARCH &arch, const int worker_grid_rows, const int harvested_rows); static void remove_worker_row_from_descriptor(tt_SocDescriptor& full_soc_descriptor, const std::vector& row_coordinates_to_remove); static void harvest_rows_in_soc_descriptor(tt::ARCH arch, tt_SocDescriptor& sdesc, uint32_t harvested_rows); static std::unordered_map create_harvested_coord_translation(const tt::ARCH arch, bool identity_map); - static std::unordered_map get_harvesting_masks_from_harvested_rows(std::unordered_map> harvested_rows); std::unordered_map get_harvested_coord_translation_map(chip_id_t logical_device_id); virtual std::uint32_t get_num_dram_channels(std::uint32_t device_id); virtual std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel); virtual std::uint32_t get_num_host_channels(std::uint32_t device_id); virtual std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel); - virtual std::uint32_t get_pcie_speed(std::uint32_t device_id); virtual std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id); virtual tt_version get_ethernet_fw_version() const; @@ -871,24 +721,19 @@ class tt_SiliconDevice: public tt_device void initialize_interprocess_mutexes(int pci_interface_id, bool cleanup_mutexes_in_shm); void cleanup_shared_host_state(); void initialize_pcie_devices(); - void broadcast_pcie_tensix_risc_reset(struct PCIdevice *device, const TensixSoftResetOptions &cores); + void broadcast_pcie_tensix_risc_reset(TTDevice *device, const TensixSoftResetOptions &cores); void broadcast_tensix_risc_reset_to_cluster(const TensixSoftResetOptions &soft_resets); void send_remote_tensix_risc_reset_to_core(const tt_cxy_pair &core, const TensixSoftResetOptions &soft_resets); void send_tensix_risc_reset_to_core(const tt_cxy_pair &core, const TensixSoftResetOptions &soft_resets); void perform_harvesting_and_populate_soc_descriptors(const std::string& sdesc_path, const bool perform_harvesting); void populate_cores(); - void init_pcie_iatus(); - void init_pcie_iatus_no_p2p(); + void init_pcie_iatus(); // No more p2p support. bool init_hugepage(chip_id_t device_id); - bool init_dmabuf(chip_id_t device_id); void check_pcie_device_initialized(int device_id); - bool init_dma_turbo_buf(struct PCIdevice* pci_device); - bool uninit_dma_turbo_buf(struct PCIdevice* pci_device); - static std::map get_physical_device_id_to_bus_id_map(std::vector physical_device_ids); void set_pcie_power_state(tt_DevicePowerState state); int set_remote_power_state(const chip_id_t &chip, tt_DevicePowerState device_state); void set_power_state(tt_DevicePowerState state); - uint32_t get_power_state_arc_msg(struct PCIdevice* pci_device, tt_DevicePowerState state); + uint32_t get_power_state_arc_msg(TTDevice *pci_device, tt_DevicePowerState state); void enable_local_ethernet_queue(const chip_id_t& chip, int timeout); void enable_ethernet_queue(int timeout); void enable_remote_ethernet_queue(const chip_id_t& chip, int timeout); @@ -900,13 +745,11 @@ class tt_SiliconDevice: public tt_device int get_clock(int logical_device_id); // Communication Functions - void read_dma_buffer(void* mem_ptr, std::uint32_t address, std::uint16_t channel, std::uint32_t size_in_bytes, chip_id_t src_device_id); - void write_dma_buffer(const void *mem_ptr, std::uint32_t size, std::uint32_t address, std::uint16_t channel, chip_id_t src_device_id); + void read_buffer(void* mem_ptr, std::uint32_t address, std::uint16_t channel, std::uint32_t size_in_bytes, chip_id_t src_device_id); + void write_buffer(const void *mem_ptr, std::uint32_t size, std::uint32_t address, std::uint16_t channel, chip_id_t src_device_id); void write_device_memory(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair target, std::uint32_t address, const std::string& fallback_tlb); void write_to_non_mmio_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t address, bool broadcast = false, std::vector broadcast_header = {}); void read_device_memory(void *mem_ptr, tt_cxy_pair target, std::uint32_t address, std::uint32_t size_in_bytes, const std::string& fallback_tlb); - void write_to_non_mmio_device_send_epoch_cmd(const uint32_t *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t address, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write); - void rolled_write_to_non_mmio_device(const uint32_t *mem_ptr, uint32_t len, tt_cxy_pair core, uint64_t address, uint32_t unroll_count); void read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_t address, uint32_t size_in_bytes); void read_mmio_device_register(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb); void write_mmio_device_register(const void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb); @@ -922,7 +765,7 @@ class tt_SiliconDevice: public tt_device int pcie_arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done = true, uint32_t arg0 = 0, uint32_t arg1 = 0, int timeout=1, uint32_t *return_3 = nullptr, uint32_t *return_4 = nullptr); int remote_arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done = true, uint32_t arg0 = 0, uint32_t arg1 = 0, int timeout=1, uint32_t *return_3 = nullptr, uint32_t *return_4 = nullptr); bool address_in_tlb_space(uint32_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size, uint32_t chip); - struct PCIdevice* get_pci_device(int pci_intf_id) const; + TTDevice *get_pci_device(int pci_intf_id) const; std::shared_ptr get_mutex(const std::string& tlb_name, int pci_interface_id); virtual uint32_t get_harvested_noc_rows_for_chip(int logical_device_id); // Returns one-hot encoded harvesting mask for PCIe mapped chips void generate_tensix_broadcast_grids_for_grayskull( std::set>& broadcast_grids, std::set& rows_to_exclude, std::set& cols_to_exclude); @@ -930,9 +773,7 @@ class tt_SiliconDevice: public tt_device // Test functions void verify_eth_fw(); void verify_sw_fw_versions(int device_id, std::uint32_t sw_version, std::vector &fw_versions); - int test_pcie_tlb_setup (struct PCIdevice* pci_device); int test_setup_interface (); - int test_broadcast (int logical_device_id); // State variables tt_device_dram_address_params dram_address_params; @@ -944,7 +785,7 @@ class tt_SiliconDevice: public tt_device std::set target_remote_chips = {}; tt_SocDescriptor& get_soc_descriptor(chip_id_t chip_id); tt::ARCH arch_name; - std::map m_pci_device_map; // Map of enabled pci devices + std::map> m_pci_device_map; // Map of enabled pci devices int m_num_pci_devices; // Number of pci devices in system (enabled or disabled) std::shared_ptr ndesc; // Level of printouts. Controlled by env var TT_PCI_LOG_LEVEL @@ -962,17 +803,10 @@ class tt_SiliconDevice: public tt_device static constexpr std::uint32_t EPOCH_ETH_CORES_MASK = (EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS-1); int active_core = NON_EPOCH_ETH_CORES_START_ID; - int active_core_epoch = EPOCH_ETH_CORES_START_ID; - bool erisc_q_ptrs_initialized = false; - std::vector erisc_q_ptrs_epoch[NUM_ETH_CORES_FOR_NON_MMIO_TRANSFERS]; - bool erisc_q_wrptr_updated[NUM_ETH_CORES_FOR_NON_MMIO_TRANSFERS]; std::vector< std::vector > remote_transfer_ethernet_cores; bool flush_non_mmio = false; bool non_mmio_transfer_cores_customized = false; std::unordered_map active_eth_core_idx_per_chip = {}; - // Size of the PCIE DMA buffer - // The setting should not exceed MAX_DMA_BYTES - std::uint32_t m_dma_buf_size; std::unordered_map noc_translation_enabled_for_chip = {}; std::map> hardware_resource_mutex_map = {}; std::unordered_map> harvested_coord_translation = {}; @@ -991,9 +825,6 @@ class tt_SiliconDevice: public tt_device std::unordered_map dynamic_tlb_config = {}; std::unordered_map dynamic_tlb_ordering_modes = {}; std::map, std::unordered_map>>> bcast_header_cache = {}; - std::uint64_t buf_physical_addr = 0; - void * buf_mapping = nullptr; - int driver_id; bool perform_harvesting_on_sdesc = false; bool use_ethernet_ordered_writes = true; bool use_ethernet_broadcast = true; diff --git a/device/tt_emulation_device.cpp b/device/tt_emulation_device.cpp deleted file mode 100644 index 3e64c15e..00000000 --- a/device/tt_emulation_device.cpp +++ /dev/null @@ -1,190 +0,0 @@ -#include -#include - -#include "common/logger.hpp" -#include "device/tt_cluster_descriptor.h" -#include "tt_emulation_device.h" -#include "tt_emu_zemi3_wrapper.h" - - -tt_emulation_device::tt_emulation_device(const std::string& sdesc_path) : tt_device(sdesc_path) { - soc_descriptor_per_chip.emplace(0, tt_SocDescriptor(sdesc_path)); - std::set target_devices = {0}; - // create just a default one, we do not have cluster anyway - ndesc = tt_ClusterDescriptor::create_for_grayskull_cluster(target_devices, {}); - tt_zebu_wrapper_inst = new tt_emu_zemi3_wrapper(); - - log_info(tt::LogEmulationDriver, "Created Emulation Device "); -} - -tt_emulation_device::~tt_emulation_device() { - ndesc.reset(); - delete tt_zebu_wrapper_inst; - log_info(tt::LogEmulationDriver, "Destroyed Emulation Device "); -} - -void tt_emulation_device::write(tt_cxy_pair core, uint64_t addr, const std::vector& data) { - const uint32_t size = static_cast(data.size()); - tt_zebu_wrapper_inst->axi_write(0, core.x, core.y, addr, size, data); - log_info(tt::LogEmulationDriver, "Wrote {} bytes to address {:#016x}, core {},{}", size, addr, core.x, core.y); -} - -std::vector tt_emulation_device::read(tt_cxy_pair core, uint64_t addr, uint32_t size) { - std::vector data(size); - tt_zebu_wrapper_inst->axi_read(0, core.x, core.y, addr, size, data); - log_info(tt::LogEmulationDriver, "Read {} bytes from address {:#016x}", size, addr); - - return data; -} - - -void tt_emulation_device::start_device(const tt_device_params& device_params) { - tt_zebu_wrapper_inst->zebu_start(); - tt_zebu_wrapper_inst->zebu_enable_waveform_dump(tt_zebu_wrapper::WAVEFORM_DUMP_QIWC); - log_info(tt::LogEmulationDriver, "Started Emulation Device "); -} - -void tt_emulation_device::deassert_risc_reset() { - tt_zebu_wrapper_inst->all_tensix_reset_deassert(); - log_info(tt::LogEmulationDriver, "Deasserted all tensix RISC Reset "); -} - -void tt_emulation_device::assert_risc_reset() { - tt_zebu_wrapper_inst->all_tensix_reset_assert(); - log_info(tt::LogEmulationDriver, "Asserted all tensix RISC Reset "); -} - -void tt_emulation_device::deassert_risc_reset_at_core(tt_cxy_pair core) { - tt_zebu_wrapper_inst->tensix_reset_deassert(core.x, core.y); -} - -void tt_emulation_device::assert_risc_reset_at_core(tt_cxy_pair core) { - tt_zebu_wrapper_inst->tensix_reset_assert(core.x, core.y); -} - - - -void tt_emulation_device::close_device() { - log_info(tt::LogEmulationDriver, "Closing Emulation Device "); - tt_zebu_wrapper_inst->zebu_finish(); -} - -void tt_emulation_device::start(std::vector plusargs, std::vector dump_cores, bool no_checkers, bool /*init_device*/, bool /*skip_driver_allocs*/ -) { - log_info(tt::LogEmulationDriver, "Starting Emulation Device "); -} - - -void tt_emulation_device::broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set& chips_to_exclude, std::set& rows_to_exclude, std::set& cols_to_exclude, const std::string& fallback_tlb) { - for(const auto& core : get_soc_descriptor(0) -> cores) { - // if(cols_to_exclude.find(core.first.x) == cols_to_exclude.end() and rows_to_exclude.find(core.first.y) == rows_to_exclude.end() and core.second.type != CoreType::HARVESTED) { - // write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, ""); - // } - // MT: Iterate through all the worker cores for bcast: - // if (get_soc_descriptor(0)->is_worker_core(core.first)) { - // write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, ""); - // } - // Emulation only broadcasts to all Tensix cores or all DRAM cores. - // differentiate which bcast pattern to use based on exclude columns - if (cols_to_exclude.find(0) == cols_to_exclude.end()) { - // Detect DRAM bcast - if (get_soc_descriptor(0)->is_dram_core(core.first)) { - write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, ""); - } - } else { - if (get_soc_descriptor(0)->is_worker_core(core.first)) { - write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, ""); - } - } - } -} -void tt_emulation_device::rolled_write_to_device(std::vector& base_vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t base_addr, const std::string& tlb_to_use) { - std::vector vec = base_vec; - uint32_t byte_increment = 4 * vec.size(); - for (uint32_t i = 0; i < unroll_count; ++i) { - vec[0] = i; // slot id for debug - uint64_t offset_addr = base_addr + i * byte_increment; - write_to_device(vec, core, offset_addr, tlb_to_use); - } -} -void tt_emulation_device::write_to_device(const void *mem_ptr, uint32_t size, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) { - log_assert(!(size % 4), "Writes to Emulation Backend should be 4 byte aligned!"); - - std::vector mem_vector((uint32_t*)mem_ptr, (uint32_t*)mem_ptr + size / sizeof(uint32_t)); - write_to_device(mem_vector, core, addr, tlb_to_use, send_epoch_cmd, last_send_epoch_cmd, ordered_with_prev_remote_write); -} - -void tt_emulation_device::write_to_device(std::vector& vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) { - - std::vector byte_data(vec.size() * sizeof(uint32_t)); - std::memcpy(byte_data.data(), vec.data(), byte_data.size()); - - write(core, addr, byte_data); -} - -void tt_emulation_device::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) { - // Placeholder - implement later - https://yyz-gitlab.local.tenstorrent.com/tenstorrent/open-umd/-/issues/26 -} - -void tt_emulation_device::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) { - // Placeholder - implement later - https://yyz-gitlab.local.tenstorrent.com/tenstorrent/open-umd/-/issues/26 -} - -void tt_emulation_device::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) { - // Placeholder - implement later - https://yyz-gitlab.local.tenstorrent.com/tenstorrent/open-umd/-/issues/26 -} - - - -void tt_emulation_device::read_from_device(std::vector& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& /*tlb_to_use*/) { - std::vector byte_data = read(core, addr, size); - - // Verify that the received byte data can be converted to uint32_t - // if (byte_data.size() % sizeof(uint32_t) != 0) { - // throw std::runtime_error("Received byte data size is not a multiple of uint32_t size."); - // } - - vec.clear(); - vec.resize(byte_data.size() / sizeof(uint32_t)); - std::memcpy(vec.data(), byte_data.data(), byte_data.size()); -} - -void tt_emulation_device::translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c) { - // No translation is performed - return; -} -tt_ClusterDescriptor* tt_emulation_device::get_cluster_description() { return ndesc.get(); } - -std::set tt_emulation_device::get_target_mmio_device_ids() { - log_error("LogEmulationDriver: get_target_mmio_device_ids not implemented"); - return {}; -} - -std::set tt_emulation_device::get_target_remote_device_ids() { - log_error("LogEmulationDriver: get_target_remote_device_ids not implemented"); - return {}; -} - -void tt_emulation_device::set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_) { - dram_address_params = dram_address_params_; -} -int tt_emulation_device::get_number_of_chips_in_cluster() { return detect_number_of_chips(); } -std::unordered_set tt_emulation_device::get_all_chips_in_cluster() { return { 0 }; } -int tt_emulation_device::detect_number_of_chips() { return 1; } - -bool tt_emulation_device::using_harvested_soc_descriptors() { return false; } -bool tt_emulation_device::noc_translation_en() { return false; } -std::unordered_map tt_emulation_device::get_harvesting_masks_for_soc_descriptors() { return {{0, 0}};} - -std::unordered_map& tt_emulation_device::get_virtual_soc_descriptors() {return soc_descriptor_per_chip;} - -std::map tt_emulation_device::get_clocks() { - return std::map(); -} - -void tt_emulation_device::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) { - l1_address_params = l1_address_params_; -} - - - diff --git a/device/tt_emulation_device.h b/device/tt_emulation_device.h deleted file mode 100644 index 259841c4..00000000 --- a/device/tt_emulation_device.h +++ /dev/null @@ -1,65 +0,0 @@ -#pragma once - -#include -#include -#include -#include "tt_soc_descriptor.h" -#include "tt_xy_pair.h" -#include "tt_device.h" - -// use forward declaration here so we do not need to include tt_zebu_wrapper.h -class tt_zebu_wrapper; - -class tt_emulation_device : public tt_device { -public: - virtual void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_); // Dont care - tt_emulation_device(const std::string& sdesc_path); - virtual void start(std::vector plusargs, std::vector dump_cores, bool no_checkers, bool init_device, bool skip_driver_allocs); - virtual void start_device(const tt_device_params& device_params); - virtual void close_device(); - virtual void deassert_risc_reset(); - virtual void deassert_risc_reset_at_core(tt_cxy_pair core); - virtual void assert_risc_reset(); - virtual void assert_risc_reset_at_core(tt_cxy_pair core); - virtual void write_to_device(std::vector& vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false); - virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false); - virtual void broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set& chips_to_exclude, std::set& rows_to_exclude, std::set& columns_to_exclude, const std::string& fallback_tlb); - - void l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); - void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels); - void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); - - virtual void rolled_write_to_device(std::vector& base_vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t base_addr, const std::string& tlb_to_use); // See Versim Implementation - virtual void read_from_device(std::vector& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use); - - virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c); - virtual bool using_harvested_soc_descriptors(); - virtual std::unordered_map get_harvesting_masks_for_soc_descriptors(); - virtual std::unordered_map& get_virtual_soc_descriptors(); - virtual bool noc_translation_en(); - virtual std::set get_target_mmio_device_ids(); - virtual std::set get_target_remote_device_ids(); - virtual ~tt_emulation_device(); - virtual tt_ClusterDescriptor* get_cluster_description(); - virtual void set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_); - virtual int get_number_of_chips_in_cluster(); - virtual std::unordered_set get_all_chips_in_cluster(); - static int detect_number_of_chips(); - virtual std::map get_clocks(); -private: - - tt_device_l1_address_params l1_address_params; - std::shared_ptr ndesc; - tt_device_dram_address_params dram_address_params; - - // zebu wrapper, provides interface to zebu emulator device through axi and command transactors - tt_zebu_wrapper *tt_zebu_wrapper_inst = NULL; - - - - // These functions implement the "protocol" between the RTL simulation and the UMD - void write(tt_cxy_pair core, uint64_t addr, const std::vector& data); - std::vector read(tt_cxy_pair core, uint64_t addr, uint32_t size); - -}; - diff --git a/device/tt_emulation_stub.cpp b/device/tt_emulation_stub.cpp deleted file mode 100644 index 33fc3c90..00000000 --- a/device/tt_emulation_stub.cpp +++ /dev/null @@ -1,73 +0,0 @@ -#include -#include - -#include "common/logger.hpp" -#include "tt_emulation_device.h" - -tt_emulation_device::tt_emulation_device(const std::string& sdesc_path) : tt_device(sdesc_path) { - throw std::runtime_error("tt_emulation_device() -- Zebu Emulation is not supported in this build\n"); -} - - -tt_emulation_device::~tt_emulation_device() {} - -void tt_emulation_device::write(tt_cxy_pair core, uint64_t addr, const std::vector& data) {} - -std::vector tt_emulation_device::read(tt_cxy_pair core, uint64_t addr, uint32_t size) {return {};} - - -void tt_emulation_device::start_device(const tt_device_params& device_params) {} - -void tt_emulation_device::deassert_risc_reset() {} - -void tt_emulation_device::assert_risc_reset() {} - -void tt_emulation_device::deassert_risc_reset_at_core(tt_cxy_pair core) {} - -void tt_emulation_device::assert_risc_reset_at_core(tt_cxy_pair core) {} - -void tt_emulation_device::close_device() {} - -void tt_emulation_device::start(std::vector plusargs, std::vector dump_cores, bool no_checkers, bool /*init_device*/, bool /*skip_driver_allocs*/) {} - - -void tt_emulation_device::broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set& chips_to_exclude, std::set& rows_to_exclude, std::set& cols_to_exclude, const std::string& fallback_tlb) {} -void tt_emulation_device::rolled_write_to_device(std::vector& base_vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t base_addr, const std::string& tlb_to_use) {} - -void tt_emulation_device::write_to_device(std::vector& vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {} -void tt_emulation_device::write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {}; -void tt_emulation_device::read_from_device(std::vector& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& /*tlb_to_use*/) {} -void tt_emulation_device::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) {} -void tt_emulation_device::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) {} -void tt_emulation_device::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) {} - - -// ------------------------- -// Not sure how to implement these functions below, leaving them blank/default for now -void tt_emulation_device::translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c) { - // No translation is performed - return; -} -tt_ClusterDescriptor* tt_emulation_device::get_cluster_description() { return ndesc.get(); } - -std::set tt_emulation_device::get_target_mmio_device_ids() {return {};} - -std::set tt_emulation_device::get_target_remote_device_ids() {return {};} - -void tt_emulation_device::set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_) {} -int tt_emulation_device::get_number_of_chips_in_cluster() { return detect_number_of_chips(); } -std::unordered_set tt_emulation_device::get_all_chips_in_cluster() { return { 0 }; } -int tt_emulation_device::detect_number_of_chips() { return 1; } - -bool tt_emulation_device::using_harvested_soc_descriptors() { return false; } -bool tt_emulation_device::noc_translation_en() { return false; } -std::unordered_map tt_emulation_device::get_harvesting_masks_for_soc_descriptors() { return {{0, 0}};} - -std::unordered_map& tt_emulation_device::get_virtual_soc_descriptors() {return soc_descriptor_per_chip;} - -std::map tt_emulation_device::get_clocks() {return std::map();} - -void tt_emulation_device::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) {} - - - diff --git a/device/tt_silicon_driver.cpp b/device/tt_silicon_driver.cpp index 0cfdf027..cb7b0d0b 100644 --- a/device/tt_silicon_driver.cpp +++ b/device/tt_silicon_driver.cpp @@ -27,39 +27,36 @@ #include #include #include +#include +#include +#include #include #include -#include -#include +// #include +// #include #include -#include -#include +// #include +// #include #include #include #include -#include +// #include + +#include "yaml-cpp/yaml.h" +#include "common/logger.hpp" +#include "device/cpuset_lib.hpp" +#include "device/driver_atomics.h" #include "device/architecture.h" #include "device/architecture_implementation.h" #include "device/tlb.h" #include "device/tt_arch_types.h" #include "tt_device.h" -#include "kmdif.h" #include "ioctl.h" //#include "epoch_q.h" -#include -#include "yaml-cpp/yaml.h" -#include -#include - -#include -#include "device/cpuset_lib.hpp" -#include "common/logger.hpp" -#include "device/driver_atomics.h" - #define WHT "\e[0;37m" #define BLK "\e[0;30m" #define RED "\e[0;31m" @@ -84,17 +81,6 @@ void clr_printf(const char *clr, const char *fmt, ...) { int g_DEBUG_LEVEL; // /src/t6ifc/t6py/packages/tenstorrent/jlink/jtag_comm.cpp bool g_READ_CHECKING_ENABLED = true; -bool g_USE_MSI_FOR_DMA = false; // Whether to wait for MSI after DMA transfer, or poll a variable -uint32_t g_DMA_BLOCK_SIZE_READ_THRESHOLD_BYTES = 0; // 0 - never use DMA. Otherwise use DMA for all blocks larger than this size -uint32_t g_DMA_BLOCK_SIZE_WRITE_THRESHOLD_BYTES = 0; // 0 - never use DMA. Otherwise use DMA for all blocks larger than this size - -// Address in CSM where the DMA request structure resides -uint32_t c_CSM_PCIE_CTRL_DMA_REQUEST_OFFSET = 0; -// Address where the trigger for transfer resides -uint32_t c_DMA_TRIGGER_ADDRESS = 0; -// To trigger arc interrupt -uint32_t c_ARC_MISC_CNTL_ADDRESS = 0; - // Print all buffers smaller than this number of bytes uint32_t g_NUM_BYTES_TO_PRINT = 8; @@ -102,24 +88,7 @@ uint32_t g_NUM_BYTES_TO_PRINT = 8; const bool g_SINGLE_PIN_PAGE_PER_FD_WORKAROND = true; const uint32_t g_MAX_HOST_MEM_CHANNELS = 4; -volatile bool msi_interrupt_received = false; - -const char device_name_pattern[] = "/dev/tenstorrent/%u"; - -const std::string tlb_large_read_mutex_name_prefix = "mem_tlb_large_read_mutex_pci_interface_id_"; -const std::string tlb_large_write_mutex_name_prefix = "mem_tlb_large_write_mutex_pci_interface_id_"; -const std::string tlb_small_read_write_mutex_name_prefix = "mem_tlb_small_read_write_mutex_pci_interface_id_"; -const std::string arc_msg_mutex_name_prefix = "arc_msg_mutex_pci_interface_id_"; - -static uint32_t GS_BAR0_WC_MAPPING_SIZE = (156<<20) + (10<<21) + (18<<24); -static uint32_t BH_BAR0_WC_MAPPING_SIZE = 188<<21; // Defines the address for WC region. addresses 0 to BH_BAR0_WC_MAPPING_SIZE are in WC, above that are UC - -static const uint32_t GS_WH_ARC_SCRATCH_6_OFFSET = 0x1FF30078; -static const uint32_t BH_NOC_NODE_ID_OFFSET = 0x1FD04044; - -const uint32_t DMA_BUF_REGION_SIZE = 4 << 20; const uint32_t HUGEPAGE_REGION_SIZE = 1 << 30; // 1GB -const uint32_t DMA_MAP_MASK = DMA_BUF_REGION_SIZE - 1; const uint32_t HUGEPAGE_MAP_MASK = HUGEPAGE_REGION_SIZE - 1; static const uint32_t MSG_ERROR_REPLY = 0xFFFFFFFF; @@ -128,209 +97,9 @@ static const uint32_t MSG_ERROR_REPLY = 0xFFFFFFFF; const char* hugepage_dir_env = std::getenv("TT_BACKEND_HUGEPAGE_DIR"); std::string hugepage_dir = hugepage_dir_env ? hugepage_dir_env : "/dev/hugepages-1G"; -// BAR0 size for Blackhole, used to determine whether write block should use BAR0 or BAR4 -const uint64_t BAR0_BH_SIZE = 512 * 1024 * 1024; - // TLB size for DRAM on blackhole - 4GB const uint64_t BH_4GB_TLB_SIZE = 4ULL * 1024 * 1024 * 1024; -// See /vendor_ip/synopsys/052021/bh_pcie_ctl_gen5/export/configuration/DWC_pcie_ctl.h -const uint64_t UNROLL_ATU_OFFSET_BAR = 0x1200; - -// Foward declarations -PCIdevice ttkmd_open(DWORD device_id, bool sharable /* = false */); -int ttkmd_close(struct PCIdevice &device); - -uint32_t pcie_dma_transfer_turbo (TTDevice *dev, uint32_t chip_addr, uint32_t host_phys_addr, uint32_t size_bytes, bool write); -DMAbuffer pci_allocate_dma_buffer(TTDevice *dev, uint32_t size); -void pcie_init_dma_transfer_turbo (PCIdevice* dev); - -void write_regs(volatile uint32_t *dest, const uint32_t *src, uint32_t word_len); - -// Stash all the fields of TTDevice in TTDeviceBase to make moving simpler. -struct TTDeviceBase -{ - unsigned int index; - - int device_fd = -1; - std::vector device_fd_per_host_ch; - void *bar0_uc = nullptr; - std::size_t bar0_uc_size = 0; - std::size_t bar0_uc_offset = 0; - - void *bar0_wc = nullptr; - std::size_t bar0_wc_size = 0; - - void *bar2_uc = nullptr; - std::size_t bar2_uc_size; - - void *bar4_wc = nullptr; - std::uint64_t bar4_wc_size; - - void *system_reg_mapping = nullptr; - std::size_t system_reg_mapping_size; - - void *system_reg_wc_mapping = nullptr; - std::size_t system_reg_wc_mapping_size; - - std::uint32_t system_reg_start_offset; // Registers >= this are system regs, use the mapping. - std::uint32_t system_reg_offset_adjust; // This is the offset of the first reg in the system reg mapping. - - int sysfs_config_fd = -1; - std::uint16_t pci_domain; - std::uint8_t pci_bus; - std::uint8_t pci_device; - std::uint8_t pci_function; - - unsigned int next_dma_buf = 0; - - DMAbuffer dma_completion_flag_buffer; // When DMA completes, it writes to this buffer - DMAbuffer dma_transfer_buffer; // Buffer for large DMA transfers - - std::uint32_t max_dma_buf_size_log2; - - tenstorrent_get_device_info_out device_info; - - std::vector dma_buffer_mappings; - - std::uint32_t read_checking_offset; -}; - -struct TTDevice : TTDeviceBase -{ - static TTDevice open(unsigned int device_id); - void open_hugepage_per_host_mem_ch(uint32_t num_host_mem_channels); - ~TTDevice() { reset(); } - - TTDevice(const TTDevice&) = delete; - void operator = (const TTDevice&) = delete; - - TTDevice(TTDevice &&that) : TTDeviceBase(std::move(that)), arch(that.arch), architecture_implementation(std::move(that.architecture_implementation)) { that.drop(); } - TTDevice &operator = (TTDevice &&that) { - reset(); - - *static_cast(this) = std::move(that); - arch = that.arch; - architecture_implementation = std::move(that.architecture_implementation); - that.drop(); - - return *this; - } - - void suspend_before_device_reset() { - reset(); - } - - void resume_after_device_reset() { - do_open(); - } - - tt::ARCH get_arch() const { return arch; } - tt::umd::architecture_implementation* get_architecture_implementation() const { return architecture_implementation.get(); } - -private: - TTDevice() = default; - - void reset() { - if (arch == tt::ARCH::BLACKHOLE && bar2_uc != nullptr && bar2_uc != MAP_FAILED) { - // Disable ATU index 0 - // TODO: Implement disabling for all indexes, once more host channels are enabled. - uint64_t iatu_index = 0; - uint64_t iatu_base = UNROLL_ATU_OFFSET_BAR + iatu_index * 0x200; - uint32_t region_ctrl_2 = 0 << 31; // REGION_EN = 0 - write_regs(reinterpret_cast(static_cast(bar2_uc) + iatu_base + 0x04), ®ion_ctrl_2, 1); - } - - if (device_fd != -1) { - close(device_fd); - } - - if (bar0_wc != nullptr && bar0_wc != MAP_FAILED && bar0_wc != bar0_uc) { - munmap(bar0_wc, bar0_wc_size); - } - - if (bar0_uc != nullptr && bar0_uc != MAP_FAILED) { - munmap(bar0_uc, bar0_uc_size); - } - - if (bar2_uc != nullptr && bar2_uc != MAP_FAILED) { - munmap(bar2_uc, bar2_uc_size); - } - - if (bar4_wc != nullptr && bar4_wc != MAP_FAILED) { - munmap(bar4_wc, bar4_wc_size); - } - - if (system_reg_mapping != nullptr && system_reg_mapping != MAP_FAILED) { - munmap(system_reg_mapping, system_reg_mapping_size); - } - - for (auto &&buf : dma_buffer_mappings) { - munmap(buf.pBuf, buf.size); - } - - if (sysfs_config_fd != -1) { - close(sysfs_config_fd); - } - - drop(); - } - - void drop() { - device_fd = -1; - bar0_uc = nullptr; - bar0_wc = nullptr; - bar2_uc = nullptr; - bar4_wc = nullptr; - system_reg_mapping = nullptr; - dma_buffer_mappings.clear(); - sysfs_config_fd = -1; - } - - void do_open(); - - tt::ARCH arch; - std::unique_ptr architecture_implementation; -}; - -TTDevice TTDevice::open(unsigned int device_id) { - TTDevice ttdev; - static int unique_id = 0; - ttdev.index = device_id; - ttdev.do_open(); - - return ttdev; -} - -bool is_grayskull(const uint16_t device_id) { - return device_id == 0xfaca; -} - -bool is_wormhole(const uint16_t device_id) { - return device_id == 0x401e; -} - -bool is_blackhole(const uint16_t device_id) { - return device_id == 0xb140; -} - -bool is_grayskull(const tenstorrent_get_device_info_out &device_info) { - return is_grayskull(device_info.device_id); -} - -bool is_wormhole(const tenstorrent_get_device_info_out &device_info) { - return is_wormhole(device_info.device_id); -} - -bool is_wormhole_b0(const uint16_t device_id, const uint16_t revision_id) { - return (is_wormhole(device_id) && (revision_id == 0x01)); -} - -bool is_blackhole(const tenstorrent_get_device_info_out &device_info) { - return is_blackhole(device_info.device_id); -} - - template void size_buffer_to_capacity(std::vector &data_buf, std::size_t size_in_bytes) { std::size_t target_size = 0; @@ -407,246 +176,6 @@ uint32_t get_available_num_host_mem_channels(const uint32_t num_channels_per_dev } -int find_device(const uint16_t device_id) { - // returns device id if found, otherwise -1 - char device_name[sizeof(device_name_pattern) + std::numeric_limits::digits10]; - std::snprintf(device_name, sizeof(device_name), device_name_pattern, (unsigned int)device_id); - int device_fd = ::open(device_name, O_RDWR | O_CLOEXEC); - LOG2 ("find_device() open call returns device_fd: %d for device_name: %s (device_id: %d)\n", device_fd, device_name, device_id); - return device_fd; -} - -// Open a unique device_id per host memory channel (workaround for ttkmd < 1.21 support for more than 1 pin per fd) -void TTDevice::open_hugepage_per_host_mem_ch(uint32_t num_host_mem_channels) { - for (int ch = 0; ch < num_host_mem_channels; ch++) { - log_debug(LogSiliconDriver, "Opening device_fd_per_host_ch device index: {} ch: {} (num_host_mem_channels: {})", index, ch, num_host_mem_channels); - int device_fd_for_host_mem = find_device(index); - if (device_fd_for_host_mem == -1) { - throw std::runtime_error(std::string("Failed opening a host memory device handle for device ") + std::to_string(index)); - } - device_fd_per_host_ch.push_back(device_fd_for_host_mem); - } -} - -int get_revision_id(TTDevice *dev); - -tt::ARCH detect_arch(TTDevice *dev) { - if (is_grayskull(dev->device_info.device_id)) { - return tt::ARCH::GRAYSKULL; - } else if (is_wormhole_b0(dev->device_info.device_id, get_revision_id(dev))) { - return tt::ARCH::WORMHOLE_B0; - } else if (is_wormhole(dev->device_info.device_id)) { - return tt::ARCH::WORMHOLE; - } else if (is_blackhole(dev->device_info.device_id)) { - return tt::ARCH::BLACKHOLE; - } else { - throw std::runtime_error(std::string("Unknown device id.")); - } -} - -tt::ARCH detect_arch(PCIdevice *pci_device) { - return pci_device->hdev->get_arch(); -} - -tt::ARCH detect_arch(uint16_t device_id) { - tt::ARCH arch_name = tt::ARCH::Invalid; - if (find_device(device_id) == -1) { - WARN("---- tt_SiliconDevice::detect_arch did not find silcon device_id: %d\n", device_id); - return arch_name; - } - struct PCIdevice pci_device = ttkmd_open((DWORD)device_id, false); - - arch_name = detect_arch(&pci_device); - - ttkmd_close(pci_device); - return arch_name; -} - -void TTDevice::do_open() { - device_fd = find_device(index); - if (device_fd == -1) { - throw std::runtime_error(std::string("Failed opening a handle for device ") + std::to_string(index)); - } - - tenstorrent_get_device_info device_info; - memset(&device_info, 0, sizeof(device_info)); - device_info.in.output_size_bytes = sizeof(device_info.out); - - if (ioctl(device_fd, TENSTORRENT_IOCTL_GET_DEVICE_INFO, &device_info) == -1) { - throw std::runtime_error(std::string("Get device info failed on device ") + std::to_string(index) + "."); - } - - this->device_info = device_info.out; - - max_dma_buf_size_log2 = device_info.out.max_dma_buf_size_log2; - - struct { - tenstorrent_query_mappings query_mappings; - tenstorrent_mapping mapping_array[8]; - } mappings; - - memset(&mappings, 0, sizeof(mappings)); - mappings.query_mappings.in.output_mapping_count = 8; - - if (ioctl(device_fd, TENSTORRENT_IOCTL_QUERY_MAPPINGS, &mappings.query_mappings) == -1) { - throw std::runtime_error(std::string("Query mappings failed on device ") + std::to_string(index) + "."); - } - - // Mapping resource to BAR - // Resource 0 -> BAR0 - // Resource 1 -> BAR2 - // Resource 2 -> BAR4 - tenstorrent_mapping bar0_uc_mapping; - tenstorrent_mapping bar0_wc_mapping; - tenstorrent_mapping bar2_uc_mapping; - tenstorrent_mapping bar2_wc_mapping; - tenstorrent_mapping bar4_uc_mapping; - tenstorrent_mapping bar4_wc_mapping; - - memset(&bar0_uc_mapping, 0, sizeof(bar0_uc_mapping)); - memset(&bar0_wc_mapping, 0, sizeof(bar0_wc_mapping)); - memset(&bar2_uc_mapping, 0, sizeof(bar2_uc_mapping)); - memset(&bar2_wc_mapping, 0, sizeof(bar2_wc_mapping)); - memset(&bar4_uc_mapping, 0, sizeof(bar4_uc_mapping)); - memset(&bar4_wc_mapping, 0, sizeof(bar4_wc_mapping)); - - for (unsigned int i = 0; i < mappings.query_mappings.in.output_mapping_count; i++) { - if (mappings.mapping_array[i].mapping_id == TENSTORRENT_MAPPING_RESOURCE0_UC) { - bar0_uc_mapping = mappings.mapping_array[i]; - } - - if (mappings.mapping_array[i].mapping_id == TENSTORRENT_MAPPING_RESOURCE0_WC) { - bar0_wc_mapping = mappings.mapping_array[i]; - } - - if (mappings.mapping_array[i].mapping_id == TENSTORRENT_MAPPING_RESOURCE1_UC) { - bar2_uc_mapping = mappings.mapping_array[i]; - } - - if (mappings.mapping_array[i].mapping_id == TENSTORRENT_MAPPING_RESOURCE1_WC) { - bar2_wc_mapping = mappings.mapping_array[i]; - } - - if (mappings.mapping_array[i].mapping_id == TENSTORRENT_MAPPING_RESOURCE2_UC) { - bar4_uc_mapping = mappings.mapping_array[i]; - } - - if (mappings.mapping_array[i].mapping_id == TENSTORRENT_MAPPING_RESOURCE2_WC) { - bar4_wc_mapping = mappings.mapping_array[i]; - } - - log_debug(LogSiliconDriver, "BAR mapping id {} base {} size {}", - mappings.mapping_array[i].mapping_id, - (void *)mappings.mapping_array[i].mapping_base, - mappings.mapping_array[i].mapping_size); - } - - if (bar0_uc_mapping.mapping_id != TENSTORRENT_MAPPING_RESOURCE0_UC) { - throw std::runtime_error(std::string("Device ") + std::to_string(index) + " has no BAR0 UC mapping."); - } - - auto wc_mapping_size = is_blackhole(device_info.out) ? BH_BAR0_WC_MAPPING_SIZE : GS_BAR0_WC_MAPPING_SIZE; - - // Attempt WC mapping first so we can fall back to all-UC if it fails. - if (bar0_wc_mapping.mapping_id == TENSTORRENT_MAPPING_RESOURCE0_WC) { - bar0_wc_size = std::min(bar0_wc_mapping.mapping_size, wc_mapping_size); - bar0_wc = mmap(NULL, bar0_wc_size, PROT_READ | PROT_WRITE, MAP_SHARED, device_fd, bar0_wc_mapping.mapping_base); - if (bar0_wc == MAP_FAILED) { - bar0_wc_size = 0; - bar0_wc = nullptr; - } - } - - if (bar0_wc) { - // The bottom part of the BAR is mapped WC. Map the top UC. - bar0_uc_size = bar0_uc_mapping.mapping_size - wc_mapping_size; - bar0_uc_offset = wc_mapping_size; - } else { - // No WC mapping, map the entire BAR UC. - bar0_uc_size = bar0_uc_mapping.mapping_size; - bar0_uc_offset = 0; - } - - bar0_uc = mmap(NULL, bar0_uc_size, PROT_READ | PROT_WRITE, MAP_SHARED, device_fd, bar0_uc_mapping.mapping_base + bar0_uc_offset); - - if (bar0_uc == MAP_FAILED) { - throw std::runtime_error(std::string("BAR0 UC memory mapping failed for device ") + std::to_string(index) + "."); - } - - if (!bar0_wc) { - bar0_wc = bar0_uc; - } - - if (is_wormhole(device_info.out)) { - if (bar4_uc_mapping.mapping_id != TENSTORRENT_MAPPING_RESOURCE2_UC) { - throw std::runtime_error(std::string("Device ") + std::to_string(index) + " has no BAR4 UC mapping."); - } - - this->system_reg_mapping_size = bar4_uc_mapping.mapping_size; - - this->system_reg_mapping = mmap(NULL, bar4_uc_mapping.mapping_size, PROT_READ | PROT_WRITE, MAP_SHARED, device_fd, bar4_uc_mapping.mapping_base); - - if (this->system_reg_mapping == MAP_FAILED) { - throw std::runtime_error(std::string("BAR4 UC memory mapping failed for device ") + std::to_string(index) + "."); - } - - this->system_reg_start_offset = (512 - 16) * 1024*1024; - this->system_reg_offset_adjust = (512 - 32) * 1024*1024; - } else if(is_blackhole(device_info.out)) { - if (bar2_uc_mapping.mapping_id != TENSTORRENT_MAPPING_RESOURCE1_UC) { - throw std::runtime_error(std::string("Device ") + std::to_string(index) + " has no BAR2 UC mapping."); - } - - // Using UnCachable memory mode. This is used for accessing registers on Blackhole. - this->bar2_uc_size = bar2_uc_mapping.mapping_size; - this->bar2_uc = mmap(NULL, bar2_uc_mapping.mapping_size, PROT_READ | PROT_WRITE, MAP_SHARED, device_fd, bar2_uc_mapping.mapping_base); - - if (this->bar2_uc == MAP_FAILED) { - throw std::runtime_error(std::string("BAR2 UC memory mapping failed for device ") + std::to_string(index) + "."); - } - - if (bar4_wc_mapping.mapping_id != TENSTORRENT_MAPPING_RESOURCE2_WC) { - throw std::runtime_error(std::string("Device ") + std::to_string(index) + " has no BAR4 WC mapping."); - } - - // Using Write-Combine memory mode. This is used for accessing DRAM on Blackhole. - // WC doesn't guarantee write ordering but has better performance. - this->bar4_wc_size = bar4_wc_mapping.mapping_size; - this->bar4_wc = mmap(NULL, bar4_wc_mapping.mapping_size, PROT_READ | PROT_WRITE, MAP_SHARED, device_fd, bar4_wc_mapping.mapping_base); - - if (this->bar4_wc == MAP_FAILED) { - throw std::runtime_error(std::string("BAR4 WC memory mapping failed for device ") + std::to_string(index) + "."); - } - } - pci_domain = device_info.out.pci_domain; - pci_bus = device_info.out.bus_dev_fn >> 8; - pci_device = PCI_SLOT(device_info.out.bus_dev_fn); - pci_function = PCI_FUNC(device_info.out.bus_dev_fn); - - arch = detect_arch(this); - architecture_implementation = tt::umd::architecture_implementation::create(static_cast(arch)); - - // GS+WH: ARC_SCRATCH[6], BH: NOC NODE_ID - this->read_checking_offset = is_blackhole(device_info.out) ? BH_NOC_NODE_ID_OFFSET : GS_WH_ARC_SCRATCH_6_OFFSET; -} - -void set_debug_level(int dl) { - g_DEBUG_LEVEL = dl; -} - -std::uint64_t pci_dma_buffer_get_physical_addr(DMAbuffer &dma_buffer) { - log_assert (dma_buffer.pDma, "DMA Buffer not initialized"); - return reinterpret_cast(dma_buffer.pDma); -} - -std::uint64_t pci_dma_buffer_get_user_addr(DMAbuffer &dma_buffer) { - log_assert (dma_buffer.pBuf, "DMA Buffer not initialized"); - return reinterpret_cast(dma_buffer.pBuf); -} - -DWORD ttkmd_init() { return 0; } // 0 on success -DWORD ttkmd_uninit() { return 0; } // 0 on success - bool is_char_dev(const dirent *ent, const char *parent_dir) { if (ent->d_type == DT_UNKNOWN || ent->d_type == DT_LNK) { char name[2 * NAME_MAX + 2]; @@ -708,267 +237,93 @@ std::vector ttkmd_scan() { return found_devices; } -int get_config_space_fd(TTDevice *dev) { - if (dev->sysfs_config_fd == -1) { - static const char pattern[] = "/sys/bus/pci/devices/0000:%02x:%02x.%u/config"; - char buf[sizeof(pattern)]; - std::snprintf(buf, sizeof(buf), pattern, - (unsigned int)dev->pci_bus, (unsigned int)dev->pci_device, (unsigned int)dev->pci_function); - dev->sysfs_config_fd = open(buf, O_RDWR); - - if (dev->sysfs_config_fd == -1) { - dev->sysfs_config_fd = open(buf, O_RDONLY); - } - } - - return dev->sysfs_config_fd; -} - -int get_revision_id(TTDevice *dev) { - - static const char pattern[] = "/sys/bus/pci/devices/%04x:%02x:%02x.%u/revision"; - char buf[sizeof(pattern)]; - std::snprintf(buf, sizeof(buf), pattern, - (unsigned int)dev->pci_domain, (unsigned int)dev->pci_bus, (unsigned int)dev->pci_device, (unsigned int)dev->pci_function); - - std::ifstream revision_file(buf); - std::string revision_string; - if (std::getline(revision_file, revision_string)) { - return std::stoi(revision_string, nullptr, 0); - } else { - throw std::runtime_error("Revision ID read failed for device"); - } -} - -int get_link_width(TTDevice *dev) { - - static const char pattern[] = "/sys/bus/pci/devices/%04x:%02x:%02x.%u/current_link_width"; - char buf[sizeof(pattern)]; - std::snprintf(buf, sizeof(buf), pattern, - (unsigned int)dev->pci_domain, (unsigned int)dev->pci_bus, (unsigned int)dev->pci_device, (unsigned int)dev->pci_function); - - std::ifstream linkwidth_file(buf); - std::string linkwidth_string; - if (std::getline(linkwidth_file, linkwidth_string)) { - return std::stoi(linkwidth_string, nullptr, 0); - } else { - throw std::runtime_error("Link width read failed for device"); - } -} - -int get_link_speed(TTDevice *dev) { - - static const char pattern[] = "/sys/bus/pci/devices/%04x:%02x:%02x.%u/current_link_speed"; - char buf[sizeof(pattern)]; - std::snprintf(buf, sizeof(buf), pattern, - (unsigned int)dev->pci_domain, (unsigned int)dev->pci_bus, (unsigned int)dev->pci_device, (unsigned int)dev->pci_function); - - std::ifstream linkspeed_file(buf); - std::string linkspeed_string; - int linkspeed; - if (std::getline(linkspeed_file, linkspeed_string) && sscanf(linkspeed_string.c_str(), "%d", &linkspeed) == 1) { - return linkspeed; - } else { - throw std::runtime_error("Link speed read failed for device"); - } -} - -int get_numa_node(TTDevice *dev) { - - static const char pattern[] = "/sys/bus/pci/devices/%04x:%02x:%02x.%u/numa_node"; - char buf[sizeof(pattern)]; - std::snprintf(buf, sizeof(buf), pattern, - (unsigned int)dev->pci_domain, (unsigned int)dev->pci_bus, (unsigned int)dev->pci_device, (unsigned int)dev->pci_function); - - std::ifstream num_node_file(buf); - std::string numa_node_string; - if (std::getline(num_node_file, numa_node_string)) { - return std::stoi(numa_node_string, nullptr, 0); - } else { - return -1; - } -} - -std::uint64_t read_bar0_base(TTDevice *dev) { - const std::uint64_t bar_address_mask = ~(std::uint64_t)0xF; - unsigned int bar0_config_offset = 0x10; - - std::uint64_t bar01; - if (pread(get_config_space_fd(dev), &bar01, sizeof(bar01), bar0_config_offset) != sizeof(bar01)) { - return 0; - } - - return bar01 & bar_address_mask; -} - -DMAbuffer allocate_dma_buffer(TTDevice *ttdev, unsigned int buffer_index, std::size_t size) { - tenstorrent_allocate_dma_buf allocate_dma_buf; - - if (size > std::numeric_limits::max()) { - throw std::runtime_error(std::string("Requested DMA buffer size (" + std::to_string(allocate_dma_buf.in.requested_size) - + ") bytes exceeds interface size limit for device " + std::to_string(ttdev->index) + ", with error: " + std::strerror(errno))); - } +// bool is_hardware_hung(const TTDevice *dev) { +// volatile const void *addr = reinterpret_cast(dev->bar0_uc) + (dev->get_architecture_implementation()->get_arc_reset_scratch_offset() + 6 * 4) - dev->bar0_uc_offset; +// std::uint32_t scratch_data = *reinterpret_cast(addr); - memset(&allocate_dma_buf, 0, sizeof(allocate_dma_buf)); - allocate_dma_buf.in.requested_size = std::max(size, getpagesize()); - allocate_dma_buf.in.buf_index = buffer_index; - - if (ioctl(ttdev->device_fd, TENSTORRENT_IOCTL_ALLOCATE_DMA_BUF, &allocate_dma_buf) == -1) { - throw std::runtime_error(std::string("DMA buffer allocation failed (") + std::to_string(allocate_dma_buf.in.requested_size) - + " bytes) for device " + std::to_string(ttdev->index) + "."); - } - - void *mapping = mmap(NULL, allocate_dma_buf.out.size, PROT_READ | PROT_WRITE, MAP_SHARED, ttdev->device_fd, allocate_dma_buf.out.mapping_offset); - - log_trace(tt::LogSiliconDriver, "DMA buffer succeeded with size {} offset {} phy_addr {}", allocate_dma_buf.out.size, allocate_dma_buf.out.mapping_offset, allocate_dma_buf.out.physical_address); - - if (mapping == MAP_FAILED) { - throw std::runtime_error(std::string("DMA buffer memory mapping failed for device ") + std::to_string(ttdev->index) + "."); - } - - DMAbuffer dmabuf; - dmabuf.pBuf = mapping; - dmabuf.pDma = allocate_dma_buf.out.physical_address; - dmabuf.size = allocate_dma_buf.out.size; - - ttdev->dma_buffer_mappings.push_back(dmabuf); - - return dmabuf; -} - -PCIdevice ttkmd_open(DWORD device_id, bool sharable /* = false */) -{ - (void)sharable; // presently ignored - - auto ttdev = std::make_unique(TTDevice::open(device_id)); - - PCIdevice device; - device.id = device_id; - device.hdev = ttdev.get(); - device.vendor_id = ttdev->device_info.vendor_id; - device.device_id = ttdev->device_info.device_id; - device.subsystem_vendor_id = ttdev->device_info.subsystem_vendor_id; - device.subsystem_id = ttdev->device_info.subsystem_id; - device.dwBus = ttdev->pci_bus; - device.dwSlot = ttdev->pci_device; - device.dwFunction = ttdev->pci_function; - device.BAR_addr = read_bar0_base(ttdev.get()); - device.BAR_size_bytes = ttdev->bar0_uc_size; - device.revision_id = get_revision_id(ttdev.get()); - ttdev.release(); - - return device; -} - -int ttkmd_close(struct PCIdevice &device) { - delete static_cast(device.hdev); - - return 0; -} - -template -volatile T* register_address(const TTDevice *dev, std::uint32_t register_offset) { - void *reg_mapping; - if (dev->system_reg_mapping != nullptr && register_offset >= dev->system_reg_start_offset) { - register_offset -= dev->system_reg_offset_adjust; - reg_mapping = dev->system_reg_mapping; - } else if (dev->bar0_wc != dev->bar0_uc && register_offset < dev->bar0_wc_size) { - reg_mapping = dev->bar0_wc; - } else { - register_offset -= dev->bar0_uc_offset; - reg_mapping = dev->bar0_uc; - } - - return reinterpret_cast(static_cast(reg_mapping) + register_offset); -} +// return (scratch_data == 0xffffffffu); +// } -bool is_hardware_hung(const TTDevice *dev) { - volatile const void *addr = reinterpret_cast(dev->bar0_uc) + (dev->get_architecture_implementation()->get_arc_reset_scratch_offset() + 6 * 4) - dev->bar0_uc_offset; - std::uint32_t scratch_data = *reinterpret_cast(addr); +// bool reset_by_sysfs(TTDevice *dev) { - return (scratch_data == 0xffffffffu); -} +// const char *virtual_env = getenv("VIRTUAL_ENV"); +// if (virtual_env == nullptr) +// return false; -bool reset_by_sysfs(TTDevice *dev) { +// std::string reset_helper_path = virtual_env; +// reset_helper_path += "/bin/reset-helper"; - const char *virtual_env = getenv("VIRTUAL_ENV"); - if (virtual_env == nullptr) - return false; +// std::string busid = std::to_string(dev->pci_bus); - std::string reset_helper_path = virtual_env; - reset_helper_path += "/bin/reset-helper"; +// dev->suspend_before_device_reset(); - std::string busid = std::to_string(dev->pci_bus); +// char *argv[3]; +// argv[0] = const_cast(reset_helper_path.c_str()); +// argv[1] = const_cast(busid.c_str()); +// argv[2] = nullptr; - dev->suspend_before_device_reset(); +// pid_t reset_helper_pid; +// if (posix_spawn(&reset_helper_pid, reset_helper_path.c_str(), nullptr, nullptr, argv, environ) != 0) +// return false; - char *argv[3]; - argv[0] = const_cast(reset_helper_path.c_str()); - argv[1] = const_cast(busid.c_str()); - argv[2] = nullptr; +// siginfo_t reset_helper_status; +// if (waitid(P_PID, reset_helper_pid, &reset_helper_status, WEXITED) != 0) +// return false; - pid_t reset_helper_pid; - if (posix_spawn(&reset_helper_pid, reset_helper_path.c_str(), nullptr, nullptr, argv, environ) != 0) - return false; +// if (reset_helper_status.si_status != 0) +// return false; - siginfo_t reset_helper_status; - if (waitid(P_PID, reset_helper_pid, &reset_helper_status, WEXITED) != 0) - return false; - - if (reset_helper_status.si_status != 0) - return false; +// dev->resume_after_device_reset(); - dev->resume_after_device_reset(); - - return true; -} +// return true; +// } -bool reset_by_ioctl(TTDevice *dev) { - struct tenstorrent_reset_device reset_device; - memset(&reset_device, 0, sizeof(reset_device)); +// bool reset_by_ioctl(TTDevice *dev) { +// struct tenstorrent_reset_device reset_device; +// memset(&reset_device, 0, sizeof(reset_device)); - reset_device.in.output_size_bytes = sizeof(reset_device.out); - reset_device.in.flags = 0; +// reset_device.in.output_size_bytes = sizeof(reset_device.out); +// reset_device.in.flags = 0; - if (ioctl(dev->device_fd, TENSTORRENT_IOCTL_RESET_DEVICE, &reset_device) == -1) { - return false; - } +// if (ioctl(dev->device_fd, TENSTORRENT_IOCTL_RESET_DEVICE, &reset_device) == -1) { +// return false; +// } - return (reset_device.out.result == 0); -} +// return (reset_device.out.result == 0); +// } -bool auto_reset_board(TTDevice *dev) { - return ((reset_by_ioctl(dev) || reset_by_sysfs(dev)) && !is_hardware_hung(dev)); -} +// bool auto_reset_board(TTDevice *dev) { +// return ((reset_by_ioctl(dev) || reset_by_sysfs(dev)) && !is_hardware_hung(dev)); +// } -void detect_ffffffff_read(TTDevice *dev, std::uint32_t data_read = 0xffffffffu) { - if (g_READ_CHECKING_ENABLED && data_read == 0xffffffffu && is_hardware_hung(dev)) { - std::uint32_t scratch_data = *register_address(dev, dev->read_checking_offset); +// void detect_ffffffff_read(TTDevice *dev, std::uint32_t data_read = 0xffffffffu) { +// if (g_READ_CHECKING_ENABLED && data_read == 0xffffffffu && is_hardware_hung(dev)) { +// std::uint32_t scratch_data = *register_address(dev, dev->read_checking_offset); - if (auto_reset_board(dev)) { - throw std::runtime_error("Read 0xffffffff from PCIE: auto-reset succeeded."); - } else { - throw std::runtime_error("Read 0xffffffff from PCIE: you should reset the board."); - } - } -} +// if (auto_reset_board(dev)) { +// throw std::runtime_error("Read 0xffffffff from PCIE: auto-reset succeeded."); +// } else { +// throw std::runtime_error("Read 0xffffffff from PCIE: you should reset the board."); +// } +// } +// } -inline void record_access (const char* where, uint32_t addr, uint32_t size, bool turbo, bool write, bool block, bool endline) { - LOG2 ("%s PCI_ACCESS %s 0x%8x %8d bytes %s %s%s", where, write ? "WR" : "RD", addr, size, turbo ? "TU" : " ", block ? "BLK" : " ", endline ? "\n" : "" ); -} +// inline void record_access (const char* where, uint32_t addr, uint32_t size, bool turbo, bool write, bool block, bool endline) { +// LOG2 ("%s PCI_ACCESS %s 0x%8x %8d bytes %s %s%s", where, write ? "WR" : "RD", addr, size, turbo ? "TU" : " ", block ? "BLK" : " ", endline ? "\n" : "" ); +// } -inline void print_buffer (const void* buffer_addr, uint32_t len_bytes = 16, bool endline = true) { - // Prints each byte in a buffer - if (g_DEBUG_LEVEL > 1) { - uint8_t *b = (uint8_t *)(buffer_addr); - for (uint32_t i = 0; i < len_bytes; i++) { - LOG2 (" [0x%x] = 0x%x (%u) ", i, b[i], b[i]); - } - if (endline) { - LOG2 ("\n"); - } - } -} +// inline void print_buffer (const void* buffer_addr, uint32_t len_bytes = 16, bool endline = true) { +// // Prints each byte in a buffer +// if (g_DEBUG_LEVEL > 1) { +// uint8_t *b = (uint8_t *)(buffer_addr); +// for (uint32_t i = 0; i < len_bytes; i++) { +// LOG2 (" [0x%x] = 0x%x (%u) ", i, b[i], b[i]); +// } +// if (endline) { +// LOG2 ("\n"); +// } +// } +// } // Custom device memcpy. This is only safe for memory-like regions on the device (Tensix L1, DRAM, ARC CSM). // Both routines assume that misaligned accesses are permitted on host memory. @@ -978,390 +333,89 @@ inline void print_buffer (const void* buffer_addr, uint32_t len_bytes = 16, bool // 2. syseng#3487 WH GDDR5 controller has a bug when 1-byte writes are temporarily adjacent // to 2-byte writes. We avoid ever performing a 1-byte write to the device. This only affects to device. -void memcpy_to_device(void *dest, const void *src, std::size_t num_bytes) { - typedef std::uint32_t copy_t; - - // Start by aligning the destination (device) pointer. If needed, do RMW to fix up the - // first partial word. - volatile copy_t *dp; - - std::uintptr_t dest_addr = reinterpret_cast(dest); - unsigned int dest_misalignment = dest_addr % sizeof(copy_t); - - if (dest_misalignment != 0) { - // Read-modify-write for the first dest element. - dp = reinterpret_cast(dest_addr - dest_misalignment); - - copy_t tmp = *dp; - - auto leading_len = std::min(sizeof(tmp) - dest_misalignment, num_bytes); - - std::memcpy(reinterpret_cast(&tmp) + dest_misalignment, src, leading_len); - num_bytes -= leading_len; - src = static_cast(src) + leading_len; - - *dp++ = tmp; - - } else { - dp = static_cast(dest); - } - - // Copy the destination-aligned middle. - const copy_t *sp = static_cast(src); - std::size_t num_words = num_bytes / sizeof(copy_t); - - for (std::size_t i = 0; i < num_words; i++) - *dp++ = *sp++; - - // Finally copy any sub-word trailer, again RMW on the destination. - auto trailing_len = num_bytes % sizeof(copy_t); - if (trailing_len != 0) { - copy_t tmp = *dp; - - std::memcpy(&tmp, sp, trailing_len); - - *dp++ = tmp; - } -} - -void memcpy_from_device(void *dest, const void *src, std::size_t num_bytes) { - typedef std::uint32_t copy_t; - - // Start by aligning the source (device) pointer. - const volatile copy_t *sp; - - std::uintptr_t src_addr = reinterpret_cast(src); - unsigned int src_misalignment = src_addr % sizeof(copy_t); - - if (src_misalignment != 0) { - sp = reinterpret_cast(src_addr - src_misalignment); - - copy_t tmp = *sp++; - - auto leading_len = std::min(sizeof(tmp) - src_misalignment, num_bytes); - std::memcpy(dest, reinterpret_cast(&tmp) + src_misalignment, leading_len); - num_bytes -= leading_len; - dest = static_cast(dest) + leading_len; - - } else { - sp = static_cast(src); - } - - // Copy the source-aligned middle. - copy_t *dp = static_cast(dest); - std::size_t num_words = num_bytes / sizeof(copy_t); - - for (std::size_t i = 0; i < num_words; i++) - *dp++ = *sp++; - - // Finally copy any sub-word trailer. - auto trailing_len = num_bytes % sizeof(copy_t); - if (trailing_len != 0) { - copy_t tmp = *sp; - std::memcpy(dp, &tmp, trailing_len); - } -} - -void read_block(TTDevice *dev, uint64_t byte_addr, uint64_t num_bytes, uint8_t* buffer_addr, uint32_t dma_buf_size) { - if (num_bytes >= g_DMA_BLOCK_SIZE_READ_THRESHOLD_BYTES && g_DMA_BLOCK_SIZE_READ_THRESHOLD_BYTES > 0) { - record_access ("read_block_a", byte_addr, num_bytes, true, false, true, true); // addr, size, turbo, write, block, endline - - DMAbuffer &transfer_buffer = dev->dma_transfer_buffer; - - uint64_t host_phys_addr = pci_dma_buffer_get_physical_addr (transfer_buffer); - uint64_t host_user_addr = pci_dma_buffer_get_user_addr (transfer_buffer); - while (num_bytes > 0) { - uint32_t transfered_bytes = std::min(num_bytes, dma_buf_size); - pcie_dma_transfer_turbo (dev, byte_addr, host_phys_addr, transfered_bytes, false); - memcpy (buffer_addr, (void*)host_user_addr, transfered_bytes); - num_bytes -= transfered_bytes; - byte_addr += transfered_bytes; - buffer_addr += transfered_bytes; - } - return; - } - - record_access("read_block_b", byte_addr, num_bytes, false, false, true, false); // addr, size, turbo, write, block, endline - - void *reg_mapping; - if (dev->bar4_wc != nullptr && byte_addr >= BAR0_BH_SIZE) { - byte_addr -= BAR0_BH_SIZE; - reg_mapping = dev->bar4_wc; - } - else if (dev->system_reg_mapping != nullptr && byte_addr >= dev->system_reg_start_offset) { - byte_addr -= dev->system_reg_offset_adjust; - reg_mapping = dev->system_reg_mapping; - } else if (dev->bar0_wc != dev->bar0_uc && byte_addr < dev->bar0_wc_size) { - reg_mapping = dev->bar0_wc; - } else { - byte_addr -= dev->bar0_uc_offset; - reg_mapping = dev->bar0_uc; - } - - const void *src = reinterpret_cast(reg_mapping) + byte_addr; - void *dest = reinterpret_cast(buffer_addr); - -#ifndef DISABLE_ISSUE_3487_FIX - memcpy_from_device(dest, src, num_bytes); -#else -#ifdef FAST_MEMCPY - - if ((num_bytes % 32 == 0) && ((intptr_t(dest) & 31) == 0) && ((intptr_t(src) & 31) == 0)) - memcpy_from_device(dest, src, num_bytes); - { - // Faster memcpy version.. about 8x currently compared to pci_read above - fastMemcpy(dest, src, num_bytes); - } - else -#else - // ~4x faster than pci_read above, but works for all sizes and alignments - memcpy(dest, src, num_bytes); -#endif -#endif - - if (num_bytes >= sizeof(std::uint32_t)) { - detect_ffffffff_read(dev, *reinterpret_cast(dest)); - } - print_buffer (buffer_addr, std::min((uint64_t)g_NUM_BYTES_TO_PRINT, num_bytes), true); -} - -void write_block(TTDevice *dev, uint64_t byte_addr, uint64_t num_bytes, const uint8_t* buffer_addr, uint32_t dma_buf_size) { - if (num_bytes >= g_DMA_BLOCK_SIZE_WRITE_THRESHOLD_BYTES && g_DMA_BLOCK_SIZE_WRITE_THRESHOLD_BYTES > 0) { - record_access ("write_block_a", byte_addr, num_bytes, true, true, true, true); // addr, size, turbo, write, block, endline - - DMAbuffer &transfer_buffer = dev->dma_transfer_buffer; - - uint64_t host_phys_addr = pci_dma_buffer_get_physical_addr (transfer_buffer); - uint64_t host_user_addr = pci_dma_buffer_get_user_addr (transfer_buffer); - while (num_bytes > 0) { - uint32_t transfered_bytes = std::min(num_bytes, dma_buf_size); - memcpy ( (void*)host_user_addr, buffer_addr, transfered_bytes); - pcie_dma_transfer_turbo (dev, byte_addr, host_phys_addr, transfered_bytes, true); - num_bytes -= transfered_bytes; - byte_addr += transfered_bytes; - buffer_addr += transfered_bytes; - } - return; - } - - record_access("write_block_b", byte_addr, num_bytes, false, true, true, false); // addr, size, turbo, write, block, endline - - void *reg_mapping; - if (dev->bar4_wc != nullptr && byte_addr >= BAR0_BH_SIZE) { - byte_addr -= BAR0_BH_SIZE; - reg_mapping = dev->bar4_wc; - } - else if (dev->system_reg_mapping != nullptr && byte_addr >= dev->system_reg_start_offset) { - byte_addr -= dev->system_reg_offset_adjust; - reg_mapping = dev->system_reg_mapping; - } else if (dev->bar0_wc != dev->bar0_uc && byte_addr < dev->bar0_wc_size) { - reg_mapping = dev->bar0_wc; - } else { - byte_addr -= dev->bar0_uc_offset; - reg_mapping = dev->bar0_uc; - } - - void *dest = reinterpret_cast(reg_mapping) + byte_addr; - const void *src = reinterpret_cast(buffer_addr); -#ifndef DISABLE_ISSUE_3487_FIX - memcpy_to_device(dest, src, num_bytes); -#else -#ifdef FAST_MEMCPY - memcpy_to_device(dest, src, num_bytes); - if ((num_bytes % 32 == 0) && ((intptr_t(dest) & 31) == 0) && ((intptr_t(src) & 31) == 0)) - - { - // Faster memcpy version.. about 8x currently compared to pci_read above - fastMemcpy(dest, src, num_bytes); - } - else -#else - // ~4x faster than pci_read above, but works for all sizes and alignments - memcpy(dest, src, num_bytes); -#endif -#endif - print_buffer (buffer_addr, std::min((uint64_t)g_NUM_BYTES_TO_PRINT, num_bytes), true); -} - -void read_checking_enable(bool enable = true) { - g_READ_CHECKING_ENABLED = enable; -} - -// Read/write to the configuration space of the device -// pData is a pointer to a buffer (see memory module) -DWORD read_cfg(TTDevice *dev, DWORD byte_offset, uint64_t pData, DWORD num_bytes) { +// void memcpy_to_device(void *dest, const void *src, std::size_t num_bytes) { +// typedef std::uint32_t copy_t; - if (pread(get_config_space_fd(dev), reinterpret_cast(pData), num_bytes, byte_offset) != num_bytes) { - throw std::runtime_error("Config space read failed for device "); - } +// // Start by aligning the destination (device) pointer. If needed, do RMW to fix up the +// // first partial word. +// volatile copy_t *dp; - return 0; -} +// std::uintptr_t dest_addr = reinterpret_cast(dest); +// unsigned int dest_misalignment = dest_addr % sizeof(copy_t); -DWORD write_cfg(TTDevice *dev, DWORD byte_offset, uint64_t pData, DWORD num_bytes) { +// if (dest_misalignment != 0) { +// // Read-modify-write for the first dest element. +// dp = reinterpret_cast(dest_addr - dest_misalignment); - if (pwrite(get_config_space_fd(dev), reinterpret_cast(pData), num_bytes, byte_offset) != num_bytes) { - throw std::runtime_error("Config space read failed for device "); - } +// copy_t tmp = *dp; - return 0; -} +// auto leading_len = std::min(sizeof(tmp) - dest_misalignment, num_bytes); -DMAbuffer pci_allocate_dma_buffer(TTDevice *dev, uint32_t size) { +// std::memcpy(reinterpret_cast(&tmp) + dest_misalignment, src, leading_len); +// num_bytes -= leading_len; +// src = static_cast(src) + leading_len; - uint32_t page_size = getpagesize(); - uint32_t page_aligned_size = (size + page_size - 1) & ~(page_size - 1); +// *dp++ = tmp; - DMAbuffer ret_val = allocate_dma_buffer(dev, dev->next_dma_buf++, page_aligned_size); - LOG1 ("Allocated DMA buffer at 0x%lx 0x%lx size: %u\n", ret_val.pBuf, ret_val.pDma, size); - return ret_val; -} - -void pcie_init_dma_transfer_turbo (PCIdevice* dev) { - // From SHA 8cf7ff1bc7b3886a: - if (detect_arch(dev) == tt::ARCH::WORMHOLE_B0) { - c_CSM_PCIE_CTRL_DMA_REQUEST_OFFSET = 0x1fef84c8; // chip.AXI.get_path_info("ARC_CSM.ARC_PCIE_DMA_REQUEST") - } else { - c_CSM_PCIE_CTRL_DMA_REQUEST_OFFSET = 0x1fef84c0; // chip.AXI.get_path_info("ARC_CSM.ARC_PCIE_DMA_REQUEST") - } - c_DMA_TRIGGER_ADDRESS = 0x1ff30074; // chip.AXI.get_path_info("ARC_RESET.SCRATCH[5]") - c_ARC_MISC_CNTL_ADDRESS = 0x1ff30100; // chip.AXI.get_path_info("ARC_RESET.ARC_MISC_CNTL") -} - -void set_use_dma(bool msi, uint32_t dma_block_size_read_threshold_bytes, uint32_t dma_block_size_write_threshold_bytes) { - g_USE_MSI_FOR_DMA = msi; - g_DMA_BLOCK_SIZE_READ_THRESHOLD_BYTES = dma_block_size_read_threshold_bytes; - g_DMA_BLOCK_SIZE_WRITE_THRESHOLD_BYTES = dma_block_size_write_threshold_bytes; -} - -void write_regs(volatile uint32_t *dest, const uint32_t *src, uint32_t word_len) { - while (word_len-- != 0) { - *dest++ = *src++; - } -} - -void write_regs(TTDevice *dev, uint32_t byte_addr, uint32_t word_len, const void *data) { - record_access("write_regs", byte_addr, word_len * sizeof(uint32_t), false, true, false, false); - - volatile uint32_t *dest = register_address(dev, byte_addr); - const uint32_t *src = reinterpret_cast(data); - - write_regs(dest, src, word_len); - - LOG2(" REG "); - print_buffer (data, std::min(g_NUM_BYTES_TO_PRINT, word_len * 4), true); -} +// } else { +// dp = static_cast(dest); +// } -void write_tlb_reg(TTDevice *dev, uint32_t byte_addr, std::uint64_t value_lower, std::uint64_t value_upper, std::uint32_t tlb_cfg_reg_size) { - record_access("write_tlb_reg", byte_addr, tlb_cfg_reg_size, false, true, false, false); - - log_assert((tlb_cfg_reg_size == 8) or (tlb_cfg_reg_size == 12), "Tenstorrent hardware supports only 64bit or 96bit TLB config regs"); - - volatile uint64_t *dest_qw = register_address(dev, byte_addr); - volatile uint32_t *dest_extra_dw = register_address(dev, byte_addr+8); -#if defined(__ARM_ARCH) || defined(__riscv) - // The store below goes through UC memory on x86, which has implicit ordering constraints with WC accesses. - // ARM has no concept of UC memory. This will not allow for implicit ordering of this store wrt other memory accesses. - // Insert an explicit full memory barrier for ARM. - // Do the same for RISC-V. - tt_driver_atomics::mfence(); -#endif - *dest_qw = value_lower; - if (tlb_cfg_reg_size > 8) { - uint32_t* p_value_upper = reinterpret_cast(&value_upper); - *dest_extra_dw = p_value_upper[0]; - } - tt_driver_atomics::mfence(); // Otherwise subsequent WC loads move earlier than the above UC store to the TLB register. - - LOG2(" TLB "); - print_buffer (&value_lower, sizeof(value_lower), true); - if (tlb_cfg_reg_size > 8) { - uint32_t* p_value_upper = reinterpret_cast(&value_upper); - print_buffer (p_value_upper, sizeof(uint32_t), true); - } -} +// // Copy the destination-aligned middle. +// const copy_t *sp = static_cast(src); +// std::size_t num_words = num_bytes / sizeof(copy_t); -void read_regs(TTDevice *dev, uint32_t byte_addr, uint32_t word_len, void *data) { - record_access("read_regs", byte_addr, word_len * sizeof(uint32_t), false, false, false, false); +// for (std::size_t i = 0; i < num_words; i++) +// *dp++ = *sp++; - const volatile uint32_t *src = register_address(dev, byte_addr); - uint32_t *dest = reinterpret_cast(data); +// // Finally copy any sub-word trailer, again RMW on the destination. +// auto trailing_len = num_bytes % sizeof(copy_t); +// if (trailing_len != 0) { +// copy_t tmp = *dp; - while (word_len-- != 0) { - uint32_t temp = *src++; - memcpy(dest++, &temp, sizeof(temp)); - } - LOG2(" REG "); - print_buffer (data, std::min(g_NUM_BYTES_TO_PRINT, word_len * 4), true); -} +// std::memcpy(&tmp, sp, trailing_len); -void handle_dma_timeout(TTDevice *dev, uint32_t size_bytes, bool write) { - detect_ffffffff_read(dev); - throw std::runtime_error(std::string("DMA transfer timeout: ") - + std::to_string(size_bytes) - + (write ? " byte write." : " byte read.")); -} -uint32_t pcie_dma_transfer_turbo (TTDevice *dev, uint32_t chip_addr, uint32_t host_phys_addr, uint32_t size_bytes, bool write) { - // c_timer t (""); +// *dp++ = tmp; +// } +// } - // t.now_in ("1. DMA setup"); +// void memcpy_from_device(void *dest, const void *src, std::size_t num_bytes) { +// typedef std::uint32_t copy_t; - if (c_CSM_PCIE_CTRL_DMA_REQUEST_OFFSET == 0) { - throw std::runtime_error ("pcie_init_dma_transfer_turbo must be called before pcie_dma_transfer_turbo"); - } +// // Start by aligning the source (device) pointer. +// const volatile copy_t *sp; - arc_pcie_ctrl_dma_request_t req = { - .chip_addr = chip_addr, - .host_phys_addr = host_phys_addr, - .completion_flag_phys_addr = static_cast(pci_dma_buffer_get_physical_addr(dev->dma_completion_flag_buffer)), - .size_bytes = size_bytes, - .write = (write ? 1U : 0U), - .pcie_msi_on_done = g_USE_MSI_FOR_DMA ? 1U : 0U, - .pcie_write_on_done = g_USE_MSI_FOR_DMA ? 0U : 1U, - .trigger = 1U, - .repeat = 1 - }; +// std::uintptr_t src_addr = reinterpret_cast(src); +// unsigned int src_misalignment = src_addr % sizeof(copy_t); - volatile uint32_t *complete_flag = (uint32_t *)pci_dma_buffer_get_user_addr(dev->dma_completion_flag_buffer); - *complete_flag = 0; +// if (src_misalignment != 0) { +// sp = reinterpret_cast(src_addr - src_misalignment); - // Configure the DMA engine - msi_interrupt_received = false; - write_regs (dev, c_CSM_PCIE_CTRL_DMA_REQUEST_OFFSET, sizeof(req) / sizeof(uint32_t), &req); +// copy_t tmp = *sp++; - // Trigger ARC interrupt 0 on core 0 - int arc_misc_cntl_value = 0; +// auto leading_len = std::min(sizeof(tmp) - src_misalignment, num_bytes); +// std::memcpy(dest, reinterpret_cast(&tmp) + src_misalignment, leading_len); +// num_bytes -= leading_len; +// dest = static_cast(dest) + leading_len; - // NOTE: Ideally, we should read the state of this register before writing to it, but that - // casues a lot of delay (reads have huge latencies) - arc_misc_cntl_value |= (1 << 16); // Cause IRQ0 on core 0 - write_regs (dev, c_ARC_MISC_CNTL_ADDRESS, 1, &arc_misc_cntl_value); +// } else { +// sp = static_cast(src); +// } - if (!g_USE_MSI_FOR_DMA) { - // t.now_in ("2. DMA poll"); - int wait_loops = 0; - while (true) { - // The complete flag is set ty by ARC (see src/hardware/soc/tb/arc_fw/lib/pcie_dma.c) - if (*complete_flag == 0xfaca) break; - wait_loops++; - } - // LOG2 ("Waited %d iterations\n", wait_loops); - } else { - // t.now_in ("2. DMA wait for MSI"); - while (msi_interrupt_received == false) - ; - } +// // Copy the source-aligned middle. +// copy_t *dp = static_cast(dest); +// std::size_t num_words = num_bytes / sizeof(copy_t); - return 0; // TODO: status -} +// for (std::size_t i = 0; i < num_words; i++) +// *dp++ = *sp++; -void print_device_info (struct PCIdevice &d) { - LOG1("PCIEIntfId 0x%x\n", d.id); - LOG1("VID:DID 0x%x:0x%x\n", d.vendor_id, d.device_id); - LOG1("SubVID:SubID 0x%x:0x%x\n", d.subsystem_vendor_id, d.subsystem_id); - LOG1("BSF %x:%x:%x\n", d.dwBus, d.dwSlot, d.dwFunction); - LOG1("BAR 0x%llx size: %dMB\n", d.BAR_addr, d.BAR_size_bytes / 1024 / 1024); -} +// // Finally copy any sub-word trailer. +// auto trailing_len = num_bytes % sizeof(copy_t); +// if (trailing_len != 0) { +// copy_t tmp = *sp; +// std::memcpy(dp, &tmp, trailing_len); +// } +// } // -------------------------------------------------------------------------------------------------------------- // -------------------------------------------------------------------------------------------------------------- @@ -1424,9 +478,9 @@ namespace { }; } // Get TLB index (from zero), check if it's in 16MB, 2MB or 1MB TLB range, and dynamically program it. -dynamic_tlb set_dynamic_tlb(PCIdevice* dev, unsigned int tlb_index, tt_xy_pair start, tt_xy_pair end, +dynamic_tlb set_dynamic_tlb(TTDevice *dev, unsigned int tlb_index, tt_xy_pair start, tt_xy_pair end, std::uint64_t address, bool multicast, std::unordered_map>& harvested_coord_translation, std::uint64_t ordering) { - auto architecture_implementation = dev->hdev->get_architecture_implementation(); + auto architecture_implementation = dev->get_architecture_implementation(); if (multicast) { std::tie(start, end) = architecture_implementation->multicast_workaround(start, end); } @@ -1436,8 +490,8 @@ dynamic_tlb set_dynamic_tlb(PCIdevice* dev, unsigned int tlb_index, tt_xy_pair s tt::umd::tlb_configuration tlb_config = architecture_implementation->get_tlb_configuration(tlb_index); std::uint32_t TLB_CFG_REG_SIZE_BYTES = architecture_implementation->get_tlb_cfg_reg_size_bytes(); - auto translated_start_coords = harvested_coord_translation.at(dev -> logical_id).at(start); - auto translated_end_coords = harvested_coord_translation.at(dev -> logical_id).at(end); + auto translated_start_coords = harvested_coord_translation.at(dev->logical_id).at(start); + auto translated_end_coords = harvested_coord_translation.at(dev->logical_id).at(end); uint32_t tlb_address = address / tlb_config.size; uint32_t local_offset = address % tlb_config.size; uint64_t tlb_base = tlb_config.base + (tlb_config.size * tlb_config.index_offset); @@ -1454,24 +508,23 @@ dynamic_tlb set_dynamic_tlb(PCIdevice* dev, unsigned int tlb_index, tt_xy_pair s // TODO #2715: hack for Blackhole A0, will potentially be fixed in B0. // Using the same static vc for reads and writes through TLBs can hang the card. It doesn't even have to be the same TLB. // Dynamic vc should not have this issue. There might be a perf impact with using dynamic vc. - .static_vc = (dev->hdev->get_arch() == tt::ARCH::BLACKHOLE) ? false : true, + .static_vc = (dev->get_arch() == tt::ARCH::BLACKHOLE) ? false : true, }.apply_offset(tlb_config.offset); LOG1("set_dynamic_tlb() with tlb_index: %d tlb_index_offset: %d dynamic_tlb_size: %dMB tlb_base: 0x%x tlb_cfg_reg: 0x%x\n", tlb_index, tlb_config.index_offset, tlb_config.size/(1024*1024), tlb_base, tlb_cfg_reg); // write_regs(dev -> hdev, tlb_cfg_reg, 2, &tlb_data); - write_tlb_reg(dev->hdev, tlb_cfg_reg, tlb_data.first, tlb_data.second, TLB_CFG_REG_SIZE_BYTES); + dev->write_tlb_reg(tlb_cfg_reg, tlb_data.first, tlb_data.second, TLB_CFG_REG_SIZE_BYTES); return { tlb_base + local_offset, tlb_config.size - local_offset }; } -dynamic_tlb set_dynamic_tlb(PCIdevice *dev, unsigned int tlb_index, tt_xy_pair target, std::uint64_t address, std::unordered_map>& harvested_coord_translation, std::uint64_t ordering = TLB_DATA::Relaxed) { +dynamic_tlb set_dynamic_tlb(TTDevice *dev, unsigned int tlb_index, tt_xy_pair target, std::uint64_t address, std::unordered_map>& harvested_coord_translation, std::uint64_t ordering = TLB_DATA::Relaxed) { return set_dynamic_tlb(dev, tlb_index, tt_xy_pair(0, 0), target, address, false, harvested_coord_translation, ordering); } -dynamic_tlb set_dynamic_tlb_broadcast(PCIdevice *dev, unsigned int tlb_index, std::uint64_t address, std::unordered_map>& harvested_coord_translation, tt_xy_pair start, tt_xy_pair end, std::uint64_t ordering = TLB_DATA::Relaxed) { +dynamic_tlb set_dynamic_tlb_broadcast(TTDevice *dev, unsigned int tlb_index, std::uint64_t address, std::unordered_map>& harvested_coord_translation, tt_xy_pair start, tt_xy_pair end, std::uint64_t ordering = TLB_DATA::Relaxed) { // Issue a broadcast to cores included in the start (top left) and end (bottom right) grid - return set_dynamic_tlb (dev, tlb_index, start, end, - address, true, harvested_coord_translation, ordering); + return set_dynamic_tlb(dev, tlb_index, start, end, address, true, harvested_coord_translation, ordering); } bool tt_SiliconDevice::address_in_tlb_space(uint32_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size, std::uint32_t chip) { @@ -1528,25 +581,16 @@ void tt_SiliconDevice::initialize_interprocess_mutexes(int pci_interface_id, boo void tt_SiliconDevice::create_device(const std::unordered_set &target_mmio_device_ids, const uint32_t &num_host_mem_ch_per_mmio_device, const bool skip_driver_allocs, const bool clean_system_resources) { m_pci_log_level = 0; - m_dma_buf_size = 0; LOG1("---- tt_SiliconDevice::tt_SiliconDevice\n"); - static int unique_driver_id = 0; - driver_id = unique_driver_id++; // Set the log level for debugging const char* pci_log_level = std::getenv("TT_PCI_LOG_LEVEL"); if (pci_log_level) { m_pci_log_level = atoi (pci_log_level); } - set_debug_level(m_pci_log_level); + g_DEBUG_LEVEL = m_pci_log_level; LOG1 ("TT_PCI_LOG_LEVEL=%d\n", m_pci_log_level); - const char* dma_buf_size = std::getenv("TT_PCI_DMA_BUF_SIZE"); - if (dma_buf_size) { - m_dma_buf_size = atoi (dma_buf_size); - } - LOG1 ("TT_PCI_DMA_BUF_SIZE=%d\n", m_dma_buf_size); - // Don't buffer stdout. setbuf(stdout, NULL); @@ -1556,18 +600,17 @@ void tt_SiliconDevice::create_device(const std::unordered_set &target log_assert(target_mmio_device_ids.size() > 0, "Must provide set of target_mmio_device_ids to tt_SiliconDevice constructor now."); for (const chip_id_t &logical_device_id : target_mmio_device_ids) { - m_pci_device_map.insert({logical_device_id, new struct PCIdevice}); - struct PCIdevice* pci_device = m_pci_device_map.at(logical_device_id); - log_assert(logical_to_physical_device_id_map.count(logical_device_id) != 0, "Cannot find logical mmio device_id: {} in cluster desc / logical-to-physical-map", logical_device_id); int pci_interface_id = logical_to_physical_device_id_map.at(logical_device_id); - log_debug(LogSiliconDriver, "Opening TT_PCI_INTERFACE_ID {} for netlist target_device_id: {}", pci_interface_id, logical_device_id); - *pci_device = ttkmd_open ((DWORD) pci_interface_id, false); - pci_device->logical_id = logical_device_id; + if (!m_pci_device_map.count(logical_device_id)) { + log_debug(LogSiliconDriver, "Opening TT_PCI_INTERFACE_ID {} for netlist target_device_id: {}", pci_interface_id, logical_device_id); + m_pci_device_map.insert({logical_device_id, std::make_unique(pci_interface_id, logical_device_id)}); + } + auto dev = m_pci_device_map.at(logical_device_id).get(); - m_num_host_mem_channels = get_available_num_host_mem_channels(num_host_mem_ch_per_mmio_device, pci_device->device_id, pci_device->revision_id); - if (arch_name == tt::ARCH::BLACKHOLE && m_num_host_mem_channels > 1) { + m_num_host_mem_channels = get_available_num_host_mem_channels(num_host_mem_ch_per_mmio_device, dev->pcie_device_id, dev->pcie_revision_id); + if (dev->get_arch() == tt::ARCH::BLACKHOLE && m_num_host_mem_channels > 1) { // TODO: Implement support for multiple host channels on BLACKHOLE. log_warning(LogSiliconDriver, "Forcing a single channel for Blackhole device. Multiple host channels not supported."); m_num_host_mem_channels = 1; @@ -1577,7 +620,7 @@ void tt_SiliconDevice::create_device(const std::unordered_set &target m_num_host_mem_channels, logical_device_id, pci_interface_id, pci_device->device_id, pci_device->revision_id); if (g_SINGLE_PIN_PAGE_PER_FD_WORKAROND) { - pci_device->hdev->open_hugepage_per_host_mem_ch(m_num_host_mem_channels); + dev->open_hugepage_per_host_mem_ch(m_num_host_mem_channels); } // Initialize these. Used to be in header file. @@ -1589,11 +632,8 @@ void tt_SiliconDevice::create_device(const std::unordered_set &target initialize_interprocess_mutexes(pci_interface_id, clean_system_resources); - if (!skip_driver_allocs) - print_device_info (*pci_device); - // MT: Initial BH - hugepages will fail init - // For using silicon driver without workload to query mission mode params, no need for hugepage/dmabuf. + // For using silicon driver without workload to query mission mode params, no need for hugepage. if (!skip_driver_allocs){ bool hugepages_initialized = init_hugepage(logical_device_id); // Large writes to remote chips require hugepages to be initialized. @@ -1601,13 +641,11 @@ void tt_SiliconDevice::create_device(const std::unordered_set &target if(target_remote_chips.size()) { log_assert(hugepages_initialized, "Hugepages must be successfully initialized if workload contains remote chips!"); } - uint16_t channel = 0; // Single channel sufficient for this? - if (not hugepage_mapping.at(logical_device_id).at(channel)) { - init_dmabuf(logical_device_id); + if (not hugepage_mapping.at(logical_device_id).at(0)) { + log_warning(LogSiliconDriver, "No hugepage mapping at device {}", logical_device_id); } } harvested_coord_translation.insert({logical_device_id, create_harvested_coord_translation(arch_name, true)}); //translation layer for harvested coords. Default is identity map - archs_in_cluster.push_back(detect_arch(logical_to_physical_device_id_map.at(logical_device_id))); } for(const chip_id_t& chip : target_devices_in_cluster) { @@ -1618,9 +656,6 @@ void tt_SiliconDevice::create_device(const std::unordered_set &target } } -bool tt_SiliconDevice::noc_translation_en() { - return translation_tables_en; -} bool tt_SiliconDevice::using_harvested_soc_descriptors() { return perform_harvesting_on_sdesc && performed_harvesting; } @@ -1820,17 +855,6 @@ void tt_SiliconDevice::populate_cores() { } } -std::unordered_map tt_SiliconDevice::get_harvesting_masks_from_harvested_rows(std::unordered_map> harvested_rows) { - std::unordered_map harvesting_masks = {}; - for(const auto& chip : harvested_rows) { - uint32_t harvesting_mask_per_chip = 0; - harvesting_masks.insert({chip.first, 0}); - for(const auto& row : chip.second) { - harvesting_masks.at(chip.first) |= (1 << row); - } - } - return harvesting_masks; -} std::vector tt_SiliconDevice::extract_rows_to_remove(const tt::ARCH &arch, const int worker_grid_rows, const int harvested_rows) { // Check if harvesting config is legal for GS and WH log_assert(!((harvested_rows & 1) || (harvested_rows & 64) || (harvested_rows & 0xFFFFF000)), "For grayskull and wormhole, only rows 1-5 and 7-11 can be harvested"); @@ -1901,26 +925,27 @@ void tt_SiliconDevice::perform_harvesting_and_populate_soc_descriptors(const std void tt_SiliconDevice::check_pcie_device_initialized(int device_id) { - struct PCIdevice* pci_device = get_pci_device(device_id); + TTDevice *pci_device = get_pci_device(device_id); + tt::ARCH device_arch = pci_device->get_arch(); if (arch_name == tt::ARCH::GRAYSKULL) { - if (!is_grayskull(pci_device->device_id)) { - throw std::runtime_error("Attempted to run grayskull configured tt_device on " + get_arch_str(detect_arch(pci_device))); + if (device_arch != tt::ARCH::GRAYSKULL) { + throw std::runtime_error("Attempted to run grayskull configured tt_device on " + get_arch_str(device_arch)); } } else if (arch_name == tt::ARCH::WORMHOLE || arch_name == tt::ARCH::WORMHOLE_B0) { - if (!is_wormhole(pci_device->device_id)) { - throw std::runtime_error("Attempted to run wormhole configured tt_device on " + get_arch_str(detect_arch(pci_device))); + if (device_arch != tt::ARCH::WORMHOLE && device_arch != tt::ARCH::WORMHOLE_B0) { + throw std::runtime_error("Attempted to run wormhole configured tt_device on " + get_arch_str(device_arch)); } } else if (arch_name == tt::ARCH::BLACKHOLE) { - if (!is_blackhole(pci_device->device_id)) { - throw std::runtime_error("Attempted to run blackhole configured tt_device on " + get_arch_str(detect_arch(pci_device))); + if (device_arch != tt::ARCH::BLACKHOLE) { + throw std::runtime_error("Attempted to run blackhole configured tt_device on " + get_arch_str(device_arch)); } } else { throw std::runtime_error("Unsupported architecture: " + get_arch_str(arch_name)); } - auto architecture_implementation = pci_device->hdev->get_architecture_implementation(); + auto architecture_implementation = pci_device->get_architecture_implementation(); // MT Initial BH - Add check for blackhole once access to ARC registers is setup through TLBs if (arch_name != tt::ARCH::BLACKHOLE) { @@ -2040,60 +1065,24 @@ void tt_SiliconDevice::initialize_pcie_devices() { check_pcie_device_initialized(device_it.first); } - // If requires multi-channel or doesn't support mmio-p2p, init iatus without p2p. - if (m_num_host_mem_channels <= 1 && arch_name == tt::ARCH::GRAYSKULL) { - init_pcie_iatus(); - } else { - // TODO: Implement support for multiple host channels on BLACKHOLE. - log_assert(!(arch_name == tt::ARCH::BLACKHOLE && m_num_host_mem_channels > 1), - "More channels are not yet supported for Blackhole"); - init_pcie_iatus_no_p2p(); - } + // TODO: Implement support for multiple host channels on BLACKHOLE. + log_assert(!(arch_name == tt::ARCH::BLACKHOLE && m_num_host_mem_channels > 1), "More channels are not yet supported for Blackhole"); + init_pcie_iatus(); init_membars(); - - // https://yyz-gitlab.local.tenstorrent.com/ihamer/ll-sw/issues/25 - // Note: using pcie dma while device is idle is safe, mixing p2p is unsafe, see issue above - // TODO: disable pcie dma if p2p traffic is present, ie. chip-to-chip or chip-to-host - - for (auto &device_it : m_pci_device_map){ - struct PCIdevice* pci_device = device_it.second; - auto device_id = pci_device->device_id; - // MT Initial BH - Don't use PCIe DMA - bool enable_pcie_dma; - if (arch_name == tt::ARCH::BLACKHOLE) { - enable_pcie_dma = false; - } else { - enable_pcie_dma = m_dma_buf_size>0; - } - // Use DMA only for transfers that cross the size thresholds (empirically determined) - if (enable_pcie_dma) { - try { - log_trace(LogSiliconDriver, "Enable PCIE DMA with bufsize {}", m_dma_buf_size); - set_use_dma (false, 128, 0); // use dma for reads only - init_dma_turbo_buf(pci_device); - } catch (const std::exception &e) { - log_trace(LogSiliconDriver, "Disable PCIE DMA, fallback to MMIO transfers due to exepction {}", e.what()); - set_use_dma (false, 0, 0); - uninit_dma_turbo_buf(pci_device); - } - } else { - log_trace(LogSiliconDriver, "Disable PCIE DMA"); - } - } } -void tt_SiliconDevice::broadcast_pcie_tensix_risc_reset(struct PCIdevice *device, const TensixSoftResetOptions &soft_resets) { +void tt_SiliconDevice::broadcast_pcie_tensix_risc_reset(TTDevice *device, const TensixSoftResetOptions &soft_resets) { LOG1("---- tt_SiliconDevice::broadcast_tensix_risc_reset\n"); auto valid = soft_resets & ALL_TENSIX_SOFT_RESET; LOG1("== For all tensix set soft-reset for %s risc cores.\n", TensixSoftResetOptionsToString(valid).c_str()); - auto architecture_implementation = device->hdev->get_architecture_implementation(); + auto architecture_implementation = device->get_architecture_implementation(); auto [soft_reset_reg, _] = set_dynamic_tlb_broadcast(device, architecture_implementation->get_reg_tlb(), architecture_implementation->get_tensix_soft_reset_addr(), harvested_coord_translation, tt_xy_pair(0, 0), tt_xy_pair(architecture_implementation->get_grid_size_x() - 1, architecture_implementation->get_grid_size_y() - 1 - num_rows_harvested.at(device -> logical_id)), TLB_DATA::Posted); - write_regs(device->hdev, soft_reset_reg, 1, &valid); + device->write_regs(soft_reset_reg, 1, &valid); tt_driver_atomics::sfence(); } @@ -2178,24 +1167,11 @@ std::vector tt_SiliconDevice::detect_available_device_ids() { return detected_device_ids; } -static bool check_dram_core_exists(const std::vector> &all_dram_cores, tt_xy_pair target_core) { - bool dram_core_exists = false; - for (const auto &dram_cores_in_channel : all_dram_cores) { - for (auto dram_core : dram_cores_in_channel) { - if (dram_core.x == target_core.x && dram_core.y == target_core.y) { - return true; - } - } - } - return false; -} - -std::function tt_SiliconDevice::get_fast_pcie_static_tlb_write_callable(int device_id) { - struct PCIdevice* pci_device = get_pci_device(device_id); - TTDevice* dev = pci_device->hdev; +std::function tt_SiliconDevice::get_fast_pcie_static_tlb_write_callable(int device_id) { + TTDevice* dev = get_pci_device(device_id); - const auto callable = [dev](uint32_t byte_addr, uint32_t num_bytes, const uint8_t* buffer_addr, uint32_t dma_buf_size) { - write_block(dev, byte_addr, num_bytes, buffer_addr, dma_buf_size); + const auto callable = [dev](uint32_t byte_addr, uint32_t num_bytes, const uint8_t* buffer_addr) { + dev->write_block(byte_addr, num_bytes, buffer_addr); }; return callable; @@ -2210,8 +1186,7 @@ tt::Writer tt_SiliconDevice::get_static_tlb_writer(tt_cxy_pair target) { throw std::runtime_error("TLBs not initialized"); } - auto *pci_device = get_pci_device(target.chip); - auto *dev = pci_device->hdev; + auto *dev = get_pci_device(target.chip); if (!dev->bar0_wc) { throw std::runtime_error("No write-combined mapping for BAR0"); @@ -2231,9 +1206,7 @@ tt::Writer tt_SiliconDevice::get_static_tlb_writer(tt_cxy_pair target) { } void tt_SiliconDevice::write_device_memory(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair target, std::uint32_t address, const std::string& fallback_tlb) { - struct PCIdevice* pci_device = get_pci_device(target.chip); - TTDevice *dev = pci_device->hdev; - + TTDevice *dev = get_pci_device(target.chip); const uint8_t* buffer_addr = static_cast(mem_ptr); // LOG1("---- tt_SiliconDevice::write_device_memory to chip:%lu %lu-%lu at 0x%x size_in_bytes: %d small_access: %d\n", @@ -2251,19 +1224,19 @@ void tt_SiliconDevice::write_device_memory(const void *mem_ptr, uint32_t size_in if (dev->bar4_wc != nullptr && tlb_size == BH_4GB_TLB_SIZE) { // This is only for Blackhole. If we want to write to DRAM (BAR4 space), we add offset // to which we write so write_block knows it needs to target BAR4 - write_block(dev, (tlb_offset + address % tlb_size) + BAR0_BH_SIZE, size_in_bytes, buffer_addr, m_dma_buf_size); + dev->write_block((tlb_offset + address % tlb_size) + BAR0_BH_SIZE, size_in_bytes, buffer_addr); } else { - write_block(dev, tlb_offset + address % tlb_size, size_in_bytes, buffer_addr, m_dma_buf_size); + dev->write_block(tlb_offset + address % tlb_size, size_in_bytes, buffer_addr); } } else { const auto tlb_index = dynamic_tlb_config.at(fallback_tlb); - const scoped_lock lock(*get_mutex(fallback_tlb, pci_device -> id)); + const scoped_lock lock(*get_mutex(fallback_tlb, dev->device_id)); while(size_in_bytes > 0) { - auto [mapped_address, tlb_size] = set_dynamic_tlb(pci_device, tlb_index, target, address, harvested_coord_translation, dynamic_tlb_ordering_modes.at(fallback_tlb)); + auto [mapped_address, tlb_size] = set_dynamic_tlb(dev, tlb_index, target, address, harvested_coord_translation, dynamic_tlb_ordering_modes.at(fallback_tlb)); uint32_t transfer_size = std::min((uint64_t)size_in_bytes, tlb_size); - write_block(dev, mapped_address, transfer_size, buffer_addr, m_dma_buf_size); + dev->write_block(mapped_address, transfer_size, buffer_addr); size_in_bytes -= transfer_size; address += transfer_size; @@ -2276,8 +1249,7 @@ void tt_SiliconDevice::write_device_memory(const void *mem_ptr, uint32_t size_in void tt_SiliconDevice::read_device_memory(void *mem_ptr, tt_cxy_pair target, std::uint32_t address, std::uint32_t size_in_bytes, const std::string& fallback_tlb) { // Assume that mem_ptr has been allocated adequate memory on host when this function is called. Otherwise, this function will cause a segfault. LOG1("---- tt_SiliconDevice::read_device_memory to chip:%lu %lu-%lu at 0x%x size_in_bytes: %d\n", target.chip, target.x, target.y, address, size_in_bytes); - struct PCIdevice* pci_device = get_pci_device(target.chip); - TTDevice *dev = pci_device->hdev; + TTDevice *dev = get_pci_device(target.chip); uint8_t* buffer_addr = static_cast(mem_ptr); @@ -2294,20 +1266,20 @@ void tt_SiliconDevice::read_device_memory(void *mem_ptr, tt_cxy_pair target, std if (dev->bar4_wc != nullptr && tlb_size == BH_4GB_TLB_SIZE) { // This is only for Blackhole. If we want to read from DRAM (BAR4 space), we add offset // from which we read so read_block knows it needs to target BAR4 - read_block(dev, (tlb_offset + address % tlb_size) + BAR0_BH_SIZE, size_in_bytes, buffer_addr, m_dma_buf_size); + dev->read_block((tlb_offset + address % tlb_size) + BAR0_BH_SIZE, size_in_bytes, buffer_addr); } else { - read_block(dev, tlb_offset + address % tlb_size, size_in_bytes, buffer_addr, m_dma_buf_size); + dev->read_block(tlb_offset + address % tlb_size, size_in_bytes, buffer_addr); } LOG1 (" read_block called with tlb_offset: %d, tlb_size: %d\n", tlb_offset, tlb_size); } else { const auto tlb_index = dynamic_tlb_config.at(fallback_tlb); - const scoped_lock lock(*get_mutex(fallback_tlb, pci_device -> id)); + const scoped_lock lock(*get_mutex(fallback_tlb, dev->device_id)); LOG1 (" dynamic tlb_index: %d\n", tlb_index); while(size_in_bytes > 0) { - auto [mapped_address, tlb_size] = set_dynamic_tlb(pci_device, tlb_index, target, address, harvested_coord_translation, dynamic_tlb_ordering_modes.at(fallback_tlb)); + auto [mapped_address, tlb_size] = set_dynamic_tlb(dev, tlb_index, target, address, harvested_coord_translation, dynamic_tlb_ordering_modes.at(fallback_tlb)); uint32_t transfer_size = std::min((uint64_t)size_in_bytes, tlb_size); - read_block(dev, mapped_address, transfer_size, buffer_addr, m_dma_buf_size); + dev->read_block(mapped_address, transfer_size, buffer_addr); size_in_bytes -= transfer_size; address += transfer_size; @@ -2317,7 +1289,7 @@ void tt_SiliconDevice::read_device_memory(void *mem_ptr, tt_cxy_pair target, std } } -void tt_SiliconDevice::read_dma_buffer( +void tt_SiliconDevice::read_buffer( void* mem_ptr, std::uint32_t address, std::uint16_t channel, @@ -2330,20 +1302,18 @@ void tt_SiliconDevice::read_dma_buffer( if(hugepage_mapping.at(src_device_id).at(channel)) { user_scratchspace = static_cast(hugepage_mapping.at(src_device_id).at(channel)) + (address & HUGEPAGE_MAP_MASK); - } else if (buf_mapping) { - user_scratchspace = static_cast(buf_mapping) + (address & DMA_MAP_MASK); } else { - std::string err_msg = "write_dma_buffer: Hugepage or DMAbuffer are not allocated for src_device_id: " + std::to_string(src_device_id) + " ch: " + std::to_string(channel); + std::string err_msg = "write_buffer: Hugepages are not allocated for src_device_id: " + std::to_string(src_device_id) + " ch: " + std::to_string(channel); err_msg += " - Ensure sufficient number of Hugepages installed per device (1 per host mem ch, per device)"; throw std::runtime_error(err_msg); } - LOG1("---- tt_SiliconDevice::read_dma_buffer (src_device_id: %d, ch: %d) from 0x%lx\n", src_device_id, channel, user_scratchspace); + LOG1("---- tt_SiliconDevice::read_buffer (src_device_id: %d, ch: %d) from 0x%lx\n", src_device_id, channel, user_scratchspace); memcpy(mem_ptr, user_scratchspace, size_in_bytes); } -void tt_SiliconDevice::write_dma_buffer( +void tt_SiliconDevice::write_buffer( const void *mem_ptr, std::uint32_t size, std::uint32_t address, @@ -2352,43 +1322,34 @@ void tt_SiliconDevice::write_dma_buffer( void * user_scratchspace = nullptr; if(hugepage_mapping.at(src_device_id).at(channel)) { - log_assert(size <= HUGEPAGE_REGION_SIZE, "write_dma_buffer data has larger size {} than destination buffer {}", size, HUGEPAGE_REGION_SIZE); + log_assert(size <= HUGEPAGE_REGION_SIZE, "write_buffer data has larger size {} than destination buffer {}", size, HUGEPAGE_REGION_SIZE); log_debug(LogSiliconDriver, "Using hugepage mapping at address {} offset {} chan {} size {}", hugepage_mapping.at(src_device_id).at(channel), (address & HUGEPAGE_MAP_MASK), channel, size); user_scratchspace = static_cast(hugepage_mapping.at(src_device_id).at(channel)) + (address & HUGEPAGE_MAP_MASK); - } - else if(buf_mapping) { - log_assert(size <= DMA_BUF_REGION_SIZE, "write_dma_buffer data has larger size {} than destination buffer {}", size, DMA_BUF_REGION_SIZE); - log_debug(LogSiliconDriver, "Using DMA Buffer at address {} offset {} size {}", - buf_mapping, - address, - size); - // we failed when initializing huge pages, we are using a 1MB DMA buffer as a stand-in - user_scratchspace = reinterpret_cast(buf_mapping); } else { - std::string err_msg = "write_dma_buffer: Hugepage or DMAbuffer are not allocated for src_device_id: " + std::to_string(src_device_id) + " ch: " + std::to_string(channel); + std::string err_msg = "write_buffer: Hugepage are not allocated for src_device_id: " + std::to_string(src_device_id) + " ch: " + std::to_string(channel); throw std::runtime_error(err_msg); } memcpy(user_scratchspace, mem_ptr, size); } -uint32_t tt_SiliconDevice::get_power_state_arc_msg(struct PCIdevice* pci_device, tt_DevicePowerState state) { +uint32_t tt_SiliconDevice::get_power_state_arc_msg(TTDevice* pci_device, tt_DevicePowerState state) { uint32_t msg = 0xaa00; switch (state) { case BUSY: { - msg |= pci_device->hdev->get_architecture_implementation()->get_arc_message_arc_go_busy(); + msg |= pci_device->get_architecture_implementation()->get_arc_message_arc_go_busy(); break; } case LONG_IDLE: { - msg |= pci_device->hdev->get_architecture_implementation()->get_arc_message_arc_go_long_idle(); + msg |= pci_device->get_architecture_implementation()->get_arc_message_arc_go_long_idle(); break; } case SHORT_IDLE: { - msg |= pci_device->hdev->get_architecture_implementation()->get_arc_message_arc_go_short_idle(); + msg |= pci_device->get_architecture_implementation()->get_arc_message_arc_go_short_idle(); break; } default: throw std::runtime_error("Unrecognized power state."); @@ -2400,7 +1361,7 @@ void tt_SiliconDevice::set_pcie_power_state(tt_DevicePowerState state) { for (auto &device_it : m_pci_device_map){ int d = device_it.first; - struct PCIdevice* pci_device = device_it.second; + auto pci_device = device_it.second.get(); uint32_t msg = get_power_state_arc_msg(pci_device, state); std::stringstream ss; ss << state; @@ -2427,8 +1388,8 @@ int tt_SiliconDevice::get_clock(int logical_device_id) { uint32_t clock; auto mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(logical_device_id); - struct PCIdevice* pci_device = get_pci_device(mmio_capable_chip_logical); - auto exit_code = arc_msg(logical_device_id, 0xaa00 | pci_device->hdev->get_architecture_implementation()->get_arc_message_get_aiclk(), true, 0xFFFF, 0xFFFF, 1, &clock); + TTDevice* pci_device = get_pci_device(mmio_capable_chip_logical); + auto exit_code = arc_msg(logical_device_id, 0xaa00 | pci_device->get_architecture_implementation()->get_arc_message_get_aiclk(), true, 0xFFFF, 0xFFFF, 1, &clock); if (exit_code != 0) { throw std::runtime_error("Failed to get aiclk value with exit code " + std::to_string(exit_code)); } @@ -2444,72 +1405,24 @@ std::map tt_SiliconDevice::get_clocks() { return clock_freq_map; } -//! Simple test of communication to device/target. true if it passes. -// bool tt_SiliconDevice::test_write_read(tt_cxy_pair target) { -// WARN("---- tt_SiliconDevice::test_write_read not implemented\n"); -// return true; -// } - -// bool tt_SiliconDevice::test_write_speed (struct PCIdevice* pci_device) { -// TTDevice *dev = pci_device->hdev; - -// if (dev->bar0_uc == dev->bar0_wc) { -// WARN("---- tt_SiliconDevice::test_write_speed WC not configured\n"); -// } - -// std::byte fill_value{0x42}; -// std::vector write_buf(architecture_implementation->get_static_tlb_size(), fill_value); - -// auto before = std::chrono::high_resolution_clock::now(); -// for (std::uint32_t y = 1; y < architecture_implementation->get_grid_size_y(); y++) -// { -// for (std::uint32_t x = 1; x < architecture_implementation->get_grid_size_x(); x++) -// { -// auto tlb_index = map_core_to_tlb(tt_xy_pair(x, y)); -// if (tlb_index < 0) { continue; } - -// auto offset = tlb_index * architecture_implementation->get_static_tlb_size(); - -// memcpy(static_cast(dev->bar0_wc) + offset, write_buf.data(), write_buf.size()); -// } -// } -// auto after = std::chrono::high_resolution_clock::now(); - -// std::chrono::duration interval = after - before; - -// unsigned int write_bw = 120 * std::milli::den / interval.count(); - -// LOG1("---- tt_SiliconDevice::test_write_speed Wrote 120MB @ %u MB/s\n", write_bw); - -// return (write_bw >= 512); // L1 write BW scales with AICLK, for low AICLK it will be very slow. -// } - tt_SiliconDevice::~tt_SiliconDevice () { LOG1 ("---- tt_SiliconDevice::~tt_SiliconDevice\n"); - for(int i = 0; i < archs_in_cluster.size(); i++) { - if(archs_in_cluster[i] == tt::ARCH::WORMHOLE) { - log_warning(LogSiliconDriver, "Virtual device {} for this run is Wormhole A0. This architecture is now deprecated. Please use Wormhole B0 for testing.", i); - } - } cleanup_shared_host_state(); for (auto &device_it : m_pci_device_map){ chip_id_t device_id = device_it.first; + // TTDevice *dev = device_it.second.get(); for (int ch = 0; ch < m_num_host_mem_channels; ch ++) { if (hugepage_mapping.at(device_id).at(ch)) { munmap(hugepage_mapping.at(device_id).at(ch), hugepage_mapping_size.at(device_id).at(ch)); } } - - struct PCIdevice* pci_device = device_it.second; - - ttkmd_close (*pci_device); - delete pci_device; - pci_device = NULL; + + device_it.second.reset(); } m_pci_device_map.clear(); ndesc.reset(); @@ -2531,15 +1444,11 @@ std::optional> tt_SiliconDevice::get_tlb_data_fro return tlb_data; } -uint32_t tt_SiliconDevice::get_m_dma_buf_size() const { - return m_dma_buf_size; -} - void tt_SiliconDevice::configure_tlb(chip_id_t logical_device_id, tt_xy_pair core, std::int32_t tlb_index, std::int32_t address, uint64_t ordering) { log_assert(ordering == TLB_DATA::Strict || ordering == TLB_DATA::Posted || ordering == TLB_DATA::Relaxed, "Invalid ordering specified in tt_SiliconDevice::configure_tlb"); - struct PCIdevice* pci_device = get_pci_device(logical_device_id); + TTDevice *pci_device = get_pci_device(logical_device_id); set_dynamic_tlb(pci_device, tlb_index, core, address, harvested_coord_translation, ordering); - auto tlb_size = std::get<1>(pci_device->hdev->get_architecture_implementation()->describe_tlb(tlb_index).value()); + auto tlb_size = std::get<1>(pci_device->get_architecture_implementation()->describe_tlb(tlb_index).value()); if(tlb_config_map.find(logical_device_id) == tlb_config_map.end()) tlb_config_map.insert({logical_device_id, {}}); tlb_config_map[logical_device_id].insert({tlb_index, (address / tlb_size) * tlb_size}); } @@ -2547,167 +1456,39 @@ void tt_SiliconDevice::configure_tlb(chip_id_t logical_device_id, tt_xy_pair cor void tt_SiliconDevice::set_fallback_tlb_ordering_mode(const std::string& fallback_tlb, uint64_t ordering) { log_assert(ordering == TLB_DATA::Strict || ordering == TLB_DATA::Posted || ordering == TLB_DATA::Relaxed, "Invalid ordering specified in tt_SiliconDevice::configure_tlb."); log_assert(dynamic_tlb_ordering_modes.find(fallback_tlb) != dynamic_tlb_ordering_modes.end(), "Invalid TLB specified in tt_SiliconDevice::set_fallback_tlb_ordering_mode."); - log_assert(fallback_tlb != "LARGE_READ_TLB" && fallback_tlb != "LARGE_WRITE_TLB", "Ordering modes for LARGE_READ_TLB and LARGE_WRITE_TLB cannot be modified."); - dynamic_tlb_ordering_modes.at(fallback_tlb) = ordering; -} -// This function checks that all TLBs are properly setup. It should return 0 if all is good (i.e. if init_pcie_tlb is called prior) -// int tt_SiliconDevice::test_pcie_tlb_setup (struct PCIdevice* pci_device) { - // LOG1("---- tt_SiliconDevice::test_pcie_tlb_setup\n"); - // uint64_t tlb_data; - // int ret_val; - // // Check static TLBs (only active Tensix cores for GS ... Active tensix cores + ethernet cores for WH) - // for (uint32_t y = 0; y < architecture_implementation->get_grid_size_y() - num_rows_harvested; y++) { - // for (uint32_t x = 0; x < architecture_implementation->get_grid_size_x(); x++) { - // int tlb_index = get_static_tlb_index(tt_xy_pair(x, y)); - // auto translated_coords = harvested_coord_translation.at(pci_device -> id).at(tt_xy_pair(x, y)); - // if (tlb_index < 0) { continue; } - - // auto tlb_data_attempt = architecture_implementation->get_tlb_data(tlb_index, TLB_DATA { - // .x_end = translated_coords.x, - // .y_end = translated_coords.y, - // }); - // if (!tlb_data_attempt.has_value()) { - // throw std::runtime_error("Error setting up (" + std::to_string(x) + ", " + std::to_string(y) + ") in pcie_tlb_test."); - // } - // uint64_t expected_tlb_data = tlb_data_attempt.value(); - - // uint32_t tlb_setup_addr = architecture_implementation->get_static_tlb_cfg_addr() + 8 * tlb_index; // Each tlb setup takes 2 dwords, hence 8 bytes - // read_regs(pci_device->hdev, tlb_setup_addr, 2, &tlb_data); - - // } - // } - - // // Check 16MB TLBs 1-16 for peer-to-peer communication with DRAM channel 0 - // uint64_t peer_dram_offset = architecture_implementation->get_dram_channel_0_peer2peer_region_start(); - // for (uint32_t tlb_id = 1; tlb_id < 17; tlb_id++) { - // auto tlb_data_expected = architecture_implementation->get_tlb_data(architecture_implementation->get_tlb_base_index_16m() + tlb_id, TLB_DATA { - // .local_offset = peer_dram_offset / architecture_implementation->get_dynamic_tlb_16m_size(), - // .x_end = architecture_implementation->get_dram_channel_0_x(), - // .y_end = architecture_implementation->get_dram_channel_0_y(), - // .ordering = TLB_DATA::Posted, - // .static_vc = true, - // }); - // uint64_t tlb_data_observed; - // uint32_t tlb_setup_addr = architecture_implementation->get_dynamic_tlb_16m_cfg_addr() + 8 * tlb_id; // Each tlb setup takes 2 dwords, hence 8 bytes - // read_regs(pci_device->hdev, tlb_setup_addr, 2, &tlb_data_observed); - // ret_val = (tlb_data_expected == tlb_data_observed) ? 0 : 1; - // if (ret_val != 0) return ret_val; - // peer_dram_offset += architecture_implementation->get_dynamic_tlb_16m_size(); - // } - // return ret_val; -//} - -// Set up IATU for peer2peer -// Consider changing this function -void tt_SiliconDevice::init_pcie_iatus() { - - int starting_device_id = m_pci_device_map.begin()->first; - int ending_device_id = m_pci_device_map.rbegin()->first; - int num_enabled_devices = m_pci_device_map.size(); - - LOG1("---- tt_SiliconDevice::init_pcie_iatus() num_enabled_devices: %d starting_device_id: %d ending_device_id: %d\n", num_enabled_devices, starting_device_id, ending_device_id); - log_assert(m_num_host_mem_channels <= 1, "Maximum of 1x 1GB Host memory channels supported."); - - // Requirement for ring topology in GS, but since WH can share below code, check it again here for mmio mapped devices, - // otherwise us/ds device calculations will not be correct. Don't expect to see this for Wormhole today. - log_assert((starting_device_id + num_enabled_devices - 1) == ending_device_id, "The set of workload mmio-mapped target_device_id's must be sequential, without gaps."); - - for (auto &src_device_it : m_pci_device_map){ - int src_pci_id = src_device_it.first; - struct PCIdevice* src_pci_device = src_device_it.second; - - uint32_t current_peer_region = 0; - const int num_peer_ids = 3; // 0=HOST, 1=UPSTREAM Device, 2=DOWNSTREAM Device, 3=Unused - for (int peer_id = 0; peer_id < num_peer_ids; peer_id++) { - - //TODO: migrate this to huge pages when that support is in - if (peer_id == 0){ - LOG2 ("Setting up src_pci_id: %d peer_id: %d to Host. current_peer_region: %d\n", src_pci_id, peer_id, current_peer_region); - // Device to Host (peer_id==0) - const uint16_t host_memory_channel = 0; // Only single channel supported. - if (hugepage_mapping.at(src_pci_id).at(host_memory_channel)) { - iatu_configure_peer_region(src_pci_id, current_peer_region, hugepage_physical_address.at(src_pci_id).at(host_memory_channel), HUGEPAGE_REGION_SIZE); - host_channel_size.insert({(int)src_pci_device->logical_id, {HUGEPAGE_REGION_SIZE}}); - } else if(buf_mapping) { - // we failed when initializing huge pages, we are using a 1MB DMA buffer as a stand-in - iatu_configure_peer_region(src_pci_id, current_peer_region, buf_physical_addr, DMA_BUF_REGION_SIZE); - } - } else if (peer_id == 1 || peer_id == 2){ - // Device to Device (peer_id==1 : Upstream, peer_id==2 : Downstream) - // For determining upstream/downstream peers in ring topology - this matches is_target_device_downstream() in net2pipe - int upstream_peer_device_id = src_pci_id > starting_device_id ? src_pci_id - 1 : ending_device_id; - int downstream_peer_device_id = src_pci_id < (ending_device_id) ? src_pci_id + 1 : starting_device_id; - - int peer_device_id = peer_id == 1 ? upstream_peer_device_id : downstream_peer_device_id; - - struct PCIdevice* peer_pci_device = m_pci_device_map.at(peer_device_id); - uint64_t peer_BAR_addr = peer_pci_device->BAR_addr; - uint32_t peer_pci_interface_id = peer_pci_device->id; - uint32_t TLB1_16MB_OFFSET = 0; // Was 192MB offset to DRAM, now added by net2pipe since ATU maps to base of 512MB PCI Bar. - uint32_t PEER_REGION_SIZE = 1024 * 1024 * 1024; // Was 256MB. Want 512MB. Updated to 1024MB to match net2pipe more easily. - // FIXME - How to reduce PEER_REGION_SIZE=256 again, and make this still work? Need to make the ATU mappings non-contiguous 256MB chunks (every 1GB?) to match net2pipe? - - LOG2 ("Setting up src_pci_id: %d peer_id: %d to Device (upstream_peer_device_id: %d downstream_peer_device_id: %d) gives peer_device_id: %d (peer_pci_interface_id: %d) current_peer_region: %d\n", - src_pci_id, peer_id, upstream_peer_device_id, downstream_peer_device_id, peer_device_id, peer_pci_interface_id, current_peer_region ); - - iatu_configure_peer_region (src_pci_id, current_peer_region, peer_BAR_addr + TLB1_16MB_OFFSET, PEER_REGION_SIZE); - } - current_peer_region ++; - } - } + log_assert(fallback_tlb != "LARGE_READ_TLB" && fallback_tlb != "LARGE_WRITE_TLB", "Ordering modes for LARGE_READ_TLB and LARGE_WRITE_TLB cannot be modified."); + dynamic_tlb_ordering_modes.at(fallback_tlb) = ordering; } // TT<->TT P2P support removed in favor of increased Host memory. -void tt_SiliconDevice::init_pcie_iatus_no_p2p() { - +void tt_SiliconDevice::init_pcie_iatus() { int num_enabled_devices = m_pci_device_map.size(); - LOG1("---- tt_SiliconDevice::init_pcie_iatus_no_p2p() num_enabled_devices: %d\n", num_enabled_devices); + LOG1("---- tt_SiliconDevice::init_pcie_iatus() num_enabled_devices: %d\n", num_enabled_devices); log_assert(m_num_host_mem_channels <= g_MAX_HOST_MEM_CHANNELS, "Maximum of {} 1GB Host memory channels supported.", g_MAX_HOST_MEM_CHANNELS); for (auto &src_device_it : m_pci_device_map){ int src_pci_id = src_device_it.first; - struct PCIdevice* src_pci_device = src_device_it.second; + TTDevice* src_pci_device = src_device_it.second.get(); // Device to Host (multiple channels) for (int channel_id = 0; channel_id < m_num_host_mem_channels; channel_id++) { - // TODO - Try to remove DMA buffer support. if (hugepage_mapping.at(src_pci_id).at(channel_id)) { std::uint32_t region_size = HUGEPAGE_REGION_SIZE; if(channel_id == 3) region_size = 805306368; // Remove 256MB from full 1GB for channel 3 (iATU limitation) log_debug(LogSiliconDriver, "Configuring ATU channel {} to point to hugepage {}.", channel_id, src_pci_id); iatu_configure_peer_region(src_pci_id, channel_id, hugepage_physical_address.at(src_pci_id).at(channel_id), region_size); if(host_channel_size.find(src_pci_device->logical_id) == host_channel_size.end()) { - host_channel_size.insert({(int)src_pci_device->logical_id, {}}); + host_channel_size.insert({src_pci_device->logical_id, {}}); } host_channel_size.at(src_pci_device -> logical_id).push_back(region_size); - } else if(buf_mapping) { - log_debug(LogSiliconDriver, "Configuring ATU channel {} to point to DMA buffer.", channel_id); - // we failed when initializing huge pages, we are using a 1MB DMA buffer as a stand-in - iatu_configure_peer_region(src_pci_id, channel_id, buf_physical_addr, DMA_BUF_REGION_SIZE); + } else { + std::string err_msg = "init_pcie_iatus: Hugepages are not allocated for src_pci_id: " + std::to_string(src_pci_id) + " ch: " + std::to_string(channel_id); + throw std::runtime_error(err_msg); } } } } -uint32_t tt_SiliconDevice::dma_allocation_size(chip_id_t src_device_id) -{ - - // Fall back to first device if no src_device_id is provided. Assumes all devices have the same size, which is true. - chip_id_t device_index = src_device_id == -1 ? m_pci_device_map.begin()->first : src_device_id; - - if (hugepage_mapping.at(device_index).at(0)) { - return HUGEPAGE_REGION_SIZE; - } else if (buf_mapping) { - return DMA_BUF_REGION_SIZE; - } else { - log_fatal("Nothing has been allocated yet"); - return 0; - } -} - - - - // Looks for hugetlbfs inside /proc/mounts matching desired pagesize (typically 1G) std::string find_hugepage_dir(std::size_t pagesize) { @@ -2795,52 +1576,6 @@ int tt_SiliconDevice::open_hugepage_file(const std::string &dir, chip_id_t physi return fd; } -bool tt_SiliconDevice::init_dmabuf(chip_id_t device_id) { - if (buf_mapping == nullptr) { - - TTDevice *dev = m_pci_device_map.begin()->second->hdev; - - DMAbuffer buf = pci_allocate_dma_buffer(dev, DMA_BUF_REGION_SIZE); - buf_mapping = static_cast(reinterpret_cast(pci_dma_buffer_get_user_addr(buf))); - buf_physical_addr= pci_dma_buffer_get_physical_addr(buf); - } - return true; -} - -bool tt_SiliconDevice::init_dma_turbo_buf (struct PCIdevice* pci_device) { - // Allocate buffers for DMA transfer data and flag - pci_device->hdev->dma_completion_flag_buffer = pci_allocate_dma_buffer(pci_device->hdev, sizeof(uint64_t)); - pci_device->hdev->dma_transfer_buffer = pci_allocate_dma_buffer(pci_device->hdev, m_dma_buf_size); - pcie_init_dma_transfer_turbo(pci_device); - return true; -} - -bool tt_SiliconDevice::uninit_dma_turbo_buf (struct PCIdevice* pci_device) { - struct DMAbuffer &flag_buffer = pci_device->hdev->dma_completion_flag_buffer; - struct DMAbuffer &xfer_buffer = pci_device->hdev->dma_transfer_buffer; - if (flag_buffer.pBuf) { - for (auto it = pci_device->hdev->dma_buffer_mappings.begin(); it != pci_device->hdev->dma_buffer_mappings.end();) { - if (it->pBuf == flag_buffer.pBuf) { - it = pci_device->hdev->dma_buffer_mappings.erase(it); - } else { - ++it; - } - } - munmap(flag_buffer.pBuf, flag_buffer.size); - } - if (xfer_buffer.pBuf) { - for (auto it = pci_device->hdev->dma_buffer_mappings.begin(); it != pci_device->hdev->dma_buffer_mappings.end();) { - if (it->pBuf == xfer_buffer.pBuf) { - it = pci_device->hdev->dma_buffer_mappings.erase(it); - } else { - ++it; - } - } - munmap(xfer_buffer.pBuf, xfer_buffer.size); - } - return true; -} - // For debug purposes when various stages fails. void print_file_contents(std::string filename, std::string hint = ""){ if (std::filesystem::exists(filename)){ @@ -2858,7 +1593,8 @@ bool tt_SiliconDevice::init_hugepage(chip_id_t device_id) { const std::size_t mapping_size = (std::size_t) HUGEPAGE_REGION_SIZE; // Convert from logical (device_id in netlist) to physical device_id (in case of virtualization) - auto physical_device_id = m_pci_device_map.at(device_id)->id; + auto dev = m_pci_device_map.at(device_id).get(); + auto physical_device_id = dev->device_id; std::string hugepage_dir = find_hugepage_dir(hugepage_size); if (hugepage_dir.empty()) { @@ -2884,7 +1620,7 @@ bool tt_SiliconDevice::init_hugepage(chip_id_t device_id) { close(hugepage_fd); if (mapping == MAP_FAILED) { - uint32_t num_tt_mmio_devices_for_arch = tt::cpuset::tt_cpuset_allocator::get_num_tt_pci_devices_by_pci_device_id(m_pci_device_map.at(device_id)->device_id, m_pci_device_map.at(device_id)->revision_id); + uint32_t num_tt_mmio_devices_for_arch = tt::cpuset::tt_cpuset_allocator::get_num_tt_pci_devices_by_pci_device_id(physical_device_id, dev->pcie_revision_id); WARN("---- ttSiliconDevice::init_hugepage: physical_device_id: %d ch: %d mapping hugepage failed. (errno: %s).\n", physical_device_id, ch, strerror(errno)); WARN("---- Possible hint: /proc/cmdline should have hugepages=N, nr_hugepages=N - (N = NUM_MMIO_TT_DEVICES * (is_grayskull ? 1 : 4). NUM_MMIO_DEVICES = %d\n", num_tt_mmio_devices_for_arch); print_file_contents("/proc/cmdline");\ @@ -2907,7 +1643,7 @@ bool tt_SiliconDevice::init_hugepage(chip_id_t device_id) { pin_pages.in.virtual_address = reinterpret_cast(mapping); pin_pages.in.size = mapping_size; - auto &fd = g_SINGLE_PIN_PAGE_PER_FD_WORKAROND ? m_pci_device_map.at(device_id)->hdev->device_fd_per_host_ch[ch] : m_pci_device_map.at(device_id)->hdev->device_fd; + auto &fd = g_SINGLE_PIN_PAGE_PER_FD_WORKAROND ? dev->device_fd_per_host_ch[ch] : dev->device_fd; if (ioctl(fd, TENSTORRENT_IOCTL_PIN_PAGES, &pin_pages) == -1) { WARN("---- ttSiliconDevice::init_hugepage: physical_device_id: %d ch: %d TENSTORRENT_IOCTL_PIN_PAGES failed (errno: %s). Common Issue: Requires TTMKD >= 1.11, see following file contents...\n", physical_device_id, ch, strerror(errno)); @@ -2933,23 +1669,23 @@ bool tt_SiliconDevice::init_hugepage(chip_id_t device_id) { int tt_SiliconDevice::test_setup_interface () { if (arch_name == tt::ARCH::GRAYSKULL) { int ret_val = 0; - TTDevice *dev = m_pci_device_map.begin()->second->hdev; + TTDevice *dev = m_pci_device_map.begin()->second.get(); - uint32_t mapped_reg = set_dynamic_tlb(m_pci_device_map.begin()->second, dev->get_architecture_implementation()->get_reg_tlb(), tt_xy_pair(0, 0), 0xffb20108, harvested_coord_translation).bar_offset; + uint32_t mapped_reg = set_dynamic_tlb(dev, dev->get_architecture_implementation()->get_reg_tlb(), tt_xy_pair(0, 0), 0xffb20108, harvested_coord_translation).bar_offset; uint32_t regval = 0; - read_regs(dev, mapped_reg, 1, ®val); + dev->read_regs(mapped_reg, 1, ®val); ret_val = (regval != 0xffffffff && ((regval & 0x1) == 1)) ? 0 : 1; return ret_val; } else if (arch_name == tt::ARCH::WORMHOLE || arch_name == tt::ARCH::WORMHOLE_B0) { int ret_val = 0; - TTDevice *dev = m_pci_device_map.begin()->second->hdev; + TTDevice *dev = m_pci_device_map.begin()->second.get(); - uint32_t mapped_reg = set_dynamic_tlb(m_pci_device_map.begin()->second, dev->get_architecture_implementation()->get_reg_tlb(), tt_xy_pair(1, 0), 0xffb20108, harvested_coord_translation).bar_offset; + uint32_t mapped_reg = set_dynamic_tlb(dev, dev->get_architecture_implementation()->get_reg_tlb(), tt_xy_pair(1, 0), 0xffb20108, harvested_coord_translation).bar_offset; uint32_t regval = 0; - read_regs(dev, mapped_reg, 1, ®val); + dev->read_regs(mapped_reg, 1, ®val); ret_val = (regval != 0xffffffff && (regval == 33)) ? 0 : 1; return ret_val; } @@ -2971,80 +1707,24 @@ int tt_SiliconDevice::test_setup_interface () { } } -// Code used to test non existent broadcast TLB -// Keep for now, in case we need to test broadcast TLB again. -// int tt_SiliconDevice::test_broadcast (int logical_device_id) { -// LOG1("---- tt_SiliconDevice::test_broadcast\n"); - -// int ret_val = 0; -// struct PCIdevice* pci_device = get_pci_device(logical_device_id); - -// assert (test_pcie_tlb_setup(pci_device) == 0); - -// std::vector fill_array (1024, 0); -// uint32_t broadcast_bar_offset = architecture_implementation->get_broadcast_tlb_index() * architecture_implementation->get_static_tlb_size(); -// LOG2 ("broadcast_bar_offset = 0x%x\n", broadcast_bar_offset); - -// uint64_t fill_array_ptr = (uint64_t)(&fill_array[0]); - -// // a. Fill with increasing numbers -// // -// for (size_t i = 0; i < fill_array.size(); i++) { -// fill_array[i] = i; -// } -// write_block(pci_device->hdev, broadcast_bar_offset, fill_array.size() * sizeof (std::uint32_t), fill_array_ptr, m_dma_buf_size); - -// // Check individual locations -// for (uint32_t xi = 0; xi < architecture_implementation->get_t6_x_locations().size(); xi++) { -// for (uint32_t yi = 0; yi < architecture_implementation->get_t6_y_locations().size(); yi++) { -// tt_cxy_pair read_loc(logical_device_id, architecture_implementation->get_t6_x_locations()[xi], architecture_implementation->get_t6_y_locations()[yi]); -// read_vector (fill_array, read_loc, 0, fill_array.size() * sizeof (fill_array[0]) ); -// for (size_t i = 0; i < fill_array.size(); i++) { -// ret_val = (fill_array[i] == i) ? 0 : 1; -// if (ret_val) return ret_val; -// } -// } -// } - -// // b. Test with zeroes -// // -// std::vector fill_array_zeroes (1024, 0); -// uint64_t fill_array_zeroes_ptr = (uint64_t)(&fill_array_zeroes[0]); -// write_block(pci_device->hdev, broadcast_bar_offset, fill_array.size() * sizeof (std::uint32_t), fill_array_zeroes_ptr, m_dma_buf_size); - -// // Check individual locations -// for (uint32_t xi = 0; xi < architecture_implementation->get_t6_x_locations().size(); xi++) { -// for (uint32_t yi = 0; yi < architecture_implementation->get_t6_y_locations().size(); yi++) { -// tt_cxy_pair read_loc(logical_device_id, architecture_implementation->get_t6_x_locations()[xi], architecture_implementation->get_t6_y_locations()[yi]); -// read_vector (fill_array, read_loc, 0, fill_array.size() * sizeof (fill_array_zeroes[0]) ); -// for (size_t i = 0; i < fill_array.size(); i++) { -// ret_val = (fill_array_zeroes[i] == 0) ? 0 : 1; -// if (ret_val) return ret_val; -// } -// } -// } - -// return ret_val; -// } - void tt_SiliconDevice::bar_write32 (int logical_device_id, uint32_t addr, uint32_t data) { - TTDevice* dev = get_pci_device(logical_device_id)->hdev; + TTDevice *dev = get_pci_device(logical_device_id); if (addr < dev->bar0_uc_offset) { - write_block (dev, addr, sizeof(data), reinterpret_cast(&data), m_dma_buf_size); + dev->write_block(addr, sizeof(data), reinterpret_cast(&data)); // do we have to reinterpret_cast? } else { - write_regs (dev, addr, 1, &data); + dev->write_regs(addr, 1, &data); } } uint32_t tt_SiliconDevice::bar_read32 (int logical_device_id, uint32_t addr) { - TTDevice* dev = get_pci_device(logical_device_id)->hdev; + TTDevice* dev = get_pci_device(logical_device_id); uint32_t data; if (addr < dev->bar0_uc_offset) { - read_block (dev, addr, sizeof(data), reinterpret_cast(&data), m_dma_buf_size); + dev->read_block(addr, sizeof(data), reinterpret_cast(&data)); } else { - read_regs (dev, addr, 1, &data); + dev->read_regs(addr, 1, &data); } return data; } @@ -3058,12 +1738,12 @@ int tt_SiliconDevice::pcie_arc_msg(int logical_device_id, uint32_t msg_code, boo } log_assert(arg0 <= 0xffff and arg1 <= 0xffff, "Only 16 bits allowed in arc_msg args"); // Only 16 bits are allowed - struct PCIdevice* pci_device = get_pci_device(logical_device_id); - auto architecture_implementation = pci_device->hdev->get_architecture_implementation(); + TTDevice *pci_device = get_pci_device(logical_device_id); + auto architecture_implementation = pci_device->get_architecture_implementation(); // Exclusive access for a single process at a time. Based on physical pci interface id. std::string msg_type = "ARC_MSG"; - const scoped_lock lock(*get_mutex(msg_type, pci_device->id)); + const scoped_lock lock(*get_mutex(msg_type, pci_device->device_id)); uint32_t fw_arg = arg0 | (arg1<<16); int exit_code = 0; @@ -3108,7 +1788,7 @@ int tt_SiliconDevice::pcie_arc_msg(int logical_device_id, uint32_t msg_code, boo } } - detect_ffffffff_read(pci_device->hdev); + // detect_ffffffff_read(pci_device); return exit_code; } @@ -3118,8 +1798,8 @@ int tt_SiliconDevice::iatu_configure_peer_region (int logical_device_id, uint32_ uint32_t dest_bar_hi = (bar_addr_64 >> 32) & 0xffffffff; std::uint32_t region_id_to_use = peer_region_id; if(peer_region_id == 3) region_id_to_use = 4; // Hack use region 4 for channel 3..this ensures that we have a smaller chan 3 address space with the correct start offset - struct PCIdevice* pci_device = get_pci_device(logical_device_id); - auto architecture_implementation = pci_device->hdev->get_architecture_implementation(); + TTDevice *pci_device = get_pci_device(logical_device_id); + auto architecture_implementation = pci_device->get_architecture_implementation(); // BR: ARC doesn't work yet on Blackhole, so programming ATU directly. Should be removed when arc starts working. // TODO: Remove when ARC is implemented on BH. @@ -3139,15 +1819,15 @@ int tt_SiliconDevice::iatu_configure_peer_region (int logical_device_id, uint32_ uint64_t iatu_index = 0; uint64_t iatu_base = UNROLL_ATU_OFFSET_BAR + iatu_index * 0x200; - write_regs(reinterpret_cast(static_cast(pci_device->hdev->bar2_uc) + iatu_base + 0x00), ®ion_ctrl_1, 1); - write_regs(reinterpret_cast(static_cast(pci_device->hdev->bar2_uc) + iatu_base + 0x04), ®ion_ctrl_2, 1); - write_regs(reinterpret_cast(static_cast(pci_device->hdev->bar2_uc) + iatu_base + 0x08), &base_addr_lo, 1); - write_regs(reinterpret_cast(static_cast(pci_device->hdev->bar2_uc) + iatu_base + 0x0c), &base_addr_hi, 1); - write_regs(reinterpret_cast(static_cast(pci_device->hdev->bar2_uc) + iatu_base + 0x10), &limit_address_lo, 1); - write_regs(reinterpret_cast(static_cast(pci_device->hdev->bar2_uc) + iatu_base + 0x14), &dest_bar_lo, 1); - write_regs(reinterpret_cast(static_cast(pci_device->hdev->bar2_uc) + iatu_base + 0x18), &dest_bar_hi, 1); - write_regs(reinterpret_cast(static_cast(pci_device->hdev->bar2_uc) + iatu_base + 0x1c), ®ion_ctrl_3, 1); - write_regs(reinterpret_cast(static_cast(pci_device->hdev->bar2_uc) + iatu_base + 0x20), &limit_address_hi, 1); + pci_device->write_regs(reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x00), ®ion_ctrl_1, 1); + pci_device->write_regs(reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x04), ®ion_ctrl_2, 1); + pci_device->write_regs(reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x08), &base_addr_lo, 1); + pci_device->write_regs(reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x0c), &base_addr_hi, 1); + pci_device->write_regs(reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x10), &limit_address_lo, 1); + pci_device->write_regs(reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x14), &dest_bar_lo, 1); + pci_device->write_regs(reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x18), &dest_bar_hi, 1); + pci_device->write_regs(reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x1c), ®ion_ctrl_3, 1); + pci_device->write_regs(reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x20), &limit_address_hi, 1); } else { bar_write32(logical_device_id, architecture_implementation->get_arc_csm_mailbox_offset() + 0 * 4, region_id_to_use); @@ -3193,8 +1873,8 @@ uint32_t tt_SiliconDevice::get_harvested_rows (int logical_device_id) { harv = std::stoul(harv_override, nullptr, 16); } else { auto mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(logical_device_id); - struct PCIdevice* pci_device = get_pci_device(mmio_capable_chip_logical); - int harvesting_msg_code = arc_msg(logical_device_id, 0xaa00 | pci_device->hdev->get_architecture_implementation()->get_arc_message_arc_get_harvesting(), true, 0, 0, 1, &harv); + TTDevice *pci_device = get_pci_device(mmio_capable_chip_logical); + int harvesting_msg_code = arc_msg(logical_device_id, 0xaa00 | pci_device->get_architecture_implementation()->get_arc_message_arc_get_harvesting(), true, 0, 0, 1, &harv); log_assert(harvesting_msg_code != MSG_ERROR_REPLY, "Failed to read harvested rows from device {}", logical_device_id); } log_assert(harv != 0xffffffff, "Readback 0xffffffff for harvesting info. Chip is fused incorrectly!"); @@ -3224,48 +1904,20 @@ void tt_SiliconDevice::enable_local_ethernet_queue(const chip_id_t &device_id, i } } -void *tt_SiliconDevice::channel_address(std::uint32_t offset, const tt_cxy_pair& target) { - log_assert(ndesc->is_chip_mmio_capable(target.chip), "Cannot call channel_address for non-MMIO device"); - struct PCIdevice* pci_device = get_pci_device(target.chip); - auto architecture_implementation = pci_device->hdev->get_architecture_implementation(); - std::uint64_t bar0_offset; - - // Temporary hack for blackhole bringup. - if (arch_name == tt::ARCH::BLACKHOLE) { - // We use BAR4 segment for mapping for Blackhole. - log_assert(tlbs_init, "TLBs were not initialized."); - std::int32_t tlb_index = map_core_to_tlb(tt_xy_pair(target.x, target.y)); - auto [tlb_offset, tlb_size] = pci_device->hdev->get_architecture_implementation()->describe_tlb(tlb_index).value(); - - log_assert(pci_device->hdev->bar4_wc != nullptr && tlb_size == BH_4GB_TLB_SIZE, "BAR4 not initialized, or TLBs not initialized properly."); - return static_cast(pci_device->hdev->bar4_wc) + tlb_offset + offset; - } else { - // This hard-codes that we use 16MB TLB #1 onwards for the mapping. - bar0_offset = offset - architecture_implementation->get_dram_channel_0_peer2peer_region_start() - + architecture_implementation->get_dynamic_tlb_16m_base() + architecture_implementation->get_dynamic_tlb_16m_size(); - } - - return static_cast(pci_device->hdev->bar0_wc) + bar0_offset; -} - void *tt_SiliconDevice::host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const { - if (hugepage_mapping.at(src_device_id).at(channel) != nullptr) { return static_cast(hugepage_mapping.at(src_device_id).at(channel)) + offset; - } else if(buf_mapping) { - // we failed when initializing huge pages, we are using a 1MB DMA buffer as a stand-in - return static_cast(buf_mapping) + offset; } else { return nullptr; } } // Wrapper for throwing more helpful exception when not-enabled pci intf is accessed. -inline struct PCIdevice* tt_SiliconDevice::get_pci_device(int device_id) const { +inline TTDevice* tt_SiliconDevice::get_pci_device(int device_id) const { if (!m_pci_device_map.count(device_id)){ throw std::runtime_error(std::string("device_id: " + std::to_string(device_id) + " attempted to be accessed, but is not enabled.")); } - return m_pci_device_map.at(device_id); + return m_pci_device_map.at(device_id).get(); } std::shared_ptr tt_SiliconDevice::get_mutex(const std::string& tlb_name, int pci_interface_id) { @@ -3273,46 +1925,6 @@ std::shared_ptr tt_SiliconDevice::get_mutex(co return hardware_resource_mutex_map.at(mutex_name); } - -std::unordered_map tt_SiliconDevice::get_logical_to_physical_mmio_device_id_map(std::vector physical_device_ids){ - - std::unordered_map logical_to_physical_mmio_device_id_map; - - LOG1("get_logical_to_physical_mmio_device_id_map() -- num_physical_devices: %d\n", physical_device_ids.size()); - - for (int logical_device_idx=0; logical_device_idx < physical_device_ids.size(); logical_device_idx++){ - logical_to_physical_mmio_device_id_map.insert({logical_device_idx, physical_device_ids.at(logical_device_idx)}); - } - - return logical_to_physical_mmio_device_id_map; - -} - - -// Get PCI bus_id info for looking up TT devices in hwloc to find associated CPU package. -std::map tt_SiliconDevice::get_physical_device_id_to_bus_id_map(std::vector physical_device_ids){ - - std::map physical_device_id_to_bus_id_map; - - for (auto &pci_interface_id : physical_device_ids){ - - auto ttdev = std::make_unique(TTDevice::open(pci_interface_id)); - - std::ostringstream pci_bsf; - pci_bsf << std::hex << std::setw(2) << std::setfill('0') << (int) ttdev->pci_bus << ":"; - pci_bsf << std::hex << std::setw(2) << std::setfill('0') << (int) ttdev->pci_device << "."; - pci_bsf << std::hex << (int) ttdev->pci_function; - - std::string pci_bsf_str = pci_bsf.str(); - LOG2("get_physical_device_id_to_bus_id_map() -- pci_interface_id: %d BSF: %s\n", pci_interface_id, pci_bsf_str.c_str()); - physical_device_id_to_bus_id_map.insert({pci_interface_id, pci_bsf_str}); - - } - - return physical_device_id_to_bus_id_map; - -} - uint64_t tt_SiliconDevice::get_sys_addr(uint32_t chip_x, uint32_t chip_y, uint32_t noc_x, uint32_t noc_y, uint64_t offset) { uint64_t result = chip_y; uint64_t noc_addr_local_bits_mask = (1UL << eth_interface_params.noc_addr_local_bits) - 1; @@ -3345,7 +1957,6 @@ bool tt_SiliconDevice::is_non_mmio_cmd_q_full(uint32_t curr_wptr, uint32_t curr_ * * Relevant functions: * - write_to_non_mmio_device - * - rolled_write_to_non_mmio_device * - read_from_non_mmio_device * * The non-MMIO read/write functions (excluding the `*_epoch_cmd` variants) are responsible for the @@ -3442,8 +2053,7 @@ void tt_SiliconDevice::write_to_non_mmio_device( // MUTEX ACQUIRE (NON-MMIO) // do not locate any ethernet core reads/writes before this acquire // - const scoped_lock lock( - *get_mutex(NON_MMIO_MUTEX_NAME, this->get_pci_device(mmio_capable_chip_logical)->id)); + const scoped_lock lock(*get_mutex(NON_MMIO_MUTEX_NAME, this->get_pci_device(mmio_capable_chip_logical)->device_id)); int& active_core_for_txn = non_mmio_transfer_cores_customized ? active_eth_core_idx_per_chip.at(mmio_capable_chip_logical) : active_core; tt_cxy_pair remote_transfer_ethernet_core = remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn]; @@ -3579,282 +2189,6 @@ void tt_SiliconDevice::write_to_non_mmio_device( } } - -// Specialized function for small epoch commands: -// 1) uses separate eth cores than other non-mmio transfers hence does not require mutex -// 2) does not have the code paths for transfers larger than 32kB (1024 cmds) -// 3) only reads erisc_q_ptrs_epoch once, or when the queues are full -// 4) only updates wptr on eth command queues for the last epoch command or when the queue is full or when switching eth cores based on eth-ordered-writes policy, or when -// eth-ordered-writes are not supported but current write must be ordered (flush prev wrptr). -// 5) When eth-ordered-write not supported, allow flush to be used as ordering mechanism when ordering is requested via arg. When eth-ordered-write is supported, always use it -// and ensure ordering to same remote chip destinations by always using same remote xfer eth core for a given destination based on noc xy. Must ensure wrptr is flushed on -// switch of eth cores, and copy of rdptr/wrptr maintained on host for each eth xfer core. -void tt_SiliconDevice::write_to_non_mmio_device_send_epoch_cmd(const uint32_t *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t address, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) { - log_assert(!non_mmio_transfer_cores_customized, "{} cannot be used if ethernet cores for host->cluster transfers are customized. The default Ethernet Core configuration must be used.", __FUNCTION__); - using data_word_t = uint32_t; - constexpr int DATA_WORD_SIZE = sizeof(data_word_t); - - const auto &mmio_capable_chip = ndesc->get_closest_mmio_capable_chip(core.chip); - const auto target_chip = ndesc->get_chip_locations().at(core.chip); - - std::string write_tlb = "LARGE_WRITE_TLB"; - std::string read_tlb = "LARGE_READ_TLB"; - std::string empty_tlb = ""; - translate_to_noc_table_coords(core.chip, core.y, core.x); - - const auto &mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(core.chip); - tt_cxy_pair remote_transfer_ethernet_core = remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_epoch]; - - // read all eth queue ptrs for the first time, and initialize wrptr_updated bool for strict ordering. - if (!erisc_q_ptrs_initialized) { - for (int core_epoch = EPOCH_ETH_CORES_START_ID; core_epoch < EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS + EPOCH_ETH_CORES_START_ID; core_epoch++) { - erisc_q_ptrs_epoch[core_epoch].reserve(eth_interface_params.remote_update_ptr_size_bytes*2/sizeof(uint32_t)); - read_device_memory(erisc_q_ptrs_epoch[core_epoch].data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb); - erisc_q_wrptr_updated[core_epoch] = false; - erisc_q_ptrs_initialized = true; - } - } - - std::vector erisc_command(sizeof(routing_cmd_t)/DATA_WORD_SIZE); - routing_cmd_t *new_cmd = (routing_cmd_t *)&erisc_command[0]; - std::vector data_block; - - // Two mechanisms for ordering depending on eth fw version. - if (use_ethernet_ordered_writes) { - // Feature in this function to ensure ordering via eth-ordered-writes by using same eth core for all epoch writes to same dest noc xy. - auto &soc_desc = get_soc_descriptor(mmio_capable_chip); - int core_id = core.x * soc_desc.grid_size.y + core.y; - int new_active_core_epoch = (core_id % EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS) + EPOCH_ETH_CORES_START_ID; - - // Switch eth cores, and if wrptr was not flushed to device for previous eth core, do it now. - if (new_active_core_epoch != active_core_epoch) { - if (!erisc_q_wrptr_updated[active_core_epoch]) { - std::vector erisc_q_wptr = { erisc_q_ptrs_epoch[active_core_epoch][0] }; - write_device_memory(erisc_q_wptr.data(), erisc_q_wptr.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, write_tlb); - tt_driver_atomics::sfence(); - erisc_q_wrptr_updated[active_core_epoch] = true; - } - active_core_epoch = new_active_core_epoch; - remote_transfer_ethernet_core = remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_epoch]; - } - } else if (ordered_with_prev_remote_write) { - // Flush used as ordering mechanism when eth ordered writes are unsupported. If previous write requires flush, - // handle it here before setting flush_non_mmio for the current write. - if (!erisc_q_wrptr_updated[active_core_epoch]) { - std::vector erisc_q_wptr = { erisc_q_ptrs_epoch[active_core_epoch][0] }; - write_device_memory(erisc_q_wptr.data(), erisc_q_wptr.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, write_tlb); - tt_driver_atomics::sfence(); - erisc_q_wrptr_updated[active_core_epoch] = true; - } - wait_for_non_mmio_flush(); - } - - flush_non_mmio = true; - uint32_t timestamp = 0; //CMD_TIMESTAMP; - - bool use_dram = size_in_bytes > 256 * DATA_WORD_SIZE ? true : false; - uint32_t max_block_size = use_dram ? host_address_params.eth_routing_block_size : eth_interface_params.max_block_size; - uint32_t block_size; - - // Ethernet ordered writes must originate from same erisc core, so prevent updating active core here. - while (is_non_mmio_cmd_q_full(erisc_q_ptrs_epoch[active_core_epoch][0], erisc_q_ptrs_epoch[active_core_epoch][4])) { - if (!use_ethernet_ordered_writes){ - active_core_epoch++; - log_assert(active_core_epoch - EPOCH_ETH_CORES_START_ID >= 0, "Invalid ERISC core for sending epoch commands"); - active_core_epoch = ((active_core_epoch - EPOCH_ETH_CORES_START_ID) % EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS) + EPOCH_ETH_CORES_START_ID; - remote_transfer_ethernet_core = remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_epoch]; - } - read_device_memory(erisc_q_ptrs_epoch[active_core_epoch].data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb); - } - - uint32_t req_wr_ptr = erisc_q_ptrs_epoch[active_core_epoch][0] & eth_interface_params.cmd_buf_size_mask; - if (address & 0x1F) { // address not 32-byte aligned - // can send it in one transfer, no need to break it up - log_assert(size_in_bytes == DATA_WORD_SIZE, "Non-mmio cmd queue update is too big"); - block_size = DATA_WORD_SIZE; - } else { - // can send it in one transfer, no need to break it up - log_assert(size_in_bytes <= max_block_size, "Non-mmio cmd queue update is too big. size_in_bytes: {} exceeds max_block_size: {}", size_in_bytes, max_block_size); - block_size = size_in_bytes; - } - uint32_t req_flags = block_size > DATA_WORD_SIZE ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_wr_req | timestamp) : eth_interface_params.cmd_wr_req; - if (use_ethernet_ordered_writes) { - req_flags |= eth_interface_params.cmd_ordered; - } - - uint32_t resp_flags = block_size > DATA_WORD_SIZE ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_wr_ack) : eth_interface_params.cmd_wr_ack; - timestamp = 0; - - uint32_t host_dram_block_addr = host_address_params.eth_routing_buffers_start + (active_core_epoch * eth_interface_params.cmd_buf_size + req_wr_ptr) * max_block_size; - uint16_t host_dram_channel = 0; // This needs to be 0, since WH can only map ETH buffers to chan 0. - - // send the data - if (req_flags & eth_interface_params.cmd_data_block) { - // Copy data to sysmem or device DRAM for Block mode - if (use_dram) { - req_flags |= eth_interface_params.cmd_data_block_dram; - resp_flags |= eth_interface_params.cmd_data_block_dram; - size_buffer_to_capacity(data_block, block_size); - memcpy(&data_block[0], mem_ptr, block_size); - write_to_sysmem(data_block, host_dram_block_addr, host_dram_channel, mmio_capable_chip_logical); - } else { - uint32_t buf_address = eth_interface_params.eth_routing_data_buffer_addr + req_wr_ptr * max_block_size; - size_buffer_to_capacity(data_block, block_size); - memcpy(&data_block[0], mem_ptr, block_size); - write_device_memory(data_block.data(), data_block.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, buf_address, write_tlb); - } - tt_driver_atomics::sfence(); - } - - // send the write request - log_assert((req_flags & eth_interface_params.cmd_data_block) ? (address & 0x1F) == 0 : true, "Block mode address must be 32-byte aligned."); - - new_cmd->sys_addr = get_sys_addr(std::get<0>(target_chip), std::get<1>(target_chip), core.x, core.y, address); - new_cmd->rack = get_sys_rack(std::get<2>(target_chip), std::get<3>(target_chip)); - new_cmd->data = req_flags & eth_interface_params.cmd_data_block ? block_size : *mem_ptr; - new_cmd->flags = req_flags; - if (use_dram) { - new_cmd->src_addr_tag = host_dram_block_addr; - } - - write_device_memory(erisc_command.data(), erisc_command.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.request_routing_cmd_queue_base + (sizeof(routing_cmd_t) * req_wr_ptr), write_tlb); - tt_driver_atomics::sfence(); - - // update the wptr only if the eth queue is full or for the last command - erisc_q_ptrs_epoch[active_core_epoch][0] = (erisc_q_ptrs_epoch[active_core_epoch][0] + 1) & eth_interface_params.cmd_buf_ptr_mask; - if (last_send_epoch_cmd || is_non_mmio_cmd_q_full(erisc_q_ptrs_epoch[active_core_epoch][0], erisc_q_ptrs_epoch[active_core_epoch][4])) { - std::vector erisc_q_wptr = { erisc_q_ptrs_epoch[active_core_epoch][0] }; - write_device_memory(erisc_q_wptr.data(), erisc_q_wptr.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, write_tlb); - tt_driver_atomics::sfence(); - erisc_q_wrptr_updated[active_core_epoch] = true; - } else { - erisc_q_wrptr_updated[active_core_epoch] = false; - } -} - -/* - * Note that this function is required to acquire the `NON_MMIO_MUTEX_NAME` mutex for interacting with the ethernet core (host) command queue - * DO NOT issue any pcie reads/writes to the ethernet core prior to acquiring the mutex. For extra information, see the "NON_MMIO_MUTEX Usage" above - */ -void tt_SiliconDevice::rolled_write_to_non_mmio_device(const uint32_t *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t address, uint32_t unroll_count) { - using data_word_t = uint32_t; - constexpr int DATA_WORD_SIZE = sizeof(data_word_t); - - std::string write_tlb = "LARGE_WRITE_TLB"; - std::string read_tlb = "LARGE_READ_TLB"; - std::string empty_tlb = ""; - translate_to_noc_table_coords(core.chip, core.y, core.x); - - const eth_coord_t target_chip = ndesc->get_chip_locations().at(core.chip); - - - std::vector erisc_command; - std::vector erisc_q_rptr = std::vector(1); - std::vector erisc_q_ptrs = std::vector(eth_interface_params.remote_update_ptr_size_bytes*2 / sizeof(uint32_t)); - - std::vector data_block = std::vector(size_in_bytes / DATA_WORD_SIZE); - - routing_cmd_t *new_cmd; - - flush_non_mmio = true; - uint32_t transfer_size = size_in_bytes * unroll_count; - uint32_t buffer_id = 0; - uint32_t timestamp = 0; //CMD_TIMESTAMP; - - // - // MUTEX ACQUIRE (NON-MMIO) - // do not locate any ethernet core reads/writes before this acquire - // - const auto &mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(core.chip); - - if (non_mmio_transfer_cores_customized) { - log_assert(active_eth_core_idx_per_chip.find(mmio_capable_chip_logical) != active_eth_core_idx_per_chip.end(), "Ethernet Cores for Host to Cluster communication were not initialized for all MMIO devices."); - } - - const scoped_lock lock( - *get_mutex(NON_MMIO_MUTEX_NAME, this->get_pci_device(mmio_capable_chip_logical)->id)); - - erisc_command.resize(sizeof(routing_cmd_t)/DATA_WORD_SIZE); - new_cmd = (routing_cmd_t *)&erisc_command[0]; - int& active_core_for_txn = non_mmio_transfer_cores_customized ? active_eth_core_idx_per_chip.at(mmio_capable_chip_logical) : active_core; - read_device_memory(erisc_q_ptrs.data(), remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn], eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb); - - uint32_t offset = 0; - - bool full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0], erisc_q_ptrs[4]); - erisc_q_rptr.resize(1); - erisc_q_rptr[0] = erisc_q_ptrs[4]; - - uint32_t unroll_offset = 0; - - while (offset < transfer_size) { - while (full) { - read_device_memory(erisc_q_rptr.data(), remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn], eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes + eth_interface_params.remote_update_ptr_size_bytes, DATA_WORD_SIZE, read_tlb); - full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0],erisc_q_rptr[0]); - } - //full = true; - // set full only if this command will make the q full. - // otherwise full stays false so that we do not poll the rd pointer in next iteration. - // As long as current command push does not fill up the queue completely, we do not want - // to poll rd pointer in every iteration. - //full = is_non_mmio_cmd_q_full((erisc_q_ptrs[0] + 1) & CMD_BUF_PTR_MASK, erisc_q_rptr[0]); - - log_assert(((address + offset) & 0x1F) == 0, "Base address + offset in incorrect range!"); - - uint32_t req_wr_ptr = erisc_q_ptrs[0] & eth_interface_params.cmd_buf_size_mask; - - uint32_t req_flags = eth_interface_params.cmd_data_block_dram | eth_interface_params.cmd_data_block | eth_interface_params.cmd_wr_req; - timestamp = 0; - - uint32_t host_dram_block_addr = host_address_params.eth_routing_buffers_start + (active_core_for_txn * eth_interface_params.cmd_buf_size + req_wr_ptr) * host_address_params.eth_routing_block_size; - uint16_t host_dram_channel = 0; // This needs to be 0, since WH can only map ETH buffers to chan 0. - - memcpy(data_block.data(), mem_ptr, size_in_bytes); - uint32_t byte_increment = data_block.size() * DATA_WORD_SIZE; - uint32_t host_mem_offset = 0; - uint32_t i = 0; - for (i = 0; (i + unroll_offset) < unroll_count; i++) { - if ((host_mem_offset + byte_increment) > host_address_params.eth_routing_block_size) { - break; - } - data_block[0] = i + unroll_offset; - write_to_sysmem(data_block, host_dram_block_addr + host_mem_offset, host_dram_channel, mmio_capable_chip_logical); - host_mem_offset += byte_increment; - } - unroll_offset += i; - tt_driver_atomics::sfence(); - new_cmd->sys_addr = get_sys_addr(std::get<0>(target_chip), std::get<1>(target_chip), core.x, core.y, address + offset); - new_cmd->rack = get_sys_rack(std::get<2>(target_chip), std::get<3>(target_chip)); - new_cmd->data = host_mem_offset; - new_cmd->flags = req_flags; - new_cmd->src_addr_tag = host_dram_block_addr; - - write_device_memory(erisc_command.data(), erisc_command.size() * DATA_WORD_SIZE, remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn], eth_interface_params.request_routing_cmd_queue_base + (sizeof(routing_cmd_t) * req_wr_ptr), write_tlb); - tt_driver_atomics::sfence(); - erisc_q_ptrs[0] = (erisc_q_ptrs[0] + 1) & eth_interface_params.cmd_buf_ptr_mask; - std::vector erisc_q_wptr; - erisc_q_wptr.resize(1); - erisc_q_wptr[0] = erisc_q_ptrs[0]; - write_device_memory(erisc_q_wptr.data(), erisc_q_wptr.size() * DATA_WORD_SIZE, remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn], eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, write_tlb); - tt_driver_atomics::sfence(); - offset += host_mem_offset; - - // If there is more data to send and this command will make the q full, switch to next Q. - // otherwise full stays false so that we do not poll the rd pointer in next iteration. - // As long as current command push does not fill up the queue completely, we do not want - // to poll rd pointer in every iteration. - - if (is_non_mmio_cmd_q_full((erisc_q_ptrs[0]) & eth_interface_params.cmd_buf_ptr_mask, erisc_q_rptr[0])) { - active_core_for_txn++; - uint32_t update_mask_for_chip = (remote_transfer_ethernet_cores[mmio_capable_chip_logical].size() - 1); - active_core_for_txn = non_mmio_transfer_cores_customized ? (active_core_for_txn & update_mask_for_chip) : ((active_core_for_txn & NON_EPOCH_ETH_CORES_MASK) + NON_EPOCH_ETH_CORES_START_ID); - read_device_memory(erisc_q_ptrs.data(), remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn], eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb); - full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0], erisc_q_ptrs[4]); - erisc_q_rptr[0] = erisc_q_ptrs[4]; - } - } -} - /* * Note that this function is required to acquire the `NON_MMIO_MUTEX_NAME` mutex for interacting with the ethernet core (host) command queue * DO NOT use `active_core` or issue any pcie reads/writes to the ethernet core prior to acquiring the mutex. For extra information, see the "NON_MMIO_MUTEX Usage" above @@ -3889,8 +2223,7 @@ void tt_SiliconDevice::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core // MUTEX ACQUIRE (NON-MMIO) // do not locate any ethernet core reads/writes before this acquire // - const scoped_lock lock( - *get_mutex(NON_MMIO_MUTEX_NAME, this->get_pci_device(mmio_capable_chip_logical)->id)); + const scoped_lock lock(*get_mutex(NON_MMIO_MUTEX_NAME, this->get_pci_device(mmio_capable_chip_logical)->device_id)); const tt_cxy_pair remote_transfer_ethernet_core = remote_transfer_ethernet_cores[mmio_capable_chip_logical].at(0); read_device_memory(erisc_q_ptrs.data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb); @@ -4186,15 +2519,14 @@ std::unordered_map>>& tt_SiliconDevice:: void tt_SiliconDevice::pcie_broadcast_write(chip_id_t chip, const void* mem_ptr, uint32_t size_in_bytes, std::uint32_t addr, const tt_xy_pair& start, const tt_xy_pair& end, const std::string& fallback_tlb) { // Use the specified TLB to broadcast data to all cores included in the [start, end] grid -> GS Only. Use Ethernet Broadcast for WH. - struct PCIdevice* pci_device = get_pci_device(chip); + TTDevice *pci_device = get_pci_device(chip); const auto tlb_index = dynamic_tlb_config.at(fallback_tlb); - TTDevice *dev = pci_device->hdev; const uint8_t* buffer_addr = static_cast(mem_ptr); - const scoped_lock lock(*get_mutex(fallback_tlb, pci_device -> id)); + const scoped_lock lock(*get_mutex(fallback_tlb, pci_device->device_id)); while(size_in_bytes > 0) { auto [mapped_address, tlb_size] = set_dynamic_tlb_broadcast(pci_device, tlb_index, addr, harvested_coord_translation, start, end, dynamic_tlb_ordering_modes.at(fallback_tlb)); uint64_t transfer_size = std::min((uint64_t)size_in_bytes, tlb_size); - write_block(dev, mapped_address, transfer_size, buffer_addr, m_dma_buf_size); + pci_device->write_block(mapped_address, transfer_size, buffer_addr); size_in_bytes -= transfer_size; addr += transfer_size; @@ -4419,18 +2751,18 @@ int tt_SiliconDevice::remote_arc_msg(int chip, uint32_t msg_code, bool wait_for_ } void tt_SiliconDevice::write_to_sysmem(const void* mem_ptr, std::uint32_t size, uint64_t addr, uint16_t channel, chip_id_t src_device_id) { - write_dma_buffer(mem_ptr, size, addr, channel, src_device_id); + write_buffer(mem_ptr, size, addr, channel, src_device_id); } void tt_SiliconDevice::write_to_sysmem(std::vector& vec, uint64_t addr, uint16_t channel, chip_id_t src_device_id) { - write_dma_buffer(vec.data(), vec.size() * sizeof(uint32_t), addr, channel, src_device_id); + write_buffer(vec.data(), vec.size() * sizeof(uint32_t), addr, channel, src_device_id); } void tt_SiliconDevice::read_from_sysmem(void* mem_ptr, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id) { - read_dma_buffer(mem_ptr, addr, channel, size, src_device_id); + read_buffer(mem_ptr, addr, channel, size, src_device_id); } void tt_SiliconDevice::read_from_sysmem(std::vector &vec, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id) { size_buffer_to_capacity(vec, size); - read_dma_buffer(vec.data(), addr, channel, size, src_device_id); + read_buffer(vec.data(), addr, channel, size, src_device_id); } void tt_SiliconDevice::set_membar_flag(const chip_id_t chip, const std::unordered_set& cores, const uint32_t barrier_value, const uint32_t barrier_addr, const std::string& fallback_tlb) { @@ -4450,7 +2782,7 @@ void tt_SiliconDevice::set_membar_flag(const chip_id_t chip, const std::unordere cores_synced.insert(core); } else { - log_trace(LogSiliconDriver, "Waiting for core {} to recieve mem bar flag {} in function", core.str(), barrier_value); + log_info(LogSiliconDriver, "Waiting for core {} to recieve mem bar flag {} in function", core.str(), barrier_value); } } } @@ -4462,7 +2794,7 @@ void tt_SiliconDevice::set_membar_flag(const chip_id_t chip, const std::unordere void tt_SiliconDevice::insert_host_to_device_barrier(const chip_id_t chip, const std::unordered_set& cores, const uint32_t barrier_addr, const std::string& fallback_tlb) { // Ensure that this memory barrier is atomic across processes/threads - const scoped_lock lock(*get_mutex(MEM_BARRIER_MUTEX_NAME, this->get_pci_device(chip)->id)); + const scoped_lock lock(*get_mutex(MEM_BARRIER_MUTEX_NAME, this->get_pci_device(chip)->device_id)); set_membar_flag(chip, cores, tt_MemBarFlag::SET, barrier_addr, fallback_tlb); set_membar_flag(chip, cores, tt_MemBarFlag::RESET, barrier_addr, fallback_tlb); } @@ -4544,7 +2876,7 @@ void tt_SiliconDevice::dram_membar(const chip_id_t chip, const std::string& fall } } -void tt_SiliconDevice::write_to_device(const void *mem_ptr, uint32_t size, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) { +void tt_SiliconDevice::write_to_device(const void *mem_ptr, uint32_t size, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb) { bool target_is_mmio_capable = ndesc -> is_chip_mmio_capable(core.chip); if(target_is_mmio_capable) { if (fallback_tlb == "REG_TLB") { @@ -4552,74 +2884,29 @@ void tt_SiliconDevice::write_to_device(const void *mem_ptr, uint32_t size, tt_cx } else { write_device_memory(mem_ptr, size, core, addr, fallback_tlb); } - } - else if (!send_epoch_cmd) { + } else { log_assert(arch_name != tt::ARCH::BLACKHOLE, "Non-MMIO targets not supported in Blackhole"); log_assert((get_soc_descriptor(core.chip).ethernet_cores).size() > 0 && get_number_of_chips_in_cluster() > 1, "Cannot issue ethernet writes to a single chip cluster!"); write_to_non_mmio_device(mem_ptr, size, core, addr); - } else { - log_assert(arch_name != tt::ARCH::BLACKHOLE, "Non-MMIO targets not supported in Blackhole"); - // as long as epoch commands are sent single-threaded, no need to acquire mutex - log_assert(!(size % 4), "Epoch commands must be 4 byte aligned!"); - write_to_non_mmio_device_send_epoch_cmd((uint32_t*)mem_ptr, size, core, addr, last_send_epoch_cmd, ordered_with_prev_remote_write); } } - -void tt_SiliconDevice::write_to_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) { - // Overloaded device writer that accepts a vector - write_to_device(vec.data(), vec.size() * sizeof(uint32_t), core, addr, fallback_tlb, send_epoch_cmd, last_send_epoch_cmd, ordered_with_prev_remote_write); -} - - -void tt_SiliconDevice::write_epoch_cmd_to_device(const uint32_t *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) { - bool target_is_mmio_capable = ndesc -> is_chip_mmio_capable(core.chip); - if(target_is_mmio_capable) { - write_device_memory(mem_ptr, size_in_bytes, core, addr, fallback_tlb); - } else { - log_assert(arch_name != tt::ARCH::BLACKHOLE, "Non-MMIO targets not supported in Blackhole"); // MT: Use only dynamic TLBs and never program static - write_to_non_mmio_device_send_epoch_cmd(mem_ptr, size_in_bytes, core, addr, last_send_epoch_cmd, ordered_with_prev_remote_write); - } -} - -void tt_SiliconDevice::write_epoch_cmd_to_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) { +void tt_SiliconDevice::write_to_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb) { // Overloaded device writer that accepts a vector - write_epoch_cmd_to_device(vec.data(), vec.size() * sizeof(uint32_t), core, addr, fallback_tlb, last_send_epoch_cmd, ordered_with_prev_remote_write); -} - -void tt_SiliconDevice::rolled_write_to_device(uint32_t* mem_ptr, uint32_t size_in_bytes, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb) { - log_assert(!(size_in_bytes % 4), "{} only supports 4-byte aligned data", __FUNCTION__); - bool target_is_mmio_capable = ndesc->is_chip_mmio_capable(core.chip); - - if (target_is_mmio_capable) { - for (int i=0; i 0 && get_number_of_chips_in_cluster() > 1, "Cannot issue ethernet writes to a single chip cluster!"); - rolled_write_to_non_mmio_device(mem_ptr, size_in_bytes, core, addr, unroll_count); - } -} - -void tt_SiliconDevice::rolled_write_to_device(std::vector &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb) { - rolled_write_to_device(vec.data(), vec.size() * sizeof(uint32_t), unroll_count, core, addr, fallback_tlb); + write_to_device(vec.data(), vec.size() * sizeof(uint32_t), core, addr, fallback_tlb); } void tt_SiliconDevice::read_mmio_device_register(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) { - struct PCIdevice* pci_device = get_pci_device(core.chip); - TTDevice *dev = pci_device->hdev; + TTDevice *pci_device = get_pci_device(core.chip); const auto tlb_index = dynamic_tlb_config.at(fallback_tlb); - const scoped_lock lock(*get_mutex(fallback_tlb, pci_device -> id)); + const scoped_lock lock(*get_mutex(fallback_tlb, pci_device->device_id)); LOG1 (" dynamic tlb_index: %d\n", tlb_index); auto [mapped_address, tlb_size] = set_dynamic_tlb(pci_device, tlb_index, core, addr, harvested_coord_translation, TLB_DATA::Strict); // Align block to 4bytes if needed. auto aligned_buf = tt_4_byte_aligned_buffer(mem_ptr, size); - read_regs(dev, mapped_address, aligned_buf.block_size / sizeof(std::uint32_t), aligned_buf.local_storage); + pci_device->read_regs(mapped_address, aligned_buf.block_size / sizeof(std::uint32_t), aligned_buf.local_storage); if(aligned_buf.input_size != aligned_buf.block_size) { // Copy value from aligned buffer to main buffer. @@ -4629,11 +2916,10 @@ void tt_SiliconDevice::read_mmio_device_register(void* mem_ptr, tt_cxy_pair core void tt_SiliconDevice::write_mmio_device_register(const void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) { - struct PCIdevice* pci_device = get_pci_device(core.chip); - TTDevice *dev = pci_device->hdev; + TTDevice *pci_device = get_pci_device(core.chip); const auto tlb_index = dynamic_tlb_config.at(fallback_tlb); - const scoped_lock lock(*get_mutex(fallback_tlb, pci_device -> id)); + const scoped_lock lock(*get_mutex(fallback_tlb, pci_device->device_id)); LOG1 (" dynamic tlb_index: %d\n", tlb_index); auto [mapped_address, tlb_size] = set_dynamic_tlb(pci_device, tlb_index, core, addr, harvested_coord_translation, TLB_DATA::Strict); @@ -4643,7 +2929,7 @@ void tt_SiliconDevice::write_mmio_device_register(const void* mem_ptr, tt_cxy_pa // Copy value from main buffer to aligned buffer std::memcpy(aligned_buf.local_storage, mem_ptr, size); } - write_regs(dev, mapped_address, aligned_buf.block_size / sizeof(uint32_t), aligned_buf.local_storage); + pci_device->write_regs(mapped_address, aligned_buf.block_size / sizeof(uint32_t), aligned_buf.local_storage); } void tt_SiliconDevice::read_from_device(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) { @@ -4694,7 +2980,7 @@ void tt_SiliconDevice::send_remote_tensix_risc_reset_to_core(const tt_cxy_pair & int tt_SiliconDevice::set_remote_power_state(const chip_id_t &chip, tt_DevicePowerState device_state) { auto mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(chip); - struct PCIdevice* pci_device = get_pci_device(mmio_capable_chip_logical); + TTDevice *pci_device = get_pci_device(mmio_capable_chip_logical); return remote_arc_msg(chip, get_power_state_arc_msg(pci_device, device_state), true, 0, 0, 1, NULL, NULL); } @@ -4718,7 +3004,7 @@ void tt_SiliconDevice::enable_remote_ethernet_queue(const chip_id_t& chip, int t void tt_SiliconDevice::broadcast_tensix_risc_reset_to_cluster(const TensixSoftResetOptions &soft_resets) { if(arch_name == tt::ARCH::GRAYSKULL) { for (auto &device_it : m_pci_device_map) { - broadcast_pcie_tensix_risc_reset(device_it.second, soft_resets); + broadcast_pcie_tensix_risc_reset(device_it.second.get(), soft_resets); } } else { @@ -4792,14 +3078,14 @@ void tt_SiliconDevice::deassert_resets_and_set_power_state() { if (arch_name != tt::ARCH::BLACKHOLE) { // Send ARC Messages to deassert RISCV resets for (auto &device_it : m_pci_device_map){ - arc_msg(device_it.first, 0xaa00 | device_it.second->hdev->get_architecture_implementation()->get_arc_message_deassert_riscv_reset(), true, 0, 0); + arc_msg(device_it.first, 0xaa00 | device_it.second.get()->get_architecture_implementation()->get_arc_message_deassert_riscv_reset(), true, 0, 0); } if(ndesc != nullptr) { for(const chip_id_t& chip : target_devices_in_cluster) { if(!ndesc -> is_chip_mmio_capable(chip)) { auto mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(chip); - struct PCIdevice* pci_device = get_pci_device(mmio_capable_chip_logical); - remote_arc_msg(chip, 0xaa00 | pci_device->hdev->get_architecture_implementation()->get_arc_message_deassert_riscv_reset(), true, 0x0, 0x0, 1, NULL, NULL); + auto pci_device = get_pci_device(mmio_capable_chip_logical); + remote_arc_msg(chip, 0xaa00 | pci_device->get_architecture_implementation()->get_arc_message_deassert_riscv_reset(), true, 0x0, 0x0, 1, NULL, NULL); } } enable_ethernet_queue(30); @@ -4905,22 +3191,8 @@ std::uint32_t tt_SiliconDevice::get_host_channel_size(std::uint32_t device_id, s return host_channel_size.at(device_id).at(channel); } -std::uint32_t tt_SiliconDevice::get_pcie_speed(std::uint32_t device_id) { - int link_width = 0; - int link_speed = 0; - if (ndesc->is_chip_mmio_capable(device_id)) { - PCIdevice *pci_device = get_pci_device(device_id); - link_width = get_link_width(pci_device->hdev); - link_speed = get_link_speed(pci_device->hdev); - log_debug(LogSiliconDriver, "Device {} PCIe link width: x{}, speed: {} Gb/s", device_id, link_width, link_speed); - } else { - log_debug(LogSiliconDriver, "Device {} is NOT a PCIe device, width: x{}, speed: {} Gb/s", device_id, link_width, link_speed); - } - return (link_width * link_speed); -} - std::uint32_t tt_SiliconDevice::get_numa_node_for_pcie_device(std::uint32_t device_id) { - return get_numa_node(get_pci_device(device_id)->hdev); + return get_pci_device(device_id)->numa_node; } std::uint64_t tt_SiliconDevice::get_pcie_base_addr_from_device() const { diff --git a/device/tt_silicon_driver_common.hpp b/device/tt_silicon_driver_common.hpp index 1649bf70..9f275668 100644 --- a/device/tt_silicon_driver_common.hpp +++ b/device/tt_silicon_driver_common.hpp @@ -9,19 +9,6 @@ #include #include - -typedef struct { - uint32_t chip_addr; - uint32_t host_phys_addr; - uint32_t completion_flag_phys_addr; - uint32_t size_bytes : 28; - uint32_t write : 1; - uint32_t pcie_msi_on_done : 1; - uint32_t pcie_write_on_done : 1; - uint32_t trigger : 1; - uint32_t repeat; -} arc_pcie_ctrl_dma_request_t; // 5 * 4 = 20B - enum class TensixSoftResetOptions: std::uint32_t { NONE = 0, BRISC = ((std::uint32_t) 1 << 11), diff --git a/device/tt_soc_descriptor.cpp b/device/tt_soc_descriptor.cpp index 4320b3ef..60958372 100644 --- a/device/tt_soc_descriptor.cpp +++ b/device/tt_soc_descriptor.cpp @@ -199,98 +199,20 @@ int tt_SocDescriptor::get_num_dram_channels() const { return num_channels; } -std::vector tt_SocDescriptor::get_dram_chan_map() { - std::vector chan_map; - for (unsigned int i = 0; i < dram_cores.size(); i++) { - chan_map.push_back(i); - } - return chan_map; -}; - bool tt_SocDescriptor::is_worker_core(const tt_xy_pair &core) const { return ( routing_x_to_worker_x.find(core.x) != routing_x_to_worker_x.end() && routing_y_to_worker_y.find(core.y) != routing_y_to_worker_y.end()); } -tt_xy_pair tt_SocDescriptor::get_worker_core(const tt_xy_pair &core) const { - tt_xy_pair worker_xy = { - static_cast(routing_x_to_worker_x.at(core.x)), static_cast(routing_y_to_worker_y.at(core.y))}; - return worker_xy; -} - -tt_xy_pair tt_SocDescriptor::get_routing_core(const tt_xy_pair& core) const { - tt_xy_pair routing_xy = { - static_cast(worker_log_to_routing_x.at(core.x)), static_cast(worker_log_to_routing_y.at(core.y))}; - return routing_xy; -} - tt_xy_pair tt_SocDescriptor::get_core_for_dram_channel(int dram_chan, int subchannel) const { return this->dram_cores.at(dram_chan).at(subchannel); }; -tt_xy_pair tt_SocDescriptor::get_pcie_core(int pcie_id) const { - return this->pcie_cores.at(pcie_id); -}; - bool tt_SocDescriptor::is_ethernet_core(const tt_xy_pair &core) const { return this->ethernet_core_channel_map.find(core) != ethernet_core_channel_map.end(); } -bool tt_SocDescriptor::is_dram_core(const tt_xy_pair &core) const { - static std::unordered_set cores = {}; - if (cores.empty()) { - for (const std::vector &dram_chan : this->dram_cores) { - for (const tt_xy_pair &subchannel : dram_chan) { - cores.insert(subchannel); - } - } - } - return cores.find(core) != cores.end(); -} - -int tt_SocDescriptor::get_channel_of_ethernet_core(const tt_xy_pair &core) const { - return this->ethernet_core_channel_map.at(core); -} - -int tt_SocDescriptor::get_num_dram_subchans() const { - int num_chan = 0; - for (const std::vector &core : this->dram_cores) { - num_chan += core.size(); - } - return num_chan; -} - -int tt_SocDescriptor::get_num_dram_blocks_per_channel() const { - int num_blocks = 0; - if (arch == tt::ARCH::GRAYSKULL) { - num_blocks = 1; - } else if (arch == tt::ARCH::WORMHOLE) { - num_blocks = 2; - } else if (arch == tt::ARCH::WORMHOLE_B0) { - num_blocks = 2; - } else if (arch == tt::ARCH::BLACKHOLE) { - num_blocks = 2; - } - return num_blocks; -} - -// Note: same as t_SiliconDevice::get_pcie_base_addr_from_device -uint64_t tt_SocDescriptor::get_noc2host_offset(uint16_t host_channel) const { - - const std::uint64_t PEER_REGION_SIZE = (1024 * 1024 * 1024); - - if (arch == tt::ARCH::GRAYSKULL) { - return (host_channel * PEER_REGION_SIZE); - }else if (arch == tt::ARCH::WORMHOLE || arch == tt::ARCH::WORMHOLE_B0) { - return (host_channel * PEER_REGION_SIZE) + 0x800000000; - } else if (arch == tt::ARCH::BLACKHOLE) { - return (host_channel * PEER_REGION_SIZE) + (1ULL << 60); - } else { - throw std::runtime_error("Unsupported architecture"); - } -} - std::ostream &operator<<(std::ostream &out, const tt::ARCH &arch_name) { if (arch_name == tt::ARCH::JAWBRIDGE) { out << "jawbridge"; diff --git a/device/tt_soc_descriptor.h b/device/tt_soc_descriptor.h index 2be98749..87ea1799 100644 --- a/device/tt_soc_descriptor.h +++ b/device/tt_soc_descriptor.h @@ -23,8 +23,6 @@ namespace YAML { class Node; } -static constexpr std::size_t DEFAULT_DRAM_SIZE_PER_CORE = 8 * 1024 * 1024; - std::ostream &operator<<(std::ostream &out, const tt::ARCH &arch_name); static inline std::string get_arch_str(const tt::ARCH arch_name){ @@ -132,18 +130,9 @@ class tt_SocDescriptor { uint64_t dram_bank_size; int get_num_dram_channels() const; - std::vector get_dram_chan_map(); bool is_worker_core(const tt_xy_pair &core) const; - tt_xy_pair get_worker_core(const tt_xy_pair& core) const; - tt_xy_pair get_routing_core(const tt_xy_pair& core) const; tt_xy_pair get_core_for_dram_channel(int dram_chan, int subchannel) const; - tt_xy_pair get_pcie_core(int pcie_id = 0) const; - bool is_dram_core(const tt_xy_pair& core) const; bool is_ethernet_core(const tt_xy_pair& core) const; - int get_channel_of_ethernet_core(const tt_xy_pair &core) const; - int get_num_dram_subchans() const; - int get_num_dram_blocks_per_channel() const; - uint64_t get_noc2host_offset(uint16_t host_channel) const; // Default constructor. Creates uninitialized object with public access to all of its attributes. tt_SocDescriptor() = default; diff --git a/device/tt_umd.hpp b/device/tt_umd.hpp new file mode 100644 index 00000000..0fc95860 --- /dev/null +++ b/device/tt_umd.hpp @@ -0,0 +1,57 @@ +/* + * SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include "tt_soc_descriptor.h" +#include "tt_xy_pair.h" +#include "tt_silicon_driver_common.hpp" +#include "device/tt_cluster_descriptor_types.h" +#include "device/tlb.h" +#include "device/tt_io.hpp" + +#include "pci_device.hpp" + +void write_to_device(const void *mem_ptr, uint32_t size, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb); +void write_to_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb); +void read_from_device(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb); +void read_from_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb); +void write_to_sysmem(const void* mem_ptr, std::uint32_t size, uint64_t addr, uint16_t channel, chip_id_t src_device_id); +void write_to_sysmem(std::vector& vec, uint64_t addr, uint16_t channel, chip_id_t src_device_id); +void read_from_sysmem(void* mem_ptr, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id); +void read_from_sysmem(std::vector &vec, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id); + +void wait_for_non_mmio_flush(); +void l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); +void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels); +void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); + +void assert_risc_reset(); +void assert_risc_reset_at_core(tt_cxy_pair core); +void deassert_risc_reset(); +void deassert_risc_reset_at_core(tt_cxy_pair core); + +std::map get_clocks(); +std::set get_target_remote_device_ids(); +std::uint32_t get_num_host_channels(std::uint32_t device_id); +std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel); +void *host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel); // should prob be a get? +std::uint64_t get_pcie_base_addr_from_device(); +std::unordered_map get_harvesting_masks_for_soc_descriptors(); +std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id); +std::vector detect_available_device_ids(); // should move all calls of this completely into umd + +// Fast-dispatch workaround :( +std::function get_fast_pcie_static_tlb_write_callable(int device_id); +tt::Writer get_static_tlb_writer(tt_cxy_pair target); + diff --git a/device/tt_versim_device.cpp b/device/tt_versim_device.cpp deleted file mode 100644 index e7ac7506..00000000 --- a/device/tt_versim_device.cpp +++ /dev/null @@ -1,323 +0,0 @@ -// SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - - - -#include "tt_device.h" -#include "device/driver_atomics.h" -#include "common/logger.hpp" -#include -#include -#include -#include - -#include "yaml-cpp/yaml.h" - -// TODO: Remove dependency on command_assembler + soc -#include "command_assembler/soc.h" -#include "device/tt_cluster_descriptor.h" -namespace CA = CommandAssembler; - - -void translate_soc_descriptor_to_ca_soc(CA::Soc &soc, const tt_SocDescriptor soc_descriptor) { - for (auto &core : soc_descriptor.cores) { - CA::SocNocNode node; - CA::xy_pair CA_coord(core.first.x, core.first.y); - node.noc_coord = CA_coord; - node.memory_size = core.second.l1_size; - switch (core.second.type) { - case CoreType::ARC: node.arc = true; break; - case CoreType::DRAM: { - node.dram = true; - #ifdef EN_DRAM_ALIAS - node.dram_channel_id = std::get<0>(soc_descriptor.dram_core_channel_map.at(core.first)); - #endif - } break; - case CoreType::ETH: node.eth = true; break; - case CoreType::PCIE: node.pcie = true; break; - case CoreType::WORKER: node.worker = true; break; - case CoreType::HARVESTED: node.harvested = true; break; - case CoreType::ROUTER_ONLY: node.router_only = true; break; - default: std::cout << " Error: Unsupported CoreType type: " << static_cast(core.second.type) << std::endl; break; - } - soc.SetNodeProperties(node.noc_coord, node); - } -} - -//////// -// Device Versim -//////// - -#include "device.h" -#include "sim_interactive.h" -#include - -tt_VersimDevice::tt_VersimDevice(const std::string &sdesc_path, const std::string &ndesc_path) : tt_device(sdesc_path) { - soc_descriptor_per_chip.emplace(0, tt_SocDescriptor(sdesc_path)); - std::set target_devices = {0}; - if (ndesc_path == "") { - ndesc = tt_ClusterDescriptor::create_for_grayskull_cluster(target_devices, {}); - } - else { - ndesc = tt_ClusterDescriptor::create_from_yaml(ndesc_path); - } -} - -std::unordered_map& tt_VersimDevice::get_virtual_soc_descriptors() {return soc_descriptor_per_chip;} - -tt_ClusterDescriptor* tt_VersimDevice::get_cluster_description() {return ndesc.get();} -void tt_VersimDevice::start_device(const tt_device_params &device_params) { - bool no_checkers = true; - std::vector dump_cores = device_params.unroll_vcd_dump_cores(get_soc_descriptor(0) -> grid_size); - start(device_params.expand_plusargs(), dump_cores, no_checkers, device_params.init_device, false); -} - -void tt_VersimDevice::close_device() { - stop(); -} - -void tt_VersimDevice::start( - std::vector plusargs, - std::vector dump_cores, - bool no_checkers, - bool /*init_device*/, - bool /*skip_driver_allocs*/ - ) { - - std::cout << "Start Versim Device " << std::endl; - std::string device_descriptor_dir = "./"; - - std::optional vcd_suffix; - if (dump_cores.size() > 0) { - vcd_suffix = "core_dump.vcd"; - } - - std::vector vcd_cores; - - // TODO: For now create a temporary stuff from CA and populate from descriptor before passing back to versim-core - // interface. mainly bypasses arch_configs etc from llir. We can populate soc directly - // MT: have to preserve ca_soc_descriptor object since versim references it at runtime - CA::xy_pair CA_grid_size((soc_descriptor_per_chip.begin() -> second).grid_size.x, (soc_descriptor_per_chip.begin() -> second).grid_size.y); - // CA::Soc ca_soc_manager(CA_grid_size); - std::unique_ptr p_ca_soc_manager_unique = std::make_unique(CA_grid_size); - translate_soc_descriptor_to_ca_soc(*p_ca_soc_manager_unique, (soc_descriptor_per_chip.begin() -> second)); - // TODO: End - - std::cout << "Versim Device: turn_on_device "; - std::vector trisc_sizes = {static_cast(l1_address_params.trisc0_size), static_cast(l1_address_params.trisc1_size), static_cast(l1_address_params.trisc2_size)}; - std::unique_ptr versim_unique = versim::turn_on_device(CA_grid_size, *p_ca_soc_manager_unique, plusargs, vcd_suffix, dump_cores, no_checkers, - l1_address_params.trisc_base, trisc_sizes); - versim = versim_unique.release(); - - std::cout << "Versim Device: write info to tvm db " << std::endl; - versim::write_info_to_tvm_db(l1_address_params.trisc_base, trisc_sizes); - versim::build_and_connect_tvm_phase(); - - versim->spin_threads(*p_ca_soc_manager_unique, false); - versim::assert_reset(*versim); - - p_ca_soc_manager = (void*)(p_ca_soc_manager_unique.release()); - - std::cout << "Versim Device: Done start " << std::endl; -} - -tt_VersimDevice::~tt_VersimDevice () { - ndesc.reset(); -} - -// bool tt_VersimDevice::run() { -// std::cout << "Versim Device: Run " << std::endl; - -// // Run Versim main_loop -// versim::startup_versim_main_loop(*versim); - -// return true; -// } - -void tt_VersimDevice::deassert_risc_reset() { - std::cout << "Versim Device: Deassert risc resets start" << std::endl; - versim::handle_resetting_triscs(*versim); - std::cout << "Versim Device: Start main loop " << std::endl; - versim::startup_versim_main_loop(*versim); -} - -void tt_VersimDevice::deassert_risc_reset_at_core(tt_cxy_pair core) { - // This function deasserts reset on the full versim device (don't need core level granularity for versim) - deassert_risc_reset(); -} - -void tt_VersimDevice::assert_risc_reset() { - std::cout << "Pause all the cores" << std::endl; - versim::pause(*versim); - - std::cout << "Wait for cores to go to paused state" << std::endl; - versim::sleep_wait_for_paused (*versim); - - std::cout << "Assert riscv reset" << std::endl; - versim::assert_riscv_reset(*versim); -} - -void tt_VersimDevice::assert_risc_reset_at_core(tt_cxy_pair core) { - // This function asserts reset on the full versim device (don't need core level granularity for versim) - assert_risc_reset(); -} - -void tt_VersimDevice::rolled_write_to_device(std::vector &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) { - uint32_t byte_increment = vec.size() * 4; - for (int i=0; i mem_vector(mem_ptr, mem_ptr + len); - rolled_write_to_device(mem_vector, unroll_count, core, addr, fallback_tlb); -} - -void tt_VersimDevice::write_to_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) { - - log_debug(tt::LogSiliconDriver, "Versim Device ({}): Write vector at target core {}, address: {}", get_sim_time(*versim), core.str(), addr); - - bool aligned_32B = (soc_descriptor_per_chip.begin() -> second).cores.at(core).type == CoreType::DRAM; - // MT: Remove these completely - CommandAssembler::xy_pair CA_target(core.x, core.y); - CommandAssembler::memory CA_tensor_memory(addr, vec); - - nuapi::device::write_memory_to_core(*versim, CA_target, CA_tensor_memory); -} - -void tt_VersimDevice::write_to_device(const void *mem_ptr, uint32_t size, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) { - log_assert(!(size % 4), "Writes to Versim Backend should be 4 byte aligned!"); - - std::vector mem_vector((uint32_t*)mem_ptr, (uint32_t*)mem_ptr + size / sizeof(uint32_t)); - write_to_device(mem_vector, core, addr, tlb_to_use, send_epoch_cmd, last_send_epoch_cmd, ordered_with_prev_remote_write); -} - -void tt_VersimDevice::broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set& chips_to_exclude, std::set& rows_to_exclude, std::set& cols_to_exclude, const std::string& fallback_tlb) { - for(const auto& core : get_soc_descriptor(0) -> cores) { - if(cols_to_exclude.find(core.first.x) == cols_to_exclude.end() and rows_to_exclude.find(core.first.y) == rows_to_exclude.end() and core.second.type != CoreType::HARVESTED) { - write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, ""); - } - } -} -void tt_VersimDevice::wait_for_non_mmio_flush() { - // Do nothing, since Versim does not simulate non-mmio mapped chips -} - -void tt_VersimDevice::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) { - tt_driver_atomics::mfence(); // Ensure no reordering of loads/stores around this -} - -void tt_VersimDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) { - tt_driver_atomics::mfence(); // Ensure no reordering of loads/stores around this -} - -void tt_VersimDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& dram_cores) { - tt_driver_atomics::mfence(); // Ensure no reordering of loads/stores around this -} - -void tt_VersimDevice::read_from_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) { - log_debug(tt::LogSiliconDriver, "Versim Device ({}): Read vector from address: {}, with size: {} Bytes", get_sim_time(*versim), addr, size); - - CommandAssembler::xy_pair CA_target(core.x, core.y); - - size_t size_in_words = size / 4; - auto result = nuapi::device::read_memory_from_core(*versim, CA_target, addr, size_in_words); - vec = result; -} - -void tt_VersimDevice::read_from_device(void *mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) { - log_debug(tt::LogSiliconDriver, "Versim Device ({}): Read vector from address: {}, with size: {} Bytes", get_sim_time(*versim), addr, size); - log_assert(!(size % 4), "Reads from Versim backend should be 4 byte aligned!"); - - CommandAssembler::xy_pair CA_target(core.x, core.y); - - size_t size_in_words = size / 4; - auto result = nuapi::device::read_memory_from_core(*versim, CA_target, addr, size_in_words); - memcpy(mem_ptr, result.data(), result.size()*sizeof(uint32_t)); -} - -void tt_VersimDevice::translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c) { - // No translation is performed - return; -} - -std::set tt_VersimDevice::get_target_mmio_device_ids() { - // Must only be used for silicon - return {}; -} - -std::set tt_VersimDevice::get_target_remote_device_ids() { - // Must only be used for silicon - return {}; -} - - -bool versim_check_dram_core_exists(const std::vector> &dram_core_channels, tt_xy_pair target_core) { - bool dram_core_exists = false; - for (const auto &dram_cores_in_channel: dram_core_channels) { - for (const auto &dram_core : dram_cores_in_channel) { - if (dram_core.x == target_core.x && dram_core.y == target_core.y) { - return true; - } - } - } - return false; -} - -int tt_VersimDevice::get_number_of_chips_in_cluster() { return detect_number_of_chips(); } -std::unordered_set tt_VersimDevice::get_all_chips_in_cluster() { return {0}; } -int tt_VersimDevice::detect_number_of_chips() { return 1; } - -bool tt_VersimDevice::using_harvested_soc_descriptors() { return false; } -bool tt_VersimDevice::noc_translation_en() { return false; } -std::unordered_map tt_VersimDevice::get_harvesting_masks_for_soc_descriptors() { return {{0, 0}};} - -// Meant to breakout running functions for simulator -bool tt_VersimDevice::stop() { - std::cout << "Versim Device: Stop " << std::endl; - - versim::turn_off_device(*versim); - versim->shutdown(); - // Force free of all versim cores - for (auto x = 0; x < versim->grid_size.x; x++) { - for (auto y = 0; y < versim->grid_size.y; y++) { - delete versim->core_grid.at(x).at(y); - } - } - std::cout << "Versim Device: Stop completed " << std::endl; - delete versim; - return true; -} - -std::map tt_VersimDevice::get_clocks() { - return std::map(); -} - -void tt_VersimDevice::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) { - l1_address_params = l1_address_params_; -} - -void tt_VersimDevice::set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_) { - dram_address_params = dram_address_params_; -} - -std::uint32_t tt_VersimDevice::get_num_dram_channels(std::uint32_t device_id) { - return get_soc_descriptor(device_id) -> get_num_dram_channels(); -} - -std::uint64_t tt_VersimDevice::get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel) { - return get_soc_descriptor(device_id) -> dram_bank_size; // Space per channel is identical for now -} - -std::uint32_t tt_VersimDevice::get_num_host_channels(std::uint32_t device_id) { - // Host buffers not allocated for Versim Devices - return 0; -} - -std::uint32_t tt_VersimDevice::get_host_channel_size(std::uint32_t device_id, std::uint32_t channel) { - // Host buffers not allocated for Versim Devices - return 0; -} \ No newline at end of file diff --git a/device/tt_versim_stub.cpp b/device/tt_versim_stub.cpp deleted file mode 100644 index 27c69f80..00000000 --- a/device/tt_versim_stub.cpp +++ /dev/null @@ -1,84 +0,0 @@ -// SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - - -#include "tt_device.h" - -#include -#include -#include -#include - -tt_VersimDevice::tt_VersimDevice(const std::string &sdesc_path, const std::string &ndesc_path) : tt_device(sdesc_path) { - throw std::runtime_error("tt_VersimDevice() -- VERSIM is not supported in this build\n"); -} - -tt_VersimDevice::~tt_VersimDevice () {} - -std::unordered_map& tt_VersimDevice::get_virtual_soc_descriptors() { - throw std::runtime_error("tt_VersimDevice() -- VERSIM is not supported in this build\n"); - return soc_descriptor_per_chip; -} - -int tt_VersimDevice::get_number_of_chips_in_cluster() { return detect_number_of_chips(); } -std::unordered_set tt_VersimDevice::get_all_chips_in_cluster() { return {}; } -int tt_VersimDevice::detect_number_of_chips() { return 0; } - -void tt_VersimDevice::start_device(const tt_device_params &device_params) {} -void tt_VersimDevice::close_device() {} -void tt_VersimDevice::write_to_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {} -void tt_VersimDevice::broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set& chips_to_exclude, std::set& rows_to_exclude, std::set& cols_to_exclude, const std::string& fallback_tlb) {} -void tt_VersimDevice::read_from_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) {} -void tt_VersimDevice::rolled_write_to_device(std::vector &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) {} -void tt_VersimDevice::write_to_device(const void *mem_ptr, uint32_t len, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {} -void tt_VersimDevice::read_from_device(void *mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) {} -void tt_VersimDevice::rolled_write_to_device(uint32_t* mem_ptr, uint32_t len, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb) {} -void tt_VersimDevice::wait_for_non_mmio_flush() {} - -void tt_VersimDevice::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) {} -void tt_VersimDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) {} -void tt_VersimDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& dram_cores) {} - -void tt_VersimDevice::start( - std::vector plusargs, - std::vector dump_cores, - bool no_checkers, - bool /*init_device*/, - bool /*skip_driver_allocs*/ -) {} - -void tt_VersimDevice::deassert_risc_reset() {} -void tt_VersimDevice::deassert_risc_reset_at_core(tt_cxy_pair core) {} -void tt_VersimDevice::assert_risc_reset() {} -void tt_VersimDevice::assert_risc_reset_at_core(tt_cxy_pair core) {} - -void tt_VersimDevice::translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c) {}; -// void tt_VersimDevice::dump_wall_clock_mailbox(std::string output_path, int device_id) {} - -std::set tt_VersimDevice::get_target_mmio_device_ids() {return {};} -std::set tt_VersimDevice::get_target_remote_device_ids() {return {};} - -bool versim_check_dram_core_exists( - const std::vector> &dram_core_channels, tt_xy_pair target_core) { - return false; -} - -bool tt_VersimDevice::using_harvested_soc_descriptors() { return false; } -bool tt_VersimDevice::noc_translation_en() { return false; } -std::unordered_map tt_VersimDevice::get_harvesting_masks_for_soc_descriptors() { return std::unordered_map();} - -bool tt_VersimDevice::stop() { return true; } - -void tt_VersimDevice::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) {} -void tt_VersimDevice::set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_) {} - -std::uint32_t tt_VersimDevice::get_num_dram_channels(std::uint32_t device_id) {return 0;} -std::uint64_t tt_VersimDevice::get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel) {return 0;} -std::uint32_t tt_VersimDevice::get_num_host_channels(std::uint32_t device_id) {return 0;} -std::uint32_t tt_VersimDevice::get_host_channel_size(std::uint32_t device_id, std::uint32_t channel) {return 0;} - -std::map tt_VersimDevice::get_clocks() {return std::map();} - -tt_ClusterDescriptor* tt_VersimDevice::get_cluster_description() {return ndesc.get();} - diff --git a/device/wormhole/impl_device.hpp b/device/wormhole/impl_device.hpp deleted file mode 100644 index 227cac48..00000000 --- a/device/wormhole/impl_device.hpp +++ /dev/null @@ -1,82 +0,0 @@ -/* - * SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc. - * - * SPDX-License-Identifier: Apache-2.0 - */ - -#pragma once - -#include "device/tt_silicon_driver_common.hpp" - -// See src/t6ifc/t6py/packages/tenstorrent/data/wormhole/pci/tlb.yaml -// local_offset: [ 0, 15, 0, "36-bit address prefix, prepended to the 20 LSBs of issued address to form a 56-bit NOC address. The 1MB TLB #n corresponds to the 1MB MMIO range starting at (0x0 + N*0x100000)."] -// x_end : [ 0, 21, 16, "" ] -// y_end : [ 0, 27, 22, "" ] -// x_start : [ 0, 33, 28, "" ] -// y_start : [ 0, 39, 34, "" ] -// noc_sel: [ 0, 40, 40, "NOC select (1 = NOC1, 0 = NOC0)"] -// mcast: [ 0, 41, 41, "1 = multicast, 0 = unicast"] -// ordering: [ 0, 43, 42, "ordering mode (01 = strict (full AXI ordering), 00 = relaxed (no RAW hazard), 10 = posted (may have RAW hazard)"] -// linked: [ 0, 44, 44, "linked"] - -// local_offset: [ 0, 14, 0, "35-bit address prefix, prepended to the 21 LSBs of issued address to form a 56-bit NOC address. The 2MB TLB #n corresponds to the 2MB MMIO range starting at (0x9C00000 + N*0x200000)."] -// x_end : [ 0, 20, 15, "" ] -// y_end : [ 0, 26, 21, "" ] -// x_start : [ 0, 32, 27, "" ] -// y_start : [ 0, 38, 33, "" ] -// noc_sel: [ 0, 39, 39, "NOC select (1 = NOC1, 0 = NOC0)"] -// mcast: [ 0, 40, 40, "1 = multicast, 0 = unicast"] -// ordering: [ 0, 42, 41, "ordering mode (01 = strict (full AXI ordering), 00 = relaxed (no RAW hazard), 10 = posted (may have RAW hazard)"] -// linked: [ 0, 43, 43, "linked"] - -// local_offset: [ 0, 11, 0, "32-bit address prefix, prepended to the 24 LSBs of issued address to form a 56-bit NOC address. The 16MB TLB #n corresponds to the 16MB MMIO range starting at (0xB000000 + N*0x1000000)."] -// x_end : [ 0, 17, 12, "" ] -// y_end : [ 0, 23, 18, "" ] -// x_start : [ 0, 29, 24, "" ] -// y_start : [ 0, 35, 30, "" ] -// noc_sel: [ 0, 36, 36, "NOC select (1 = NOC1, 0 = NOC0)"] -// mcast: [ 0, 37, 37, "1 = multicast, 0 = unicast"] -// ordering: [ 0, 39, 38, "ordering mode (01 = strict (full AXI ordering), 00 = relaxed (no RAW hazard), 10 = posted (may have RAW hazard)"] -// linked: [ 0, 40, 40, "linked"] - -const auto TLB_1M_OFFSET = TLB_OFFSETS { - .local_offset = 0, - .x_end = 16, - .y_end = 22, - .x_start = 28, - .y_start = 34, - .noc_sel = 40, - .mcast = 41, - .ordering = 42, - .linked = 44, - .static_vc = 45, - .static_vc_end = 46 -}; - -const auto TLB_2M_OFFSET = TLB_OFFSETS { - .local_offset = 0, - .x_end = 15, - .y_end = 21, - .x_start = 27, - .y_start = 33, - .noc_sel = 39, - .mcast = 40, - .ordering = 41, - .linked = 43, - .static_vc = 44, - .static_vc_end = 45 -}; - -const auto TLB_16M_OFFSET = TLB_OFFSETS { - .local_offset = 0, - .x_end = 12, - .y_end = 18, - .x_start = 24, - .y_start = 30, - .noc_sel = 36, - .mcast = 37, - .ordering = 38, - .linked = 40, - .static_vc = 41, - .static_vc_end = 42 -}; diff --git a/device/wormhole_implementation.cpp b/device/wormhole/wormhole_implementation.cpp similarity index 98% rename from device/wormhole_implementation.cpp rename to device/wormhole/wormhole_implementation.cpp index 9295e2de..96722311 100644 --- a/device/wormhole_implementation.cpp +++ b/device/wormhole/wormhole_implementation.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "device/wormhole_implementation.h" +#include "wormhole_implementation.h" namespace tt::umd { diff --git a/device/wormhole_implementation.h b/device/wormhole/wormhole_implementation.h similarity index 100% rename from device/wormhole_implementation.h rename to device/wormhole/wormhole_implementation.h diff --git a/tests/blackhole/test_silicon_driver_bh.cpp b/tests/blackhole/test_silicon_driver_bh.cpp index d6c938aa..23816841 100644 --- a/tests/blackhole/test_silicon_driver_bh.cpp +++ b/tests/blackhole/test_silicon_driver_bh.cpp @@ -12,7 +12,7 @@ #include #include -#include "device/blackhole_implementation.h" +#include "device/blackhole/blackhole_implementation.h" #include "device/tt_cluster_descriptor.h" #include "tests/test_utils/generate_cluster_desc.hpp" diff --git a/tests/emulation/test_emulation_device.cpp b/tests/emulation/test_emulation_device.cpp deleted file mode 100644 index e54fa8f0..00000000 --- a/tests/emulation/test_emulation_device.cpp +++ /dev/null @@ -1,39 +0,0 @@ -#include "gtest/gtest.h" -#include "device/tt_soc_descriptor.h" -#include "device/tt_device.h" -#include "device/tt_emulation_device.h" - -TEST(EmulationDeviceGS, BasicEmuTest) { - tt_emulation_device device = tt_emulation_device("../../tests/soc_descs/grayskull_10x12.yaml"); - tt_device_params default_params; - - std::size_t phys_x = 1; - std::size_t phys_y = 1; - tt_xy_pair core = tt_xy_pair(phys_x, phys_y); - - uint32_t size = 16; - uint64_t l1_addr = 0x1000; - std::vector wdata(size); - std::vector rdata(size); - - try { - device.start_device(default_params); - - for (auto &byte : wdata) { - byte = rand(); - } - device.write_to_device(wdata, tt_cxy_pair(0, core), l1_addr, "l1"); - device.read_from_device(rdata, tt_cxy_pair(0, core), l1_addr, size, "l1"); - ASSERT_EQ(wdata, rdata) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; - - device.deassert_risc_reset(); - device.write_to_device(wdata, tt_cxy_pair(0, tt_xy_pair(phys_x, phys_y)), l1_addr, "l1"); - device.assert_risc_reset(); - device.write_to_device(wdata, tt_cxy_pair(0, tt_xy_pair(phys_x, phys_y)), l1_addr, "l1"); - - - } catch (const std::exception &e) { - std::cout << "Error: " << e.what() << std::endl; - } - device.close_device(); -} diff --git a/tests/galaxy/test_umd_remote_api_stability.cpp b/tests/galaxy/test_umd_remote_api_stability.cpp index f6bd28e8..ecf99862 100644 --- a/tests/galaxy/test_umd_remote_api_stability.cpp +++ b/tests/galaxy/test_umd_remote_api_stability.cpp @@ -76,13 +76,11 @@ TEST_F(WormholeGalaxyStabilityTestFixture, MixedRemoteTransfers) { 100000 * scale_number_of_tests, seed, - transfer_type_weights_t{.write = 0.40, .rolled_write = 0.2, .read = 0.4, .epoch_cmd_write = 0.0}, + transfer_type_weights_t{.write = 0.40, .read = 0.4}, std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution std::uniform_int_distribution(0x4, 30000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(0x4, 30000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution, std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution - std::uniform_int_distribution(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution, 0.75, 0.75, std::uniform_int_distribution(0x4, 30000), //READ_SIZE_GENERATOR_T const& read_size_distribution, @@ -108,13 +106,11 @@ TEST_F(WormholeGalaxyStabilityTestFixture, DISABLED_MultithreadedMixedRemoteTran 50000 * scale_number_of_tests, 0, - transfer_type_weights_t{.write = 0.50, .rolled_write = 0., .read = 0.50, .epoch_cmd_write = 0.}, + transfer_type_weights_t{.write = 0.50, .read = 0.50}, std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution std::uniform_int_distribution(0x4, 30000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(0x4, 30000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution, std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution - std::uniform_int_distribution(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution, 0.75, 0.75, std::uniform_int_distribution(0x4, 30000), //READ_SIZE_GENERATOR_T const& read_size_distribution, @@ -129,13 +125,11 @@ TEST_F(WormholeGalaxyStabilityTestFixture, DISABLED_MultithreadedMixedRemoteTran 50000 * scale_number_of_tests, 100, - transfer_type_weights_t{.write = 0.25, .rolled_write = 0.25, .read = 0.50, .epoch_cmd_write = 0.}, + transfer_type_weights_t{.write = 0.25, .read = 0.50}, std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution std::uniform_int_distribution(0x4, 30000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(0x4, 30000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution, std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution - std::uniform_int_distribution(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution, 0.75, 0.75, std::uniform_int_distribution(0x4, 30000), //READ_SIZE_GENERATOR_T const& read_size_distribution, @@ -150,13 +144,11 @@ TEST_F(WormholeGalaxyStabilityTestFixture, DISABLED_MultithreadedMixedRemoteTran 50000 * scale_number_of_tests, 23, - transfer_type_weights_t{.write = 0.5, .rolled_write = 0.25, .read = 0.25, .epoch_cmd_write = 0.}, + transfer_type_weights_t{.write = 0.5, .read = 0.25}, std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution std::uniform_int_distribution(0x4, 30000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(0x4, 30000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution, std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution - std::uniform_int_distribution(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution, 0.75, 0.75, std::uniform_int_distribution(0x4, 30000), //READ_SIZE_GENERATOR_T const& read_size_distribution, @@ -171,13 +163,11 @@ TEST_F(WormholeGalaxyStabilityTestFixture, DISABLED_MultithreadedMixedRemoteTran 100000 * scale_number_of_tests, 99, - transfer_type_weights_t{.write = 0.1, .rolled_write = 0, .read = 0.1, .epoch_cmd_write = 0.8}, + transfer_type_weights_t{.write = 0.1, .read = 0.1}, std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution std::uniform_int_distribution(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(0x4, 3000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution, std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution - std::uniform_int_distribution(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution, 0.75, 0.75, std::uniform_int_distribution(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution, diff --git a/tests/grayskull/test_silicon_driver.cpp b/tests/grayskull/test_silicon_driver.cpp index d8324f13..d890d8a9 100644 --- a/tests/grayskull/test_silicon_driver.cpp +++ b/tests/grayskull/test_silicon_driver.cpp @@ -7,7 +7,7 @@ #include "gtest/gtest.h" #include "tt_device.h" #include "device/tt_soc_descriptor.h" -#include "device/wormhole_implementation.h" +#include "device/wormhole/wormhole_implementation.h" #include "l1_address_map.h" #include "tests/test_utils/generate_cluster_desc.hpp" diff --git a/tests/test_utils/stimulus_generators.hpp b/tests/test_utils/stimulus_generators.hpp index 094f06cb..6d35afb8 100644 --- a/tests/test_utils/stimulus_generators.hpp +++ b/tests/test_utils/stimulus_generators.hpp @@ -36,7 +36,7 @@ namespace tt::umd::test::utils { static const std::string SOC_DESC_PATH = "tests/soc_descs/wormhole_b0_8x10.yaml"; -enum RemoteTransferType : uint8_t { WRITE = 0, ROLLED_WRITE, READ, EPOCH_CMD_WRITE }; +enum RemoteTransferType : uint8_t { WRITE = 0, READ }; template < typename SAMPLE_T, @@ -102,14 +102,6 @@ struct write_transfer_sample_t { std::string tlb_to_use; // (payload.data(), size, destination, address, tlb_to_use, false, false); }; -struct rolled_write_transfer_sample_t { - destination_t destination; - address_t address; - transfer_size_t size_in_bytes; - int unroll_count; - std::string tlb_to_use; - // (payload, 2, destination, address, tlb_to_use); -}; struct read_transfer_sample_t { destination_t destination; address_t address; @@ -117,17 +109,8 @@ struct read_transfer_sample_t { std::string tlb_to_use; // (payload.data(), destination, address, size, tlb_to_use); }; -struct write_epoch_cmd_sample_t { - destination_t destination; - address_t address; - transfer_size_t size_in_bytes; - std::string tlb_to_use; - bool last_epoch_command; - bool ordered_with_prev_remote_write; - // (payload.data(), size, destination, address, tlb_to_use, last_epoch_command, ordered_with_prev_remote_write); -}; -using remote_transfer_sample_t = std::tuple>; +using remote_transfer_sample_t = std::tuple>; template < template @@ -267,25 +250,6 @@ template < template class WRITE_SIZE_DISTR_T, - template - class WRITE_EPOCH_CMD_DEST_DISTR_T, - template - class WRITE_EPOCH_CMD_ADDR_DISTR_T, - template - class WRITE_EPOCH_CMD_SIZE_DISTR_T, - class WRITE_EPOCH_CMD_LAST_CMD_DISTR_T, - class WRITE_EPOCH_CMD_ORDERED_DISTR_T, - - template - class ROLLED_WRITE_DEST_DISTR_T, - template - class ROLLED_WRITE_ADDR_DISTR_T, - class ROLLED_WRITE_SIZE_DISTR_OUT_T, - template - class ROLLED_WRITE_SIZE_DISTR_T, - template - class ROLLED_WRITE_UNROLL_DISTR_T, - template class READ_DEST_DISTR_T, template @@ -299,8 +263,6 @@ class TestGenerator { using transfer_type_generator_t = DefaultTransferTypeGenerator; // ConstrainedTemplateTemplateGenerator; using write_command_generator_t = WriteCommandGenerator; - using write_epoch_cmd_command_generator_t = WriteEpochCmdCommandGenerator; - using rolled_write_command_generator_t = RolledWriteCommandGenerator; using read_command_generator_t = ReadCommandGenerator; public: @@ -308,14 +270,10 @@ class TestGenerator { int seed, transfer_type_generator_t const& transfer_type_distribution, write_command_generator_t const& write_command_generator, - rolled_write_command_generator_t const& rolled_write_command_generator, - write_epoch_cmd_command_generator_t const& write_epoch_cmd_command_generator, read_command_generator_t const& read_command_generator) : generator(seed), transfer_type_distribution(transfer_type_distribution), write_command_generator(write_command_generator), - rolled_write_command_generator(rolled_write_command_generator), - write_epoch_cmd_command_generator(write_epoch_cmd_command_generator), read_command_generator(read_command_generator) { } @@ -338,34 +296,6 @@ class TestGenerator { .tlb_to_use = "LARGE_WRITE_TLB"}}; } break; - case RemoteTransferType::ROLLED_WRITE: { - destination_t const& destination = rolled_write_command_generator.destination_generator.generate(); - address_t const& address = rolled_write_command_generator.address_generator.generate(); - transfer_size_t const& size_in_bytes = rolled_write_command_generator.size_generator.generate(); - int unroll_count = rolled_write_command_generator.unroll_generator.generate(); - return {transfer_type, rolled_write_transfer_sample_t{ - .destination = destination, - .address = address, - .size_in_bytes = size_in_bytes, - .unroll_count = unroll_count, - .tlb_to_use = "LARGE_WRITE_TLB"}}; - } break; - - case RemoteTransferType::EPOCH_CMD_WRITE: { - destination_t const& destination = write_epoch_cmd_command_generator.destination_generator.generate(); - address_t const& address = write_epoch_cmd_command_generator.address_generator.generate(); - transfer_size_t const& size_in_bytes = write_epoch_cmd_command_generator.size_generator.generate(); - bool last_epoch_cmd = write_epoch_cmd_command_generator.last_cmd_generator.generate(); - bool ordered_with_prev_remote_write = write_epoch_cmd_command_generator.ordered_generator.generate(); - return {transfer_type, write_epoch_cmd_sample_t{ - .destination = destination, - .address = address, - .size_in_bytes = size_in_bytes, - .tlb_to_use = "LARGE_WRITE_TLB", - .last_epoch_command = last_epoch_cmd, - .ordered_with_prev_remote_write = ordered_with_prev_remote_write}}; - } break; - case RemoteTransferType::READ: { destination_t const& destination = read_command_generator.destination_generator.generate(); address_t const& address = read_command_generator.address_generator.generate(); @@ -388,22 +318,17 @@ class TestGenerator { transfer_type_generator_t transfer_type_distribution; write_command_generator_t write_command_generator; - rolled_write_command_generator_t rolled_write_command_generator; - write_epoch_cmd_command_generator_t write_epoch_cmd_command_generator; read_command_generator_t read_command_generator; }; struct transfer_type_weights_t { double write; - double rolled_write; double read; - double epoch_cmd_write; }; static auto address_aligner = [](address_t addr) -> address_t { addr = (((addr - 1) / 32) + 1) * 32; assert(addr % 32 == 0); return addr;}; static auto transfer_size_aligner = [](transfer_size_t size) -> transfer_size_t { size = (((size - 1) / 4) + 1) * 4; assert(size > 0); assert(size % 4 == 0); return size; }; -static auto rolled_write_transfer_size_aligner = [](transfer_size_t size) -> transfer_size_t { size = (((size - 1) / 32) + 1) * 32; assert(size > 0); return size;}; static auto address_aligner_32B = [](transfer_size_t size) -> transfer_size_t { size = (((size - 1) / 32) + 1) * 32; assert(size > 0); return size;}; static auto size_aligner_32B = [](transfer_size_t size) -> transfer_size_t { size = (((size - 1) / 32) + 1) * 32; assert(size > 0); return size;}; template @@ -433,28 +358,12 @@ static void print_command(remote_transfer_sample_t const& command) { << ", y=" << command_args.destination.y << ", x=" << command_args.destination.x << "), address: " << command_args.address << ", size_in_bytes: " << command_args.size_in_bytes << std::endl; } break; - case RemoteTransferType::ROLLED_WRITE: { - rolled_write_transfer_sample_t const& command_args = - std::get(std::get<1>(command)); - std::cout << "Transfer type: ROLLED_WRITE, destination: (c=" << command_args.destination.chip - << ", y=" << command_args.destination.y << ", x=" << command_args.destination.x - << "), address: " << command_args.address << ", size_in_bytes: " << command_args.size_in_bytes - << ", unroll_count: " << command_args.unroll_count << std::endl; - } break; case RemoteTransferType::READ: { read_transfer_sample_t const& command_args = std::get(std::get<1>(command)); std::cout << "Transfer type: READ, destination: (c=" << command_args.destination.chip << ", y=" << command_args.destination.y << ", x=" << command_args.destination.x << "), address: " << command_args.address << ", size_in_bytes: " << command_args.size_in_bytes << std::endl; } break; - case RemoteTransferType::EPOCH_CMD_WRITE: { - write_epoch_cmd_sample_t const& command_args = std::get(std::get<1>(command)); - std::cout << "Transfer type: EPOCH_CMD_WRITE, destination: (c=" << command_args.destination.chip - << ", y=" << command_args.destination.y << ", x=" << command_args.destination.x - << "), address: " << command_args.address << ", size_in_bytes: " << command_args.size_in_bytes - << ", last_cmd: " << (command_args.last_epoch_command ? " True" : "False") - << ", ordered_w_prev_remote_write: " << (command_args.ordered_with_prev_remote_write ? " True" : "False") << std::endl; - } break; default: throw std::runtime_error("Invalid transfer type"); }; } @@ -479,14 +388,7 @@ static inline void dispatch_remote_transfer_command( write_transfer_sample_t const& command_args = std::get(std::get<1>(command)); assert(command_args.size_in_bytes >= sizeof(uint32_t)); resize_payload(payload,command_args.size_in_bytes); - driver.write_to_device(payload.data(), bytes_to_words(command_args.size_in_bytes), command_args.destination, command_args.address, command_args. - tlb_to_use, false, false); - } break; - case RemoteTransferType::ROLLED_WRITE: { - rolled_write_transfer_sample_t const& command_args = std::get(std::get<1>(command)); - assert(command_args.size_in_bytes >= sizeof(uint32_t)); - resize_payload(payload,command_args.size_in_bytes); - driver.rolled_write_to_device(payload, command_args.unroll_count, command_args.destination, command_args.address, command_args.tlb_to_use); + driver.write_to_device(payload.data(), bytes_to_words(command_args.size_in_bytes), command_args.destination, command_args.address, command_args.tlb_to_use); } break; case RemoteTransferType::READ: { read_transfer_sample_t const& command_args = std::get(std::get<1>(command)); @@ -494,12 +396,6 @@ static inline void dispatch_remote_transfer_command( resize_payload(payload,command_args.size_in_bytes); driver.read_from_device(payload.data(), command_args.destination, command_args.address, command_args.size_in_bytes, command_args.tlb_to_use); } break; - case RemoteTransferType::EPOCH_CMD_WRITE: { - write_epoch_cmd_sample_t const& command_args = std::get(std::get<1>(command)); - assert(command_args.size_in_bytes >= sizeof(uint32_t)); - resize_payload(payload,command_args.size_in_bytes); - driver.write_epoch_cmd_to_device(payload.data(), bytes_to_words(command_args.size_in_bytes), command_args.destination, command_args.address, command_args.tlb_to_use, command_args.last_epoch_command, command_args.ordered_with_prev_remote_write); - } break; default: throw std::runtime_error("Invalid transfer type"); }; @@ -524,16 +420,9 @@ static void print_command_executable_code(remote_transfer_sample_t const& comman std::cout << "assert(" << command_args.size_in_bytes << " >= sizeof(uint32_t));" << std::endl; emit_bytes_to_words_len_string("len", command_args.size_in_bytes, sizeof(uint32_t)); emit_payload_resize_string(command_args.size_in_bytes, sizeof(uint32_t)); - std::cout << "device->write_to_device(payload.data(), len, destination, " << command_args.address << ", \"" << command_args.tlb_to_use << "\", false, false);" << std::endl; + std::cout << "device->write_to_device(payload.data(), len, destination, " << command_args.address << ", \"" << command_args.tlb_to_use << "\");" << std::endl; // driver.write_to_device(payload.data(), command_args.size, command_args.destination, command_args.address, command_args.tlb_to_use, false, false); } break; - case RemoteTransferType::ROLLED_WRITE: { - rolled_write_transfer_sample_t const& command_args = std::get(std::get<1>(command)); - std::cout << "tt_cxy_pair const& destination = tt_cxy_pair(" << command_args.destination.chip << ", " << command_args.destination.x << ", " << command_args.destination.y << ");" << std::endl; - emit_payload_resize_string(command_args.size_in_bytes, sizeof(uint32_t)); - std::cout << "device->rolled_write_to_device(payload, " << command_args.unroll_count << ", destination, " << command_args.address << ", \"" << command_args.tlb_to_use << "\");" << std::endl; - // driver.rolled_write_to_device(payload, command_args.unroll_count, command_args.destination, command_args.address, command_args.tlb_to_use); - } break; case RemoteTransferType::READ: { read_transfer_sample_t const& command_args = std::get(std::get<1>(command)); std::cout << "tt_cxy_pair const& destination = tt_cxy_pair(" << command_args.destination.chip << ", " << command_args.destination.x << ", " << command_args.destination.y << ");" << std::endl; @@ -541,15 +430,6 @@ static void print_command_executable_code(remote_transfer_sample_t const& comman std::cout << "device->read_from_device(payload.data(), destination, " << command_args.address << ", " << command_args.size_in_bytes << ", \"" << command_args.tlb_to_use << "\");" << std::endl; // driver.read_from_device(payload.data(), command_args.destination, command_args.address, command_args.size, command_args.tlb_to_use); } break; - case RemoteTransferType::EPOCH_CMD_WRITE: { - write_epoch_cmd_sample_t const& command_args = std::get(std::get<1>(command)); - std::cout << "tt_cxy_pair const& destination = tt_cxy_pair(" << command_args.destination.chip << ", " << command_args.destination.x << ", " << command_args.destination.y << ");" << std::endl; - emit_payload_resize_string(command_args.size_in_bytes, sizeof(uint32_t)); - emit_bytes_to_words_len_string("len", command_args.size_in_bytes, sizeof(uint32_t)); - std::cout << "device->write_epoch_cmd_to_device(payload.data(), len, destination, " << command_args.address << ", \"" << command_args.tlb_to_use << "\", " << (command_args.last_epoch_command ? "true":"false") - << "\", " << (command_args.ordered_with_prev_remote_write ? "true":"false") << ");" << std::endl; - // driver.write_epoch_cmd_to_device(payload.data(), command_args.size, command_args.destination, command_args.address, command_args.tlb_to_use, command_args.last_epoch_command, command_args.ordered_with_prev_remote_write); - } break; default: throw std::runtime_error("Invalid transfer type"); }; @@ -572,18 +452,6 @@ template< template class WRITE_ADDR_DISTR_T, class WRITE_SIZE_DISTR_OUT_T, template class WRITE_SIZE_DISTR_T, - - template class ROLLED_WRITE_DEST_DISTR_T, - template class ROLLED_WRITE_ADDR_DISTR_T, - class ROLLED_WRITE_SIZE_DISTR_OUT_T, - template class ROLLED_WRITE_SIZE_DISTR_T, - template class ROLLED_WRITE_UNROLL_COUNT_DISTR_T, - - template class WRITE_EPOCH_CMD_DEST_DISTR_T, - template class WRITE_EPOCH_CMD_ADDR_DISTR_T, - template class WRITE_EPOCH_CMD_SIZE_DISTR_T, - class WRITE_EPOCH_CMD_LAST_CMD_DISTR_T, - class WRITE_EPOCH_CMD_ORDERED_DISTR_T, template class READ_DEST_DISTR_T, template class READ_ADDR_DISTR_T, @@ -598,8 +466,6 @@ void RunMixedTransfers( transfer_type_weights_t const& transfer_type_weights, WriteCommandGenerator const& write_command_generator, - RolledWriteCommandGenerator const& rolled_write_command_generator, - WriteEpochCmdCommandGenerator const& write_epoch_cmd_command_generator, ReadCommandGenerator const& read_command_generator, bool record_command_history = false, @@ -609,14 +475,12 @@ void RunMixedTransfers( auto test_generator = TestGenerator( seed, {seed, - {transfer_type_weights.write, transfer_type_weights.rolled_write, transfer_type_weights.read, transfer_type_weights.epoch_cmd_write}, + {transfer_type_weights.write, transfer_type_weights.read}, [](int transfer_type) -> RemoteTransferType { assert(transfer_type < 4); return static_cast(transfer_type); }}, write_command_generator, - rolled_write_command_generator, - write_epoch_cmd_command_generator, read_command_generator); if (record_command_history) { @@ -663,58 +527,6 @@ static ConstrainedTemplateTemplateGenerator destination_t { return core_index_to_location.at(dest); }); } - -static RolledWriteCommandGenerator < - std::uniform_int_distribution, - std::uniform_int_distribution, - transfer_size_t, - std::uniform_int_distribution, - std::uniform_int_distribution -> - build_dummy_rolled_write_command_generator(tt_SiliconDevice &device) { - tt_ClusterDescriptor *cluster_desc = device.get_cluster_description(); - tt_SocDescriptor const& soc_desc = device.get_virtual_soc_descriptors().at(0); - std::vector core_index_to_location = generate_core_index_locations(*cluster_desc, soc_desc); - auto dest_generator = ConstrainedTemplateTemplateGenerator( - 0, - std::uniform_int_distribution(0, core_index_to_location.size() - 1), - [core_index_to_location](int dest) -> destination_t { return core_index_to_location.at(dest); }); - auto addr_generator_32B_aligned = ConstrainedTemplateTemplateGenerator(0, std::uniform_int_distribution(0,0), address_aligner_32B); - auto rolled_write_size_generator = ConstrainedTemplateTemplateGenerator( - 0, std::uniform_int_distribution(0,0), rolled_write_transfer_size_aligner); - auto unroll_count_generator = ConstrainedTemplateTemplateGenerator( - 0, std::uniform_int_distribution(0,0), [](int unroll_count) -> int { return unroll_count; }); - - return RolledWriteCommandGenerator( - dest_generator, addr_generator_32B_aligned, rolled_write_size_generator, unroll_count_generator); -} - -static WriteEpochCmdCommandGenerator < - std::uniform_int_distribution, - std::uniform_int_distribution, - std::uniform_int_distribution, - std::bernoulli_distribution, - std::bernoulli_distribution -> build_dummy_write_epoch_cmd_command_generator(tt_SiliconDevice &device) { - tt_ClusterDescriptor *cluster_desc = device.get_cluster_description(); - tt_SocDescriptor const& soc_desc = device.get_virtual_soc_descriptors().at(0); - std::vector core_index_to_location = generate_core_index_locations(*cluster_desc, soc_desc); - auto dest_generator = ConstrainedTemplateTemplateGenerator( - 0, - std::uniform_int_distribution(0, core_index_to_location.size() - 1), - [core_index_to_location](int dest) -> destination_t { return core_index_to_location.at(dest); }); - auto addr_generator_32B_aligned = ConstrainedTemplateTemplateGenerator(0, std::uniform_int_distribution(0,0), address_aligner_32B); - auto write_epoch_cmd_generator = ConstrainedTemplateTemplateGenerator( - 0, std::uniform_int_distribution(0,0), transfer_size_aligner); - auto last_epoch_cmd_generator = ConstrainedTemplateGenerator( - 0, std::bernoulli_distribution(1), [](bool last_epoch_cmd) -> bool { return last_epoch_cmd; }); - auto ordered_generator = ConstrainedTemplateGenerator( - 0, std::bernoulli_distribution(1), [](bool ordered_with_prev_remote_write) -> bool { return ordered_with_prev_remote_write; }); - - return WriteEpochCmdCommandGenerator( - dest_generator, addr_generator_32B_aligned, write_epoch_cmd_generator, last_epoch_cmd_generator, ordered_generator); -} - static WriteCommandGenerator< std::uniform_int_distribution, std::uniform_int_distribution, @@ -764,10 +576,6 @@ template< template class WRITE_SIZE_GENERATOR_T, template - class ROLLED_WRITE_SIZE_GENERATOR_T, - template - class WRITE_EPOCH_CMD_SIZE_GENERATOR_T, - template class READ_SIZE_GENERATOR_T, template class UNROLL_COUNT_GENERATOR_T @@ -780,9 +588,7 @@ void RunMixedTransfersUniformDistributions( transfer_type_weights_t const& transfer_type_weights, ADDR_GENERATOR_T const& address_distribution, WRITE_SIZE_GENERATOR_T const& write_size_distribution, - ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution, UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution, - WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution, float percent_not_last_epoch_cmd, float percent_not_remote_ordered, READ_SIZE_GENERATOR_T const& read_size_distribution, @@ -802,12 +608,8 @@ void RunMixedTransfersUniformDistributions( auto addr_generator_32B_aligned = ConstrainedTemplateTemplateGenerator(seed + 1, address_distribution, address_aligner_32B); auto write_size_generator = ConstrainedTemplateTemplateGenerator( seed + 2, write_size_distribution, transfer_size_aligner); - auto rolled_write_size_generator = ConstrainedTemplateTemplateGenerator( - seed + 2, rolled_write_size_distribution, rolled_write_transfer_size_aligner); auto read_size_generator = ConstrainedTemplateTemplateGenerator( seed + 2, read_size_distribution, transfer_size_aligner); - auto write_epoch_cmd_generator = ConstrainedTemplateTemplateGenerator( - seed + 2, write_epoch_cmd_size_distribution, transfer_size_aligner); auto last_epoch_cmd_generator = ConstrainedTemplateGenerator( seed + 3, std::bernoulli_distribution(percent_not_last_epoch_cmd), [](bool last_epoch_cmd) -> bool { return last_epoch_cmd; }); auto ordered_generator = ConstrainedTemplateGenerator( @@ -823,9 +625,6 @@ void RunMixedTransfersUniformDistributions( transfer_type_weights, WriteCommandGenerator(dest_generator, addr_generator, write_size_generator), - RolledWriteCommandGenerator(dest_generator, addr_generator_32B_aligned, rolled_write_size_generator, unroll_count_generator), - WriteEpochCmdCommandGenerator( - dest_generator, addr_generator_32B_aligned, write_epoch_cmd_generator, last_epoch_cmd_generator, ordered_generator), ReadCommandGenerator(dest_generator, addr_generator, read_size_generator), record_command_history, diff --git a/tests/wormhole/test_silicon_driver_wh.cpp b/tests/wormhole/test_silicon_driver_wh.cpp index df686dfa..6551b3cc 100644 --- a/tests/wormhole/test_silicon_driver_wh.cpp +++ b/tests/wormhole/test_silicon_driver_wh.cpp @@ -13,7 +13,7 @@ #include "host_mem_address_map.h" #include "device/tt_cluster_descriptor.h" -#include "device/wormhole_implementation.h" +#include "device/wormhole/wormhole_implementation.h" #include "tests/test_utils/generate_cluster_desc.hpp" void set_params_for_remote_txn(tt_SiliconDevice& device) { diff --git a/tests/wormhole/test_umd_remote_api_stability.cpp b/tests/wormhole/test_umd_remote_api_stability.cpp index 36c02914..96fef09a 100644 --- a/tests/wormhole/test_umd_remote_api_stability.cpp +++ b/tests/wormhole/test_umd_remote_api_stability.cpp @@ -73,13 +73,11 @@ TEST_F(WormholeNebulaX2TestFixture, MixedRemoteTransfersMediumSmall) { 100000 * scale_number_of_tests, 0, - transfer_type_weights_t{.write = 0.25, .rolled_write = 0.25, .read = 0.25, .epoch_cmd_write = 0.25}, + transfer_type_weights_t{.write = 0.25, .read = 0.25}, std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution std::uniform_int_distribution(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(0x4, 3000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution, std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution - std::uniform_int_distribution(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution, 0.75, 0.75, std::uniform_int_distribution(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution, @@ -108,13 +106,11 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersMediumSmall 100000 * scale_number_of_tests, 0, - transfer_type_weights_t{.write = 0.50, .rolled_write = 0., .read = 0.50, .epoch_cmd_write = 0.}, + transfer_type_weights_t{.write = 0.50, .read = 0.50}, std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution std::uniform_int_distribution(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(0x4, 3000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution, std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution - std::uniform_int_distribution(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution, 0.75, 0.75, std::uniform_int_distribution(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution, @@ -129,13 +125,11 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersMediumSmall 100000 * scale_number_of_tests, 100, - transfer_type_weights_t{.write = 0.25, .rolled_write = 0.25, .read = 0.50, .epoch_cmd_write = 0.}, + transfer_type_weights_t{.write = 0.25, .read = 0.50}, std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution std::uniform_int_distribution(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(0x4, 3000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution, std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution - std::uniform_int_distribution(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution, 0.75, 0.75, std::uniform_int_distribution(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution, @@ -150,13 +144,11 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersMediumSmall 100000 * scale_number_of_tests, 23, - transfer_type_weights_t{.write = 0.5, .rolled_write = 0.25, .read = 0.25, .epoch_cmd_write = 0.}, + transfer_type_weights_t{.write = 0.5, .read = 0.25}, std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution std::uniform_int_distribution(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(0x4, 3000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution, std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution - std::uniform_int_distribution(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution, 0.75, 0.75, std::uniform_int_distribution(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution, @@ -171,13 +163,11 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersMediumSmall 100000 * scale_number_of_tests, 99, - transfer_type_weights_t{.write = 1.0, .rolled_write = 0, .read = 0.0, .epoch_cmd_write = 0.0}, + transfer_type_weights_t{.write = 1.0, .read = 0.0}, std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution std::uniform_int_distribution(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(0x4, 3000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution, std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution - std::uniform_int_distribution(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution, 0.75, 0.75, std::uniform_int_distribution(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution, @@ -206,13 +196,11 @@ TEST_F(WormholeNebulaX2TestFixture, MixedRemoteTransfersLarge) { 10000 * scale_number_of_tests, 0, - transfer_type_weights_t{.write = 0.15, .rolled_write = 0, .read = 0.15, .epoch_cmd_write = 0.7}, + transfer_type_weights_t{.write = 0.15, .read = 0.15}, std::uniform_int_distribution(0x10000, 0x200000), // address generator distribution std::uniform_int_distribution(0x4, 300000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(0x4, 300000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution, std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution - std::uniform_int_distribution(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution, 0.75, 0.75, std::uniform_int_distribution(0x4, 300000), //READ_SIZE_GENERATOR_T const& read_size_distribution, @@ -247,11 +235,9 @@ TEST_F(WormholeNebulaX2TestFixture, WritesOnlyNormalDistributionMean10kStd3kMinS 10000 * scale_number_of_tests, 0, - transfer_type_weights_t{.write = 1., .rolled_write = 0., .read = 0., .epoch_cmd_write = 0.}, + transfer_type_weights_t{.write = 1., .read = 0.}, WriteCommandGenerator(dest_generator, address_generator, write_size_generator), - build_dummy_rolled_write_command_generator(*device), - build_dummy_write_epoch_cmd_command_generator(*device), build_dummy_read_command_generator(*device), false, // Set to true if you want to emit the command history code to command line @@ -279,13 +265,11 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLMS) { 100000 * scale_number_of_tests, 0, - transfer_type_weights_t{.write = 0.50, .rolled_write = 0., .read = 0.50, .epoch_cmd_write = 0.}, + transfer_type_weights_t{.write = 0.50, .read = 0.50}, std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution std::uniform_int_distribution(4, 300000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(0x4, 3000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution, std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution - std::uniform_int_distribution(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution, 0.75, 0.75, std::uniform_int_distribution(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution, @@ -300,13 +284,11 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLMS) { 100000 * scale_number_of_tests, 100, - transfer_type_weights_t{.write = 0.25, .rolled_write = 0.25, .read = 0.50, .epoch_cmd_write = 0.}, + transfer_type_weights_t{.write = 0.25, .read = 0.50}, std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution std::uniform_int_distribution(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(0x4, 3000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution, std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution - std::uniform_int_distribution(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution, 0.75, 0.75, std::uniform_int_distribution(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution, @@ -321,13 +303,11 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLMS) { 100000 * scale_number_of_tests, 23, - transfer_type_weights_t{.write = 0.5, .rolled_write = 0.25, .read = 0.25, .epoch_cmd_write = 0.}, + transfer_type_weights_t{.write = 0.5, .read = 0.25}, std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution std::uniform_int_distribution(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(0x4, 3000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution, std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution - std::uniform_int_distribution(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution, 0.75, 0.75, std::uniform_int_distribution(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution, @@ -342,13 +322,11 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLMS) { 100000 * scale_number_of_tests, 99, - transfer_type_weights_t{.write = 1.0, .rolled_write = 0, .read = 0.0, .epoch_cmd_write = 0.0}, + transfer_type_weights_t{.write = 1.0, .read = 0.0}, std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution std::uniform_int_distribution(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(0x4, 3000), //ROLLED_WRITE_SIZE_GENERATOR_T const& rolled_write_size_distribution, std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution - std::uniform_int_distribution(0x4, 0x12), //WRITE_EPOCH_CMD_SIZE_GENERATOR_T const& write_epoch_cmd_size_distribution, 0.75, 0.75, std::uniform_int_distribution(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution, @@ -387,11 +365,9 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLargeWrites 10000 * scale_number_of_tests, 0, - transfer_type_weights_t{.write = 1., .rolled_write = 0., .read = 0., .epoch_cmd_write = 0.}, + transfer_type_weights_t{.write = 1., .read = 0.}, WriteCommandGenerator(dest_generator, address_generator, write_size_generator), - build_dummy_rolled_write_command_generator(*device), - build_dummy_write_epoch_cmd_command_generator(*device), build_dummy_read_command_generator(*device), false, // Set to true if you want to emit the command history code to command line @@ -404,11 +380,9 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLargeWrites 10000 * scale_number_of_tests, 0, - transfer_type_weights_t{.write = 1., .rolled_write = 0., .read = 0., .epoch_cmd_write = 0.}, + transfer_type_weights_t{.write = 1., .read = 0.}, WriteCommandGenerator(dest_generator, address_generator, write_size_generator), - build_dummy_rolled_write_command_generator(*device), - build_dummy_write_epoch_cmd_command_generator(*device), build_dummy_read_command_generator(*device), false, // Set to true if you want to emit the command history code to command line @@ -421,11 +395,9 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLargeWrites 10000 * scale_number_of_tests, 0, - transfer_type_weights_t{.write = 0, .rolled_write = 0., .read = 1., .epoch_cmd_write = 0.}, + transfer_type_weights_t{.write = 0, .read = 1.}, build_dummy_write_command_generator(*device), - build_dummy_rolled_write_command_generator(*device), - build_dummy_write_epoch_cmd_command_generator(*device), ReadCommandGenerator(dest_generator, address_generator, read_size_generator), false, // Set to true if you want to emit the command history code to command line @@ -438,11 +410,9 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLargeWrites 10000 * scale_number_of_tests, 0, - transfer_type_weights_t{.write = 0, .rolled_write = 0., .read = 1., .epoch_cmd_write = 0.}, + transfer_type_weights_t{.write = 0, .read = 1.}, build_dummy_write_command_generator(*device), - build_dummy_rolled_write_command_generator(*device), - build_dummy_write_epoch_cmd_command_generator(*device), ReadCommandGenerator(dest_generator, address_generator, read_size_generator), false, // Set to true if you want to emit the command history code to command line