diff --git a/tt_metal/detail/tt_metal.hpp b/tt_metal/detail/tt_metal.hpp index cb10f8b3a33..e19369c0d6f 100644 --- a/tt_metal/detail/tt_metal.hpp +++ b/tt_metal/detail/tt_metal.hpp @@ -264,6 +264,7 @@ namespace tt::tt_metal{ static vector vflags( Device::detect_num_available_devices() ); chip_id_t id = device->id(); TT_FATAL(id < command_queues.size(), "Invalid device {} detected", id); + TT_FATAL(device->is_initialized(), "Cannot access command queue for closed device {}", id); std::call_once(vflags[id], [&device](){ command_queues[device->id()] = std::make_unique(device); }); diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index 2c1bf6fd420..4a2312ece57 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -279,8 +279,7 @@ bool Device::initialize(const std::vector& l1_bank_remap) { ); // Create system memory writer for this device to have an associated interface to hardware command queue (i.e. hugepage) - const char *TT_METAL_SLOW_DISPATCH_MODE = std::getenv("TT_METAL_SLOW_DISPATCH_MODE"); - if (TT_METAL_SLOW_DISPATCH_MODE == nullptr) { + if (std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr) { this->sysmem_writer = std::make_unique( this->id_, [&, this]() -> const std::set& { return this->dispatch_cores(); }, @@ -305,6 +304,9 @@ bool Device::close() { this->clear_l1_state(); tt::Cluster::instance().l1_barrier(id_); allocator::clear(*this->allocator_); + if (std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr) { + this->sysmem_writer.reset(nullptr); + } this->active_devices_.deactivate_device(this->id_); diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp index 26e632594d1..818e31bdf11 100644 --- a/tt_metal/impl/dispatch/command_queue.cpp +++ b/tt_metal/impl/dispatch/command_queue.cpp @@ -658,7 +658,9 @@ CommandQueue::CommandQueue(Device* device) { vector pointers(CQ_START / sizeof(uint32_t), 0); pointers[0] = CQ_START >> 4; - tt::Cluster::instance().write_sysmem(pointers.data(), pointers.size() * sizeof(uint32_t), 0, 0); + chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device->id()); + uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id()); + tt::Cluster::instance().write_sysmem(pointers.data(), pointers.size() * sizeof(uint32_t), 0, mmio_device_id, channel); this->device = device; } @@ -699,15 +701,18 @@ void CommandQueue::enqueue_read_buffer(Buffer& buffer, void* dst, bool blocking) uint32_t padded_page_size = align(buffer.page_size(), 32); uint32_t data_size_in_bytes = padded_page_size * num_pages; + chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(this->device->id()); + uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->device->id()); + if ((buffer.page_size() % 32) != 0) { // If page size is not 32B-aligned, we cannot do a contiguous copy uint32_t dst_address_offset = 0; for (uint32_t sysmem_address_offset = 0; sysmem_address_offset < data_size_in_bytes; sysmem_address_offset += padded_page_size) { - tt::Cluster::instance().read_sysmem((char*)dst + dst_address_offset, buffer.page_size(), command.read_buffer_addr + sysmem_address_offset, 0); + tt::Cluster::instance().read_sysmem((char*)dst + dst_address_offset, buffer.page_size(), command.read_buffer_addr + sysmem_address_offset, mmio_device_id, channel); dst_address_offset += buffer.page_size(); } } else { - tt::Cluster::instance().read_sysmem(dst, data_size_in_bytes, command.read_buffer_addr, 0); + tt::Cluster::instance().read_sysmem(dst, data_size_in_bytes, command.read_buffer_addr, mmio_device_id, channel); } } @@ -801,15 +806,18 @@ void CommandQueue::finish() { FinishCommand command(this->device, *this->device->sysmem_writer); this->enqueue_command(command, false); + chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(this->device->id()); + uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->device->id()); + // We then poll to check that we're done. uint32_t finish; do { - tt::Cluster::instance().read_sysmem(&finish, 4, HOST_CQ_FINISH_PTR, 0); + tt::Cluster::instance().read_sysmem(&finish, 4, HOST_CQ_FINISH_PTR, mmio_device_id, channel); } while (finish != 1); // Reset this value to 0 before moving on finish = 0; - tt::Cluster::instance().write_sysmem(&finish, 4, HOST_CQ_FINISH_PTR, 0); + tt::Cluster::instance().write_sysmem(&finish, 4, HOST_CQ_FINISH_PTR, mmio_device_id, channel); } void CommandQueue::wrap() { diff --git a/tt_metal/impl/dispatch/command_queue_interface.hpp b/tt_metal/impl/dispatch/command_queue_interface.hpp index f1457430686..f3c6281077c 100644 --- a/tt_metal/impl/dispatch/command_queue_interface.hpp +++ b/tt_metal/impl/dispatch/command_queue_interface.hpp @@ -10,7 +10,9 @@ using namespace tt::tt_metal; inline uint32_t get_cq_rd_ptr(chip_id_t chip_id) { uint32_t recv; - tt::Cluster::instance().read_sysmem(&recv, sizeof(uint32_t), HOST_CQ_READ_PTR, chip_id); + chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(chip_id); + uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(chip_id); + tt::Cluster::instance().read_sysmem(&recv, sizeof(uint32_t), HOST_CQ_READ_PTR, mmio_device_id, channel); return recv; } @@ -49,7 +51,8 @@ class SystemMemoryWriter { SystemMemoryWriter(chip_id_t device_id, const std::function &()>& dispatch_cores, const std::function& worker_from_logical) : device_id(device_id), m_dma_buf_size(tt::Cluster::instance().get_m_dma_buf_size(device_id)), - hugepage_start((char*) tt::Cluster::instance().host_dma_address(0, device_id, 0)), // TODO: Uplift this api to translate device_id to nearest mmio device id + hugepage_start( + (char*) tt::Cluster::instance().host_dma_address(0, tt::Cluster::instance().get_associated_mmio_device(device_id), tt::Cluster::instance().get_assigned_channel_for_device(device_id))), fast_write_callable( tt::Cluster::instance().get_fast_pcie_static_tlb_write_callable(device_id)), dispatch_cores_callable(dispatch_cores), diff --git a/tt_metal/llrt/tt_cluster.cpp b/tt_metal/llrt/tt_cluster.cpp index 734a096955a..4b67ea6b2e3 100644 --- a/tt_metal/llrt/tt_cluster.cpp +++ b/tt_metal/llrt/tt_cluster.cpp @@ -117,6 +117,8 @@ void Cluster::generate_cluster_descriptor() { void Cluster::initialize_device_drivers() { for (const auto &[mmio_device_id, controlled_devices] : this->devices_grouped_by_assoc_mmio_device_) { + this->assign_mem_channels_to_devices(mmio_device_id, controlled_devices); + this->open_driver(mmio_device_id, controlled_devices); tt_device_params default_params; @@ -128,6 +130,25 @@ void Cluster::initialize_device_drivers() { } } +void Cluster::assign_mem_channels_to_devices(chip_id_t mmio_device_id, const std::set &controlled_device_ids) { + // g_MAX_HOST_MEM_CHANNELS (4) is defined in tt_SiliconDevice and denotes the max number of host memory channels per MMIO device + // Metal currently assigns 1 channel per device. See https://github.com/tenstorrent-metal/tt-metal/issues/4087 + TT_ASSERT(controlled_device_ids.size() <= 4, "Unable to assign each device to its own host memory channel!"); + uint16_t channel = 0; + this->device_to_host_mem_channel_[mmio_device_id] = channel++; + for (const chip_id_t &device_id : controlled_device_ids) { + if (device_id == mmio_device_id) { + continue; + } + this->device_to_host_mem_channel_[device_id] = channel++; + } + + std::cout << "MMIO device ID " << mmio_device_id << std::endl; + for (const auto &[did, cid] : this->device_to_host_mem_channel_) { + std::cout << "device " << did << " channel " << cid << std::endl; + } +} + void Cluster::get_metal_desc_from_tt_desc( const std::unordered_map &input, const std::unordered_map &per_chip_id_harvesting_masks) { @@ -499,14 +520,13 @@ void Cluster::read_reg(std::uint32_t *mem_ptr, tt_cxy_pair target, uint64_t addr this->get_driver(chip_id).read_from_device(mem_ptr, virtual_target, addr, size_in_bytes, "REG_TLB"); } -void Cluster::write_sysmem(const void* vec, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id) const { - constexpr uint16_t channel = 0; +void Cluster::write_sysmem(const void* vec, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id, uint16_t channel) const { + TT_ASSERT(this->cluster_desc_->is_chip_mmio_capable(src_device_id)); this->get_driver(src_device_id).write_to_sysmem(vec, size_in_bytes, addr, channel, src_device_id); } -void Cluster::read_sysmem(void *vec, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id) const { - // TODO: Uplift - constexpr uint16_t channel = 0; +void Cluster::read_sysmem(void *vec, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id, uint16_t channel) const { + TT_ASSERT(this->cluster_desc_->is_chip_mmio_capable(src_device_id)); this->get_driver(src_device_id).read_from_sysmem(vec, addr, channel, size_in_bytes, src_device_id); } diff --git a/tt_metal/llrt/tt_cluster.hpp b/tt_metal/llrt/tt_cluster.hpp index 8b0b8ecd627..ad1560fce5e 100644 --- a/tt_metal/llrt/tt_cluster.hpp +++ b/tt_metal/llrt/tt_cluster.hpp @@ -95,8 +95,8 @@ class Cluster { void write_reg(const std::uint32_t *mem_ptr, tt_cxy_pair target, uint64_t addr) const; void read_reg(std::uint32_t *mem_ptr, tt_cxy_pair target, uint64_t addr) const; - void write_sysmem(const void* mem_ptr, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id) const; - void read_sysmem(void *mem_ptr, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id) const; + void write_sysmem(const void* mem_ptr, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id, uint16_t channel) const; + void read_sysmem(void *mem_ptr, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id, uint16_t channel) const; int get_device_aiclk(const chip_id_t &chip_id) const; @@ -127,6 +127,15 @@ class Cluster { // Returns connected ethernet core on the other chip std::tuple get_connected_ethernet_core(std::tuple eth_core) const; + // Returns MMIO device ID (logical) that controls given `device_id`. If `device_id` is MMIO device it is returned. + chip_id_t get_associated_mmio_device(chip_id_t device_id) const { + return this->device_to_mmio_device_.at(device_id); + } + + uint16_t get_assigned_channel_for_device(chip_id_t device_id) const { + return this->device_to_host_mem_channel_.at(device_id); + } + uint32_t get_tensix_soft_reset_addr() const; private: @@ -136,6 +145,7 @@ class Cluster { void detect_arch_and_target(); void generate_cluster_descriptor(); void initialize_device_drivers(); + void assign_mem_channels_to_devices(chip_id_t mmio_device_id, const std::set &controlled_device_ids); void open_driver(chip_id_t mmio_device_id, const std::set &controlled_device_ids, const bool &skip_driver_allocs = false); void start_driver(chip_id_t mmio_device_id, tt_device_params &device_params) const; @@ -164,6 +174,17 @@ class Cluster { // Save mapping of device id to associated MMIO device id for fast lookup std::unordered_map device_to_mmio_device_; + // Currently, each device is mapped to its own channel in host memory to enable fast dispatch + // Channels are unique within a group of devices all controlled by a particular MMIO device + // For example: + // Two N300 cards where MMIO device IDs are 0, 1 and R chips are 2, 3 + // 0 L controls 2 R and 1 L controls 3 R then, device_to_host_mem_channel_: + // 0 -> 0 + // 2 -> 1 + // 1 -> 0 + // 3 -> 1 + std::unordered_map device_to_host_mem_channel_; + tt_device_dram_address_params dram_address_params = { DRAM_BARRIER_BASE }; diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp index 70b56ad34f4..d3ed04e9a2d 100644 --- a/tt_metal/tt_metal.cpp +++ b/tt_metal/tt_metal.cpp @@ -335,10 +335,6 @@ Device *CreateDevice(chip_id_t device_id, const std::vector& l1_bank_r } bool CloseDevice(Device *device) { - // TODO: ALMEET Needed to ensure that CQ doesn't contain a closed device - // if (detail::GLOBAL_CQ) { - // detail::GLOBAL_CQ.reset(nullptr); - // } return device->close(); }