Skip to content

Commit

Permalink
#2934: Assign devices to specific host mem channels
Browse files Browse the repository at this point in the history
  • Loading branch information
abhullar-tt committed Dec 4, 2023
1 parent facc2ac commit cb22adb
Show file tree
Hide file tree
Showing 7 changed files with 71 additions and 20 deletions.
1 change: 1 addition & 0 deletions tt_metal/detail/tt_metal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,7 @@ namespace tt::tt_metal{
static vector<std::once_flag> vflags( Device::detect_num_available_devices() );
chip_id_t id = device->id();
TT_FATAL(id < command_queues.size(), "Invalid device {} detected", id);
TT_FATAL(device->is_initialized(), "Cannot access command queue for closed device {}", id);
std::call_once(vflags[id], [&device](){
command_queues[device->id()] = std::make_unique<CommandQueue>(device); });

Expand Down
6 changes: 4 additions & 2 deletions tt_metal/impl/device/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -279,8 +279,7 @@ bool Device::initialize(const std::vector<uint32_t>& l1_bank_remap) {
);

// Create system memory writer for this device to have an associated interface to hardware command queue (i.e. hugepage)
const char *TT_METAL_SLOW_DISPATCH_MODE = std::getenv("TT_METAL_SLOW_DISPATCH_MODE");
if (TT_METAL_SLOW_DISPATCH_MODE == nullptr) {
if (std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr) {
this->sysmem_writer = std::make_unique<SystemMemoryWriter>(
this->id_,
[&, this]() -> const std::set<CoreCoord>& { return this->dispatch_cores(); },
Expand All @@ -305,6 +304,9 @@ bool Device::close() {
this->clear_l1_state();
tt::Cluster::instance().l1_barrier(id_);
allocator::clear(*this->allocator_);
if (std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr) {
this->sysmem_writer.reset(nullptr);
}

this->active_devices_.deactivate_device(this->id_);

Expand Down
18 changes: 13 additions & 5 deletions tt_metal/impl/dispatch/command_queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -658,7 +658,9 @@ CommandQueue::CommandQueue(Device* device) {
vector<uint32_t> pointers(CQ_START / sizeof(uint32_t), 0);
pointers[0] = CQ_START >> 4;

tt::Cluster::instance().write_sysmem(pointers.data(), pointers.size() * sizeof(uint32_t), 0, 0);
chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device->id());
uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id());
tt::Cluster::instance().write_sysmem(pointers.data(), pointers.size() * sizeof(uint32_t), 0, mmio_device_id, channel);

this->device = device;
}
Expand Down Expand Up @@ -699,15 +701,18 @@ void CommandQueue::enqueue_read_buffer(Buffer& buffer, void* dst, bool blocking)
uint32_t padded_page_size = align(buffer.page_size(), 32);
uint32_t data_size_in_bytes = padded_page_size * num_pages;

chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(this->device->id());
uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->device->id());

if ((buffer.page_size() % 32) != 0) {
// If page size is not 32B-aligned, we cannot do a contiguous copy
uint32_t dst_address_offset = 0;
for (uint32_t sysmem_address_offset = 0; sysmem_address_offset < data_size_in_bytes; sysmem_address_offset += padded_page_size) {
tt::Cluster::instance().read_sysmem((char*)dst + dst_address_offset, buffer.page_size(), command.read_buffer_addr + sysmem_address_offset, 0);
tt::Cluster::instance().read_sysmem((char*)dst + dst_address_offset, buffer.page_size(), command.read_buffer_addr + sysmem_address_offset, mmio_device_id, channel);
dst_address_offset += buffer.page_size();
}
} else {
tt::Cluster::instance().read_sysmem(dst, data_size_in_bytes, command.read_buffer_addr, 0);
tt::Cluster::instance().read_sysmem(dst, data_size_in_bytes, command.read_buffer_addr, mmio_device_id, channel);
}
}

Expand Down Expand Up @@ -801,15 +806,18 @@ void CommandQueue::finish() {
FinishCommand command(this->device, *this->device->sysmem_writer);
this->enqueue_command(command, false);

chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(this->device->id());
uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->device->id());

// We then poll to check that we're done.
uint32_t finish;
do {
tt::Cluster::instance().read_sysmem(&finish, 4, HOST_CQ_FINISH_PTR, 0);
tt::Cluster::instance().read_sysmem(&finish, 4, HOST_CQ_FINISH_PTR, mmio_device_id, channel);
} while (finish != 1);

// Reset this value to 0 before moving on
finish = 0;
tt::Cluster::instance().write_sysmem(&finish, 4, HOST_CQ_FINISH_PTR, 0);
tt::Cluster::instance().write_sysmem(&finish, 4, HOST_CQ_FINISH_PTR, mmio_device_id, channel);
}

void CommandQueue::wrap() {
Expand Down
7 changes: 5 additions & 2 deletions tt_metal/impl/dispatch/command_queue_interface.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ using namespace tt::tt_metal;

inline uint32_t get_cq_rd_ptr(chip_id_t chip_id) {
uint32_t recv;
tt::Cluster::instance().read_sysmem(&recv, sizeof(uint32_t), HOST_CQ_READ_PTR, chip_id);
chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(chip_id);
uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(chip_id);
tt::Cluster::instance().read_sysmem(&recv, sizeof(uint32_t), HOST_CQ_READ_PTR, mmio_device_id, channel);
return recv;
}

Expand Down Expand Up @@ -49,7 +51,8 @@ class SystemMemoryWriter {
SystemMemoryWriter(chip_id_t device_id, const std::function<const std::set<CoreCoord> &()>& dispatch_cores, const std::function<CoreCoord (CoreCoord)>& worker_from_logical) :
device_id(device_id),
m_dma_buf_size(tt::Cluster::instance().get_m_dma_buf_size(device_id)),
hugepage_start((char*) tt::Cluster::instance().host_dma_address(0, device_id, 0)), // TODO: Uplift this api to translate device_id to nearest mmio device id
hugepage_start(
(char*) tt::Cluster::instance().host_dma_address(0, tt::Cluster::instance().get_associated_mmio_device(device_id), tt::Cluster::instance().get_assigned_channel_for_device(device_id))),
fast_write_callable(
tt::Cluster::instance().get_fast_pcie_static_tlb_write_callable(device_id)),
dispatch_cores_callable(dispatch_cores),
Expand Down
30 changes: 25 additions & 5 deletions tt_metal/llrt/tt_cluster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,8 @@ void Cluster::generate_cluster_descriptor() {

void Cluster::initialize_device_drivers() {
for (const auto &[mmio_device_id, controlled_devices] : this->devices_grouped_by_assoc_mmio_device_) {
this->assign_mem_channels_to_devices(mmio_device_id, controlled_devices);

this->open_driver(mmio_device_id, controlled_devices);

tt_device_params default_params;
Expand All @@ -128,6 +130,25 @@ void Cluster::initialize_device_drivers() {
}
}

void Cluster::assign_mem_channels_to_devices(chip_id_t mmio_device_id, const std::set<chip_id_t> &controlled_device_ids) {
// g_MAX_HOST_MEM_CHANNELS (4) is defined in tt_SiliconDevice and denotes the max number of host memory channels per MMIO device
// Metal currently assigns 1 channel per device. See https://github.com/tenstorrent-metal/tt-metal/issues/4087
TT_ASSERT(controlled_device_ids.size() <= 4, "Unable to assign each device to its own host memory channel!");
uint16_t channel = 0;
this->device_to_host_mem_channel_[mmio_device_id] = channel++;
for (const chip_id_t &device_id : controlled_device_ids) {
if (device_id == mmio_device_id) {
continue;
}
this->device_to_host_mem_channel_[device_id] = channel++;
}

std::cout << "MMIO device ID " << mmio_device_id << std::endl;
for (const auto &[did, cid] : this->device_to_host_mem_channel_) {
std::cout << "device " << did << " channel " << cid << std::endl;
}
}

void Cluster::get_metal_desc_from_tt_desc(
const std::unordered_map<chip_id_t, tt_SocDescriptor> &input,
const std::unordered_map<chip_id_t, uint32_t> &per_chip_id_harvesting_masks) {
Expand Down Expand Up @@ -499,14 +520,13 @@ void Cluster::read_reg(std::uint32_t *mem_ptr, tt_cxy_pair target, uint64_t addr
this->get_driver(chip_id).read_from_device(mem_ptr, virtual_target, addr, size_in_bytes, "REG_TLB");
}

void Cluster::write_sysmem(const void* vec, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id) const {
constexpr uint16_t channel = 0;
void Cluster::write_sysmem(const void* vec, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id, uint16_t channel) const {
TT_ASSERT(this->cluster_desc_->is_chip_mmio_capable(src_device_id));
this->get_driver(src_device_id).write_to_sysmem(vec, size_in_bytes, addr, channel, src_device_id);
}

void Cluster::read_sysmem(void *vec, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id) const {
// TODO: Uplift
constexpr uint16_t channel = 0;
void Cluster::read_sysmem(void *vec, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id, uint16_t channel) const {
TT_ASSERT(this->cluster_desc_->is_chip_mmio_capable(src_device_id));
this->get_driver(src_device_id).read_from_sysmem(vec, addr, channel, size_in_bytes, src_device_id);
}

Expand Down
25 changes: 23 additions & 2 deletions tt_metal/llrt/tt_cluster.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,8 @@ class Cluster {
void write_reg(const std::uint32_t *mem_ptr, tt_cxy_pair target, uint64_t addr) const;
void read_reg(std::uint32_t *mem_ptr, tt_cxy_pair target, uint64_t addr) const;

void write_sysmem(const void* mem_ptr, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id) const;
void read_sysmem(void *mem_ptr, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id) const;
void write_sysmem(const void* mem_ptr, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id, uint16_t channel) const;
void read_sysmem(void *mem_ptr, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id, uint16_t channel) const;

int get_device_aiclk(const chip_id_t &chip_id) const;

Expand Down Expand Up @@ -127,6 +127,15 @@ class Cluster {
// Returns connected ethernet core on the other chip
std::tuple<chip_id_t, CoreCoord> get_connected_ethernet_core(std::tuple<chip_id_t, CoreCoord> eth_core) const;

// Returns MMIO device ID (logical) that controls given `device_id`. If `device_id` is MMIO device it is returned.
chip_id_t get_associated_mmio_device(chip_id_t device_id) const {
return this->device_to_mmio_device_.at(device_id);
}

uint16_t get_assigned_channel_for_device(chip_id_t device_id) const {
return this->device_to_host_mem_channel_.at(device_id);
}

uint32_t get_tensix_soft_reset_addr() const;

private:
Expand All @@ -136,6 +145,7 @@ class Cluster {
void detect_arch_and_target();
void generate_cluster_descriptor();
void initialize_device_drivers();
void assign_mem_channels_to_devices(chip_id_t mmio_device_id, const std::set<chip_id_t> &controlled_device_ids);
void open_driver(chip_id_t mmio_device_id, const std::set<chip_id_t> &controlled_device_ids, const bool &skip_driver_allocs = false);
void start_driver(chip_id_t mmio_device_id, tt_device_params &device_params) const;

Expand Down Expand Up @@ -164,6 +174,17 @@ class Cluster {
// Save mapping of device id to associated MMIO device id for fast lookup
std::unordered_map<chip_id_t, chip_id_t> device_to_mmio_device_;

// Currently, each device is mapped to its own channel in host memory to enable fast dispatch
// Channels are unique within a group of devices all controlled by a particular MMIO device
// For example:
// Two N300 cards where MMIO device IDs are 0, 1 and R chips are 2, 3
// 0 L controls 2 R and 1 L controls 3 R then, device_to_host_mem_channel_:
// 0 -> 0
// 2 -> 1
// 1 -> 0
// 3 -> 1
std::unordered_map<chip_id_t, uint16_t> device_to_host_mem_channel_;

tt_device_dram_address_params dram_address_params = {
DRAM_BARRIER_BASE
};
Expand Down
4 changes: 0 additions & 4 deletions tt_metal/tt_metal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -335,10 +335,6 @@ Device *CreateDevice(chip_id_t device_id, const std::vector<uint32_t>& l1_bank_r
}

bool CloseDevice(Device *device) {
// TODO: ALMEET Needed to ensure that CQ doesn't contain a closed device
// if (detail::GLOBAL_CQ) {
// detail::GLOBAL_CQ.reset(nullptr);
// }
return device->close();
}

Expand Down

0 comments on commit cb22adb

Please sign in to comment.