Skip to content

Commit

Permalink
#8450: Establish tunnels originating from an mmio device. Determine t…
Browse files Browse the repository at this point in the history
…he remote chips as well as their order on the tunnel.
  • Loading branch information
ubcheema committed May 15, 2024
1 parent 5871c4c commit 4232c5d
Show file tree
Hide file tree
Showing 9 changed files with 206 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ class CommandQueueSingleCardFixture : public ::testing::Test {
auto enable_remote_chip = getenv("TT_METAL_ENABLE_REMOTE_CHIP");
arch_ = tt::get_arch_from_string(tt::test_utils::get_env_arch_name());

const chip_id_t mmio_device_id = 1;
const chip_id_t mmio_device_id = 0;
reserved_devices_ = tt::tt_metal::detail::CreateDevices({mmio_device_id});
if (enable_remote_chip) {
for (const auto &[id, device] : reserved_devices_) {
Expand Down
3 changes: 2 additions & 1 deletion tt_metal/common/core_descriptor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,8 @@ inline const core_descriptor_t &get_core_descriptor_config(chip_id_t device_id,
}

std::vector<RelativeCoreCoord> dispatch_cores;
for (const auto& core_node : desc_yaml["dispatch_cores"]) {
auto dispatch_cores_string = tt::Cluster::instance().is_galaxy_cluster() ? "tg_dispatch_cores" : "dispatch_cores";
for (const auto& core_node : desc_yaml[dispatch_cores_string]) {
RelativeCoreCoord coord = {};
if (core_node.IsSequence()) {
// Logical coord
Expand Down
10 changes: 9 additions & 1 deletion tt_metal/core_descriptors/wormhole_b0_80_arch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ galaxy:
dispatch_cores:
[[0, -1], [1, -1], [2, -1], [3, -1], [4, -1], [5, -1], [6, -1], [7, -1]]

tg_dispatch_cores:
[[0, -1], [1, -1], [2, -1], [3, -1], [4, -1], [5, -1], [6, -1], [7, -1]]

dispatch_core_type:
"tensix"

Expand Down Expand Up @@ -47,14 +50,19 @@ nebula_x1:

compute_with_storage_grid_range: # Logical only start and end [x, y]
start: [0, 0]
end: [7, 7]
end: [7, 3]

storage_cores:
[]

dispatch_cores:
[[0, -1], [1, -1], [2, -1], [3, -1], [4, -1], [5, -1], [6, -1], [7, -1]]

tg_dispatch_cores:
[[0, -1], [1, -1], [2, -1], [3, -1], [4, -1], [5, -1], [6, -1], [7, -1],
[0, -2], [1, -2], [2, -2], [3, -2], [4, -2], [5, -2], [6, -2], [7, -2],
[0, -3], [1, -3], [2, -3], [3, -3], [4, -3], [5, -3], [6, -3], [7, -3]]

dispatch_core_type:
"tensix"

Expand Down
71 changes: 70 additions & 1 deletion tt_metal/impl/device/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,74 @@ void Device::clear_l1_state() {
// TODO: clear idle eriscs as well
}

void Device::setup_tunnel_for_remote_devices() {
chip_id_t mmio_device_id = this->id_;
std::vector<std::vector<chip_id_t>> tunnel_chips = {};
uint32_t num_tunnels = tt::Cluster::instance().get_mmio_device_tunnel_count(mmio_device_id);
if (num_tunnels == 0) {
//no remote device conected to this mmio device.
return;
}

tunnel_chips.resize(num_tunnels);

auto tunnels_from_mmio = tt::Cluster::instance().get_tunnels_from_mmio_device(mmio_device_id);
uint32_t index = 0;
for (auto tunnel : tunnels_from_mmio) {
for (auto remote_dev : tunnel) {
log_info(tt::LogMetal, "MMIO Device {} : Tunnel {} : Device {}", mmio_device_id, index, remote_dev);
}
index++;
}

/*
for (const chip_id_t &device_id : tt::Cluster::instance().get_devices_controlled_by_mmio_device(this->id_)) {
uint32_t tunnel_stop = tt::Cluster::instance().get_device_tunnel_depth(device_id);
if (tunnel_stop == 0) {
// mmio device. i.e start of tunnel.
tunnel_chips.resize(tt::Cluster::instance().get_mmio_device_max_tunnel_depth(device_id) + 1);// account for mmio device.
for (const chip_id_t &tunnel_dev_id : tt::Cluster::instance().get_devices_controlled_by_mmio_device(this->id_)) {
uint32_t stop_index = tt::Cluster::instance().get_device_tunnel_depth(tunnel_dev_id);
tunnel_chips[stop_index] = tunnel_dev_id;
log_info(tt::LogMetal, " MMIO Device {} : Controlled Device {} : stop_index {}", mmio_device_id, tunnel_dev_id, stop_index);
}
} else {
// a remote device.
// tunnel_stop hops away.
uint8_t num_hw_cqs = 1;
uint32_t cq_id = 0;
uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id);
CoreType dispatch_core_type = dispatch_core_manager::get(num_hw_cqs).get_dispatch_core_type(mmio_device_id);
tt_cxy_pair prefetch_location = dispatch_core_manager::get(num_hw_cqs).prefetcher_core(device_id, channel, cq_id);
tt_cxy_pair dispatch_location = dispatch_core_manager::get(num_hw_cqs).dispatcher_core(device_id, channel, cq_id);
if (tunnel_stop == 1) {
//need to allocate mux/demux on mmio chip only once.
//all tunnel stops, share the same mux/demux on mmio chip.
tt_cxy_pair mux_location = dispatch_core_manager::get(num_hw_cqs).mux_core(device_id, channel, cq_id);
tt_cxy_pair demux_location = dispatch_core_manager::get(num_hw_cqs).demux_core(device_id, channel, cq_id);
CoreCoord mux_physical_core = get_physical_core_coordinate(mux_location, dispatch_core_type);
CoreCoord demux_physical_core = get_physical_core_coordinate(demux_location, dispatch_core_type);
}
TT_ASSERT(tunnel_chips[tunnel_stop] == device_id,
"Tunnel Stop Device Ids dont match. {} but it is expected to be on device {}", tunnel_chips[tunnel_stop], device_id);
TT_ASSERT(tunnel_stop != 0, "Tunnel Stop Cannot Be 0");
tt_cxy_pair us_location = dispatch_core_manager::get(num_hw_cqs).tunneler_core(tunnel_chips[tunnel_stop - 1], tunnel_chips[tunnel_stop], channel, cq_id);
CoreCoord tunneler_logical_core = CoreCoord(us_location.x, us_location.y);
TT_ASSERT(us_location.chip == tunnel_chips[tunnel_stop - 1],
"Upstream Tunneler is on device {} but it is expected to be on device {}", us_location.chip, tunnel_chips[tunnel_stop - 1]);
//get it from dispatch core manager. like tunneler_core.
CoreCoord r_tunneler_logical_core = std::get<1>(tt::Cluster::instance().get_connected_ethernet_core(std::make_tuple(us_location.chip, CoreCoord(us_location.x, us_location.y))));
CoreCoord r_tunneler_physical_core = tt::Cluster::instance().ethernet_core_from_logical_core(device_id, r_tunneler_logical_core); // should be device not this-> whichis mmio device.
CoreCoord tunneler_physical_core = this->ethernet_core_from_logical_core(us_location);
}
}
*/
}

// TODO (abhullar): Refactor this with #2593 to allow each target fast dispatch (FD) device to program their associated FD cores regardless of whether they are on the target device or not.
// Currently we have to program FD cores for the remote device when initializing the MMIO device because completion queue cores are on MMIO device
// and we don't have handle on MMIO device when initializing the remote device
Expand Down Expand Up @@ -501,6 +569,7 @@ void Device::compile_command_queue_programs() {
}
detail::CompileProgram(this, *command_queue_program_ptr);
this->command_queue_programs.push_back(std::move(command_queue_program_ptr));
this->setup_tunnel_for_remote_devices();
} else {
/////////////////Following section is for mmio device serving Remote Device
uint8_t num_hw_cqs = 1;
Expand Down Expand Up @@ -535,7 +604,7 @@ void Device::compile_command_queue_programs() {

tt_cxy_pair mux_location = dispatch_core_manager::get(num_hw_cqs).mux_core(device_id, channel, cq_id);
tt_cxy_pair demux_location = dispatch_core_manager::get(num_hw_cqs).demux_core(device_id, channel, cq_id);
tt_cxy_pair tunneler_location = dispatch_core_manager::get(num_hw_cqs).tunneler_core(device_id, channel, cq_id);
tt_cxy_pair tunneler_location = dispatch_core_manager::get(num_hw_cqs).tunneler_core(mmio_device_id, device_id, channel, cq_id);
CoreCoord tunneler_logical_core = CoreCoord(tunneler_location.x, tunneler_location.y);
TT_ASSERT(tunneler_location.chip == mmio_device_id,
"Tunneler is on device {} but it is expected to be on device {}", tunneler_location.chip, mmio_device_id);
Expand Down
2 changes: 2 additions & 0 deletions tt_metal/impl/device/device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,8 @@ class Device {
return tt::Cluster::instance().get_associated_mmio_device(this->id_) == this->id_;
}

void setup_tunnel_for_remote_devices();

uint32_t num_banks(const BufferType &buffer_type) const;
uint32_t bank_size(const BufferType &buffer_type) const;

Expand Down
17 changes: 11 additions & 6 deletions tt_metal/impl/dispatch/dispatch_core_manager.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,17 +193,22 @@ class dispatch_core_manager {
/// @param channel assigned to the command queue where commands are enqueued
/// @param cq_id ID of the command queue within the channel
/// @return tt_cxy_pair logical location (chip + core coordinate) of the ethernet tunnel core
const tt_cxy_pair &tunneler_core(chip_id_t device_id, uint16_t channel, uint8_t cq_id) {
const tt_cxy_pair &tunneler_core(chip_id_t upstream_device_id, chip_id_t device_id, uint16_t channel, uint8_t cq_id) {
dispatch_core_types_t &assignment = this->dispatch_core_assignments[device_id][channel][cq_id];
if (assignment.tunneler.has_value()) {
return assignment.tunneler.value();
}
TT_ASSERT(assignment.mux.has_value(), " Mux core not assigned for device {}. Must assign a Mux core before getting a tunneler core.", device_id);
//TT_ASSERT(assignment.mux.has_value(), " Mux core not assigned for device {}. Must assign a Mux core before getting a tunneler core.", device_id);

tt_cxy_pair tunneler_location = tt::Cluster::instance().get_eth_core_for_dispatch_core(
assignment.mux.value(), EthRouterMode::BI_DIR_TUNNELING, device_id);
assignment.tunneler = tunneler_location;
log_info(tt::LogMetal, "Allocated Tunneler Core: {} for Device {}", tunneler_location.str(), device_id);
//tt_cxy_pair tunneler_location = tt::Cluster::instance().get_eth_core_for_dispatch_core(
// assignment.mux.value(), EthRouterMode::BI_DIR_TUNNELING, device_id);

auto[us_core, ds_core] = tt::Cluster::instance().get_eth_tunnel_core(upstream_device_id, device_id, EthRouterMode::BI_DIR_TUNNELING);

assignment.tunneler = us_core;
assignment.tunneler_d = ds_core;

log_info(tt::LogMetal, "Allocated Tunneler Core: {} for Device {}", us_core.str(), device_id);
return assignment.tunneler.value();
}

Expand Down
89 changes: 85 additions & 4 deletions tt_metal/llrt/tt_cluster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,6 @@ void Cluster::generate_cluster_descriptor() {
tt_ClusterDescriptor::create_for_grayskull_cluster(logical_mmio_device_ids, physical_mmio_device_ids);
} else {
this->cluster_desc_ = tt_ClusterDescriptor::create_from_yaml(this->cluster_desc_path_);
YAML::Node yaml = YAML::LoadFile(this->cluster_desc_path_);
for (const auto &chip_id : this->cluster_desc_->get_all_chips()) {
if (this->cluster_desc_->get_board_type(chip_id) == BoardType::GALAXY) {
this->is_tg_cluster_ = true;
Expand Down Expand Up @@ -192,9 +191,8 @@ void Cluster::assert_risc_reset() {
void Cluster::assign_mem_channels_to_devices(chip_id_t mmio_device_id, const std::set<chip_id_t> &controlled_device_ids) {
// g_MAX_HOST_MEM_CHANNELS (4) is defined in tt_SiliconDevice and denotes the max number of host memory channels per MMIO device
// Metal currently assigns 1 channel per device. See https://github.com/tenstorrent/tt-metal/issues/4087
std::cout<<"controlled_device_ids.size() = "<<controlled_device_ids.size()<<std::endl;
// This should really be 9. By my TG has one gateway wormhole showing no connections to Galaxy.
// chip coordinates, 0, 1, 0, 0 has no connected ethernet links to Galaxy.

// TGG will have 16 remote chips + mmio chip accessed over one mmio chip.
TT_ASSERT(controlled_device_ids.size() <= 17, "Unable to assign each device to its own host memory channel!");
uint16_t channel = 0;
this->device_to_host_mem_channel_[mmio_device_id] = channel++;
Expand Down Expand Up @@ -643,6 +641,75 @@ std::unordered_map<chip_id_t, std::vector<CoreCoord>> Cluster::get_ethernet_core
return connected_chips;
}

std::vector<std::vector<chip_id_t>> Cluster::get_tunnels_from_mmio_device(chip_id_t mmio_chip_id) const {
std::vector<std::vector<chip_id_t>> tunnels_from_mmio = {};
const auto &all_eth_connections = this->cluster_desc_->get_ethernet_connections();
TT_ASSERT(this->cluster_desc_->is_chip_mmio_capable(mmio_chip_id));

if (all_eth_connections.find(mmio_chip_id) == all_eth_connections.end()) {
return {};
}

std::set<chip_id_t> device_ids = get_devices_controlled_by_mmio_device(mmio_chip_id);
device_ids.erase(mmio_chip_id);

if (device_ids.size() == 0) {
return {};
}

for (const auto &[eth_chan, connected_chip_chan] : all_eth_connections.at(mmio_chip_id)) {
const auto &other_chip_id = std::get<0>(connected_chip_chan);
if (device_ids.find(other_chip_id) != device_ids.end()) {
//mmio chip is connected to a remote chip in its mmio group.
//erase from the pool so multiple ethenret connections to same remote device do not
//pollute the counts.
device_ids.erase(other_chip_id);
std::vector<chip_id_t> first_stop = {other_chip_id};
auto it = std::find(tunnels_from_mmio.begin(), tunnels_from_mmio.end(), first_stop);
TT_ASSERT(it == tunnels_from_mmio.end(),"Duplicate first tunnel stop found when finding FD2 Tunnel devices.");
tunnels_from_mmio.push_back(first_stop);
}
}

log_info(tt::LogMetal, " Found {} FD Tunnels originating from MMIO Device {}", tunnels_from_mmio.size(), mmio_chip_id);

device_ids = get_devices_controlled_by_mmio_device(mmio_chip_id);
device_ids.erase(mmio_chip_id);

for (auto &tunnel : tunnels_from_mmio) {
TT_ASSERT(tunnel.size() == 1,"Tunnel depth must be 1 when it has only 1 stop in it.");
device_ids.erase(tunnel[0]);
}

bool tunneled_device_hit;
for (auto it = device_ids.begin(); it != device_ids.end();) {
tunneled_device_hit = false;
for (auto &dev_vec : tunnels_from_mmio) {
for (const auto &[eth_chan, connected_chip_chan] : all_eth_connections.at(dev_vec.back())) {
const auto &other_chip_id = std::get<0>(connected_chip_chan);
auto id_iter = device_ids.find(other_chip_id);
if (id_iter != device_ids.end()) {
it = device_ids.erase(id_iter);
dev_vec.push_back(other_chip_id);
tunneled_device_hit = true;
break;
}
}
}
TT_ASSERT(tunneled_device_hit || (it == device_ids.end()),"Loop Exit Error.");
}

TT_ASSERT(tunnels_from_mmio.size() != 0,"Must have at least 1 tunnel from MMIO Device.");
uint32_t tunnel_depth = tunnels_from_mmio[0].size();
log_info(tt::LogMetal, "Each FD Tunnel is {} deep.", tunnel_depth);

for (auto &dev_vec : tunnels_from_mmio)
TT_ASSERT(dev_vec.size() == tunnel_depth,"All tunnels from mmio device must have same depth. Found {}. Expected {}.", dev_vec.size(), tunnel_depth);

return tunnels_from_mmio;
}


// Ethernet cluster api
void Cluster::initialize_ethernet_sockets() {
for (const auto &chip_id : this->cluster_desc_->get_all_chips()) {
Expand Down Expand Up @@ -830,6 +897,20 @@ tt_cxy_pair Cluster::get_eth_core_for_dispatch_core(
return {};
}

std::tuple<tt_cxy_pair, tt_cxy_pair> Cluster::get_eth_tunnel_core(
chip_id_t upstream_chip_id, chip_id_t downstream_chip_id, EthRouterMode mode) const {
for (const auto &[eth_core, router_mode] : this->device_eth_routing_info_.at(downstream_chip_id)) {

// Check for connected chip id since one chip can be bi directional tunneling to multiple chips
const auto [tunnel_chip_id, tunnel_eth_core] = this->get_connected_ethernet_core(std::make_tuple(downstream_chip_id, eth_core));
if (router_mode == mode and tunnel_chip_id == upstream_chip_id) {
return std::make_tuple(tt_cxy_pair(tunnel_chip_id, tunnel_eth_core), tt_cxy_pair(downstream_chip_id, eth_core));
}
}
TT_ASSERT(false, "Cluster does not contain requested eth routing core");
return {};
}

// TODO: ALLAN Can change to write one bit
void Cluster::set_internal_routing_info_for_ethernet_cores(bool enable_internal_routing) const {
log_debug(tt::LogDevice, "Set internal routing bit {}", enable_internal_routing);
Expand Down
8 changes: 8 additions & 0 deletions tt_metal/llrt/tt_cluster.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,8 @@ class Cluster {
tt_cxy_pair get_eth_core_for_dispatch_core(
tt_cxy_pair logical_dispatch_core, EthRouterMode mode, chip_id_t connected_chip_id) const;

std::tuple<tt_cxy_pair, tt_cxy_pair> get_eth_tunnel_core(chip_id_t upstream_chip_id, chip_id_t downstream_chip_id, EthRouterMode mode) const;

// Internal routing for SD and FD enables launching user ethernet kernels and FD tunneling for all devices in the
// cluster. When using multiple devices in a cluster, this should be the flow:
// CreateDevice(0)
Expand All @@ -169,6 +171,11 @@ class Cluster {
TT_ASSERT(this->devices_grouped_by_assoc_mmio_device_.count(mmio_device_id), "Expected device {} to be an MMIO device!", mmio_device_id);
return this->devices_grouped_by_assoc_mmio_device_.at(mmio_device_id);
}

// Returns vector of unique tunnels originating from mmio device.
// Each vecor entry is another vector of remote devices on that tunnel.
std::vector<std::vector<chip_id_t>> get_tunnels_from_mmio_device(chip_id_t mmio_chip_id) const;

bool is_galaxy_cluster() const;

private:
Expand All @@ -194,6 +201,7 @@ class Cluster {
// Returns map of connected chip ids to active ethernet cores
std::unordered_map<chip_id_t, std::vector<CoreCoord>> get_ethernet_cores_grouped_by_connected_chips(
chip_id_t chip_id) const;

void initialize_ethernet_sockets();

ARCH arch_;
Expand Down
Loading

0 comments on commit 4232c5d

Please sign in to comment.