Skip to content

Commit

Permalink
Change tt_cluster to use single tt_SiliconDevice (#13949)
Browse files Browse the repository at this point in the history
Ticket
First step towards #13948

What's changed
Changing tt_cluster so that it uses a single tt_SiliconDevice (to be renamed to tt::umd::Cluster) for all connected chips.
In UMD, the following changes were made prior to this change:

Some API tests added to track current API usage.
Made a test which explicitly shows our current driver doesn't support multiple separate clusters of chips (e.g. two N300 cards not connected through ethernet).
chip_id added to setup_core_to_tlb_map, since each mmio chip has it's own set of TLBs
Changed wait_for_non_mmio_flush to be able to accept chip_id, so it can run only for a specific remote chip.
chip_id added to get_pcie_base_addr_from_device
dynamic_tlb_config removed from constructor (this might be checked in with another PR before this one, since it is already in master)
Related UMD PRs
tenstorrent/tt-umd#166
tenstorrent/tt-umd#179
tenstorrent/tt-umd#165
tenstorrent/tt-umd#183
tenstorrent/tt-umd#131
tenstorrent/tt-umd#195
tenstorrent/tt-umd#247
  • Loading branch information
broskoTT authored Nov 3, 2024
1 parent 4924301 commit 97731c4
Show file tree
Hide file tree
Showing 8 changed files with 62 additions and 87 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -686,7 +686,7 @@ uint32_t get_l1_size(tt::ARCH arch) {
constexpr uint32_t WH_L1_SIZE = 1499136;

uint32_t l1_size = 0;
if (arch == tt::ARCH::WORMHOLE || arch == tt::ARCH::WORMHOLE_B0) {
if (arch == tt::ARCH::WORMHOLE_B0) {
l1_size = WH_L1_SIZE;
} else if (arch == tt::ARCH::GRAYSKULL) {
l1_size = GS_L1_SIZE;
Expand All @@ -701,7 +701,7 @@ double get_tt_npu_rpeak_tflops(tt::ARCH arch, CoreCoord grid_size, int tt_npu_cl
double rpeak_tflops = 0.0f;
double clock = static_cast<double>(tt_npu_clock) / 1000;
uint32_t num_compute_core = grid_size.x * grid_size.y;
if (arch == tt::ARCH::WORMHOLE || arch == tt::ARCH::WORMHOLE_B0) {
if (arch == tt::ARCH::WORMHOLE_B0) {
rpeak_tflops =
WH_FPU_BFP8_TFLOPS_PER_TENSIX * static_cast<double>(num_compute_core) * static_cast<double>(clock);
} else if (arch == tt::ARCH::GRAYSKULL) {
Expand Down Expand Up @@ -777,7 +777,7 @@ CoreCoord get_core_range(
std::tuple<MathFidelity, bool> get_compute_params(tt::ARCH arch) {
MathFidelity math_fidelity = MathFidelity::HiFi4;
bool fp32_dest_acc_en = false;
if (arch == tt::ARCH::WORMHOLE || arch == tt::ARCH::WORMHOLE_B0) {
if (arch == tt::ARCH::WORMHOLE_B0) {
math_fidelity = MathFidelity::HiFi2;
// TODO: apply packer_l1_acc
// TODO: need to consider whether to set these variablias as arguments
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -514,7 +514,7 @@ uint32_t get_dram_bandwidth(tt::ARCH arch) {
constexpr uint32_t WH_DRAM_BANDWIDTH_GB_PER_SEC = 384;

uint32_t dram_bandwidth_gb_per_sec = 0;
if (arch == tt::ARCH::WORMHOLE || arch == tt::ARCH::WORMHOLE_B0) {
if (arch == tt::ARCH::WORMHOLE_B0) {
dram_bandwidth_gb_per_sec = WH_DRAM_BANDWIDTH_GB_PER_SEC;
} else if (arch == tt::ARCH::GRAYSKULL) {
dram_bandwidth_gb_per_sec = GS_DRAM_BANDWIDTH_GB_PER_SEC;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ uint32_t get_dram_bandwidth(tt::ARCH arch) {
constexpr uint32_t WH_DRAM_BANDWIDTH_GB_PER_SEC = 384;

uint32_t dram_bandwidth_gb_per_sec = 0;
if (arch == tt::ARCH::WORMHOLE || arch == tt::ARCH::WORMHOLE_B0) {
if (arch == tt::ARCH::WORMHOLE_B0) {
dram_bandwidth_gb_per_sec = WH_DRAM_BANDWIDTH_GB_PER_SEC;
} else if (arch == tt::ARCH::GRAYSKULL) {
dram_bandwidth_gb_per_sec = GS_DRAM_BANDWIDTH_GB_PER_SEC;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ uint32_t get_dram_bandwidth(tt::ARCH arch) {
constexpr uint32_t WH_DRAM_BANDWIDTH_GB_PER_SEC = 384;

uint32_t dram_bandwidth_gb_per_sec = 0;
if (arch == tt::ARCH::WORMHOLE || arch == tt::ARCH::WORMHOLE_B0) {
if (arch == tt::ARCH::WORMHOLE_B0) {
dram_bandwidth_gb_per_sec = WH_DRAM_BANDWIDTH_GB_PER_SEC;
} else if (arch == tt::ARCH::GRAYSKULL) {
dram_bandwidth_gb_per_sec = GS_DRAM_BANDWIDTH_GB_PER_SEC;
Expand Down
2 changes: 1 addition & 1 deletion tt_metal/llrt/tlb_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ void configure_static_tlbs(tt::ARCH arch, chip_id_t mmio_device_id, const metal_
}
}

device_driver.setup_core_to_tlb_map([get_static_tlb_index](CoreCoord core) { return get_static_tlb_index(core); });
device_driver.setup_core_to_tlb_map(mmio_device_id, get_static_tlb_index);
}

} // namespace ll_api
98 changes: 44 additions & 54 deletions tt_metal/llrt/tt_cluster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -236,18 +236,16 @@ void Cluster::generate_cluster_descriptor() {
void Cluster::initialize_device_drivers() {
for (const auto &[mmio_device_id, controlled_devices] : this->devices_grouped_by_assoc_mmio_device_) {
this->assign_mem_channels_to_devices(mmio_device_id, controlled_devices);
}

this->open_driver(mmio_device_id, controlled_devices);
this->open_driver();

tt_device_params default_params;
this->start_driver(mmio_device_id, default_params);
}
tt_device_params default_params;
this->start_driver(default_params);
}

void Cluster::assert_risc_reset() {
for (const auto &[mmio_device_id, controlled_devices] : this->devices_grouped_by_assoc_mmio_device_) {
this->get_driver(mmio_device_id).assert_risc_reset();
}
this->driver_->assert_risc_reset();
}

void Cluster::assign_mem_channels_to_devices(
Expand Down Expand Up @@ -276,40 +274,40 @@ void Cluster::get_metal_desc_from_tt_desc(
}
}

void Cluster::open_driver(
chip_id_t mmio_device_id, const std::set<chip_id_t> &controlled_device_ids, const bool &skip_driver_allocs) {
void Cluster::open_driver(const bool &skip_driver_allocs) {
const std::string sdesc_path = get_soc_description_file(this->arch_, this->target_type_);

std::unique_ptr<tt_device> device_driver;
if (this->target_type_ == TargetDevice::Silicon) {
std::unordered_set<chip_id_t> all_chips = this->cluster_desc_->get_all_chips();
std::set<chip_id_t> all_chips_set(all_chips.begin(), all_chips.end());
// This is the target/desired number of mem channels per arch/device.
// Silicon driver will attempt to open this many hugepages as channels, and assert if workload uses more than
// available. Metal currently uses assigns 1 channel per device
uint32_t num_host_mem_ch_per_mmio_device = controlled_device_ids.size();
if (is_tg_cluster_) {
num_host_mem_ch_per_mmio_device = HOST_MEM_CHANNELS;
}
// Silicon driver will attempt to open this many hugepages as channels per mmio chip,
// and assert if workload uses more than available.
uint32_t num_host_mem_ch_per_mmio_device = std::min(HOST_MEM_CHANNELS, (uint32_t)all_chips_set.size());
// This will remove harvested rows from the soc descriptor
const bool perform_harvesting = true;
const bool clean_system_resources = true;
device_driver = std::make_unique<tt_SiliconDevice>(
sdesc_path,
this->cluster_desc_path_,
controlled_device_ids,
all_chips_set,
num_host_mem_ch_per_mmio_device,
skip_driver_allocs,
clean_system_resources,
perform_harvesting);
if (this->arch_ == tt::ARCH::WORMHOLE_B0 and not this->is_galaxy_cluster()) {
// Give UMD Limited access to eth cores 8 and 9 for Non-Galaxy Wormhole Clusters
device_driver->configure_active_ethernet_cores_for_mmio_device(mmio_device_id, {});
for (const auto &[mmio_device_id, _]: this->cluster_desc_->get_chips_with_mmio()) {
device_driver->configure_active_ethernet_cores_for_mmio_device(mmio_device_id, {});
}
}
device_driver->set_driver_host_address_params(host_address_params);
device_driver->set_driver_eth_interface_params(eth_interface_params);

// Adding this check is a workaround for current UMD bug that only uses this getter to populate private metadata
// that is later expected to be populated by unrelated APIs
TT_FATAL(device_driver->get_target_mmio_device_ids().size() == 1, "Only one target mmio device id allowed.");
// TT_FATAL(device_driver->get_target_mmio_device_ids().size() == 1, "Only one target mmio device id allowed.");
} else if (this->target_type_ == TargetDevice::Simulator) {
device_driver = std::make_unique<tt_SimulationDevice>(sdesc_path);
}
Expand All @@ -319,29 +317,28 @@ void Cluster::open_driver(

this->get_metal_desc_from_tt_desc(
device_driver->get_virtual_soc_descriptors(), device_driver->get_harvesting_masks_for_soc_descriptors());
this->mmio_device_id_to_driver_[mmio_device_id] = std::move(device_driver);
this->driver_ = std::move(device_driver);
}

void Cluster::start_driver(chip_id_t mmio_device_id, tt_device_params &device_params) const {
void Cluster::start_driver(tt_device_params &device_params) const {
device_params.init_device = true;

TT_FATAL(this->sdesc_per_chip_.size(), "Descriptor must be loaded. Try open_driver()");

if (this->target_type_ == TargetDevice::Silicon && device_params.init_device) {
ll_api::configure_static_tlbs(
this->arch_, mmio_device_id, this->get_soc_desc(mmio_device_id), this->get_driver(mmio_device_id));
for (const auto [mmio_device_id, _]: this->cluster_desc_->get_chips_with_mmio()) {
ll_api::configure_static_tlbs(
this->arch_, mmio_device_id, this->get_soc_desc(mmio_device_id), *this->driver_);
}
}

this->mmio_device_id_to_driver_.at(mmio_device_id)->start_device(device_params);
this->driver_->start_device(device_params);
}

Cluster::~Cluster() {
log_info(tt::LogDevice, "Closing user mode device drivers");
for (const auto &[mmio_device_id, device_driver] : this->mmio_device_id_to_driver_) {
device_driver->close_device();
}
this->driver_->close_device();

this->mmio_device_id_to_driver_.clear();
this->sdesc_per_chip_.clear();
this->devices_grouped_by_assoc_mmio_device_.clear();
this->device_to_mmio_device_.clear();
Expand All @@ -350,11 +347,6 @@ Cluster::~Cluster() {
this->tunnels_from_mmio_device.clear();
}

tt_device &Cluster::get_driver(chip_id_t device_id) const {
chip_id_t mmio_device_id = this->device_to_mmio_device_.at(device_id);
return *(this->mmio_device_id_to_driver_.at(mmio_device_id));
}

std::unordered_map<chip_id_t, eth_coord_t> Cluster::get_user_chip_ethernet_coordinates() const {
auto user_chip_ethernet_coordinates = this->cluster_desc_->get_chip_locations();
if (this->is_galaxy_cluster()) {
Expand All @@ -380,7 +372,7 @@ uint32_t Cluster::get_harvested_rows(chip_id_t chip) const {
if (this->target_type_ == TargetDevice::Simulator) {
return 0;
} else {
return this->get_driver(chip).harvested_rows_per_target.at(chip);
return this->driver_->harvested_rows_per_target.at(chip);
}
}

Expand All @@ -406,7 +398,7 @@ int Cluster::get_device_aiclk(const chip_id_t &chip_id) const {
// get_clocks returns MMIO device ID -> clock frequency
// There is one driver per MMIO device, so we use that to index returned map
chip_id_t mmio_device_id = this->device_to_mmio_device_.at(chip_id);
return this->get_driver(chip_id).get_clocks().at(mmio_device_id);
return this->driver_->get_clocks().at(mmio_device_id);
}
TT_THROW("Cannot get frequency for device {} that is not initialized!", chip_id);
return 0;
Expand All @@ -415,13 +407,13 @@ int Cluster::get_device_aiclk(const chip_id_t &chip_id) const {
void Cluster::deassert_risc_reset_at_core(const tt_cxy_pair &physical_chip_coord) const {
const metal_SocDescriptor &soc_desc = this->get_soc_desc(physical_chip_coord.chip);
tt_cxy_pair virtual_chip_coord = soc_desc.convert_to_umd_coordinates(physical_chip_coord);
this->get_driver(virtual_chip_coord.chip).deassert_risc_reset_at_core(virtual_chip_coord);
this->driver_->deassert_risc_reset_at_core(virtual_chip_coord);
}

void Cluster::assert_risc_reset_at_core(const tt_cxy_pair &physical_chip_coord) const {
const metal_SocDescriptor &soc_desc = this->get_soc_desc(physical_chip_coord.chip);
tt_cxy_pair virtual_chip_coord = soc_desc.convert_to_umd_coordinates(physical_chip_coord);
this->get_driver(virtual_chip_coord.chip).assert_risc_reset_at_core(virtual_chip_coord);
this->driver_->assert_risc_reset_at_core(virtual_chip_coord);
}

inline uint64_t get_sys_addr(uint32_t chip_x, uint32_t chip_y, uint32_t noc_x, uint32_t noc_y, uint64_t offset) {
Expand Down Expand Up @@ -481,10 +473,9 @@ void Cluster::write_core(
tt::watcher_sanitize_host_noc_write(soc_desc, {core.x, core.y}, addr, sz_in_bytes);
}
tt_cxy_pair virtual_core = soc_desc.convert_to_umd_coordinates(core);
this->get_driver(chip_id).write_to_device(mem_ptr, sz_in_bytes, virtual_core, addr, "LARGE_WRITE_TLB");
if (this->get_driver(chip_id).get_target_remote_device_ids().find(virtual_core.chip) !=
this->get_driver(chip_id).get_target_remote_device_ids().end()) {
this->get_driver(chip_id).wait_for_non_mmio_flush();
this->driver_->write_to_device(mem_ptr, sz_in_bytes, virtual_core, addr, "LARGE_WRITE_TLB");
if (this->cluster_desc_->is_chip_remote(chip_id)) {
this->driver_->wait_for_non_mmio_flush(chip_id);
}
}

Expand All @@ -498,7 +489,7 @@ void Cluster::read_core(
}

tt_cxy_pair virtual_core = soc_desc.convert_to_umd_coordinates(core);
this->get_driver(chip_id).read_from_device(mem_ptr, virtual_core, addr, size_in_bytes, "LARGE_READ_TLB");
this->driver_->read_from_device(mem_ptr, virtual_core, addr, size_in_bytes, "LARGE_READ_TLB");
}

void Cluster::read_core(
Expand All @@ -516,10 +507,9 @@ void Cluster::write_reg(const std::uint32_t *mem_ptr, tt_cxy_pair target, uint64
tt::watcher_sanitize_host_noc_write(soc_desc, {target.x, target.y}, addr, size_in_bytes);
}
tt_cxy_pair virtual_target = soc_desc.convert_to_umd_coordinates(target);
this->get_driver(chip_id).write_to_device(mem_ptr, size_in_bytes, virtual_target, addr, "REG_TLB");
if (this->get_driver(chip_id).get_target_remote_device_ids().find(virtual_target.chip) !=
this->get_driver(chip_id).get_target_remote_device_ids().end()) {
this->get_driver(chip_id).wait_for_non_mmio_flush();
this->driver_->write_to_device(mem_ptr, size_in_bytes, virtual_target, addr, "REG_TLB");
if (this->cluster_desc_->is_chip_remote(chip_id)) {
this->driver_->wait_for_non_mmio_flush(chip_id);
}
}

Expand All @@ -532,19 +522,19 @@ void Cluster::read_reg(std::uint32_t *mem_ptr, tt_cxy_pair target, uint64_t addr
tt::watcher_sanitize_host_noc_read(soc_desc, {target.x, target.y}, addr, size_in_bytes);
}
tt_cxy_pair virtual_target = soc_desc.convert_to_umd_coordinates(target);
this->get_driver(chip_id).read_from_device(mem_ptr, virtual_target, addr, size_in_bytes, "REG_TLB");
this->driver_->read_from_device(mem_ptr, virtual_target, addr, size_in_bytes, "REG_TLB");
}

void Cluster::write_sysmem(
const void *vec, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id, uint16_t channel) const {
TT_ASSERT(this->cluster_desc_->is_chip_mmio_capable(src_device_id));
this->get_driver(src_device_id).write_to_sysmem(vec, size_in_bytes, addr, channel & HOST_MEM_CHANNELS_MASK, src_device_id);
this->driver_->write_to_sysmem(vec, size_in_bytes, addr, channel & HOST_MEM_CHANNELS_MASK, src_device_id);
}

void Cluster::read_sysmem(
void *vec, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id, uint16_t channel) const {
TT_ASSERT(this->cluster_desc_->is_chip_mmio_capable(src_device_id));
this->get_driver(src_device_id).read_from_sysmem(vec, addr, channel & HOST_MEM_CHANNELS_MASK, size_in_bytes, src_device_id);
this->driver_->read_from_sysmem(vec, addr, channel & HOST_MEM_CHANNELS_MASK, size_in_bytes, src_device_id);
}

void Cluster::verify_sw_fw_versions(
Expand Down Expand Up @@ -574,7 +564,7 @@ void Cluster::dram_barrier(chip_id_t chip_id) const {
for (uint32_t channel = 0; channel < this->get_soc_desc(chip_id).get_num_dram_channels(); channel++) {
dram_channels.insert(channel);
}
this->get_driver(chip_id).dram_membar(chip_id, "LARGE_WRITE_TLB", dram_channels);
this->driver_->dram_membar(chip_id, "LARGE_WRITE_TLB", dram_channels);
}

// L1 barrier is used to implement host-to-device synchronization and should be used when all previous writes to L1 need
Expand All @@ -583,26 +573,26 @@ void Cluster::dram_barrier(chip_id_t chip_id) const {
// binaries, metadata, and data to compute on are committed before launching kernels
void Cluster::l1_barrier(chip_id_t chip_id) const {
// Sets and resets L1 barrier of all tensix cores and ethernet cores
this->get_driver(chip_id).l1_membar(chip_id, "LARGE_WRITE_TLB");
this->driver_->l1_membar(chip_id, "LARGE_WRITE_TLB");
}

uint32_t Cluster::get_num_host_channels(chip_id_t device_id) const {
bool mmio_capable = this->cluster_desc_->is_chip_mmio_capable(device_id);
return mmio_capable ? this->get_driver(device_id).get_num_host_channels(device_id) : 0;
return mmio_capable ? this->driver_->get_num_host_channels(device_id) : 0;
}

uint32_t Cluster::get_host_channel_size(chip_id_t device_id, uint32_t channel) const {
TT_ASSERT(this->cluster_desc_->is_chip_mmio_capable(device_id));
return this->get_driver(device_id).get_host_channel_size(device_id, channel & HOST_MEM_CHANNELS_MASK);
return this->driver_->get_host_channel_size(device_id, channel & HOST_MEM_CHANNELS_MASK);
}

void *Cluster::host_dma_address(uint64_t offset, chip_id_t src_device_id, uint16_t channel) const {
TT_ASSERT(this->cluster_desc_->is_chip_mmio_capable(src_device_id));
return this->get_driver(src_device_id).host_dma_address(offset, src_device_id, channel & HOST_MEM_CHANNELS_MASK);
return this->driver_->host_dma_address(offset, src_device_id, channel & HOST_MEM_CHANNELS_MASK);
}

uint64_t Cluster::get_pcie_base_addr_from_device(chip_id_t chip_id) const {
return this->get_driver(chip_id).get_pcie_base_addr_from_device();
return this->driver_->get_pcie_base_addr_from_device(chip_id);
}

std::unordered_map<chip_id_t, std::vector<CoreCoord>> Cluster::get_ethernet_cores_grouped_by_connected_chips(
Expand Down
Loading

0 comments on commit 97731c4

Please sign in to comment.