Skip to content

Commit

Permalink
More sysmem tests (#294)
Browse files Browse the repository at this point in the history
### Issue
#293

### Description
Adds more unit tests for sysmem. I need a reliable way to ensure it is
functioning as intended as work ramps up to enable IOMMU and provide a
transition away from huge pages.

### List of the changes
* Adds a second sysmem test for Wormhole that examines more of the
address space.
* Adds a Grayskull version of the simple Wormhole sysmem test.
* Some whitespace cleanup

### Testing
Manually tested; code paths are in CI

### API Changes
There are no API changes in this PR.
  • Loading branch information
joelsmithTT authored Nov 14, 2024
1 parent 8985708 commit e03bc51
Show file tree
Hide file tree
Showing 4 changed files with 206 additions and 70 deletions.
76 changes: 67 additions & 9 deletions tests/grayskull/test_silicon_driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ TEST(SiliconDriverGS, HarvestingRuntime) {
// Iterate over devices and only setup static TLBs for functional worker cores
auto& sdesc = device.get_virtual_soc_descriptors().at(i);
for(auto& core : sdesc.workers) {
// Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE.
// Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE.
device.configure_tlb(i, core, get_static_tlb_index(core), l1_mem::address_map::DATA_BUFFER_SPACE_BASE);
}
device.setup_core_to_tlb_map(i, get_static_tlb_index);
Expand Down Expand Up @@ -146,19 +146,19 @@ TEST(SiliconDriverGS, StaticTLB_RW) {
return flat_index;
};
std::set<chip_id_t> target_devices = {0};

uint32_t num_host_mem_ch_per_mmio_device = 1;
Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true);
for(int i = 0; i < target_devices.size(); i++) {
// Iterate over devices and only setup static TLBs for worker cores
auto& sdesc = device.get_virtual_soc_descriptors().at(i);
for(auto& core : sdesc.workers) {
// Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE.
// Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE.
device.configure_tlb(i, core, get_static_tlb_index(core), l1_mem::address_map::DATA_BUFFER_SPACE_BASE, TLB_DATA::Posted);
}
device.setup_core_to_tlb_map(i, get_static_tlb_index);
}

tt_device_params default_params;
device.start_device(default_params);
device.deassert_risc_reset();
Expand Down Expand Up @@ -188,7 +188,7 @@ TEST(SiliconDriverGS, StaticTLB_RW) {
address += 0x20; // Increment by uint32_t size for each write
}
}
device.close_device();
device.close_device();
}

TEST(SiliconDriverGS, DynamicTLB_RW) {
Expand Down Expand Up @@ -239,7 +239,6 @@ TEST(SiliconDriverGS, MultiThreadedDevice) {

uint32_t num_host_mem_ch_per_mmio_device = 1;
Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true);

tt_device_params default_params;
device.start_device(default_params);
device.deassert_risc_reset();
Expand Down Expand Up @@ -299,7 +298,7 @@ TEST(SiliconDriverGS, MultiThreadedDevice) {

TEST(SiliconDriverGS, MultiThreadedMemBar) { // this tests takes ~5 mins to run
// Have 2 threads read and write from a single device concurrently
// All (fairly large) transactions go through a static TLB.
// All (fairly large) transactions go through a static TLB.
// We want to make sure the memory barrier is thread/process safe.

// Memory barrier flags get sent to address 0 for all channels in this test
Expand All @@ -317,12 +316,12 @@ TEST(SiliconDriverGS, MultiThreadedMemBar) { // this tests takes ~5 mins to run
uint32_t num_host_mem_ch_per_mmio_device = 1;

Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true);

for(int i = 0; i < target_devices.size(); i++) {
// Iterate over devices and only setup static TLBs for functional worker cores
auto& sdesc = device.get_virtual_soc_descriptors().at(i);
for(auto& core : sdesc.workers) {
// Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE.
// Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE.
device.configure_tlb(i, core, get_static_tlb_index(core), base_addr);
}
device.setup_core_to_tlb_map(i, get_static_tlb_index);
Expand Down Expand Up @@ -404,3 +403,62 @@ TEST(SiliconDriverGS, MultiThreadedMemBar) { // this tests takes ~5 mins to run

device.close_device();
}

/**
* Copied from Wormhole unit tests.
*/
TEST(SiliconDriverGS, SysmemTestWithPcie) {
Cluster cluster(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"),
"", // test_utils::GetClusterDescYAML(),
{0},
1, // one "host memory channel", currently a 1G huge page
false, // skip driver allocs - no (don't skip)
true, // clean system resources - yes
true); // perform harvesting - yes


cluster.start_device(tt_device_params{}); // no special parameters

const chip_id_t mmio_chip_id = 0;
const auto PCIE = cluster.get_soc_descriptor(mmio_chip_id).pcie_cores.at(0);
const tt_cxy_pair PCIE_CORE(mmio_chip_id, PCIE.x, PCIE.y);
const size_t test_size_bytes = 0x4000; // Arbitrarilly chosen, but small size so the test runs quickly.

// PCIe core is at (x=0, y=4) on Grayskull NOC0.
ASSERT_EQ(PCIE.x, 0);
ASSERT_EQ(PCIE.y, 4);

// Bad API: how big is the buffer? How do we know it's big enough?
// Situation today is that there's a 1G hugepage behind it, although this is
// unclear from the API and may change in the future.
uint8_t *sysmem = (uint8_t*)cluster.host_dma_address(0, 0, 0);
ASSERT_NE(sysmem, nullptr);

uint64_t base_address = cluster.get_pcie_base_addr_from_device(mmio_chip_id);

// Buffer that we will use to read sysmem into, then write sysmem from.
std::vector<uint8_t> buffer(test_size_bytes, 0x0);

// Step 1: Fill sysmem with random bytes.
test_utils::fill_with_random_bytes(sysmem, test_size_bytes);

// Step 2: Read sysmem into buffer.
cluster.read_from_device(&buffer[0], PCIE_CORE, base_address, buffer.size(), "REG_TLB");

// Step 3: Verify that buffer matches sysmem.
ASSERT_EQ(buffer, std::vector<uint8_t>(sysmem, sysmem + test_size_bytes));

// Step 4: Fill buffer with random bytes.
test_utils::fill_with_random_bytes(&buffer[0], test_size_bytes);

// Step 5: Write buffer into sysmem, overwriting what was there.
cluster.write_to_device(&buffer[0], buffer.size(), PCIE_CORE, base_address, "REG_TLB");

// Step 5b: Read back sysmem into a throwaway buffer. The intent is to
// ensure the write has completed before we check sysmem against buffer.
std::vector<uint8_t> throwaway(test_size_bytes, 0x0);
cluster.read_from_device(&throwaway[0], PCIE_CORE, base_address, throwaway.size(), "REG_TLB");

// Step 6: Verify that sysmem matches buffer.
ASSERT_EQ(buffer, std::vector<uint8_t>(sysmem, sysmem + test_size_bytes));
}
4 changes: 3 additions & 1 deletion tests/microbenchmark/device_fixture.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
#include "device/tt_soc_descriptor.h"
#include "tests/test_utils/generate_cluster_desc.hpp"

using tt::umd::Cluster;

class uBenchmarkFixture : public ::testing::Test {
protected:
void SetUp() override {
Expand All @@ -34,7 +36,7 @@ class uBenchmarkFixture : public ::testing::Test {
// Iterate over devices and only setup static TLBs for functional worker cores
auto& sdesc = device->get_virtual_soc_descriptors().at(i);
for(auto& core : sdesc.workers) {
// Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE.
// Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE.
device->configure_tlb(i, core, get_static_tlb_index(core), l1_mem::address_map::DATA_BUFFER_SPACE_BASE);
}
}
Expand Down
16 changes: 15 additions & 1 deletion tests/test_utils/device_test_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@
#pragma once

#include <cstdint>
#include <vector>
#include <random>
#include <string>
#include <vector>

#include "cluster.h"

Expand All @@ -27,4 +28,17 @@ static void read_data_from_device(tt_device& device, std::vector<uint32_t> &vec,
device.read_from_device(vec.data(), core, addr, size, tlb_to_use);
}

inline void fill_with_random_bytes(uint8_t* data, size_t n)
{
static std::random_device rd;
static std::mt19937_64 gen(rd());
uint64_t* data64 = reinterpret_cast<uint64_t*>(data);
std::generate_n(data64, n/8, [&]() { return gen(); });

// Handle remaining bytes
for (size_t i = (n/8)*8; i < n; ++i) {
data[i] = static_cast<uint8_t>(gen());
}
}

}
Loading

0 comments on commit e03bc51

Please sign in to comment.