Skip to content

Commit

Permalink
#8081: Fix watcher assuming NoC-0 when reporting errors
Browse files Browse the repository at this point in the history
  • Loading branch information
tt-dma committed May 20, 2024
1 parent 4f3349a commit bfbc0f9
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "tt_metal/detail/tt_metal.hpp"
#include "tt_metal/host_api.hpp"
#include "common/bfloat16.hpp"
#include "hostdevcommon/common_runtime_address_map.h"

//////////////////////////////////////////////////////////////////////////////////////////
// A test for checking watcher NOC sanitization.
Expand All @@ -21,7 +22,7 @@ typedef enum sanitization_features {
SanitizeAlignmentDRAM
} watcher_features_t;

void RunTestOnCore(WatcherFixture* fixture, Device* device, CoreCoord &core, bool is_eth_core, watcher_features_t feature) {
void RunTestOnCore(WatcherFixture* fixture, Device* device, CoreCoord &core, bool is_eth_core, watcher_features_t feature, bool use_ncrisc = false) {
// Set up program
Program program = Program();
CoreCoord phys_core;
Expand Down Expand Up @@ -52,6 +53,8 @@ void RunTestOnCore(WatcherFixture* fixture, Device* device, CoreCoord &core, boo

auto input_dram_noc_xy = input_dram_buffer->noc_coordinates();
auto output_dram_noc_xy = output_dram_buffer->noc_coordinates();
log_info("Input DRAM: {}", input_dram_noc_xy);
log_info("Output DRAM: {}", output_dram_noc_xy);

// A DRAM copy kernel, we'll feed it incorrect inputs to test sanitization.
KernelHandle dram_copy_kernel;
Expand All @@ -77,8 +80,8 @@ void RunTestOnCore(WatcherFixture* fixture, Device* device, CoreCoord &core, boo
"tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp",
core,
tt_metal::DataMovementConfig{
.processor = tt_metal::DataMovementProcessor::RISCV_0,
.noc = tt_metal::NOC::RISCV_0_default,
.processor = (use_ncrisc) ? tt_metal::DataMovementProcessor::RISCV_1 : tt_metal::DataMovementProcessor::RISCV_0,
.noc = (use_ncrisc) ? tt_metal::NOC::RISCV_1_default : tt_metal::NOC::RISCV_0_default,
.defines=dram_copy_kernel_defines
}
);
Expand Down Expand Up @@ -147,15 +150,27 @@ void RunTestOnCore(WatcherFixture* fixture, Device* device, CoreCoord &core, boo
break;
case SanitizeAlignmentL1:
case SanitizeAlignmentDRAM:
{
// NoC-1 has a different coordinate for the same DRAM
const metal_SocDescriptor& soc_d = tt::Cluster::instance().get_soc_desc(device->id());
int noc = (use_ncrisc) ? 1 : 0;
CoreCoord target_phys_core = {
NOC_0_X(noc, soc_d.grid_size.x, input_dram_noc_xy.x),
NOC_0_Y(noc, soc_d.grid_size.y, input_dram_noc_xy.y)
};
string risc_name = (is_eth_core) ? "erisc" : "brisc";
if (use_ncrisc)
risc_name = "ncrisc";
expected = fmt::format(
"Device {} {} core(x={:2},y={:2}) phys(x={:2},y={:2}): {} using noc0 tried to access DRAM core w/ physical coords {} DRAM[addr=0x{:08x},len=102400], misaligned with local L1[addr=0x{:08x}]",
"Device {} {} core(x={:2},y={:2}) phys(x={:2},y={:2}): {} using noc{} tried to access DRAM core w/ physical coords {} DRAM[addr=0x{:08x},len=102400], misaligned with local L1[addr=0x{:08x}]",
device->id(),
(is_eth_core) ? "ethnet" : "worker",
core.x, core.y, phys_core.x, phys_core.y,
(is_eth_core) ? "erisc" : "brisc", input_dram_noc_xy.str(),
risc_name, noc, target_phys_core,
input_dram_buffer_addr,
l1_buffer_addr
);
}
break;
default:
log_warning(LogTest, "Unrecognized feature to test ({}), skipping...", feature);
Expand Down Expand Up @@ -251,6 +266,18 @@ TEST_F(WatcherFixture, TestWatcherSanitizeAlignmentDRAM) {
);
}

TEST_F(WatcherFixture, TestWatcherSanitizeAlignmentDRAMNCrisc) {
if (this->slow_dispatch_)
GTEST_SKIP();
this->RunTestOnDevice(
[](WatcherFixture *fixture, Device *device){
CoreCoord core{0, 0};
RunTestOnCore(fixture, device, core, false, SanitizeAlignmentDRAM, true);
},
this->devices_[0]
);
}

TEST_F(WatcherFixture, TestWatcherSanitizeEth) {
if (this->slow_dispatch_)
GTEST_SKIP();
Expand Down
4 changes: 4 additions & 0 deletions tt_metal/hostdevcommon/common_runtime_address_map.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,7 @@ constexpr static std::uint32_t DRAM_BARRIER_BASE = 0;
constexpr static std::uint32_t DRAM_ALIGNMENT = 32;
constexpr static std::uint32_t DRAM_BARRIER_SIZE = ((sizeof(uint32_t) + DRAM_ALIGNMENT - 1) / DRAM_ALIGNMENT) * DRAM_ALIGNMENT;
constexpr static std::uint32_t DRAM_UNRESERVED_BASE = DRAM_BARRIER_BASE + DRAM_BARRIER_SIZE; // Start of unreserved space

// Helper functions to convert NoC coordinates to NoC-0 coordinates, used in metal as "physical" coordinates.
#define NOC_0_X(noc_index, noc_size_x, x) (noc_index == 0 ? (x) : (noc_size_x-1-(x)))
#define NOC_0_Y(noc_index, noc_size_y, y) (noc_index == 0 ? (y) : (noc_size_y-1-(y)))
4 changes: 2 additions & 2 deletions tt_metal/hw/inc/risc_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
#include "limits.h"
#include "mod_div_lib.h"

#define NOC_X(x) (noc_index == 0 ? (x) : (noc_size_x-1-(x)))
#define NOC_Y(y) (noc_index == 0 ? (y) : (noc_size_y-1-(y)))
#define NOC_X(x) NOC_0_X(noc_index, noc_size_x, (x))
#define NOC_Y(y) NOC_0_Y(noc_index, noc_size_y, (y))

#define TILE_WORD_2_BIT ((256 + 64 + 32) >> 4)
#define TILE_WORD_4_BIT ((512 + 64 + 32) >> 4)
Expand Down
26 changes: 16 additions & 10 deletions tt_metal/impl/debug/watcher_server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,14 @@ static void log_ring_buffer(Device *device, CoreCoord core) {
}
}

static std::pair<string, string> get_core_and_mem_type(Device *device, CoreCoord &phys_core) {
static std::pair<string, string> get_core_and_mem_type(Device *device, CoreCoord &noc_coord, int noc) {
// Get the physical coord from the noc coord
const metal_SocDescriptor& soc_d = tt::Cluster::instance().get_soc_desc(device->id());
CoreCoord phys_core = {
NOC_0_X(noc, soc_d.grid_size.x, noc_coord.x),
NOC_0_Y(noc, soc_d.grid_size.y, noc_coord.y)
};

CoreType core_type;
try {
core_type = device->core_type_from_physical_core(phys_core);
Expand Down Expand Up @@ -285,25 +292,24 @@ static string get_noc_target_str(
const debug_sanitize_noc_addr_msg_t* san
) {
string out = fmt::format("{} using noc{} tried to access ", get_riscv_name(core, san->which), noc);

if (san->multicast) {
CoreCoord target_phys_core_start = {NOC_MCAST_ADDR_START_X(san->noc_addr), NOC_MCAST_ADDR_START_Y(san->noc_addr)};
CoreCoord target_phys_core_end = {NOC_MCAST_ADDR_END_X(san->noc_addr), NOC_MCAST_ADDR_END_Y(san->noc_addr)};
auto type_and_mem = get_core_and_mem_type(device, target_phys_core_start);
CoreCoord target_phys_noc_core_start = {NOC_MCAST_ADDR_START_X(san->noc_addr), NOC_MCAST_ADDR_START_Y(san->noc_addr)};
CoreCoord target_phys_noc_core_end = {NOC_MCAST_ADDR_END_X(san->noc_addr), NOC_MCAST_ADDR_END_Y(san->noc_addr)};
auto type_and_mem = get_core_and_mem_type(device, target_phys_noc_core_start, noc);
out += fmt::format(
"{} core range w/ physical coords {}-{} {}",
type_and_mem.first,
target_phys_core_start.str(),
target_phys_core_end.str(),
target_phys_noc_core_start.str(),
target_phys_noc_core_end.str(),
type_and_mem.second
);
} else {
CoreCoord target_phys_core = {NOC_UNICAST_ADDR_X(san->noc_addr), NOC_UNICAST_ADDR_Y(san->noc_addr)};
auto type_and_mem = get_core_and_mem_type(device, target_phys_core);
CoreCoord target_phys_noc_core = {NOC_UNICAST_ADDR_X(san->noc_addr), NOC_UNICAST_ADDR_Y(san->noc_addr)};
auto type_and_mem = get_core_and_mem_type(device, target_phys_noc_core, noc);
out += fmt::format(
"{} core w/ physical coords {} {}",
type_and_mem.first,
target_phys_core.str(),
target_phys_noc_core.str(),
type_and_mem.second
);
}
Expand Down
11 changes: 4 additions & 7 deletions tt_metal/jit_build/genfiles.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -355,9 +355,6 @@ void jit_build_genfiles_descriptors(const JitBuildEnv& env,
}
}

#define NOC_X(noc_index, noc_size_x, x) (noc_index == 0 ? (x) : (noc_size_x-1-(x)))
#define NOC_Y(noc_index, noc_size_y, y) (noc_index == 0 ? (y) : (noc_size_y-1-(y)))

std::string generate_bank_to_noc_coord_descriptor_string(
tt_xy_pair grid_size,
std::vector<CoreCoord>& dram_bank_map,
Expand Down Expand Up @@ -427,8 +424,8 @@ std::string generate_bank_to_noc_coord_descriptor_string(
for (unsigned int noc = 0; noc < 2; noc++) {
ss << " {" << "\t// noc=" << noc << endl;
for (unsigned int bank_id = 0; bank_id < dram_bank_map.size(); bank_id++) {
uint16_t noc_x = NOC_X(noc, grid_size.x, dram_bank_map[bank_id].x);
uint16_t noc_y = NOC_Y(noc, grid_size.y, dram_bank_map[bank_id].y);
uint16_t noc_x = NOC_0_X(noc, grid_size.x, dram_bank_map[bank_id].x);
uint16_t noc_y = NOC_0_Y(noc, grid_size.y, dram_bank_map[bank_id].y);
uint16_t xy = ((noc_y << NOC_ADDR_NODE_ID_BITS) | noc_x) << (NOC_ADDR_LOCAL_BITS - 32);
ss << " " << xy << "," << "\t// NOC_X=" << noc_x << " NOC_Y=" << noc_y << endl;
}
Expand Down Expand Up @@ -482,8 +479,8 @@ std::string generate_bank_to_noc_coord_descriptor_string(
for (unsigned int noc = 0; noc < 2; noc++) {
ss << " {" << "\t// noc=" << noc << endl;
for (unsigned int bank_id = 0; bank_id < l1_bank_map.size(); bank_id++) {
uint16_t noc_x = NOC_X(noc, grid_size.x, l1_bank_map[bank_id].x);
uint16_t noc_y = NOC_Y(noc, grid_size.y, l1_bank_map[bank_id].y);
uint16_t noc_x = NOC_0_X(noc, grid_size.x, l1_bank_map[bank_id].x);
uint16_t noc_y = NOC_0_Y(noc, grid_size.y, l1_bank_map[bank_id].y);
uint16_t xy = ((noc_y << NOC_ADDR_NODE_ID_BITS) | noc_x) << (NOC_ADDR_LOCAL_BITS - 32);
ss << " " << xy << "," << "\t// NOC_X=" << noc_x << " NOC_Y=" << noc_y << endl;
}
Expand Down

0 comments on commit bfbc0f9

Please sign in to comment.