Skip to content

Commit

Permalink
Ihamer/7468 inject noc delays (#8889)
Browse files Browse the repository at this point in the history
* #7468: add support for delays

* #7468: Bug fixes. Improved hashing.

* #7468: Add atomic transaction category

* #7468: documentation for debug delay

* #7468: Spelling, wording

* #7468: merge with main

* #7468: fix typo

* #7468: fix rst formatting

* #7468: Break core selection for each of transaction types

* #7468: test delay feature

* #7468: Fix to use hash function

Instead of get_feature_enabled, use get_feature_hash_string for hash
computation.

Comments and documentation wording as well.

* #7468: Update tracy to match main

* #7468: Save/restore target selection in test fixture

* Update docs/source/tt-metalium/tools/watcher.rst

Co-authored-by: David Ma <[email protected]>

* #7468: Fix invocation example

* #7468: Use riscv_wait instead of busy loop.

* #7468: Print warning on wrong core coordinates

* #7468: Fix end of lines

* #7468: Update the docs for the change from loop iterations to clk cycles

---------

Co-authored-by: David Ma <[email protected]>
  • Loading branch information
ihamer-tt and tt-dma authored May 31, 2024
1 parent 872db88 commit fbd0602
Show file tree
Hide file tree
Showing 15 changed files with 990 additions and 542 deletions.
21 changes: 21 additions & 0 deletions docs/source/tt-metalium/tools/watcher.rst
Original file line number Diff line number Diff line change
Expand Up @@ -201,3 +201,24 @@ watcher log:
0x00000020,0x0000001f,0x0000001e,0x0000001d,0x0000001c,0x0000001b,0x0000001a,0x00000019,
0x00000018,0x00000017,0x00000016,0x00000015,0x00000014,0x00000013,0x00000012,0x00000011,
0x00000010,0x0000000f,0x0000000e,0x0000000d,0x0000000c,0x0000000b,0x0000000a]
Debug Delays
------------
Watcher can insert NOC transaction delays for debugging purposes. These delays can be specified by
transaction type and location. Environment variable `TT_METAL_WATCHER_DELAY` specifies the number
of clock cycles to wait for. Similarly to DPRINT, the delay can be set for all cores, or a
or a subset by setting environment variable `TT_METAL_*_DEBUG_DELAY_CORES`: x,y OR (x1,y1),(x2,y2),(x3,y3) OR (x1,y1)-(x2,y2) OR all.
The * can be one of: READ, WRITE or ATOMIC indicating whether the delays will be inserted before read, write or atomic NOC
transactions. Finally, the delay can be set for a specific RISCs (BRISC, NCRISC, TRISC0, TRISC1, TRISC2) through the
environment variable `TT_METAL_*_DEBUG_DELAY_RISCVS`: (one of: BR,NC,TR0,TR1,TR2); if not set, the delay
is applied to all RISCs.
Note that `TT_METAL_WATCHER` must be set and `TT_METAL_WATCHER_DISABLE_NOC_SANITIZE` must not be
set for the delays to be applied.

For example, the following command will run test_eltwise_binary with a delay of 10 iterations added to both READ and WRITE
transactions on BRISC core at location 0,0:

.. code-block::
TT_METAL_WATCHER=1 TT_METAL_WATCHER_DEBUG_DELAY=10 TT_METAL_READ_DEBUG_DELAY_CORES=0,0 TT_METAL_WRITE_DEBUG_DELAY_CORES=0,0 TT_METAL_READ_DEBUG_DELAY_RISCVS=BR TT_METAL_WRITE_DEBUG_DELAY_RISCVS=BR ./build/test/tt_metal/test_eltwise_binary
1 change: 1 addition & 0 deletions tests/tt_metal/tt_metal/unit_tests_common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ set(UNIT_TESTS_COMMON_SRC
${CMAKE_CURRENT_SOURCE_DIR}/dram/test_dram.cpp
${CMAKE_CURRENT_SOURCE_DIR}/watcher/test_assert.cpp
${CMAKE_CURRENT_SOURCE_DIR}/watcher/test_noc_sanitize.cpp
${CMAKE_CURRENT_SOURCE_DIR}/watcher/test_noc_sanitize_delays.cpp
${CMAKE_CURRENT_SOURCE_DIR}/watcher/test_pause.cpp
${CMAKE_CURRENT_SOURCE_DIR}/watcher/test_ringbuf.cpp
${CMAKE_CURRENT_SOURCE_DIR}/watcher/test_waypoint.cpp
Expand Down
28 changes: 14 additions & 14 deletions tests/tt_metal/tt_metal/unit_tests_common/common/dprint_fixture.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,12 @@ class DPrintFixture: public CommonFixture {
// The core range (physical) needs to be set >= the set of all cores
// used by all tests using this fixture, so set dprint enabled for
// all cores and all devices
tt::llrt::OptionsG.set_dprint_enabled(true);
tt::llrt::OptionsG.set_dprint_all_cores(CoreType::WORKER, true);
tt::llrt::OptionsG.set_dprint_all_cores(CoreType::ETH, true);
tt::llrt::OptionsG.set_dprint_all_chips(true);
tt::llrt::OptionsG.set_feature_enabled(tt::llrt::RunTimeDebugFeatureDprint, true);
tt::llrt::OptionsG.set_feature_all_cores(tt::llrt::RunTimeDebugFeatureDprint, CoreType::WORKER, true);
tt::llrt::OptionsG.set_feature_all_cores(tt::llrt::RunTimeDebugFeatureDprint, CoreType::ETH, true);
tt::llrt::OptionsG.set_feature_all_chips(tt::llrt::RunTimeDebugFeatureDprint, true);
// Send output to a file so the test can check after program is run.
tt::llrt::OptionsG.set_dprint_file_name(dprint_file_name);
tt::llrt::OptionsG.set_feature_file_name(tt::llrt::RunTimeDebugFeatureDprint, dprint_file_name);
tt::llrt::OptionsG.set_test_mode_enabled(true);
watcher_previous_enabled = tt::llrt::OptionsG.get_watcher_enabled();
tt::llrt::OptionsG.set_watcher_enabled(false);
Expand All @@ -49,7 +49,7 @@ class DPrintFixture: public CommonFixture {
disabled[core_desc.dispatch_core_type].insert(core);
}
}
tt::llrt::OptionsG.set_dprint_disabled_cores(disabled);
tt::llrt::OptionsG.set_feature_disabled_cores(tt::llrt::RunTimeDebugFeatureDprint, disabled);

ExtraSetUp();

Expand All @@ -65,12 +65,12 @@ class DPrintFixture: public CommonFixture {
std::remove(dprint_file_name.c_str());

// Reset DPrint settings
tt::llrt::OptionsG.set_dprint_cores({});
tt::llrt::OptionsG.set_dprint_enabled(false);
tt::llrt::OptionsG.set_dprint_all_cores(CoreType::WORKER, false);
tt::llrt::OptionsG.set_dprint_all_cores(CoreType::ETH, false);
tt::llrt::OptionsG.set_dprint_all_chips(false);
tt::llrt::OptionsG.set_dprint_file_name("");
tt::llrt::OptionsG.set_feature_cores(tt::llrt::RunTimeDebugFeatureDprint, {});
tt::llrt::OptionsG.set_feature_enabled(tt::llrt::RunTimeDebugFeatureDprint, false);
tt::llrt::OptionsG.set_feature_all_cores(tt::llrt::RunTimeDebugFeatureDprint, CoreType::WORKER, false);
tt::llrt::OptionsG.set_feature_all_cores(tt::llrt::RunTimeDebugFeatureDprint, CoreType::ETH, false);
tt::llrt::OptionsG.set_feature_all_chips(tt::llrt::RunTimeDebugFeatureDprint, false);
tt::llrt::OptionsG.set_feature_file_name(tt::llrt::RunTimeDebugFeatureDprint, "");
tt::llrt::OptionsG.set_test_mode_enabled(false);
tt::llrt::OptionsG.set_watcher_enabled(watcher_previous_enabled);
}
Expand All @@ -97,7 +97,7 @@ class DPrintFixtureDisableDevices: public DPrintFixture {
protected:
void ExtraSetUp() override {
// For this test, mute each devices using the environment variable
tt::llrt::OptionsG.set_dprint_all_chips(false);
tt::llrt::OptionsG.set_dprint_chip_ids({});
tt::llrt::OptionsG.set_feature_all_chips(tt::llrt::RunTimeDebugFeatureDprint, false);
tt::llrt::OptionsG.set_feature_chip_ids(tt::llrt::RunTimeDebugFeatureDprint, {});
}
};
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <thread>
#include "common_fixture.hpp"
#include "impl/debug/watcher_server.hpp"
#include "llrt/rtoptions.hpp"

// A version of CommonFixture with watcher enabled
class WatcherFixture: public CommonFixture {
Expand Down Expand Up @@ -84,3 +85,40 @@ class WatcherFixture: public CommonFixture {
tt::watcher_clear_log();
}
};

// A version of WatcherFixture with read and write debug delays enabled
class WatcherDelayFixture : public WatcherFixture {
public:
tt::llrt::TargetSelection saved_target_selection[tt::llrt::RunTimeDebugFeatureCount];

std::map<CoreType, std::vector<CoreCoord>> delayed_cores;

void SetUp() override {
tt::llrt::OptionsG.set_watcher_debug_delay(5000000);
delayed_cores[CoreType::WORKER] = {{0, 0}, {1, 1}};

// Store the previous state of the watcher features
saved_target_selection[tt::llrt::RunTimeDebugFeatureReadDebugDelay] = tt::llrt::OptionsG.get_feature_targets(tt::llrt::RunTimeDebugFeatureReadDebugDelay);
saved_target_selection[tt::llrt::RunTimeDebugFeatureWriteDebugDelay] = tt::llrt::OptionsG.get_feature_targets(tt::llrt::RunTimeDebugFeatureWriteDebugDelay);
saved_target_selection[tt::llrt::RunTimeDebugFeatureAtomicDebugDelay] = tt::llrt::OptionsG.get_feature_targets(tt::llrt::RunTimeDebugFeatureAtomicDebugDelay);

// Enable read and write debug delay for the test core
tt::llrt::OptionsG.set_feature_enabled(tt::llrt::RunTimeDebugFeatureReadDebugDelay, true);
tt::llrt::OptionsG.set_feature_cores(tt::llrt::RunTimeDebugFeatureReadDebugDelay, delayed_cores);
tt::llrt::OptionsG.set_feature_enabled(tt::llrt::RunTimeDebugFeatureWriteDebugDelay, true);
tt::llrt::OptionsG.set_feature_cores(tt::llrt::RunTimeDebugFeatureWriteDebugDelay, delayed_cores);

// Call parent
WatcherFixture::SetUp();
}

void TearDown() override {
// Call parent
WatcherFixture::TearDown();

// Restore
tt::llrt::OptionsG.set_feature_targets(tt::llrt::RunTimeDebugFeatureReadDebugDelay, saved_target_selection[tt::llrt::RunTimeDebugFeatureReadDebugDelay]);
tt::llrt::OptionsG.set_feature_targets(tt::llrt::RunTimeDebugFeatureWriteDebugDelay, saved_target_selection[tt::llrt::RunTimeDebugFeatureWriteDebugDelay]);
tt::llrt::OptionsG.set_feature_targets(tt::llrt::RunTimeDebugFeatureAtomicDebugDelay, saved_target_selection[tt::llrt::RunTimeDebugFeatureAtomicDebugDelay]);
}
};
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ TEST(DPrintErrorChecking, TestPrintInvalidCore) {
// device setup, but not the print server should simply ignore the invalid cores.
std::map<CoreType, std::vector<CoreCoord>> dprint_cores;
dprint_cores[CoreType::WORKER] = {{0, 0}, {1, 1}, {100, 100}};
tt::llrt::OptionsG.set_dprint_cores(dprint_cores); // Only (100, 100) is invalid.
tt::llrt::OptionsG.set_dprint_enabled(true);
tt::llrt::OptionsG.set_feature_cores(tt::llrt::RunTimeDebugFeatureDprint, dprint_cores);
tt::llrt::OptionsG.set_feature_enabled(tt::llrt::RunTimeDebugFeatureDprint, true);

const int device_id = 0;
Device* device = nullptr;
Expand All @@ -27,6 +27,6 @@ TEST(DPrintErrorChecking, TestPrintInvalidCore) {
// We expect that even though illegal worker cores were requested, device setup did not hang.
// So just make sure that device setup worked and then close the device.
EXPECT_TRUE(device != nullptr);
tt::llrt::OptionsG.set_dprint_enabled(false);
tt::llrt::OptionsG.set_feature_enabled(tt::llrt::RunTimeDebugFeatureDprint, false);
tt::tt_metal::CloseDevice(device);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include "llrt/rtoptions.hpp"
#include "watcher_fixture.hpp"
#include "test_utils.hpp"
#include "llrt/llrt.hpp"
#include "tt_metal/detail/tt_metal.hpp"
#include "tt_metal/host_api.hpp"
#include "common/bfloat16.hpp"
#include "hostdevcommon/common_runtime_address_map.h"

//////////////////////////////////////////////////////////////////////////////////////////
// A test for checking watcher NOC sanitization.
//////////////////////////////////////////////////////////////////////////////////////////
using namespace tt;
using namespace tt::tt_metal;

// Incrementally populate a vector with bfloat16 values starting from a given float value
// and incrementing by 1.0f for each element.
void inc_populate(std::vector<std::uint32_t>& vec, float start_from) {
float val = start_from;
for (std::uint32_t i = 0; i < vec.size(); i++) {
bfloat16 num_1_bfloat16 = bfloat16(val);
val = val + 1.0f;
bfloat16 num_2_bfloat16 = bfloat16(val);
val = val + 1.0f;
vec.at(i) = pack_two_bfloat16_into_uint32(std::pair<bfloat16, bfloat16>(num_1_bfloat16, num_2_bfloat16));
}
}

void RunDelayTestOnCore(WatcherDelayFixture* fixture, Device* device, CoreCoord &core) {
tt_metal::Program program = tt_metal::CreateProgram();

const uint32_t SINGLE_TILE_SIZE = 2 * 1024;
const uint32_t NUM_TILES = 4;
const uint32_t DRAM_BUFFER_SIZE = SINGLE_TILE_SIZE * NUM_TILES; // NUM_TILES of FP16_B, hard-coded in the reader/writer kernels
const uint32_t PAGE_SIZE = DRAM_BUFFER_SIZE;

tt_metal::InterleavedBufferConfig dram_config{
.device=device,
.size = DRAM_BUFFER_SIZE,
.page_size = PAGE_SIZE,
.buffer_type = tt_metal::BufferType::DRAM
};

auto src0_dram_buffer = CreateBuffer(dram_config);
uint32_t dram_buffer_src0_addr = src0_dram_buffer->address();
auto src1_dram_buffer = CreateBuffer(dram_config);
uint32_t dram_buffer_src1_addr = src1_dram_buffer->address();
auto dst_dram_buffer = CreateBuffer(dram_config);
uint32_t dram_buffer_dst_addr = dst_dram_buffer->address();

auto dram_src0_noc_xy = src0_dram_buffer->noc_coordinates();
auto dram_src1_noc_xy = src1_dram_buffer->noc_coordinates();
auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates();

uint32_t src0_cb_index = 0;
uint32_t num_input_tiles = 2;
tt_metal::CircularBufferConfig cb_src0_config = tt_metal::CircularBufferConfig(num_input_tiles * SINGLE_TILE_SIZE, {{src0_cb_index, tt::DataFormat::Float16_b}})
.set_page_size(src0_cb_index, SINGLE_TILE_SIZE);
auto cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config);

uint32_t src1_cb_index = 1;
tt_metal::CircularBufferConfig cb_src1_config = tt_metal::CircularBufferConfig(num_input_tiles * SINGLE_TILE_SIZE, {{src1_cb_index, tt::DataFormat::Float16_b}})
.set_page_size(src1_cb_index, SINGLE_TILE_SIZE);
auto cb_src1 = tt_metal::CreateCircularBuffer(program, core, cb_src1_config);

uint32_t ouput_cb_index = 16; // output operands start at index 16
uint32_t num_output_tiles = 2;
tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_output_tiles * SINGLE_TILE_SIZE, {{ouput_cb_index, tt::DataFormat::Float16_b}})
.set_page_size(ouput_cb_index, SINGLE_TILE_SIZE);
auto cb_output = tt_metal::CreateCircularBuffer(program, core, cb_output_config);

auto binary_reader_kernel = tt_metal::CreateKernel(
program,
"tt_metal/kernels/dataflow/reader_binary_diff_lengths.cpp",
core,
tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});

auto unary_writer_kernel = tt_metal::CreateKernel(
program,
"tt_metal/kernels/dataflow/writer_unary.cpp",
core,
tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});

vector<uint32_t> compute_kernel_args = { };

std::map<string, string> binary_defines = {
{ "ELTWISE_OP", "add_tiles" },
{ "ELTWISE_OP_CODE", "0" }
};
auto eltwise_binary_kernel = tt_metal::CreateKernel(
program,
"tt_metal/kernels/compute/eltwise_binary.cpp",
core,
tt_metal::ComputeConfig{.compile_args = compute_kernel_args, .defines = binary_defines});

SetRuntimeArgs(
program,
eltwise_binary_kernel,
core,
{NUM_TILES, 1}
);

float constant = 0.0f;
float start_from = 0.0f;
std::vector<uint32_t> src0_vec = create_constant_vector_of_bfloat16(DRAM_BUFFER_SIZE, constant);
std::vector<uint32_t> src1_vec = create_constant_vector_of_bfloat16(DRAM_BUFFER_SIZE, 0.0f);
inc_populate(src1_vec, start_from);
std::vector<uint32_t> expected_vec = create_constant_vector_of_bfloat16(DRAM_BUFFER_SIZE, 0.0f);
inc_populate(expected_vec, start_from + constant);

CommandQueue& cq = device->command_queue();

EnqueueWriteBuffer(cq, std::ref(src0_dram_buffer), src0_vec, false);
EnqueueWriteBuffer(cq, std::ref(src1_dram_buffer), src1_vec, false);

vector<uint32_t> reader_args = {
dram_buffer_src0_addr,
(std::uint32_t)dram_src0_noc_xy.x,
(std::uint32_t)dram_src0_noc_xy.y,
NUM_TILES,
dram_buffer_src1_addr,
(std::uint32_t)dram_src1_noc_xy.x,
(std::uint32_t)dram_src1_noc_xy.y,
NUM_TILES,
0};

vector<uint32_t> writer_args = {
dram_buffer_dst_addr, (std::uint32_t)dram_dst_noc_xy.x, (std::uint32_t)dram_dst_noc_xy.y, NUM_TILES
};

SetRuntimeArgs(program, unary_writer_kernel, core, writer_args);
SetRuntimeArgs(program, binary_reader_kernel, core, reader_args);

EnqueueProgram(cq, program, false);
std::vector<uint32_t> result_vec;
EnqueueReadBuffer(cq, dst_dram_buffer, result_vec, true);

// Print the feedback generated by debug_delay functionality
std::vector<uint32_t> read_vec;

CoreCoord worker_core = fixture->delayed_cores[CoreType::WORKER][0]; // Just check that the first delayed core has the feedback set
CoreCoord phys_core = device->physical_core_from_logical_core({0,0}, CoreType::WORKER);
read_vec = tt::llrt::read_hex_vec_from_core (
device->id(),
phys_core,
GET_MAILBOX_ADDRESS_HOST(debug_insert_delays), sizeof(debug_insert_delays_msg_t));

log_info(tt::LogTest, "Read back debug_insert_delays: 0x{:x}", read_vec[0]);
EXPECT_TRUE((read_vec[0] >> 24) == 0x3);
}

TEST_F(WatcherDelayFixture, TestWatcherSanitizeInsertDelays) {
if (this->slow_dispatch_)
GTEST_SKIP();

this->RunTestOnDevice(
[](WatcherFixture *fixture, Device *device){
CoreCoord core{0, 0};
RunDelayTestOnCore(dynamic_cast<WatcherDelayFixture*>(fixture), device, core);
},
this->devices_[0]
);
}
2 changes: 2 additions & 0 deletions tt_metal/hw/inc/dataflow_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include "risc_attribs.h"
#include "third_party/umd/device/tt_silicon_driver_common.hpp"
#include "debug/assert.h"
#include "dev_msgs.h"

extern uint8_t noc_index;

Expand Down Expand Up @@ -1602,6 +1603,7 @@ void noc_semaphore_inc(uint64_t addr, uint32_t incr) {
*/
DEBUG_STATUS("NSIW");
DEBUG_SANITIZE_NOC_ADDR(addr, 4);
DEBUG_INSERT_DELAY(TransactionAtomic);
noc_fast_atomic_increment(noc_index, NCRISC_AT_CMD_BUF, addr, NOC_UNICAST_WRITE_VC, incr, 31 /*wrap*/, false /*linked*/, false /*posted*/);
DEBUG_STATUS("NSID");
}
Expand Down
Loading

0 comments on commit fbd0602

Please sign in to comment.