diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/kernels/pull_from_pcie.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/kernels/pull_from_pcie.cpp index 9ae0f0adffb..9f94b540aaf 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/kernels/pull_from_pcie.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/kernels/pull_from_pcie.cpp @@ -17,7 +17,7 @@ void kernel_main() { volatile tt_l1_ptr uint32_t* done_address = reinterpret_cast(L1_UNRESERVED_BASE); while (done_address[0] == 0) { - uint64_t host_src_addr = get_noc_addr_helper(NOC_XY_ENCODING(PCIE_NOC_X, PCIE_NOC_Y), pcie_read_ptr); + uint64_t host_src_addr = get_noc_addr_helper(NOC_XY_PCIE_ENCODING(PCIE_NOC_X, PCIE_NOC_Y, NOC_INDEX), pcie_read_ptr); noc_async_read(host_src_addr, L1_UNRESERVED_BASE, read_sizeB); pcie_read_ptr += read_sizeB; if (pcie_read_ptr > pcie_base + pcie_sizeB) { diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/pcie_write_16b.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/pcie_write_16b.cpp index 05c4a338ff5..ac8945a4d6d 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/pcie_write_16b.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/pcie_write_16b.cpp @@ -11,7 +11,7 @@ void kernel_main() { constexpr uint32_t base_pcie_dst_address = get_compile_time_arg_val(1); constexpr uint32_t num_16b_writes = get_compile_time_arg_val(2); - uint64_t pcie_core_noc_encoding = uint64_t(NOC_XY_ENCODING(PCIE_NOC_X, PCIE_NOC_Y)) << 32; + uint64_t pcie_core_noc_encoding = uint64_t(NOC_XY_PCIE_ENCODING(PCIE_NOC_X, PCIE_NOC_Y, NOC_INDEX)) << 32; uint32_t l1_src_address = base_l1_src_address; uint32_t pcie_dst_address = base_pcie_dst_address; diff --git a/tt_metal/hw/inc/blackhole/noc/noc_parameters.h b/tt_metal/hw/inc/blackhole/noc/noc_parameters.h index 8b8e9ad1415..7f6529f9915 100644 --- a/tt_metal/hw/inc/blackhole/noc/noc_parameters.h +++ b/tt_metal/hw/inc/blackhole/noc/noc_parameters.h @@ -14,6 +14,9 @@ #define NOC_XY_ENCODING(x, y) \ ((((uint64_t)(y)) << (NOC_ADDR_LOCAL_BITS + NOC_ADDR_NODE_ID_BITS)) | (((uint64_t)(x)) << NOC_ADDR_LOCAL_BITS)) +#define NOC_XY_PCIE_ENCODING(x, y, noc_index) \ + NOC_XY_ENCODING(x, y) + #define NOC_MULTICAST_ENCODING(x_start, y_start, x_end, y_end) \ ((((uint64_t)(x_start)) << (NOC_ADDR_LOCAL_BITS + 2 * NOC_ADDR_NODE_ID_BITS)) | \ (((uint64_t)(y_start)) << (NOC_ADDR_LOCAL_BITS + 3 * NOC_ADDR_NODE_ID_BITS)) | \ diff --git a/tt_metal/hw/inc/dataflow_api.h b/tt_metal/hw/inc/dataflow_api.h index 91b1a26f8f3..12df89b03de 100644 --- a/tt_metal/hw/inc/dataflow_api.h +++ b/tt_metal/hw/inc/dataflow_api.h @@ -476,7 +476,7 @@ uint64_t get_l1_noc_addr(const uint32_t id, const uint32_t page_size, const uint } uint64_t get_system_memory_noc_addr(const uint32_t id, const uint32_t page_size, const uint32_t base_addr, const uint32_t offset = 0) { - constexpr static uint64_t pcie_core_noc_encoding = uint64_t(NOC_XY_ENCODING(PCIE_NOC_X, PCIE_NOC_Y)) << 32; + uint64_t pcie_core_noc_encoding = uint64_t(NOC_XY_PCIE_ENCODING(NOC_X(PCIE_NOC_X), NOC_Y(PCIE_NOC_Y), noc_index)) << 32; uint32_t addr = base_addr + page_size * id + offset; uint64_t noc_addr = pcie_core_noc_encoding | addr; return noc_addr; diff --git a/tt_metal/hw/inc/grayskull/noc/noc_parameters.h b/tt_metal/hw/inc/grayskull/noc/noc_parameters.h index 3fa07c45294..ed13f98ea8f 100644 --- a/tt_metal/hw/inc/grayskull/noc/noc_parameters.h +++ b/tt_metal/hw/inc/grayskull/noc/noc_parameters.h @@ -12,6 +12,8 @@ // Address formats #define NOC_XY_ENCODING(x, y) ((((uint32_t)(y)) << (NOC_ADDR_NODE_ID_BITS)) | (((uint32_t)(x)))) +#define NOC_XY_PCIE_ENCODING(x, y, noc_index) NOC_XY_ENCODING(x, y) + #define NOC_MULTICAST_ENCODING(x_start, y_start, x_end, y_end) \ ((x_start) << (2 * NOC_ADDR_NODE_ID_BITS)) | ((y_start) << (3 * NOC_ADDR_NODE_ID_BITS)) | (x_end) | \ ((y_end) << (NOC_ADDR_NODE_ID_BITS)) diff --git a/tt_metal/hw/inc/wormhole/noc/noc_parameters.h b/tt_metal/hw/inc/wormhole/noc/noc_parameters.h index 0a2256ffeeb..f6b361d3ff3 100644 --- a/tt_metal/hw/inc/wormhole/noc/noc_parameters.h +++ b/tt_metal/hw/inc/wormhole/noc/noc_parameters.h @@ -9,13 +9,21 @@ #define PCIE_NOC_X 0 #define PCIE_NOC_Y 3 +#define PCIE_NOC1_X 9 +#define PCIE_NOC1_Y 8 + // Address formats #define NOC_XY_ENCODING(x, y) \ (((uint32_t)(y)) << ((NOC_ADDR_LOCAL_BITS % 32)+NOC_ADDR_NODE_ID_BITS)) | \ - (((uint32_t)(x)) << (NOC_ADDR_LOCAL_BITS % 32)) | ((x == PCIE_NOC_X and y == PCIE_NOC_Y) * 0x8) \ + (((uint32_t)(x)) << (NOC_ADDR_LOCAL_BITS % 32)) \ + +// Address formats +#define NOC_XY_PCIE_ENCODING(x, y, noc_index) \ + NOC_XY_ENCODING(x, y) | \ + ((noc_index ? (x == PCIE_NOC1_X and y == PCIE_NOC1_Y) : (x == PCIE_NOC_X and y == PCIE_NOC_Y)) * 0x8) \ #define NOC_MULTICAST_ENCODING(x_start, y_start, x_end, y_end) \ - (((uint32_t)(x_start)) << ((NOC_ADDR_LOCAL_BITS % 32)+2*NOC_ADDR_NODE_ID_BITS)) | \ + (((uint32_t)(x_start)) << ((NOC_ADDR_LOCAL_BITS % 32)+2*NOC_ADDR_NODE_ID_BITS)) | \ (((uint32_t)(y_start)) << ((NOC_ADDR_LOCAL_BITS % 32)+3*NOC_ADDR_NODE_ID_BITS)) | \ (((uint32_t)(x_end)) << (NOC_ADDR_LOCAL_BITS % 32)) | \ (((uint32_t)(y_end)) << ((NOC_ADDR_LOCAL_BITS % 32)+NOC_ADDR_NODE_ID_BITS)) \ diff --git a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp index 75b525d0a91..8002bd01704 100644 --- a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp +++ b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp @@ -43,7 +43,7 @@ constexpr uint32_t is_h_variant = get_compile_time_arg_val(16); constexpr uint32_t upstream_noc_xy = uint32_t(NOC_XY_ENCODING(UPSTREAM_NOC_X, UPSTREAM_NOC_Y)); constexpr uint32_t downstream_noc_xy = uint32_t(NOC_XY_ENCODING(DOWNSTREAM_NOC_X, DOWNSTREAM_NOC_Y)); constexpr uint32_t my_noc_xy = uint32_t(NOC_XY_ENCODING(MY_NOC_X, MY_NOC_Y)); -constexpr uint32_t pcie_noc_xy = uint32_t(NOC_XY_ENCODING(NOC_0_X(static_cast(NOC_INDEX), noc_size_x, PCIE_NOC_X), NOC_0_Y(static_cast(NOC_INDEX), noc_size_y, PCIE_NOC_Y))); +constexpr uint32_t pcie_noc_xy = uint32_t(NOC_XY_PCIE_ENCODING(NOC_0_X(static_cast(NOC_INDEX), noc_size_x, PCIE_NOC_X), NOC_0_Y(static_cast(NOC_INDEX), noc_size_y, PCIE_NOC_Y), NOC_INDEX)); constexpr uint32_t dispatch_cb_page_size = 1 << dispatch_cb_log_page_size; constexpr uint32_t completion_queue_end_addr = completion_queue_base_addr + completion_queue_size; diff --git a/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp b/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp index 0ee658ad1c2..0124d992b2c 100644 --- a/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp +++ b/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp @@ -52,7 +52,7 @@ constexpr uint32_t is_h_variant = get_compile_time_arg_val(22); constexpr uint32_t my_noc_xy = uint32_t(NOC_XY_ENCODING(MY_NOC_X, MY_NOC_Y)); constexpr uint32_t upstream_noc_xy = uint32_t(NOC_XY_ENCODING(UPSTREAM_NOC_X, UPSTREAM_NOC_Y)); constexpr uint32_t downstream_noc_xy = uint32_t(NOC_XY_ENCODING(DOWNSTREAM_NOC_X, DOWNSTREAM_NOC_Y)); -constexpr uint32_t pcie_noc_xy = uint32_t(NOC_XY_ENCODING(NOC_0_X(static_cast(NOC_INDEX), noc_size_x, PCIE_NOC_X), NOC_0_Y(static_cast(NOC_INDEX), noc_size_y, PCIE_NOC_Y))); +constexpr uint32_t pcie_noc_xy = uint32_t(NOC_XY_PCIE_ENCODING(NOC_0_X(static_cast(NOC_INDEX), noc_size_x, PCIE_NOC_X), NOC_0_Y(static_cast(NOC_INDEX), noc_size_y, PCIE_NOC_Y), NOC_INDEX)); constexpr uint32_t downstream_cb_page_size = 1 << downstream_cb_log_page_size; constexpr uint32_t downstream_cb_end = downstream_cb_base + (1 << downstream_cb_log_page_size) * downstream_cb_pages; constexpr uint32_t prefetch_q_end = prefetch_q_base + prefetch_q_size; diff --git a/tt_metal/impl/dispatch/kernels/cq_prefetch.hpp b/tt_metal/impl/dispatch/kernels/cq_prefetch.hpp deleted file mode 100644 index 036316ee43a..00000000000 --- a/tt_metal/impl/dispatch/kernels/cq_prefetch.hpp +++ /dev/null @@ -1,674 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -// Common prefetch code for use by _hd, _h, _d prefetch variants - -#include "dataflow_api.h" -#include "debug/dprint.h" -#include "tt_metal/impl/dispatch/kernels/cq_common.hpp" - -extern const uint32_t scratch_db_top[2]; - - -template -FORCE_INLINE -void write_downstream(uint32_t& data_ptr, - uint32_t& downstream_data_ptr, - uint32_t length) { - - uint32_t remaining = cb_end - downstream_data_ptr; - if (length > remaining) { - if (remaining > 0) { - noc_async_write(data_ptr, get_noc_addr_helper(downstream_noc_xy, downstream_data_ptr), remaining); - data_ptr += remaining; - length -= remaining; - } - downstream_data_ptr = cb_base; - } - - noc_async_write(data_ptr, get_noc_addr_helper(downstream_noc_xy, downstream_data_ptr), length); - downstream_data_ptr += length; -} - -template -FORCE_INLINE -void read_from_pcie(volatile tt_l1_ptr uint16_t *& prefetch_q_rd_ptr, - uint32_t& pending_read_size, - uint32_t& fence, - uint32_t& pcie_read_ptr, - uint32_t cmd_ptr, - uint32_t size) { - - // Wrap cmddat_q - if (fence + size + preamble_size > cmddat_q_base + cmddat_q_size) { - // only wrap if there are no commands ready, otherwise we'll leave some on the floor - // TODO: does this matter for perf? - if (cmd_ptr != fence) { - return; - } - fence = cmddat_q_base; - } - - // Wrap pcie/hugepage - if (pcie_read_ptr + size > pcie_base + pcie_size) { - pcie_read_ptr = pcie_base; - } - - uint64_t host_src_addr = get_noc_addr_helper(NOC_XY_ENCODING(NOC_X(PCIE_NOC_X), NOC_Y(PCIE_NOC_Y)), pcie_read_ptr); - noc_async_read(host_src_addr, fence + preamble_size, size); - pending_read_size = size + preamble_size; - pcie_read_ptr += size; - - *prefetch_q_rd_ptr = 0; - - // Tell host we read - *(volatile tt_l1_ptr uint32_t *) prefetch_q_rd_ptr_addr = (uint32_t)prefetch_q_rd_ptr; - - prefetch_q_rd_ptr++; - - // Wrap prefetch_q - if ((uint32_t)prefetch_q_rd_ptr == prefetch_q_end) { - prefetch_q_rd_ptr = (volatile tt_l1_ptr uint16_t*)prefetch_q_base; - } -} - -// This routine can be called in 8 states based on the boolean values cmd_ready, prefetch_q_ready, read_pending: -// - !cmd_ready, !prefetch_q_ready, !read_pending: stall on prefetch_q, issue read, read barrier -// - !cmd_ready, !prefetch_q_ready, read pending: read barrier (and re-evaluate prefetch_q_ready) -// - !cmd_ready, prefetch_q_ready, !read_pending: issue read, read barrier (XXXX +issue read after?) -// - !cmd_ready, prefetch_q_ready, read_pending: read barrier, issue read -// - cmd_ready, !prefetch_q_ready, !read_pending: exit -// - cmd_ready, !prefetch_q_ready, read_pending: exit (no barrier yet) -// - cmd_ready, prefetch_q_ready, !read_pending: issue read -// - cmd_ready, prefetch_q_ready, read_pending: exit (don't add latency to the in flight request) -// -// With WH tagging of reads: -// open question: should fetcher loop on prefetch_q_ready issuing reads until !prefetch_q_ready -// - !cmd_ready, !prefetch_q_ready, !read_pending: stall on prefetch_q, issue read, read barrier -// - !cmd_ready, !prefetch_q_ready, read pending: read barrier on oldest tag -// - !cmd_ready, prefetch_q_ready, !read_pending: issue read, read barrier (XXXX +retry after?) -// - !cmd_ready, prefetch_q_ready, read_pending: issue read, read barrier on oldest tag -// - cmd_ready, !prefetch_q_ready, !read_pending: exit -// - cmd_ready, !prefetch_q_ready, read_pending: exit (no barrier yet) -// - cmd_ready, prefetch_q_ready, !read_pending: issue and tag read -// - cmd_ready, prefetch_q_ready, read_pending: issue and tag read -template -void fetch_q_get_cmds(uint32_t& fence, uint32_t& cmd_ptr, uint32_t& pcie_read_ptr) { - - static uint32_t pending_read_size = 0; - static volatile tt_l1_ptr uint16_t* prefetch_q_rd_ptr = (volatile tt_l1_ptr uint16_t*)prefetch_q_base; - - if (fence < cmd_ptr) { - DPRINT << "wrap cmd ptr1 " << fence << " " << cmd_ptr << ENDL(); - cmd_ptr = fence; - } - - bool cmd_ready = (cmd_ptr != fence); - uint32_t fetch_size = (uint32_t)*prefetch_q_rd_ptr << prefetch_q_log_minsize; - - if (fetch_size != 0 && pending_read_size == 0) { - DPRINT << "read1: " << (uint32_t)prefetch_q_rd_ptr << " " << " " << fence << " " << fetch_size << ENDL(); - read_from_pcie - (prefetch_q_rd_ptr, pending_read_size, fence, pcie_read_ptr, cmd_ptr, fetch_size); - } - if (!cmd_ready) { - if (pending_read_size != 0) { - DPRINT << "barrier" << ENDL(); - noc_async_read_barrier(); - - // wrap the cmddat_q - if (fence < cmd_ptr) { - cmd_ptr = fence; - } - - fence += pending_read_size; - pending_read_size = 0; - // After the stall, re-check the host - fetch_size = (uint32_t)*prefetch_q_rd_ptr << prefetch_q_log_minsize; - if (fetch_size != 0) { - read_from_pcie - (prefetch_q_rd_ptr, pending_read_size, fence, pcie_read_ptr, cmd_ptr, fetch_size); - } - } else { - // By here, prefetch_q_ready must be false - // Nothing to fetch, nothing pending, nothing available, stall on host - DEBUG_STATUS("HQW"); - DPRINT << "prefetcher stall" << ENDL(); - while ((fetch_size = *prefetch_q_rd_ptr) == 0); - DPRINT << "recurse" << ENDL(); - fetch_q_get_cmds(fence, cmd_ptr, pcie_read_ptr); - DEBUG_STATUS("HQD"); - } - } -} - -template -uint32_t process_debug_cmd(uint32_t cmd_ptr) { - - volatile CQPrefetchCmd tt_l1_ptr *cmd = (volatile CQPrefetchCmd tt_l1_ptr *)cmd_ptr; - uint32_t checksum = 0; - uint32_t data_start = (uint32_t)cmd + sizeof(CQPrefetchCmd); - uint32_t *data = (uint32_t *)data_start; - uint32_t size = cmd->debug.size; - - uint32_t front_size = (size <= cmddat_end - data_start) ? size : cmddat_end - data_start; - for (uint32_t i = 0; i < front_size / sizeof(uint32_t); i++) { - checksum += *data++; - } - uint32_t back_size = size - front_size; - if (back_size > 0) { - data = (uint32_t *)cmddat_base; - for (uint32_t i = 0; i < back_size / sizeof(uint32_t); i++) { - checksum += *data++; - } - } - - if (checksum != cmd->debug.checksum) { - DEBUG_STATUS("!CHK"); - ASSERT(0); - } - - return cmd->debug.stride; -} - -template -static uint32_t process_relay_inline_cmd(uint32_t cmd_ptr, - uint32_t& dispatch_data_ptr) { - - volatile CQPrefetchCmd tt_l1_ptr *cmd = (volatile CQPrefetchCmd tt_l1_ptr *)cmd_ptr; - - uint32_t length = cmd->relay_inline.length; - uint32_t data_ptr = cmd_ptr + sizeof(CQPrefetchCmd); - - uint32_t npages = (length + cb_page_size - 1) >> cb_log_page_size; - - // Assume the downstream buffer is big relative to cmddat command size that we can - // grab what we need in one chunk - cb_acquire_pages(npages); - - uint32_t remaining = cmddat_end - data_ptr; - if (cmddat_wrap_enable && length > remaining) { - // wrap cmddat - write_downstream(data_ptr, dispatch_data_ptr, remaining); - length -= remaining; - data_ptr = cmddat_base; - } - - DPRINT << my_noc_xy << " " << dispatch_noc_xy << " " << cb_base << ENDL(); - write_downstream(data_ptr, dispatch_data_ptr, length); - - // Round to nearest page - dispatch_data_ptr += (cb_page_size - (dispatch_data_ptr & (cb_page_size - 1))) & (cb_page_size - 1); - - // XXXXX - painful syncing right now? move this into get_cmds - noc_async_writes_flushed(); - cb_release_pages(npages); - - return cmd->relay_inline.stride; -} - -// This version of inline sends inline data to the dispatcher but doesn't flush the page to the dispatcher -// This is used to assemble dispatcher commands when data comes out of band, eg, reading from DRAM -// That means this command is stateful, incorrect use will be...bad -// NOTE: this routine assumes we're sending a command header and that is LESS THAN A PAGE -template -static uint32_t process_relay_inline_noflush_cmd(uint32_t cmd_ptr, - uint32_t& dispatch_data_ptr) { - - volatile CQPrefetchCmd tt_l1_ptr *cmd = (volatile CQPrefetchCmd tt_l1_ptr *)cmd_ptr; - - uint32_t length = sizeof(CQDispatchCmd); - uint32_t data_ptr = cmd_ptr + sizeof(CQPrefetchCmd); - - cb_acquire_pages(1); - if (dispatch_data_ptr == cb_end) { - dispatch_data_ptr = cb_base; - } - noc_async_write(data_ptr, get_noc_addr_helper(dispatch_noc_xy, dispatch_data_ptr), length); - dispatch_data_ptr += length; - - return CQ_PREFETCH_CMD_BARE_MIN_SIZE; -} - -template -static uint32_t write_pages_to_dispatcher(uint32_t& dispatch_data_ptr, - uint32_t& scratch_write_addr, - uint32_t& amt_to_write) { - - uint32_t page_residual_space = dispatch_cb_page_size - (dispatch_data_ptr & (dispatch_cb_page_size - 1)); - uint32_t npages = (amt_to_write - page_residual_space + dispatch_cb_page_size + extra_space - 1) / dispatch_cb_page_size; - - // Grabbing all pages at once is ok if scratch_size < 3 * dispatch_cb_block_size - if (!test_for_nonzero || npages != 0) { - cb_acquire_pages(npages); - } - - uint64_t noc_addr = get_noc_addr_helper(dispatch_noc_xy, dispatch_data_ptr); - if (dispatch_data_ptr == dispatch_cb_end) { - dispatch_data_ptr = dispatch_cb_base; - } else if (dispatch_data_ptr + amt_to_write > dispatch_cb_end) { // wrap - uint32_t last_chunk_size = dispatch_cb_end - dispatch_data_ptr; - noc_async_write(scratch_write_addr, noc_addr, last_chunk_size); - dispatch_data_ptr = dispatch_cb_base; - scratch_write_addr += last_chunk_size; - amt_to_write -= last_chunk_size; - noc_addr = get_noc_addr_helper(dispatch_noc_xy, dispatch_data_ptr); - } - - noc_async_write(scratch_write_addr, noc_addr, amt_to_write); - dispatch_data_ptr += amt_to_write; - - return npages; -} - -// This fn prefetches data from DRAM memory and writes data to the dispatch core. -// Reading from DRAM has the following characteristics: -// - latency is moderately high ~400 cycles on WH -// - DRAM bw is ~maximized when page size reaches 2K -// - for kernel dispatch, it is expected that page sizes will often be <2K -// - for buffer writing, page sizes will vary -// - writing to dispatcher works best with 4K pages (2K pages cover overhead, 4K gives perf cushion) -// - writing a 4K page takes ~32*4=128 cycles -// - writing 4 4K pages is 512 cycles, close to parity w/ the latency of DRAM -// - to hide the latency (~12% overhead), assume we need to read ~32 pages=128K, double buffered -// - in other words, we'll never achieve high efficiency and always be (somewhat) latency bound -// Algorithm does: -// - read a batch from DRAM -// - loop: read a batch from DRAM while sending to dispatcher -// - send a batch to dispatcher -// The size of the first read should be based on latency. With small page sizes -// bandwidth will be low and we'll be DRAM bound (send to dispatcher is ~free). -// With larger pages we'll get closer to a bandwidth match -// The dispatch buffer is a ring buffer. -template -uint32_t process_relay_paged_cmd(uint32_t cmd_ptr, - uint32_t& dispatch_data_ptr) { - - // This ensures that a previous cmd using the scratch buf has finished - noc_async_writes_flushed(); - - volatile CQPrefetchCmd tt_l1_ptr *cmd = (volatile CQPrefetchCmd tt_l1_ptr *)cmd_ptr; - uint32_t page_id = cmd->relay_paged.start_page; - uint32_t base_addr = cmd->relay_paged.base_addr; - uint32_t page_size = cmd->relay_paged.page_size; - uint32_t pages = cmd->relay_paged.pages; - uint32_t read_length = pages * page_size; - - InterleavedAddrGen addr_gen; - addr_gen.bank_base_address = base_addr; - addr_gen.page_size = page_size; - - // First step - read into DB0 - uint32_t scratch_read_addr = scratch_db_top[0]; - uint32_t amt_to_read = (scratch_db_half_size > read_length) ? read_length : scratch_db_half_size; - uint32_t amt_read = 0; - while (amt_to_read >= page_size) { - uint64_t noc_addr = addr_gen.get_noc_addr(page_id); // XXXX replace this w/ walking the banks to save mul on GS - noc_async_read(noc_addr, scratch_read_addr, page_size); - scratch_read_addr += page_size; - page_id++; - amt_to_read -= page_size; - amt_read += page_size; - } - noc_async_read_barrier(); - - // Second step - read into DB[x], write from DB[x], toggle x, iterate - // Writes are fast, reads are slow - uint32_t db_toggle = 0; - uint32_t scratch_write_addr; - read_length -= amt_read; - while (read_length != 0) { - // This ensures that writes from prior iteration are done - // TODO(pgk); we can do better on WH w/ tagging - noc_async_writes_flushed(); - - db_toggle ^= 1; - scratch_read_addr = scratch_db_top[db_toggle]; - scratch_write_addr = scratch_db_top[db_toggle ^ 1]; - - uint32_t amt_to_write = amt_read; - amt_to_read = (scratch_db_half_size > read_length) ? read_length : scratch_db_half_size; - amt_read = 0; - while (amt_to_read >= page_size) { - uint64_t noc_addr = addr_gen.get_noc_addr(page_id); // XXXX replace this w/ walking the banks to save mul on GS - noc_async_read(noc_addr, scratch_read_addr, page_size); - scratch_read_addr += page_size; - page_id++; - amt_to_read -= page_size; - amt_read += page_size; - } - - // Third step - write from DB - uint32_t npages = write_pages_to_dispatcher< - 0, - false, - my_noc_xy, - my_dispatch_cb_sem_id, - dispatch_noc_xy, - dispatch_cb_base, - dispatch_cb_end, - dispatch_cb_page_size>(dispatch_data_ptr, scratch_write_addr, amt_to_write); - cb_release_pages(npages); - - read_length -= amt_read; - - // TODO(pgk); we can do better on WH w/ tagging - noc_async_read_barrier(); - } - - // Third step - write from DB - scratch_write_addr = scratch_db_top[db_toggle]; - uint32_t amt_to_write = amt_read; - uint32_t npages = write_pages_to_dispatcher< - CQ_DISPATCH_CMD_SIZE, - true, - my_noc_xy, - my_dispatch_cb_sem_id, - dispatch_noc_xy, - dispatch_cb_base, - dispatch_cb_end, - dispatch_cb_page_size>(dispatch_data_ptr, scratch_write_addr, amt_to_write); - - uint32_t pad_to_page = dispatch_cb_page_size - (dispatch_data_ptr & (dispatch_cb_page_size - 1)); - dispatch_data_ptr += pad_to_page; - - // One page was acquired w/ the cmd in CMD_RELAY_INLINE_NOFLUSH - cb_release_pages(npages + 1); - - return CQ_PREFETCH_CMD_BARE_MIN_SIZE; -} - -template -uint32_t process_relay_linear_cmd(uint32_t cmd_ptr, - uint32_t& dispatch_data_ptr) { - - // This ensures that a previous cmd using the scratch buf has finished - noc_async_writes_flushed(); - - volatile CQPrefetchCmd tt_l1_ptr *cmd = (volatile CQPrefetchCmd tt_l1_ptr *)cmd_ptr; - uint32_t noc_xy_addr = cmd->relay_linear.noc_xy_addr; - uint32_t read_addr = cmd->relay_linear.addr; - uint32_t length = cmd->relay_linear.length; - uint32_t read_length = length; - - // First step - read into DB0 - uint32_t scratch_read_addr = scratch_db_top[0]; - uint32_t amt_to_read = (scratch_db_half_size > read_length) ? read_length : scratch_db_half_size; - uint64_t noc_addr = get_noc_addr_helper(noc_xy_addr, read_addr); - noc_async_read(noc_addr, scratch_read_addr, amt_to_read); - read_addr += amt_to_read; - noc_async_read_barrier(); - - // Second step - read into DB[x], write from DB[x], toggle x, iterate - // Writes are fast, reads are slow - uint32_t db_toggle = 0; - uint32_t scratch_write_addr; - read_length -= amt_to_read; - while (read_length != 0) { - // This ensures that writes from prior iteration are done - // TODO(pgk); we can do better on WH w/ tagging - noc_async_writes_flushed(); - - db_toggle ^= 1; - scratch_read_addr = scratch_db_top[db_toggle]; - scratch_write_addr = scratch_db_top[db_toggle ^ 1]; - - uint32_t amt_to_write = amt_to_read; - amt_to_read = (scratch_db_half_size > read_length) ? read_length : scratch_db_half_size; - noc_addr = get_noc_addr_helper(noc_xy_addr, read_addr); - noc_async_read(noc_addr, scratch_read_addr, amt_to_read); - read_addr += amt_to_read; - - // Third step - write from DB - uint32_t npages = write_pages_to_dispatcher< - 0, - false, - my_noc_xy, - my_dispatch_cb_sem_id, - dispatch_noc_xy, - dispatch_cb_base, - dispatch_cb_end, - dispatch_cb_page_size>(dispatch_data_ptr, scratch_write_addr, amt_to_write); - - cb_release_pages(npages); - - read_length -= amt_to_read; - - // TODO(pgk); we can do better on WH w/ tagging - noc_async_read_barrier(); - } - - // Third step - write from DB - scratch_write_addr = scratch_db_top[db_toggle]; - uint32_t amt_to_write = amt_to_read; - uint32_t npages = write_pages_to_dispatcher< - CQ_DISPATCH_CMD_SIZE, - true, - my_noc_xy, - my_dispatch_cb_sem_id, - dispatch_noc_xy, - dispatch_cb_base, - dispatch_cb_end, - dispatch_cb_page_size>(dispatch_data_ptr, scratch_write_addr, amt_to_write); - - uint32_t pad_to_page = dispatch_cb_page_size - (dispatch_data_ptr & (dispatch_cb_page_size - 1)); - dispatch_data_ptr += pad_to_page; - - // One page was acquired w/ the cmd in CMD_RELAY_INLINE_NOFLUSH - cb_release_pages(npages + 1); - - return CQ_PREFETCH_CMD_BARE_MIN_SIZE; -} - -template -uint32_t process_stall(uint32_t cmd_ptr) { - - static uint32_t count = 0; - - count++; - - DEBUG_STATUS("PSW"); - volatile tt_l1_ptr uint32_t* sem_addr = - reinterpret_cast(get_semaphore(dispatch_sync_sem_id)); - while (*sem_addr != count); - DEBUG_STATUS("PSD"); - - return CQ_PREFETCH_CMD_BARE_MIN_SIZE; -} - -template -bool process_cmd(uint32_t cmd_ptr, - uint32_t& downstream_data_ptr, - uint32_t& stride) { - - volatile CQPrefetchCmd tt_l1_ptr *cmd = (volatile CQPrefetchCmd tt_l1_ptr *)cmd_ptr; - bool done = false; - - switch (cmd->base.cmd_id) { - case CQ_PREFETCH_CMD_RELAY_LINEAR: - DPRINT << "relay linear: " << cmd_ptr << ENDL(); - stride = process_relay_linear_cmd< - my_noc_xy, - my_downstream_cb_sem_id, - downstream_noc_xy, - downstream_cb_sem_id, - downstream_cb_base, - downstream_cb_end, - downstream_cb_page_size, - scratch_db_half_size>(cmd_ptr, downstream_data_ptr); - break; - - case CQ_PREFETCH_CMD_RELAY_PAGED: - DPRINT << "relay dram page: " << cmd_ptr << ENDL(); - if (cmd->relay_paged.is_dram) { - stride = process_relay_paged_cmd< - true, - my_noc_xy, - my_downstream_cb_sem_id, - downstream_noc_xy, - downstream_cb_sem_id, - downstream_cb_base, - downstream_cb_end, - downstream_cb_page_size, - scratch_db_half_size>(cmd_ptr, downstream_data_ptr); - } else { - stride = process_relay_paged_cmd< - false, - my_noc_xy, - my_downstream_cb_sem_id, - downstream_noc_xy, - downstream_cb_sem_id, - downstream_cb_base, - downstream_cb_end, - downstream_cb_page_size, - scratch_db_half_size>(cmd_ptr, downstream_data_ptr); - } - break; - - case CQ_PREFETCH_CMD_RELAY_INLINE: - DPRINT << "inline" << ENDL(); - stride = process_relay_inline_cmd< - cmddat_wrap_enable, - my_noc_xy, - my_downstream_cb_sem_id, - downstream_noc_xy, - downstream_cb_sem_id, - cmddat_base, - cmddat_end, - downstream_cb_base, - downstream_cb_end, - downstream_cb_log_page_size, - downstream_cb_page_size>(cmd_ptr, downstream_data_ptr); - break; - - case CQ_PREFETCH_CMD_RELAY_INLINE_NOFLUSH: - DPRINT << "inline no flush" << ENDL(); - stride = process_relay_inline_noflush_cmd< - my_noc_xy, - my_downstream_cb_sem_id, - downstream_noc_xy, - downstream_cb_base, - downstream_cb_end>(cmd_ptr, downstream_data_ptr); - break; - - case CQ_PREFETCH_CMD_STALL: - DPRINT << "stall" << ENDL(); - stride = process_stall(cmd_ptr); - break; - - case CQ_PREFETCH_CMD_DEBUG: - DPRINT << "debug" << ENDL(); - stride = process_debug_cmd(cmd_ptr); - break; - - case CQ_PREFETCH_CMD_TERMINATE: - DPRINT << "terminating\n"; - done = true; - break; - - default: - DPRINT << "prefetch invalid command:" << (uint32_t)cmd->base.cmd_id << " " << cmd_ptr << " " << ENDL(); - DPRINT << HEX() << *(uint32_t*)cmd_ptr << ENDL(); - DPRINT << HEX() << *((uint32_t*)cmd_ptr+1) << ENDL(); - DPRINT << HEX() << *((uint32_t*)cmd_ptr+2) << ENDL(); - DPRINT << HEX() << *((uint32_t*)cmd_ptr+3) << ENDL(); - DPRINT << HEX() << *((uint32_t*)cmd_ptr+4) << ENDL(); - DEBUG_STATUS("!CMD"); - ASSERT(0); - } - - return done; -}