Skip to content

Commit

Permalink
#0: Resolve trace issues exposed by using eth dispatch
Browse files Browse the repository at this point in the history
 - stall_state was set to STALLED, even if exec_buf command wrapped around in
   the cmddat_q, causing read_from_pcie to not be called until exec_buf was
   run ... except exec_buf was never run, as it was never fetched
 - Resolve this by only setting the STALLED flag, if there are no pending
   reads over PCIe -> all entries in the exec_buf fetch_q entry have been read
 - Add multi_device trace tests using ethernet dispatch
  • Loading branch information
tt-asaigal committed Jun 15, 2024
1 parent d44016b commit 8abc730
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 14 deletions.
1 change: 1 addition & 0 deletions tests/scripts/t3000/run_t3000_frequent_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ run_t3000_trace_stress_tests() {
echo "LOG_METAL: Running run_t3000_trace_stress_tests"

NUM_TRACE_LOOPS=30 pytest tests/ttnn/unit_tests/test_multi_device_trace.py
NUM_TRACE_LOOPS=30 WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest tests/ttnn/unit_tests/test_multi_device_trace.py

# Record the end time
end_time=$(date +%s)
Expand Down
1 change: 1 addition & 0 deletions tests/scripts/t3000/run_t3000_unit_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ run_t3000_ttnn_tests() {

echo "LOG_METAL: Running run_t3000_ttnn_tests"
pytest tests/ttnn/unit_tests/test_multi_device_trace.py
WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest tests/ttnn/unit_tests/test_multi_device_trace.py
pytest tests/ttnn/unit_tests/test_multi_device.py
pytest tests/ttnn/unit_tests/test_multi_device_async.py
# Record the end time
Expand Down
7 changes: 5 additions & 2 deletions tests/ttnn/unit_tests/test_multi_device_trace.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,11 +108,11 @@ def run_op_chain(input_0, input_1):

@pytest.mark.parametrize(
"shape",
[(1, 3, 1024, 1024), (1, 1, 512, 512), (1, 1, 32, 32), (1, 3, 512, 512), (1, 3, 32, 32)],
[(1, 1, 256, 256), (1, 3, 1024, 1024), (1, 1, 512, 512), (1, 1, 32, 32), (1, 3, 512, 512), (1, 3, 32, 32)],
)
@pytest.mark.parametrize("use_all_gather", [True, False])
@pytest.mark.parametrize("enable_async", [True])
@pytest.mark.parametrize("device_params", [{"trace_region_size": 99328}], indirect=True)
@pytest.mark.parametrize("device_params", [{"trace_region_size": 104448}], indirect=True)
def test_multi_device_multi_trace(t3k_device_mesh, shape, use_all_gather, enable_async):
torch.manual_seed(0)
if t3k_device_mesh.get_num_devices() <= 1:
Expand Down Expand Up @@ -245,16 +245,19 @@ def run_op_chain_2(input_0, input_1, weight):
assert_with_pcc(device_tensor_torch, torch_output_golden_2, pcc=0.96)
else:
# Perform host All-Gather
logger.info("Read Back Trace 0 Outputs")
ttnn_torch_output_tensor = ttnn.to_torch(
output_tensor, mesh_composer=ConcatMeshToTensor(t3k_device_mesh, dim=0), device=t3k_device_mesh
)
assert_with_pcc(ttnn_torch_output_tensor, torch_output_golden, pcc=0.96)

logger.info("Read Back Trace 1 Outputs")
ttnn_torch_output_tensor = ttnn.to_torch(
output_tensor_1, mesh_composer=ConcatMeshToTensor(t3k_device_mesh, dim=0), device=t3k_device_mesh
)
assert_with_pcc(ttnn_torch_output_tensor, torch_output_golden_1, pcc=0.96)

logger.info("Read Back Trace 1 Outputs")
ttnn_torch_output_tensor = ttnn.to_torch(
output_tensor_2, mesh_composer=ConcatMeshToTensor(t3k_device_mesh, dim=0), device=t3k_device_mesh
)
Expand Down
34 changes: 22 additions & 12 deletions tt_metal/impl/dispatch/kernels/cq_prefetch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,19 +126,21 @@ void barrier_and_stall(uint32_t& pending_read_size, uint32_t& fence, uint32_t& c

template<uint32_t preamble_size>
FORCE_INLINE
void read_from_pcie(volatile tt_l1_ptr prefetch_q_entry_type *& prefetch_q_rd_ptr,
uint32_t& pending_read_size,
uint32_t read_from_pcie(volatile tt_l1_ptr prefetch_q_entry_type *& prefetch_q_rd_ptr,
uint32_t& fence,
uint32_t& pcie_read_ptr,
uint32_t cmd_ptr,
uint32_t size) {

uint32_t pending_read_size = 0;
// Wrap cmddat_q
if (fence + size + preamble_size > cmddat_q_base + cmddat_q_size) {
// only wrap if there are no commands ready, otherwise we'll leave some on the floor
// TODO: does this matter for perf?
if (cmd_ptr != fence) {
return;
// No pending reads, since the location of fence cannot be moved due to unread commands
// in the cmddat_q -> reads cannot be issued to fill the queue.
return pending_read_size;
}
fence = cmddat_q_base;
}
Expand All @@ -165,6 +167,7 @@ void read_from_pcie(volatile tt_l1_ptr prefetch_q_entry_type *& prefetch_q_rd_pt
if ((uint32_t)prefetch_q_rd_ptr == prefetch_q_end) {
prefetch_q_rd_ptr = (volatile tt_l1_ptr prefetch_q_entry_type*)prefetch_q_base;
}
return pending_read_size;
}

// This routine can be called in 8 states based on the boolean values cmd_ready, prefetch_q_ready, read_pending:
Expand Down Expand Up @@ -213,14 +216,15 @@ void fetch_q_get_cmds(uint32_t& fence, uint32_t& cmd_ptr, uint32_t& pcie_read_pt
stall_state = static_cast<StallState>(stall_flag << 1); // NOT_STALLED -> STALL_NEXT if stall_flag is set

if (fetch_size != 0 && pending_read_size == 0) {
read_from_pcie<preamble_size>
(prefetch_q_rd_ptr, pending_read_size, fence, pcie_read_ptr, cmd_ptr, fetch_size);
if (stall_state == STALL_NEXT) {
// No pending reads. exec_buf is the first command being fetched and should be offset
pending_read_size = read_from_pcie<preamble_size>
(prefetch_q_rd_ptr, fence, pcie_read_ptr, cmd_ptr, fetch_size);
if (stall_state == STALL_NEXT && pending_read_size != 0) {
// No pending reads -> stall_state can be set to STALLED, since the read to the cmd
// that initiated the stall has been issued.
// exec_buf is the first command being fetched and should be offset
// by preamble size. After ensuring that the exec_buf command has been read (barrier),
// exit.
barrier_and_stall(pending_read_size, fence, cmd_ptr); // STALL_NEXT -> STALLED
return;
}
}
if (!cmd_ready) {
Expand Down Expand Up @@ -248,12 +252,18 @@ void fetch_q_get_cmds(uint32_t& fence, uint32_t& cmd_ptr, uint32_t& pcie_read_pt
// If the prefetcher state reached here, it is issuing a read to the same "slot", since for exec_buf commands
// we will insert a read barrier. Hence, the exec_buf command will be concatenated to a previous command, and
// should not be offset by pramble size.
read_from_pcie<0>
(prefetch_q_rd_ptr, pending_read_size, fence, pcie_read_ptr, cmd_ptr, fetch_size);
pending_read_size = read_from_pcie<0>
(prefetch_q_rd_ptr, fence, pcie_read_ptr, cmd_ptr, fetch_size);
if (pending_read_size == 0) {
// read_from_pcie early exited, due to a wrap, i.e. the exec_buf cmd is at a wrapped location, and a read
// to it could not be issued, since there are existing commands in the cmddat_q.
// Only move the stall_state to stalled if the read to the cmd that initiated the stall was issued
return;
}
barrier_and_stall(pending_read_size, fence, cmd_ptr); // STALL_NEXT -> STALLED
} else {
read_from_pcie<preamble_size>
(prefetch_q_rd_ptr, pending_read_size, fence, pcie_read_ptr, cmd_ptr, fetch_size);
pending_read_size = read_from_pcie<preamble_size>
(prefetch_q_rd_ptr, fence, pcie_read_ptr, cmd_ptr, fetch_size);
}
}
} else {
Expand Down

0 comments on commit 8abc730

Please sign in to comment.