#0: Resolve trace issues exposed by using eth dispatch

- stall_state was set to STALLED, even if exec_buf command wrapped around in the cmddat_q, causing read_from_pcie to not be called until exec_buf was run ... except exec_buf was never run, as it was never fetched - Resolve this by only setting the STALLED flag, if there are no pending reads over PCIe -> all entries in the exec_buf fetch_q entry have been read - Add multi_device trace tests using ethernet dispatch
tenstorrent · Jun 15, 2024 · 8abc730 · 8abc730
1 parent d44016b
commit 8abc730
Show file tree

Hide file tree

Showing 4 changed files with 29 additions and 14 deletions.
diff --git a/tests/scripts/t3000/run_t3000_frequent_tests.sh b/tests/scripts/t3000/run_t3000_frequent_tests.sh
@@ -92,6 +92,7 @@ run_t3000_trace_stress_tests() {
   echo "LOG_METAL: Running run_t3000_trace_stress_tests"
 
   NUM_TRACE_LOOPS=30 pytest tests/ttnn/unit_tests/test_multi_device_trace.py
+  NUM_TRACE_LOOPS=30 WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest tests/ttnn/unit_tests/test_multi_device_trace.py
 
   # Record the end time
   end_time=$(date +%s)

diff --git a/tests/scripts/t3000/run_t3000_unit_tests.sh b/tests/scripts/t3000/run_t3000_unit_tests.sh
@@ -28,6 +28,7 @@ run_t3000_ttnn_tests() {
 
   echo "LOG_METAL: Running run_t3000_ttnn_tests"
   pytest tests/ttnn/unit_tests/test_multi_device_trace.py
+  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest tests/ttnn/unit_tests/test_multi_device_trace.py
   pytest tests/ttnn/unit_tests/test_multi_device.py
   pytest tests/ttnn/unit_tests/test_multi_device_async.py
   # Record the end time

diff --git a/tests/ttnn/unit_tests/test_multi_device_trace.py b/tests/ttnn/unit_tests/test_multi_device_trace.py
@@ -108,11 +108,11 @@ def run_op_chain(input_0, input_1):
 
 @pytest.mark.parametrize(
     "shape",
-    [(1, 3, 1024, 1024), (1, 1, 512, 512), (1, 1, 32, 32), (1, 3, 512, 512), (1, 3, 32, 32)],
+    [(1, 1, 256, 256), (1, 3, 1024, 1024), (1, 1, 512, 512), (1, 1, 32, 32), (1, 3, 512, 512), (1, 3, 32, 32)],
 )
 @pytest.mark.parametrize("use_all_gather", [True, False])
 @pytest.mark.parametrize("enable_async", [True])
-@pytest.mark.parametrize("device_params", [{"trace_region_size": 99328}], indirect=True)
+@pytest.mark.parametrize("device_params", [{"trace_region_size": 104448}], indirect=True)
 def test_multi_device_multi_trace(t3k_device_mesh, shape, use_all_gather, enable_async):
     torch.manual_seed(0)
     if t3k_device_mesh.get_num_devices() <= 1:
@@ -245,16 +245,19 @@ def run_op_chain_2(input_0, input_1, weight):
                 assert_with_pcc(device_tensor_torch, torch_output_golden_2, pcc=0.96)
         else:
             # Perform host All-Gather
+            logger.info("Read Back Trace 0 Outputs")
             ttnn_torch_output_tensor = ttnn.to_torch(
                 output_tensor, mesh_composer=ConcatMeshToTensor(t3k_device_mesh, dim=0), device=t3k_device_mesh
             )
             assert_with_pcc(ttnn_torch_output_tensor, torch_output_golden, pcc=0.96)
 
+            logger.info("Read Back Trace 1 Outputs")
             ttnn_torch_output_tensor = ttnn.to_torch(
                 output_tensor_1, mesh_composer=ConcatMeshToTensor(t3k_device_mesh, dim=0), device=t3k_device_mesh
             )
             assert_with_pcc(ttnn_torch_output_tensor, torch_output_golden_1, pcc=0.96)
 
+            logger.info("Read Back Trace 1 Outputs")
             ttnn_torch_output_tensor = ttnn.to_torch(
                 output_tensor_2, mesh_composer=ConcatMeshToTensor(t3k_device_mesh, dim=0), device=t3k_device_mesh
             )

diff --git a/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp b/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp
@@ -126,19 +126,21 @@ void barrier_and_stall(uint32_t& pending_read_size, uint32_t& fence, uint32_t& c
 
 template<uint32_t preamble_size>
 FORCE_INLINE
-void read_from_pcie(volatile tt_l1_ptr prefetch_q_entry_type *& prefetch_q_rd_ptr,
-                    uint32_t& pending_read_size,
+uint32_t read_from_pcie(volatile tt_l1_ptr prefetch_q_entry_type *& prefetch_q_rd_ptr,
                     uint32_t& fence,
                     uint32_t& pcie_read_ptr,
                     uint32_t cmd_ptr,
                     uint32_t size) {
 
+    uint32_t pending_read_size = 0;
     // Wrap cmddat_q
     if (fence + size + preamble_size > cmddat_q_base + cmddat_q_size) {
         // only wrap if there are no commands ready, otherwise we'll leave some on the floor
         // TODO: does this matter for perf?
         if (cmd_ptr != fence) {
-            return;
+            // No pending reads, since the location of fence cannot be moved due to unread commands
+            // in the cmddat_q -> reads cannot be issued to fill the queue.
+            return pending_read_size;
         }
         fence = cmddat_q_base;
     }
@@ -165,6 +167,7 @@ void read_from_pcie(volatile tt_l1_ptr prefetch_q_entry_type *& prefetch_q_rd_pt
     if ((uint32_t)prefetch_q_rd_ptr == prefetch_q_end) {
         prefetch_q_rd_ptr = (volatile tt_l1_ptr prefetch_q_entry_type*)prefetch_q_base;
     }
+    return pending_read_size;
 }
 
 // This routine can be called in 8 states based on the boolean values cmd_ready, prefetch_q_ready, read_pending:
@@ -213,14 +216,15 @@ void fetch_q_get_cmds(uint32_t& fence, uint32_t& cmd_ptr, uint32_t& pcie_read_pt
     stall_state = static_cast<StallState>(stall_flag << 1); // NOT_STALLED -> STALL_NEXT if stall_flag is set
 
     if (fetch_size != 0 && pending_read_size == 0) {
-        read_from_pcie<preamble_size>
-            (prefetch_q_rd_ptr, pending_read_size, fence, pcie_read_ptr, cmd_ptr, fetch_size);
-        if (stall_state == STALL_NEXT) {
-            // No pending reads. exec_buf is the first command being fetched and should be offset
+        pending_read_size = read_from_pcie<preamble_size>
+            (prefetch_q_rd_ptr, fence, pcie_read_ptr, cmd_ptr, fetch_size);
+        if (stall_state == STALL_NEXT && pending_read_size != 0) {
+            // No pending reads -> stall_state can be set to STALLED, since the read to the cmd
+            // that initiated the stall has been issued.
+            // exec_buf is the first command being fetched and should be offset
             // by preamble size. After ensuring that the exec_buf command has been read (barrier),
             // exit.
             barrier_and_stall(pending_read_size, fence, cmd_ptr); // STALL_NEXT -> STALLED
-            return;
         }
     }
     if (!cmd_ready) {
@@ -248,12 +252,18 @@ void fetch_q_get_cmds(uint32_t& fence, uint32_t& cmd_ptr, uint32_t& pcie_read_pt
                     // If the prefetcher state reached here, it is issuing a read to the same "slot", since for exec_buf commands
                     // we will insert a read barrier. Hence, the exec_buf command will be concatenated to a previous command, and
                     // should not be offset by pramble size.
-                    read_from_pcie<0>
-                        (prefetch_q_rd_ptr, pending_read_size, fence, pcie_read_ptr, cmd_ptr, fetch_size);
+                    pending_read_size = read_from_pcie<0>
+                        (prefetch_q_rd_ptr, fence, pcie_read_ptr, cmd_ptr, fetch_size);
+                    if (pending_read_size == 0) {
+                        // read_from_pcie early exited, due to a wrap, i.e. the exec_buf cmd is at a wrapped location, and a read
+                        // to it could not be issued, since there are existing commands in the cmddat_q.
+                        // Only move the stall_state to stalled if the read to the cmd that initiated the stall was issued
+                        return;
+                    }
                     barrier_and_stall(pending_read_size, fence, cmd_ptr); // STALL_NEXT -> STALLED
                 } else {
-                    read_from_pcie<preamble_size>
-                        (prefetch_q_rd_ptr, pending_read_size, fence, pcie_read_ptr, cmd_ptr, fetch_size);
+                    pending_read_size = read_from_pcie<preamble_size>
+                        (prefetch_q_rd_ptr, fence, pcie_read_ptr, cmd_ptr, fetch_size);
                 }
             }
         } else {