From 8402e526c5d5049204ff740e875c4dfc17e6c391 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Thu, 19 Oct 2023 19:15:21 +0200
Subject: [PATCH] Fixed error when an accessor from an RTL tasklet is a stream
 (#1403)

* Copyright bump
* Ensured all RTL samples' comments are of a consistent style, and mentions which target mode they're inteded for.
* Added a comment about the temporal vectorization hardware test stalling in 2022.1.
---
 dace/codegen/targets/rtl.py               |  17 ++-
 samples/fpga/rtl/add_fortytwo.py          |  39 +++---
 samples/fpga/rtl/axpy.py                  |  13 +-
 samples/fpga/rtl/axpy_double_pump.py      | 143 +++++++++++-----------
 samples/fpga/rtl/fladd.py                 |  17 +--
 samples/fpga/rtl/pipeline.py              |  41 ++++---
 samples/fpga/rtl/rtl_multi_tasklet.py     |  44 +++----
 samples/fpga/rtl/rtl_tasklet_parameter.py |  36 +++---
 samples/fpga/rtl/rtl_tasklet_pipeline.py  |  36 +++---
 samples/fpga/rtl/rtl_tasklet_scalar.py    |  30 ++---
 samples/fpga/rtl/rtl_tasklet_vector.py    |  40 +++---
 tests/rtl/hardware_test.py                |  22 ++--
 tests/rtl/simulation_test.py              |   6 +-
 13 files changed, 256 insertions(+), 228 deletions(-)

diff --git a/dace/codegen/targets/rtl.py b/dace/codegen/targets/rtl.py
index dcb752e215..935615fad6 100644
--- a/dace/codegen/targets/rtl.py
+++ b/dace/codegen/targets/rtl.py
@@ -1,8 +1,8 @@
 # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
 
 import itertools
-
 from typing import List, Tuple, Dict
+import warnings
 
 from dace import dtypes, config, registry, symbolic, nodes, sdfg, data
 from dace.sdfg import graph, state, find_input_arraynode, find_output_arraynode
@@ -102,6 +102,21 @@ def copy_memory(self, sdfg: sdfg.SDFG, dfg: state.StateSubgraphView, state_id: i
                 elif isinstance(arr, data.Scalar):
                     line: str = "{} {} = {};".format(dst_node.in_connectors[edge.dst_conn].ctype, edge.dst_conn,
                                                      edge.src.data)
+                elif isinstance(arr, data.Stream):
+                    # TODO Streams are currently unsupported, as the proper
+                    # behaviour has to be implemented to avoid deadlocking. It
+                    # is only a warning, as the RTL backend is partially used
+                    # by the Xilinx backend, which may hit this case, but will
+                    # discard the errorneous code.
+                    warnings.warn(
+                        'Streams are currently unsupported by the RTL backend.' \
+                        'This may produce errors or deadlocks in the generated code.'
+                    )
+                    line: str = "// WARNING: Unsupported read from ({}) variable '{}' from stream '{}'." \
+                        " This may lead to a deadlock if used in code.\n".format(
+                            dst_node.in_connectors[edge.dst_conn].ctype, edge.dst_conn, edge.src_conn)
+                    line += "{} {} = {}.pop();".format(
+                            dst_node.in_connectors[edge.dst_conn].ctype, edge.dst_conn, edge.src.data)
         elif isinstance(edge.src, nodes.MapEntry) and isinstance(edge.dst, nodes.Tasklet):
             rtl_name = self.unique_name(edge.dst, sdfg.nodes()[state_id], sdfg)
             self.n_unrolled[rtl_name] = symbolic.evaluate(edge.src.map.range[0][1] + 1, sdfg.constants)
diff --git a/samples/fpga/rtl/add_fortytwo.py b/samples/fpga/rtl/add_fortytwo.py
index 9c14ad098b..5abcd76a5b 100644
--- a/samples/fpga/rtl/add_fortytwo.py
+++ b/samples/fpga/rtl/add_fortytwo.py
@@ -1,8 +1,9 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
-#
-# This sample shows adding a constant integer value to a stream of integers.
-#
-# It is intended for running hardware_emulation or hardware xilinx targets.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+"""
+    This sample shows adding a constant integer value to a stream of integers.
+
+    It is intended for running hardware_emulation or hardware xilinx targets.
+"""
 
 import dace
 import numpy as np
@@ -116,21 +117,21 @@
 ######################################################################
 
 if __name__ == '__main__':
+    with dace.config.set_temporary('compiler', 'xilinx', 'mode', value='hardware_emulation'):
+        # init data structures
+        N.set(8192)
+        a = np.random.randint(0, 100, N.get()).astype(np.int32)
+        b = np.zeros((N.get(), )).astype(np.int32)
 
-    # init data structures
-    N.set(8192)
-    a = np.random.randint(0, 100, N.get()).astype(np.int32)
-    b = np.zeros((N.get(), )).astype(np.int32)
-
-    # show initial values
-    print("a={}, b={}".format(a, b))
+        # show initial values
+        print("a={}, b={}".format(a, b))
 
-    # call program
-    sdfg(A=a, B=b, N=N)
+        # call program
+        sdfg(A=a, B=b, N=N)
 
-    # show result
-    print("a={}, b={}".format(a, b))
+        # show result
+        print("a={}, b={}".format(a, b))
 
-    # check result
-    for i in range(N.get()):
-        assert b[i] == a[i] + 42
+        # check result
+        for i in range(N.get()):
+            assert b[i] == a[i] + 42
diff --git a/samples/fpga/rtl/axpy.py b/samples/fpga/rtl/axpy.py
index 8b720aaa1e..4f386c82a4 100644
--- a/samples/fpga/rtl/axpy.py
+++ b/samples/fpga/rtl/axpy.py
@@ -1,7 +1,10 @@
-# Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved.
-#
-# This sample shows the AXPY BLAS routine. It is implemented through Xilinx IPs in order to utilize floating point
-# operations. It is intended for running hardware_emulation or hardware xilinx targets.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+"""
+    This sample shows the AXPY BLAS routine. It is implemented through Xilinx IPs in order to utilize floating point
+    operations.
+
+    It is intended for running hardware_emulation or hardware xilinx targets.
+"""
 
 import dace
 import numpy as np
@@ -259,4 +262,4 @@ def make_sdfg(veclen=2):
         expected = a * x + y
         diff = np.linalg.norm(expected - result) / N.get()
         print("Difference:", diff)
-    exit(0 if diff <= 1e-5 else 1)
+        assert diff <= 1e-5
diff --git a/samples/fpga/rtl/axpy_double_pump.py b/samples/fpga/rtl/axpy_double_pump.py
index 2d44ab7689..c79948007b 100644
--- a/samples/fpga/rtl/axpy_double_pump.py
+++ b/samples/fpga/rtl/axpy_double_pump.py
@@ -1,73 +1,74 @@
-# Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved.
-#
-# This sample shows the AXPY BLAS routine. It is implemented through Xilinx
-# IPs in order to utilize double pumping, which doubles the performance per
-# consumed FPGA resource. The double pumping operation is "inwards", which
-# means that the internal vectorization width of the core computation is half
-# that of the external vectorization width. This translates into utilizing half
-# the amount of internal computing resources, compared to a regular vectorized
-# implementetation. The block diagram of the design for a 32-bit floating-point
-# implementation using vectorization width 2 is:
-#
-#          ap_aclk          s_axis_y_in        s_axis_x_in     a
-#             │                  │                  │          │
-#             │                  │                  │          │
-#             │                  │                  │          │
-#     ┌───────┼─────────┬────────┼─────────┐        │          │
-#     │       │         │        │         │        │          │
-#     │       │         │        ▼         │        ▼          │
-#     │       │         │  ┌────────────┐  │  ┌────────────┐   │
-#     │       │         └─►│            │  └─►│            │   │
-#     │       │            │ Clock sync │     │ Clock sync │   │
-#     │       │         ┌─►│            │  ┌─►│            │   │
-#     │       ▼ 300 MHz │  └─────┬──────┘  │  └─────┬──────┘   │
-#     │ ┌────────────┐  │        │         │        │          │
-#     │ │ Clock      │  │        │         │        │          │
-#     │ │            │  ├────────┼─────────┤        │          │
-#     │ │ Multiplier │  │        │         │        │          │
-#     │ └─────┬──────┘  │        ▼ 64 bit  │        ▼ 64 bit   │
-#     │       │ 600 MHz │  ┌────────────┐  │  ┌────────────┐   │
-#     │       │         │  │            │  │  │            │   │
-#     │       └─────────┼─►│ Data issue │  └─►│ Data issue │   │
-#     │                 │  │            │     │            │   │
-#     │                 │  └─────┬──────┘     └─────┬──────┘   │
-#     │                 │        │ 32 bit           │ 32 bit   │
-#     │                 │        │                  │          │
-#     │                 │        │                  │          │
-#     │                 │        │                  ▼          ▼
-#     │                 │        │                 ┌────────────┐
-#     │                 │        │                 │            │
-#     │                 ├────────┼────────────────►│ Multiplier │
-#     │                 │        │                 │            │
-#     │                 │        │                 └─────┬──────┘
-#     │                 │        │                       │
-#     │                 │        │        ┌──────────────┘
-#     │                 │        │        │
-#     │                 │        ▼        ▼
-#     │                 │      ┌────────────┐
-#     │                 │      │            │
-#     │                 ├─────►│    Adder   │
-#     │                 │      │            │
-#     │                 │      └─────┬──────┘
-#     │                 │            │
-#     │                 │            ▼ 32 bit
-#     │                 │      ┌─────────────┐
-#     │                 │      │             │
-#     │                 ├─────►│ Data packer │
-#     │                 │      │             │
-#     │                 │      └─────┬───────┘
-#     │                 │            │ 64 bit
-#     │                 │            ▼
-#     │                 │      ┌────────────┐
-#     │                 └─────►│            │
-#     │                        │ Clock sync │
-#     └───────────────────────►│            │
-#                              └─────┬──────┘
-#                                    │
-#                                    ▼
-#                            m_axis_result_out
-#
-# It is intended for running hardware_emulation or hardware xilinx targets.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+"""
+    This sample shows the AXPY BLAS routine. It is implemented through Xilinx
+    IPs in order to utilize double pumping, which doubles the performance per
+    consumed FPGA resource. The double pumping operation is "inwards", which
+    means that the internal vectorization width of the core computation is half
+    that of the external vectorization width. This translates into utilizing half
+    the amount of internal computing resources, compared to a regular vectorized
+    implementetation. The block diagram of the design for a 32-bit floating-point
+    implementation using vectorization width 2 is:
+
+             ap_aclk          s_axis_y_in        s_axis_x_in     a
+                │                  │                  │          │
+                │                  │                  │          │
+                │                  │                  │          │
+        ┌───────┼─────────┬────────┼─────────┐        │          │
+        │       │         │        │         │        │          │
+        │       │         │        ▼         │        ▼          │
+        │       │         │  ┌────────────┐  │  ┌────────────┐   │
+        │       │         └─►│            │  └─►│            │   │
+        │       │            │ Clock sync │     │ Clock sync │   │
+        │       │         ┌─►│            │  ┌─►│            │   │
+        │       ▼ 300 MHz │  └─────┬──────┘  │  └─────┬──────┘   │
+        │ ┌────────────┐  │        │         │        │          │
+        │ │ Clock      │  │        │         │        │          │
+        │ │            │  ├────────┼─────────┤        │          │
+        │ │ Multiplier │  │        │         │        │          │
+        │ └─────┬──────┘  │        ▼ 64 bit  │        ▼ 64 bit   │
+        │       │ 600 MHz │  ┌────────────┐  │  ┌────────────┐   │
+        │       │         │  │            │  │  │            │   │
+        │       └─────────┼─►│ Data issue │  └─►│ Data issue │   │
+        │                 │  │            │     │            │   │
+        │                 │  └─────┬──────┘     └─────┬──────┘   │
+        │                 │        │ 32 bit           │ 32 bit   │
+        │                 │        │                  │          │
+        │                 │        │                  │          │
+        │                 │        │                  ▼          ▼
+        │                 │        │                 ┌────────────┐
+        │                 │        │                 │            │
+        │                 ├────────┼────────────────►│ Multiplier │
+        │                 │        │                 │            │
+        │                 │        │                 └─────┬──────┘
+        │                 │        │                       │
+        │                 │        │        ┌──────────────┘
+        │                 │        │        │
+        │                 │        ▼        ▼
+        │                 │      ┌────────────┐
+        │                 │      │            │
+        │                 ├─────►│    Adder   │
+        │                 │      │            │
+        │                 │      └─────┬──────┘
+        │                 │            │
+        │                 │            ▼ 32 bit
+        │                 │      ┌─────────────┐
+        │                 │      │             │
+        │                 ├─────►│ Data packer │
+        │                 │      │             │
+        │                 │      └─────┬───────┘
+        │                 │            │ 64 bit
+        │                 │            ▼
+        │                 │      ┌────────────┐
+        │                 └─────►│            │
+        │                        │ Clock sync │
+        └───────────────────────►│            │
+                                 └─────┬──────┘
+                                       │
+                                       ▼
+                               m_axis_result_out
+
+    It is intended for running hardware_emulation or hardware xilinx targets.
+"""
 
 import dace
 import numpy as np
@@ -452,4 +453,4 @@ def make_sdfg(veclen=2):
             diff = np.linalg.norm(expected - result) / N.get()
             print("Difference:", diff)
 
-    exit(0 if diff <= 1e-5 else 1)
+            assert diff <= 1e-5
diff --git a/samples/fpga/rtl/fladd.py b/samples/fpga/rtl/fladd.py
index f22d419cbc..daf1ed269b 100644
--- a/samples/fpga/rtl/fladd.py
+++ b/samples/fpga/rtl/fladd.py
@@ -1,10 +1,11 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
-#
-# This sample shows how to utilize an IP core in an RTL tasklet. This is done
-# through the vector add problem, which adds two floating point vectors
-# together.
-#
-# It is intended for running hardware_emulation or hardware xilinx targets.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+"""
+    This sample shows how to utilize an IP core in an RTL tasklet. This is done
+    through the vector add problem, which adds two floating point vectors
+    together.
+
+    It is intended for running hardware_emulation or hardware xilinx targets.
+"""
 
 import dace
 import numpy as np
@@ -190,4 +191,4 @@
         expected = a + b
         diff = np.linalg.norm(expected - c) / N.get()
         print("Difference:", diff)
-    exit(0 if diff <= 1e-5 else 1)
+        assert diff <= 1e-5
diff --git a/samples/fpga/rtl/pipeline.py b/samples/fpga/rtl/pipeline.py
index b487da91ce..dbd0460fb0 100644
--- a/samples/fpga/rtl/pipeline.py
+++ b/samples/fpga/rtl/pipeline.py
@@ -1,9 +1,10 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
-#
-# This sample shows a DEPTH deep pipeline, where each stage adds 1 to the
-# integer input stream.
-#
-# It is intended for running hardware_emulation or hardware xilinx targets.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+"""
+    This sample shows a DEPTH deep pipeline, where each stage adds 1 to the
+    integer input stream.
+
+    It is intended for running hardware_emulation or hardware xilinx targets.
+"""
 
 import dace
 import numpy as np
@@ -151,21 +152,21 @@
 ######################################################################
 
 if __name__ == '__main__':
+    with dace.config.set_temporary('compiler', 'xilinx', 'mode', value='hardware_emulation'):
+        # init data structures
+        N.set(8192)
+        a = np.random.randint(0, 100, N.get()).astype(np.int32)
+        b = np.zeros((N.get(), )).astype(np.int32)
 
-    # init data structures
-    N.set(8192)
-    a = np.random.randint(0, 100, N.get()).astype(np.int32)
-    b = np.zeros((N.get(), )).astype(np.int32)
-
-    # show initial values
-    print("a={}, b={}".format(a, b))
+        # show initial values
+        print("a={}, b={}".format(a, b))
 
-    # call program
-    sdfg(A=a, B=b, N=N)
+        # call program
+        sdfg(A=a, B=b, N=N)
 
-    # show result
-    print("a={}, b={}".format(a, b))
+        # show result
+        print("a={}, b={}".format(a, b))
 
-    # check result
-    for i in range(N.get()):
-        assert b[i] == a[i] + depth
+        # check result
+        for i in range(N.get()):
+            assert b[i] == a[i] + depth
diff --git a/samples/fpga/rtl/rtl_multi_tasklet.py b/samples/fpga/rtl/rtl_multi_tasklet.py
index a646eb6be9..4a4a09deec 100644
--- a/samples/fpga/rtl/rtl_multi_tasklet.py
+++ b/samples/fpga/rtl/rtl_multi_tasklet.py
@@ -1,11 +1,11 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 """
     Two sequential RTL tasklets connected through a memlet.
+
+    It is intended for running simulation xilinx targets.
 """
 
 import dace
-import argparse
-
 import numpy as np
 
 # add sdfg
@@ -32,7 +32,7 @@
         m_axis_b_tdata <= 0;
         s_axis_a_tready <= 1'b1;
         state <= READY;
-    end else if (s_axis_a_tvalid && state == READY) begin // case: load a 
+    end else if (s_axis_a_tvalid && state == READY) begin // case: load a
         m_axis_b_tdata <= s_axis_a_tdata;
         s_axis_a_tready <= 1'b0;
         state <= BUSY;
@@ -41,7 +41,7 @@
     else
         m_axis_b_tdata <= m_axis_b_tdata;
         state <= DONE;
-end    
+end
 
 assign m_axis_b_tvalid = (m_axis_b_tdata >= 80) ? 1'b1:1'b0;
 """,
@@ -59,7 +59,7 @@
         m_axis_c_tdata <= 0;
         s_axis_b_tready <= 1'b1;
         state <= READY;
-    end else if (s_axis_b_tvalid && state == READY) begin // case: load a 
+    end else if (s_axis_b_tvalid && state == READY) begin // case: load a
         m_axis_c_tdata <= s_axis_b_tdata;
         s_axis_b_tready <= 1'b0;
         state <= BUSY;
@@ -68,9 +68,9 @@
     else
         m_axis_c_tdata <= m_axis_c_tdata;
         state <= DONE;
-end    
+end
 
-assign m_axis_c_tvalid = (m_axis_c_tdata >= 100) ? 1'b1:1'b0;   
+assign m_axis_c_tvalid = (m_axis_c_tdata >= 100) ? 1'b1:1'b0;
 """,
                              language=dace.Language.SystemVerilog)
 
@@ -92,21 +92,21 @@
 ######################################################################
 
 if __name__ == '__main__':
+    with dace.config.set_temporary('compiler', 'xilinx', 'mode', value='simulation'):
+        # init data structures
+        a = np.random.randint(0, 80, 1).astype(np.int32)
+        b = np.array([0]).astype(np.int32)
+        c = np.array([0]).astype(np.int32)
 
-    # init data structures
-    a = np.random.randint(0, 80, 1).astype(np.int32)
-    b = np.array([0]).astype(np.int32)
-    c = np.array([0]).astype(np.int32)
-
-    # show initial values
-    print("a={}, b={}, c={}".format(a, b, c))
+        # show initial values
+        print("a={}, b={}, c={}".format(a, b, c))
 
-    # call program
-    sdfg(A=a, B=b, C=c)
+        # call program
+        sdfg(A=a, B=b, C=c)
 
-    # show result
-    print("a={}, b={}, c={}".format(a, b, c))
+        # show result
+        print("a={}, b={}, c={}".format(a, b, c))
 
-    # check result
-    assert b == 80
-    assert c == 100
+        # check result
+        assert b == 80
+        assert c == 100
diff --git a/samples/fpga/rtl/rtl_tasklet_parameter.py b/samples/fpga/rtl/rtl_tasklet_parameter.py
index d20688b385..112e88a6bf 100644
--- a/samples/fpga/rtl/rtl_tasklet_parameter.py
+++ b/samples/fpga/rtl/rtl_tasklet_parameter.py
@@ -1,11 +1,11 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 """
     Simple RTL tasklet with a single scalar input and a single scalar output. It increments b from a up to 100.
+
+    It is intended for running simulation xilinx targets.
 """
 
 import dace
-import argparse
-
 import numpy as np
 
 # add sdfg
@@ -47,7 +47,7 @@
             m_axis_b_tdata <= 0;
             s_axis_a_tready <= 1'b1;
             state <= READY;
-        end else if (s_axis_a_tvalid && state == READY) begin // case: load a 
+        end else if (s_axis_a_tvalid && state == READY) begin // case: load a
             m_axis_b_tdata <= s_axis_a_tdata;
             s_axis_a_tready <= 1'b0;
             state <= BUSY;
@@ -56,9 +56,9 @@
         else
             m_axis_b_tdata <= m_axis_b_tdata;
             state <= DONE;
-    end    
+    end
 
-    assign m_axis_b_tvalid  = (m_axis_b_tdata >= MAX_VAL) ? 1'b1:1'b0;  
+    assign m_axis_b_tvalid  = (m_axis_b_tdata >= MAX_VAL) ? 1'b1:1'b0;
     ''',
                             language=dace.Language.SystemVerilog)
 
@@ -76,19 +76,19 @@
 ######################################################################
 
 if __name__ == '__main__':
+    with dace.config.set_temporary('compiler', 'xilinx', 'mode', value='simulation'):
+        # init data structures
+        a = np.random.randint(0, 100, 1).astype(np.int32)
+        b = np.array([0]).astype(np.int32)
 
-    # init data structures
-    a = np.random.randint(0, 100, 1).astype(np.int32)
-    b = np.array([0]).astype(np.int32)
-
-    # show initial values
-    print("a={}, b={}".format(a, b))
+        # show initial values
+        print("a={}, b={}".format(a, b))
 
-    # call program
-    sdfg(A=a, B=b)
+        # call program
+        sdfg(A=a, B=b)
 
-    # show result
-    print("a={}, b={}".format(a, b))
+        # show result
+        print("a={}, b={}".format(a, b))
 
-    # check result
-    assert b == sdfg.constants["MAX_VAL"]
+        # check result
+        assert b == sdfg.constants["MAX_VAL"]
diff --git a/samples/fpga/rtl/rtl_tasklet_pipeline.py b/samples/fpga/rtl/rtl_tasklet_pipeline.py
index 9166806c63..3ef20cd03f 100644
--- a/samples/fpga/rtl/rtl_tasklet_pipeline.py
+++ b/samples/fpga/rtl/rtl_tasklet_pipeline.py
@@ -1,11 +1,11 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 """
     Pipelined, AXI-handshake compliant example that increments b from a up to 100.
+
+    It is intended for running simulation xilinx targets.
 """
 
 import dace
-import argparse
-
 import numpy as np
 
 # add symbol
@@ -59,7 +59,7 @@
             state <= state_next;
     end
 
-    always_comb 
+    always_comb
     begin
         state_next = state;
         case(state)
@@ -132,21 +132,21 @@
 ######################################################################
 
 if __name__ == '__main__':
+    with dace.config.set_temporary('compiler', 'xilinx', 'mode', value='simulation'):
+        # init data structures
+        num_elements = dace.symbolic.evaluate(N, sdfg.constants)
+        a = np.random.randint(0, 100, num_elements).astype(np.int32)
+        b = np.array([0] * num_elements).astype(np.int32)
 
-    # init data structures
-    num_elements = dace.symbolic.evaluate(N, sdfg.constants)
-    a = np.random.randint(0, 100, num_elements).astype(np.int32)
-    b = np.array([0] * num_elements).astype(np.int32)
-
-    # show initial values
-    print("a={}, b={}".format(a, b))
+        # show initial values
+        print("a={}, b={}".format(a, b))
 
-    # call program
-    sdfg(A=a, B=b)
+        # call program
+        sdfg(A=a, B=b)
 
-    # show result
-    print("a={}, b={}".format(a, b))
+        # show result
+        print("a={}, b={}".format(a, b))
 
-    assert b[
-        0] == 100  # TODO: implement detection of #elements to process, s.t. we can extend the assertion to the whole array
-    assert np.all(map((lambda x: x == 0), b[1:-1]))  # should still be at the init value (for the moment)
+        assert b[
+            0] == 100  # TODO: implement detection of #elements to process, s.t. we can extend the assertion to the whole array
+        assert np.all(map((lambda x: x == 0), b[1:-1]))  # should still be at the init value (for the moment)
diff --git a/samples/fpga/rtl/rtl_tasklet_scalar.py b/samples/fpga/rtl/rtl_tasklet_scalar.py
index c9f6380a2b..cf8d53ec91 100644
--- a/samples/fpga/rtl/rtl_tasklet_scalar.py
+++ b/samples/fpga/rtl/rtl_tasklet_scalar.py
@@ -1,11 +1,11 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 """
     Simple RTL tasklet with a single scalar input and a single scalar output. It increments b from a up to 100.
+
+    It is intended for running simulation xilinx targets.
 """
 
 import dace
-import argparse
-
 import numpy as np
 
 # add sdfg
@@ -79,19 +79,19 @@
 ######################################################################
 
 if __name__ == '__main__':
+    with dace.config.set_temporary('compiler', 'xilinx', 'mode', value='simulation'):
+        # init data structures
+        a = np.random.randint(0, 100, 1).astype(np.int32)
+        b = np.array([0]).astype(np.int32)
 
-    # init data structures
-    a = np.random.randint(0, 100, 1).astype(np.int32)
-    b = np.array([0]).astype(np.int32)
-
-    # show initial values
-    print("a={}, b={}".format(a, b))
+        # show initial values
+        print("a={}, b={}".format(a, b))
 
-    # call program
-    sdfg(A=a, B=b)
+        # call program
+        sdfg(A=a, B=b)
 
-    # show result
-    print("a={}, b={}".format(a, b))
+        # show result
+        print("a={}, b={}".format(a, b))
 
-    # check result
-    assert b == 100
+        # check result
+        assert b == 100
diff --git a/samples/fpga/rtl/rtl_tasklet_vector.py b/samples/fpga/rtl/rtl_tasklet_vector.py
index c099a6a38d..9015b4f35e 100644
--- a/samples/fpga/rtl/rtl_tasklet_vector.py
+++ b/samples/fpga/rtl/rtl_tasklet_vector.py
@@ -1,11 +1,11 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 """
     RTL tasklet with a vector input of 4 int32 (width=128bits) and a single scalar output. It increments b from a[31:0] up to 100.
+
+    It is intended for running simulation xilinx targets.
 """
 
 import dace
-import argparse
-
 import numpy as np
 
 # add symbol
@@ -44,13 +44,13 @@
 
         typedef enum [1:0] {READY, BUSY, DONE} state_e;
         state_e state;
-    
+
         always@(posedge ap_aclk) begin
             if (ap_areset) begin // case: reset
                 m_axis_b_tdata <= 0;
                 s_axis_a_tready <= 1'b1;
                 state <= READY;
-            end else if (s_axis_a_tvalid && state == READY) begin // case: load a 
+            end else if (s_axis_a_tvalid && state == READY) begin // case: load a
                 m_axis_b_tdata <= s_axis_a_tdata[0];
                 s_axis_a_tready <= 1'b0;
                 state <= BUSY;
@@ -60,9 +60,9 @@
                 m_axis_b_tdata <= m_axis_b_tdata;
                 state <= DONE;
             end
-        end    
-    
-        assign m_axis_b_tvalid = (m_axis_b_tdata >= s_axis_a_tdata[0] + s_axis_a_tdata[1] && (state == BUSY || state == DONE)) ? 1'b1:1'b0; 
+        end
+
+        assign m_axis_b_tvalid = (m_axis_b_tdata >= s_axis_a_tdata[0] + s_axis_a_tdata[1] && (state == BUSY || state == DONE)) ? 1'b1:1'b0;
     ''',
                             language=dace.Language.SystemVerilog)
 
@@ -80,19 +80,19 @@
 ######################################################################
 
 if __name__ == '__main__':
+    with dace.config.set_temporary('compiler', 'xilinx', 'mode', value='simulation'):
+        # init data structures
+        a = np.random.randint(0, 100, dace.symbolic.evaluate(WIDTH, sdfg.constants)).astype(np.int32)
+        b = np.array([0]).astype(np.int32)
 
-    # init data structures
-    a = np.random.randint(0, 100, dace.symbolic.evaluate(WIDTH, sdfg.constants)).astype(np.int32)
-    b = np.array([0]).astype(np.int32)
-
-    # show initial values
-    print("a={}, b={}".format(a, b))
+        # show initial values
+        print("a={}, b={}".format(a, b))
 
-    # call program
-    sdfg(A=a, B=b)
+        # call program
+        sdfg(A=a, B=b)
 
-    # show result
-    print("a={}, b={}".format(a, b))
+        # show result
+        print("a={}, b={}".format(a, b))
 
-    # check result
-    assert b == a[0] + a[1]
+        # check result
+        assert b == a[0] + a[1]
diff --git a/tests/rtl/hardware_test.py b/tests/rtl/hardware_test.py
index 821688f481..727dc7362b 100644
--- a/tests/rtl/hardware_test.py
+++ b/tests/rtl/hardware_test.py
@@ -1,4 +1,7 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+"""
+    Test suite for testing RTL integration with DaCe targeting Xilinx FPGAs.
+"""
 import dace
 from dace.fpga_testing import rtl_test
 import numpy as np
@@ -13,7 +16,7 @@
 def make_vadd_sdfg(N: dace.symbol, veclen: int = 8):
     '''
     Function for generating a simple vector addition SDFG that adds a vector `A` of `N` elements to a scalar `B` into a vector `C` of `N` elements, all using SystemVerilog.
-    The tasklet creates `veclen` instances of a floating point adder that operates on `N` elements. 
+    The tasklet creates `veclen` instances of a floating point adder that operates on `N` elements.
 
     :param N: The number of elements the SDFG takes as input and output.
     :param veclen: The number of floating point adders to instantiate.
@@ -197,7 +200,7 @@ def make_vadd_multi_sdfg(N, M):
 
     :param N: The number of elements to compute on.
     :param M: The number of compute PEs to initialize.
-    :return: An SDFG that has arguments `A` and `B`. 
+    :return: An SDFG that has arguments `A` and `B`.
     '''
     # add sdfg
     sdfg = dace.SDFG(f'integer_vector_plus_42_multiple_kernels_{N.get() // M.get()}')
@@ -321,7 +324,7 @@ def make_vadd_multi_sdfg(N, M):
 @rtl_test()
 def test_hardware_vadd():
     '''
-    Test for the simple vector addition. 
+    Test for the simple vector addition.
     '''
 
     # add symbol
@@ -346,7 +349,7 @@ def test_hardware_vadd():
 @rtl_test()
 def test_hardware_add42_single():
     '''
-    Test for adding a constant using a single PE. 
+    Test for adding a constant using a single PE.
     '''
     N = dace.symbol('N')
     M = dace.symbol('M')
@@ -428,10 +431,11 @@ def test_hardware_vadd_temporal_vectorization():
     '''
     Tests whether the multi-pumping optimization can be applied automatically by applying the temporal vectorization transformation. It starts from a numpy vector addition for generating the SDFG. This SDFG is then optimized by applying the vectorization, streaming memory, fpga and temporal vectorization transformations in that order.
     '''
-    # TODO !!!!! THIS TEST STALLS IN HARDWARE EMULATION WITH VITIS 2021.2 !!!!!
-    # But it works fine for 2020.2 and 2022.2. It seems like everything but the
-    # last transaction correctly goes through just fine. The last transaction
-    # is never output by the floating point adder, but the inputs are consumed. 
+    # TODO !!!!! THIS TEST STALLS IN HARDWARE EMULATION WITH VITIS 2021.2 and 2022.1 !!!!!
+    # But it works fine for 2020.2, 2022.2, and 2023.1. It seems like
+    # everything but the last transaction correctly goes through just fine. The
+    # last transaction is never output by the floating point adder, but the
+    # inputs are consumed.
     with dace.config.set_temporary('compiler', 'xilinx', 'frequency', value='"0:300\\|1:600"'):
         # Generate the test data and expected results
         size_n = 1024
diff --git a/tests/rtl/simulation_test.py b/tests/rtl/simulation_test.py
index f20ff6133a..6b7ac2cd15 100644
--- a/tests/rtl/simulation_test.py
+++ b/tests/rtl/simulation_test.py
@@ -1,5 +1,7 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
-
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+"""
+    Test suite for testing RTL tasklets in DaCe with Verilator as a backend for simulation.
+"""
 import dace
 import numpy as np
 import pytest