Fixed error when an accessor from an RTL tasklet is a stream (#1403)

* Copyright bump * Ensured all RTL samples' comments are of a consistent style, and mentions which target mode they're inteded for. * Added a comment about the temporal vectorization hardware test stalling in 2022.1.
spcl · Oct 19, 2023 · 8402e52 · 8402e52
1 parent 6f471cf
commit 8402e52
Show file tree

Hide file tree

Showing 13 changed files with 256 additions and 228 deletions.
diff --git a/dace/codegen/targets/rtl.py b/dace/codegen/targets/rtl.py
@@ -1,8 +1,8 @@
 # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
 
 import itertools
-
 from typing import List, Tuple, Dict
+import warnings
 
 from dace import dtypes, config, registry, symbolic, nodes, sdfg, data
 from dace.sdfg import graph, state, find_input_arraynode, find_output_arraynode
@@ -102,6 +102,21 @@ def copy_memory(self, sdfg: sdfg.SDFG, dfg: state.StateSubgraphView, state_id: i
                 elif isinstance(arr, data.Scalar):
                     line: str = "{} {} = {};".format(dst_node.in_connectors[edge.dst_conn].ctype, edge.dst_conn,
                                                      edge.src.data)
+                elif isinstance(arr, data.Stream):
+                    # TODO Streams are currently unsupported, as the proper
+                    # behaviour has to be implemented to avoid deadlocking. It
+                    # is only a warning, as the RTL backend is partially used
+                    # by the Xilinx backend, which may hit this case, but will
+                    # discard the errorneous code.
+                    warnings.warn(
+                        'Streams are currently unsupported by the RTL backend.' \
+                        'This may produce errors or deadlocks in the generated code.'
+                    )
+                    line: str = "// WARNING: Unsupported read from ({}) variable '{}' from stream '{}'." \
+                        " This may lead to a deadlock if used in code.\n".format(
+                            dst_node.in_connectors[edge.dst_conn].ctype, edge.dst_conn, edge.src_conn)
+                    line += "{} {} = {}.pop();".format(
+                            dst_node.in_connectors[edge.dst_conn].ctype, edge.dst_conn, edge.src.data)
         elif isinstance(edge.src, nodes.MapEntry) and isinstance(edge.dst, nodes.Tasklet):
             rtl_name = self.unique_name(edge.dst, sdfg.nodes()[state_id], sdfg)
             self.n_unrolled[rtl_name] = symbolic.evaluate(edge.src.map.range[0][1] + 1, sdfg.constants)

diff --git a/samples/fpga/rtl/add_fortytwo.py b/samples/fpga/rtl/add_fortytwo.py
@@ -1,8 +1,9 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
-#
-# This sample shows adding a constant integer value to a stream of integers.
-#
-# It is intended for running hardware_emulation or hardware xilinx targets.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+"""
+    This sample shows adding a constant integer value to a stream of integers.
+
+    It is intended for running hardware_emulation or hardware xilinx targets.
+"""
 
 import dace
 import numpy as np
@@ -116,21 +117,21 @@
 ######################################################################
 
 if __name__ == '__main__':
+    with dace.config.set_temporary('compiler', 'xilinx', 'mode', value='hardware_emulation'):
+        # init data structures
+        N.set(8192)
+        a = np.random.randint(0, 100, N.get()).astype(np.int32)
+        b = np.zeros((N.get(), )).astype(np.int32)
 
-    # init data structures
-    N.set(8192)
-    a = np.random.randint(0, 100, N.get()).astype(np.int32)
-    b = np.zeros((N.get(), )).astype(np.int32)
-
-    # show initial values
-    print("a={}, b={}".format(a, b))
+        # show initial values
+        print("a={}, b={}".format(a, b))
 
-    # call program
-    sdfg(A=a, B=b, N=N)
+        # call program
+        sdfg(A=a, B=b, N=N)
 
-    # show result
-    print("a={}, b={}".format(a, b))
+        # show result
+        print("a={}, b={}".format(a, b))
 
-    # check result
-    for i in range(N.get()):
-        assert b[i] == a[i] + 42
+        # check result
+        for i in range(N.get()):
+            assert b[i] == a[i] + 42
diff --git a/samples/fpga/rtl/axpy.py b/samples/fpga/rtl/axpy.py
@@ -1,7 +1,10 @@
-# Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved.
-#
-# This sample shows the AXPY BLAS routine. It is implemented through Xilinx IPs in order to utilize floating point
-# operations. It is intended for running hardware_emulation or hardware xilinx targets.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+"""
+    This sample shows the AXPY BLAS routine. It is implemented through Xilinx IPs in order to utilize floating point
+    operations.
+
+    It is intended for running hardware_emulation or hardware xilinx targets.
+"""
 
 import dace
 import numpy as np
@@ -259,4 +262,4 @@ def make_sdfg(veclen=2):
         expected = a * x + y
         diff = np.linalg.norm(expected - result) / N.get()
         print("Difference:", diff)
-    exit(0 if diff <= 1e-5 else 1)
+        assert diff <= 1e-5
diff --git a/samples/fpga/rtl/axpy_double_pump.py b/samples/fpga/rtl/axpy_double_pump.py
@@ -1,73 +1,74 @@
-# Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved.
-#
-# This sample shows the AXPY BLAS routine. It is implemented through Xilinx
-# IPs in order to utilize double pumping, which doubles the performance per
-# consumed FPGA resource. The double pumping operation is "inwards", which
-# means that the internal vectorization width of the core computation is half
-# that of the external vectorization width. This translates into utilizing half
-# the amount of internal computing resources, compared to a regular vectorized
-# implementetation. The block diagram of the design for a 32-bit floating-point
-# implementation using vectorization width 2 is:
-#
-#          ap_aclk          s_axis_y_in        s_axis_x_in     a
-#             │                  │                  │          │
-#             │                  │                  │          │
-#             │                  │                  │          │
-#     ┌───────┼─────────┬────────┼─────────┐        │          │
-#     │       │         │        │         │        │          │
-#     │       │         │        ▼         │        ▼          │
-#     │       │         │  ┌────────────┐  │  ┌────────────┐   │
-#     │       │         └─►│            │  └─►│            │   │
-#     │       │            │ Clock sync │     │ Clock sync │   │
-#     │       │         ┌─►│            │  ┌─►│            │   │
-#     │       ▼ 300 MHz │  └─────┬──────┘  │  └─────┬──────┘   │
-#     │ ┌────────────┐  │        │         │        │          │
-#     │ │ Clock      │  │        │         │        │          │
-#     │ │            │  ├────────┼─────────┤        │          │
-#     │ │ Multiplier │  │        │         │        │          │
-#     │ └─────┬──────┘  │        ▼ 64 bit  │        ▼ 64 bit   │
-#     │       │ 600 MHz │  ┌────────────┐  │  ┌────────────┐   │
-#     │       │         │  │            │  │  │            │   │
-#     │       └─────────┼─►│ Data issue │  └─►│ Data issue │   │
-#     │                 │  │            │     │            │   │
-#     │                 │  └─────┬──────┘     └─────┬──────┘   │
-#     │                 │        │ 32 bit           │ 32 bit   │
-#     │                 │        │                  │          │
-#     │                 │        │                  │          │
-#     │                 │        │                  ▼          ▼
-#     │                 │        │                 ┌────────────┐
-#     │                 │        │                 │            │
-#     │                 ├────────┼────────────────►│ Multiplier │
-#     │                 │        │                 │            │
-#     │                 │        │                 └─────┬──────┘
-#     │                 │        │                       │
-#     │                 │        │        ┌──────────────┘
-#     │                 │        │        │
-#     │                 │        ▼        ▼
-#     │                 │      ┌────────────┐
-#     │                 │      │            │
-#     │                 ├─────►│    Adder   │
-#     │                 │      │            │
-#     │                 │      └─────┬──────┘
-#     │                 │            │
-#     │                 │            ▼ 32 bit
-#     │                 │      ┌─────────────┐
-#     │                 │      │             │
-#     │                 ├─────►│ Data packer │
-#     │                 │      │             │
-#     │                 │      └─────┬───────┘
-#     │                 │            │ 64 bit
-#     │                 │            ▼
-#     │                 │      ┌────────────┐
-#     │                 └─────►│            │
-#     │                        │ Clock sync │
-#     └───────────────────────►│            │
-#                              └─────┬──────┘
-#                                    │
-#                                    ▼
-#                            m_axis_result_out
-#
-# It is intended for running hardware_emulation or hardware xilinx targets.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+"""
+    This sample shows the AXPY BLAS routine. It is implemented through Xilinx
+    IPs in order to utilize double pumping, which doubles the performance per
+    consumed FPGA resource. The double pumping operation is "inwards", which
+    means that the internal vectorization width of the core computation is half
+    that of the external vectorization width. This translates into utilizing half
+    the amount of internal computing resources, compared to a regular vectorized
+    implementetation. The block diagram of the design for a 32-bit floating-point
+    implementation using vectorization width 2 is:
+
+             ap_aclk          s_axis_y_in        s_axis_x_in     a
+                │                  │                  │          │
+                │                  │                  │          │
+                │                  │                  │          │
+        ┌───────┼─────────┬────────┼─────────┐        │          │
+        │       │         │        │         │        │          │
+        │       │         │        ▼         │        ▼          │
+        │       │         │  ┌────────────┐  │  ┌────────────┐   │
+        │       │         └─►│            │  └─►│            │   │
+        │       │            │ Clock sync │     │ Clock sync │   │
+        │       │         ┌─►│            │  ┌─►│            │   │
+        │       ▼ 300 MHz │  └─────┬──────┘  │  └─────┬──────┘   │
+        │ ┌────────────┐  │        │         │        │          │
+        │ │ Clock      │  │        │         │        │          │
+        │ │            │  ├────────┼─────────┤        │          │
+        │ │ Multiplier │  │        │         │        │          │
+        │ └─────┬──────┘  │        ▼ 64 bit  │        ▼ 64 bit   │
+        │       │ 600 MHz │  ┌────────────┐  │  ┌────────────┐   │
+        │       │         │  │            │  │  │            │   │
+        │       └─────────┼─►│ Data issue │  └─►│ Data issue │   │
+        │                 │  │            │     │            │   │
+        │                 │  └─────┬──────┘     └─────┬──────┘   │
+        │                 │        │ 32 bit           │ 32 bit   │
+        │                 │        │                  │          │
+        │                 │        │                  │          │
+        │                 │        │                  ▼          ▼
+        │                 │        │                 ┌────────────┐
+        │                 │        │                 │            │
+        │                 ├────────┼────────────────►│ Multiplier │
+        │                 │        │                 │            │
+        │                 │        │                 └─────┬──────┘
+        │                 │        │                       │
+        │                 │        │        ┌──────────────┘
+        │                 │        │        │
+        │                 │        ▼        ▼
+        │                 │      ┌────────────┐
+        │                 │      │            │
+        │                 ├─────►│    Adder   │
+        │                 │      │            │
+        │                 │      └─────┬──────┘
+        │                 │            │
+        │                 │            ▼ 32 bit
+        │                 │      ┌─────────────┐
+        │                 │      │             │
+        │                 ├─────►│ Data packer │
+        │                 │      │             │
+        │                 │      └─────┬───────┘
+        │                 │            │ 64 bit
+        │                 │            ▼
+        │                 │      ┌────────────┐
+        │                 └─────►│            │
+        │                        │ Clock sync │
+        └───────────────────────►│            │
+                                 └─────┬──────┘
+                                       │
+                                       ▼
+                               m_axis_result_out
+
+    It is intended for running hardware_emulation or hardware xilinx targets.
+"""
 
 import dace
 import numpy as np
@@ -452,4 +453,4 @@ def make_sdfg(veclen=2):
             diff = np.linalg.norm(expected - result) / N.get()
             print("Difference:", diff)
 
-    exit(0 if diff <= 1e-5 else 1)
+            assert diff <= 1e-5
diff --git a/samples/fpga/rtl/fladd.py b/samples/fpga/rtl/fladd.py
@@ -1,10 +1,11 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
-#
-# This sample shows how to utilize an IP core in an RTL tasklet. This is done
-# through the vector add problem, which adds two floating point vectors
-# together.
-#
-# It is intended for running hardware_emulation or hardware xilinx targets.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+"""
+    This sample shows how to utilize an IP core in an RTL tasklet. This is done
+    through the vector add problem, which adds two floating point vectors
+    together.
+
+    It is intended for running hardware_emulation or hardware xilinx targets.
+"""
 
 import dace
 import numpy as np
@@ -190,4 +191,4 @@
         expected = a + b
         diff = np.linalg.norm(expected - c) / N.get()
         print("Difference:", diff)
-    exit(0 if diff <= 1e-5 else 1)
+        assert diff <= 1e-5
diff --git a/samples/fpga/rtl/pipeline.py b/samples/fpga/rtl/pipeline.py
@@ -1,9 +1,10 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
-#
-# This sample shows a DEPTH deep pipeline, where each stage adds 1 to the
-# integer input stream.
-#
-# It is intended for running hardware_emulation or hardware xilinx targets.
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+"""
+    This sample shows a DEPTH deep pipeline, where each stage adds 1 to the
+    integer input stream.
+
+    It is intended for running hardware_emulation or hardware xilinx targets.
+"""
 
 import dace
 import numpy as np
@@ -151,21 +152,21 @@
 ######################################################################
 
 if __name__ == '__main__':
+    with dace.config.set_temporary('compiler', 'xilinx', 'mode', value='hardware_emulation'):
+        # init data structures
+        N.set(8192)
+        a = np.random.randint(0, 100, N.get()).astype(np.int32)
+        b = np.zeros((N.get(), )).astype(np.int32)
 
-    # init data structures
-    N.set(8192)
-    a = np.random.randint(0, 100, N.get()).astype(np.int32)
-    b = np.zeros((N.get(), )).astype(np.int32)
-
-    # show initial values
-    print("a={}, b={}".format(a, b))
+        # show initial values
+        print("a={}, b={}".format(a, b))
 
-    # call program
-    sdfg(A=a, B=b, N=N)
+        # call program
+        sdfg(A=a, B=b, N=N)
 
-    # show result
-    print("a={}, b={}".format(a, b))
+        # show result
+        print("a={}, b={}".format(a, b))
 
-    # check result
-    for i in range(N.get()):
-        assert b[i] == a[i] + depth
+        # check result
+        for i in range(N.get()):
+            assert b[i] == a[i] + depth