From 8402e526c5d5049204ff740e875c4dfc17e6c391 Mon Sep 17 00:00:00 2001 From: Carl Johnsen Date: Thu, 19 Oct 2023 19:15:21 +0200 Subject: [PATCH] Fixed error when an accessor from an RTL tasklet is a stream (#1403) * Copyright bump * Ensured all RTL samples' comments are of a consistent style, and mentions which target mode they're inteded for. * Added a comment about the temporal vectorization hardware test stalling in 2022.1. --- dace/codegen/targets/rtl.py | 17 ++- samples/fpga/rtl/add_fortytwo.py | 39 +++--- samples/fpga/rtl/axpy.py | 13 +- samples/fpga/rtl/axpy_double_pump.py | 143 +++++++++++----------- samples/fpga/rtl/fladd.py | 17 +-- samples/fpga/rtl/pipeline.py | 41 ++++--- samples/fpga/rtl/rtl_multi_tasklet.py | 44 +++---- samples/fpga/rtl/rtl_tasklet_parameter.py | 36 +++--- samples/fpga/rtl/rtl_tasklet_pipeline.py | 36 +++--- samples/fpga/rtl/rtl_tasklet_scalar.py | 30 ++--- samples/fpga/rtl/rtl_tasklet_vector.py | 40 +++--- tests/rtl/hardware_test.py | 22 ++-- tests/rtl/simulation_test.py | 6 +- 13 files changed, 256 insertions(+), 228 deletions(-) diff --git a/dace/codegen/targets/rtl.py b/dace/codegen/targets/rtl.py index dcb752e215..935615fad6 100644 --- a/dace/codegen/targets/rtl.py +++ b/dace/codegen/targets/rtl.py @@ -1,8 +1,8 @@ # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. import itertools - from typing import List, Tuple, Dict +import warnings from dace import dtypes, config, registry, symbolic, nodes, sdfg, data from dace.sdfg import graph, state, find_input_arraynode, find_output_arraynode @@ -102,6 +102,21 @@ def copy_memory(self, sdfg: sdfg.SDFG, dfg: state.StateSubgraphView, state_id: i elif isinstance(arr, data.Scalar): line: str = "{} {} = {};".format(dst_node.in_connectors[edge.dst_conn].ctype, edge.dst_conn, edge.src.data) + elif isinstance(arr, data.Stream): + # TODO Streams are currently unsupported, as the proper + # behaviour has to be implemented to avoid deadlocking. It + # is only a warning, as the RTL backend is partially used + # by the Xilinx backend, which may hit this case, but will + # discard the errorneous code. + warnings.warn( + 'Streams are currently unsupported by the RTL backend.' \ + 'This may produce errors or deadlocks in the generated code.' + ) + line: str = "// WARNING: Unsupported read from ({}) variable '{}' from stream '{}'." \ + " This may lead to a deadlock if used in code.\n".format( + dst_node.in_connectors[edge.dst_conn].ctype, edge.dst_conn, edge.src_conn) + line += "{} {} = {}.pop();".format( + dst_node.in_connectors[edge.dst_conn].ctype, edge.dst_conn, edge.src.data) elif isinstance(edge.src, nodes.MapEntry) and isinstance(edge.dst, nodes.Tasklet): rtl_name = self.unique_name(edge.dst, sdfg.nodes()[state_id], sdfg) self.n_unrolled[rtl_name] = symbolic.evaluate(edge.src.map.range[0][1] + 1, sdfg.constants) diff --git a/samples/fpga/rtl/add_fortytwo.py b/samples/fpga/rtl/add_fortytwo.py index 9c14ad098b..5abcd76a5b 100644 --- a/samples/fpga/rtl/add_fortytwo.py +++ b/samples/fpga/rtl/add_fortytwo.py @@ -1,8 +1,9 @@ -# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. -# -# This sample shows adding a constant integer value to a stream of integers. -# -# It is intended for running hardware_emulation or hardware xilinx targets. +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. +""" + This sample shows adding a constant integer value to a stream of integers. + + It is intended for running hardware_emulation or hardware xilinx targets. +""" import dace import numpy as np @@ -116,21 +117,21 @@ ###################################################################### if __name__ == '__main__': + with dace.config.set_temporary('compiler', 'xilinx', 'mode', value='hardware_emulation'): + # init data structures + N.set(8192) + a = np.random.randint(0, 100, N.get()).astype(np.int32) + b = np.zeros((N.get(), )).astype(np.int32) - # init data structures - N.set(8192) - a = np.random.randint(0, 100, N.get()).astype(np.int32) - b = np.zeros((N.get(), )).astype(np.int32) - - # show initial values - print("a={}, b={}".format(a, b)) + # show initial values + print("a={}, b={}".format(a, b)) - # call program - sdfg(A=a, B=b, N=N) + # call program + sdfg(A=a, B=b, N=N) - # show result - print("a={}, b={}".format(a, b)) + # show result + print("a={}, b={}".format(a, b)) - # check result - for i in range(N.get()): - assert b[i] == a[i] + 42 + # check result + for i in range(N.get()): + assert b[i] == a[i] + 42 diff --git a/samples/fpga/rtl/axpy.py b/samples/fpga/rtl/axpy.py index 8b720aaa1e..4f386c82a4 100644 --- a/samples/fpga/rtl/axpy.py +++ b/samples/fpga/rtl/axpy.py @@ -1,7 +1,10 @@ -# Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved. -# -# This sample shows the AXPY BLAS routine. It is implemented through Xilinx IPs in order to utilize floating point -# operations. It is intended for running hardware_emulation or hardware xilinx targets. +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. +""" + This sample shows the AXPY BLAS routine. It is implemented through Xilinx IPs in order to utilize floating point + operations. + + It is intended for running hardware_emulation or hardware xilinx targets. +""" import dace import numpy as np @@ -259,4 +262,4 @@ def make_sdfg(veclen=2): expected = a * x + y diff = np.linalg.norm(expected - result) / N.get() print("Difference:", diff) - exit(0 if diff <= 1e-5 else 1) + assert diff <= 1e-5 diff --git a/samples/fpga/rtl/axpy_double_pump.py b/samples/fpga/rtl/axpy_double_pump.py index 2d44ab7689..c79948007b 100644 --- a/samples/fpga/rtl/axpy_double_pump.py +++ b/samples/fpga/rtl/axpy_double_pump.py @@ -1,73 +1,74 @@ -# Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved. -# -# This sample shows the AXPY BLAS routine. It is implemented through Xilinx -# IPs in order to utilize double pumping, which doubles the performance per -# consumed FPGA resource. The double pumping operation is "inwards", which -# means that the internal vectorization width of the core computation is half -# that of the external vectorization width. This translates into utilizing half -# the amount of internal computing resources, compared to a regular vectorized -# implementetation. The block diagram of the design for a 32-bit floating-point -# implementation using vectorization width 2 is: -# -# ap_aclk s_axis_y_in s_axis_x_in a -# │ │ │ │ -# │ │ │ │ -# │ │ │ │ -# ┌───────┼─────────┬────────┼─────────┐ │ │ -# │ │ │ │ │ │ │ -# │ │ │ ▼ │ ▼ │ -# │ │ │ ┌────────────┐ │ ┌────────────┐ │ -# │ │ └─►│ │ └─►│ │ │ -# │ │ │ Clock sync │ │ Clock sync │ │ -# │ │ ┌─►│ │ ┌─►│ │ │ -# │ ▼ 300 MHz │ └─────┬──────┘ │ └─────┬──────┘ │ -# │ ┌────────────┐ │ │ │ │ │ -# │ │ Clock │ │ │ │ │ │ -# │ │ │ ├────────┼─────────┤ │ │ -# │ │ Multiplier │ │ │ │ │ │ -# │ └─────┬──────┘ │ ▼ 64 bit │ ▼ 64 bit │ -# │ │ 600 MHz │ ┌────────────┐ │ ┌────────────┐ │ -# │ │ │ │ │ │ │ │ │ -# │ └─────────┼─►│ Data issue │ └─►│ Data issue │ │ -# │ │ │ │ │ │ │ -# │ │ └─────┬──────┘ └─────┬──────┘ │ -# │ │ │ 32 bit │ 32 bit │ -# │ │ │ │ │ -# │ │ │ │ │ -# │ │ │ ▼ ▼ -# │ │ │ ┌────────────┐ -# │ │ │ │ │ -# │ ├────────┼────────────────►│ Multiplier │ -# │ │ │ │ │ -# │ │ │ └─────┬──────┘ -# │ │ │ │ -# │ │ │ ┌──────────────┘ -# │ │ │ │ -# │ │ ▼ ▼ -# │ │ ┌────────────┐ -# │ │ │ │ -# │ ├─────►│ Adder │ -# │ │ │ │ -# │ │ └─────┬──────┘ -# │ │ │ -# │ │ ▼ 32 bit -# │ │ ┌─────────────┐ -# │ │ │ │ -# │ ├─────►│ Data packer │ -# │ │ │ │ -# │ │ └─────┬───────┘ -# │ │ │ 64 bit -# │ │ ▼ -# │ │ ┌────────────┐ -# │ └─────►│ │ -# │ │ Clock sync │ -# └───────────────────────►│ │ -# └─────┬──────┘ -# │ -# ▼ -# m_axis_result_out -# -# It is intended for running hardware_emulation or hardware xilinx targets. +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. +""" + This sample shows the AXPY BLAS routine. It is implemented through Xilinx + IPs in order to utilize double pumping, which doubles the performance per + consumed FPGA resource. The double pumping operation is "inwards", which + means that the internal vectorization width of the core computation is half + that of the external vectorization width. This translates into utilizing half + the amount of internal computing resources, compared to a regular vectorized + implementetation. The block diagram of the design for a 32-bit floating-point + implementation using vectorization width 2 is: + + ap_aclk s_axis_y_in s_axis_x_in a + │ │ │ │ + │ │ │ │ + │ │ │ │ + ┌───────┼─────────┬────────┼─────────┐ │ │ + │ │ │ │ │ │ │ + │ │ │ ▼ │ ▼ │ + │ │ │ ┌────────────┐ │ ┌────────────┐ │ + │ │ └─►│ │ └─►│ │ │ + │ │ │ Clock sync │ │ Clock sync │ │ + │ │ ┌─►│ │ ┌─►│ │ │ + │ ▼ 300 MHz │ └─────┬──────┘ │ └─────┬──────┘ │ + │ ┌────────────┐ │ │ │ │ │ + │ │ Clock │ │ │ │ │ │ + │ │ │ ├────────┼─────────┤ │ │ + │ │ Multiplier │ │ │ │ │ │ + │ └─────┬──────┘ │ ▼ 64 bit │ ▼ 64 bit │ + │ │ 600 MHz │ ┌────────────┐ │ ┌────────────┐ │ + │ │ │ │ │ │ │ │ │ + │ └─────────┼─►│ Data issue │ └─►│ Data issue │ │ + │ │ │ │ │ │ │ + │ │ └─────┬──────┘ └─────┬──────┘ │ + │ │ │ 32 bit │ 32 bit │ + │ │ │ │ │ + │ │ │ │ │ + │ │ │ ▼ ▼ + │ │ │ ┌────────────┐ + │ │ │ │ │ + │ ├────────┼────────────────►│ Multiplier │ + │ │ │ │ │ + │ │ │ └─────┬──────┘ + │ │ │ │ + │ │ │ ┌──────────────┘ + │ │ │ │ + │ │ ▼ ▼ + │ │ ┌────────────┐ + │ │ │ │ + │ ├─────►│ Adder │ + │ │ │ │ + │ │ └─────┬──────┘ + │ │ │ + │ │ ▼ 32 bit + │ │ ┌─────────────┐ + │ │ │ │ + │ ├─────►│ Data packer │ + │ │ │ │ + │ │ └─────┬───────┘ + │ │ │ 64 bit + │ │ ▼ + │ │ ┌────────────┐ + │ └─────►│ │ + │ │ Clock sync │ + └───────────────────────►│ │ + └─────┬──────┘ + │ + ▼ + m_axis_result_out + + It is intended for running hardware_emulation or hardware xilinx targets. +""" import dace import numpy as np @@ -452,4 +453,4 @@ def make_sdfg(veclen=2): diff = np.linalg.norm(expected - result) / N.get() print("Difference:", diff) - exit(0 if diff <= 1e-5 else 1) + assert diff <= 1e-5 diff --git a/samples/fpga/rtl/fladd.py b/samples/fpga/rtl/fladd.py index f22d419cbc..daf1ed269b 100644 --- a/samples/fpga/rtl/fladd.py +++ b/samples/fpga/rtl/fladd.py @@ -1,10 +1,11 @@ -# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. -# -# This sample shows how to utilize an IP core in an RTL tasklet. This is done -# through the vector add problem, which adds two floating point vectors -# together. -# -# It is intended for running hardware_emulation or hardware xilinx targets. +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. +""" + This sample shows how to utilize an IP core in an RTL tasklet. This is done + through the vector add problem, which adds two floating point vectors + together. + + It is intended for running hardware_emulation or hardware xilinx targets. +""" import dace import numpy as np @@ -190,4 +191,4 @@ expected = a + b diff = np.linalg.norm(expected - c) / N.get() print("Difference:", diff) - exit(0 if diff <= 1e-5 else 1) + assert diff <= 1e-5 diff --git a/samples/fpga/rtl/pipeline.py b/samples/fpga/rtl/pipeline.py index b487da91ce..dbd0460fb0 100644 --- a/samples/fpga/rtl/pipeline.py +++ b/samples/fpga/rtl/pipeline.py @@ -1,9 +1,10 @@ -# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. -# -# This sample shows a DEPTH deep pipeline, where each stage adds 1 to the -# integer input stream. -# -# It is intended for running hardware_emulation or hardware xilinx targets. +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. +""" + This sample shows a DEPTH deep pipeline, where each stage adds 1 to the + integer input stream. + + It is intended for running hardware_emulation or hardware xilinx targets. +""" import dace import numpy as np @@ -151,21 +152,21 @@ ###################################################################### if __name__ == '__main__': + with dace.config.set_temporary('compiler', 'xilinx', 'mode', value='hardware_emulation'): + # init data structures + N.set(8192) + a = np.random.randint(0, 100, N.get()).astype(np.int32) + b = np.zeros((N.get(), )).astype(np.int32) - # init data structures - N.set(8192) - a = np.random.randint(0, 100, N.get()).astype(np.int32) - b = np.zeros((N.get(), )).astype(np.int32) - - # show initial values - print("a={}, b={}".format(a, b)) + # show initial values + print("a={}, b={}".format(a, b)) - # call program - sdfg(A=a, B=b, N=N) + # call program + sdfg(A=a, B=b, N=N) - # show result - print("a={}, b={}".format(a, b)) + # show result + print("a={}, b={}".format(a, b)) - # check result - for i in range(N.get()): - assert b[i] == a[i] + depth + # check result + for i in range(N.get()): + assert b[i] == a[i] + depth diff --git a/samples/fpga/rtl/rtl_multi_tasklet.py b/samples/fpga/rtl/rtl_multi_tasklet.py index a646eb6be9..4a4a09deec 100644 --- a/samples/fpga/rtl/rtl_multi_tasklet.py +++ b/samples/fpga/rtl/rtl_multi_tasklet.py @@ -1,11 +1,11 @@ -# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. """ Two sequential RTL tasklets connected through a memlet. + + It is intended for running simulation xilinx targets. """ import dace -import argparse - import numpy as np # add sdfg @@ -32,7 +32,7 @@ m_axis_b_tdata <= 0; s_axis_a_tready <= 1'b1; state <= READY; - end else if (s_axis_a_tvalid && state == READY) begin // case: load a + end else if (s_axis_a_tvalid && state == READY) begin // case: load a m_axis_b_tdata <= s_axis_a_tdata; s_axis_a_tready <= 1'b0; state <= BUSY; @@ -41,7 +41,7 @@ else m_axis_b_tdata <= m_axis_b_tdata; state <= DONE; -end +end assign m_axis_b_tvalid = (m_axis_b_tdata >= 80) ? 1'b1:1'b0; """, @@ -59,7 +59,7 @@ m_axis_c_tdata <= 0; s_axis_b_tready <= 1'b1; state <= READY; - end else if (s_axis_b_tvalid && state == READY) begin // case: load a + end else if (s_axis_b_tvalid && state == READY) begin // case: load a m_axis_c_tdata <= s_axis_b_tdata; s_axis_b_tready <= 1'b0; state <= BUSY; @@ -68,9 +68,9 @@ else m_axis_c_tdata <= m_axis_c_tdata; state <= DONE; -end +end -assign m_axis_c_tvalid = (m_axis_c_tdata >= 100) ? 1'b1:1'b0; +assign m_axis_c_tvalid = (m_axis_c_tdata >= 100) ? 1'b1:1'b0; """, language=dace.Language.SystemVerilog) @@ -92,21 +92,21 @@ ###################################################################### if __name__ == '__main__': + with dace.config.set_temporary('compiler', 'xilinx', 'mode', value='simulation'): + # init data structures + a = np.random.randint(0, 80, 1).astype(np.int32) + b = np.array([0]).astype(np.int32) + c = np.array([0]).astype(np.int32) - # init data structures - a = np.random.randint(0, 80, 1).astype(np.int32) - b = np.array([0]).astype(np.int32) - c = np.array([0]).astype(np.int32) - - # show initial values - print("a={}, b={}, c={}".format(a, b, c)) + # show initial values + print("a={}, b={}, c={}".format(a, b, c)) - # call program - sdfg(A=a, B=b, C=c) + # call program + sdfg(A=a, B=b, C=c) - # show result - print("a={}, b={}, c={}".format(a, b, c)) + # show result + print("a={}, b={}, c={}".format(a, b, c)) - # check result - assert b == 80 - assert c == 100 + # check result + assert b == 80 + assert c == 100 diff --git a/samples/fpga/rtl/rtl_tasklet_parameter.py b/samples/fpga/rtl/rtl_tasklet_parameter.py index d20688b385..112e88a6bf 100644 --- a/samples/fpga/rtl/rtl_tasklet_parameter.py +++ b/samples/fpga/rtl/rtl_tasklet_parameter.py @@ -1,11 +1,11 @@ -# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. """ Simple RTL tasklet with a single scalar input and a single scalar output. It increments b from a up to 100. + + It is intended for running simulation xilinx targets. """ import dace -import argparse - import numpy as np # add sdfg @@ -47,7 +47,7 @@ m_axis_b_tdata <= 0; s_axis_a_tready <= 1'b1; state <= READY; - end else if (s_axis_a_tvalid && state == READY) begin // case: load a + end else if (s_axis_a_tvalid && state == READY) begin // case: load a m_axis_b_tdata <= s_axis_a_tdata; s_axis_a_tready <= 1'b0; state <= BUSY; @@ -56,9 +56,9 @@ else m_axis_b_tdata <= m_axis_b_tdata; state <= DONE; - end + end - assign m_axis_b_tvalid = (m_axis_b_tdata >= MAX_VAL) ? 1'b1:1'b0; + assign m_axis_b_tvalid = (m_axis_b_tdata >= MAX_VAL) ? 1'b1:1'b0; ''', language=dace.Language.SystemVerilog) @@ -76,19 +76,19 @@ ###################################################################### if __name__ == '__main__': + with dace.config.set_temporary('compiler', 'xilinx', 'mode', value='simulation'): + # init data structures + a = np.random.randint(0, 100, 1).astype(np.int32) + b = np.array([0]).astype(np.int32) - # init data structures - a = np.random.randint(0, 100, 1).astype(np.int32) - b = np.array([0]).astype(np.int32) - - # show initial values - print("a={}, b={}".format(a, b)) + # show initial values + print("a={}, b={}".format(a, b)) - # call program - sdfg(A=a, B=b) + # call program + sdfg(A=a, B=b) - # show result - print("a={}, b={}".format(a, b)) + # show result + print("a={}, b={}".format(a, b)) - # check result - assert b == sdfg.constants["MAX_VAL"] + # check result + assert b == sdfg.constants["MAX_VAL"] diff --git a/samples/fpga/rtl/rtl_tasklet_pipeline.py b/samples/fpga/rtl/rtl_tasklet_pipeline.py index 9166806c63..3ef20cd03f 100644 --- a/samples/fpga/rtl/rtl_tasklet_pipeline.py +++ b/samples/fpga/rtl/rtl_tasklet_pipeline.py @@ -1,11 +1,11 @@ -# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. """ Pipelined, AXI-handshake compliant example that increments b from a up to 100. + + It is intended for running simulation xilinx targets. """ import dace -import argparse - import numpy as np # add symbol @@ -59,7 +59,7 @@ state <= state_next; end - always_comb + always_comb begin state_next = state; case(state) @@ -132,21 +132,21 @@ ###################################################################### if __name__ == '__main__': + with dace.config.set_temporary('compiler', 'xilinx', 'mode', value='simulation'): + # init data structures + num_elements = dace.symbolic.evaluate(N, sdfg.constants) + a = np.random.randint(0, 100, num_elements).astype(np.int32) + b = np.array([0] * num_elements).astype(np.int32) - # init data structures - num_elements = dace.symbolic.evaluate(N, sdfg.constants) - a = np.random.randint(0, 100, num_elements).astype(np.int32) - b = np.array([0] * num_elements).astype(np.int32) - - # show initial values - print("a={}, b={}".format(a, b)) + # show initial values + print("a={}, b={}".format(a, b)) - # call program - sdfg(A=a, B=b) + # call program + sdfg(A=a, B=b) - # show result - print("a={}, b={}".format(a, b)) + # show result + print("a={}, b={}".format(a, b)) - assert b[ - 0] == 100 # TODO: implement detection of #elements to process, s.t. we can extend the assertion to the whole array - assert np.all(map((lambda x: x == 0), b[1:-1])) # should still be at the init value (for the moment) + assert b[ + 0] == 100 # TODO: implement detection of #elements to process, s.t. we can extend the assertion to the whole array + assert np.all(map((lambda x: x == 0), b[1:-1])) # should still be at the init value (for the moment) diff --git a/samples/fpga/rtl/rtl_tasklet_scalar.py b/samples/fpga/rtl/rtl_tasklet_scalar.py index c9f6380a2b..cf8d53ec91 100644 --- a/samples/fpga/rtl/rtl_tasklet_scalar.py +++ b/samples/fpga/rtl/rtl_tasklet_scalar.py @@ -1,11 +1,11 @@ -# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. """ Simple RTL tasklet with a single scalar input and a single scalar output. It increments b from a up to 100. + + It is intended for running simulation xilinx targets. """ import dace -import argparse - import numpy as np # add sdfg @@ -79,19 +79,19 @@ ###################################################################### if __name__ == '__main__': + with dace.config.set_temporary('compiler', 'xilinx', 'mode', value='simulation'): + # init data structures + a = np.random.randint(0, 100, 1).astype(np.int32) + b = np.array([0]).astype(np.int32) - # init data structures - a = np.random.randint(0, 100, 1).astype(np.int32) - b = np.array([0]).astype(np.int32) - - # show initial values - print("a={}, b={}".format(a, b)) + # show initial values + print("a={}, b={}".format(a, b)) - # call program - sdfg(A=a, B=b) + # call program + sdfg(A=a, B=b) - # show result - print("a={}, b={}".format(a, b)) + # show result + print("a={}, b={}".format(a, b)) - # check result - assert b == 100 + # check result + assert b == 100 diff --git a/samples/fpga/rtl/rtl_tasklet_vector.py b/samples/fpga/rtl/rtl_tasklet_vector.py index c099a6a38d..9015b4f35e 100644 --- a/samples/fpga/rtl/rtl_tasklet_vector.py +++ b/samples/fpga/rtl/rtl_tasklet_vector.py @@ -1,11 +1,11 @@ -# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. """ RTL tasklet with a vector input of 4 int32 (width=128bits) and a single scalar output. It increments b from a[31:0] up to 100. + + It is intended for running simulation xilinx targets. """ import dace -import argparse - import numpy as np # add symbol @@ -44,13 +44,13 @@ typedef enum [1:0] {READY, BUSY, DONE} state_e; state_e state; - + always@(posedge ap_aclk) begin if (ap_areset) begin // case: reset m_axis_b_tdata <= 0; s_axis_a_tready <= 1'b1; state <= READY; - end else if (s_axis_a_tvalid && state == READY) begin // case: load a + end else if (s_axis_a_tvalid && state == READY) begin // case: load a m_axis_b_tdata <= s_axis_a_tdata[0]; s_axis_a_tready <= 1'b0; state <= BUSY; @@ -60,9 +60,9 @@ m_axis_b_tdata <= m_axis_b_tdata; state <= DONE; end - end - - assign m_axis_b_tvalid = (m_axis_b_tdata >= s_axis_a_tdata[0] + s_axis_a_tdata[1] && (state == BUSY || state == DONE)) ? 1'b1:1'b0; + end + + assign m_axis_b_tvalid = (m_axis_b_tdata >= s_axis_a_tdata[0] + s_axis_a_tdata[1] && (state == BUSY || state == DONE)) ? 1'b1:1'b0; ''', language=dace.Language.SystemVerilog) @@ -80,19 +80,19 @@ ###################################################################### if __name__ == '__main__': + with dace.config.set_temporary('compiler', 'xilinx', 'mode', value='simulation'): + # init data structures + a = np.random.randint(0, 100, dace.symbolic.evaluate(WIDTH, sdfg.constants)).astype(np.int32) + b = np.array([0]).astype(np.int32) - # init data structures - a = np.random.randint(0, 100, dace.symbolic.evaluate(WIDTH, sdfg.constants)).astype(np.int32) - b = np.array([0]).astype(np.int32) - - # show initial values - print("a={}, b={}".format(a, b)) + # show initial values + print("a={}, b={}".format(a, b)) - # call program - sdfg(A=a, B=b) + # call program + sdfg(A=a, B=b) - # show result - print("a={}, b={}".format(a, b)) + # show result + print("a={}, b={}".format(a, b)) - # check result - assert b == a[0] + a[1] + # check result + assert b == a[0] + a[1] diff --git a/tests/rtl/hardware_test.py b/tests/rtl/hardware_test.py index 821688f481..727dc7362b 100644 --- a/tests/rtl/hardware_test.py +++ b/tests/rtl/hardware_test.py @@ -1,4 +1,7 @@ -# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. +""" + Test suite for testing RTL integration with DaCe targeting Xilinx FPGAs. +""" import dace from dace.fpga_testing import rtl_test import numpy as np @@ -13,7 +16,7 @@ def make_vadd_sdfg(N: dace.symbol, veclen: int = 8): ''' Function for generating a simple vector addition SDFG that adds a vector `A` of `N` elements to a scalar `B` into a vector `C` of `N` elements, all using SystemVerilog. - The tasklet creates `veclen` instances of a floating point adder that operates on `N` elements. + The tasklet creates `veclen` instances of a floating point adder that operates on `N` elements. :param N: The number of elements the SDFG takes as input and output. :param veclen: The number of floating point adders to instantiate. @@ -197,7 +200,7 @@ def make_vadd_multi_sdfg(N, M): :param N: The number of elements to compute on. :param M: The number of compute PEs to initialize. - :return: An SDFG that has arguments `A` and `B`. + :return: An SDFG that has arguments `A` and `B`. ''' # add sdfg sdfg = dace.SDFG(f'integer_vector_plus_42_multiple_kernels_{N.get() // M.get()}') @@ -321,7 +324,7 @@ def make_vadd_multi_sdfg(N, M): @rtl_test() def test_hardware_vadd(): ''' - Test for the simple vector addition. + Test for the simple vector addition. ''' # add symbol @@ -346,7 +349,7 @@ def test_hardware_vadd(): @rtl_test() def test_hardware_add42_single(): ''' - Test for adding a constant using a single PE. + Test for adding a constant using a single PE. ''' N = dace.symbol('N') M = dace.symbol('M') @@ -428,10 +431,11 @@ def test_hardware_vadd_temporal_vectorization(): ''' Tests whether the multi-pumping optimization can be applied automatically by applying the temporal vectorization transformation. It starts from a numpy vector addition for generating the SDFG. This SDFG is then optimized by applying the vectorization, streaming memory, fpga and temporal vectorization transformations in that order. ''' - # TODO !!!!! THIS TEST STALLS IN HARDWARE EMULATION WITH VITIS 2021.2 !!!!! - # But it works fine for 2020.2 and 2022.2. It seems like everything but the - # last transaction correctly goes through just fine. The last transaction - # is never output by the floating point adder, but the inputs are consumed. + # TODO !!!!! THIS TEST STALLS IN HARDWARE EMULATION WITH VITIS 2021.2 and 2022.1 !!!!! + # But it works fine for 2020.2, 2022.2, and 2023.1. It seems like + # everything but the last transaction correctly goes through just fine. The + # last transaction is never output by the floating point adder, but the + # inputs are consumed. with dace.config.set_temporary('compiler', 'xilinx', 'frequency', value='"0:300\\|1:600"'): # Generate the test data and expected results size_n = 1024 diff --git a/tests/rtl/simulation_test.py b/tests/rtl/simulation_test.py index f20ff6133a..6b7ac2cd15 100644 --- a/tests/rtl/simulation_test.py +++ b/tests/rtl/simulation_test.py @@ -1,5 +1,7 @@ -# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. - +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. +""" + Test suite for testing RTL tasklets in DaCe with Verilator as a backend for simulation. +""" import dace import numpy as np import pytest