From 1a118580c75f26b0c9c00a15090edff452a294a6 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Wed, 3 Apr 2024 10:23:13 +0200 Subject: [PATCH 01/30] [Data packing] Disable unwanted sign extension in fast mode --- src/finn/util/data_packing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/finn/util/data_packing.py b/src/finn/util/data_packing.py index 7698850029..cad2b6ca23 100644 --- a/src/finn/util/data_packing.py +++ b/src/finn/util/data_packing.py @@ -296,8 +296,9 @@ def npy_to_rtlsim_input(input_file, input_dtype, pad_to_nbits, reverse_inner=Tru else: raise Exception("input_file must be ndarray or filename for .npy") if inp.shape[-1] == 1 and input_dtype.is_integer(): + mask = (1 << input_dtype.bitwidth()) - 1 packed_data = inp.flatten().astype(input_dtype.to_numpy_dt()) - packed_data = [int(x) for x in packed_data] + packed_data = [int(x) & mask for x in packed_data] else: packed_data = pack_innermost_dim_as_hex_string( inp, input_dtype, pad_to_nbits, reverse_inner=reverse_inner From ed46d83494e781fd24308f620360e636104eb539 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Fri, 17 May 2024 09:40:46 +0000 Subject: [PATCH 02/30] [Data packing] fix bipolar case, add test --- src/finn/util/data_packing.py | 6 ++++- tests/util/test_data_packing_hls.py | 41 ++++++++++++++++++++++++++++- 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/src/finn/util/data_packing.py b/src/finn/util/data_packing.py index cad2b6ca23..6a72d38058 100644 --- a/src/finn/util/data_packing.py +++ b/src/finn/util/data_packing.py @@ -295,7 +295,11 @@ def npy_to_rtlsim_input(input_file, input_dtype, pad_to_nbits, reverse_inner=Tru inp = np.load(input_file) else: raise Exception("input_file must be ndarray or filename for .npy") - if inp.shape[-1] == 1 and input_dtype.is_integer(): + if ( + inp.shape[-1] == 1 + and input_dtype.is_integer() + and input_dtype.get_canonical_name() != "BIPOLAR" + ): mask = (1 << input_dtype.bitwidth()) - 1 packed_data = inp.flatten().astype(input_dtype.to_numpy_dt()) packed_data = [int(x) & mask for x in packed_data] diff --git a/tests/util/test_data_packing_hls.py b/tests/util/test_data_packing_hls.py index b95bcd5d42..a718f171e2 100644 --- a/tests/util/test_data_packing_hls.py +++ b/tests/util/test_data_packing_hls.py @@ -36,7 +36,7 @@ from qonnx.util.basic import gen_finn_dt_tensor from finn.util.basic import make_build_dir -from finn.util.data_packing import numpy_to_hls_code +from finn.util.data_packing import npy_to_rtlsim_input, numpy_to_hls_code @pytest.mark.util @@ -141,3 +141,42 @@ def remove_all_whitespace(s): eB = """{{ap_uint<4>("0xf", 16), ap_uint<4>("0xf", 16)}, {ap_uint<4>("0x7", 16), ap_uint<4>("0xd", 16)}};""" assert remove_all_whitespace(ret) == remove_all_whitespace(eB) + + +@pytest.mark.util +@pytest.mark.parametrize( + "dtype", + [ + DataType["BINARY"], + DataType["BIPOLAR"], + DataType["TERNARY"], + DataType["INT2"], + DataType["INT7"], + DataType["INT8"], + DataType["INT22"], + DataType["INT32"], + DataType["UINT7"], + DataType["UINT8"], + DataType["UINT15"], + DataType["FIXED<9,6>"], + DataType["FLOAT32"], + ], +) +def test_npy_to_rtlsim_input(dtype): + # check if slow and fast data packing produce the same non-sign-extended input for rtlsim + # fast mode is triggered for certain data types if last (SIMD) dim = 1 + inp_fast = gen_finn_dt_tensor(dtype, (1, 8, 8, 8 // 1, 1)) # N H W FOLD SIMD + inp_slow = inp_fast.reshape((1, 8, 8, 8 // 2, 2)) # N H W FOLD SIMD + + output_fast = npy_to_rtlsim_input(inp_fast, dtype, 1 * dtype.bitwidth()) + output_slow = npy_to_rtlsim_input(inp_slow, dtype, 2 * dtype.bitwidth()) + + output_slow_split = [] + for x in output_slow: + # least significant bits = first element: + output_slow_split.append(x & ((1 << dtype.bitwidth()) - 1)) + # remaining bits = second element: + output_slow_split.append(x >> dtype.bitwidth()) + + assert all([(x >> dtype.bitwidth()) == 0 for x in output_fast]), "extraneous bits detected" + assert np.all(output_fast == output_slow_split), "different behavior of packing modes detected" From 964c8ca758615c5dc3c08bfe075a702079494fe4 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Fri, 17 May 2024 09:41:53 +0000 Subject: [PATCH 03/30] Rename data packing test file --- tests/util/{test_data_packing_hls.py => test_data_packing.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/util/{test_data_packing_hls.py => test_data_packing.py} (100%) diff --git a/tests/util/test_data_packing_hls.py b/tests/util/test_data_packing.py similarity index 100% rename from tests/util/test_data_packing_hls.py rename to tests/util/test_data_packing.py From 6dc38ba2fceb2a86762e971a7ce7153955943bf5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Tue, 21 May 2024 20:43:35 +0100 Subject: [PATCH 04/30] Fix lane partitioning in 4-bit DSP compute. --- finn-rtllib/mvu/mvu_4sx4u.sv | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index 703bde665e..ab94825c4a 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -79,7 +79,10 @@ module mvu_4sx4u #( assign vld = L[5]; // Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism - localparam int unsigned D[4:0] = '{ ACCU_WIDTH+22, 22, 15, 8, 0 }; // Lane offsets + localparam int unsigned D[4:0] = // Lane offsets + VERSION == 1? '{ ACCU_WIDTH+21, 21, 14, 7, 0 } : + VERSION == 2? '{ ACCU_WIDTH+23, 23, 16, 8, 0 } : + /* else */ '{ default: 0 }; localparam int unsigned PIPE_COUNT = (PE+3)/4; for(genvar c = 0; c < PIPE_COUNT; c++) begin : genPipes @@ -88,7 +91,7 @@ module mvu_4sx4u #( localparam int unsigned PE_END = PE < 4*(c+1)? PE : 4*(c+1); localparam int unsigned PE_REM = 4*(c+1) - PE_END; - uwire [57:0] p3[SIMD]; + uwire [47:0] p3[SIMD]; uwire signed [ 1:0] h3[SIMD][3]; for(genvar s = 0; s < SIMD; s++) begin : genSIMD From 6772e0344339c8e676fbe73806f23d89db6bb86d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Wed, 22 May 2024 19:26:06 +0100 Subject: [PATCH 05/30] Harden 4-bit DSP MVU for promotion of device primitive. --- finn-rtllib/mvu/mvu_4sx4u.sv | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index ab94825c4a..7f3d6961e3 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -83,6 +83,7 @@ module mvu_4sx4u #( VERSION == 1? '{ ACCU_WIDTH+21, 21, 14, 7, 0 } : VERSION == 2? '{ ACCU_WIDTH+23, 23, 16, 8, 0 } : /* else */ '{ default: 0 }; + localparam int unsigned A_WIDTH = 23 + 2*VERSION; // Width of A datapath localparam int unsigned PIPE_COUNT = (PE+3)/4; for(genvar c = 0; c < PIPE_COUNT; c++) begin : genPipes @@ -127,7 +128,14 @@ module mvu_4sx4u #( aa = '0; for(int unsigned pe = 0; pe < PE_END - PE_BEG; pe++) begin dd[D[pe + PE_REM]+:3] = ww[pe]; - aa[D[pe + PE_REM]+ 3] = ww[pe][3]; + + // The sign of the weights are generally put on the subtracted A port. + // However, when coinciding with the actual sign bit position of the + // multiplier input path, it also goes onto the D input. This prevents + // sign extensions that may happen when a DSP primitive is auto-promoted + // to a newer generation. + if(D[pe + PE_REM]+3 == A_WIDTH-1) dd[D[pe + PE_REM]+3] = ww[pe][3]; + else aa[D[pe + PE_REM]+3] = ww[pe][3]; end end end : blkVectorize @@ -138,6 +146,7 @@ module mvu_4sx4u #( // rst can be only applied to AD and zero only to B // with the same effect as zeroing both. if(BEHAVIORAL) begin : genBehav + // Stage #1: Input Refine logic signed [17:0] B1 = 0; always_ff @(posedge clk) begin @@ -145,7 +154,7 @@ module mvu_4sx4u #( else if(en) B1 <= bb; end - logic signed [26:0] AD1 = 0; + logic signed [A_WIDTH-1:0] AD1 = 0; always_ff @(posedge clk) begin if(rst) AD1 <= 0; else if(en) AD1 <= dd - aa; From 9e2ba5ca6c692f84fb63398205051470581242ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Wed, 22 May 2024 19:26:43 +0100 Subject: [PATCH 06/30] Restrict to narrow-range weights for the moment. --- finn-rtllib/mvu/tb/mvu_axi_tb.sv | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv index fff69739bc..d3532bcfea 100644 --- a/finn-rtllib/mvu/tb/mvu_axi_tb.sv +++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv @@ -41,8 +41,8 @@ module mvu_axi_tb(); localparam int unsigned MH = 32; localparam int unsigned SIMD = 48; localparam int unsigned PE = 16; - localparam int unsigned SEGMENTLEN = 2.0; - localparam bit FORCE_BEHAVIORAL = 1; + localparam int unsigned SEGMENTLEN = 2; + localparam bit FORCE_BEHAVIORAL = 0; localparam bit M_REG_LUT = 1; // Bit-width config localparam int unsigned ACTIVATION_WIDTH = 4; @@ -112,6 +112,17 @@ module mvu_axi_tb(); function weight_matrix_t init_WEIGHTS; automatic weight_matrix_t res; std::randomize(res); + for(int unsigned nf = 0; nf < NF; nf++) begin + for(int unsigned sf = 0; sf < SF; sf++) begin + for(int unsigned pe = 0; pe < PE; pe++) begin + for(int unsigned simd = 0; simd < SIMD; simd++) begin + if(res[nf][sf][pe][simd] == (1 << (WEIGHT_WIDTH-1))) begin + res[nf][sf][pe][simd]++; + end + end + end + end + end return res; endfunction : init_WEIGHTS; From 739d64468d0d754f6cd1b54045f8ad7af466202e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Fri, 24 May 2024 19:40:03 +0100 Subject: [PATCH 07/30] Enable non-narrow weights for DSP48E2. Expose version in core selection. --- finn-rtllib/mvu/mvu_4sx4u.sv | 148 ++++++++++------ finn-rtllib/mvu/mvu_vvu_axi.sv | 19 +- finn-rtllib/mvu/tb/mvu_axi_tb.sv | 286 ++++++++++++++++--------------- 3 files changed, 266 insertions(+), 187 deletions(-) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index 7f3d6961e3..2f2e1c0d23 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -36,8 +36,9 @@ module mvu_4sx4u #( int unsigned SIMD, int unsigned ACCU_WIDTH, - int unsigned VERSION = 1, + int unsigned VERSION = 1, // Version 1 (DSP48E1) *must* commit to NARROW_WEIGHTS bit SIGNED_ACTIVATIONS = 0, + bit NARROW_WEIGHTS = 0, // Weights from [-7:7] rather than [-8:7] bit FORCE_BEHAVIORAL = 0 )( // Global Control @@ -62,6 +63,54 @@ module mvu_4sx4u #( `endif FORCE_BEHAVIORAL; + //----------------------------------------------------------------------- + // Determine Lane Configuration + typedef struct { + int unsigned OFFSET[4:0]; + int unsigned LO_WIDTH[3:0]; + int unsigned HI_WIDTH[2:0]; + int unsigned LO_WIDTH_MAX; // exluding leftmost lane + int unsigned HI_WIDTH_MAX; // exluding leftmost lane + } slicing_t; + function slicing_t sliceLanes(); + automatic slicing_t slicing; + + // Determine Lane Offsets + unique case(VERSION) + 1: begin + if(!NARROW_WEIGHTS) begin + $error("%m: Need NARROW_WEIGHTS for DSP48E1."); + $finish; + end + slicing.OFFSET = '{ ACCU_WIDTH+21, 21, 14, 7, 0 }; + end + 2: begin + slicing.OFFSET = NARROW_WEIGHTS? + '{ ACCU_WIDTH+23, 23, 16, 8, 0 } : + '{ ACCU_WIDTH+22, 22, 15, 8, 0 }; + end + endcase + + // Derive other Lane Attributes + for(int unsigned i = 0; i < 4; i++) begin + automatic int unsigned lw = slicing.OFFSET[i+1] - slicing.OFFSET[i]; + slicing.LO_WIDTH[i] = lw; + + if(i < 3) begin + automatic int unsigned hw = 1 + $clog2(2**(ACCU_WIDTH-lw-1)+SIMD); + slicing.HI_WIDTH[i] = hw; + + if(lw > slicing.LO_WIDTH_MAX) slicing.LO_WIDTH_MAX = lw; + if(hw > slicing.HI_WIDTH_MAX) slicing.HI_WIDTH_MAX = hw; + end + end + + return slicing; + endfunction : sliceLanes + localparam slicing_t SLICING = sliceLanes(); + localparam int unsigned A_WIDTH = 23 + 2*VERSION; // Width of A datapath + + // Compute the count of decendents for all nodes in the reduction trees. typedef int unsigned leave_load_t[2*SIMD-1]; function leave_load_t init_leave_loads(); automatic leave_load_t res; @@ -79,12 +128,6 @@ module mvu_4sx4u #( assign vld = L[5]; // Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism - localparam int unsigned D[4:0] = // Lane offsets - VERSION == 1? '{ ACCU_WIDTH+21, 21, 14, 7, 0 } : - VERSION == 2? '{ ACCU_WIDTH+23, 23, 16, 8, 0 } : - /* else */ '{ default: 0 }; - localparam int unsigned A_WIDTH = 23 + 2*VERSION; // Width of A datapath - localparam int unsigned PIPE_COUNT = (PE+3)/4; for(genvar c = 0; c < PIPE_COUNT; c++) begin : genPipes @@ -102,7 +145,7 @@ module mvu_4sx4u #( logic [26:0] dd; logic [ 1:0] xx[3:1]; if(1) begin : blkVectorize - uwire [3:0] ww[PE_END - PE_BEG]; + uwire signed [3:0] ww[PE_END - PE_BEG]; for(genvar pe = 0; pe < PE_END - PE_BEG; pe++) begin assign ww[pe] = w[PE_BEG + pe][s]; if(pe) begin @@ -127,15 +170,19 @@ module mvu_4sx4u #( dd = '0; aa = '0; for(int unsigned pe = 0; pe < PE_END - PE_BEG; pe++) begin - dd[D[pe + PE_REM]+:3] = ww[pe]; + automatic int unsigned ofs = SLICING.OFFSET[pe + PE_REM]; + dd[ofs+:3] = ww[pe]; + assert(!NARROW_WEIGHTS || (ww[pe] != -8)) else begin + $warning("Weight of -8 violates NARROW_WEIGHTS commitment."); + end // The sign of the weights are generally put on the subtracted A port. // However, when coinciding with the actual sign bit position of the // multiplier input path, it also goes onto the D input. This prevents // sign extensions that may happen when a DSP primitive is auto-promoted // to a newer generation. - if(D[pe + PE_REM]+3 == A_WIDTH-1) dd[D[pe + PE_REM]+3] = ww[pe][3]; - else aa[D[pe + PE_REM]+3] = ww[pe][3]; + if(ofs+3 == A_WIDTH-1) dd[ofs+3] = ww[pe][3]; + else aa[ofs+3] = ww[pe][3]; end end end : blkVectorize @@ -441,14 +488,14 @@ module mvu_4sx4u #( X1 <= xx; X2 <= X1; foreach(X3[i]) begin - X3[i] <= X2[i] + (L[3]? 2'h0 : pp[D[i]+:2]); + X3[i] <= X2[i] + (L[3]? 2'h0 : pp[SLICING.OFFSET[i]+:2]); end end end // Derive actual cross-lane overflows for(genvar i = 0; i < 3; i++) begin - assign h3[s][i] = pp[D[i+1]+:2] - X3[i+1]; + assign h3[s][i] = pp[SLICING.OFFSET[i+1]+:2] - X3[i+1]; end assign p3[s] = pp; @@ -457,51 +504,55 @@ module mvu_4sx4u #( // Stage #4: Cross-SIMD Reduction // Count leaves reachable from each node - localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 1}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop + localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 1 }; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop uwire signed [ACCU_WIDTH-1:0] up4; - uwire signed [$clog2(2**(ACCU_WIDTH-8)+SIMD):0] hi4[3]; // min LO_WIDTH=7 - uwire [$clog2(SIMD)+7 :0] lo4[3]; // max LO_WIDTH=8 + uwire signed [ SLICING.HI_WIDTH_MAX-1:0] hi4[3]; + uwire [$clog2(SIMD)+SLICING.LO_WIDTH_MAX-1:0] lo4[3]; for(genvar i = 0; i < 4; i++) begin - localparam int unsigned LO_WIDTH = D[i+1] - D[i]; - localparam int unsigned HI_WIDTH = 1 + $clog2(2**(ACCU_WIDTH-LO_WIDTH-1)+SIMD); // Conclusive high part accumulation - if(i >= PE_REM && i < 3) begin : genHi - // Adder Tree across all SIMD high contributions, each from [-1:1] - uwire signed [2*SIMD-2:0][$clog2(1+SIMD):0] tree; - for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s][i]; - for(genvar n = 0; n < SIMD-1; n++) begin - // Sum truncated to actual maximum bit width at this node - uwire signed [$clog2(1+LEAVE_LOAD[n]):0] s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]); - assign tree[n] = s; - end + if(i < 3) begin : genHi + if(i < PE_REM) assign hi4[i] = '0; + else begin + localparam int unsigned HI_WIDTH = SLICING.HI_WIDTH[i]; + + // Adder Tree across all SIMD high contributions, each from [-1:1] + uwire signed [2*SIMD-2:0][$clog2(1+SIMD):0] tree; + for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s][i]; + for(genvar n = 0; n < SIMD-1; n++) begin + // Sum truncated to actual maximum bit width at this node + uwire signed [$clog2(1+LEAVE_LOAD[n]):0] s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]); + assign tree[n] = s; + end - // High Sideband Accumulation - logic signed [HI_WIDTH-1:0] Hi4 = 0; - always_ff @(posedge clk) begin - if(rst) Hi4 <= 0; - else if(en) begin - automatic logic signed [HI_WIDTH:0] h = $signed(L[4]? 0 : Hi4) + $signed(tree[0]); - assert(h[HI_WIDTH] == h[HI_WIDTH-1]) else begin - $error("%m: Accumulation overflow for ACCU_WIDTH=%0d", ACCU_WIDTH); - $stop; + // High Sideband Accumulation + logic signed [HI_WIDTH-1:0] Hi4 = 0; + always_ff @(posedge clk) begin + if(rst) Hi4 <= 0; + else if(en) begin + automatic logic signed [HI_WIDTH:0] h = $signed(L[4]? 0 : Hi4) + $signed(tree[0]); + assert(h[HI_WIDTH] == h[HI_WIDTH-1]) else begin + $error("%m: Accumulation overflow for ACCU_WIDTH=%0d", ACCU_WIDTH); + $stop; + end + Hi4 <= h; end - Hi4 <= h; end + assign hi4[i] = Hi4; + end - assign hi4[i] = Hi4; end : genHi - else if (i < 3) begin : genHiZero - assign hi4[i] = '0; - end : genHiZero // Conclusive low part accumulation (all unsigned arithmetic) - if(i >= PE_REM) begin : blkLo + if(i < PE_REM) assign lo4[i] = '0; + else begin : genLo + localparam int unsigned LO_WIDTH = SLICING.LO_WIDTH[i]; + // Adder Tree across all SIMD low contributions localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1)); uwire [2*SIMD-2:0][ROOT_WIDTH-1:0] tree; - for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH]; + for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][SLICING.OFFSET[i]+:LO_WIDTH]; for(genvar n = 0; n < SIMD-1; n++) begin // Sum truncated to actual maximum bit width at this node localparam int unsigned NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1)); @@ -517,10 +568,7 @@ module mvu_4sx4u #( if(i == 3) assign up4 = Lo4; else assign lo4[i] = Lo4; - end : blkLo - else begin : blkLoZero - assign lo4[i] = '0; - end : blkLoZero + end : genLo end @@ -530,9 +578,9 @@ module mvu_4sx4u #( if(rst) Res5 <= '{ default: 0 }; else if(en) begin Res5[3] <= up4 - hi4[2]; - Res5[2] <= $signed({ hi4[2], {(D[3] - D[2]){1'b0}} }) + $signed({ 1'b0, lo4[2] }) - hi4[1]; - Res5[1] <= $signed({ hi4[1], {(D[2] - D[1]){1'b0}} }) + $signed({ 1'b0, lo4[1] }) - hi4[0]; - Res5[0] <= $signed({ hi4[0], {(D[1] - D[0]){1'b0}} }) + $signed({ 1'b0, lo4[0] }); + Res5[2] <= $signed({ hi4[2], {(SLICING.LO_WIDTH[2]){1'b0}} }) + $signed({ 1'b0, lo4[2] }) - hi4[1]; + Res5[1] <= $signed({ hi4[1], {(SLICING.LO_WIDTH[1]){1'b0}} }) + $signed({ 1'b0, lo4[1] }) - hi4[0]; + Res5[0] <= $signed({ hi4[0], {(SLICING.LO_WIDTH[0]){1'b0}} }) + $signed({ 1'b0, lo4[0] }); end end diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index 6498530113..35325abdf9 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -55,6 +55,7 @@ module mvu_vvu_axi #( int unsigned ACTIVATION_WIDTH, int unsigned WEIGHT_WIDTH, int unsigned ACCU_WIDTH, + bit NARROW_WEIGHTS = 0, bit SIGNED_ACTIVATIONS = 0, bit PUMPED_COMPUTE = 0, @@ -306,8 +307,22 @@ module mvu_vvu_axi #( .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), .vld(dsp_vld), .p(dsp_p) ); - "mvu_4sx4u": - mvu_4sx4u #(.PE(PE), .SIMD(DSP_SIMD), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( + "mvu_4sx4u_dsp48e1": + mvu_4sx4u #( + .PE(PE), .SIMD(DSP_SIMD), + .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .NARROW_WEIGHTS(NARROW_WEIGHTS), + .VERSION(1), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL) + ) core ( + .clk(dsp_clk), .rst, .en(dsp_en), + .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), + .vld(dsp_vld), .p(dsp_p) + ); + "mvu_4sx4u_dsp48e2": + mvu_4sx4u #( + .PE(PE), .SIMD(DSP_SIMD), + .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .NARROW_WEIGHTS(NARROW_WEIGHTS), + .VERSION(2), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL) + ) core ( .clk(dsp_clk), .rst, .en(dsp_en), .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), .vld(dsp_vld), .p(dsp_p) diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv index d3532bcfea..f16c40db34 100644 --- a/finn-rtllib/mvu/tb/mvu_axi_tb.sv +++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv @@ -70,7 +70,7 @@ module mvu_axi_tb(); uwire ap_clk = clk; - // Generate activations + // Generate shared Activations typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; typedef activation_t activation_vector_t[SF]; @@ -82,158 +82,174 @@ module mvu_axi_tb(); activation_vector_t ACTIVATIONS = init_ACTIVATIONS(); - struct { - activation_t dat; - logic vld; - logic rdy; - } activations; - - initial begin - activations.vld = 0; - activations.dat = 'X; - @(posedge clk iff ap_rst_n); - - for (int i=0; i= 0; - @(posedge clk); - end while (!(activations.vld === 1 && activations.rdy === 1)); + // Run parallel instances across DSP versions and NARROW_WEIGHTS + bit [2:1][1:0] done = { 2: 2'b00, 1: 2'b01 }; // [ver][narrow] + always_comb begin + if(&done) begin + $display("Test completed."); + $finish; end - - activations.vld <= 0; - activations.dat <= 'x; end - // Generate weights - typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; - typedef weight_t weight_matrix_t[NF][SF]; + for(genvar ver = 1; ver <= 2; ver++) begin : genVersion + for(genvar narrow = (ver == 1); narrow <= 1; narrow++) begin : genNarrowWide + + // Activations Feed + struct { + activation_t dat; + logic vld; + logic rdy; + } activations; + + initial begin + activations.vld = 0; + activations.dat = 'X; + @(posedge clk iff ap_rst_n); + + for(int unsigned i = 0; i < SF; i++) begin + while($urandom()%7 == 0) @(posedge clk); + activations.dat <= ACTIVATIONS[i]; + activations.vld <= 1; + @(posedge clk iff activations.rdy); + activations.dat <= 'x; + activations.vld <= 0; + end + end - function weight_matrix_t init_WEIGHTS; - automatic weight_matrix_t res; - std::randomize(res); - for(int unsigned nf = 0; nf < NF; nf++) begin - for(int unsigned sf = 0; sf < SF; sf++) begin - for(int unsigned pe = 0; pe < PE; pe++) begin - for(int unsigned simd = 0; simd < SIMD; simd++) begin - if(res[nf][sf][pe][simd] == (1 << (WEIGHT_WIDTH-1))) begin - res[nf][sf][pe][simd]++; + // Instance-specifc Weights (may be narrow) + typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; + typedef weight_t weight_matrix_t[NF][SF]; + + function weight_matrix_t init_WEIGHTS; + automatic weight_matrix_t res; + std::randomize(res); + if(narrow) begin // increment all weights of -8 + for(int unsigned nf = 0; nf < NF; nf++) begin + for(int unsigned sf = 0; sf < SF; sf++) begin + for(int unsigned pe = 0; pe < PE; pe++) begin + for(int unsigned simd = 0; simd < SIMD; simd++) begin + if(res[nf][sf][pe][simd] == (1 << (WEIGHT_WIDTH-1))) begin + res[nf][sf][pe][simd]++; + end + end end end end end - end - return res; - endfunction : init_WEIGHTS; - - weight_matrix_t WEIGHTS = init_WEIGHTS(); - - struct { - weight_t dat; - logic vld; - logic rdy; - } weights; - - initial begin - weights.vld = 0; - weights.dat = 'X; - @(posedge clk iff ap_rst_n); - - weights.vld <= 1; - for (int i=0; i1 - // Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..] - // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like: - // (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to - // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i) - for (int i = 0; i < NF; i++) begin - for (int j = 0; j < SF; j++) begin - for (int k = 0; k < PE; k++) begin - for (int l = 0; l < SIMD; l++) begin - if (SIGNED_ACTIVATIONS) - res[i][k] = $signed(res[i][k]) + $signed(a[j][l]) * $signed(w[i][j][k][l]); - else - res[i][k] = $signed(res[i][k]) + $signed({1'b0, a[j][l]}) * $signed(w[i][j][k][l]); + // Function to compute golden output + // a: [SF][SIMD-1:0][ACTIVATION_WIDTH-1:0] + // a: [SF][PE*SIMD-1:0][ACTIVATION_WIDTH-1:0] + // w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] + typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t; + typedef output_t output_vector_t [NF]; + + struct { + output_t dat; + logic vld; + logic rdy; + } outputs; + + function output_vector_t check_output(activation_vector_t a, weight_matrix_t w); + automatic output_vector_t res = '{default: 0}; + // The input stream will have the channels interleaved for VVU when PE>1 + // Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..] + // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like: + // (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to + // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i) + for (int i = 0; i < NF; i++) begin + for (int j = 0; j < SF; j++) begin + for (int k = 0; k < PE; k++) begin + for (int l = 0; l < SIMD; l++) begin + if (SIGNED_ACTIVATIONS) + res[i][k] = $signed(res[i][k]) + $signed(a[j][l]) * $signed(w[i][j][k][l]); + else + res[i][k] = $signed(res[i][k]) + $signed({1'b0, a[j][l]}) * $signed(w[i][j][k][l]); + end end end end - end - return res; - endfunction : check_output; - - output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS); - - int unsigned NF_CNT = 0; - initial begin - outputs.rdy = 0; - while (NF_CNT < NF) begin - // Loop until both rdy & vld are asserted - do begin - outputs.rdy <= $urandom()%7 >= 0; - @(posedge clk iff ap_rst_n); - end while (!(outputs.rdy === 1 && outputs.vld === 1)); - - // Compare produced outputs against golden outputs - foreach(outputs.dat[i]) begin - assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); - else begin - $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); - $stop; + return res; + endfunction : check_output; + + output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS); + initial begin + outputs.rdy = 0; + @(posedge clk iff ap_rst_n); + + for(int unsigned nf = 0; nf < NF; nf++) begin + while($urandom()%13 == 0) @(posedge clk); + outputs.rdy <= 1; + @(posedge clk iff outputs.vld); + outputs.rdy <= 0; + + // Compare produced outputs against golden outputs + foreach(outputs.dat[i]) begin + assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[nf][i])) begin + $display(">>> [t=%0t] Test succeeded (nf=%0d)! Computed / GOLDEN = %0d / %0d", $time, nf, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[nf][i])); + end + else begin + $error(">>> [t=%0t] TEST failed (nf=%0d)! Computed / GOLDEN = %0d / %0d", $time, nf, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[nf][i])); + $stop; + end end end - NF_CNT += 1; + done[ver][narrow] = 1; end - $finish; - end - - // Instantiate DUT - mvu_vvu_axi #( - .IS_MVU(IS_MVU), - .COMPUTE_CORE(COMPUTE_CORE), - .MW(MW), - .MH(MH), - .PE(PE), - .SIMD(SIMD), - .ACTIVATION_WIDTH(ACTIVATION_WIDTH), - .WEIGHT_WIDTH(WEIGHT_WIDTH), - .ACCU_WIDTH(ACCU_WIDTH), - .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), - .SEGMENTLEN(SEGMENTLEN), - .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL), - .M_REG_LUT(M_REG_LUT) - ) - dut ( - .ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld), - .s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld), - .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld), - .m_axis_output_tready(outputs.rdy) - ); + // Instantiate DUT + mvu_vvu_axi #( + .IS_MVU(IS_MVU), + .COMPUTE_CORE(ver == 1? "mvu_4sx4u_dsp48e1" : "mvu_4sx4u_dsp48e2"), + .MW(MW), + .MH(MH), + .PE(PE), + .SIMD(SIMD), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .WEIGHT_WIDTH(WEIGHT_WIDTH), + .ACCU_WIDTH(ACCU_WIDTH), + .NARROW_WEIGHTS(narrow), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .SEGMENTLEN(SEGMENTLEN), + .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL), + .M_REG_LUT(M_REG_LUT) + ) + dut ( + .ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld), + .s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld), + .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld), + .m_axis_output_tready(outputs.rdy) + ); + + end : genNarrowWide + end : genVersion endmodule : mvu_axi_tb From dbf8ed730a4a8483f2b576eff76b0630caefa18a Mon Sep 17 00:00:00 2001 From: auphelia Date: Mon, 27 May 2024 16:28:18 +0100 Subject: [PATCH 08/30] [RTL MVU] Update code generation to take dsp variant into account --- finn-rtllib/mvu/mvu_vvu_axi_wrapper.v | 3 +- .../rtl/matrixvectoractivation_rtl.py | 38 +++--- .../rtl/vectorvectoractivation_rtl.py | 3 +- .../fpgadataflow/specialize_layers.py | 122 +++++++++--------- src/finn/util/basic.py | 17 +++ src/finn/util/fpgadataflow.py | 8 -- 6 files changed, 102 insertions(+), 89 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v index 50c15c1b02..4edf676008 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v +++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v @@ -42,6 +42,7 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter ACTIVATION_WIDTH = $ACTIVATION_WIDTH$, parameter WEIGHT_WIDTH = $WEIGHT_WIDTH$, parameter ACCU_WIDTH = $ACCU_WIDTH$, + parameter NARROW_WEIGHTS = $NARROW_WEIGHTS$, parameter SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$, parameter SEGMENTLEN = $SEGMENTLEN$, parameter FORCE_BEHAVIORAL = $FORCE_BEHAVIORAL$, @@ -77,7 +78,7 @@ module $MODULE_NAME_AXI_WRAPPER$ #( mvu_vvu_axi #( .IS_MVU(IS_MVU), .COMPUTE_CORE(COMPUTE_CORE), .PUMPED_COMPUTE(PUMPED_COMPUTE), .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), - .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .NARROW_WEIGHTS(NARROW_WEIGHTS), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL) ) inst ( .ap_clk(ap_clk), diff --git a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py index d48b3a918d..a6a8e72bdf 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py @@ -32,7 +32,7 @@ from finn.custom_op.fpgadataflow.matrixvectoractivation import MVAU from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend -from finn.util.basic import get_rtlsim_trace_depth, make_build_dir +from finn.util.basic import get_dsp_block, get_rtlsim_trace_depth, make_build_dir from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy try: @@ -55,10 +55,7 @@ def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) def get_nodeattr_types(self): - my_attrs = { - # Flag to indicate if Versal device is targeted - "is_versal": ("i", False, 0, {0, 1}), - } + my_attrs = {} my_attrs.update(MVAU.get_nodeattr_types(self)) my_attrs.update(RTLBackend.get_nodeattr_types(self)) return my_attrs @@ -141,10 +138,11 @@ def dsp_estimation(self): # multiplication P = self.get_nodeattr("PE") Q = self.get_nodeattr("SIMD") - if self.get_nodeattr("is_versal"): - mult_dsp = P * np.ceil(Q / 3) - else: - mult_dsp = np.ceil(P / 4) * Q + # TODO: get dsp block type + # if dsp_block = "DSP58": + # mult_dsp = P * np.ceil(Q / 3) + # else: + mult_dsp = np.ceil(P / 4) * Q return int(mult_dsp) def instantiate_ip(self, cmd): @@ -186,7 +184,7 @@ def _resolve_segment_len(self, clk): dsp_chain_len = critical_path_dsps if critical_path_dsps < max_chain_len else max_chain_len return dsp_chain_len - def _resolve_impl_style(self, fpgapart): + def _resolve_impl_style(self, dsp_block): # Based on target device and activation/weight-width, choose the # supported RTL compute core assert ( @@ -198,15 +196,15 @@ def _resolve_impl_style(self, fpgapart): act_width = self.get_input_datatype(0).bitwidth() weight_width = self.get_input_datatype(1).bitwidth() - is_versal_family = self.get_nodeattr("is_versal") - if is_versal_family: + if dsp_block == "DSP58": return "mvu_vvu_8sx9_dsp58" else: - act_width = self.get_input_datatype(0).bitwidth() - weight_width = self.get_input_datatype(1).bitwidth() - if (act_width == 4 and weight_width == 4) and not (is_versal_family): - return "mvu_4sx4u" + if act_width <= 4 and weight_width <= 4: + if dsp_block == "DSP48E1": + return "mvu_4sx4u_dsp48e1" + elif dsp_block == "DSP48E2": + return "mvu_4sx4u_dsp48e2" else: return "mvu_8sx8u_dsp48" @@ -216,6 +214,11 @@ def generate_hdl(self, model, fpgapart, clk): self.generate_params(model, code_gen_dir) template_path, code_gen_dict = self.prepare_codegen_default(fpgapart, clk) + # determine if weights are narrow range and add parameter to code gen dict + weights = model.get_initializer(self.onnx_node.input[1]) + wdt = self.get_weight_datatype() + narrow_weights = 0 if np.min(weights) == wdt.min() else 1 + code_gen_dict["$NARROW_WEIGHTS$"] = str(narrow_weights) # add general parameters to dictionary code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [self.get_verilog_top_module_name()] # save top module name so we can refer to it after this node has been renamed @@ -248,9 +251,10 @@ def generate_hdl(self, model, fpgapart, clk): def prepare_codegen_default(self, fpgapart, clk): template_path = os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v" + dsp_block = get_dsp_block(fpgapart) code_gen_dict = {} code_gen_dict["$IS_MVU$"] = [str(1)] - code_gen_dict["$COMPUTE_CORE$"] = [self._resolve_impl_style(fpgapart)] + code_gen_dict["$COMPUTE_CORE$"] = [self._resolve_impl_style(dsp_block)] code_gen_dict["$MW$"] = [str(self.get_nodeattr("MW"))] code_gen_dict["$MH$"] = [str(self.get_nodeattr("MH"))] code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))] diff --git a/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py index 27fc9f10a1..2d4240a7f3 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py @@ -33,9 +33,8 @@ from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend from finn.custom_op.fpgadataflow.vectorvectoractivation import VVAU -from finn.util.basic import get_rtlsim_trace_depth, make_build_dir +from finn.util.basic import get_rtlsim_trace_depth, is_versal, make_build_dir from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy -from finn.util.fpgadataflow import is_versal try: from pyverilator import PyVerilator diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py index e71d6c23a4..9e660717f3 100644 --- a/src/finn/transformation/fpgadataflow/specialize_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -26,18 +26,18 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import numpy as np import warnings from onnx import helper -from qonnx.core.datatype import DataType from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.base import Transformation from finn.custom_op.fpgadataflow.hls import custom_op as hls_variants from finn.custom_op.fpgadataflow.rtl import custom_op as rtl_variants -from finn.util.fpgadataflow import is_versal +from finn.util.basic import get_dsp_block, is_versal -def _determine_impl_style(node, fpgapart): +def _determine_impl_style(node, fpgapart, model): optype = node.op_type # check if there is an HLS or RTL variant or both @@ -45,8 +45,8 @@ def _determine_impl_style(node, fpgapart): rtl_variant = optype + "_rtl" in rtl_variants.keys() # check if user has specified a preferred_impl_style - inst = getCustomOp(node) - impl_style = inst.get_nodeattr("preferred_impl_style") + node_inst = getCustomOp(node) + impl_style = node_inst.get_nodeattr("preferred_impl_style") # if impl_style not set, for "simple" layers always try # to use rtl variant if available @@ -55,23 +55,19 @@ def _determine_impl_style(node, fpgapart): return _dwc_determine_impl_style(node) if rtl_variant: if optype == "MVAU": - inp_width_fit = ( - DataType[getCustomOp(node).get_nodeattr("inputDataType")].bitwidth() >= 4 - ) - weight_width_fit = ( - DataType[getCustomOp(node).get_nodeattr("weightDataType")].bitwidth() >= 4 - ) - if inp_width_fit and weight_width_fit and _mvu_rtl_possible(node): + idt = node_inst.get_input_datatype() + wdt = node_inst.get_weight_datatype() + inp_width_fit = idt.bitwidth() >= 4 + weight_width_fit = wdt.bitwidth() >= 4 + if inp_width_fit and weight_width_fit and _mvu_rtl_possible(node, fpgapart, model): return "rtl" else: return "hls" elif optype == "VVAU": - inp_width_fit = ( - DataType[getCustomOp(node).get_nodeattr("inputDataType")].bitwidth() >= 4 - ) - weight_width_fit = ( - DataType[getCustomOp(node).get_nodeattr("weightDataType")].bitwidth() >= 4 - ) + idt = node_inst.get_input_datatype() + wdt = node_inst.get_weight_datatype() + inp_width_fit = idt.bitwidth() >= 4 + weight_width_fit = wdt.bitwidth() >= 4 if inp_width_fit and weight_width_fit and _vvu_rtl_possible(node, fpgapart): return "rtl" else: @@ -136,7 +132,7 @@ def _determine_impl_style(node, fpgapart): # user setting can be fulfilled return "rtl" elif optype == "MVAU": - if _mvu_rtl_possible(node): + if _mvu_rtl_possible(node, fpgapart, model): return "rtl" else: warn_str = """There is no RTL variant for %s. The node will automatically be @@ -232,31 +228,43 @@ def _swg_hls_possible(node): return False -def _mvu_rtl_possible(n): +def _mvu_rtl_possible(n, fpgapart, model): # Checks whether RTL-based MVU is supported # Currently, for DSP48 we only support computations up to # 8sx8u (8-bit signed weights x 8-bit (un)signed activations) - # and for DSP58 we support up to 8sx9s. Next to that, - # embedded thresholding functionality is not supported and - # neither binaryxnormode computation. - inp_width_in_range = ( - DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() <= 8 - ) or ( - DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() == 9 - and DataType[getCustomOp(n).get_nodeattr("inputDataType")].min() < 0 - ) - weight_width_in_range = DataType[getCustomOp(n).get_nodeattr("weightDataType")].bitwidth() <= 8 - signed_weights = DataType[getCustomOp(n).get_nodeattr("weightDataType")].min() < 0 - no_activation = getCustomOp(n).get_nodeattr("noActivation") == 1 - not_binaryxnor_mode = getCustomOp(n).get_nodeattr("binaryXnorMode") == 0 + # and for DSP58 we support up to 8sx9s. + # Please note, DSP48E1 does only support narrow range for weights + # Next to that, embedded thresholding functionality is not supported + # and neither binaryxnormode computation. + node_inst = getCustomOp(n) + # first check if no Activation or binary xnor mode and return False + # immediately if one of them is True + no_activation = node_inst.get_nodeattr("noActivation") == 0 + not_binaryxnor_mode = node_inst.get_nodeattr("binaryXnorMode") == 1 + if no_activation or not_binaryxnor_mode: + return False - return ( - inp_width_in_range - and weight_width_in_range - and signed_weights - and no_activation - and not_binaryxnor_mode - ) + # check if weights are signed, if not return False + wdt = node_inst.get_weight_datatype() + if not wdt.signed(): + return False + + # check which dsp block is available on fpga + dsp_block = get_dsp_block(fpgapart) + # check if weights are narrow + weights = model.get_initializer(n.input[1]) + narrow_weights = False if np.min(weights) == wdt.min() else True + # if non narrow weights and only DSP48E1 available return False + if not narrow_weights and dsp_block == "DSP48E1": + return False + + # if none of the above constraints have been triggered + # we now check if input and weight data types are in range + idt = node_inst.get_input_datatype() + inp_width_in_range = (idt.bitwidth() <= 8) or (idt.bitwidth() == 9 and idt.signed()) + weight_width_in_range = wdt.bitwidth() <= 8 + + return inp_width_in_range and weight_width_in_range def _vvu_rtl_possible(n, fpgapart): @@ -264,24 +272,19 @@ def _vvu_rtl_possible(n, fpgapart): # Currently, we only support RTL-VVU on DSP58 up to 8sx9s inputs # (8-bit signed weights x (9-bit signed OR 8-bit (un)signed) activations). # Next to that, embedded thresholding functionality is not supported. - in_width_in_range = ( - DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() <= 8 - ) or ( - DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() == 9 - and DataType[getCustomOp(n).get_nodeattr("inputDataType")].min() < 0 - ) - weight_width_in_range = DataType[getCustomOp(n).get_nodeattr("weightDataType")].bitwidth() <= 8 - signed_weights = DataType[getCustomOp(n).get_nodeattr("weightDataType")].min() < 0 - is_versal_family = is_versal(fpgapart) - no_activation = getCustomOp(n).get_nodeattr("noActivation") == 1 + node_inst = getCustomOp(n) + if not node_inst.get_nodeattr("noActivation"): + return False + if not is_versal(fpgapart): + return False + + idt = node_inst.get_input_datatype() + wdt = node_inst.get_weight_datatype() + in_width_in_range = (idt.bitwidth() <= 8) or (idt.bitwidth() == 9 and idt.min() < 0) + weight_width_in_range = wdt.bitwidth() <= 8 + signed_weights = wdt.min() < 0 - return ( - in_width_in_range - and weight_width_in_range - and signed_weights - and is_versal_family - and no_activation - ) + return in_width_in_range and weight_width_in_range and signed_weights class SpecializeLayers(Transformation): @@ -300,7 +303,7 @@ def apply(self, model): if not node.domain == "finn.custom_op.fpgadataflow": continue node_ind += 1 - impl_style = _determine_impl_style(node, self.fpgapart) + impl_style = _determine_impl_style(node, self.fpgapart, model) optype = node.op_type + "_" + impl_style new_node = helper.make_node( @@ -313,9 +316,6 @@ def apply(self, model): for attribute in node.attribute: if attribute.name != "preferred_impl_style": new_node.attribute.append(attribute) - if new_node.op_type == "MVAU_rtl": - is_versal_family = is_versal(self.fpgapart) - getCustomOp(new_node).set_nodeattr("is_versal", is_versal_family) graph.node.insert(node_ind, new_node) # remove old nodes graph.node.remove(node) diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py index 1995d9f06a..91c191962f 100644 --- a/src/finn/util/basic.py +++ b/src/finn/util/basic.py @@ -288,3 +288,20 @@ def memutil(req_mem_spec, primitive_spec): eff = (req_width * req_depth) / (count * prim_width * prim_depth) waste = (count * prim_width * prim_depth) - (req_width * req_depth) return (count, eff, waste) + + +def is_versal(fpgapart): + """Returns whether board is part of the Versal family""" + return ( + fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] + or fpgapart[0:5] == "xqrvc" + ) + + +def get_dsp_block(fpgapart): + if is_versal(fpgapart): + return "DSP58" + elif fpgapart[2] == "7": + return "DSP48E1" + else: + return "DSP48E2" diff --git a/src/finn/util/fpgadataflow.py b/src/finn/util/fpgadataflow.py index 3d3d343cd4..aae438fac2 100644 --- a/src/finn/util/fpgadataflow.py +++ b/src/finn/util/fpgadataflow.py @@ -69,11 +69,3 @@ def is_rtl_node(node): is_node = True return is_node - - -def is_versal(fpgapart): - """Returns whether board is part of the Versal family""" - return ( - fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] - or fpgapart[0:5] == "xqrvc" - ) From c4ce3e5fa238baf3881779d928358890039c2260 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Mon, 27 May 2024 17:00:12 +0100 Subject: [PATCH 09/30] Defer use of struct in slicing derivation to accommodate Verilator limitations. --- finn-rtllib/mvu/mvu_4sx4u.sv | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index 2f2e1c0d23..aa76a230da 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -73,7 +73,11 @@ module mvu_4sx4u #( int unsigned HI_WIDTH_MAX; // exluding leftmost lane } slicing_t; function slicing_t sliceLanes(); - automatic slicing_t slicing; + automatic int unsigned offset[4:0]; + automatic int unsigned lo_width[3:0]; + automatic int unsigned hi_width[2:0]; + automatic int unsigned lw_max; // exluding leftmost lane + automatic int unsigned hw_max; // exluding leftmost lane // Determine Lane Offsets unique case(VERSION) @@ -82,10 +86,10 @@ module mvu_4sx4u #( $error("%m: Need NARROW_WEIGHTS for DSP48E1."); $finish; end - slicing.OFFSET = '{ ACCU_WIDTH+21, 21, 14, 7, 0 }; + offset = '{ ACCU_WIDTH+21, 21, 14, 7, 0 }; end 2: begin - slicing.OFFSET = NARROW_WEIGHTS? + offset = NARROW_WEIGHTS? '{ ACCU_WIDTH+23, 23, 16, 8, 0 } : '{ ACCU_WIDTH+22, 22, 15, 8, 0 }; end @@ -93,19 +97,26 @@ module mvu_4sx4u #( // Derive other Lane Attributes for(int unsigned i = 0; i < 4; i++) begin - automatic int unsigned lw = slicing.OFFSET[i+1] - slicing.OFFSET[i]; - slicing.LO_WIDTH[i] = lw; + automatic int unsigned lw = offset[i+1] - offset[i]; + lo_width[i] = lw; if(i < 3) begin automatic int unsigned hw = 1 + $clog2(2**(ACCU_WIDTH-lw-1)+SIMD); - slicing.HI_WIDTH[i] = hw; + hi_width[i] = hw; - if(lw > slicing.LO_WIDTH_MAX) slicing.LO_WIDTH_MAX = lw; - if(hw > slicing.HI_WIDTH_MAX) slicing.HI_WIDTH_MAX = hw; + if(lw > lw_max) lw_max = lw; + if(hw > hw_max) hw_max = hw; end end - return slicing; + return slicing_t'{ + OFFSET: offset, + LO_WIDTH: lo_width, + HI_WIDTH: hi_width, + LO_WIDTH_MAX: lw_max, + HI_WIDTH_MAX: hw_max + }; + endfunction : sliceLanes localparam slicing_t SLICING = sliceLanes(); localparam int unsigned A_WIDTH = 23 + 2*VERSION; // Width of A datapath @@ -172,8 +183,8 @@ module mvu_4sx4u #( for(int unsigned pe = 0; pe < PE_END - PE_BEG; pe++) begin automatic int unsigned ofs = SLICING.OFFSET[pe + PE_REM]; dd[ofs+:3] = ww[pe]; - assert(!NARROW_WEIGHTS || (ww[pe] != -8)) else begin - $warning("Weight of -8 violates NARROW_WEIGHTS commitment."); + assert(!NARROW_WEIGHTS || rst || !en || zero || (ww[pe] != -8)) else begin + $warning("%m: Weight of -8 violates NARROW_WEIGHTS commitment."); end // The sign of the weights are generally put on the subtracted A port. From 3f87a9d94d058a21732bff7e9f61f3154072eee3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Tue, 28 May 2024 06:19:46 +0100 Subject: [PATCH 10/30] Decompose computed struct of geometric configuration to accommodate Verilator limitations. --- finn-rtllib/mvu/mvu_4sx4u.sv | 109 +++++++++++++++++------------------ 1 file changed, 54 insertions(+), 55 deletions(-) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index aa76a230da..0f8f643206 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -65,60 +65,59 @@ module mvu_4sx4u #( //----------------------------------------------------------------------- // Determine Lane Configuration - typedef struct { - int unsigned OFFSET[4:0]; - int unsigned LO_WIDTH[3:0]; - int unsigned HI_WIDTH[2:0]; - int unsigned LO_WIDTH_MAX; // exluding leftmost lane - int unsigned HI_WIDTH_MAX; // exluding leftmost lane - } slicing_t; - function slicing_t sliceLanes(); - automatic int unsigned offset[4:0]; - automatic int unsigned lo_width[3:0]; - automatic int unsigned hi_width[2:0]; - automatic int unsigned lw_max; // exluding leftmost lane - automatic int unsigned hw_max; // exluding leftmost lane - - // Determine Lane Offsets + initial begin + if(!NARROW_WEIGHTS && (VERSION == 1)) begin + $error("%m: Need NARROW_WEIGHTS for DSP48E1."); + $finish; + end + end + + typedef int unsigned lane_offset_v[4:0]; + typedef int unsigned lo_width_v[3:-1]; // Index -1: maximum across all but leftmost lane + typedef int unsigned hi_width_v[2:-1]; + + function lane_offset_v sliceLanes(); unique case(VERSION) 1: begin - if(!NARROW_WEIGHTS) begin - $error("%m: Need NARROW_WEIGHTS for DSP48E1."); - $finish; - end - offset = '{ ACCU_WIDTH+21, 21, 14, 7, 0 }; + return NARROW_WEIGHTS? + lane_offset_v'{ ACCU_WIDTH+21, 21, 14, 7, 0 } : + lane_offset_v'{ 0, 0, 0, 0, 0 }; // not supported end 2: begin - offset = NARROW_WEIGHTS? - '{ ACCU_WIDTH+23, 23, 16, 8, 0 } : - '{ ACCU_WIDTH+22, 22, 15, 8, 0 }; + return NARROW_WEIGHTS? + lane_offset_v'{ ACCU_WIDTH+23, 23, 16, 8, 0 } : + lane_offset_v'{ ACCU_WIDTH+22, 22, 15, 8, 0 }; end endcase + endfunction : sliceLanes + localparam lane_offset_v OFFSETS = sliceLanes(); - // Derive other Lane Attributes + function lo_width_v calcLoWidths(); + automatic lo_width_v lo_width; + automatic int unsigned lw_max = 0; for(int unsigned i = 0; i < 4; i++) begin - automatic int unsigned lw = offset[i+1] - offset[i]; + automatic int unsigned lw = OFFSETS[i+1] - OFFSETS[i]; lo_width[i] = lw; - - if(i < 3) begin - automatic int unsigned hw = 1 + $clog2(2**(ACCU_WIDTH-lw-1)+SIMD); - hi_width[i] = hw; - - if(lw > lw_max) lw_max = lw; - if(hw > hw_max) hw_max = hw; - end + if((i < 3) && (lw > lw_max)) lw_max = lw; end + lo_width[-1] = lw_max; + return lo_width; + endfunction : calcLoWidths + localparam lo_width_v LO_WIDTHS = calcLoWidths(); + + function hi_width_v calcHiWidths(); + automatic hi_width_v hi_width; + automatic int unsigned hw_max = 0; + for(int unsigned i = 0; i < 3; i++) begin + automatic int unsigned hw = 1 + $clog2(2**(ACCU_WIDTH-LO_WIDTHS[i]-1)+SIMD); + hi_width[i] = hw; + if(hw > hw_max) hw_max = hw; + end + hi_width[-1] = hw_max; + return hi_width; + endfunction : calcHiWidths + localparam hi_width_v HI_WIDTHS = calcHiWidths(); - return slicing_t'{ - OFFSET: offset, - LO_WIDTH: lo_width, - HI_WIDTH: hi_width, - LO_WIDTH_MAX: lw_max, - HI_WIDTH_MAX: hw_max - }; - - endfunction : sliceLanes - localparam slicing_t SLICING = sliceLanes(); localparam int unsigned A_WIDTH = 23 + 2*VERSION; // Width of A datapath // Compute the count of decendents for all nodes in the reduction trees. @@ -159,7 +158,7 @@ module mvu_4sx4u #( uwire signed [3:0] ww[PE_END - PE_BEG]; for(genvar pe = 0; pe < PE_END - PE_BEG; pe++) begin assign ww[pe] = w[PE_BEG + pe][s]; - if(pe) begin + if(pe > 0) begin if(BEHAVIORAL) assign xx[pe + PE_REM] = zero? 0 : ww[pe] * a[s]; `ifndef VERILATOR else begin @@ -181,7 +180,7 @@ module mvu_4sx4u #( dd = '0; aa = '0; for(int unsigned pe = 0; pe < PE_END - PE_BEG; pe++) begin - automatic int unsigned ofs = SLICING.OFFSET[pe + PE_REM]; + automatic int unsigned ofs = OFFSETS[pe + PE_REM]; dd[ofs+:3] = ww[pe]; assert(!NARROW_WEIGHTS || rst || !en || zero || (ww[pe] != -8)) else begin $warning("%m: Weight of -8 violates NARROW_WEIGHTS commitment."); @@ -499,14 +498,14 @@ module mvu_4sx4u #( X1 <= xx; X2 <= X1; foreach(X3[i]) begin - X3[i] <= X2[i] + (L[3]? 2'h0 : pp[SLICING.OFFSET[i]+:2]); + X3[i] <= X2[i] + (L[3]? 2'h0 : pp[OFFSETS[i]+:2]); end end end // Derive actual cross-lane overflows for(genvar i = 0; i < 3; i++) begin - assign h3[s][i] = pp[SLICING.OFFSET[i+1]+:2] - X3[i+1]; + assign h3[s][i] = pp[OFFSETS[i+1]+:2] - X3[i+1]; end assign p3[s] = pp; @@ -518,15 +517,15 @@ module mvu_4sx4u #( localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 1 }; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop uwire signed [ACCU_WIDTH-1:0] up4; - uwire signed [ SLICING.HI_WIDTH_MAX-1:0] hi4[3]; - uwire [$clog2(SIMD)+SLICING.LO_WIDTH_MAX-1:0] lo4[3]; + uwire signed [ HI_WIDTHS[-1]-1:0] hi4[3]; + uwire [$clog2(SIMD)+LO_WIDTHS[-1]-1:0] lo4[3]; for(genvar i = 0; i < 4; i++) begin // Conclusive high part accumulation if(i < 3) begin : genHi if(i < PE_REM) assign hi4[i] = '0; else begin - localparam int unsigned HI_WIDTH = SLICING.HI_WIDTH[i]; + localparam int unsigned HI_WIDTH = HI_WIDTHS[i]; // Adder Tree across all SIMD high contributions, each from [-1:1] uwire signed [2*SIMD-2:0][$clog2(1+SIMD):0] tree; @@ -558,12 +557,12 @@ module mvu_4sx4u #( // Conclusive low part accumulation (all unsigned arithmetic) if(i < PE_REM) assign lo4[i] = '0; else begin : genLo - localparam int unsigned LO_WIDTH = SLICING.LO_WIDTH[i]; + localparam int unsigned LO_WIDTH = LO_WIDTHS[i]; // Adder Tree across all SIMD low contributions localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1)); uwire [2*SIMD-2:0][ROOT_WIDTH-1:0] tree; - for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][SLICING.OFFSET[i]+:LO_WIDTH]; + for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][OFFSETS[i]+:LO_WIDTH]; for(genvar n = 0; n < SIMD-1; n++) begin // Sum truncated to actual maximum bit width at this node localparam int unsigned NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1)); @@ -589,9 +588,9 @@ module mvu_4sx4u #( if(rst) Res5 <= '{ default: 0 }; else if(en) begin Res5[3] <= up4 - hi4[2]; - Res5[2] <= $signed({ hi4[2], {(SLICING.LO_WIDTH[2]){1'b0}} }) + $signed({ 1'b0, lo4[2] }) - hi4[1]; - Res5[1] <= $signed({ hi4[1], {(SLICING.LO_WIDTH[1]){1'b0}} }) + $signed({ 1'b0, lo4[1] }) - hi4[0]; - Res5[0] <= $signed({ hi4[0], {(SLICING.LO_WIDTH[0]){1'b0}} }) + $signed({ 1'b0, lo4[0] }); + Res5[2] <= $signed({ hi4[2], {(LO_WIDTHS[2]){1'b0}} }) + $signed({ 1'b0, lo4[2] }) - hi4[1]; + Res5[1] <= $signed({ hi4[1], {(LO_WIDTHS[1]){1'b0}} }) + $signed({ 1'b0, lo4[1] }) - hi4[0]; + Res5[0] <= $signed({ hi4[0], {(LO_WIDTHS[0]){1'b0}} }) + $signed({ 1'b0, lo4[0] }); end end From b0852c880c19b33f5b461fbb6f2f29bbcf6e00ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Tue, 28 May 2024 11:07:11 +0100 Subject: [PATCH 11/30] Even more simplification. --- finn-rtllib/mvu/mvu_4sx4u.sv | 61 +++++++++++++++--------------------- 1 file changed, 26 insertions(+), 35 deletions(-) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index 0f8f643206..889fba63a9 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -72,10 +72,18 @@ module mvu_4sx4u #( end end + /** + * Lane Slicing + * Assumptions: + * - Internal lane widths differ, at most, by a single bit. + * - The rightmost lane (#0) has the maximum internal width. + * - The leftmost lane (#3) extends into the wide DSP accumulation path and + * is constrained by ACCU_WIDTH rather than the next lane. It doesn't have + * an external high extension. + * - The one but leftmost lane (#2) has the minimum internal width and, hence, + * the macimum external high extension. + */ typedef int unsigned lane_offset_v[4:0]; - typedef int unsigned lo_width_v[3:-1]; // Index -1: maximum across all but leftmost lane - typedef int unsigned hi_width_v[2:-1]; - function lane_offset_v sliceLanes(); unique case(VERSION) 1: begin @@ -92,31 +100,14 @@ module mvu_4sx4u #( endfunction : sliceLanes localparam lane_offset_v OFFSETS = sliceLanes(); - function lo_width_v calcLoWidths(); - automatic lo_width_v lo_width; - automatic int unsigned lw_max = 0; - for(int unsigned i = 0; i < 4; i++) begin - automatic int unsigned lw = OFFSETS[i+1] - OFFSETS[i]; - lo_width[i] = lw; - if((i < 3) && (lw > lw_max)) lw_max = lw; - end - lo_width[-1] = lw_max; - return lo_width; - endfunction : calcLoWidths - localparam lo_width_v LO_WIDTHS = calcLoWidths(); - - function hi_width_v calcHiWidths(); - automatic hi_width_v hi_width; - automatic int unsigned hw_max = 0; - for(int unsigned i = 0; i < 3; i++) begin - automatic int unsigned hw = 1 + $clog2(2**(ACCU_WIDTH-LO_WIDTHS[i]-1)+SIMD); - hi_width[i] = hw; - if(hw > hw_max) hw_max = hw; - end - hi_width[-1] = hw_max; - return hi_width; - endfunction : calcHiWidths - localparam hi_width_v HI_WIDTHS = calcHiWidths(); + function int unsigned lo_width(input int unsigned i); + return OFFSETS[i+1] - OFFSETS[i]; + endfunction : lo_width + function int unsigned hi_width(input int unsigned i); + return 1 + $clog2(2**(ACCU_WIDTH-lo_width(i)-1)+SIMD); + endfunction : hi_width + localparam int unsigned LO_WIDTH_MAX = lo_width(0); + localparam int unsigned HI_WIDTH_MAX = hi_width(2); localparam int unsigned A_WIDTH = 23 + 2*VERSION; // Width of A datapath @@ -517,15 +508,15 @@ module mvu_4sx4u #( localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 1 }; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop uwire signed [ACCU_WIDTH-1:0] up4; - uwire signed [ HI_WIDTHS[-1]-1:0] hi4[3]; - uwire [$clog2(SIMD)+LO_WIDTHS[-1]-1:0] lo4[3]; + uwire signed [ HI_WIDTH_MAX-1:0] hi4[3]; + uwire [$clog2(SIMD)+LO_WIDTH_MAX-1:0] lo4[3]; for(genvar i = 0; i < 4; i++) begin // Conclusive high part accumulation if(i < 3) begin : genHi if(i < PE_REM) assign hi4[i] = '0; else begin - localparam int unsigned HI_WIDTH = HI_WIDTHS[i]; + localparam int unsigned HI_WIDTH = hi_width(i); // Adder Tree across all SIMD high contributions, each from [-1:1] uwire signed [2*SIMD-2:0][$clog2(1+SIMD):0] tree; @@ -557,7 +548,7 @@ module mvu_4sx4u #( // Conclusive low part accumulation (all unsigned arithmetic) if(i < PE_REM) assign lo4[i] = '0; else begin : genLo - localparam int unsigned LO_WIDTH = LO_WIDTHS[i]; + localparam int unsigned LO_WIDTH = lo_width(i); // Adder Tree across all SIMD low contributions localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1)); @@ -588,9 +579,9 @@ module mvu_4sx4u #( if(rst) Res5 <= '{ default: 0 }; else if(en) begin Res5[3] <= up4 - hi4[2]; - Res5[2] <= $signed({ hi4[2], {(LO_WIDTHS[2]){1'b0}} }) + $signed({ 1'b0, lo4[2] }) - hi4[1]; - Res5[1] <= $signed({ hi4[1], {(LO_WIDTHS[1]){1'b0}} }) + $signed({ 1'b0, lo4[1] }) - hi4[0]; - Res5[0] <= $signed({ hi4[0], {(LO_WIDTHS[0]){1'b0}} }) + $signed({ 1'b0, lo4[0] }); + Res5[2] <= $signed({ hi4[2], {(lo_width(2)){1'b0}} }) + $signed({ 1'b0, lo4[2] }) - hi4[1]; + Res5[1] <= $signed({ hi4[1], {(lo_width(1)){1'b0}} }) + $signed({ 1'b0, lo4[1] }) - hi4[0]; + Res5[0] <= $signed({ hi4[0], {(lo_width(0)){1'b0}} }) + $signed({ 1'b0, lo4[0] }); end end From b36c5b190a0a01503b9f521a320c4a5820ac2b78 Mon Sep 17 00:00:00 2001 From: auphelia Date: Tue, 28 May 2024 13:48:25 +0100 Subject: [PATCH 12/30] [RTL MVU] Setting lo width max explicitly and updating tests --- finn-rtllib/mvu/mvu_4sx4u.sv | 2 +- tests/fpgadataflow/test_fpgadataflow_mvau.py | 13 +++++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index 889fba63a9..ccb25380c8 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -106,7 +106,7 @@ module mvu_4sx4u #( function int unsigned hi_width(input int unsigned i); return 1 + $clog2(2**(ACCU_WIDTH-lo_width(i)-1)+SIMD); endfunction : hi_width - localparam int unsigned LO_WIDTH_MAX = lo_width(0); + localparam int unsigned LO_WIDTH_MAX = OFFSETS[1] - OFFSETS[0]; localparam int unsigned HI_WIDTH_MAX = hi_width(2); localparam int unsigned A_WIDTH = 23 + 2*VERSION; // Width of A datapath diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py index 2a22f3fc41..4eb0b22d46 100644 --- a/tests/fpgadataflow/test_fpgadataflow_mvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py @@ -635,17 +635,19 @@ def test_mvau_fifocharacterize_rtlsim( @pytest.mark.parametrize("mh", [18]) @pytest.mark.parametrize("mw", [128]) -@pytest.mark.parametrize("pe", [1, 6, 9, 18]) -@pytest.mark.parametrize("simd", [1, 4, 16, 64, 128]) +@pytest.mark.parametrize("pe", [1, 9, 18]) +@pytest.mark.parametrize("simd", [1, 64, 128]) @pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]]) @pytest.mark.parametrize("wdt", [DataType["INT4"], DataType["INT8"]]) -@pytest.mark.parametrize("part", ["xcvc1902-vsva2197-2MP-e-S", "xcku3p-ffva676-1-e"]) +@pytest.mark.parametrize( + "part", ["xcvc1902-vsva2197-2MP-e-S", "xcku3p-ffva676-1-e", "xc7z020clg400-1"] +) @pytest.mark.parametrize("clk_ns", [1.66, 4]) @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado def test_fpgadataflow_rtl_mvau(mh, mw, pe, simd, idt, wdt, part, clk_ns): - if part == "xcku3p-ffva676-1-e" and clk_ns != 1.66: + if part != "xcvc1902-vsva2197-2MP-e-S" and clk_ns != 1.66: pytest.skip( """Skip test for varying clk for devices other than Versal, since this variable only affects DSP58s""" @@ -657,6 +659,9 @@ def test_fpgadataflow_rtl_mvau(mh, mw, pe, simd, idt, wdt, part, clk_ns): ifm = helper.make_tensor_value_info("ifm", TensorProto.FLOAT, [1, ofm_h, ofm_w, mw]) ofm = helper.make_tensor_value_info("ofm", TensorProto.FLOAT, (1, ofm_h, ofm_w, mh)) W = gen_finn_dt_tensor(wdt, (mw, mh)) + # if 7 series, force weights to narrow range + if part == "xc7z020clg400-1": + W = np.clip(W, wdt.min() + 1, wdt.max()) model = make_single_matmul_modelwrapper(ifm, ofm, idt, wdt, W) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) From 4012378c899bb7cac5eeb8bc6c058c1c89f5ee57 Mon Sep 17 00:00:00 2001 From: auphelia Date: Wed, 29 May 2024 12:29:00 +0100 Subject: [PATCH 13/30] [Transform] Make fpga part required argument for SpecializeLayers --- notebooks/advanced/3_folding.ipynb | 2 +- .../bnn-pynq/cnv_end2end_example.ipynb | 13 ++-- .../bnn-pynq/tfc_end2end_example.ipynb | 59 ++++++++++--------- .../bnn-pynq/tfc_end2end_verification.ipynb | 2 +- src/finn/builder/build_dataflow_steps.py | 6 +- .../fpgadataflow/make_zynq_proj.py | 4 +- .../fpgadataflow/specialize_layers.py | 2 +- .../fpgadataflow/vitis_build.py | 4 +- tests/end2end/test_end2end_bnn_pynq.py | 5 +- tests/end2end/test_end2end_mobilenet_v1.py | 2 +- .../test_convert_to_hw_1d_conv_layer.py | 4 +- .../test_convert_to_hw_channelwise_layer.py | 2 +- .../test_convert_to_hw_conv_fc_transition.py | 2 +- .../test_convert_to_hw_conv_layer.py | 4 +- .../test_convert_to_hw_layers_cnv.py | 2 +- .../test_convert_to_hw_layers_fc.py | 4 +- .../test_convert_to_hw_layers_synthetic.py | 2 +- .../test_convert_to_hw_pool_batch.py | 2 +- .../test_depthwise_convolution.py | 4 +- .../test_fpgadataflow_addstreams.py | 2 +- .../test_fpgadataflow_channelwise_ops.py | 2 +- .../test_fpgadataflow_checksum.py | 2 +- .../fpgadataflow/test_fpgadataflow_concat.py | 6 +- .../test_fpgadataflow_convinputgenerator.py | 2 +- ...dataflow_convinputgenerator_rtl_dynamic.py | 8 +-- .../fpgadataflow/test_fpgadataflow_deconv.py | 2 +- .../test_fpgadataflow_downsampler.py | 2 +- .../test_fpgadataflow_duplicatestreams.py | 2 +- tests/fpgadataflow/test_fpgadataflow_dwc.py | 6 +- .../fpgadataflow/test_fpgadataflow_eltwise.py | 2 +- tests/fpgadataflow/test_fpgadataflow_fifo.py | 2 +- .../test_fpgadataflow_fmpadding.py | 2 +- .../test_fpgadataflow_globalaccpool.py | 2 +- .../test_fpgadataflow_labelselect.py | 2 +- .../fpgadataflow/test_fpgadataflow_lookup.py | 4 +- .../test_fpgadataflow_res_estimate.py | 4 +- .../test_fpgadataflow_streamingmaxpool.py | 2 +- .../test_fpgadataflow_thresholding.py | 2 +- .../test_fpgadataflow_thresholding_runtime.py | 8 +-- .../test_fpgadataflow_upsampler.py | 2 +- tests/fpgadataflow/test_runtime_weights.py | 4 +- 41 files changed, 103 insertions(+), 92 deletions(-) diff --git a/notebooks/advanced/3_folding.ipynb b/notebooks/advanced/3_folding.ipynb index 8c7b97d6c6..fc9f0080ec 100644 --- a/notebooks/advanced/3_folding.ipynb +++ b/notebooks/advanced/3_folding.ipynb @@ -567,7 +567,7 @@ "from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers\n", "\n", "model_updated = model_updated.transform(InsertDWC())\n", - "model_updated = model_updated.transform(SpecializeLayers())\n", + "model_updated = model_updated.transform(SpecializeLayers(\"xc7z020clg400-1\"))\n", "model_updated = model_updated.transform(GiveUniqueNodeNames())" ] }, diff --git a/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb index 3141d54ddf..8b8cff8ee9 100644 --- a/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb +++ b/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb @@ -282,6 +282,12 @@ "metadata": {}, "outputs": [], "source": [ + "from finn.util.basic import pynq_part_map\n", + "# change this if you have a different PYNQ board, see list above\n", + "pynq_board = \"Pynq-Z1\"\n", + "fpga_part = pynq_part_map[pynq_board]\n", + "target_clk_ns = 10\n", + "\n", "import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw\n", "from finn.transformation.fpgadataflow.create_dataflow_partition import (\n", " CreateDataflowPartition,\n", @@ -314,7 +320,7 @@ "# save the dataflow partition with a different name for easier access\n", "# and specialize the layers to HLS variants\n", "dataflow_model = ModelWrapper(dataflow_model_filename)\n", - "dataflow_model = dataflow_model.transform(SpecializeLayers())\n", + "dataflow_model = dataflow_model.transform(SpecializeLayers(fpga_part))\n", "dataflow_model.save(build_dir + \"/end2end_cnv_w1a1_dataflow_model.onnx\")" ] }, @@ -432,12 +438,9 @@ "metadata": {}, "outputs": [], "source": [ - "test_pynq_board = \"Pynq-Z1\"\n", - "target_clk_ns = 10\n", - "\n", "from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild\n", "model = ModelWrapper(build_dir+\"/end2end_cnv_w1a1_folded.onnx\")\n", - "model = model.transform(ZynqBuild(platform = test_pynq_board, period_ns = target_clk_ns))" + "model = model.transform(ZynqBuild(platform = pynq_board, period_ns = target_clk_ns))" ] }, { diff --git a/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb index bbaa74dbff..675ba23d2d 100644 --- a/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb +++ b/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb @@ -547,6 +547,36 @@ "thresh_node_inst.set_nodeattr(\"preferred_impl_style\", \"hls\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We'll define two helper variables that describe the Xilinx FPGA part name and the PYNQ board name that we are targeting." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# print the names of the supported PYNQ boards\n", + "from finn.util.basic import pynq_part_map\n", + "print(pynq_part_map.keys())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# change this if you have a different PYNQ board, see list above\n", + "pynq_board = \"Pynq-Z1\"\n", + "fpga_part = pynq_part_map[pynq_board]\n", + "target_clk_ns = 10" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -561,7 +591,7 @@ "outputs": [], "source": [ "from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers\n", - "model = model.transform(SpecializeLayers())\n", + "model = model.transform(SpecializeLayers(fpga_part))\n", "\n", "model.save(build_dir+\"/tfc_w1_a1_specialize_layers.onnx\")\n", "showInNetron(build_dir+\"/tfc_w1_a1_specialize_layers.onnx\")" @@ -687,32 +717,7 @@ "source": [ "## 3. Hardware Build \n", "\n", - "We're finally ready to start generating hardware from our network. Depending on whether you want to target a Zynq or Alveo platform, FINN offers two transformations to build the accelerator, integrate into an appropriate shell and build a bitfile. These are `ZynqBuild` and `VitisBuild` for Zynq and Alveo, respectively. In this notebook we'll demonstrate the `ZynqBuild` as these boards are more common and it's much faster to complete bitfile generation for the smaller FPGAs found on them.\n", - "\n", - "As we will be dealing with FPGA synthesis tools in these tasks, we'll define two helper variables that describe the Xilinx FPGA part name and the PYNQ board name that we are targeting." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# print the names of the supported PYNQ boards\n", - "from finn.util.basic import pynq_part_map\n", - "print(pynq_part_map.keys())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# change this if you have a different PYNQ board, see list above\n", - "pynq_board = \"Pynq-Z1\"\n", - "fpga_part = pynq_part_map[pynq_board]\n", - "target_clk_ns = 10" + "We're finally ready to start generating hardware from our network. Depending on whether you want to target a Zynq or Alveo platform, FINN offers two transformations to build the accelerator, integrate into an appropriate shell and build a bitfile. These are `ZynqBuild` and `VitisBuild` for Zynq and Alveo, respectively. In this notebook we'll demonstrate the `ZynqBuild` as these boards are more common and it's much faster to complete bitfile generation for the smaller FPGAs found on them." ] }, { diff --git a/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb b/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb index a07a8d2254..aacd12ef05 100644 --- a/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb +++ b/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb @@ -396,7 +396,7 @@ "child_model = child_model.transform(InsertDWC()) \n", "child_model = child_model.transform(InsertFIFO(create_shallow_fifos=True))\n", "# DWC and FIFOs need to be specialized to either HLS or RTL variants\n", - "child_model = child_model.transform(SpecializeLayers())\n", + "child_model = child_model.transform(SpecializeLayers(test_fpga_part))\n", "child_model.save(build_dir + \"/test.onnx\");\n", "child_model = child_model.transform(GiveUniqueNodeNames())\n", "child_model = child_model.transform(PrepareIP(test_fpga_part, target_clk_ns))\n", diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index a842a3ce4e..44d54f8aa2 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -541,7 +541,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): if cfg.auto_fifo_depths: if cfg.auto_fifo_strategy == "characterize": model = model.transform(InsertDWC()) - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers(cfg._resolve_fpga_part())) model = model.transform(GiveUniqueNodeNames()) model = model.transform( PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()) @@ -559,7 +559,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): create_shallow_fifos=True, ) ) - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers(cfg._resolve_fpga_part())) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) elif cfg.auto_fifo_strategy == "largefifo_rtlsim": @@ -591,7 +591,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): # need to make sure all FIFOs are created so that their depth can be # set by ApplyConfig, so create_shallow_fifos=True model = model.transform(InsertFIFO(create_shallow_fifos=True)) - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers(cfg._resolve_fpga_part())) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) if cfg.folding_config_file is not None: diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py index fc2047b08e..63ce2d3cbf 100644 --- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py +++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py @@ -322,7 +322,7 @@ def apply(self, model): prep_transforms = [ InsertIODMA(self.axi_port_width), InsertDWC(), - SpecializeLayers(), + SpecializeLayers(self.fpga_part), Floorplan(), CreateDataflowPartition(partition_model_dir=self.partition_model_dir), ] @@ -338,7 +338,7 @@ def apply(self, model): dataflow_model_filename = sdp_node.get_nodeattr("model") kernel_model = ModelWrapper(dataflow_model_filename) kernel_model = kernel_model.transform(InsertFIFO()) - kernel_model = kernel_model.transform(SpecializeLayers()) + kernel_model = kernel_model.transform(SpecializeLayers(self.fpga_part)) kernel_model = kernel_model.transform(GiveUniqueNodeNames(prefix)) kernel_model.save(dataflow_model_filename) kernel_model = kernel_model.transform(PrepareIP(self.fpga_part, self.period_ns)) diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py index 9e660717f3..dbcadd1df5 100644 --- a/src/finn/transformation/fpgadataflow/specialize_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -290,7 +290,7 @@ def _vvu_rtl_possible(n, fpgapart): class SpecializeLayers(Transformation): """Specialize all layers to either HLS or RTL variants""" - def __init__(self, fpgapart=""): + def __init__(self, fpgapart): super().__init__() self.fpgapart = fpgapart diff --git a/src/finn/transformation/fpgadataflow/vitis_build.py b/src/finn/transformation/fpgadataflow/vitis_build.py index da7624b8ff..157d81cf35 100644 --- a/src/finn/transformation/fpgadataflow/vitis_build.py +++ b/src/finn/transformation/fpgadataflow/vitis_build.py @@ -383,7 +383,7 @@ def __init__( def apply(self, model): _check_vitis_envvars() # prepare at global level, then break up into kernels - prep_transforms = [InsertIODMA(512), InsertDWC(), SpecializeLayers()] + prep_transforms = [InsertIODMA(512), InsertDWC(), SpecializeLayers(self.fpga_part)] for trn in prep_transforms: model = model.transform(trn) model = model.transform(GiveUniqueNodeNames()) @@ -405,7 +405,7 @@ def apply(self, model): dataflow_model_filename = sdp_node.get_nodeattr("model") kernel_model = ModelWrapper(dataflow_model_filename) kernel_model = kernel_model.transform(InsertFIFO()) - kernel_model = kernel_model.transform(SpecializeLayers()) + kernel_model = kernel_model.transform(SpecializeLayers(self.fpga_part)) kernel_model = kernel_model.transform(RemoveUnusedTensors()) kernel_model = kernel_model.transform(GiveUniqueNodeNames(prefix)) kernel_model.save(dataflow_model_filename) diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py index 94134967fa..7fb0f5ff1d 100644 --- a/tests/end2end/test_end2end_bnn_pynq.py +++ b/tests/end2end/test_end2end_bnn_pynq.py @@ -596,6 +596,7 @@ def test_convert_to_hw_layers(self, topology, wbits, abits, board): assert len(model.get_nodes_by_op_type(op_type)) == exp_count def test_specialize_layers(self, topology, wbits, abits, board): + build_data = get_build_env(board, target_clk_ns) prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "convert_to_hw_layers") model = load_test_checkpoint_or_skip(prev_chkpt_name) # set preferred impl style to hls for all layers @@ -605,7 +606,7 @@ def test_specialize_layers(self, topology, wbits, abits, board): if is_fpgadataflow_node(node): inst = getCustomOp(node) inst.set_nodeattr("preferred_impl_style", "hls") - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers(build_data["part"])) model = model.transform(GiveUniqueNodeNames()) model.save(get_checkpoint_name(topology, wbits, abits, "specialize_layers")) exp_layer_counts = { @@ -739,7 +740,7 @@ def test_ipstitch_rtlsim(self, topology, wbits, abits, board): model = load_test_checkpoint_or_skip(prev_chkpt_name) test_fpga_part = get_build_env(board, target_clk_ns)["part"] model = model.transform(InsertDWC()) - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers(test_fpga_part)) model = model.transform(GiveUniqueNodeNames()) model = model.transform(AnnotateCycles()) perf = model.analysis(dataflow_performance) diff --git a/tests/end2end/test_end2end_mobilenet_v1.py b/tests/end2end/test_end2end_mobilenet_v1.py index 4645689206..01d995c147 100644 --- a/tests/end2end/test_end2end_mobilenet_v1.py +++ b/tests/end2end/test_end2end_mobilenet_v1.py @@ -246,7 +246,7 @@ def test_end2end_mobilenet_convert_to_hw_layers(): @pytest.mark.end2end def test_end2end_mobilenet_specialize_layers(): model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_hw_layers.onnx") - model = model.transform(SpecializeLayers(fpgapart=fpga_part)) + model = model.transform(SpecializeLayers(fpga_part)) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) model.save(build_dir + "/end2end_mobilenet_specialize_layers.onnx") diff --git a/tests/fpgadataflow/test_convert_to_hw_1d_conv_layer.py b/tests/fpgadataflow/test_convert_to_hw_1d_conv_layer.py index c5d0281203..6d3929109f 100644 --- a/tests/fpgadataflow/test_convert_to_hw_1d_conv_layer.py +++ b/tests/fpgadataflow/test_convert_to_hw_1d_conv_layer.py @@ -143,10 +143,10 @@ def test_convert_to_hw_1d_conv_layer(conv_config, depthwise, use_rtl_swg, exec_m inst.set_nodeattr("preferred_impl_style", "hls") if depthwise is True: new_model = new_model.transform(to_hw.InferVectorVectorActivation()) - new_model = new_model.transform(SpecializeLayers()) + new_model = new_model.transform(SpecializeLayers("xc7z020clg400-1")) else: new_model = new_model.transform(to_hw.InferQuantizedMatrixVectorActivation()) - new_model = new_model.transform(SpecializeLayers()) + new_model = new_model.transform(SpecializeLayers("xc7z020clg400-1")) # set folding parameters for MVAU if new_model.get_nodes_by_op_type("MVAU_hls"): fc_node = new_model.get_nodes_by_op_type("MVAU_hls")[0] diff --git a/tests/fpgadataflow/test_convert_to_hw_channelwise_layer.py b/tests/fpgadataflow/test_convert_to_hw_channelwise_layer.py index 4b063f8505..ac02008ff2 100644 --- a/tests/fpgadataflow/test_convert_to_hw_channelwise_layer.py +++ b/tests/fpgadataflow/test_convert_to_hw_channelwise_layer.py @@ -121,7 +121,7 @@ def test_convert_to_hw_channelwise_layer(pdt, idt, onnx_op_name, scalar_param, e assert (y_produced == y_expected).all() assert model.graph.node[1].op_type == "ChannelwiseOp" - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers("xc7z020clg400-1")) if exec_mode == "cppsim": model = model.transform(PrepareCppSim()) diff --git a/tests/fpgadataflow/test_convert_to_hw_conv_fc_transition.py b/tests/fpgadataflow/test_convert_to_hw_conv_fc_transition.py index f7b3c55c2a..f9b5dff56c 100755 --- a/tests/fpgadataflow/test_convert_to_hw_conv_fc_transition.py +++ b/tests/fpgadataflow/test_convert_to_hw_conv_fc_transition.py @@ -204,7 +204,7 @@ def test_convert_to_hw_conv_fc_transition(conv_config, depthwise, use_reshape): if is_fpgadataflow_node(node): inst = getCustomOp(node) inst.set_nodeattr("preferred_impl_style", "hls") - new_model = new_model.transform(SpecializeLayers()) + new_model = new_model.transform(SpecializeLayers("xc7z020clg400-1")) new_model = new_model.transform(GiveUniqueNodeNames()) new_model = new_model.transform(InferDataLayouts()) diff --git a/tests/fpgadataflow/test_convert_to_hw_conv_layer.py b/tests/fpgadataflow/test_convert_to_hw_conv_layer.py index 61f8af7806..122997e412 100644 --- a/tests/fpgadataflow/test_convert_to_hw_conv_layer.py +++ b/tests/fpgadataflow/test_convert_to_hw_conv_layer.py @@ -131,10 +131,10 @@ def test_convert_to_hw_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mode inst.set_nodeattr("preferred_impl_style", "hls") if depthwise is True: new_model = new_model.transform(to_hw.InferVectorVectorActivation()) - new_model = new_model.transform(SpecializeLayers()) + new_model = new_model.transform(SpecializeLayers("xc7z020clg400-1")) else: new_model = new_model.transform(to_hw.InferQuantizedMatrixVectorActivation()) - new_model = new_model.transform(SpecializeLayers()) + new_model = new_model.transform(SpecializeLayers("xc7z020clg400-1")) # set folding parameters for MVAU if new_model.get_nodes_by_op_type("MVAU_hls"): fc_node = new_model.get_nodes_by_op_type("MVAU_hls")[0] diff --git a/tests/fpgadataflow/test_convert_to_hw_layers_cnv.py b/tests/fpgadataflow/test_convert_to_hw_layers_cnv.py index 71f383ca23..4b8668c7b3 100644 --- a/tests/fpgadataflow/test_convert_to_hw_layers_cnv.py +++ b/tests/fpgadataflow/test_convert_to_hw_layers_cnv.py @@ -111,7 +111,7 @@ def test_convert_to_hw_layers_cnv_w1a1(fused_activation): if is_fpgadataflow_node(node): inst = getCustomOp(node) inst.set_nodeattr("preferred_impl_style", "hls") - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers("xc7z020clg400-1")) for node in model.graph.node: if node.op_type == "MVAU_hls": inst = getCustomOp(node) diff --git a/tests/fpgadataflow/test_convert_to_hw_layers_fc.py b/tests/fpgadataflow/test_convert_to_hw_layers_fc.py index 746ded9074..94fafae6b7 100644 --- a/tests/fpgadataflow/test_convert_to_hw_layers_fc.py +++ b/tests/fpgadataflow/test_convert_to_hw_layers_fc.py @@ -82,7 +82,7 @@ def test_convert_to_hw_layers_tfc_w1a1(): model = model.transform(absorb.AbsorbMulIntoMultiThreshold()) model = model.transform(RoundAndClipThresholds()) model = model.transform(to_hw.InferBinaryMatrixVectorActivation()) - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers("xc7z020clg400-1")) fc0 = model.graph.node[2] assert fc0.op_type.startswith("MVAU") assert model.get_tensor_shape(fc0.input[0]) == [1, 784] @@ -154,7 +154,7 @@ def test_convert_to_hw_layers_tfc_w1a2(): model = model.transform(GiveReadableTensorNames()) model = model.transform(Streamline()) model = model.transform(to_hw.InferQuantizedMatrixVectorActivation()) - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers("xc7z020clg400-1")) fc0 = model.graph.node[2] assert fc0.op_type.startswith("MVAU") diff --git a/tests/fpgadataflow/test_convert_to_hw_layers_synthetic.py b/tests/fpgadataflow/test_convert_to_hw_layers_synthetic.py index 6c83f10617..6a22f39cdc 100644 --- a/tests/fpgadataflow/test_convert_to_hw_layers_synthetic.py +++ b/tests/fpgadataflow/test_convert_to_hw_layers_synthetic.py @@ -210,7 +210,7 @@ def test_convert_to_hw_layers_synthetic(ch, ifmdim, idt): output_hw = oxe.execute_onnx(model, input_dict, True) - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers("xc7z020clg400-1")) # check topology status diff --git a/tests/fpgadataflow/test_convert_to_hw_pool_batch.py b/tests/fpgadataflow/test_convert_to_hw_pool_batch.py index d532cf345e..e155053b8b 100644 --- a/tests/fpgadataflow/test_convert_to_hw_pool_batch.py +++ b/tests/fpgadataflow/test_convert_to_hw_pool_batch.py @@ -186,7 +186,7 @@ def test_convert_to_hw_pool(idt, odt, pool_config, ifm_ch, pe, op_type, exec_mod inst.set_nodeattr("preferred_impl_style", "hls") y_produced = oxe.execute_onnx(new_model, input_dict)["outp"] assert (y_produced == y_expected).all() - new_model = new_model.transform(SpecializeLayers()) + new_model = new_model.transform(SpecializeLayers("xc7z020clg400-1")) # Folding for n in new_model.graph.node: diff --git a/tests/fpgadataflow/test_depthwise_convolution.py b/tests/fpgadataflow/test_depthwise_convolution.py index b8242df933..f684931478 100644 --- a/tests/fpgadataflow/test_depthwise_convolution.py +++ b/tests/fpgadataflow/test_depthwise_convolution.py @@ -182,7 +182,7 @@ def test_depthwise_conv_hw_cppsim(act, pe, k, stride, padding): new_model = model.transform(InferConvInpGen()) new_model = new_model.transform(InferVectorVectorActivation()) - new_model = new_model.transform(SpecializeLayers()) + new_model = new_model.transform(SpecializeLayers("xc7z020clg400-1")) # set SIMD in ConvInputGen node and PE in VVAU node for n in new_model.graph.node: @@ -226,7 +226,7 @@ def test_depthwise_conv_hw_rtlsim(act, pe, k, stride, padding): new_model = model.transform(InferConvInpGen()) new_model = new_model.transform(InferVectorVectorActivation()) - new_model = new_model.transform(SpecializeLayers()) + new_model = new_model.transform(SpecializeLayers("xc7z020clg400-1")) # set SIMD in ConvInputGen node and PE in VVAU node for n in new_model.graph.node: diff --git a/tests/fpgadataflow/test_fpgadataflow_addstreams.py b/tests/fpgadataflow/test_fpgadataflow_addstreams.py index 530d94e13b..484cbbe04a 100644 --- a/tests/fpgadataflow/test_fpgadataflow_addstreams.py +++ b/tests/fpgadataflow/test_fpgadataflow_addstreams.py @@ -116,7 +116,7 @@ def test_fpgadataflow_addstreams(idt, ch, fold, exec_mode): y_produced = oxe.execute_onnx(model, input_dict)["outp"] assert (y_produced == y_expected).all(), "Execution of hw layer failed" - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers("xc7z020clg400-1")) if exec_mode == "cppsim": model = model.transform(PrepareCppSim()) diff --git a/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py index d5fa7c779f..2ad49ae58b 100644 --- a/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py +++ b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py @@ -139,7 +139,7 @@ def test_fpgadataflow_channelwise_ops(idt, act, pdt, nf, ich, func, vecs, exec_m assert (y_produced == y_expected).all(), "HW layer execution failed" - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers("xc7z020clg400-1")) if exec_mode == "cppsim": model = model.transform(PrepareCppSim()) diff --git a/tests/fpgadataflow/test_fpgadataflow_checksum.py b/tests/fpgadataflow/test_fpgadataflow_checksum.py index 34a48996c9..817d13e13d 100644 --- a/tests/fpgadataflow/test_fpgadataflow_checksum.py +++ b/tests/fpgadataflow/test_fpgadataflow_checksum.py @@ -176,7 +176,7 @@ def test_fpgadataflow_checksum(): # rtlsim model = model.transform(InsertFIFO(True)) - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers(test_fpga_part)) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) model = model.transform(HLSSynthIP()) diff --git a/tests/fpgadataflow/test_fpgadataflow_concat.py b/tests/fpgadataflow/test_fpgadataflow_concat.py index b52b14fca3..25c738d049 100644 --- a/tests/fpgadataflow/test_fpgadataflow_concat.py +++ b/tests/fpgadataflow/test_fpgadataflow_concat.py @@ -98,7 +98,7 @@ def test_fpgadataflow_concat(exec_mode, idt): assert model.graph.node[0].domain == "finn.custom_op.fpgadataflow" ret = execute_onnx(model, inp_dict) assert (ret[oname] == exp_out).all() - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers("xc7z020clg400-1")) assert model.graph.node[0].op_type == "StreamingConcat_hls" assert model.graph.node[0].domain == "finn.custom_op.fpgadataflow.hls" if exec_mode == "cppsim": @@ -141,11 +141,11 @@ def test_fpgadataflow_concat_stitchedip(): model = model.transform(InferConcatLayer()) assert model.graph.node[0].op_type == "StreamingConcat" assert model.graph.node[0].domain == "finn.custom_op.fpgadataflow" - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers(fpga_part)) assert model.graph.node[0].op_type == "StreamingConcat_hls" assert model.graph.node[0].domain == "finn.custom_op.fpgadataflow.hls" model = model.transform(InsertFIFO(create_shallow_fifos=True)) - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers(fpga_part)) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP(fpga_part, clk_ns)) model = model.transform(HLSSynthIP()) diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py index 45ca74fbea..dc5dc0c02a 100644 --- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py +++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py @@ -189,7 +189,7 @@ def test_fpgadataflow_slidingwindow( # set impl_style inst = getCustomOp(model.get_nodes_by_op_type("ConvolutionInputGenerator")[0]) inst.set_nodeattr("preferred_impl_style", impl_style) - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers("xc7z020clg400-1")) # set simd inst = getCustomOp(model.graph.node[0]) inst.set_nodeattr("SIMD", simd) diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py index 6c0712b7b0..9c45b06f4a 100644 --- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py +++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py @@ -253,7 +253,7 @@ def test_fpgadataflow_conv_dynamic(cfg): model = model.transform(to_hw.InferQuantizedMatrixVectorActivation()) model = model.transform(to_hw.InferVectorVectorActivation()) model = model.transform(absorb.AbsorbConsecutiveTransposes()) - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers("xc7z020clg400-1")) parent_model = model.transform(CreateDataflowPartition()) sdp_inst = getCustomOp(parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]) model = ModelWrapper(sdp_inst.get_nodeattr("model")) @@ -281,7 +281,7 @@ def test_fpgadataflow_conv_dynamic(cfg): getCustomOp(comp_node).set_nodeattr("PE", 4) model = model.transform(InsertDWC()) model = model.transform(InsertFIFO(create_shallow_fifos=True)) - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers("xc7z020clg400-1")) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) model = model.transform(PrepareIP("xc7z020clg400-1", 5)) @@ -523,11 +523,11 @@ def test_fpgadataflow_slidingwindow_rtl_dynamic( dw=dw, ) - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers("xc7z020clg400-1")) # Simulate using stitched-ip-rtlsim so we can use existing infrastructure # that supports hook functions to re-program configuration before rtlsim model = model.transform(InsertFIFO(True)) # required for proper simulation - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers("xc7z020clg400-1")) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP("xc7z020clg400-1", 5)) model = model.transform(HLSSynthIP()) diff --git a/tests/fpgadataflow/test_fpgadataflow_deconv.py b/tests/fpgadataflow/test_fpgadataflow_deconv.py index f1fc989066..16cf7481f2 100644 --- a/tests/fpgadataflow/test_fpgadataflow_deconv.py +++ b/tests/fpgadataflow/test_fpgadataflow_deconv.py @@ -169,7 +169,7 @@ def test_fpgadataflow_deconv(idim, stride, ifm_ch, ofm_ch, simd, pe, k, padding, y_produced = oxe.execute_onnx(model, input_dict)["outp"] assert (y_produced == y_expected).all() - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers(test_fpga_part)) model = model.transform(MinimizeAccumulatorWidth()) for n in model.graph.node: diff --git a/tests/fpgadataflow/test_fpgadataflow_downsampler.py b/tests/fpgadataflow/test_fpgadataflow_downsampler.py index 25717a4152..fb9d52eb51 100644 --- a/tests/fpgadataflow/test_fpgadataflow_downsampler.py +++ b/tests/fpgadataflow/test_fpgadataflow_downsampler.py @@ -131,7 +131,7 @@ def test_fpgadataflow_downsampler(is_1d, flip_1d, exec_mode): assert len(model.get_nodes_by_op_type("DownSampler")) == 1 y_produced = execute_onnx(model, idict)["out0"] assert (y_produced == y_expected).all() - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers("xc7z020clg400-1")) if exec_mode == "cppsim": model = model.transform(SetExecMode("cppsim")) model = model.transform(PrepareCppSim()) diff --git a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py index 62b9265466..7ac9cbe3fb 100644 --- a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py +++ b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py @@ -128,7 +128,7 @@ def test_fpgadataflow_duplicatestreams(idt, ch, fold, imdim, n_dupl, exec_mode, y = output_dict["outp%d" % i] assert (y == expected_y).all(), "HW layer execution failed" - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers("xc7z020clg400-1")) if exec_mode == "cppsim": model = model.transform(PrepareCppSim()) diff --git a/tests/fpgadataflow/test_fpgadataflow_dwc.py b/tests/fpgadataflow/test_fpgadataflow_dwc.py index 7152d32a7b..1454433d87 100644 --- a/tests/fpgadataflow/test_fpgadataflow_dwc.py +++ b/tests/fpgadataflow/test_fpgadataflow_dwc.py @@ -113,7 +113,7 @@ def test_fpgadataflow_dwc(config, exec_mode): input values anymore.""" assert y.shape == tuple(shape), """The output shape is incorrect.""" - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers(test_fpga_part)) model = model.transform(GiveUniqueNodeNames()) if exec_mode == "cppsim": model = model.transform(PrepareCppSim()) @@ -158,9 +158,9 @@ def test_fpgadataflow_dwc_stitched_rtlsim(config): input_dict = prepare_inputs(x, finn_dtype) model = make_single_dwc_modelwrapper(shape, inWidth, outWidth, finn_dtype) - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers(test_fpga_part)) model = model.transform(InsertFIFO(create_shallow_fifos=True)) - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers(test_fpga_part)) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) model = model.transform(HLSSynthIP()) diff --git a/tests/fpgadataflow/test_fpgadataflow_eltwise.py b/tests/fpgadataflow/test_fpgadataflow_eltwise.py index fbfcc8e28b..996477f28f 100644 --- a/tests/fpgadataflow/test_fpgadataflow_eltwise.py +++ b/tests/fpgadataflow/test_fpgadataflow_eltwise.py @@ -114,7 +114,7 @@ def test_fpgadataflow_eltwise(dt0, ch, fold, do_abs, exec_mode): y_produced = execute_onnx(model, idict)["out0"] assert (y_produced == y_expected).all(), exec_mode + " failed" - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers("xc7z020clg400-1")) assert len(model.graph.node) == 1 assert model.graph.node[0].op_type == "StreamingEltwise_hls" diff --git a/tests/fpgadataflow/test_fpgadataflow_fifo.py b/tests/fpgadataflow/test_fpgadataflow_fifo.py index 1719da1454..f628a0e7af 100644 --- a/tests/fpgadataflow/test_fpgadataflow_fifo.py +++ b/tests/fpgadataflow/test_fpgadataflow_fifo.py @@ -96,7 +96,7 @@ def test_fpgadataflow_fifo_rtlsim(Shape, folded_shape, depth, finn_dtype): input_dict = prepare_inputs(x, finn_dtype) model = make_single_fifo_modelwrapper(Shape, depth, folded_shape, finn_dtype) - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers(test_fpga_part)) model = model.transform(SetExecMode("rtlsim")) model = model.transform(GiveUniqueNodeNames()) diff --git a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py index 45cc265ac7..87e3267186 100644 --- a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py +++ b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py @@ -135,7 +135,7 @@ def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, idt, mode, impl_style): assert y_produced.shape == expected_oshape assert (y_produced == y_expected).all(), "HW layer execution failed" - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers(test_fpga_part)) model = model.transform(InferShapes()) model = model.transform(SetExecMode(mode)) diff --git a/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py b/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py index 9c2802aade..cca4bb7e8e 100644 --- a/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py +++ b/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py @@ -112,7 +112,7 @@ def test_fpgadataflow_globalaccpool(idt, ch, fold, imdim, exec_mode, impl_style) assert (y == expected_y).all(), "HW layer verification failed" - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers("xc7z020clg400-1")) if exec_mode == "cppsim": model = model.transform(PrepareCppSim()) diff --git a/tests/fpgadataflow/test_fpgadataflow_labelselect.py b/tests/fpgadataflow/test_fpgadataflow_labelselect.py index 98ded66ca7..83ab2ddcaf 100644 --- a/tests/fpgadataflow/test_fpgadataflow_labelselect.py +++ b/tests/fpgadataflow/test_fpgadataflow_labelselect.py @@ -118,7 +118,7 @@ def test_fpgadataflow_labelselect(idt, labels, fold, k, exec_mode, impl_style): assert soft_verify_topk(x, y, k), "HW layer execution failed" - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers("xc7z020clg400-1")) if exec_mode == "cppsim": model = model.transform(PrepareCppSim()) diff --git a/tests/fpgadataflow/test_fpgadataflow_lookup.py b/tests/fpgadataflow/test_fpgadataflow_lookup.py index cb15fa3ae5..d5aadc33d4 100644 --- a/tests/fpgadataflow/test_fpgadataflow_lookup.py +++ b/tests/fpgadataflow/test_fpgadataflow_lookup.py @@ -131,7 +131,7 @@ def test_fpgadataflow_lookup(edt, embedding_cfg, exec_mode): ret_hw = execute_onnx(model, {iname: itensor}) assert (exp_out == ret_hw[oname]).all() # call transformation to convert abstraction layer into HLS layer - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers("xczu3eg-sbva484-1-e")) assert model.graph.node[0].op_type == "Lookup_hls" if exec_mode == "cppsim": model = model.transform(GiveUniqueNodeNames()) @@ -174,7 +174,7 @@ def test_fpgadataflow_lookup_external(): assert (model.get_initializer(ename) == embeddings).all() model = model.transform(InferLookupLayer()) assert model.graph.node[0].op_type == "Lookup" - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers(fpga_part)) assert model.graph.node[0].op_type == "Lookup_hls" assert model.graph.node[0].input[0] == iname assert model.graph.node[0].input[1] == ename diff --git a/tests/fpgadataflow/test_fpgadataflow_res_estimate.py b/tests/fpgadataflow/test_fpgadataflow_res_estimate.py index 1bc2d9d59e..7ef4659205 100644 --- a/tests/fpgadataflow/test_fpgadataflow_res_estimate.py +++ b/tests/fpgadataflow/test_fpgadataflow_res_estimate.py @@ -40,6 +40,8 @@ ) from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers +test_fpga_part = "xczu3eg-sbva484-1-e" + def check_two_dict_for_equality(dict1, dict2): for key in dict1: @@ -96,7 +98,7 @@ def test_res_estimate(): model.set_tensor_datatype("outp", odt) model.set_tensor_datatype("weights", wdt) - model.transform(SpecializeLayers()) + model.transform(SpecializeLayers(test_fpga_part)) model = model.transform(GiveUniqueNodeNames()) prod_resource_estimation = model.analysis(res_estimation) expect_resource_estimation = { diff --git a/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py b/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py index 0df7181a60..c520fb50fc 100644 --- a/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py +++ b/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py @@ -146,7 +146,7 @@ def test_fpgadataflow_streamingmaxpool(idt, dim_1d, k, ifm_dim, ifm_ch, pe, ceil y_produced = oxe.execute_onnx(model, input_dict)["outp"] assert (y_produced == y_expected).all() - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers("xczu3eg-sbva484-1-e")) # Ensure PE value is set streamingmaxpool_node = model.get_nodes_by_op_type("StreamingMaxPool_hls")[0] diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py index 6501dba33e..e4dd49fc7f 100644 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py @@ -227,7 +227,7 @@ def test_fpgadataflow_thresholding( node = model.get_nodes_by_op_type(model.graph.node[0].op_type)[0] inst = getCustomOp(node) inst.set_nodeattr("preferred_impl_style", impl_style) - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers(test_fpga_part)) model = model.transform(InferShapes()) assert model.graph.node[0].op_type == "Thresholding_" + str(impl_style) diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_runtime.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_runtime.py index a9a2c79551..9948701157 100644 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding_runtime.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_runtime.py @@ -148,7 +148,7 @@ def test_runtime_thresholds_read(impl_style, cfg): actval = odt.min() model = make_single_thresholding_modelwrapper(impl_style, T, idt, odt, actval, n_inp_vecs) - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers(test_fpga_part)) # Make sure that specialize layer did not default to HLS implementation assert model.graph.node[0].op_type == "Thresholding_" + str(impl_style) @@ -169,7 +169,7 @@ def test_runtime_thresholds_read(impl_style, cfg): old_weight_stream = list(old_weight_stream) # need to create stitched IP for runtime weight testing model = model.transform(InsertFIFO(True)) - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers(test_fpga_part)) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) model = model.transform(HLSSynthIP()) @@ -252,7 +252,7 @@ def test_runtime_thresholds_write(impl_style, cfg): actval = odt.min() model = make_single_thresholding_modelwrapper(impl_style, T_init, idt, odt, actval, n_inp_vecs) - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers(test_fpga_part)) # Validate that specialize layer did not default to HLS implementation assert model.graph.node[0].op_type == "Thresholding_" + str(impl_style) @@ -280,7 +280,7 @@ def test_runtime_thresholds_write(impl_style, cfg): # need to create stitched IP for runtime weight testing model = model.transform(InsertFIFO(True)) - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers(test_fpga_part)) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) model = model.transform(HLSSynthIP()) diff --git a/tests/fpgadataflow/test_fpgadataflow_upsampler.py b/tests/fpgadataflow/test_fpgadataflow_upsampler.py index b0da767eaa..4539917878 100644 --- a/tests/fpgadataflow/test_fpgadataflow_upsampler.py +++ b/tests/fpgadataflow/test_fpgadataflow_upsampler.py @@ -174,7 +174,7 @@ def test_fpgadataflow_upsampler(dt, IFMDim, scale, NumChannels, exec_mode, is_1d test_result = output_dict[model.graph.output[0].name] output_matches = np.isclose(golden_result, test_result, atol=atol).all() - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers("xc7z020clg400-1")) # Prep sim if exec_mode == "cppsim": diff --git a/tests/fpgadataflow/test_runtime_weights.py b/tests/fpgadataflow/test_runtime_weights.py index 3e7822a077..4ca61578c3 100644 --- a/tests/fpgadataflow/test_runtime_weights.py +++ b/tests/fpgadataflow/test_runtime_weights.py @@ -70,7 +70,7 @@ def test_runtime_weights_single_layer(): } layer_spec_list = [layer_spec] model = hls_random_mlp_maker(layer_spec_list) - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers(test_fpga_part)) fcl = model.get_nodes_by_op_type("MVAU_hls")[0] op_inst = getCustomOp(fcl) op_inst.set_nodeattr("mem_mode", "internal_decoupled") @@ -83,7 +83,7 @@ def test_runtime_weights_single_layer(): old_weight_stream = map(lambda x: int(x, 16), old_weight_stream.split("\n")) old_weight_stream = list(old_weight_stream) model = model.transform(InsertFIFO(True)) - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers(test_fpga_part)) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) model = model.transform(HLSSynthIP()) From 1dd118b146b3310daea3835c67cfa7c102631992 Mon Sep 17 00:00:00 2001 From: auphelia Date: Wed, 29 May 2024 13:53:25 +0100 Subject: [PATCH 14/30] [RTL MVAU] Bring back is_versal node attribute for resource estimations --- .../fpgadataflow/rtl/matrixvectoractivation_rtl.py | 14 ++++++++------ .../fpgadataflow/specialize_layers.py | 3 +++ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py index a6a8e72bdf..d307efe988 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py @@ -55,7 +55,10 @@ def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) def get_nodeattr_types(self): - my_attrs = {} + my_attrs = { + # Flag to indicate if Versal device is targeted + "is_versal": ("i", False, 0, {0, 1}), + } my_attrs.update(MVAU.get_nodeattr_types(self)) my_attrs.update(RTLBackend.get_nodeattr_types(self)) return my_attrs @@ -138,11 +141,10 @@ def dsp_estimation(self): # multiplication P = self.get_nodeattr("PE") Q = self.get_nodeattr("SIMD") - # TODO: get dsp block type - # if dsp_block = "DSP58": - # mult_dsp = P * np.ceil(Q / 3) - # else: - mult_dsp = np.ceil(P / 4) * Q + if self.get_nodeattr("is_versal"): + mult_dsp = P * np.ceil(Q / 3) + else: + mult_dsp = np.ceil(P / 4) * Q return int(mult_dsp) def instantiate_ip(self, cmd): diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py index dbcadd1df5..9a88d34787 100644 --- a/src/finn/transformation/fpgadataflow/specialize_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -316,6 +316,9 @@ def apply(self, model): for attribute in node.attribute: if attribute.name != "preferred_impl_style": new_node.attribute.append(attribute) + if new_node.op_type == "MVAU_rtl": + is_versal_family = is_versal(self.fpgapart) + getCustomOp(new_node).set_nodeattr("is_versal", is_versal_family) graph.node.insert(node_ind, new_node) # remove old nodes graph.node.remove(node) From 0a2b4364e09a41995e2b2d18bbc165a3f3b152c1 Mon Sep 17 00:00:00 2001 From: auphelia Date: Wed, 29 May 2024 17:14:09 +0100 Subject: [PATCH 15/30] [Analysis] Pass fpgapart to resource estimation analysis pass --- notebooks/advanced/3_folding.ipynb | 7 ++++--- .../analysis/fpgadataflow/res_estimation.py | 18 +++++++++--------- src/finn/builder/build_dataflow_steps.py | 9 +++++++-- .../hls/matrixvectoractivation_hls.py | 2 +- .../hls/vectorvectoractivation_hls.py | 2 +- src/finn/custom_op/fpgadataflow/hwcustomop.py | 6 +++--- .../rtl/matrixvectoractivation_rtl.py | 10 ++++------ .../rtl/vectorvectoractivation_rtl.py | 7 ++++++- .../fpgadataflow/annotate_resources.py | 7 ++++--- .../fpgadataflow/specialize_layers.py | 3 --- tests/end2end/test_end2end_bnn_pynq.py | 2 +- .../test_fpgadataflow_res_estimate.py | 7 +++++-- 12 files changed, 45 insertions(+), 35 deletions(-) diff --git a/notebooks/advanced/3_folding.ipynb b/notebooks/advanced/3_folding.ipynb index fc9f0080ec..e9527a2ef7 100644 --- a/notebooks/advanced/3_folding.ipynb +++ b/notebooks/advanced/3_folding.ipynb @@ -159,6 +159,7 @@ "metadata": {}, "outputs": [], "source": [ + "from functools import partial\n", "from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer\n", "from finn.analysis.fpgadataflow.res_estimation import res_estimation" ] @@ -216,7 +217,7 @@ "metadata": {}, "outputs": [], "source": [ - "res_dict = model.analysis(res_estimation)\n", + "res_dict = model.analysis(partial(res_estimation, fpgapart=\"xc7z020clg400-1\"))\n", "res_dict" ] }, @@ -363,7 +364,7 @@ "metadata": {}, "outputs": [], "source": [ - "res_dict_updated = model.analysis(res_estimation)\n", + "res_dict_updated = model.analysis(partial(res_estimation, fpgapart=\"xc7z020clg400-1\"))\n", "res_dict_updated" ] }, @@ -596,7 +597,7 @@ "outputs": [], "source": [ "model_dwc = ModelWrapper(\"cybsec_DWC.onnx\")\n", - "res_dict_dwc = model_dwc.analysis(res_estimation)\n", + "res_dict_dwc = model_dwc.analysis(partial(res_estimation, fpgapart=\"xc7z020clg400-1\"))\n", "res_dict_dwc" ] }, diff --git a/src/finn/analysis/fpgadataflow/res_estimation.py b/src/finn/analysis/fpgadataflow/res_estimation.py index a6be1f1f53..fb12eed837 100644 --- a/src/finn/analysis/fpgadataflow/res_estimation.py +++ b/src/finn/analysis/fpgadataflow/res_estimation.py @@ -31,7 +31,7 @@ from finn.util.fpgadataflow import is_hls_node, is_rtl_node -def res_estimation(model): +def res_estimation(model, fpgapart): """Estimates the resources needed for the given model. Ensure that all nodes have unique names (by calling the GiveUniqueNodeNames transformation) prior to calling this analysis pass to ensure all nodes are @@ -43,12 +43,12 @@ def res_estimation(model): for node in model.graph.node: if is_hls_node(node) or is_rtl_node(node): inst = registry.getCustomOp(node) - res_dict[node.name] = inst.node_res_estimation() + res_dict[node.name] = inst.node_res_estimation(fpgapart) return res_dict -def res_estimation_complete(model): +def res_estimation_complete(model, fpgapart): """Estimates the resources needed for the given model and all values for resource-related switches. Ensure that all nodes have unique names (by calling the GiveUniqueNodeNames @@ -66,21 +66,21 @@ def res_estimation_complete(model): orig_restype = inst.get_nodeattr("resType") res_dict[node.name] = [] inst.set_nodeattr("resType", "dsp") - res_dict[node.name].append(inst.node_res_estimation()) + res_dict[node.name].append(inst.node_res_estimation(fpgapart)) inst.set_nodeattr("resType", "lut") - res_dict[node.name].append(inst.node_res_estimation()) + res_dict[node.name].append(inst.node_res_estimation(fpgapart)) inst.set_nodeattr("resType", orig_restype) elif op_type.startswith("ConvolutionInputGenerator"): orig_ramstyle = inst.get_nodeattr("ram_style") res_dict[node.name] = [] inst.set_nodeattr("ram_style", "block") - res_dict[node.name].append(inst.node_res_estimation()) + res_dict[node.name].append(inst.node_res_estimation(fpgapart)) inst.set_nodeattr("ram_style", "distributed") - res_dict[node.name].append(inst.node_res_estimation()) + res_dict[node.name].append(inst.node_res_estimation(fpgapart)) inst.set_nodeattr("ram_style", "ultra") - res_dict[node.name].append(inst.node_res_estimation()) + res_dict[node.name].append(inst.node_res_estimation(fpgapart)) inst.set_nodeattr("ram_style", orig_ramstyle) else: - res_dict[node.name] = [inst.node_res_estimation()] + res_dict[node.name] = [inst.node_res_estimation(fpgapart)] return res_dict diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index 44d54f8aa2..ecc1d28c53 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -34,6 +34,7 @@ import warnings from copy import deepcopy from distutils.dir_util import copy_tree +from functools import partial from qonnx.core.modelwrapper import ModelWrapper from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount @@ -470,11 +471,15 @@ def step_generate_estimate_reports(model: ModelWrapper, cfg: DataflowBuildConfig estimate_layer_cycles = model.analysis(exp_cycles_per_layer) with open(report_dir + "/estimate_layer_cycles.json", "w") as f: json.dump(estimate_layer_cycles, f, indent=2) - estimate_layer_resources = model.analysis(res_estimation) + estimate_layer_resources = model.analysis( + partial(res_estimation, fpgapart=cfg._resolve_fpga_part()) + ) estimate_layer_resources["total"] = aggregate_dict_keys(estimate_layer_resources) with open(report_dir + "/estimate_layer_resources.json", "w") as f: json.dump(estimate_layer_resources, f, indent=2) - estimate_layer_resources_complete = model.analysis(res_estimation_complete) + estimate_layer_resources_complete = model.analysis( + partial(res_estimation_complete, fpgapart=cfg._resolve_fpga_part()) + ) with open(report_dir + "/estimate_layer_config_alternatives.json", "w") as f: json.dump(estimate_layer_resources_complete, f, indent=2) # need to call AnnotateCycles before dataflow_performance diff --git a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py index 94f8cc0845..cae1c30eb6 100644 --- a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py @@ -118,7 +118,7 @@ def lut_estimation(self): c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2 ) - def dsp_estimation(self): + def dsp_estimation(self, fpgapart): # multiplication P = self.get_nodeattr("PE") res_type = self.get_nodeattr("resType") diff --git a/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py index 3e10b640c5..f9ba68e6b6 100644 --- a/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py @@ -112,7 +112,7 @@ def lut_estimation(self): c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2 ) - def dsp_estimation(self): + def dsp_estimation(self, fpgapart): # multiplication P = self.get_nodeattr("PE") res_type = self.get_nodeattr("resType") diff --git a/src/finn/custom_op/fpgadataflow/hwcustomop.py b/src/finn/custom_op/fpgadataflow/hwcustomop.py index 57c0fec067..b40b8f3074 100644 --- a/src/finn/custom_op/fpgadataflow/hwcustomop.py +++ b/src/finn/custom_op/fpgadataflow/hwcustomop.py @@ -136,7 +136,7 @@ def get_rtlsim(self): sim = PyVerilator(rtlsim_so) return sim - def node_res_estimation(self): + def node_res_estimation(self, fpgapart): """Returns summarized resource estimation of BRAMs and LUTs of the node as a dictionary.""" ret = dict() @@ -145,7 +145,7 @@ def node_res_estimation(self): ret["LUT"] = self.lut_estimation() ret["URAM"] = self.uram_estimation() ret["URAM_efficiency"] = self.uram_efficiency_estimation() - ret["DSP"] = self.dsp_estimation() + ret["DSP"] = self.dsp_estimation(fpgapart) return ret def bram_efficiency_estimation(self): @@ -173,7 +173,7 @@ def lut_estimation(self): HWCustomOp class but has to be filled by every node""" return 0 - def dsp_estimation(self): + def dsp_estimation(self, fpgapart): """Function for DSP resource estimation, is member function of HWCustomOp class but has to be filled by every node""" return 0 diff --git a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py index d307efe988..93a3f0c3b0 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py @@ -55,10 +55,7 @@ def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) def get_nodeattr_types(self): - my_attrs = { - # Flag to indicate if Versal device is targeted - "is_versal": ("i", False, 0, {0, 1}), - } + my_attrs = {} my_attrs.update(MVAU.get_nodeattr_types(self)) my_attrs.update(RTLBackend.get_nodeattr_types(self)) return my_attrs @@ -137,11 +134,12 @@ def execute_node(self, context, graph): def lut_estimation(self): return 0 - def dsp_estimation(self): + def dsp_estimation(self, fpgapart): # multiplication P = self.get_nodeattr("PE") Q = self.get_nodeattr("SIMD") - if self.get_nodeattr("is_versal"): + dsp_block = get_dsp_block(fpgapart) + if dsp_block == "DSP58": mult_dsp = P * np.ceil(Q / 3) else: mult_dsp = np.ceil(P / 4) * Q diff --git a/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py index 2d4240a7f3..41c3e90038 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py @@ -142,7 +142,7 @@ def execute_node(self, context, graph): def lut_estimation(self): return 0 - def dsp_estimation(self): + def dsp_estimation(self, fpgapart): P = self.get_nodeattr("PE") Q = self.get_nodeattr("SIMD") return int(P * np.ceil(Q / 3)) @@ -176,6 +176,11 @@ def generate_hdl(self, model, fpgapart, clk): self.generate_params(model, code_gen_dir) template_path, code_gen_dict = self.prepare_codegen_default(fpgapart, clk) + # determine if weights are narrow range and add parameter to code gen dict + weights = model.get_initializer(self.onnx_node.input[1]) + wdt = self.get_weight_datatype() + narrow_weights = 0 if np.min(weights) == wdt.min() else 1 + code_gen_dict["$NARROW_WEIGHTS$"] = str(narrow_weights) # add general parameters to dictionary code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [self.get_verilog_top_module_name()] # save top module name so we can refer to it after this node has been renamed diff --git a/src/finn/transformation/fpgadataflow/annotate_resources.py b/src/finn/transformation/fpgadataflow/annotate_resources.py index f07a5186d5..7b0219d8a8 100644 --- a/src/finn/transformation/fpgadataflow/annotate_resources.py +++ b/src/finn/transformation/fpgadataflow/annotate_resources.py @@ -26,8 +26,8 @@ # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - import qonnx.custom_op.registry as registry +from functools import partial from qonnx.core.modelwrapper import ModelWrapper from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.base import Transformation @@ -49,15 +49,16 @@ class AnnotateResources(Transformation): chosen mode (e.g. HLSSynthIP for hls) was previously run. """ - def __init__(self, mode, override_res_dict=None): + def __init__(self, mode, fpgapart, override_res_dict=None): super().__init__() self.mode = mode + self.fpgapart = fpgapart self.res_dict = override_res_dict def apply(self, model): graph = model.graph if self.mode == "estimate": - res_fxn = res_estimation + res_fxn = partial(res_estimation, fpgapart=self.fpgapart) elif self.mode == "hls": res_fxn = hls_synth_res_estimation elif self.mode == "synth": diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py index 9a88d34787..dbcadd1df5 100644 --- a/src/finn/transformation/fpgadataflow/specialize_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -316,9 +316,6 @@ def apply(self, model): for attribute in node.attribute: if attribute.name != "preferred_impl_style": new_node.attribute.append(attribute) - if new_node.op_type == "MVAU_rtl": - is_versal_family = is_versal(self.fpgapart) - getCustomOp(new_node).set_nodeattr("is_versal", is_versal_family) graph.node.insert(node_ind, new_node) # remove old nodes graph.node.remove(node) diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py index 7fb0f5ff1d..387bf16c95 100644 --- a/tests/end2end/test_end2end_bnn_pynq.py +++ b/tests/end2end/test_end2end_bnn_pynq.py @@ -807,7 +807,7 @@ def test_build(self, topology, wbits, abits, board): prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "fifodepth_" + board) model = load_test_checkpoint_or_skip(prev_chkpt_name) model = model.transform(build_data["build_fxn"]) - model = model.transform(AnnotateResources("synth")) + model = model.transform(AnnotateResources("synth", build_data["part"])) model.save(get_checkpoint_name(topology, wbits, abits, "build_" + board)) @pytest.mark.slow diff --git a/tests/fpgadataflow/test_fpgadataflow_res_estimate.py b/tests/fpgadataflow/test_fpgadataflow_res_estimate.py index 7ef4659205..d81936f7e5 100644 --- a/tests/fpgadataflow/test_fpgadataflow_res_estimate.py +++ b/tests/fpgadataflow/test_fpgadataflow_res_estimate.py @@ -28,6 +28,7 @@ import pytest +from functools import partial from onnx import TensorProto, helper from qonnx.core.datatype import DataType from qonnx.core.modelwrapper import ModelWrapper @@ -100,7 +101,7 @@ def test_res_estimate(): model.transform(SpecializeLayers(test_fpga_part)) model = model.transform(GiveUniqueNodeNames()) - prod_resource_estimation = model.analysis(res_estimation) + prod_resource_estimation = model.analysis(partial(res_estimation, fpgapart=test_fpga_part)) expect_resource_estimation = { "MVAU_hls_0": { "BRAM_18K": 0, @@ -117,7 +118,9 @@ def test_res_estimate(): ), """The produced output of the res_estimation analysis pass is not equal to the expected one""" - prod_resource_estimation = model.analysis(res_estimation_complete) + prod_resource_estimation = model.analysis( + partial(res_estimation_complete, fpgapart=test_fpga_part) + ) expect_resource_estimation = { "MVAU_hls_0": [ { From 97f59d532b361be710001bca1a3a74cd706e7c32 Mon Sep 17 00:00:00 2001 From: auphelia Date: Thu, 30 May 2024 09:35:49 +0100 Subject: [PATCH 16/30] [End2end] Fix bnn end2end test --- src/finn/transformation/fpgadataflow/annotate_resources.py | 4 +++- tests/end2end/test_end2end_bnn_pynq.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/annotate_resources.py b/src/finn/transformation/fpgadataflow/annotate_resources.py index 7b0219d8a8..ee2da2094c 100644 --- a/src/finn/transformation/fpgadataflow/annotate_resources.py +++ b/src/finn/transformation/fpgadataflow/annotate_resources.py @@ -78,7 +78,9 @@ def apply(self, model): # recurse into model to manually annotate per-layer resources sdp_model_filename = getCustomOp(node).get_nodeattr("model") sdp_model = ModelWrapper(sdp_model_filename) - sdp_model = sdp_model.transform(AnnotateResources(self.mode, self.res_dict)) + sdp_model = sdp_model.transform( + AnnotateResources(self.mode, self.fpgapart, self.res_dict) + ) sdp_dict = sdp_model.get_metadata_prop("res_total_" + self.mode) sdp_dict = eval(sdp_dict) # save transformed model diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py index 387bf16c95..6fd7cb5e66 100644 --- a/tests/end2end/test_end2end_bnn_pynq.py +++ b/tests/end2end/test_end2end_bnn_pynq.py @@ -706,7 +706,7 @@ def test_ipgen(self, topology, wbits, abits, board): build_data = get_build_env(board, target_clk_ns) if build_data["kind"] == "alveo" and ("VITIS_PATH" not in os.environ): pytest.skip("VITIS_PATH not set") - prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "fold") + prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "minimize_bit_width") model = load_test_checkpoint_or_skip(prev_chkpt_name) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP(build_data["part"], target_clk_ns)) From 7c3b03abbe36c75ef012b3f4f0e2ef17f6d9d8f6 Mon Sep 17 00:00:00 2001 From: auphelia Date: Thu, 30 May 2024 17:17:41 +0100 Subject: [PATCH 17/30] [RTL MVAU-extw] Update ip stitching for external weights --- .../rtl/matrixvectoractivation_rtl.py | 24 ++++++++++----- .../rtl/vectorvectoractivation_rtl.py | 25 +++++++++++----- .../specialize_layers_config.json | 30 +++++++++++++++++++ ...{tfc-w1a1-extw.json => tfc-w2a2-extw.json} | 17 ++++++++--- tests/end2end/test_ext_weights.py | 9 ++++-- 5 files changed, 84 insertions(+), 21 deletions(-) create mode 100644 src/finn/qnn-data/test_ext_weights/specialize_layers_config.json rename src/finn/qnn-data/test_ext_weights/{tfc-w1a1-extw.json => tfc-w2a2-extw.json} (66%) diff --git a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py index 93a3f0c3b0..3e81aa93e0 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py @@ -159,14 +159,24 @@ def instantiate_ip(self, cmd): ] for f in sourcefiles: cmd.append("add_files -norecurse %s" % (f)) - cmd.append( - "create_bd_cell -type hier -reference %s /%s/%s" - % ( - self.get_nodeattr("gen_top_module"), - self.onnx_node.name, - self.onnx_node.name, + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "internal_decoupled": + cmd.append( + "create_bd_cell -type hier -reference %s /%s/%s" + % ( + self.get_nodeattr("gen_top_module"), + self.onnx_node.name, + self.onnx_node.name, + ) + ) + else: + cmd.append( + "create_bd_cell -type hier -reference %s %s" + % ( + self.get_nodeattr("gen_top_module"), + self.onnx_node.name, + ) ) - ) def _resolve_segment_len(self, clk): # Insert pipeline registers in the DSP58 chain to meet target clock frequency diff --git a/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py index 41c3e90038..32943d86cf 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py @@ -161,14 +161,25 @@ def instantiate_ip(self, cmd): ] for f in sourcefiles: cmd.append("add_files -norecurse %s" % (f)) - cmd.append( - "create_bd_cell -type hier -reference %s /%s/%s" - % ( - self.get_nodeattr("gen_top_module"), - self.onnx_node.name, - self.onnx_node.name, + + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "internal_decoupled": + cmd.append( + "create_bd_cell -type hier -reference %s /%s/%s" + % ( + self.get_nodeattr("gen_top_module"), + self.onnx_node.name, + self.onnx_node.name, + ) + ) + else: + cmd.append( + "create_bd_cell -type hier -reference %s %s" + % ( + self.get_nodeattr("gen_top_module"), + self.onnx_node.name, + ) ) - ) def generate_hdl(self, model, fpgapart, clk): # Generate params as part of IP preparation diff --git a/src/finn/qnn-data/test_ext_weights/specialize_layers_config.json b/src/finn/qnn-data/test_ext_weights/specialize_layers_config.json new file mode 100644 index 0000000000..3218c2d89a --- /dev/null +++ b/src/finn/qnn-data/test_ext_weights/specialize_layers_config.json @@ -0,0 +1,30 @@ +{ + "Defaults": {}, + "Thresholding_0": { + "preferred_impl_style": "rtl" + }, + "MVAU_0": { + "preferred_impl_style": "rtl" + }, + "Thresholding_1": { + "preferred_impl_style": "rtl" + }, + "MVAU_1": { + "preferred_impl_style": "hls" + }, + "Thresholding_2": { + "preferred_impl_style": "rtl" + }, + "MVAU_2": { + "preferred_impl_style": "rtl" + }, + "Thresholding_3": { + "preferred_impl_style": "rtl" + }, + "MVAU_3": { + "preferred_impl_style": "rtl" + }, + "LabelSelect_0": { + "preferred_impl_style": "hls" + } +} diff --git a/src/finn/qnn-data/test_ext_weights/tfc-w1a1-extw.json b/src/finn/qnn-data/test_ext_weights/tfc-w2a2-extw.json similarity index 66% rename from src/finn/qnn-data/test_ext_weights/tfc-w1a1-extw.json rename to src/finn/qnn-data/test_ext_weights/tfc-w2a2-extw.json index 9fe22443dc..29484e2940 100644 --- a/src/finn/qnn-data/test_ext_weights/tfc-w1a1-extw.json +++ b/src/finn/qnn-data/test_ext_weights/tfc-w2a2-extw.json @@ -3,22 +3,31 @@ "Thresholding_rtl_0": { "PE": 49 }, - "MVAU_hls_0": { + "MVAU_rtl_0": { "PE": 16, "SIMD": 49, "ram_style": "block" }, - "MVAU_hls_1": { + "Thresholding_rtl_1": { + "PE": 16 + }, + "MVAU_hls_0": { "PE": 8, "SIMD": 8, "mem_mode": "external" }, - "MVAU_hls_2": { + "Thresholding_rtl_2": { + "PE": 8 + }, + "MVAU_rtl_1": { "PE": 8, "SIMD": 8, "mem_mode": "external" }, - "MVAU_hls_3": { + "Thresholding_rtl_3": { + "PE": 8 + }, + "MVAU_rtl_2": { "PE": 10, "SIMD": 8, "ram_style": "distributed" diff --git a/tests/end2end/test_ext_weights.py b/tests/end2end/test_ext_weights.py index bac343bedf..29d2f58e66 100644 --- a/tests/end2end/test_ext_weights.py +++ b/tests/end2end/test_ext_weights.py @@ -60,7 +60,7 @@ def get_checkpoint_name(step): # checkpoint for build step is an entire dir return build_dir + "/end2end_ext_weights_build" elif step == "download": - return onnx_dir_local + "/tfc-w1a1.onnx" + return onnx_dir_local + "/tfc-w2a2.onnx" else: # other checkpoints are onnx files return build_dir + "/end2end_ext_weights_%s.onnx" % (step) @@ -82,14 +82,17 @@ def test_end2end_ext_weights_build(): model_file = get_checkpoint_name("download") load_test_checkpoint_or_skip(model_file) test_data = os.environ["FINN_ROOT"] + "/src/finn/qnn-data/test_ext_weights" - folding_config_file = test_data + "/tfc-w1a1-extw.json" + folding_config_file = test_data + "/tfc-w2a2-extw.json" + specialize_layers_config_file = test_data + "/specialize_layers_config.json" output_dir = make_build_dir("test_end2end_ext_weights_build") cfg = build.DataflowBuildConfig( output_dir=output_dir, verbose=True, + standalone_thresholds=True, folding_config_file=folding_config_file, + specialize_layers_config_file=specialize_layers_config_file, synth_clk_period_ns=target_clk_ns, - board="Pynq-Z1", + board="ZCU104", shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ, generate_outputs=[ build_cfg.DataflowOutputType.ESTIMATE_REPORTS, From 120838934dd6ef7b2b89b4eefa242848e2422746 Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 31 May 2024 13:31:16 +0100 Subject: [PATCH 18/30] [Docker] Add string to download xrt --- docker/Dockerfile.finn | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile.finn b/docker/Dockerfile.finn index 2ceb1f4195..438c534943 100644 --- a/docker/Dockerfile.finn +++ b/docker/Dockerfile.finn @@ -78,7 +78,7 @@ RUN cd verilator && \ make install # install XRT -RUN wget https://www.xilinx.com/bin/public/openDownload?filename=$XRT_DEB_VERSION.deb -O /tmp/$XRT_DEB_VERSION.deb +RUN wget -U 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17' https://www.xilinx.com/bin/public/openDownload?filename=$XRT_DEB_VERSION.deb -O /tmp/$XRT_DEB_VERSION.deb RUN apt install -y /tmp/$XRT_DEB_VERSION.deb RUN rm /tmp/$XRT_DEB_VERSION.deb From ca913ab389bf2b18bef22f5d36918ff771ae4e85 Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 31 May 2024 13:57:05 +0100 Subject: [PATCH 19/30] [GHA] Add quotes to xrt link --- docker/Dockerfile.finn | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile.finn b/docker/Dockerfile.finn index 438c534943..d2b64da5a1 100644 --- a/docker/Dockerfile.finn +++ b/docker/Dockerfile.finn @@ -78,7 +78,7 @@ RUN cd verilator && \ make install # install XRT -RUN wget -U 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17' https://www.xilinx.com/bin/public/openDownload?filename=$XRT_DEB_VERSION.deb -O /tmp/$XRT_DEB_VERSION.deb +RUN wget -U 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17' "https://www.amd.com/bin/public/amdOpenDownload?filename=$XRT_DEB_VERSION.deb" -O /tmp/$XRT_DEB_VERSION.deb RUN apt install -y /tmp/$XRT_DEB_VERSION.deb RUN rm /tmp/$XRT_DEB_VERSION.deb From 8d6543c944f5505285053e8555ab2ff4ca644faa Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 31 May 2024 15:38:39 +0100 Subject: [PATCH 20/30] [GHA] Add debug flag for GHA --- docker/Dockerfile.finn | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile.finn b/docker/Dockerfile.finn index d2b64da5a1..38bdb7ce58 100644 --- a/docker/Dockerfile.finn +++ b/docker/Dockerfile.finn @@ -78,7 +78,7 @@ RUN cd verilator && \ make install # install XRT -RUN wget -U 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17' "https://www.amd.com/bin/public/amdOpenDownload?filename=$XRT_DEB_VERSION.deb" -O /tmp/$XRT_DEB_VERSION.deb +RUN wget -U 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17' "https://www.amd.com/bin/public/amdOpenDownload?filename=$XRT_DEB_VERSION.deb" -O /tmp/$XRT_DEB_VERSION.deb --debug RUN apt install -y /tmp/$XRT_DEB_VERSION.deb RUN rm /tmp/$XRT_DEB_VERSION.deb From 356528f32d82e5aafb0f1996b64aadc09a40eb7e Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 31 May 2024 16:39:10 +0100 Subject: [PATCH 21/30] [Deps] Introduce env var to skip xrt download --- .github/workflows/quicktest-dev-pr.yml | 1 + docker/Dockerfile.finn | 8 +++++--- docker/finn_entrypoint.sh | 2 +- run-docker.sh | 3 ++- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/.github/workflows/quicktest-dev-pr.yml b/.github/workflows/quicktest-dev-pr.yml index e2ba47ec29..91104653f6 100644 --- a/.github/workflows/quicktest-dev-pr.yml +++ b/.github/workflows/quicktest-dev-pr.yml @@ -22,4 +22,5 @@ jobs: export FINN_ROOT=$(pwd) export FINN_BUILD_DIR=/tmp/finn_gha export FINN_INST_NAME=finn_gha + export FINN_SKIP_XRT_DOWNLOAD=1 ./run-docker.sh quicktest diff --git a/docker/Dockerfile.finn b/docker/Dockerfile.finn index 38bdb7ce58..29ec00414b 100644 --- a/docker/Dockerfile.finn +++ b/docker/Dockerfile.finn @@ -31,6 +31,7 @@ FROM ubuntu:jammy-20230126 LABEL maintainer="Jakoba Petri-Koenig , Yaman Umuroglu " ARG XRT_DEB_VERSION="xrt_202220.2.14.354_22.04-amd64-xrt" +ARG SKIP_XRT WORKDIR /workspace @@ -78,9 +79,10 @@ RUN cd verilator && \ make install # install XRT -RUN wget -U 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17' "https://www.amd.com/bin/public/amdOpenDownload?filename=$XRT_DEB_VERSION.deb" -O /tmp/$XRT_DEB_VERSION.deb --debug -RUN apt install -y /tmp/$XRT_DEB_VERSION.deb -RUN rm /tmp/$XRT_DEB_VERSION.deb +RUN if [ -z "$SKIP_XRT" ];then \ + wget -U 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17' "https://www.amd.com/bin/public/amdOpenDownload?filename=$XRT_DEB_VERSION.deb" -O /tmp/$XRT_DEB_VERSION.deb --debug && \ + apt install -y /tmp/$XRT_DEB_VERSION.deb && \ + rm /tmp/$XRT_DEB_VERSION.deb; fi # versioned Python package requirements for FINN compiler # these are given in requirements.txt diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh index 61c8f78665..c7500bcaa6 100644 --- a/docker/finn_entrypoint.sh +++ b/docker/finn_entrypoint.sh @@ -86,7 +86,7 @@ if [ -f "$VITIS_PATH/settings64.sh" ];then source $XILINX_XRT/setup.sh gecho "Found XRT at $XILINX_XRT" else - recho "XRT not found on $XILINX_XRT, did the installation fail?" + recho "XRT not found on $XILINX_XRT, did you skip the download or did the installation fail?" exit -1 fi else diff --git a/run-docker.sh b/run-docker.sh index e732492728..57f420143d 100755 --- a/run-docker.sh +++ b/run-docker.sh @@ -100,6 +100,7 @@ SCRIPTPATH=$(dirname "$SCRIPT") : ${NVIDIA_VISIBLE_DEVICES=""} : ${DOCKER_BUILDKIT="1"} : ${FINN_SINGULARITY=""} +: ${FINN_SKIP_XRT_DOWNLOAD=""} DOCKER_INTERACTIVE="" @@ -186,7 +187,7 @@ if [ "$FINN_DOCKER_PREBUILT" = "0" ] && [ -z "$FINN_SINGULARITY" ]; then # Need to ensure this is done within the finn/ root folder: OLD_PWD=$(pwd) cd $SCRIPTPATH - docker build -f docker/Dockerfile.finn --build-arg XRT_DEB_VERSION=$XRT_DEB_VERSION --tag=$FINN_DOCKER_TAG $FINN_DOCKER_BUILD_EXTRA . + docker build -f docker/Dockerfile.finn --build-arg XRT_DEB_VERSION=$XRT_DEB_VERSION --build-arg SKIP_XRT=$FINN_SKIP_XRT_DOWNLOAD --tag=$FINN_DOCKER_TAG $FINN_DOCKER_BUILD_EXTRA . cd $OLD_PWD fi # Launch container with current directory mounted From 79c46bbac92cf62e8415c909725c60443e861d35 Mon Sep 17 00:00:00 2001 From: auphelia Date: Wed, 5 Jun 2024 13:29:58 +0100 Subject: [PATCH 22/30] [RTL Thresh] Enable narrow and per tensor mode with runtime writeable params --- .../fpgadataflow/rtl/thresholding_rtl.py | 29 ++++-- .../test_fpgadataflow_thresholding_runtime.py | 98 ++++++++++--------- 2 files changed, 72 insertions(+), 55 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py index 9584c3ae5f..9ab1fb9112 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py @@ -167,13 +167,17 @@ def prepare_codegen_rtl_values(self, model): their key value(s) in the RTL template files""" code_gen_dict = {} - # TODO check for sortedness and size here? thresholds = model.get_initializer(self.onnx_node.input[1]) bias = self.get_nodeattr("ActVal") # activation bias value output_data_type = self.get_nodeattr("outputDataType") # output precision input_data_type = self.get_nodeattr("inputDataType") # input/threshold precision o_bitwidth = DataType[output_data_type].bitwidth() + t_path = self.get_nodeattr("code_gen_dir_ipgen") + if self.get_nodeattr("runtime_writeable_weights") == 1: + thresh_file_name = f"{t_path}/memblock.dat" + self.make_weight_file(thresholds, "decoupled", thresh_file_name) + # The RTL expects 2^N-1 thresholds, but narrow range quantization will result in # one less threshold, prepending a dummy threshold (minimal possible value determined by # input data type) and decrease the bias by 1. @@ -197,7 +201,6 @@ def prepare_codegen_rtl_values(self, model): prefix="", ) - t_path = self.get_nodeattr("code_gen_dir_ipgen") pe = self.get_nodeattr("PE") num_channels = self.get_nodeattr("NumChannels") # number of channels @@ -227,10 +230,6 @@ def prepare_codegen_rtl_values(self, model): f.write(val + "\n") code_gen_dict["$THRESHOLDS_PATH$"] = ['"./%s_"' % self.onnx_node.name] - if self.get_nodeattr("runtime_writeable_weights") == 1: - thresh_file_name = f"{t_path}/memblock.dat" - self.make_weight_file(thresholds, "decoupled", thresh_file_name) - # Identify the module name code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [ self.get_verilog_top_module_name() + "_axi_wrapper" @@ -521,7 +520,23 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): ch = self.get_nodeattr("NumChannels") output_data_type = self.get_nodeattr("outputDataType") # output precision o_bitwidth = DataType[output_data_type].bitwidth() - n_thres_steps = 2**o_bitwidth - 1 + # The RTL expects 2^N-1 thresholds, but narrow range quantization will result in + # one less threshold, prepending a dummy threshold (minimal possible value determined by + # input data type) and decrease the bias by 1. + # Additionally, increase number of threshold steps to reflect new shape + expected_thresholds = 2**o_bitwidth - 1 + n_thres_steps = self.get_nodeattr("numSteps") + wdt = self.get_weight_datatype() + if expected_thresholds != n_thres_steps: + min_val = wdt.min() + thresholds = np.insert(thresholds, 0, min_val, axis=1) + n_thres_steps += 1 + expected_shape = (ch, expected_thresholds) + + # If a single threshold value is found, broadcast the value + if thresholds.shape != expected_shape: + thresholds = np.broadcast_to(thresholds, expected_shape) + width_padded = roundup_to_integer_multiple(thresholds.shape[1], 2**o_bitwidth) thresh_padded = np.zeros((thresholds.shape[0], width_padded)) thresh_padded[: thresholds.shape[0], :n_thres_steps] = thresholds diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_runtime.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_runtime.py index 9948701157..1ad695bb94 100644 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding_runtime.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_runtime.py @@ -51,10 +51,17 @@ target_clk_ns = 5 -def generate_random_threshold_values(input_data_type, num_input_channels, num_steps): +def generate_random_threshold_values( + data_type, num_input_channels, num_steps, narrow=False, per_tensor=False +): + if per_tensor: + num_input_channels = 1 + if narrow: + num_steps -= 1 + return np.random.randint( - input_data_type.min(), - input_data_type.max() + 1, + data_type.min(), + data_type.max() + 1, (num_input_channels, num_steps), ).astype(np.float32) @@ -75,11 +82,9 @@ def layout_NCHW2FINN(data): return np.transpose(data, (0, 2, 3, 1)) -def make_single_thresholding_modelwrapper(impl_style, T, idt, odt, actval, n_inp_vecs): - NumChannels = T.shape[0] - - inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, n_inp_vecs + [NumChannels]) - outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, n_inp_vecs + [NumChannels]) +def make_single_thresholding_modelwrapper(impl_style, T, idt, odt, actval, n_inp_vecs, num_ch): + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, n_inp_vecs + [num_ch]) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, n_inp_vecs + [num_ch]) node_inp_list = ["inp", "thresh"] @@ -89,7 +94,7 @@ def make_single_thresholding_modelwrapper(impl_style, T, idt, odt, actval, n_inp ["outp"], domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", - NumChannels=NumChannels, + NumChannels=num_ch, numSteps=T.shape[1], inputDataType=idt.name, weightDataType=idt.name, # will be set by MinimizeAccumulatorWidth @@ -118,10 +123,12 @@ def make_single_thresholding_modelwrapper(impl_style, T, idt, odt, actval, n_inp @pytest.mark.parametrize("impl_style", ["rtl", "hls"]) # configuration (ch, pe) -@pytest.mark.parametrize("cfg", [(1, 1), (6, 2), (6, 3), (8, 4)]) +@pytest.mark.parametrize("cfg", [(1, 1), (6, 2), (6, 3)]) +@pytest.mark.parametrize("narrow", [True, False]) +@pytest.mark.parametrize("per_tensor", [True, False]) @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_runtime_thresholds_read(impl_style, cfg): +def test_runtime_thresholds_read(impl_style, cfg, narrow, per_tensor): """Read back threshold weights during runtime 1. Create random initial weights T @@ -137,17 +144,17 @@ def test_runtime_thresholds_read(impl_style, cfg): idt = DataType["INT16"] odt = act n_steps = act.get_num_possible_values() - 1 - np.random.seed(2) - T = np.random.randint(idt.min(), idt.max() + 1, (ch, n_steps)).astype(np.float32) - # provide non-decreasing thresholds - T = np.sort(T, axis=1) + # Generate random thresholds and sort in ascending order + T = generate_random_threshold_values(idt, ch, n_steps, narrow, per_tensor) + + # provide non-decreasing/ascending thresholds + T = sort_thresholds_increasing(T) - if odt == DataType["BIPOLAR"]: - actval = 0 - else: - actval = odt.min() + actval = act.min() + if narrow: + actval += 1 - model = make_single_thresholding_modelwrapper(impl_style, T, idt, odt, actval, n_inp_vecs) + model = make_single_thresholding_modelwrapper(impl_style, T, idt, odt, actval, n_inp_vecs, ch) model = model.transform(SpecializeLayers(test_fpga_part)) # Make sure that specialize layer did not default to HLS implementation @@ -204,23 +211,21 @@ def read_weights(sim): # convert back to NHWC for comparison to hw outputs expected = np.transpose(expected, (0, 2, 3, 1))[1] - if act == DataType["BIPOLAR"]: - # binary to bipolarW - expected = 2 * expected - 1 - else: - # signed offset - expected += act.min() + # signed offset + expected += actval # Validate the output is as expected assert (y == expected).all() -@pytest.mark.parametrize("impl_style", ["hls", "rtl"]) +@pytest.mark.parametrize("impl_style", ["rtl", "hls"]) # configuration (ch, pe) -@pytest.mark.parametrize("cfg", [(1, 1), (6, 2), (6, 3), (8, 4)]) +@pytest.mark.parametrize("cfg", [(1, 1), (6, 2), (6, 3)]) +@pytest.mark.parametrize("narrow", [True, False]) +@pytest.mark.parametrize("per_tensor", [True, False]) @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_runtime_thresholds_write(impl_style, cfg): +def test_runtime_thresholds_write(impl_style, cfg, narrow, per_tensor): """Write threshold weights during runtime 1. Create random initial weights T_init @@ -241,17 +246,19 @@ def test_runtime_thresholds_write(impl_style, cfg): odt = act n_steps = act.get_num_possible_values() - 1 - np.random.seed(2) - T_init = np.random.randint(idt.min(), idt.max() + 1, (ch, n_steps)).astype(np.float32) - # provide non-decreasing thresholds - T_init = np.sort(T_init, axis=1) + # Generate random thresholds and sort in ascending order + T_init = generate_random_threshold_values(idt, ch, n_steps, narrow, per_tensor) + + # provide non-decreasing/ascending thresholds + T_init = sort_thresholds_increasing(T_init) - if odt == DataType["BIPOLAR"]: - actval = 0 - else: - actval = odt.min() + actval = act.min() + if narrow: + actval += 1 - model = make_single_thresholding_modelwrapper(impl_style, T_init, idt, odt, actval, n_inp_vecs) + model = make_single_thresholding_modelwrapper( + impl_style, T_init, idt, odt, actval, n_inp_vecs, ch + ) model = model.transform(SpecializeLayers(test_fpga_part)) # Validate that specialize layer did not default to HLS implementation @@ -264,10 +271,9 @@ def test_runtime_thresholds_write(impl_style, cfg): op_inst.set_nodeattr("runtime_writeable_weights", 1) # Make new weights for runtime write - np.random.seed(4) - T_write = np.random.randint(idt.min(), idt.max() + 1, (ch, n_steps)).astype(np.float32) - # provide non-decreasing thresholds - T_write = np.sort(T_write, axis=1) + T_write = generate_random_threshold_values(idt, ch, n_steps, narrow, per_tensor) + # provide non-decreasing/ascending thresholds + T_write = sort_thresholds_increasing(T_write) dat_fname = f"T_write_{cfg}.dat" # distinguish fname per paramter for distributed testing op_inst.make_weight_file(T_write, "decoupled_runtime", dat_fname) @@ -321,12 +327,8 @@ def read_weights(sim): # convert back to NHWC for comparison to hw outputs expected = np.transpose(expected, (0, 2, 3, 1))[1] - if act == DataType["BIPOLAR"]: - # binary to bipolarW - expected = 2 * expected - 1 - else: - # signed offset - expected += act.min() + # signed off-set + expected += actval # Validate the output is as expected assert (y == expected).all() From 304337bb69428b526efe2fbdacf412db169dfd91 Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 7 Jun 2024 11:16:05 +0100 Subject: [PATCH 23/30] [Tests] Change target board for subset of mvau tests --- tests/fpgadataflow/test_fpgadataflow_mvau.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py index 4eb0b22d46..1ec77f4eec 100644 --- a/tests/fpgadataflow/test_fpgadataflow_mvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py @@ -312,7 +312,7 @@ def test_fpgadataflow_mvau_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): inst.set_nodeattr("mem_mode", mem_mode) # Note: only HLS-based MVAU layers execute CPPsim inst.set_nodeattr("preferred_impl_style", "hls") - model = model.transform(SpecializeLayers("xc7z020clg400-1")) + model = model.transform(SpecializeLayers("xczu7ev-ffvc1156-2-e")) model = model.transform(GiveUniqueNodeNames()) model = model.transform(SetExecMode("cppsim")) model = model.transform(PrepareCppSim()) @@ -423,10 +423,10 @@ def test_fpgadataflow_mvau_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): y_expected = y.reshape(oshape) # TODO split up into several dependent tests -- need to check how this # works for parametrized tests... - model = model.transform(SpecializeLayers("xc7z020clg400-1")) + model = model.transform(SpecializeLayers("xczu7ev-ffvc1156-2-e")) model = model.transform(SetExecMode("rtlsim")) model = model.transform(GiveUniqueNodeNames()) - model = model.transform(PrepareIP("xc7z020clg400-1", 5)) + model = model.transform(PrepareIP("xczu7ev-ffvc1156-2-e", 5)) model = model.transform(HLSSynthIP()) model = model.transform(PrepareRTLSim()) y_produced = oxe.execute_onnx(model, input_dict)["outp"] @@ -531,12 +531,12 @@ def test_fpgadataflow_mvau_large_depth_decoupled_mode_rtlsim( y_expected = y.reshape(oshape) # TODO split up into several dependent tests -- need to check how this # works for parametrized tests... - model = model.transform(SpecializeLayers("xc7z020clg400-1")) + model = model.transform(SpecializeLayers("xczu7ev-ffvc1156-2-e")) model = model.transform(MinimizeWeightBitWidth()) model = model.transform(MinimizeAccumulatorWidth()) model = model.transform(SetExecMode("rtlsim")) model = model.transform(GiveUniqueNodeNames()) - model = model.transform(PrepareIP("xc7z020clg400-1", 5)) + model = model.transform(PrepareIP("xczu7ev-ffvc1156-2-e", 5)) model = model.transform(HLSSynthIP()) model = model.transform(PrepareRTLSim()) y_produced = oxe.execute_onnx(model, input_dict)["outp"] @@ -611,12 +611,12 @@ def test_mvau_fifocharacterize_rtlsim( inst.set_nodeattr("preferred_impl_style", preferred_impl_style) total_fold = nf * sf exp_total_cycles = total_fold + 10 - model = model.transform(SpecializeLayers("xc7z020clg400-1")) + model = model.transform(SpecializeLayers("xczu7ev-ffvc1156-2-e")) model = model.transform(MinimizeWeightBitWidth()) model = model.transform(MinimizeAccumulatorWidth()) model = model.transform(SetExecMode("rtlsim")) model = model.transform(GiveUniqueNodeNames()) - model = model.transform(PrepareIP("xc7z020clg400-1", 5)) + model = model.transform(PrepareIP("xczu7ev-ffvc1156-2-e", 5)) model = model.transform(HLSSynthIP()) model = model.transform(PrepareRTLSim()) model = model.transform(DeriveCharacteristic(exp_total_cycles)) From 1c46131bbd47edb70e0d1c156c123a52d5d5da11 Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 7 Jun 2024 11:25:41 +0100 Subject: [PATCH 24/30] [Docker] Enable optional xrt installation from local deb --- docker/Dockerfile.finn | 7 ++++++- run-docker.sh | 16 +++++++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile.finn b/docker/Dockerfile.finn index 29ec00414b..0cfe0f4339 100644 --- a/docker/Dockerfile.finn +++ b/docker/Dockerfile.finn @@ -32,6 +32,7 @@ LABEL maintainer="Jakoba Petri-Koenig , Yaman Umuro ARG XRT_DEB_VERSION="xrt_202220.2.14.354_22.04-amd64-xrt" ARG SKIP_XRT +ARG LOCAL_XRT WORKDIR /workspace @@ -79,8 +80,12 @@ RUN cd verilator && \ make install # install XRT +RUN if [ -z "$LOCAL_XRT" ];then \ + wget -U 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17' "https://www.xilinx.com/bin/public/openDownload?filename=$XRT_DEB_VERSION.deb" -O /tmp/$XRT_DEB_VERSION.deb; fi + +COPY $XRT_DEB_VERSION.deb /tmp/$XRT_DEB_VERSION.deb + RUN if [ -z "$SKIP_XRT" ];then \ - wget -U 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17' "https://www.amd.com/bin/public/amdOpenDownload?filename=$XRT_DEB_VERSION.deb" -O /tmp/$XRT_DEB_VERSION.deb --debug && \ apt install -y /tmp/$XRT_DEB_VERSION.deb && \ rm /tmp/$XRT_DEB_VERSION.deb; fi diff --git a/run-docker.sh b/run-docker.sh index 57f420143d..88fabff2fa 100755 --- a/run-docker.sh +++ b/run-docker.sh @@ -101,6 +101,7 @@ SCRIPTPATH=$(dirname "$SCRIPT") : ${DOCKER_BUILDKIT="1"} : ${FINN_SINGULARITY=""} : ${FINN_SKIP_XRT_DOWNLOAD=""} +: ${FINN_XRT_PATH=""} DOCKER_INTERACTIVE="" @@ -182,14 +183,27 @@ if [ "$FINN_SKIP_DEP_REPOS" = "0" ]; then ./fetch-repos.sh fi +# If xrt path given, copy .deb file to this repo +# Be aware that we assume a certain name of the xrt deb version +if [ -d "$FINN_XRT_PATH" ];then + cp $FINN_XRT_PATH/$XRT_DEB_VERSION.deb . + export LOCAL_XRT=1 +fi + # Build the FINN Docker image if [ "$FINN_DOCKER_PREBUILT" = "0" ] && [ -z "$FINN_SINGULARITY" ]; then # Need to ensure this is done within the finn/ root folder: OLD_PWD=$(pwd) cd $SCRIPTPATH - docker build -f docker/Dockerfile.finn --build-arg XRT_DEB_VERSION=$XRT_DEB_VERSION --build-arg SKIP_XRT=$FINN_SKIP_XRT_DOWNLOAD --tag=$FINN_DOCKER_TAG $FINN_DOCKER_BUILD_EXTRA . + docker build -f docker/Dockerfile.finn --build-arg XRT_DEB_VERSION=$XRT_DEB_VERSION --build-arg SKIP_XRT=$FINN_SKIP_XRT_DOWNLOAD --build-arg LOCAL_XRT=$LOCAL_XRT --tag=$FINN_DOCKER_TAG $FINN_DOCKER_BUILD_EXTRA . cd $OLD_PWD fi + +# Remove local xrt.deb file from repo +if [ ! -z "$LOCAL_XRT" ];then + rm $XRT_DEB_VERSION.deb +fi + # Launch container with current directory mounted # important to pass the --init flag here for correct Vivado operation, see: # https://stackoverflow.com/questions/55733058/vivado-synthesis-hangs-in-docker-container-spawned-by-jenkins From 3f428a0ea0ba32975c5393d45930cd3ff6f1ea79 Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 7 Jun 2024 11:43:56 +0100 Subject: [PATCH 25/30] [GHA] Add path for skipping xrt download --- docker/Dockerfile.finn | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile.finn b/docker/Dockerfile.finn index 0cfe0f4339..9a7aa52e44 100644 --- a/docker/Dockerfile.finn +++ b/docker/Dockerfile.finn @@ -80,7 +80,7 @@ RUN cd verilator && \ make install # install XRT -RUN if [ -z "$LOCAL_XRT" ];then \ +RUN if [ -z "$LOCAL_XRT" ] && [ -z "$SKIP_XRT" ];then \ wget -U 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17' "https://www.xilinx.com/bin/public/openDownload?filename=$XRT_DEB_VERSION.deb" -O /tmp/$XRT_DEB_VERSION.deb; fi COPY $XRT_DEB_VERSION.deb /tmp/$XRT_DEB_VERSION.deb From 048557f77244672eb0cec5386d624c035c99cd2a Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 7 Jun 2024 11:59:39 +0100 Subject: [PATCH 26/30] [Docker] Workaround to allow for optional COPY command --- docker/Dockerfile.finn | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/docker/Dockerfile.finn b/docker/Dockerfile.finn index 9a7aa52e44..823d1232d5 100644 --- a/docker/Dockerfile.finn +++ b/docker/Dockerfile.finn @@ -83,7 +83,7 @@ RUN cd verilator && \ RUN if [ -z "$LOCAL_XRT" ] && [ -z "$SKIP_XRT" ];then \ wget -U 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17' "https://www.xilinx.com/bin/public/openDownload?filename=$XRT_DEB_VERSION.deb" -O /tmp/$XRT_DEB_VERSION.deb; fi -COPY $XRT_DEB_VERSION.deb /tmp/$XRT_DEB_VERSION.deb +COPY requirements.txt $XRT_DEB_VERSION.* /tmp/ RUN if [ -z "$SKIP_XRT" ];then \ apt install -y /tmp/$XRT_DEB_VERSION.deb && \ @@ -91,9 +91,8 @@ RUN if [ -z "$SKIP_XRT" ];then \ # versioned Python package requirements for FINN compiler # these are given in requirements.txt -COPY requirements.txt . -RUN pip install -r requirements.txt -RUN rm requirements.txt +RUN pip install -r /tmp/requirements.txt +RUN rm /tmp/requirements.txt # install PyTorch RUN pip install torch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116 From abb6daf4c72d938c9bf918ebb263dad491b84ed0 Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 7 Jun 2024 17:58:48 +0100 Subject: [PATCH 27/30] [Tests] Add res type and depth triggers --- tests/end2end/test_end2end_bnn_pynq.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py index 6fd7cb5e66..d697a192d4 100644 --- a/tests/end2end/test_end2end_bnn_pynq.py +++ b/tests/end2end/test_end2end_bnn_pynq.py @@ -131,6 +131,7 @@ def fold_tfc(model): fcl_inst.set_nodeattr("SIMD", simd) fcl_inst.set_nodeattr("ram_style", ramstyle) fcl_inst.set_nodeattr("mem_mode", "internal_decoupled") + fcl_inst.set_nodeattr("resType", "lut") # set parallelism for input quantizer to be same as first layer's SIMD inp_qnt_node = model.get_nodes_by_op_type("Thresholding_rtl")[0] inp_qnt = getCustomOp(inp_qnt_node) @@ -155,6 +156,7 @@ def fold_lfc(model): fcl_inst.set_nodeattr("ram_style", ramstyle) fcl_inst.set_nodeattr("runtime_writeable_weights", 1) fcl_inst.set_nodeattr("mem_mode", "internal_decoupled") + fcl_inst.set_nodeattr("resType", "lut") # set parallelism for input quantizer to be same as first layer's SIMD inp_qnt_node = model.get_nodes_by_op_type("Thresholding_rtl")[0] inp_qnt = getCustomOp(inp_qnt_node) @@ -181,12 +183,14 @@ def fold_cnv_large(model): fcl_inst.set_nodeattr("PE", pe) fcl_inst.set_nodeattr("SIMD", simd) fcl_inst.set_nodeattr("mem_mode", "internal_decoupled") + fcl_inst.set_nodeattr("resType", "lut") swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator_hls") for i in range(len(swg_layers)): swg_inst = getCustomOp(swg_layers[i]) simd = folding[i][1] swg_inst.set_nodeattr("SIMD", simd) + swg_inst.set_nodeattr("ram_style", "distributed") return model @@ -194,8 +198,8 @@ def fold_cnv_small(model): fc_layers = model.get_nodes_by_op_type("MVAU_hls") # each tuple is (PE, SIMD) for a layer folding = [ - (8, 3, "distributed"), - (16, 16, "distributed"), + (8, 3, "auto"), + (16, 16, "auto"), (8, 16, "auto"), (8, 16, "block"), (4, 8, "auto"), @@ -210,12 +214,18 @@ def fold_cnv_small(model): fcl_inst.set_nodeattr("SIMD", simd) fcl_inst.set_nodeattr("ram_style", ramstyle) fcl_inst.set_nodeattr("mem_mode", "internal_decoupled") + fcl_inst.set_nodeattr("resType", "lut") swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator_hls") for i in range(len(swg_layers)): swg_inst = getCustomOp(swg_layers[i]) simd = folding[i][1] swg_inst.set_nodeattr("SIMD", simd) + swg_inst.set_nodeattr("ram_style", "distributed") + inp_qnt_node = model.get_nodes_by_op_type("Thresholding_rtl")[0] + inp_qnt = getCustomOp(inp_qnt_node) + inp_qnt.set_nodeattr("depth_trigger_uram", 32000) + inp_qnt.set_nodeattr("depth_trigger_bram", 32000) return model @@ -719,8 +729,8 @@ def test_set_fifo_depths(self, topology, wbits, abits, board): prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "ipgen_" + board) model = load_test_checkpoint_or_skip(prev_chkpt_name) test_fpga_part = get_build_env(board, target_clk_ns)["part"] - if topology == "cnv" and wbits == 2 and abits == 2 and board == "Pynq-Z1": - # Enabling swg_exception for this single test case. Disabling the exception results in + if topology == "cnv" and abits == 2 and board == "Pynq-Z1": + # Enabling swg_exception for these test cases. Disabling the exception results in # a design that exceeds the resources of the Pynq-Z1 board. In future this should be # revisited and handled correctly as the swg_exception is poorly justified. model = model.transform( From b9894c0793780288f916e796754f9217ae9f95d4 Mon Sep 17 00:00:00 2001 From: auphelia Date: Tue, 11 Jun 2024 17:34:35 +0100 Subject: [PATCH 28/30] [Deps] Move setuptools installation to Dockerfile --- docker/Dockerfile.finn | 3 +++ requirements.txt | 1 - 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile.finn b/docker/Dockerfile.finn index 823d1232d5..5126ed3ff4 100644 --- a/docker/Dockerfile.finn +++ b/docker/Dockerfile.finn @@ -132,6 +132,9 @@ RUN pip install tokenize-rt==4.2.1 # pyverilator RUN pip install tclwrapper==0.0.1 +# assure that we have the right setuptools version +RUN pip install setuptools==68.2.2 + # extra environment variables for FINN compiler ENV VIVADO_IP_CACHE "/tmp/vivado_ip_cache" diff --git a/requirements.txt b/requirements.txt index c2973f9432..d4ca45cb37 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,7 +14,6 @@ psutil==5.9.4 pyscaffold==4.4 scipy==1.10.1 setupext-janitor>=1.1.2 -setuptools==68.2.2 sigtools==4.0.1 toposort==1.7.0 vcdvcd==1.0.5 From aacdaeef7b835bb4aef2617706d5b19c294bd721 Mon Sep 17 00:00:00 2001 From: auphelia Date: Tue, 11 Jun 2024 18:02:46 +0100 Subject: [PATCH 29/30] [Tests] Fix bnn pynq to use default hw variants --- tests/end2end/test_end2end_bnn_pynq.py | 32 ++++++-------------------- 1 file changed, 7 insertions(+), 25 deletions(-) diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py index d697a192d4..81c6316ec1 100644 --- a/tests/end2end/test_end2end_bnn_pynq.py +++ b/tests/end2end/test_end2end_bnn_pynq.py @@ -95,7 +95,6 @@ MoveScalarLinearPastInvariants, ) from finn.util.basic import get_finn_root, make_build_dir, test_board_map -from finn.util.fpgadataflow import is_fpgadataflow_node from finn.util.pytorch import ToTensor from finn.util.test import ( execute_parent, @@ -185,7 +184,7 @@ def fold_cnv_large(model): fcl_inst.set_nodeattr("mem_mode", "internal_decoupled") fcl_inst.set_nodeattr("resType", "lut") - swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator_hls") + swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl") for i in range(len(swg_layers)): swg_inst = getCustomOp(swg_layers[i]) simd = folding[i][1] @@ -198,14 +197,14 @@ def fold_cnv_small(model): fc_layers = model.get_nodes_by_op_type("MVAU_hls") # each tuple is (PE, SIMD) for a layer folding = [ - (8, 3, "auto"), - (16, 16, "auto"), + (8, 3, "distributed"), + (16, 16, "distributed"), (8, 16, "auto"), - (8, 16, "block"), + (8, 16, "distributed"), (4, 8, "auto"), (1, 8, "auto"), - (1, 2, "distributed"), - (2, 2, "block"), + (1, 2, "block"), + (2, 2, "auto"), (5, 1, "distributed"), ] for fcl, (pe, simd, ramstyle) in zip(fc_layers, folding): @@ -216,7 +215,7 @@ def fold_cnv_small(model): fcl_inst.set_nodeattr("mem_mode", "internal_decoupled") fcl_inst.set_nodeattr("resType", "lut") - swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator_hls") + swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl") for i in range(len(swg_layers)): swg_inst = getCustomOp(swg_layers[i]) simd = folding[i][1] @@ -609,13 +608,6 @@ def test_specialize_layers(self, topology, wbits, abits, board): build_data = get_build_env(board, target_clk_ns) prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "convert_to_hw_layers") model = load_test_checkpoint_or_skip(prev_chkpt_name) - # set preferred impl style to hls for all layers - force_hls_boards = ["Pynq-Z1", "U250"] - if topology == "cnv" and wbits == 2 and abits == 2 and board in force_hls_boards: - for node in model.graph.node: - if is_fpgadataflow_node(node): - inst = getCustomOp(node) - inst.set_nodeattr("preferred_impl_style", "hls") model = model.transform(SpecializeLayers(build_data["part"])) model = model.transform(GiveUniqueNodeNames()) model.save(get_checkpoint_name(topology, wbits, abits, "specialize_layers")) @@ -646,19 +638,9 @@ def test_specialize_layers(self, topology, wbits, abits, board): ("StreamingMaxPool_hls", 2), ("LabelSelect_hls", 1), ], - "cnv-2-2": [ - ("Transpose", 1), - ("Thresholding_hls", 1), - ("ConvolutionInputGenerator_hls", 6), - ("MVAU_hls", 9), - ("StreamingMaxPool_hls", 2), - ("LabelSelect_hls", 1), - ], } if topology == "tfc" and wbits == 1 and abits == 1: exp_key = "tfc-1-1" - elif topology == "cnv" and wbits == 2 and abits == 2 and board in force_hls_boards: - exp_key = "cnv-2-2" else: exp_key = topology exp_layer_counts = exp_layer_counts[exp_key] From 0ef0ca4867bd0e82aba4a863f14c01109af93488 Mon Sep 17 00:00:00 2001 From: auphelia Date: Thu, 13 Jun 2024 09:59:45 +0100 Subject: [PATCH 30/30] [Deps] Update finn-experimental commit hash --- fetch-repos.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fetch-repos.sh b/fetch-repos.sh index 2b1613abe4..2033973f2a 100755 --- a/fetch-repos.sh +++ b/fetch-repos.sh @@ -28,7 +28,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. QONNX_COMMIT="fd61cfeebbdaba351abf7e9d54cd785d7776fa4f" -FINN_EXP_COMMIT="7a587b2ccc8fbd4daaec946f3bc66c273f85451b" +FINN_EXP_COMMIT="0724be21111a21f0d81a072fccc1c446e053f851" BREVITAS_COMMIT="84f42259ec869eb151af4cb8a8b23ad925f493db" PYVERILATOR_COMMIT="ce0a08c20cb8c1d1e84181d6f392390f846adbd1" CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4"