From 34343e9ed9c996361c6fa2b1477992d800b40f43 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 13 May 2024 00:10:18 +0100 Subject: [PATCH 01/11] [mvu rtl]: minor change to width of signal --- finn-rtllib/mvu/mvu_4sx4u.sv | 2 +- finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index 0ac2628ee5..2956700ea2 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -452,7 +452,7 @@ module mvu_4sx4u #( uwire [$clog2(SIMD)+7:0] lo4[3]; for(genvar i = 0; i < 4; i++) begin localparam int unsigned LO_WIDTH = D[i+1] - D[i]; - localparam int unsigned HI_WIDTH = ACCU_WIDTH - LO_WIDTH; + localparam int unsigned HI_WIDTH = (ACCU_WIDTH - LO_WIDTH) < ($clog2(1+SIMD) + 1) ? $clog2(1+SIMD) : (ACCU_WIDTH - LO_WIDTH); // Conclusive high part accumulation if(i >= PE_REM && i < 3) begin : genHi diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv index fbf48784f0..08f978e6b5 100644 --- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -449,11 +449,11 @@ module mvu_8sx8u_dsp48 #( uwire signed [ACCU_WIDTH -1:0] up4; uwire signed [ACCU_WIDTH -SINGLE_PROD_WIDTH:0] hi4; - uwire [$clog2(SIMD)+SINGLE_PROD_WIDTH-1:0] lo4; + uwire [$clog2(SIMD)+SINGLE_PROD_WIDTH:0] lo4; // Conclusive high part accumulation if(PE_REM == 0) begin : genHi - localparam int unsigned HI_WIDTH = ACCU_WIDTH - D[1]; + localparam int unsigned HI_WIDTH = (ACCU_WIDTH - SINGLE_PROD_WIDTH) < ($clog2(1+SIMD)+1) ? $clog2(1+SIMD)+1 : ACCU_WIDTH - SINGLE_PROD_WIDTH; // Adder Tree across all SIMD high contributions, each from [-1:1] uwire signed [2*SIMD-2:0][$clog2(1+SIMD):0] tree; for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s]; From 4122ee77a95aac0c4f439e9093b562cbf1bd1464 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 13 May 2024 16:53:15 +0100 Subject: [PATCH 02/11] test case --- finn-rtllib/mvu/mvu_4sx4u.sv | 2 +- finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 4 ++-- finn-rtllib/mvu/tb/mvu_axi_tb.sv | 17 +++++++++-------- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index 2956700ea2..0ac2628ee5 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -452,7 +452,7 @@ module mvu_4sx4u #( uwire [$clog2(SIMD)+7:0] lo4[3]; for(genvar i = 0; i < 4; i++) begin localparam int unsigned LO_WIDTH = D[i+1] - D[i]; - localparam int unsigned HI_WIDTH = (ACCU_WIDTH - LO_WIDTH) < ($clog2(1+SIMD) + 1) ? $clog2(1+SIMD) : (ACCU_WIDTH - LO_WIDTH); + localparam int unsigned HI_WIDTH = ACCU_WIDTH - LO_WIDTH; // Conclusive high part accumulation if(i >= PE_REM && i < 3) begin : genHi diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv index 08f978e6b5..c76d2680d8 100644 --- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -453,7 +453,7 @@ module mvu_8sx8u_dsp48 #( // Conclusive high part accumulation if(PE_REM == 0) begin : genHi - localparam int unsigned HI_WIDTH = (ACCU_WIDTH - SINGLE_PROD_WIDTH) < ($clog2(1+SIMD)+1) ? $clog2(1+SIMD)+1 : ACCU_WIDTH - SINGLE_PROD_WIDTH; + localparam int unsigned HI_WIDTH = ACCU_WIDTH - SINGLE_PROD_WIDTH; // Adder Tree across all SIMD high contributions, each from [-1:1] uwire signed [2*SIMD-2:0][$clog2(1+SIMD):0] tree; for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s]; @@ -510,7 +510,7 @@ module mvu_8sx8u_dsp48 #( always_ff @(posedge clk) begin if(rst) Res5 <= '{ default: 0 }; else if(en) begin - Res5[1] <= up4 - hi4; + Res5[1] <= up4 - hi4; // -809 - 1 (_01) = -810. -809 - -3 (101) = -806 Res5[0] <= $signed({ hi4, {(D[1] - D[0]){1'b0}} }) + $signed({ 1'b0, lo4 }); end end diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv index 4ed7b4bf5f..59714c8e59 100644 --- a/finn-rtllib/mvu/tb/mvu_axi_tb.sv +++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv @@ -36,19 +36,19 @@ module mvu_axi_tb(); //-------------------- Simulation parameters --------------------\\ // Matrix & parallelism config localparam bit IS_MVU = 1; - localparam string COMPUTE_CORE = "mvu_4sx4u"; - localparam int unsigned MW = 120; - localparam int unsigned MH = 40; - localparam int unsigned SIMD = 20; - localparam int unsigned PE = 10; + localparam string COMPUTE_CORE = "mvu_8sx8u_dsp48"; + localparam int unsigned MW = 6; + localparam int unsigned MH = 32; + localparam int unsigned SIMD = 6; + localparam int unsigned PE = 16; localparam int unsigned SEGMENTLEN = 2.0; localparam bit FORCE_BEHAVIORAL = 1; localparam bit M_REG_LUT = 1; // Bit-width config - localparam int unsigned ACTIVATION_WIDTH = 4; + localparam int unsigned ACTIVATION_WIDTH = 8; localparam int unsigned WEIGHT_WIDTH = 4; - localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); - localparam bit SIGNED_ACTIVATIONS = 0; + localparam int unsigned ACCU_WIDTH = 14; //ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); + localparam bit SIGNED_ACTIVATIONS = 1; // Simulation constants localparam int unsigned NF = MH/PE; localparam int unsigned SF = MW/SIMD; @@ -132,6 +132,7 @@ module mvu_axi_tb(); for (int i=0; i Date: Tue, 14 May 2024 14:35:50 +0100 Subject: [PATCH 03/11] Get 8-bit DSP MVU ready for optimized accumulators. --- finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv index c76d2680d8..e48757496b 100644 --- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -91,7 +91,7 @@ module mvu_8sx8u_dsp48 #( localparam int unsigned PE_END = PE < 2*(c+1)? PE : 2*(c+1); localparam int unsigned PE_REM = 2*(c+1) - PE_END; - uwire [57:0] p3[SIMD]; + uwire [47:0] p3[SIMD]; uwire signed [ 1:0] h3[SIMD]; for(genvar s = 0; s < SIMD; s++) begin : genSIMD @@ -447,13 +447,13 @@ module mvu_8sx8u_dsp48 #( // Count leaves reachable from each node localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 0}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop - uwire signed [ACCU_WIDTH -1:0] up4; - uwire signed [ACCU_WIDTH -SINGLE_PROD_WIDTH:0] hi4; - uwire [$clog2(SIMD)+SINGLE_PROD_WIDTH:0] lo4; + uwire signed [ACCU_WIDTH -1:0] up4; + uwire signed [ACCU_WIDTH -D[1] :0] hi4; // secure true sign bit for optimized accumulators + uwire [$clog2(SIMD)+D[1]-1:0] lo4; // Conclusive high part accumulation if(PE_REM == 0) begin : genHi - localparam int unsigned HI_WIDTH = ACCU_WIDTH - SINGLE_PROD_WIDTH; + localparam int unsigned HI_WIDTH = ACCU_WIDTH - D[1]; // Adder Tree across all SIMD high contributions, each from [-1:1] uwire signed [2*SIMD-2:0][$clog2(1+SIMD):0] tree; for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s]; @@ -464,10 +464,10 @@ module mvu_8sx8u_dsp48 #( end // High Sideband Accumulation - logic signed [HI_WIDTH-1:0] Hi4 = 0; + logic signed [HI_WIDTH:0] Hi4 = 0; // secure true sign bit for optimized accumulators always_ff @(posedge clk) begin if(rst) Hi4 <= 0; - else if(en) Hi4 <= (L[4]? 0 : Hi4) + $signed(tree[0]); + else if(en) Hi4 <= $signed(L[4]? 0 : Hi4) + $signed(tree[0]); end assign hi4 = Hi4; end : genHi @@ -479,14 +479,14 @@ module mvu_8sx8u_dsp48 #( localparam int unsigned LO_WIDTH = D[i+1] - D[i]; // Conclusive low part accumulation if(i >= PE_REM) begin : blkLo - // Adder Tree across all SIMD low contributions + // Adder Tree across all SIMD low contributions (all unsigned arithmetic) localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1)); uwire [2*SIMD-2:0][ROOT_WIDTH-1:0] tree; for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH]; for(genvar n = 0; n < SIMD-1; n++) begin // Sum truncated to actual maximum bit width at this node localparam int unsigned NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1)); - uwire [NODE_WIDTH-1:0] s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]); + uwire [NODE_WIDTH-1:0] s = tree[2*n+1] + tree[2*n+2]; assign tree[n] = s; end @@ -510,7 +510,7 @@ module mvu_8sx8u_dsp48 #( always_ff @(posedge clk) begin if(rst) Res5 <= '{ default: 0 }; else if(en) begin - Res5[1] <= up4 - hi4; // -809 - 1 (_01) = -810. -809 - -3 (101) = -806 + Res5[1] <= up4 - hi4; Res5[0] <= $signed({ hi4, {(D[1] - D[0]){1'b0}} }) + $signed({ 1'b0, lo4 }); end end From ca439ff43ba27686535836df88ab44144339e572 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Tue, 14 May 2024 19:32:25 +0100 Subject: [PATCH 04/11] updated weights file --- finn-rtllib/mvu/tb/mvu_axi_tb.sv | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv index 59714c8e59..ea2f087721 100644 --- a/finn-rtllib/mvu/tb/mvu_axi_tb.sv +++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv @@ -109,9 +109,25 @@ module mvu_axi_tb(); typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; typedef weight_t weight_matrix_t[NF][SF]; - function weight_matrix_t init_WEIGHTS; + // function weight_matrix_t init_WEIGHTS; + // automatic weight_matrix_t res; + // std::randomize(res); + // return res; + // endfunction : init_WEIGHTS; + // weight_matrix_t WEIGHTS = init_WEIGHTS(); + + function weight_matrix_t init_WEIGHTS(); automatic weight_matrix_t res; - std::randomize(res); + logic [383:0] WEIGHT_MATRIX [2] = {384'h6e507f99bdcd011437f919f9f74f77ad9716aefe9661717f717f021797c77900976277550a09199c00744b797da29d49, 384'h75e37a070f09a290903159f9bb999cf9d91c7691951727009190909276ea097b491ae70d71707f1ced99794c3e0717e7}; + for (int i=0; i Date: Tue, 14 May 2024 21:53:28 +0100 Subject: [PATCH 05/11] fix to lane offsets --- finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv index e48757496b..5c4d04dfd3 100644 --- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -82,7 +82,7 @@ module mvu_8sx8u_dsp48 #( // Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism localparam int unsigned SINGLE_PROD_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH; - localparam int unsigned D[2:0] = '{ ACCU_WIDTH+SINGLE_PROD_WIDTH, SINGLE_PROD_WIDTH, 0 }; // Lane offsets + localparam int unsigned D[2:0] = '{ ACCU_WIDTH+SINGLE_PROD_WIDTH, SINGLE_PROD_WIDTH-1, 0 }; // Lane offsets localparam int unsigned PIPE_COUNT = (PE+1)/2; for(genvar c = 0; c < PIPE_COUNT; c++) begin : genPipes From 282af2cbdcfb737f368ffb203fc39e3f8a42e21d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Wed, 15 May 2024 08:05:06 +0100 Subject: [PATCH 06/11] Formal derivation of HI_WIDTH computation. --- finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv index 5c4d04dfd3..78cd64be10 100644 --- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -82,7 +82,7 @@ module mvu_8sx8u_dsp48 #( // Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism localparam int unsigned SINGLE_PROD_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH; - localparam int unsigned D[2:0] = '{ ACCU_WIDTH+SINGLE_PROD_WIDTH, SINGLE_PROD_WIDTH-1, 0 }; // Lane offsets + localparam int unsigned D[2:0] = '{ ACCU_WIDTH+SINGLE_PROD_WIDTH, SINGLE_PROD_WIDTH, 0 }; // Lane offsets localparam int unsigned PIPE_COUNT = (PE+1)/2; for(genvar c = 0; c < PIPE_COUNT; c++) begin : genPipes @@ -447,13 +447,30 @@ module mvu_8sx8u_dsp48 #( // Count leaves reachable from each node localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 0}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop + // Range of Cross-lane Contribution Tracked in Hi4 + /* + * - Assumption: ACCU_WIDTH bounds right lane value at any point in time. + * - The value x beyond the lane boundary is hence bounded by: + * -2^(w-1) <= x <= 2^(w-1)-1 with w = ACCU_WIDTH - D[1] + * - This value decomposes into the tracked overflow h and the overflow l + * from the low SIMD lane reduction with: + * 0 <= l <= SIMD + * - From x = l + h follows: + * h = x - l + * -2^(w-1) - SIMD <= h <= 2^(w-1)-1 + * - This required bit width of the two's complement representation of this + * signed value is determined by its lower bound to be at least: + * 1 + $clog2(2^(w-1)+SIMD) + */ + localparam int unsigned HI_WIDTH = 1 + $clog2(2**(ACCU_WIDTH-D[1]-1)+SIMD); + uwire signed [ACCU_WIDTH -1:0] up4; - uwire signed [ACCU_WIDTH -D[1] :0] hi4; // secure true sign bit for optimized accumulators + uwire signed [HI_WIDTH -1:0] hi4; uwire [$clog2(SIMD)+D[1]-1:0] lo4; // Conclusive high part accumulation if(PE_REM == 0) begin : genHi - localparam int unsigned HI_WIDTH = ACCU_WIDTH - D[1]; + // Adder Tree across all SIMD high contributions, each from [-1:1] uwire signed [2*SIMD-2:0][$clog2(1+SIMD):0] tree; for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s]; @@ -464,7 +481,7 @@ module mvu_8sx8u_dsp48 #( end // High Sideband Accumulation - logic signed [HI_WIDTH:0] Hi4 = 0; // secure true sign bit for optimized accumulators + logic signed [HI_WIDTH-1:0] Hi4 = 0; always_ff @(posedge clk) begin if(rst) Hi4 <= 0; else if(en) Hi4 <= $signed(L[4]? 0 : Hi4) + $signed(tree[0]); From dc9855855d31e4fd32b5d4bedf8371553cbf7416 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Wed, 15 May 2024 08:27:37 +0100 Subject: [PATCH 07/11] Catch and report cross-lane accumulation overflow in simulation. --- finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv index 78cd64be10..414c4b0be0 100644 --- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -483,8 +483,15 @@ module mvu_8sx8u_dsp48 #( // High Sideband Accumulation logic signed [HI_WIDTH-1:0] Hi4 = 0; always_ff @(posedge clk) begin - if(rst) Hi4 <= 0; - else if(en) Hi4 <= $signed(L[4]? 0 : Hi4) + $signed(tree[0]); + if(rst) Hi4 <= 0; + else if(en) begin + automatic logic signed [HI_WIDTH:0] h = $signed(L[4]? 0 : Hi4) + $signed(tree[0]); + assert(h[HI_WIDTH] == h[HI_WIDTH-1]) else begin + $error("%m: Accumulation overflow for ACCU_WIDTH=%0d", ACCU_WIDTH); + $stop; + end + Hi4 <= h; + end end assign hi4 = Hi4; end : genHi From 074f15dffd7410dd667ee8cbc88cab726aaa0967 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 15 May 2024 12:18:53 +0100 Subject: [PATCH 08/11] fix to width hi4 --- finn-rtllib/mvu/mvu_4sx4u.sv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index 0ac2628ee5..2adb37bb35 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -448,11 +448,11 @@ module mvu_4sx4u #( localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 1}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop uwire signed [ACCU_WIDTH -1:0] up4; - uwire signed [ACCU_WIDTH -8:0] hi4[3]; + uwire signed [$clog2(2**(ACCU_WIDTH-8)+SIMD) :0] hi4[3]; uwire [$clog2(SIMD)+7:0] lo4[3]; for(genvar i = 0; i < 4; i++) begin localparam int unsigned LO_WIDTH = D[i+1] - D[i]; - localparam int unsigned HI_WIDTH = ACCU_WIDTH - LO_WIDTH; + localparam int unsigned HI_WIDTH = 1 + $clog2(2**(ACCU_WIDTH-LO_WIDTH-1)+SIMD); // Conclusive high part accumulation if(i >= PE_REM && i < 3) begin : genHi From 057911f6ec03d90526374e937226218dc3636ae3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Wed, 15 May 2024 14:02:14 +0100 Subject: [PATCH 09/11] Redimension reduction arithmetic. --- finn-rtllib/mvu/mvu_4sx4u.sv | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index 2adb37bb35..c527431ec4 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -447,9 +447,9 @@ module mvu_4sx4u #( // Count leaves reachable from each node localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 1}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop - uwire signed [ACCU_WIDTH -1:0] up4; - uwire signed [$clog2(2**(ACCU_WIDTH-8)+SIMD) :0] hi4[3]; - uwire [$clog2(SIMD)+7:0] lo4[3]; + uwire signed [ACCU_WIDTH-1:0] up4; + uwire signed [$clog2(2**(ACCU_WIDTH-7)+SIMD):0] hi4[3]; // min LO_WIDTH=7 + uwire [$clog2(SIMD)+7 :0] lo4[3]; // max LO_WIDTH=8 for(genvar i = 0; i < 4; i++) begin localparam int unsigned LO_WIDTH = D[i+1] - D[i]; localparam int unsigned HI_WIDTH = 1 + $clog2(2**(ACCU_WIDTH-LO_WIDTH-1)+SIMD); @@ -477,7 +477,7 @@ module mvu_4sx4u #( assign hi4[i] = '0; end : genHiZero - // Conclusive low part accumulation + // Conclusive low part accumulation (all unsigned arithmetic) if(i >= PE_REM) begin : blkLo // Adder Tree across all SIMD low contributions localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1)); @@ -486,7 +486,7 @@ module mvu_4sx4u #( for(genvar n = 0; n < SIMD-1; n++) begin // Sum truncated to actual maximum bit width at this node localparam int unsigned NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1)); - uwire [NODE_WIDTH-1:0] s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]); + uwire [NODE_WIDTH-1:0] s = tree[2*n+1] + tree[2*n+2]; assign tree[n] = s; end From d48ced883887a2c20b06fc78ba1c89f67646c717 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 15 May 2024 16:56:41 +0100 Subject: [PATCH 10/11] bitwidth adjustment hi4 and extra overflow check --- finn-rtllib/mvu/mvu_4sx4u.sv | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index c527431ec4..703bde665e 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -448,7 +448,7 @@ module mvu_4sx4u #( localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 1}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop uwire signed [ACCU_WIDTH-1:0] up4; - uwire signed [$clog2(2**(ACCU_WIDTH-7)+SIMD):0] hi4[3]; // min LO_WIDTH=7 + uwire signed [$clog2(2**(ACCU_WIDTH-8)+SIMD):0] hi4[3]; // min LO_WIDTH=7 uwire [$clog2(SIMD)+7 :0] lo4[3]; // max LO_WIDTH=8 for(genvar i = 0; i < 4; i++) begin localparam int unsigned LO_WIDTH = D[i+1] - D[i]; @@ -469,7 +469,14 @@ module mvu_4sx4u #( logic signed [HI_WIDTH-1:0] Hi4 = 0; always_ff @(posedge clk) begin if(rst) Hi4 <= 0; - else if(en) Hi4 <= (L[4]? 0 : Hi4) + $signed(tree[0]); + else if(en) begin + automatic logic signed [HI_WIDTH:0] h = $signed(L[4]? 0 : Hi4) + $signed(tree[0]); + assert(h[HI_WIDTH] == h[HI_WIDTH-1]) else begin + $error("%m: Accumulation overflow for ACCU_WIDTH=%0d", ACCU_WIDTH); + $stop; + end + Hi4 <= h; + end end assign hi4[i] = Hi4; end : genHi From 963a38d4a5a783ef0f9d6aa419ea90fce991a193 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 15 May 2024 16:56:59 +0100 Subject: [PATCH 11/11] restored testbench to more general setting --- finn-rtllib/mvu/tb/mvu_axi_tb.sv | 31 +++++++------------------------ 1 file changed, 7 insertions(+), 24 deletions(-) diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv index ea2f087721..fff69739bc 100644 --- a/finn-rtllib/mvu/tb/mvu_axi_tb.sv +++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv @@ -36,18 +36,18 @@ module mvu_axi_tb(); //-------------------- Simulation parameters --------------------\\ // Matrix & parallelism config localparam bit IS_MVU = 1; - localparam string COMPUTE_CORE = "mvu_8sx8u_dsp48"; - localparam int unsigned MW = 6; + localparam string COMPUTE_CORE = "mvu_4sx4u"; + localparam int unsigned MW = 96; localparam int unsigned MH = 32; - localparam int unsigned SIMD = 6; + localparam int unsigned SIMD = 48; localparam int unsigned PE = 16; localparam int unsigned SEGMENTLEN = 2.0; localparam bit FORCE_BEHAVIORAL = 1; localparam bit M_REG_LUT = 1; // Bit-width config - localparam int unsigned ACTIVATION_WIDTH = 8; + localparam int unsigned ACTIVATION_WIDTH = 4; localparam int unsigned WEIGHT_WIDTH = 4; - localparam int unsigned ACCU_WIDTH = 14; //ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); + localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); localparam bit SIGNED_ACTIVATIONS = 1; // Simulation constants localparam int unsigned NF = MH/PE; @@ -109,25 +109,9 @@ module mvu_axi_tb(); typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; typedef weight_t weight_matrix_t[NF][SF]; - // function weight_matrix_t init_WEIGHTS; - // automatic weight_matrix_t res; - // std::randomize(res); - // return res; - // endfunction : init_WEIGHTS; - // weight_matrix_t WEIGHTS = init_WEIGHTS(); - - function weight_matrix_t init_WEIGHTS(); + function weight_matrix_t init_WEIGHTS; automatic weight_matrix_t res; - logic [383:0] WEIGHT_MATRIX [2] = {384'h6e507f99bdcd011437f919f9f74f77ad9716aefe9661717f717f021797c77900976277550a09199c00744b797da29d49, 384'h75e37a070f09a290903159f9bb999cf9d91c7691951727009190909276ea097b491ae70d71707f1ced99794c3e0717e7}; - for (int i=0; i