Skip to content

Commit

Permalink
Supporting vstart CSR for operand read, VALU, VLSU
Browse files Browse the repository at this point in the history
* vstart support for vector unit-stride loads and stores

* vstart support for vector strided loads and stores

* vstart support for valu operations, mask operations not tested

* Preliminary work on vstart support for vector indexed loads and stores

* Minor fixes

* Refactoring

* Explanatory comments
  • Loading branch information
MaistoV committed Oct 18, 2023
1 parent 11ceb51 commit dd8e5a1
Show file tree
Hide file tree
Showing 9 changed files with 635 additions and 452 deletions.
15 changes: 12 additions & 3 deletions hardware/include/ara_pkg.sv
Original file line number Diff line number Diff line change
Expand Up @@ -974,11 +974,20 @@ package ara_pkg;
} opqueue_e;

// Each lane has eight VRF banks
// NOTE: values != 8 are not supported
localparam int unsigned NrVRFBanksPerLane = 8;

// Find the starting address of a vector register vid
// Find the starting address (in bytes) of a vector register chunk of vid
function automatic logic [63:0] vaddr(logic [4:0] vid, int NrLanes);
vaddr = vid * (VLENB / NrLanes / 8);
// Each vector register spans multiple words in each bank in each lane
// The start address is the same in every lane
// Therefore, within each lane, each vector register chunk starts on a given offset
vaddr = vid * (VLENB / NrLanes / NrVRFBanksPerLane);
// NOTE: the only extensively tested configuration of Ara keeps:
// - (VLEN / NrLanes) constant to 1024;
// - NrVRFBanksPerLane always equal to 8.
// Given so, each vector register will span 2 words across all the banks and lanes,
// therefore, vaddr = vid * 16
endfunction: vaddr

// Differenciate between SLDU and ADDRGEN operands from opqueue
Expand Down Expand Up @@ -1016,7 +1025,7 @@ package ara_pkg;

typedef struct packed {
rvv_pkg::vew_e eew; // Effective element width
vlen_t vl; // Vector length
vlen_t elem_count; // Vector body length
opqueue_conversion_e conv; // Type conversion
logic [1:0] ntr_red; // Neutral type for reductions
logic is_reduct; // Is this a reduction?
Expand Down
2 changes: 1 addition & 1 deletion hardware/src/ara.sv
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,7 @@ module ara import ara_pkg::*; #(
.pe_resp_o (pe_resp[NrLanes+OffsetStore : NrLanes+OffsetLoad] ),
.addrgen_ack_o (addrgen_ack ),
.addrgen_exception_o (addrgen_exception ),
.addrgen_exception_vstart_o (addrgen_exception_vstart ),
.addrgen_exception_vstart_o (addrgen_exception_vstart ),
// Interface with the Mask unit
.mask_i (mask ),
.mask_valid_i (mask_valid ),
Expand Down
170 changes: 91 additions & 79 deletions hardware/src/lane/lane_sequencer.sv

Large diffs are not rendered by default.

44 changes: 22 additions & 22 deletions hardware/src/lane/operand_queue.sv
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
///////////////////////

// Count how many operands were already produced
vlen_t vl_d, vl_q;
vlen_t elem_count_d, elem_count_q;

elen_t conv_operand;
// Decide whether we are taking the operands from the lower or from the upper half of the input
Expand Down Expand Up @@ -226,23 +226,23 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
end

// Assert the signal if the last 64-bit packet will contain also
// elements with idx >= vl (they should not contribute to the result!).
// elements with idx >= elem_count (they should not contribute to the result!).
// Gate for power saving
// Power optimization:
// The optimal solution would be to act on the mask bits in the two
// processing units (valu and vmfpu), masking the unused elements.
unique case (cmd.eew)
EW8 : begin
incomplete_packet = |cmd.vl[2:0];
last_packet = ((cmd.vl - vl_q) <= 8) ? 1'b1 : 1'b0;
incomplete_packet = |cmd.elem_count[2:0];
last_packet = ((cmd.elem_count - elem_count_q) <= 8) ? 1'b1 : 1'b0;
end
EW16: begin
incomplete_packet = |cmd.vl[1:0];
last_packet = ((cmd.vl - vl_q) <= 4) ? 1'b1 : 1'b0;
incomplete_packet = |cmd.elem_count[1:0];
last_packet = ((cmd.elem_count - elem_count_q) <= 4) ? 1'b1 : 1'b0;
end
EW32: begin
incomplete_packet = |cmd.vl[0:0];
last_packet = ((cmd.vl - vl_q) <= 2) ? 1'b1 : 1'b0;
incomplete_packet = |cmd.elem_count[0:0];
last_packet = ((cmd.elem_count - elem_count_q) <= 2) ? 1'b1 : 1'b0;
end
default: begin
incomplete_packet = 1'b0;
Expand Down Expand Up @@ -373,15 +373,15 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
if (SupportNtrVal) unique case (cmd.eew)
EW8 : for (int unsigned b = 0; b < 8; b++) begin
automatic int unsigned bs = shuffle_index(b, 1, EW8);
if ((b >> 0) >= cmd.vl[2:0]) conv_operand[8*bs +: 8] = ntr.w8[b];
if ((b >> 0) >= cmd.elem_count[2:0]) conv_operand[8*bs +: 8] = ntr.w8[b];
end
EW16: for (int unsigned b = 0; b < 8; b++) begin
automatic int unsigned bs = shuffle_index(b, 1, EW16);
if ((b >> 1) >= cmd.vl[1:0]) conv_operand[8*bs +: 8] = ntr.w8[b];
if ((b >> 1) >= cmd.elem_count[1:0]) conv_operand[8*bs +: 8] = ntr.w8[b];
end
EW32: for (int unsigned b = 0; b < 8; b++) begin
automatic int unsigned bs = shuffle_index(b, 1, EW32);
if ((b >> 2) >= cmd.vl[0:0]) conv_operand[8*bs +: 8] = ntr.w8[b];
if ((b >> 2) >= cmd.elem_count[0:0]) conv_operand[8*bs +: 8] = ntr.w8[b];
end
default:;
endcase
Expand All @@ -401,7 +401,7 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i

// Maintain state
select_d = select_q;
vl_d = vl_q;
elem_count_d = elem_count_q;

// Send the operand
operand_o = conv_operand;
Expand All @@ -418,16 +418,16 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
OpQueueConversionZExt2,
OpQueueConversionWideFP2,
OpQueueAdjustFPCvt:
if (SupportIntExt2) vl_d = vl_q + (1 << (int'(EW64) - int'(cmd.eew))) / 2;
if (SupportIntExt2) elem_count_d = elem_count_q + (1 << (unsigned'(EW64) - unsigned'(cmd.eew))) / 2;
OpQueueConversionSExt4,
OpQueueConversionZExt4:
if (SupportIntExt4) vl_d = vl_q + (1 << (int'(EW64) - int'(cmd.eew))) / 4;
if (SupportIntExt4) elem_count_d = elem_count_q + (1 << (unsigned'(EW64) - unsigned'(cmd.eew))) / 4;
OpQueueConversionSExt8,
OpQueueConversionZExt8:
if (SupportIntExt8) vl_d = vl_q + (1 << (int'(EW64) - int'(cmd.eew))) / 8;
if (SupportIntExt8) elem_count_d = elem_count_q + (1 << (unsigned'(EW64) - unsigned'(cmd.eew))) / 8;
OpQueueReductionZExt:
vl_d = vl_q + 1;
default: vl_d = vl_q + (1 << (int'(EW64) - int'(cmd.eew)));
elem_count_d = elem_count_q + 1;
default: elem_count_d = elem_count_q + (1 << (unsigned'(EW64) - unsigned'(cmd.eew)));
endcase

// Update the pointer to the input operand
Expand All @@ -443,22 +443,22 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
if ((select_q != '0 && select_d == '0) || cmd.conv == OpQueueConversionNone) ibuf_pop = 1'b1;

// Finished execution
if (vl_d >= cmd.vl) begin
if (elem_count_d >= cmd.elem_count) begin : finished_elems
ibuf_pop = 1'b1;
cmd_pop = 1'b1;
select_d = '0;
vl_d = '0;
end
elem_count_d = '0;
end : finished_elems
end
end : obuf_control

always_ff @(posedge clk_i or negedge rst_ni) begin: p_type_conversion_ff
if (!rst_ni) begin
select_q <= '0;
vl_q <= '0;
elem_count_q <= '0;
end else begin
select_q <= select_d;
vl_q <= vl_d;
elem_count_q <= elem_count_d;
end
end : p_type_conversion_ff

Expand Down
Loading

0 comments on commit dd8e5a1

Please sign in to comment.