From f08c28f443dd38d39051129463756a68d0a0c613 Mon Sep 17 00:00:00 2001 From: Vincenzo Maisto Date: Fri, 13 Oct 2023 13:28:28 +0200 Subject: [PATCH] Supporting vstart CSR for operand read, VALU, VLSU * vstart support for vector unit-stride loads and stores * vstart support for vector strided loads and stores * vstart support for valu operations, mask operations not tested * Preliminary work on vstart support for vector indexed loads and stores * Minor fixes * Refactoring * Explanatory comments --- hardware/include/ara_pkg.sv | 15 +- hardware/src/ara.sv | 4 +- hardware/src/ara_dispatcher.sv | 19 +- hardware/src/lane/lane_sequencer.sv | 170 +++++++------- hardware/src/lane/operand_queue.sv | 44 ++-- hardware/src/lane/operand_requester.sv | 306 +++++++++++++------------ hardware/src/lane/valu.sv | 78 ++++--- hardware/src/vlsu/addrgen.sv | 49 +++- hardware/src/vlsu/vldu.sv | 150 +++++++----- hardware/src/vlsu/vstu.sv | 249 ++++++++++++-------- 10 files changed, 637 insertions(+), 447 deletions(-) diff --git a/hardware/include/ara_pkg.sv b/hardware/include/ara_pkg.sv index df0cdc7b5..b8ffa78c8 100644 --- a/hardware/include/ara_pkg.sv +++ b/hardware/include/ara_pkg.sv @@ -974,11 +974,20 @@ package ara_pkg; } opqueue_e; // Each lane has eight VRF banks + // NOTE: values != 8 are not supported localparam int unsigned NrVRFBanksPerLane = 8; - // Find the starting address of a vector register vid + // Find the starting address (in bytes) of a vector register chunk of vid function automatic logic [63:0] vaddr(logic [4:0] vid, int NrLanes); - vaddr = vid * (VLENB / NrLanes / 8); + // Each vector register spans multiple words in each bank in each lane + // The start address is the same in every lane + // Therefore, within each lane, each vector register chunk starts on a given offset + vaddr = vid * (VLENB / NrLanes / NrVRFBanksPerLane); + // NOTE: the only extensively tested configuration of Ara keeps: + // - (VLEN / NrLanes) constant to 1024; + // - NrVRFBanksPerLane always equal to 8. + // Given so, each vector register will span 2 words across all the banks and lanes, + // therefore, vaddr = vid * 16 endfunction: vaddr // Differenciate between SLDU and ADDRGEN operands from opqueue @@ -1016,7 +1025,7 @@ package ara_pkg; typedef struct packed { rvv_pkg::vew_e eew; // Effective element width - vlen_t vl; // Vector length + vlen_t elem_count; // Vector body length opqueue_conversion_e conv; // Type conversion logic [1:0] ntr_red; // Neutral type for reductions logic is_reduct; // Is this a reduction? diff --git a/hardware/src/ara.sv b/hardware/src/ara.sv index 3288219d4..2bb6c6d08 100644 --- a/hardware/src/ara.sv +++ b/hardware/src/ara.sv @@ -43,7 +43,7 @@ module ara import ara_pkg::*; #( // Interface with CVA6's sv39 MMU // This is everything the MMU can provide, it might be overcomplete for Ara and some signals be useless - output exception_t mmu_misaligned_ex_o, + output ariane_pkg::exception_t mmu_misaligned_ex_o, output logic mmu_req_o, // request address translation output logic [riscv::VLEN-1:0] mmu_vaddr_o, // virtual address out output logic mmu_is_store_o, // the translation is requested by a store @@ -54,7 +54,7 @@ module ara import ara_pkg::*; #( // Cycle 1 input logic mmu_valid_i, // translation is valid input logic [riscv::PLEN-1:0] mmu_paddr_i, // translated address - input exception_t mmu_exception_i, // address translation threw an exception + input ariane_pkg::exception_t mmu_exception_i, // address translation threw an exception // Interface with Ariane input accelerator_req_t acc_req_i, diff --git a/hardware/src/ara_dispatcher.sv b/hardware/src/ara_dispatcher.sv index 3c270b885..667a100f5 100644 --- a/hardware/src/ara_dispatcher.sv +++ b/hardware/src/ara_dispatcher.sv @@ -2662,15 +2662,16 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( end // Wait until the back-end answers to acknowledge those instructions - if (ara_resp_valid_i) begin + if ( ara_resp_valid_i ) begin : ara_resp_valid acc_resp_o.req_ready = 1'b1; acc_resp_o.exception = ara_resp_i.exception; acc_resp_o.resp_valid = 1'b1; ara_req_valid_d = 1'b0; // In case of error, modify vstart - if (ara_resp_i.exception.valid) + if ( ara_resp_i.exception.valid ) begin : exception csr_vstart_d = ara_resp_i.exception_vl; - end + end : exception + end : ara_resp_valid end : OpcodeLoadFp ///////////////////// @@ -2859,15 +2860,16 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( end // Wait until the back-end answers to acknowledge those instructions - if (ara_resp_valid_i) begin + if (ara_resp_valid_i) begin : ara_resp_valid acc_resp_o.req_ready = 1'b1; acc_resp_o.exception = ara_resp_i.exception; acc_resp_o.resp_valid = 1'b1; ara_req_valid_d = 1'b0; // If there is an error, change vstart - if (ara_resp_i.exception.valid) + if ( ara_resp_i.exception.valid ) begin : exception csr_vstart_d = ara_resp_i.exception_vl; - end + end : exception + end : ara_resp_valid end : OpcodeStoreFp //////////////////////////// @@ -2879,6 +2881,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // Therefore, Ara must be idle before performing any CSR operation. // Stall if there is any pending vector instruction + // NOTE: This is overconstraining. Not all CSR ops actually need to stall if a vector instruction is pending. + // E.g., CSR vl is never updated by instructions past ara_dispatcher, except for "unit-stride fault-only-first loads". Reading vl would be safe otherwise. + // E.g., CSR vlenb is a design-constant parameter, reading is always safe. + // E.g., CSRs vxrm and vxsat have no influence on-non fixed-point instructions, it could be read and written safely when no fixed-point operation is running. + // By better analyzing the spec, more of optimizations of such can be made. For the sake of simplicity, the current implementation treats CSR ops as one block. if ( ara_idle_i ) begin : ara_idle // These always respond at the same cycle acc_resp_o.resp_valid = 1'b1; diff --git a/hardware/src/lane/lane_sequencer.sv b/hardware/src/lane/lane_sequencer.sv index 386b9823c..ba82f8922 100644 --- a/hardware/src/lane/lane_sequencer.sv +++ b/hardware/src/lane/lane_sequencer.sv @@ -113,7 +113,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // bits that indicate whether there is a hazard between different vector // instructions. Such hazards must be continuously cleared based on the // value of the currently running loops from the main sequencer. - operand_request_cmd_t [NrOperandQueues-1:0] operand_request_i; + operand_request_cmd_t [NrOperandQueues-1:0] operand_request; logic [NrOperandQueues-1:0] operand_request_push; operand_request_cmd_t [NrOperandQueues-1:0] operand_request_d; @@ -133,7 +133,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // Got a new request if (operand_request_push[queue]) begin - operand_request_d[queue] = operand_request_i[queue]; + operand_request_d[queue] = operand_request[queue]; operand_request_valid_d[queue] = 1'b1; end end @@ -189,7 +189,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: pe_resp_o.vinsn_done = vinsn_done_q; // Make no requests to the operand requester - operand_request_i = '0; + operand_request = '0; operand_request_push = '0; // Make no requests to the lane's VFUs @@ -197,7 +197,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: vfu_operation_valid_d = 1'b0; // If the operand requesters are busy, abort the request and wait for another cycle. - if (pe_req_valid) begin + if (pe_req_valid) begin : stall_op_req_busy unique case (pe_req.vfu) VFU_Alu : begin pe_req_ready = !(operand_request_valid_o[AluA] || @@ -230,11 +230,11 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: pe_req_ready = !(operand_request_valid_o[MaskB]); end default:; - endcase + endcase // stall_op_req_busy end // We received a new vector instruction - if (pe_req_valid && pe_req_ready && !vinsn_running_d[pe_req.id]) begin + if (pe_req_valid && pe_req_ready && !vinsn_running_d[pe_req.id]) begin : pe_req_valid // Populate the VFU request vfu_operation_d = '{ id : pe_req.id, @@ -263,9 +263,13 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: if (lane_id_i < pe_req.vl[idx_width(NrLanes)-1:0]) vfu_operation_d.vl += 1; // Vector start calculation - vfu_operation_d.vstart = pe_req.vstart / NrLanes; - // If lane_id_i < vstart % NrLanes, this lane needs to execute one micro-operation less. - if (lane_id_i < pe_req.vstart[idx_width(NrLanes)-1:0]) vfu_operation_d.vstart -= 1; + // TODO: check for LMUL = 4, 8 + // TODO: check for SEW != 64 + vfu_operation_d.vstart = pe_req.vstart / NrLanes; // High bits + // If lane_id_i < (vstart % NrLanes), this lane needs to execute one micro-operation less. + if (lane_id_i < pe_req.vstart[idx_width(NrLanes)-1:0]) begin : adjust_vstart_lane + vfu_operation_d.vstart += 1; + end : adjust_vstart_lane // Mark the vector instruction as running vinsn_running_d[pe_req.id] = (vfu_operation_d.vfu != VFU_None) ? 1'b1 : 1'b0; @@ -287,7 +291,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: unique case (pe_req.vfu) VFU_Alu: begin - operand_request_i[AluA] = '{ + operand_request[AluA] = '{ id : pe_req.id, vs : pe_req.vs1, eew : pe_req.eew_vs1, @@ -306,7 +310,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: }; operand_request_push[AluA] = pe_req.use_vs1; - operand_request_i[AluB] = '{ + operand_request[AluB] = '{ id : pe_req.id, vs : pe_req.vs2, eew : pe_req.eew_vs2, @@ -328,24 +332,24 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: operand_request_push[AluB] = pe_req.use_vs2; // This vector instruction uses masks - operand_request_i[MaskM] = '{ + operand_request[MaskM] = '{ id : pe_req.id, vs : VMASK, eew : pe_req.vtype.vsew, vtype : pe_req.vtype, // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. - vl : (pe_req.vl / NrLanes / 8) >> int'(pe_req.vtype.vsew), + vl : (pe_req.vl / NrLanes / 8) >> unsigned'(pe_req.vtype.vsew), vstart : vfu_operation_d.vstart, hazard : pe_req.hazard_vm | pe_req.hazard_vd, default: '0 }; - if ((operand_request_i[MaskM].vl << int'(pe_req.vtype.vsew)) * - NrLanes * 8 != pe_req.vl) operand_request_i[MaskM].vl += 1; + if ((operand_request[MaskM].vl << unsigned'(pe_req.vtype.vsew)) * + NrLanes * 8 != pe_req.vl) operand_request[MaskM].vl += 1; operand_request_push[MaskM] = !pe_req.vm; end VFU_MFpu: begin - operand_request_i[MulFPUA] = '{ + operand_request[MulFPUA] = '{ id : pe_req.id, vs : pe_req.vs1, eew : pe_req.eew_vs1, @@ -365,7 +369,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: }; operand_request_push[MulFPUA] = pe_req.use_vs1; - operand_request_i[MulFPUB] = '{ + operand_request[MulFPUB] = '{ id : pe_req.id, vs : pe_req.swap_vs2_vd_op ? pe_req.vd : pe_req.vs2, eew : pe_req.swap_vs2_vd_op ? pe_req.eew_vd_op : pe_req.eew_vs2, @@ -388,7 +392,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: operand_request_push[MulFPUB] = pe_req.swap_vs2_vd_op ? pe_req.use_vd_op : pe_req.use_vs2; - operand_request_i[MulFPUC] = '{ + operand_request[MulFPUC] = '{ id : pe_req.id, vs : pe_req.swap_vs2_vd_op ? pe_req.vs2 : pe_req.vd, eew : pe_req.swap_vs2_vd_op ? pe_req.eew_vs2 : pe_req.eew_vd_op, @@ -411,42 +415,42 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: pe_req.use_vs2 : pe_req.use_vd_op; // This vector instruction uses masks - operand_request_i[MaskM] = '{ + operand_request[MaskM] = '{ id : pe_req.id, vs : VMASK, eew : pe_req.vtype.vsew, vtype : pe_req.vtype, // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. - vl : (pe_req.vl / NrLanes / 8) >> int'(pe_req.vtype.vsew), + vl : (pe_req.vl / NrLanes / 8) >> unsigned'(pe_req.vtype.vsew), vstart : vfu_operation_d.vstart, hazard : pe_req.hazard_vm | pe_req.hazard_vd, default: '0 }; - if ((operand_request_i[MaskM].vl << int'(pe_req.vtype.vsew)) * - NrLanes * 8 != pe_req.vl) operand_request_i[MaskM].vl += 1; + if ((operand_request[MaskM].vl << unsigned'(pe_req.vtype.vsew)) * + NrLanes * 8 != pe_req.vl) operand_request[MaskM].vl += 1; operand_request_push[MaskM] = !pe_req.vm; end VFU_LoadUnit : begin // This vector instruction uses masks - operand_request_i[MaskM] = '{ + operand_request[MaskM] = '{ id : pe_req.id, vs : VMASK, eew : pe_req.vtype.vsew, vtype : pe_req.vtype, // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. - vl : (pe_req.vl / NrLanes / 8) >> int'(pe_req.vtype.vsew), + vl : (pe_req.vl / NrLanes / 8) >> unsigned'(pe_req.vtype.vsew), vstart : vfu_operation_d.vstart, hazard : pe_req.hazard_vm | pe_req.hazard_vd, default: '0 }; - if ((operand_request_i[MaskM].vl << int'(pe_req.vtype.vsew)) * - NrLanes * 8 != pe_req.vl) operand_request_i[MaskM].vl += 1; + if ((operand_request[MaskM].vl << unsigned'(pe_req.vtype.vsew)) * + NrLanes * 8 != pe_req.vl) operand_request[MaskM].vl += 1; operand_request_push[MaskM] = !pe_req.vm; // Load indexed - operand_request_i[SlideAddrGenA] = '{ + operand_request[SlideAddrGenA] = '{ id : pe_req_i.id, vs : pe_req_i.vs2, eew : pe_req_i.eew_vs2, @@ -461,13 +465,14 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: }; // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. - if (operand_request_i[SlideAddrGenA].vl * NrLanes != pe_req_i.vl) - operand_request_i[SlideAddrGenA].vl += 1; + if (operand_request[SlideAddrGenA].vl * NrLanes != pe_req_i.vl) + operand_request[SlideAddrGenA].vl += 1; operand_request_push[SlideAddrGenA] = pe_req_i.op == VLXE; end VFU_StoreUnit : begin - operand_request_i[StA] = '{ + // vstart is supported here + operand_request[StA] = '{ id : pe_req.id, vs : pe_req.vs1, eew : pe_req.eew_vs1, @@ -481,28 +486,34 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: hazard : pe_req.hazard_vs1 | pe_req.hazard_vd, default : '0 }; - if (operand_request_i[StA].vl * NrLanes != pe_req.vl) operand_request_i[StA].vl += 1; + // vl is not an integer multiple of NrLanes + // I.e., ( ( pe_req.vl / NrLanes * NrLanes ) == vl ) <=> ( ( vl % NrLanes ) != 0 ) + if ( ( operand_request[StA].vl * NrLanes ) != pe_req.vl ) begin : tweak_vl_StA + operand_request[StA].vl += 1; + end : tweak_vl_StA operand_request_push[StA] = pe_req.use_vs1; // This vector instruction uses masks - operand_request_i[MaskM] = '{ + // TODO: add vstart support here + operand_request[MaskM] = '{ id : pe_req.id, vs : VMASK, eew : pe_req.vtype.vsew, vtype : pe_req.vtype, // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. - vl : (pe_req.vl / NrLanes / 8) >> int'(pe_req.vtype.vsew), + vl : (pe_req.vl / NrLanes / 8) >> unsigned'(pe_req.vtype.vsew), vstart : vfu_operation_d.vstart, hazard : pe_req.hazard_vm | pe_req.hazard_vd, default: '0 }; - if ((operand_request_i[MaskM].vl << int'(pe_req.vtype.vsew)) * - NrLanes * 8 != pe_req.vl) operand_request_i[MaskM].vl += 1; + if ((operand_request[MaskM].vl << unsigned'(pe_req.vtype.vsew)) * + NrLanes * 8 != pe_req.vl) operand_request[MaskM].vl += 1; operand_request_push[MaskM] = !pe_req.vm; // Store indexed - operand_request_i[SlideAddrGenA] = '{ + // TODO: add vstart support here + operand_request[SlideAddrGenA] = '{ id : pe_req_i.id, vs : pe_req_i.vs2, eew : pe_req_i.eew_vs2, @@ -517,13 +528,14 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: }; // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. - if (operand_request_i[SlideAddrGenA].vl * NrLanes != pe_req_i.vl) - operand_request_i[SlideAddrGenA].vl += 1; + if (operand_request[SlideAddrGenA].vl * NrLanes != pe_req_i.vl) begin : tweak_vl_SlideAddrGenA + operand_request[SlideAddrGenA].vl += 1; + end : tweak_vl_SlideAddrGenA operand_request_push[SlideAddrGenA] = pe_req_i.op == VSXE; end VFU_SlideUnit: begin - operand_request_i[SlideAddrGenA] = '{ + operand_request[SlideAddrGenA] = '{ id : pe_req.id, vs : pe_req.vs2, eew : pe_req.eew_vs2, @@ -543,7 +555,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // as operands by the slide unit. // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. - operand_request_i[SlideAddrGenA].vl = + operand_request[SlideAddrGenA].vl = (pe_req.vl - pe_req.stride + NrLanes - 1) / NrLanes; end VSLIDEDOWN: begin @@ -554,7 +566,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // We need to trim full words from the start of the vector that are not used // as operands by the slide unit. - operand_request_i[SlideAddrGenA].vstart = pe_req.stride / NrLanes; + operand_request[SlideAddrGenA].vstart = pe_req.stride / NrLanes; // The stride move the initial address in boundaries of 8*NrLanes Byte. // If the stride is not multiple of a full VRF word (8*NrLanes Byte), @@ -576,15 +588,15 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: vl_tot += extra_stride; // Ask the elements, and ask one more if we do not perfectly divide NrLanes - operand_request_i[SlideAddrGenA].vl = vl_tot / NrLanes; - if (operand_request_i[SlideAddrGenA].vl * NrLanes != vl_tot) - operand_request_i[SlideAddrGenA].vl += 1; + operand_request[SlideAddrGenA].vl = vl_tot / NrLanes; + if (operand_request[SlideAddrGenA].vl * NrLanes != vl_tot) + operand_request[SlideAddrGenA].vl += 1; end default:; endcase // This vector instruction uses masks - operand_request_i[MaskM] = '{ + operand_request[MaskM] = '{ id : pe_req.id, vs : VMASK, eew : pe_req.vtype.vsew, @@ -601,32 +613,32 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // as operands by the slide unit. // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. - operand_request_i[MaskM].vl = + operand_request[MaskM].vl = ((pe_req.vl - pe_req.stride + NrLanes - 1) / 8 / NrLanes) - >> int'(pe_req.vtype.vsew); + >> unsigned'(pe_req.vtype.vsew); - if (((operand_request_i[MaskM].vl + pe_req.stride) << - int'(pe_req.vtype.vsew) * NrLanes * 8 != pe_req.vl)) - operand_request_i[MaskM].vl += 1; + if (((operand_request[MaskM].vl + pe_req.stride) << + unsigned'(pe_req.vtype.vsew) * NrLanes * 8 != pe_req.vl)) + operand_request[MaskM].vl += 1; // SLIDEUP only uses mask bits whose indices are > stride // Don't send the previous (unused) ones to the MASKU if (pe_req.stride >= NrLanes * 64) - operand_request_i[MaskM].vstart += ((pe_req.stride >> NrLanes * 64) << NrLanes * 64) / 8; + operand_request[MaskM].vstart += ((pe_req.stride >> NrLanes * 64) << NrLanes * 64) / 8; end VSLIDEDOWN: begin // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. - operand_request_i[MaskM].vl = ((pe_req.vl / NrLanes / 8) >> int'( + operand_request[MaskM].vl = ((pe_req.vl / NrLanes / 8) >> unsigned'( pe_req.vtype.vsew)); - if ((operand_request_i[MaskM].vl << int'(pe_req.vtype.vsew)) * + if ((operand_request[MaskM].vl << unsigned'(pe_req.vtype.vsew)) * NrLanes * 8 != pe_req.vl) - operand_request_i[MaskM].vl += 1; + operand_request[MaskM].vl += 1; end endcase end VFU_MaskUnit: begin - operand_request_i[AluA] = '{ + operand_request[AluA] = '{ id : pe_req.id, vs : pe_req.vs1, eew : pe_req.eew_vs1, @@ -640,21 +652,21 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // This is an operation that runs normally on the ALU, and then gets *condensed* and // reshuffled at the Mask Unit. if (pe_req.op inside {[VMSEQ:VMSBC]}) begin - operand_request_i[AluA].vl = vfu_operation_d.vl; + operand_request[AluA].vl = vfu_operation_d.vl; end // This is an operation that runs normally on the ALU, and then gets reshuffled at the // Mask Unit. else begin // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. - operand_request_i[AluA].vl = (pe_req.vl / NrLanes) >> - (int'(EW64) - int'(pe_req.eew_vs1)); - if ((operand_request_i[AluA].vl << (int'(EW64) - int'(pe_req.eew_vs1))) * NrLanes != - pe_req.vl) operand_request_i[AluA].vl += 1; + operand_request[AluA].vl = (pe_req.vl / NrLanes) >> + (unsigned'(EW64) - unsigned'(pe_req.eew_vs1)); + if ((operand_request[AluA].vl << (unsigned'(EW64) - unsigned'(pe_req.eew_vs1))) * NrLanes != + pe_req.vl) operand_request[AluA].vl += 1; end operand_request_push[AluA] = pe_req.use_vs1 && !(pe_req.op inside {[VMFEQ:VMFGE]}); - operand_request_i[AluB] = '{ + operand_request[AluB] = '{ id : pe_req.id, vs : pe_req.vs2, eew : pe_req.eew_vs2, @@ -667,21 +679,21 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // This is an operation that runs normally on the ALU, and then gets *condensed* and // reshuffled at the Mask Unit. if (pe_req.op inside {[VMSEQ:VMSBC]}) begin - operand_request_i[AluB].vl = vfu_operation_d.vl; + operand_request[AluB].vl = vfu_operation_d.vl; end // This is an operation that runs normally on the ALU, and then gets reshuffled at the // Mask Unit. else begin // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. - operand_request_i[AluB].vl = (pe_req.vl / NrLanes) >> - (int'(EW64) - int'(pe_req.eew_vs2)); - if ((operand_request_i[AluB].vl << (int'(EW64) - int'(pe_req.eew_vs2))) * NrLanes != - pe_req.vl) operand_request_i[AluB].vl += 1; + operand_request[AluB].vl = (pe_req.vl / NrLanes) >> + (unsigned'(EW64) - unsigned'(pe_req.eew_vs2)); + if ((operand_request[AluB].vl << (unsigned'(EW64) - unsigned'(pe_req.eew_vs2))) * NrLanes != + pe_req.vl) operand_request[AluB].vl += 1; end operand_request_push[AluB] = pe_req.use_vs2 && !(pe_req.op inside {[VMFEQ:VMFGE]}); - operand_request_i[MulFPUA] = '{ + operand_request[MulFPUA] = '{ id : pe_req.id, vs : pe_req.vs1, eew : pe_req.eew_vs1, @@ -694,10 +706,10 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // This is an operation that runs normally on the ALU, and then gets *condensed* and // reshuffled at the Mask Unit. - operand_request_i[MulFPUA].vl = vfu_operation_d.vl; + operand_request[MulFPUA].vl = vfu_operation_d.vl; operand_request_push[MulFPUA] = pe_req.use_vs1 && pe_req.op inside {[VMFEQ:VMFGE]}; - operand_request_i[MulFPUB] = '{ + operand_request[MulFPUB] = '{ id : pe_req.id, vs : pe_req.vs2, eew : pe_req.eew_vs2, @@ -709,10 +721,10 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: }; // This is an operation that runs normally on the ALU, and then gets *condensed* and // reshuffled at the Mask Unit. - operand_request_i[MulFPUB].vl = vfu_operation_d.vl; + operand_request[MulFPUB].vl = vfu_operation_d.vl; operand_request_push[MulFPUB] = pe_req.use_vs2 && pe_req.op inside {[VMFEQ:VMFGE]}; - operand_request_i[MaskB] = '{ + operand_request[MaskB] = '{ id : pe_req.id, vs : pe_req.vd, eew : pe_req.eew_vd_op, @@ -720,16 +732,16 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: vtype : pe_req.vtype, // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. - vl : (pe_req.vl / NrLanes / ELEN) << (int'(EW64) - int'(pe_req.vtype.vsew)), + vl : (pe_req.vl / NrLanes / ELEN) << (unsigned'(EW64) - unsigned'(pe_req.vtype.vsew)), vstart : vfu_operation_d.vstart, hazard : pe_req.hazard_vd, default : '0 }; if (((pe_req.vl / NrLanes / ELEN) * NrLanes * ELEN) != - pe_req.vl) operand_request_i[MaskB].vl += 1; + pe_req.vl) operand_request[MaskB].vl += 1; operand_request_push[MaskB] = pe_req.use_vd_op; - operand_request_i[MaskM] = '{ + operand_request[MaskM] = '{ id : pe_req.id, vs : VMASK, eew : pe_req.vtype.vsew, @@ -741,13 +753,13 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: hazard : pe_req.hazard_vm, default: '0 }; - if ((operand_request_i[MaskM].vl * NrLanes * ELEN) != pe_req.vl) begin - operand_request_i[MaskM].vl += 1; + if ((operand_request[MaskM].vl * NrLanes * ELEN) != pe_req.vl) begin + operand_request[MaskM].vl += 1; end operand_request_push[MaskM] = !pe_req.vm; end VFU_None: begin - operand_request_i[MaskB] = '{ + operand_request[MaskB] = '{ id : pe_req.id, vs : pe_req.vs2, eew : pe_req.eew_vs2, @@ -763,8 +775,8 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: operand_request_push[MaskB] = 1'b1; end default:; - endcase - end + endcase // pe_req.vfu + end : pe_req_valid end: sequencer always_ff @(posedge clk_i or negedge rst_ni) begin: p_sequencer_ff diff --git a/hardware/src/lane/operand_queue.sv b/hardware/src/lane/operand_queue.sv index 72c8202e1..9b8c1464c 100644 --- a/hardware/src/lane/operand_queue.sv +++ b/hardware/src/lane/operand_queue.sv @@ -127,7 +127,7 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i /////////////////////// // Count how many operands were already produced - vlen_t vl_d, vl_q; + vlen_t elem_count_d, elem_count_q; elen_t conv_operand; // Decide whether we are taking the operands from the lower or from the upper half of the input @@ -226,23 +226,23 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i end // Assert the signal if the last 64-bit packet will contain also - // elements with idx >= vl (they should not contribute to the result!). + // elements with idx >= elem_count (they should not contribute to the result!). // Gate for power saving // Power optimization: // The optimal solution would be to act on the mask bits in the two // processing units (valu and vmfpu), masking the unused elements. unique case (cmd.eew) EW8 : begin - incomplete_packet = |cmd.vl[2:0]; - last_packet = ((cmd.vl - vl_q) <= 8) ? 1'b1 : 1'b0; + incomplete_packet = |cmd.elem_count[2:0]; + last_packet = ((cmd.elem_count - elem_count_q) <= 8) ? 1'b1 : 1'b0; end EW16: begin - incomplete_packet = |cmd.vl[1:0]; - last_packet = ((cmd.vl - vl_q) <= 4) ? 1'b1 : 1'b0; + incomplete_packet = |cmd.elem_count[1:0]; + last_packet = ((cmd.elem_count - elem_count_q) <= 4) ? 1'b1 : 1'b0; end EW32: begin - incomplete_packet = |cmd.vl[0:0]; - last_packet = ((cmd.vl - vl_q) <= 2) ? 1'b1 : 1'b0; + incomplete_packet = |cmd.elem_count[0:0]; + last_packet = ((cmd.elem_count - elem_count_q) <= 2) ? 1'b1 : 1'b0; end default: begin incomplete_packet = 1'b0; @@ -373,15 +373,15 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i if (SupportNtrVal) unique case (cmd.eew) EW8 : for (int unsigned b = 0; b < 8; b++) begin automatic int unsigned bs = shuffle_index(b, 1, EW8); - if ((b >> 0) >= cmd.vl[2:0]) conv_operand[8*bs +: 8] = ntr.w8[b]; + if ((b >> 0) >= cmd.elem_count[2:0]) conv_operand[8*bs +: 8] = ntr.w8[b]; end EW16: for (int unsigned b = 0; b < 8; b++) begin automatic int unsigned bs = shuffle_index(b, 1, EW16); - if ((b >> 1) >= cmd.vl[1:0]) conv_operand[8*bs +: 8] = ntr.w8[b]; + if ((b >> 1) >= cmd.elem_count[1:0]) conv_operand[8*bs +: 8] = ntr.w8[b]; end EW32: for (int unsigned b = 0; b < 8; b++) begin automatic int unsigned bs = shuffle_index(b, 1, EW32); - if ((b >> 2) >= cmd.vl[0:0]) conv_operand[8*bs +: 8] = ntr.w8[b]; + if ((b >> 2) >= cmd.elem_count[0:0]) conv_operand[8*bs +: 8] = ntr.w8[b]; end default:; endcase @@ -401,7 +401,7 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i // Maintain state select_d = select_q; - vl_d = vl_q; + elem_count_d = elem_count_q; // Send the operand operand_o = conv_operand; @@ -418,16 +418,16 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i OpQueueConversionZExt2, OpQueueConversionWideFP2, OpQueueAdjustFPCvt: - if (SupportIntExt2) vl_d = vl_q + (1 << (int'(EW64) - int'(cmd.eew))) / 2; + if (SupportIntExt2) elem_count_d = elem_count_q + (1 << (unsigned'(EW64) - unsigned'(cmd.eew))) / 2; OpQueueConversionSExt4, OpQueueConversionZExt4: - if (SupportIntExt4) vl_d = vl_q + (1 << (int'(EW64) - int'(cmd.eew))) / 4; + if (SupportIntExt4) elem_count_d = elem_count_q + (1 << (unsigned'(EW64) - unsigned'(cmd.eew))) / 4; OpQueueConversionSExt8, OpQueueConversionZExt8: - if (SupportIntExt8) vl_d = vl_q + (1 << (int'(EW64) - int'(cmd.eew))) / 8; + if (SupportIntExt8) elem_count_d = elem_count_q + (1 << (unsigned'(EW64) - unsigned'(cmd.eew))) / 8; OpQueueReductionZExt: - vl_d = vl_q + 1; - default: vl_d = vl_q + (1 << (int'(EW64) - int'(cmd.eew))); + elem_count_d = elem_count_q + 1; + default: elem_count_d = elem_count_q + (1 << (unsigned'(EW64) - unsigned'(cmd.eew))); endcase // Update the pointer to the input operand @@ -443,22 +443,22 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i if ((select_q != '0 && select_d == '0) || cmd.conv == OpQueueConversionNone) ibuf_pop = 1'b1; // Finished execution - if (vl_d >= cmd.vl) begin + if (elem_count_d >= cmd.elem_count) begin : finished_elems ibuf_pop = 1'b1; cmd_pop = 1'b1; select_d = '0; - vl_d = '0; - end + elem_count_d = '0; + end : finished_elems end end : obuf_control always_ff @(posedge clk_i or negedge rst_ni) begin: p_type_conversion_ff if (!rst_ni) begin select_q <= '0; - vl_q <= '0; + elem_count_q <= '0; end else begin select_q <= select_d; - vl_q <= vl_d; + elem_count_q <= elem_count_d; end end : p_type_conversion_ff diff --git a/hardware/src/lane/operand_requester.sv b/hardware/src/lane/operand_requester.sv index 54590fbc3..cbad2b2d5 100644 --- a/hardware/src/lane/operand_requester.sv +++ b/hardware/src/lane/operand_requester.sv @@ -199,7 +199,7 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( // Operand request // /////////////////////// - // There is an operand requester for each operand queue. Each one + // There is an operand requester_index for each operand queue. Each one // can be in one of the following two states. typedef enum logic { IDLE, @@ -223,216 +223,230 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( logic [NrBanks-1:0][NrMasters-1:0] operand_gnt; payload_t [NrMasters-1:0] operand_payload; - for (genvar requester = 0; requester < NrOperandQueues; requester++) begin: gen_operand_requester - // State of this operand requester + // Metadata required to request all elements of this vector operand + typedef struct packed { + // ID of the instruction for this requester_index + vid_t id; + // Address of the next element to be read + vaddr_t addr; + // How many elements remain to be read + vlen_t len; + // Element width + vew_e vew; + + // Hazards between vector instructions + logic [NrVInsn-1:0] hazard; + + // Widening instructions produces two writes of every read + // In case of a WAW with a previous instruction, + // read once every two writes of the previous instruction + logic is_widening; + // One-bit counters + logic [NrVInsn-1:0] waw_hazard_counter; + } requester_metadata_t; + + for (genvar requester_index = 0; requester_index < NrOperandQueues; requester_index++) begin : gen_operand_requester + // State of this operand requester_index state_t state_d, state_q; - // Metadata required to request all elements of this vector operand - struct packed { - // ID of the instruction for this requester - vid_t id; - // Address of the next element to be read - vaddr_t addr; - // How many elements remain to be read - vlen_t len; - // Element width - vew_e vew; - - // Hazards between vector instructions - logic [NrVInsn-1:0] hazard; - - // Widening instructions produces two writes of every read - // In case of a WAW with a previous instruction, - // read once every two writes of the previous instruction - logic is_widening; - // One-bit counters - logic [NrVInsn-1:0] waw_hazard_counter; - } requester_d, requester_q; - + requester_metadata_t requester_metadata_d, requester_metadata_q; // Is there a hazard during this cycle? logic stall; - assign stall = |(requester_q.hazard & ~(vinsn_result_written_q & - (~{NrVInsn{requester_q.is_widening}} | requester_q.waw_hazard_counter))); + assign stall = |(requester_metadata_q.hazard & ~(vinsn_result_written_q & + (~{NrVInsn{requester_metadata_q.is_widening}} | requester_metadata_q.waw_hazard_counter))); // Did we get a grant? logic [NrBanks-1:0] operand_requester_gnt; for (genvar bank = 0; bank < NrBanks; bank++) begin: gen_operand_requester_gnt - assign operand_requester_gnt[bank] = operand_gnt[bank][requester]; + assign operand_requester_gnt[bank] = operand_gnt[bank][requester_index]; end // Did we issue a word to this operand queue? - assign operand_issued_o[requester] = |(operand_requester_gnt); + assign operand_issued_o[requester_index] = |(operand_requester_gnt); always_comb begin: operand_requester + // Helper local variables + automatic operand_queue_cmd_t operand_queue_cmd_tmp; + automatic requester_metadata_t requester_metadata_tmp; + automatic vlen_t vector_body_length; + automatic vlen_t scaled_vector_body_length; + automatic vlen_t effective_vector_body_length; + automatic vaddr_t vrf_addr; + // Maintain state state_d = state_q; - requester_d = requester_q; + requester_metadata_d = requester_metadata_q; // Make no requests to the VRF - operand_payload[requester] = '0; - for (int bank = 0; bank < NrBanks; bank++) operand_req[bank][requester] = 1'b0; + operand_payload[requester_index] = '0; + for (int bank = 0; bank < NrBanks; bank++) operand_req[bank][requester_index] = 1'b0; - // Do not acknowledge any operand requester commands - operand_request_ready_o[requester] = 1'b0; + // Do not acknowledge any operand requester_index commands + operand_request_ready_o[requester_index] = 1'b0; // Do not send any operand conversion commands - operand_queue_cmd_o[requester] = '0; - operand_queue_cmd_valid_o[requester] = 1'b0; + operand_queue_cmd_o[requester_index] = '0; + operand_queue_cmd_valid_o[requester_index] = 1'b0; + + // Prepare metadata upfront + // Length of vector body in elements, i.e., vl - vstart + vector_body_length = operand_request_i[requester_index].vl - operand_request_i[requester_index].vstart; + // For memory operations, the number of elements initially refers to the new EEW (vsew here), + // but the requester_index must refer to the old EEW (eew here) + // This reasoning cannot be applied also to widening instructions, which modify vsew + // treating it as the EEW of vd + scaled_vector_body_length = ( + vector_body_length + << operand_request_i[requester_index].vtype.vsew + ) >> operand_request_i[requester_index].eew; + // Final computed length + effective_vector_body_length = ( operand_request_i[requester_index].scale_vl ) + ? scaled_vector_body_length + : vector_body_length; + // Address of the vstart element of the vector in the VRF + vrf_addr = vaddr(operand_request_i[requester_index].vs, NrLanes) + + ( + operand_request_i[requester_index].vstart + >> (unsigned'(EW64) - unsigned'(operand_request_i[requester_index].eew)) + ); + // Init helper variables + requester_metadata_tmp = { + id : operand_request_i[requester_index].id, + addr : vrf_addr, + len : effective_vector_body_length, + vew : operand_request_i[requester_index].eew, + hazard : operand_request_i[requester_index].hazard, + is_widening : operand_request_i[requester_index].cvt_resize == CVT_WIDE, + default: '0 + }; + operand_queue_cmd_tmp = '{ + eew : operand_request_i[requester_index].eew, + elem_count: effective_vector_body_length, + conv : operand_request_i[requester_index].conv, + ntr_red : operand_request_i[requester_index].cvt_resize, + target_fu : operand_request_i[requester_index].target_fu, + is_reduct : operand_request_i[requester_index].is_reduct + }; case (state_q) - IDLE: begin + IDLE: begin : state_q_IDLE // Accept a new instruction - if (operand_request_valid_i[requester]) begin + if (operand_request_valid_i[requester_index]) begin : op_req_valid state_d = REQUESTING; // Acknowledge the request - operand_request_ready_o[requester] = 1'b1; + operand_request_ready_o[requester_index] = 1'b1; // Send a command to the operand queue - operand_queue_cmd_o[requester] = '{ - eew : operand_request_i[requester].eew, - // For memory operations, the number of elements initially refers to the new EEW (vsew here), - // but the requester must refer to the old EEW (eew here) - // This reasoning cannot be applied also to widening instructions, which modify vsew - // treating it as the EEW of vd - vl : (operand_request_i[requester].scale_vl) ? - ((operand_request_i[requester].vl << - operand_request_i[requester].vtype.vsew) >> - operand_request_i[requester].eew) : - operand_request_i[requester].vl, - conv : operand_request_i[requester].conv, - ntr_red : operand_request_i[requester].cvt_resize, - target_fu: operand_request_i[requester].target_fu, - is_reduct: operand_request_i[requester].is_reduct - }; + operand_queue_cmd_o[requester_index] = operand_queue_cmd_tmp; + operand_queue_cmd_valid_o[requester_index] = 1'b1; + // The length should be at least one after the rescaling - if (operand_queue_cmd_o[requester].vl == '0) - operand_queue_cmd_o[requester].vl = 1; - operand_queue_cmd_valid_o[requester] = 1'b1; + if (operand_queue_cmd_o[requester_index].elem_count == '0) begin : cmd_zero_rescaled_vl + operand_queue_cmd_o[requester_index].elem_count = 1; + end : cmd_zero_rescaled_vl // Store the request - requester_d = '{ - id : operand_request_i[requester].id, - addr : vaddr(operand_request_i[requester].vs, NrLanes) + - (operand_request_i[requester].vstart >> - (int'(EW64) - int'(operand_request_i[requester].eew))), - // For memory operations, the number of elements initially refers to the new EEW (vsew here), - // but the requester must refer to the old EEW (eew here) - // This reasoning cannot be applied also to widening instructions, which modify vsew - // treating it as the EEW of vd - len : (operand_request_i[requester].scale_vl) ? - ((operand_request_i[requester].vl << - operand_request_i[requester].vtype.vsew) >> - operand_request_i[requester].eew) : - operand_request_i[requester].vl, - vew : operand_request_i[requester].eew, - hazard : operand_request_i[requester].hazard, - is_widening : operand_request_i[requester].cvt_resize == CVT_WIDE, - default: '0 - }; + requester_metadata_d = requester_metadata_tmp; + // The length should be at least one after the rescaling - if (requester_d.len == '0) - requester_d.len = 1; + if (requester_metadata_d.len == '0) begin : req_zero_rescaled_vl + requester_metadata_d.len = 1; + end : req_zero_rescaled_vl + // Mute the requisition if the vl is zero - if (operand_request_i[requester].vl == '0) begin + if (operand_request_i[requester_index].vl == '0) begin : zero_vl state_d = IDLE; - operand_queue_cmd_valid_o[requester] = 1'b0; - end - end - end + operand_queue_cmd_valid_o[requester_index] = 1'b0; + end : zero_vl + end : op_req_valid + end : state_q_IDLE - REQUESTING: begin + REQUESTING: begin : state_q_REQUESTING // Update waw counters - for (int b = 0; b < NrVInsn; b++) - if (vinsn_result_written_d[b]) - requester_d.waw_hazard_counter[b] = ~requester_q.waw_hazard_counter[b]; + for (int b = 0; b < NrVInsn; b++) begin : waw_counters_update + if ( vinsn_result_written_d[b] ) begin : result_valid + requester_metadata_d.waw_hazard_counter[b] = ~requester_metadata_q.waw_hazard_counter[b]; + end : result_valid + end : waw_counters_update - if (operand_queue_ready_i[requester]) begin + if (operand_queue_ready_i[requester_index]) begin : op_queue_ready // Bank we are currently requesting - automatic int bank = requester_q.addr[idx_width(NrBanks)-1:0]; + automatic int bank = requester_metadata_q.addr[idx_width(NrBanks)-1:0]; + automatic vlen_t num_bytes; // Operand request - operand_req[bank][requester] = !stall; - operand_payload[requester] = '{ - addr : requester_q.addr >> $clog2(NrBanks), - opqueue: opqueue_e'(requester), - default: '0 + operand_req[bank][requester_index] = !stall; + operand_payload[requester_index] = '{ + addr : requester_metadata_q.addr >> $clog2(NrBanks), + opqueue: opqueue_e'(requester_index), + default: '0 // this is a read operation }; // Received a grant. - if (|operand_requester_gnt) begin + if (|operand_requester_gnt) begin : op_req_grant // Bump the address pointer - requester_d.addr = requester_q.addr + 1'b1; + requester_metadata_d.addr = requester_metadata_q.addr + 1'b1; // We read less than 64 bits worth of elements - if (requester_q.len < (1 << (int'(EW64) - int'(requester_q.vew)))) - requester_d.len = 0; - else requester_d.len = requester_q.len - (1 << (int'(EW64) - int'(requester_q.vew))); - end + num_bytes = ( 1 << ( unsigned'(EW64) - unsigned'(requester_metadata_q.vew) ) ); + if (requester_metadata_q.len < num_bytes) begin + requester_metadata_d.len = 0; + end + else begin + requester_metadata_d.len = requester_metadata_q.len - num_bytes; + end + end : op_req_grant // Finished requesting all the elements - if (requester_d.len == '0) begin + if (requester_metadata_d.len == '0) begin : req_finished state_d = IDLE; // Accept a new instruction - if (operand_request_valid_i[requester]) begin + if (operand_request_valid_i[requester_index]) begin : op_req_valid state_d = REQUESTING; // Acknowledge the request - operand_request_ready_o[requester] = 1'b1; + operand_request_ready_o[requester_index] = 1'b1; // Send a command to the operand queue - operand_queue_cmd_o[requester] = '{ - eew : operand_request_i[requester].eew, - vl : (operand_request_i[requester].scale_vl) ? - ((operand_request_i[requester].vl << - operand_request_i[requester].vtype.vsew) >> - operand_request_i[requester].eew) : - operand_request_i[requester].vl, - conv : operand_request_i[requester].conv, - ntr_red : operand_request_i[requester].cvt_resize, - target_fu: operand_request_i[requester].target_fu, - is_reduct: operand_request_i[requester].is_reduct - }; - operand_queue_cmd_valid_o[requester] = 1'b1; + operand_queue_cmd_o[requester_index] = operand_queue_cmd_tmp; + operand_queue_cmd_valid_o[requester_index] = 1'b1; + // The length should be at least one after the rescaling - if (operand_queue_cmd_o[requester].vl == '0) - operand_queue_cmd_o[requester].vl = 1; + if (operand_queue_cmd_o[requester_index].elem_count == '0) begin : cmd_zero_rescaled_vl + operand_queue_cmd_o[requester_index].elem_count = 1; + end : cmd_zero_rescaled_vl // Store the request - requester_d = '{ - id : operand_request_i[requester].id, - addr : vaddr(operand_request_i[requester].vs, NrLanes) + - (operand_request_i[requester].vstart >> - (int'(EW64) - int'(operand_request_i[requester].eew))), - len : (operand_request_i[requester].scale_vl) ? - ((operand_request_i[requester].vl << - operand_request_i[requester].vtype.vsew) >> - operand_request_i[requester].eew) : - operand_request_i[requester].vl, - vew : operand_request_i[requester].eew, - hazard : operand_request_i[requester].hazard, - default: '0 - }; + requester_metadata_d = requester_metadata_tmp; + // The length should be at least one after the rescaling - if (requester_d.len == '0) - requester_d.len = 1; - end - end - end - end - endcase + if (requester_metadata_d.len == '0) begin : req_zero_rescaled_vl + requester_metadata_d.len = 1; + end : req_zero_rescaled_vl + + // Mute the requisition if the vl is zero + if (operand_request_i[requester_index].vl == '0) begin : zero_vl + state_d = IDLE; + operand_queue_cmd_valid_o[requester_index] = 1'b0; + end : zero_vl + end : op_req_valid + end : req_finished + end : op_queue_ready + end : state_q_REQUESTING + endcase // state_q // Always keep the hazard bits up to date with the global hazard table - requester_d.hazard &= global_hazard_table_i[requester_d.id]; + requester_metadata_d.hazard &= global_hazard_table_i[requester_metadata_d.id]; end : operand_requester always_ff @(posedge clk_i or negedge rst_ni) begin if (!rst_ni) begin state_q <= IDLE; - requester_q <= '0; + requester_metadata_q <= '0; end else begin state_q <= state_d; - requester_q <= requester_d; + requester_metadata_q <= requester_metadata_d; end end end : gen_operand_requester @@ -452,7 +466,7 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( operand_req[bank][NrOperandQueues + VFU_LoadUnit] = 1'b0; end - // Generate the payload + // Generate the payloads for write back operations operand_payload[NrOperandQueues + VFU_Alu] = '{ addr : alu_result_addr_i >> $clog2(NrBanks), wen : 1'b1, @@ -523,7 +537,7 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( logic payload_hp_req; logic payload_hp_gnt; rr_arb_tree #( - .NumIn (int'(MulFPUC) - int'(AluA) + 1 + int'(VFU_MFpu) - int'(VFU_Alu) + 1), + .NumIn (unsigned'(MulFPUC) - unsigned'(AluA) + 1 + unsigned'(VFU_MFpu) - unsigned'(VFU_Alu) + 1), .DataWidth($bits(payload_t) ), .AxiVldRdy(1'b0 ) ) i_hp_vrf_arbiter ( @@ -548,7 +562,7 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( logic payload_lp_req; logic payload_lp_gnt; rr_arb_tree #( - .NumIn(int'(SlideAddrGenA)- int'(MaskB) + 1 + int'(VFU_LoadUnit) - int'(VFU_SlideUnit) + 1), + .NumIn(unsigned'(SlideAddrGenA)- unsigned'(MaskB) + 1 + unsigned'(VFU_LoadUnit) - unsigned'(VFU_SlideUnit) + 1), .DataWidth($bits(payload_t) ), .AxiVldRdy(1'b0 ) ) i_lp_vrf_arbiter ( diff --git a/hardware/src/lane/valu.sv b/hardware/src/lane/valu.sv index 8d8b1024d..369784f78 100644 --- a/hardware/src/lane/valu.sv +++ b/hardware/src/lane/valu.sv @@ -449,7 +449,9 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; (alu_operand_valid_i[0] || !vinsn_issue_q.use_vs1) && (mask_valid_i || vinsn_issue_q.vm)) begin // How many elements are we committing with this word? - automatic logic [3:0] element_cnt = (1 << (int'(EW64) - int'(vinsn_issue_q.vtype.vsew))); + automatic logic [3:0] element_cnt = (1 << (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew))); + automatic vlen_t vector_body_length = vinsn_issue_q.vl - vinsn_issue_q.vstart; + if (element_cnt > issue_cnt_q) element_cnt = issue_cnt_q; @@ -465,7 +467,12 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; // Store the result in the result queue result_queue_d[result_queue_write_pnt_q].wdata = result_queue_q[result_queue_write_pnt_q].wdata | valu_result; - result_queue_d[result_queue_write_pnt_q].addr = vaddr(vinsn_issue_q.vd, NrLanes) + ((vinsn_issue_q.vl - issue_cnt_q) >> (int'(EW64) - vinsn_issue_q.vtype.vsew)); + result_queue_d[result_queue_write_pnt_q].addr = vaddr(vinsn_issue_q.vd, NrLanes) + + ( + ( vinsn_issue_q.vl - issue_cnt_q ) // vstart is already considered in issue_cnt_q + >> (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew) + ) + ); result_queue_d[result_queue_write_pnt_q].id = vinsn_issue_q.id; result_queue_d[result_queue_write_pnt_q].mask = vinsn_issue_q.vfu == VFU_MaskUnit; if (!narrowing(vinsn_issue_q.op) || !narrowing_select_q) @@ -474,7 +481,7 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; // Is this a narrowing instruction? if (narrowing(vinsn_issue_q.op)) begin // How many elements did we calculate in this iteration? - automatic logic [3:0] element_cnt_narrow = (1 << (int'(EW64) - int'(vinsn_issue_q.vtype.vsew))) / 2; + automatic logic [3:0] element_cnt_narrow = (1 << (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew))) / 2; if (element_cnt_narrow > issue_cnt_q) element_cnt_narrow = issue_cnt_q; @@ -523,12 +530,15 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; // Assign vector length for next instruction in the instruction queue if (vinsn_queue_d.issue_cnt != 0) begin + automatic vlen_t vector_body_length = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl + - vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vstart; if (!(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].op inside {[VMANDNOT:VMXNOR]})) - issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl; + issue_cnt_d = vector_body_length; else begin - issue_cnt_d = (vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl / 8) >> + $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}"); + issue_cnt_d = (vector_body_length / 8) >> vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew; - issue_cnt_d += |vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl[2:0]; + issue_cnt_d += |vector_body_length[2:0]; end end end @@ -547,7 +557,7 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; (alu_operand_valid_i[0] || !vinsn_issue_q.use_vs1 || !first_op_q) && (mask_valid_i || vinsn_issue_q.vm)) begin // How many elements are we committing with this word? - automatic logic [3:0] element_cnt = (1 << (int'(EW64) - int'(vinsn_issue_q.vtype.vsew))); + automatic logic [3:0] element_cnt = (1 << (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew))); if (element_cnt > issue_cnt_q) element_cnt = issue_cnt_q; @@ -654,12 +664,15 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; // Assign vector length for next instruction in the instruction queue if (vinsn_queue_d.issue_cnt != 0) begin + automatic vlen_t vector_body_length = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl + - vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vstart; if (!(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].op inside {[VMANDNOT:VMXNOR]})) - issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl; + issue_cnt_d = vector_body_length; else begin - issue_cnt_d = (vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl / 8) >> + $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}"); + issue_cnt_d = (vector_body_length / 8) >> vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew; - issue_cnt_d += |vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl[2:0]; + issue_cnt_d += |vector_body_length[2:0]; end end @@ -690,12 +703,15 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; // Assign vector length for next instruction in the instruction queue if (vinsn_queue_d.issue_cnt != 0) begin + automatic vlen_t vector_body_length = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl + - vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vstart; if (!(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].op inside {[VMANDNOT:VMXNOR]})) - issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl; + issue_cnt_d = vector_body_length; else begin - issue_cnt_d = (vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl / 8) >> + $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}"); + issue_cnt_d = (vector_body_length / 8) >> vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew; - issue_cnt_d += |vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl[2:0]; + issue_cnt_d += |vector_body_length[2:0]; end end @@ -750,8 +766,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; // Decrement the counter of remaining vector elements waiting to be written // Don't do it in case of a reduction if (!is_reduction(vinsn_commit.op)) - commit_cnt_d = commit_cnt_q - (1 << (int'(EW64) - vinsn_commit.vtype.vsew)); - if (commit_cnt_q < (1 << (int'(EW64) - vinsn_commit.vtype.vsew))) commit_cnt_d = '0; + commit_cnt_d = commit_cnt_q - (1 << (unsigned'(EW64) - vinsn_commit.vtype.vsew)); + if (commit_cnt_q < (1 << (unsigned'(EW64) - vinsn_commit.vtype.vsew))) commit_cnt_d = '0; end // Finished committing the results of a vector instruction @@ -765,16 +781,20 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; else vinsn_queue_d.commit_pnt += 1; // Update the commit counter for the next instruction - if (vinsn_queue_d.commit_cnt != '0) + if (vinsn_queue_d.commit_cnt != '0) begin + automatic vlen_t vector_body_length = vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl + - vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vstart; if (!(vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].op inside {[VMANDNOT:VMXNOR]})) - commit_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl; + commit_cnt_d = vector_body_length; else begin // We are asking for bits, and we want at least one chunk of bits if // vl > 0. Therefore, commit_cnt = ceil((vl / 8) >> sew) - commit_cnt_d = (vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl / 8) >> + $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}"); + commit_cnt_d = (vector_body_length / 8) >> vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vtype.vsew; - commit_cnt_d += |vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl[2:0]; + commit_cnt_d += |vector_body_length[2:0]; end + end // Initialize counters and alu state if needed by the next instruction // After a reduction, the next instructions starts after the reduction commits @@ -796,14 +816,18 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; if (!vinsn_queue_full && vfu_operation_valid_i && (vfu_operation_i.vfu == VFU_Alu || vfu_operation_i.op inside {[VMSEQ:VMXNOR]})) begin + automatic vlen_t vector_body_length = vfu_operation_i.vl - vfu_operation_i.vstart; + vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt] = vfu_operation_i; // Do not wait for masks if, during a reduction, this lane is just a pass-through // The only valid instructions here with vl == '0 are reductions + // TODO: check if vector_body_length should be used insteada of plain vl here vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].vm = vfu_operation_i.vm | (vfu_operation_i.vl == '0); // Initialize counters and alu state if the instruction queue was empty // and the lane is not reducing if ((vinsn_queue_d.issue_cnt == '0) && !prevent_commit) begin + alu_state_d = is_reduction(vfu_operation_i.op) ? INTRA_LANE_REDUCTION : NO_REDUCTION; // The next will be the first operation of this instruction // This information is useful for reduction operation @@ -812,22 +836,24 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; reduction_rx_cnt_d = reduction_rx_cnt_init(NrLanes, lane_id_i); sldu_transactions_cnt_d = $clog2(NrLanes) + 1; - issue_cnt_d = vfu_operation_i.vl; + issue_cnt_d = vector_body_length; if (!(vfu_operation_i.op inside {[VMANDNOT:VMXNOR]})) - issue_cnt_d = vfu_operation_i.vl; + issue_cnt_d = vector_body_length; else begin - issue_cnt_d = (vfu_operation_i.vl / 8) >> + $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}"); + issue_cnt_d = (vector_body_length / 8) >> vfu_operation_i.vtype.vsew; - issue_cnt_d += |vfu_operation_i.vl[2:0]; + issue_cnt_d += |vector_body_length[2:0]; end end if (vinsn_queue_d.commit_cnt == '0) if (!(vfu_operation_i.op inside {[VMANDNOT:VMXNOR]})) - commit_cnt_d = vfu_operation_i.vl; + commit_cnt_d = vector_body_length; else begin + $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}"); // Operations between mask vectors operate on bits - commit_cnt_d = (vfu_operation_i.vl / 8) >> vfu_operation_i.vtype.vsew; - commit_cnt_d += |vfu_operation_i.vl[2:0]; + commit_cnt_d = (vector_body_length / 8) >> vfu_operation_i.vtype.vsew; + commit_cnt_d += |vector_body_length[2:0]; end // Bump pointers and counters of the vector instruction queue diff --git a/hardware/src/vlsu/addrgen.sv b/hardware/src/vlsu/addrgen.sv index 0ea8c52b9..eaf66bba1 100644 --- a/hardware/src/vlsu/addrgen.sv +++ b/hardware/src/vlsu/addrgen.sv @@ -65,6 +65,8 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( output logic addrgen_operand_ready_o ); + localparam unsigned DataWidth = $bits(elen_t); + localparam unsigned DataWidthB = DataWidth / 8; /////////////////// // Assignments // @@ -272,7 +274,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( endcase // Load element counter - idx_op_cnt_d = pe_req_i.vl; + idx_op_cnt_d = pe_req_i.vl - pe_req_i.vstart; end default: state_d = ADDRGEN; endcase // pe_req_i.op @@ -300,10 +302,23 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( addrgen_exception_o.tval = '0; end : eew_misaligned_error else begin : address_valid + // NOTE: indexed are not covered here + automatic logic [riscv::VLEN-1:0] vaddr_start; + + case ( pe_req_q.op ) + // Unit-stride: address = base + (vstart in elements) + VLE, VSE : vaddr_start = pe_req_q.scalar_op + ( pe_req_q.vstart << unsigned'(pe_req_q.vtype.vsew) ); + // Strided: address = base + (vstart * stride) + // NOTE: this multiplier might cause some timing issues + VLSE, VSSE: vaddr_start = pe_req_q.scalar_op + ( pe_req_q.vstart * pe_req_q.stride ) ; + // Indexed: let the next stage take care of vstart + VLXE, VSXE: vaddr_start = pe_req_q.scalar_op; + default : vaddr_start = '0; + endcase // pe_req_q.op addrgen_req = '{ - addr : pe_req_q.scalar_op, - len : pe_req_q.vl , + addr : vaddr_start, + len : pe_req_q.vl - pe_req_q.vstart, stride : pe_req_q.stride, vew : pe_req_q.vtype.vsew, is_load : is_load(pe_req_q.op), @@ -329,12 +344,18 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( end : ADDRGEN ADDRGEN_IDX_OP: begin : ADDRGEN_IDX_OP + // NOTE: vstart is not supported for indexed operations + // the logic shuld be introduced: + // 1. in the addrgen_operand_i operand read + // 2. in idx_vaddr computation + automatic logic [NrLanes-1:0] addrgen_operand_valid; + // Stall the interface until the operation is over to catch possible exceptions // Every address can generate an exception addrgen_req = '{ addr : pe_req_q.scalar_op, - len : pe_req_q.vl, + len : pe_req_q.vl - pe_req_q.vstart, stride : pe_req_q.stride, vew : pe_req_q.vtype.vsew, is_load : is_load(pe_req_q.op), @@ -343,10 +364,24 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( }; addrgen_req_valid = 1'b1; + // Adjust valid signals to the next block "operands_ready" + addrgen_operand_valid = addrgen_operand_valid_i; + for ( int unsigned lane = 0; lane < NrLanes; lane++ ) begin : adjust_operand_valid + // - We are left with less byte than the maximim to issue, + // this means that at least one lane is not going to push us any operand anymore + // - For the lanes which index % NrLanes != 0 + if ( ( ( idx_op_cnt_q << pe_req_q.vtype.vsew ) < (NrLanes * DataWidthB) ) + & ( lane < pe_req_q.vstart[idx_width(NrLanes)-1:0] ) + ) begin : vstart_lane_adjust + addrgen_operand_valid[lane] |= 1'b1; + end : vstart_lane_adjust + end : adjust_operand_valid + // TODO: apply the same vstart logic also to mask_valid_i + // Handle handshake and data between VRF and spill register // We accept all the incoming data, without any checks // since Ara stalls on an indexed memory operation - if (&addrgen_operand_valid_i & addrgen_operand_target_fu_i[0] == MFPU_ADDRGEN) begin + if (&addrgen_operand_valid & addrgen_operand_target_fu_i[0] == MFPU_ADDRGEN) begin // Valid data for the spill register idx_vaddr_valid_d = 1'b1; @@ -388,6 +423,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( // Consumed one element idx_op_cnt_d = idx_op_cnt_q - 1; // Have we finished a full NrLanes*64b word? + // TODO: check for the need of vstart logic here if (elm_ptr_q == last_elm_subw_q) begin // Bump lane pointer elm_ptr_d = '0; @@ -448,14 +484,12 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( addrgen_exception_o = mmu_exception_q; end end : WAIT_LAST_TRANSLATION - endcase // state_q if ( addrgen_exception_o.valid & addrgen_ack_o ) begin addrgen_exception_load_o = is_load(pe_req_q.op); addrgen_exception_store_o = !is_load(pe_req_q.op); end - end : addr_generation always_ff @(posedge clk_i or negedge rst_ni) begin @@ -936,6 +970,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( end : start_req end : axi_ax_idle end : axi_addrgen_state_AXI_ADDRGEN_REQUESTING + AXI_ADDRGEN_WAIT_TRANSLATION : begin : axi_addrgen_state_AXI_ADDRGEN_WAIT_TRANSLATION // keep request high mmu_req_o = 1'b1; diff --git a/hardware/src/vlsu/vldu.sv b/hardware/src/vlsu/vldu.sv index 7c49f3af6..467ae4a70 100644 --- a/hardware/src/vlsu/vldu.sv +++ b/hardware/src/vlsu/vldu.sv @@ -193,6 +193,10 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( // - A pointer to which byte in the full VRF word we are writing data into. logic [idx_width(DataWidth*NrLanes/8):0] vrf_word_byte_pnt_d, vrf_word_byte_pnt_q; + localparam unsigned DataWidthB = DataWidth / 8; + + vlen_t vstart_lane; + always_comb begin: p_vldu // Maintain state vinsn_queue_d = vinsn_queue_q; @@ -233,7 +237,7 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( // - The Address Generator sent us the data about the corresponding AR beat // - There is place in the result queue to write the data read from the R channel if (axi_r_valid_i && axi_addrgen_req_valid_i - && axi_addrgen_req_i.is_load && !result_queue_full) begin + && axi_addrgen_req_i.is_load && !result_queue_full) begin : axi_r_beat_read // Bytes valid in the current R beat // If non-unit strided load, we do not progress within the beat automatic shortint unsigned lower_byte = beat_lower_byte(axi_addrgen_req_i.addr, @@ -244,42 +248,47 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( // Is there a vector instruction ready to be issued? // Do we have the operands for it? if (vinsn_issue_valid && (vinsn_issue_q.vm || (|mask_valid_i))) begin : operands_valid - // TODO: add vstart here (use issue/commit_cnt_bytes_q) // Account for the issued bytes // How many bytes are valid in this VRF word - automatic vlen_t vrf_valid_bytes = (NrLanes * 8) - vrf_word_byte_pnt_q; + automatic vlen_t vrf_valid_bytes = (NrLanes * DataWidthB) - vrf_word_byte_pnt_q; // How many bytes are valid in this instruction automatic vlen_t vinsn_valid_bytes = issue_cnt_bytes_q - vrf_word_byte_pnt_q; // How many bytes are valid in this AXI word automatic vlen_t axi_valid_bytes = upper_byte - lower_byte - axi_r_byte_pnt_q + 1; + // How many bytes are we committing? automatic logic [idx_width(DataWidth*NrLanes/8):0] valid_bytes; - valid_bytes = ( issue_cnt_bytes_q < (NrLanes * 8) ) ? vinsn_valid_bytes : vrf_valid_bytes; - // valid_bytes = ( valid_bytes < axi_valid_bytes ) ? valid_bytes : axi_valid_bytes; - if ( valid_bytes >= axi_valid_bytes ) begin : valid_bytes_overflow - valid_bytes = axi_valid_bytes; - end : valid_bytes_overflow + valid_bytes = ( issue_cnt_bytes_q < (NrLanes * DataWidthB) ) ? vinsn_valid_bytes : vrf_valid_bytes; + valid_bytes = ( valid_bytes < axi_valid_bytes ) ? valid_bytes : axi_valid_bytes; + // Bump R beat and VRF word pointers axi_r_byte_pnt_d = axi_r_byte_pnt_q + valid_bytes; vrf_word_byte_pnt_d = vrf_word_byte_pnt_q + valid_bytes; // Copy data from the R channel into the result queue - for (int axi_byte = 0; axi_byte < AxiDataWidth/8; axi_byte++) begin : axi_r_to_result_queue + for (int unsigned axi_byte = 0; axi_byte < AxiDataWidth/8; axi_byte++) begin : axi_r_to_result_queue // Is this byte a valid byte in the R beat? if ( ( axi_byte >= ( lower_byte + axi_r_byte_pnt_q ) ) && ( axi_byte <= upper_byte ) ) begin : is_axi_r_byte // Map axi_byte to the corresponding byte in the VRF word (sequential) - automatic int vrf_seq_byte = axi_byte - lower_byte - axi_r_byte_pnt_q + vrf_word_byte_pnt_q; + automatic int unsigned vrf_seq_byte = axi_byte - lower_byte - axi_r_byte_pnt_q + vrf_word_byte_pnt_q; // And then shuffle it - automatic int vrf_byte = shuffle_index(vrf_seq_byte, NrLanes, vinsn_issue_q.vtype.vsew); + automatic int unsigned vrf_byte = shuffle_index(vrf_seq_byte, NrLanes, vinsn_issue_q.vtype.vsew); // Is this byte a valid byte in the VRF word? - if (vrf_seq_byte < issue_cnt_bytes_q && vrf_seq_byte < (NrLanes * 8)) begin : is_vrf_byte + if (vrf_seq_byte < issue_cnt_bytes_q && vrf_seq_byte < (NrLanes * DataWidthB)) begin : is_vrf_byte // At which lane, and what is the byte offset in that lane, of the byte vrf_byte? - automatic int vrf_lane = vrf_byte >> 3; - automatic int vrf_offset = vrf_byte[2:0]; + automatic int unsigned vrf_offset = vrf_byte[2:0]; + // Consider also vstart and make sure this index wraps around the number of lane + automatic int unsigned vrf_lane = (vrf_byte >> 3); + // Adjust lane selection w.r.t. vstart + vrf_lane += vinsn_issue_q.vstart[idx_width(NrLanes)-1:0]; + if ( vrf_lane >= NrLanes ) begin : vstart_lane_adjust + vrf_lane -= NrLanes; + end : vstart_lane_adjust + // Copy data and byte strobe result_queue_d[result_queue_write_pnt_q][vrf_lane].wdata[8*vrf_offset +: 8] = @@ -290,24 +299,37 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( end : is_axi_r_byte end : axi_r_to_result_queue - // Initialize id and addr fields of the result queue requests - for (int lane = 0; lane < NrLanes; lane++) begin + for (int unsigned lane = 0; lane < NrLanes; lane++) begin : compute_vrf_addr + automatic vlen_t issue_cnt_elems; + // elements per lane (each lane processes num elements / NrLanes) + automatic vlen_t elem_left_per_lane; + // 64-bit aligned address + automatic vlen_t lane_word_offset; + // How many elements in the vector body + automatic vlen_t elem_body_count; + // vstart value local ot the lane + automatic vlen_t vstart_lane; + + // Compute VRF chunk address per lane + elem_body_count = vinsn_issue_q.vl - vinsn_issue_q.vstart; + issue_cnt_elems = issue_cnt_bytes_q >> unsigned'(vinsn_issue_q.vtype.vsew); + elem_left_per_lane = ( elem_body_count - issue_cnt_elems ) / NrLanes; + lane_word_offset = elem_left_per_lane >> (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew)); + + vstart_lane = vinsn_issue_q.vstart / NrLanes; + // If lane_id < (vstart % NrLanes), this lane needs to execute one micro-operation less. + if ( lane < vinsn_issue_q.vstart[idx_width(NrLanes)-1:0] ) begin : vstart_lane_adjust + vstart_lane += 1; + end : vstart_lane_adjust + + // Store in result queue + result_queue_d[result_queue_write_pnt_q][lane].addr = vaddr(vinsn_issue_q.vd, NrLanes) + lane_word_offset + vstart_lane; result_queue_d[result_queue_write_pnt_q][lane].id = vinsn_issue_q.id; - result_queue_d[result_queue_write_pnt_q][lane].addr = - vaddr(vinsn_issue_q.vd, NrLanes) + // base address of vd - ( - ( - ( - (vinsn_issue_q.vl) - // total number of elements to be processed - (issue_cnt_bytes_q >> unsigned'(vinsn_issue_q.vtype.vsew)) // elements left (issue_cnt_bytes_q is in bytes, so we shift rx by EEW) - ) / NrLanes // elements per lane (each lane processes num elements / NrLanes) - ) >> (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew)) // 64-bit aligned address - ); // final offset to vd - end + end : compute_vrf_addr end : operands_valid // We have a word ready to be sent to the lanes - if (vrf_word_byte_pnt_d == NrLanes*8 || vrf_word_byte_pnt_d == issue_cnt_bytes_q) begin + if (vrf_word_byte_pnt_d == (NrLanes * DataWidthB) || vrf_word_byte_pnt_d == issue_cnt_bytes_q) begin : vrf_word_ready // Increment result queue pointers and counters result_queue_cnt_d += 1; if (result_queue_write_pnt_q == ResultQueueDepth-1) begin : result_queue_write_pnt_overflow @@ -318,6 +340,7 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( end : result_queue_write_pnt_increment // Trigger the request signal + // TODO: check if triggering all lanes is actually necessary here result_queue_valid_d[result_queue_write_pnt_q] = {NrLanes{1'b1}}; // Acknowledge the mask operands @@ -326,11 +349,11 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( // Reset the pointer in the VRF word vrf_word_byte_pnt_d = '0; // Account for the results that were issued - issue_cnt_bytes_d = issue_cnt_bytes_q - (NrLanes * 8); // TODO: add vstart here (use issue/commit_cnt_bytes_q) - if (issue_cnt_bytes_q < (NrLanes * 8)) begin : issue_cnt_bytes_overflow + issue_cnt_bytes_d = issue_cnt_bytes_q - (NrLanes * DataWidthB); + if (issue_cnt_bytes_q < (NrLanes * DataWidthB)) begin : issue_cnt_bytes_overflow issue_cnt_bytes_d = '0; end : issue_cnt_bytes_overflow - end + end : vrf_word_ready // Consumed all valid bytes in this R beat if ( ( axi_r_byte_pnt_d == ( upper_byte - lower_byte + 1 ) ) || ( issue_cnt_bytes_d == '0 ) ) begin : axi_r_beat_finish @@ -363,17 +386,19 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( // Prepare for the next vector instruction if (vinsn_queue_d.issue_cnt != 0) begin : issue_cnt_bytes_update - issue_cnt_bytes_d = ( vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl + issue_cnt_bytes_d = ( + vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl + - vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vstart ) << unsigned'(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew); end : issue_cnt_bytes_update end : vrf_results_finish - end + end : axi_r_beat_read ////////////////////////////////// // Write results into the VRF // ////////////////////////////////// - for (int lane = 0; lane < NrLanes; lane++) begin: vrf_result_write + for (int unsigned lane = 0; lane < NrLanes; lane++) begin: vrf_result_write ldu_result_req_o[lane] = result_queue_valid_q[result_queue_read_pnt_q][lane]; ldu_result_addr_o[lane] = result_queue_q[result_queue_read_pnt_q][lane].addr; ldu_result_id_o[lane] = result_queue_q[result_queue_read_pnt_q][lane].id; @@ -396,7 +421,7 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( // All lanes accepted the VRF request // Wait for all the final grants, to be sure that all the results were written back if (!(|result_queue_valid_d[result_queue_read_pnt_q]) && - (&result_final_gnt_d || commit_cnt_bytes_q > (NrLanes * 8))) begin + (&result_final_gnt_d || commit_cnt_bytes_q > (NrLanes * DataWidthB))) begin : wait_for_write_back // There is something waiting to be written if (!result_queue_empty) begin : result_available // Increment the read pointer @@ -411,12 +436,12 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( result_queue_cnt_d -= 1; // Decrement the counter of remaining vector elements waiting to be written - commit_cnt_bytes_d = commit_cnt_bytes_q - (NrLanes * 8); // TODO: add vstart here (use issue/commit_cnt_bytes_q) - if (commit_cnt_bytes_q < (NrLanes * 8)) begin : commit_cnt_bytes_overflow + commit_cnt_bytes_d = commit_cnt_bytes_q - (NrLanes * DataWidthB); + if (commit_cnt_bytes_q < (NrLanes * DataWidthB)) begin : commit_cnt_bytes_overflow commit_cnt_bytes_d = '0; end : commit_cnt_bytes_overflow end : result_available - end + end : wait_for_write_back // Finished committing the results of a vector instruction if (vinsn_commit_valid && commit_cnt_bytes_d == '0) begin : vinsn_done @@ -435,11 +460,13 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( // Update the commit counter for the next instruction if (vinsn_queue_d.commit_cnt != '0) - commit_cnt_bytes_d = ( vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl - ) << unsigned'(vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vtype.vsew); + commit_cnt_bytes_d = ( + vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl + - vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vstart + ) << unsigned'(vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vtype.vsew); end : vinsn_done - // Ack back exceptions + // Clear instruction queue in case of exceptions from addrgen if ( addrgen_exception_valid_i ) begin : exception // Signal done to sequencer pe_resp_d.vinsn_done[vinsn_commit.id] = 1'b1; @@ -451,45 +478,44 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( ////////////////////////////// if (!vinsn_queue_full && pe_req_valid_i && !vinsn_running_q[pe_req_i.id] && - pe_req_i.vfu == VFU_LoadUnit) begin + pe_req_i.vfu == VFU_LoadUnit) begin : pe_req_valid vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt] = pe_req_i; vinsn_running_d[pe_req_i.id] = 1'b1; // Initialize counters - // TODO(bug fix): add masking logic (stores are not idempotent!) if (vinsn_queue_d.issue_cnt == '0) begin : issue_cnt_bytes_init - issue_cnt_bytes_d = (pe_req_i.vl) << unsigned'(pe_req_i.vtype.vsew); + issue_cnt_bytes_d = (pe_req_i.vl - pe_req_i.vstart) << unsigned'(pe_req_i.vtype.vsew); end : issue_cnt_bytes_init if (vinsn_queue_d.commit_cnt == '0) begin : commit_cnt_bytes_init - commit_cnt_bytes_d = (pe_req_i.vl) << unsigned'(pe_req_i.vtype.vsew); + commit_cnt_bytes_d = (pe_req_i.vl - pe_req_i.vstart) << unsigned'(pe_req_i.vtype.vsew); end : commit_cnt_bytes_init // Bump pointers and counters of the vector instruction queue vinsn_queue_d.accept_pnt += 1; vinsn_queue_d.issue_cnt += 1; vinsn_queue_d.commit_cnt += 1; - end + end : pe_req_valid end: p_vldu always_ff @(posedge clk_i or negedge rst_ni) begin if (!rst_ni) begin - vinsn_running_q <= '0; - issue_cnt_bytes_q <= '0; - commit_cnt_bytes_q <= '0; - axi_len_q <= '0; - axi_r_byte_pnt_q <= '0; - vrf_word_byte_pnt_q <= '0; - pe_resp_o <= '0; - result_final_gnt_q <= '0; + vinsn_running_q <= '0; + issue_cnt_bytes_q <= '0; + commit_cnt_bytes_q <= '0; + axi_len_q <= '0; + axi_r_byte_pnt_q <= '0; + vrf_word_byte_pnt_q <= '0; + pe_resp_o <= '0; + result_final_gnt_q <= '0; end else begin - vinsn_running_q <= vinsn_running_d; - issue_cnt_bytes_q <= issue_cnt_bytes_d; - commit_cnt_bytes_q <= commit_cnt_bytes_d; - axi_len_q <= axi_len_d; - axi_r_byte_pnt_q <= axi_r_byte_pnt_d; - vrf_word_byte_pnt_q <= vrf_word_byte_pnt_d; - pe_resp_o <= pe_resp_d; - result_final_gnt_q <= result_final_gnt_d; + vinsn_running_q <= vinsn_running_d; + issue_cnt_bytes_q <= issue_cnt_bytes_d; + commit_cnt_bytes_q <= commit_cnt_bytes_d; + axi_len_q <= axi_len_d; + axi_r_byte_pnt_q <= axi_r_byte_pnt_d; + vrf_word_byte_pnt_q <= vrf_word_byte_pnt_d; + pe_resp_o <= pe_resp_d; + result_final_gnt_q <= result_final_gnt_d; end end diff --git a/hardware/src/vlsu/vstu.sv b/hardware/src/vlsu/vstu.sv index 9580f59b0..f6e5e38ca 100644 --- a/hardware/src/vlsu/vstu.sv +++ b/hardware/src/vlsu/vstu.sv @@ -46,6 +46,7 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #( output pe_resp_t pe_resp_o, // Interface with the address generator input addrgen_axi_req_t axi_addrgen_req_i, + input logic addrgen_exception_valid_i, input logic axi_addrgen_req_valid_i, output logic axi_addrgen_req_ready_o, // Interface with the lanes @@ -63,12 +64,14 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #( import axi_pkg::beat_upper_byte; import axi_pkg::BURST_INCR; + localparam unsigned DataWidthB = DataWidth / 8; + /////////////////////// // Spill registers // /////////////////////// elen_t [NrLanes-1:0] stu_operand; - logic [NrLanes-1:0] stu_operand_valid; + logic [NrLanes-1:0] stu_operand_valid_lanes; logic stu_operand_ready; for (genvar lane = 0; lane < NrLanes; lane++) begin: gen_regs @@ -83,7 +86,7 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #( .valid_i (stu_operand_valid_i[lane]), .ready_o (stu_operand_ready_o[lane]), .data_o (stu_operand[lane] ), - .valid_o (stu_operand_valid[lane] ), + .valid_o (stu_operand_valid_lanes[lane] ), .ready_i (stu_operand_ready ) ); end: gen_regs @@ -153,30 +156,47 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #( // Store Unit // ////////////////// + // NOTE: these are out here only for debug visibility, they could go in p_vldu as automatic variables + int unsigned vrf_seq_byte; + int unsigned vrf_byte ; + vlen_t vrf_valid_bytes ; + vlen_t vinsn_valid_bytes; + vlen_t axi_valid_bytes ; + logic [idx_width(DataWidth*NrLanes/8):0] valid_bytes; + + // Vector instructions currently running logic [NrVInsn-1:0] vinsn_running_d, vinsn_running_q; // Interface with the main sequencer - pe_resp_t pe_resp; + pe_resp_t pe_resp_d; // Remaining bytes of the current instruction in the issue phase - vlen_t issue_cnt_d, issue_cnt_q; + vlen_t issue_cnt_bytes_d, issue_cnt_bytes_q; // Pointers // // We need several pointers to copy data to the memory interface // from the VRF. Namely, we need: // - A counter of how many beats are left in the current AXI burst - axi_pkg::len_t len_d, len_q; + axi_pkg::len_t axi_len_d, axi_len_q; // - A pointer to which byte in the full VRF word we are reading data from. logic [idx_width(DataWidth*NrLanes/8):0] vrf_pnt_d, vrf_pnt_q; always_comb begin: p_vstu + // NOTE: these are out here only for debug visibility, they could go in p_vldu as automatic variables + vrf_seq_byte = '0; + vrf_byte = '0; + vrf_valid_bytes = '0; + vinsn_valid_bytes = '0; + axi_valid_bytes = '0; + valid_bytes = '0; + // Maintain state vinsn_queue_d = vinsn_queue_q; - issue_cnt_d = issue_cnt_q; + issue_cnt_bytes_d = issue_cnt_bytes_q; - len_d = len_q; + axi_len_d = axi_len_q; vrf_pnt_d = vrf_pnt_q; // Vector instructions currently running @@ -184,7 +204,7 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #( // We are not ready, by default axi_addrgen_req_ready_o = 1'b0; - pe_resp = '0; + pe_resp_d = '0; axi_w_o = '0; axi_w_valid_o = 1'b0; axi_b_ready_o = 1'b0; @@ -204,92 +224,130 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #( // - We received all the operands from the lanes // - The address generator generated an AXI AW request for this write beat // - The AXI subsystem is ready to accept this W beat - if (vinsn_issue_valid && &stu_operand_valid && (vinsn_issue_q.vm || (|mask_valid_i)) && - axi_addrgen_req_valid_i && !axi_addrgen_req_i.is_load && axi_w_ready_i) begin + if (vinsn_issue_valid && + axi_addrgen_req_valid_i && !axi_addrgen_req_i.is_load && axi_w_ready_i) begin : issue_valid // Bytes valid in the current W beat automatic shortint unsigned lower_byte = beat_lower_byte(axi_addrgen_req_i.addr, - axi_addrgen_req_i.size, axi_addrgen_req_i.len, BURST_INCR, AxiDataWidth/8, len_q); + axi_addrgen_req_i.size, axi_addrgen_req_i.len, BURST_INCR, AxiDataWidth/8, axi_len_q); automatic shortint unsigned upper_byte = beat_upper_byte(axi_addrgen_req_i.addr, - axi_addrgen_req_i.size, axi_addrgen_req_i.len, BURST_INCR, AxiDataWidth/8, len_q); + axi_addrgen_req_i.size, axi_addrgen_req_i.len, BURST_INCR, AxiDataWidth/8, axi_len_q); + + // For non-zero vstart values, the last operand read is not going to involve all the lanes + automatic logic [NrLanes-1:0] stu_operand_valid; + automatic logic [NrLanes-1:0] mask_valid; + // How many bytes are we committing? + // automatic logic [idx_width(DataWidth*NrLanes/8):0] valid_bytes; + // Account for the issued bytes // How many bytes are valid in this VRF word - automatic vlen_t vrf_valid_bytes = NrLanes * 8 - vrf_pnt_q; + vrf_valid_bytes = (NrLanes * DataWidthB) - vrf_pnt_q; // How many bytes are valid in this instruction - automatic vlen_t vinsn_valid_bytes = issue_cnt_q - vrf_pnt_q; + vinsn_valid_bytes = issue_cnt_bytes_q - vrf_pnt_q; // How many bytes are valid in this AXI word - automatic vlen_t axi_valid_bytes = upper_byte - lower_byte + 1; - - // How many bytes are we committing? - automatic logic [idx_width(DataWidth*NrLanes/8):0] valid_bytes; - valid_bytes = issue_cnt_q < NrLanes * 8 ? vinsn_valid_bytes : vrf_valid_bytes; - valid_bytes = valid_bytes < axi_valid_bytes ? valid_bytes : axi_valid_bytes; - - vrf_pnt_d = vrf_pnt_q + valid_bytes; - - // Copy data from the operands into the W channel - for (int axi_byte = 0; axi_byte < AxiDataWidth/8; axi_byte++) begin - // Is this byte a valid byte in the W beat? - if (axi_byte >= lower_byte && axi_byte <= upper_byte) begin - // Map axy_byte to the corresponding byte in the VRF word (sequential) - automatic int vrf_seq_byte = axi_byte - lower_byte + vrf_pnt_q; - // And then shuffle it - automatic int vrf_byte = shuffle_index(vrf_seq_byte, NrLanes, vinsn_issue_q.eew_vs1); - - // Is this byte a valid byte in the VRF word? - if (vrf_seq_byte < issue_cnt_q) begin - // At which lane, and what is the byte offset in that lane, of the byte vrf_byte? - automatic int vrf_lane = vrf_byte >> 3; - automatic int vrf_offset = vrf_byte[2:0]; - - // Copy data - axi_w_o.data[8*axi_byte +: 8] = stu_operand[vrf_lane][8*vrf_offset +: 8]; - axi_w_o.strb[axi_byte] = vinsn_issue_q.vm || mask_i[vrf_lane][vrf_offset]; + axi_valid_bytes = upper_byte - lower_byte + 1; + + valid_bytes = ( issue_cnt_bytes_q < (NrLanes * DataWidthB) ) ? vinsn_valid_bytes : vrf_valid_bytes; + valid_bytes = ( valid_bytes < axi_valid_bytes ) ? valid_bytes : axi_valid_bytes; + + // Adjust valid signals to the next block "operands_ready" + stu_operand_valid = stu_operand_valid_lanes; + for ( int unsigned lane = 0; lane < NrLanes; lane++ ) begin : adjust_operand_valid + // - We are left with less byte than the maximim to issue, + // this means that at least one lane is not going to push us any operand anymore + // - For the lanes which index % NrLanes != 0 + if ( ( issue_cnt_bytes_q < (NrLanes * DataWidthB) ) + & ( lane < vinsn_issue_q.vstart[idx_width(NrLanes)-1:0] ) + ) begin : vstart_lane_adjust + stu_operand_valid[lane] |= 1'b1; + end : vstart_lane_adjust + end : adjust_operand_valid + + // TODO: apply the same vstart logic also to mask_valid_i + // For now, assume (vstart % NrLanes == 0) + mask_valid = mask_valid_i; + + // Wait for all expected operands from the lanes + if ( &stu_operand_valid && (vinsn_issue_q.vm || (|mask_valid_i) ) ) begin : operands_ready + vrf_pnt_d = vrf_pnt_q + valid_bytes; + + // Copy data from the operands into the W channel + for (int unsigned axi_byte = 0; axi_byte < AxiDataWidth/8; axi_byte++) begin : stu_operand_to_axi_w + // Is this byte a valid byte in the W beat? + if (axi_byte >= lower_byte && axi_byte <= upper_byte) begin + // Map axy_byte to the corresponding byte in the VRF word (sequential) + vrf_seq_byte = axi_byte - lower_byte + vrf_pnt_q; + // And then shuffle it + vrf_byte = shuffle_index(vrf_seq_byte, NrLanes, vinsn_issue_q.eew_vs1); + + // Is this byte a valid byte in the VRF word? + if (vrf_seq_byte < issue_cnt_bytes_q) begin + // At which lane, and what is the byte offset in that lane, of the byte vrf_byte? + automatic int unsigned vrf_offset = vrf_byte[2:0]; + + // Consider also vstart and make sure this index wraps around the number of lane + // automatic logic [$clog2(NrLanes)-1:0] vrf_lane = (vrf_byte >> 3) + vinsn_issue_q.vstart[idx_width(NrLanes)-1:0]; + automatic int unsigned vrf_lane = (vrf_byte >> 3); + // Adjust lane selection w.r.t. vstart + vrf_lane += vinsn_issue_q.vstart[idx_width(NrLanes)-1:0]; + if ( vrf_lane >= NrLanes ) begin : vstart_lane_adjust + vrf_lane -= NrLanes; + end : vstart_lane_adjust + + // Copy data + axi_w_o.data[8*axi_byte +: 8] = stu_operand[vrf_lane][8*vrf_offset +: 8]; + axi_w_o.strb[axi_byte] = vinsn_issue_q.vm || mask_i[vrf_lane][vrf_offset]; + end end - end - end - - // Send the W beat - axi_w_valid_o = 1'b1; - // Account for the beat we sent - len_d = len_q + 1; - // We wrote all the beats for this AW burst - if ($unsigned(len_d) == axi_pkg::len_t'($unsigned(axi_addrgen_req_i.len) + 1)) begin - axi_w_o.last = 1'b1; - // Ask for another burst by the address generator - axi_addrgen_req_ready_o = 1'b1; - // Reset AXI pointers - len_d = '0; - end - - // We consumed a whole word from the lanes - if (vrf_pnt_d == NrLanes*8 || vrf_pnt_d == issue_cnt_q) begin - // Reset the pointer in the VRF word - vrf_pnt_d = '0; - // Acknowledge the operands with the lanes - stu_operand_ready = '1; - // Acknowledge the mask operand - mask_ready_o = !vinsn_issue_q.vm; - // Account for the results that were issued - issue_cnt_d = issue_cnt_q - NrLanes * 8; - if (issue_cnt_q < NrLanes * 8) - issue_cnt_d = '0; - end - end + end : stu_operand_to_axi_w + + // Send the W beat + axi_w_valid_o = 1'b1; + // Account for the beat we sent + axi_len_d = axi_len_q + 1; + // We wrote all the beats for this AW burst + if ($unsigned(axi_len_d) == axi_pkg::len_t'($unsigned(axi_addrgen_req_i.len) + 1)) begin : beats_complete + axi_w_o.last = 1'b1; + // Ask for another burst by the address generator + axi_addrgen_req_ready_o = 1'b1; + // Reset AXI pointers + axi_len_d = '0; + end : beats_complete + + // We consumed a whole word from the lanes + if (vrf_pnt_d == NrLanes*8 || vrf_pnt_d == issue_cnt_bytes_q) begin : vrf_word_done + // Reset the pointer in the VRF word + vrf_pnt_d = '0; + // Acknowledge the operands with the lanes + stu_operand_ready = '1; + // Acknowledge the mask operand + mask_ready_o = !vinsn_issue_q.vm; + // Account for the results that were issued + issue_cnt_bytes_d = issue_cnt_bytes_q - (NrLanes * DataWidthB); + if (issue_cnt_bytes_q < (NrLanes * DataWidthB)) begin : issue_cnt_bytes_overflow + issue_cnt_bytes_d = '0; + end : issue_cnt_bytes_overflow + end : vrf_word_done + end : operands_ready + end : issue_valid // Finished issuing W beats for this vector store - if (vinsn_issue_valid && issue_cnt_d == 0) begin + if (vinsn_issue_valid && issue_cnt_bytes_d == 0) begin : axi_w_beat_finish // Bump issue counters and pointers of the vector instruction queue vinsn_queue_d.issue_cnt -= 1; - if (vinsn_queue_q.issue_pnt == VInsnQueueDepth-1) + if (vinsn_queue_q.issue_pnt == VInsnQueueDepth-1) begin : issue_pnt_overflow vinsn_queue_d.issue_pnt = 0; - else + end : issue_pnt_overflow + else begin : issue_pnt_increment vinsn_queue_d.issue_pnt += 1; + end : issue_pnt_increment - if (vinsn_queue_d.issue_cnt != 0) - issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl << - int'(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew); - end + if (vinsn_queue_d.issue_cnt != 0) begin : issue_cnt_bytes_update + issue_cnt_bytes_d = ( vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl - + vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vstart + ) << unsigned'(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew); + end : issue_cnt_bytes_update + end : axi_w_beat_finish //////////////////////////// // Handle the B channel // @@ -297,63 +355,66 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #( // TODO: We cannot handle errors on the B channel. // We just acknowledge any AXI requests that come on the B channel. - if (axi_b_valid_i) begin + if (axi_b_valid_i) begin : axi_b_valid // Acknowledge the B beat axi_b_ready_o = 1'b1; // Mark the vector instruction as being done - if (vinsn_queue_d.issue_pnt != vinsn_queue_d.commit_pnt) begin + if (vinsn_queue_d.issue_pnt != vinsn_queue_d.commit_pnt) begin : instr_done // Signal complete store store_complete_o = 1'b1; - pe_resp.vinsn_done[vinsn_commit.id] = 1'b1; + pe_resp_d.vinsn_done[vinsn_commit.id] = 1'b1; // Update the commit counters and pointers vinsn_queue_d.commit_cnt -= 1; - if (vinsn_queue_d.commit_pnt == VInsnQueueDepth-1) + if (vinsn_queue_d.commit_pnt == VInsnQueueDepth-1) begin : commit_pnt_overflow vinsn_queue_d.commit_pnt = '0; - else + end : commit_pnt_overflow + else begin : commit_pnt_increment vinsn_queue_d.commit_pnt += 1; - end - end + end : commit_pnt_increment + end : instr_done + end : axi_b_valid ////////////////////////////// // Accept new instruction // ////////////////////////////// if (!vinsn_queue_full && pe_req_valid_i && !vinsn_running_q[pe_req_i.id] && - pe_req_i.vfu == VFU_StoreUnit) begin + pe_req_i.vfu == VFU_StoreUnit) begin : issue_cnt_bytes_init vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt] = pe_req_i; vinsn_running_d[pe_req_i.id] = 1'b1; // Initialize counters - if (vinsn_queue_d.issue_cnt == '0) - issue_cnt_d = pe_req_i.vl << int'(pe_req_i.vtype.vsew); + if (vinsn_queue_d.issue_cnt == '0) begin : issue_cnt_bytes_init + issue_cnt_bytes_d = (pe_req_i.vl - pe_req_i.vstart) << unsigned'(pe_req_i.vtype.vsew); + end : issue_cnt_bytes_init // Bump pointers and counters of the vector instruction queue vinsn_queue_d.accept_pnt += 1; vinsn_queue_d.issue_cnt += 1; vinsn_queue_d.commit_cnt += 1; - end + end : issue_cnt_bytes_init end: p_vstu always_ff @(posedge clk_i or negedge rst_ni) begin if (!rst_ni) begin vinsn_running_q <= '0; - issue_cnt_q <= '0; + issue_cnt_bytes_q <= '0; - len_q <= '0; + axi_len_q <= '0; vrf_pnt_q <= '0; pe_resp_o <= '0; end else begin vinsn_running_q <= vinsn_running_d; - issue_cnt_q <= issue_cnt_d; + issue_cnt_bytes_q <= issue_cnt_bytes_d; - len_q <= len_d; + axi_len_q <= axi_len_d; vrf_pnt_q <= vrf_pnt_d; - pe_resp_o <= pe_resp; + pe_resp_o <= pe_resp_d; end end