From 11ceb5158236ba2bf178e48da4395eb9c1fb245f Mon Sep 17 00:00:00 2001 From: Vincenzo Maisto Date: Wed, 18 Oct 2023 14:24:25 +0200 Subject: [PATCH] Extensions and bug fixes * Stall CSR operations if there is a pending vector instruction * Set vstart=0 for succesful vector instructions * Extend and fix Ara exception reporting from VLSU * Add MMU interface (just mock) * Refactoring --- hardware/include/ara_pkg.sv | 12 +- hardware/src/ara.sv | 43 +- hardware/src/ara_dispatcher.sv | 1225 +++++++++++++++++--------------- hardware/src/ara_sequencer.sv | 8 +- hardware/src/vlsu/addrgen.sv | 300 ++++---- hardware/src/vlsu/vldu.sv | 212 +++--- hardware/src/vlsu/vlsu.sv | 54 +- hardware/src/vlsu/vstu.sv | 1 + 8 files changed, 1035 insertions(+), 820 deletions(-) diff --git a/hardware/include/ara_pkg.sv b/hardware/include/ara_pkg.sv index 1f8e50cfa..e3d0c8753 100644 --- a/hardware/include/ara_pkg.sv +++ b/hardware/include/ara_pkg.sv @@ -155,17 +155,17 @@ package ara_pkg; } ara_op_e; // Return true if op is a load operation - function automatic is_load(ara_op_e op); + function automatic logic is_load(ara_op_e op); is_load = op inside {[VLE:VLXE]}; endfunction : is_load // Return true if op is a store operation - function automatic is_store(ara_op_e op); + function automatic logic is_store(ara_op_e op); is_store = op inside {[VSE:VSXE]}; endfunction : is_store // Return true of op is either VCPOP or VFIRST - function automatic vd_scalar(ara_op_e op); + function automatic logic vd_scalar(ara_op_e op); vd_scalar = op inside {[VCPOP:VFIRST]}; endfunction : vd_scalar @@ -322,11 +322,11 @@ package ara_pkg; // Scalar response elen_t resp; - // Instruction triggered an error - logic error; + // Instruction triggered an exception + ariane_pkg::exception_t exception; // New value for vstart - vlen_t error_vl; + vlen_t exception_vstart; } ara_resp_t; //////////////////// diff --git a/hardware/src/ara.sv b/hardware/src/ara.sv index 0583d1eea..b9e70b3e6 100644 --- a/hardware/src/ara.sv +++ b/hardware/src/ara.sv @@ -37,6 +37,25 @@ module ara import ara_pkg::*; #( input logic scan_enable_i, input logic scan_data_i, output logic scan_data_o, + + // CSR input + input logic en_ld_st_translation_i, + + // Interface with CVA6's sv39 MMU + // This is everything the MMU can provide, it might be overcomplete for Ara and some signals be useless + output ariane_pkg::exception_t mmu_misaligned_ex_o, + output logic mmu_req_o, // request address translation + output logic [riscv::VLEN-1:0] mmu_vaddr_o, // virtual address out + output logic mmu_is_store_o, // the translation is requested by a store + // if we need to walk the page table we can't grant in the same cycle + // Cycle 0 + input logic mmu_dtlb_hit_i, // sent in the same cycle as the request if translation hits in the DTLB + input logic [riscv::PPNW-1:0] mmu_dtlb_ppn_i, // ppn (send same cycle as hit) + // Cycle 1 + input logic mmu_valid_i, // translation is valid + input logic [riscv::PLEN-1:0] mmu_paddr_i, // translated address + input ariane_pkg::exception_t mmu_exception_i, // address translation threw an exception + // Interface with Ariane input accelerator_req_t acc_req_i, output accelerator_resp_t acc_resp_o, @@ -123,8 +142,8 @@ module ara import ara_pkg::*; #( pe_resp_t [NrPEs-1:0] pe_resp; // Interface with the address generator logic addrgen_ack; - logic addrgen_error; - vlen_t addrgen_error_vl; + ariane_pkg::exception_t addrgen_exception; + vlen_t addrgen_exception_vstart; logic [NrLanes-1:0] alu_vinsn_done; logic [NrLanes-1:0] mfpu_vinsn_done; // Interface with the operand requesters @@ -171,8 +190,8 @@ module ara import ara_pkg::*; #( .pe_scalar_resp_ready_o(pe_scalar_resp_ready ), // Interface with the address generator .addrgen_ack_i (addrgen_ack ), - .addrgen_error_i (addrgen_error ), - .addrgen_error_vl_i (addrgen_error_vl ) + .addrgen_exception_i (addrgen_exception ), + .addrgen_exception_vstart_i(addrgen_exception_vstart ) ); // Scalar move support @@ -337,8 +356,8 @@ module ara import ara_pkg::*; #( .pe_req_ready_o (pe_req_ready[NrLanes+OffsetStore : NrLanes+OffsetLoad]), .pe_resp_o (pe_resp[NrLanes+OffsetStore : NrLanes+OffsetLoad] ), .addrgen_ack_o (addrgen_ack ), - .addrgen_error_o (addrgen_error ), - .addrgen_error_vl_o (addrgen_error_vl ), + .addrgen_exception_o (addrgen_exception ), + .addrgen_exception_vstart_o (addrgen_exception_vstart ), // Interface with the Mask unit .mask_i (mask ), .mask_valid_i (mask_valid ), @@ -354,6 +373,18 @@ module ara import ara_pkg::*; #( .addrgen_operand_target_fu_i(sldu_addrgen_operand_target_fu ), .addrgen_operand_valid_i (sldu_addrgen_operand_valid ), .addrgen_operand_ready_o (addrgen_operand_ready ), + // CSR input + .en_ld_st_translation_i, + // Interface with CVA6's sv39 MMU + .mmu_misaligned_ex_o , + .mmu_req_o , + .mmu_vaddr_o , + .mmu_is_store_o , + .mmu_dtlb_hit_i , + .mmu_dtlb_ppn_i , + .mmu_valid_i , + .mmu_paddr_i , + .mmu_exception_i , // Load unit .ldu_result_req_o (ldu_result_req ), .ldu_result_addr_o (ldu_result_addr ), diff --git a/hardware/src/ara_dispatcher.sv b/hardware/src/ara_dispatcher.sv index 8471fb391..d9d803e71 100644 --- a/hardware/src/ara_dispatcher.sv +++ b/hardware/src/ara_dispatcher.sv @@ -53,17 +53,17 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // CSRs // //////////// - vlen_t vstart_d, vstart_q; - vlen_t vl_d, vl_q; - vtype_t vtype_d, vtype_q; - vxsat_e vxsat_d, vxsat_q; - vxrm_t vxrm_d, vxrm_q; - - `FF(vstart_q, vstart_d, '0) - `FF(vl_q, vl_d, '0) - `FF(vtype_q, vtype_d, '{vill: 1'b1, default: '0}) - `FF(vxsat_q, vxsat_d, '0) - `FF(vxrm_q, vxrm_d, '0) + vlen_t csr_vstart_d, csr_vstart_q; + vlen_t csr_vl_d, csr_vl_q; + vtype_t csr_vtype_d, csr_vtype_q; + vxsat_e csr_vxsat_d, csr_vxsat_q; + vxrm_t csr_vxrm_d, csr_vxrm_q; + + `FF(csr_vstart_q, csr_vstart_d, '0) + `FF(csr_vl_q, csr_vl_d, '0) + `FF(csr_vtype_q, csr_vtype_d, '{vill: 1'b1, default: '0}) + `FF(csr_vxsat_q, csr_vxsat_d, '0) + `FF(csr_vxrm_q, csr_vxrm_d, '0) // Converts between the internal representation of `vtype_t` and the full XLEN-bit CSR. function automatic riscv::xlen_t xlen_vtype(vtype_t vtype); xlen_vtype = {vtype.vill, {riscv::XLEN-9{1'b0}}, vtype.vma, vtype.vta, vtype.vsew, @@ -134,7 +134,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( NORMAL_OPERATION, WAIT_IDLE, RESHUFFLE, - SLDU_SEQUENCER + SLDU_SEQUENCER // NOTE: this is never used! } state_e; state_e state_d, state_q; @@ -193,9 +193,9 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // its counters of pending memory operations // Ara should tell Ariane when a memory operation is completed, so that it can modify // its pending load/store counters. - // A memory operation can be completed both when it is over and when vl_q == 0. In the latter case, + // A memory operation can be completed both when it is over and when csr_vl_q == 0. In the latter case, // Ara's decoder answers immediately, and this can cause a collision with an answer from Ara's VLSU. - // To avoid collisions, we give precedence to the VLSU, and we delay the vl_q == 0 memory op + // To avoid collisions, we give precedence to the VLSU, and we delay the csr_vl_q == 0 memory op // completion signal if a collision occurs logic load_zero_vl, store_zero_vl; // Do not checks vregs validity against current LMUL @@ -205,14 +205,15 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( logic is_decoding; // Is this an in-lane operation? logic in_lane_op; - // If the vslideup offset is greater than vl_q, the vslideup has no effects + // If the vslideup offset is greater than csr_vl_q, the vslideup has no effects logic null_vslideup; // Pipeline the VLSU's load and store complete signals, for timing reasons logic load_complete_q; logic store_complete_q; - `FF(load_complete_q, load_complete_i, 1'b0) - `FF(store_complete_q, store_complete_i, 1'b0) + logic illegal_insn_load, illegal_insn_store; + `FF(load_complete_q, load_complete_i || illegal_insn_load, 1'b0) + `FF(store_complete_q, store_complete_i || illegal_insn_store, 1'b0) // NP2 Slide support logic is_stride_np2; @@ -236,14 +237,14 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( always_comb begin: p_decoder // Default values - vstart_d = vstart_q; - vl_d = vl_q; - vtype_d = vtype_q; + csr_vstart_d = csr_vstart_q; + csr_vl_d = csr_vl_q; + csr_vtype_d = csr_vtype_q; state_d = state_q; eew_d = eew_q; eew_valid_d = eew_valid_q; - lmul_vs2 = vtype_q.vlmul; - lmul_vs1 = vtype_q.vlmul; + lmul_vs2 = csr_vtype_q.vlmul; + lmul_vs1 = csr_vtype_q.vlmul; reshuffle_req_d = reshuffle_req_q; eew_old_buffer_d = eew_old_buffer_q; @@ -255,8 +256,10 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( rs_mask_request_d = 1'b0; illegal_insn = 1'b0; - vxsat_d = vxsat_q; - vxrm_d = vxrm_q; + illegal_insn_load = 1'b0; + illegal_insn_store = 1'b0; + csr_vxsat_d = csr_vxsat_q; + csr_vxrm_d = csr_vxrm_q; is_vload = 1'b0; is_vstore = 1'b0; @@ -271,8 +274,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( is_decoding = 1'b0; in_lane_op = 1'b0; - acc_resp_o.req_ready = 1'b0; - acc_resp_o.resp_valid = 1'b0; acc_resp_o = '{ trans_id : acc_req_i.trans_id, load_complete : load_zero_vl | load_complete_q, @@ -281,18 +282,20 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( fflags_valid : |fflags_ex_valid_i, default : '0 }; + acc_resp_o.req_ready = 1'b0; + acc_resp_o.resp_valid = 1'b0; // fflags for (int lane = 0; lane < NrLanes; lane++) acc_resp_o.fflags |= fflags_ex_i[lane]; ara_req_d = '{ - vl : vl_q, - vstart : vstart_q, - vtype : vtype_q, - emul : vtype_q.vlmul, - eew_vs1 : vtype_q.vsew, - eew_vs2 : vtype_q.vsew, - eew_vd_op : vtype_q.vsew, + vl : csr_vl_q, + vstart : csr_vstart_q, + vtype : csr_vtype_q, + emul : csr_vtype_q.vlmul, + eew_vs1 : csr_vtype_q.vsew, + eew_vs2 : csr_vtype_q.vsew, + eew_vd_op : csr_vtype_q.vsew, eew_vmask : eew_q[VMASK], cvt_resize : CVT_SAME, default : '0 @@ -303,9 +306,9 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ignore_zero_vl_check = 1'b0; // Saturation in any lane will raise vxsat flag - vxsat_d |= |vxsat_flag_i; + csr_vxsat_d |= |vxsat_flag_i; // Fixed-point rounding mode is applied to all lanes - for (int lane = 0; lane < NrLanes; lane++) alu_vxrm_o[lane] = vxrm_q; + for (int lane = 0; lane < NrLanes; lane++) alu_vxrm_o[lane] = csr_vxrm_q; // Rounding mode is shared between all lanes for (int lane = 0; lane < NrLanes; lane++) acc_resp_o.fflags |= fflags_ex_i[lane]; // Special states @@ -420,14 +423,14 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( end end end - endcase + endcase // state_q - if (state_d == NORMAL_OPERATION && state_q != RESHUFFLE) begin - if (acc_req_i.req_valid && ara_req_ready_i && acc_req_i.resp_ready) begin + if (state_d == NORMAL_OPERATION && state_q != RESHUFFLE) begin : not_reshuffling + if (acc_req_i.req_valid && ara_req_ready_i && acc_req_i.resp_ready) begin : ready // Decoding is_decoding = 1'b1; // Acknowledge the request - acc_resp_o.req_ready = ara_req_ready_i; + acc_resp_o.req_ready = 1'b1; // Decode the instructions based on their opcode unique case (acc_req_i.insn.itype.opcode) @@ -435,11 +438,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // Vector Arithmetic instructions // ////////////////////////////////////// - riscv::OpcodeVec: begin + riscv::OpcodeVec: begin : OpcodeVec // Instruction is of one of the RVV types automatic rvv_instruction_t insn = rvv_instruction_t'(acc_req_i.insn.instr); - // These always respond at the same cycle + // These (mostly) always respond at the same cycle acc_resp_o.resp_valid = 1'b1; // Decode based on their func3 field @@ -447,33 +450,34 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // Configuration instructions OPCFG: begin: opcfg // These can be acknowledged regardless of the state of Ara - acc_resp_o.req_ready = 1'b1; + // NOTE: unless there is a pending fault-only first vector load + // acc_resp_o.req_ready = 1'b1; is_config = 1'b1; // Update vtype if (insn.vsetvli_type.func1 == 1'b0) begin // vsetvli - vtype_d = vtype_xlen(riscv::xlen_t'(insn.vsetvli_type.zimm11)); + csr_vtype_d = vtype_xlen(riscv::xlen_t'(insn.vsetvli_type.zimm11)); end else if (insn.vsetivli_type.func2 == 2'b11) begin // vsetivli - vtype_d = vtype_xlen(riscv::xlen_t'(insn.vsetivli_type.zimm10)); + csr_vtype_d = vtype_xlen(riscv::xlen_t'(insn.vsetivli_type.zimm10)); end else if (insn.vsetvl_type.func7 == 7'b100_0000) begin // vsetvl - vtype_d = vtype_xlen(riscv::xlen_t'(acc_req_i.rs2[7:0])); + csr_vtype_d = vtype_xlen(riscv::xlen_t'(acc_req_i.rs2[7:0])); end else - acc_resp_o.error = 1'b1; + illegal_insn = 1'b1; // Check whether the updated vtype makes sense - if ((vtype_d.vsew > rvv_pkg::vew_e'($clog2(ELENB))) || // SEW <= ELEN - (vtype_d.vlmul == LMUL_RSVD) || // reserved value + if ((csr_vtype_d.vsew > rvv_pkg::vew_e'($clog2(ELENB))) || // SEW <= ELEN + (csr_vtype_d.vlmul == LMUL_RSVD) || // reserved value // LMUL >= SEW/ELEN - (signed'($clog2(ELENB)) + signed'(vtype_d.vlmul) < signed'(vtype_d.vsew))) begin - vtype_d = '{vill: 1'b1, default: '0}; - vl_d = '0; + (signed'($clog2(ELENB)) + signed'(csr_vtype_d.vlmul) < signed'(csr_vtype_d.vsew))) begin + csr_vtype_d = '{vill: 1'b1, default: '0}; + csr_vl_d = '0; end // Update the vector length else begin // Maximum vector length. VLMAX = LMUL * VLEN / SEW. - automatic int unsigned vlmax = VLENB >> vtype_d.vsew; - unique case (vtype_d.vlmul) + automatic int unsigned vlmax = VLENB >> csr_vtype_d.vsew; + unique case (csr_vtype_d.vlmul) LMUL_1 : vlmax <<= 0; LMUL_2 : vlmax <<= 1; LMUL_4 : vlmax <<= 2; @@ -486,24 +490,24 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( endcase if (insn.vsetivli_type.func2 == 2'b11) begin // vsetivli - vl_d = vlen_t'(insn.vsetivli_type.uimm5); + csr_vl_d = vlen_t'(insn.vsetivli_type.uimm5); end else begin // vsetvl || vsetvli if (insn.vsetvl_type.rs1 == '0 && insn.vsetvl_type.rd == '0) begin // Do not update the vector length - vl_d = vl_q; + csr_vl_d = csr_vl_q; end else if (insn.vsetvl_type.rs1 == '0 && insn.vsetvl_type.rd != '0) begin // Set the vector length to vlmax - vl_d = vlmax; + csr_vl_d = vlmax; end else begin // Normal stripmining - vl_d = ((|acc_req_i.rs1[$bits(acc_req_i.rs1)-1:$bits(vl_d)]) || + csr_vl_d = ((|acc_req_i.rs1[$bits(acc_req_i.rs1)-1:$bits(csr_vl_d)]) || (vlen_t'(acc_req_i.rs1) > vlmax)) ? vlmax : vlen_t'(acc_req_i.rs1); end end end // Return the new vl - acc_resp_o.result = vl_d; + acc_resp_o.result = csr_vl_d; // If the vtype has changed, wait for the backend before issuing any new instructions. // This is to avoid hazards on implicit register labels when LMUL_old > LMUL_new @@ -511,7 +515,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // Checking only lmul_q is a trick: we want to stall only if both lmuls have // zero MSB. If lmul_q has zero MSB, it's greater than lmul_d only if also // lmul_d has zero MSB since the slice comparison is intrinsically unsigned - if (!vtype_q.vlmul[2] && (vtype_d.vlmul[2:0] < vtype_q.vlmul[2:0])) + if (!csr_vtype_q.vlmul[2] && (csr_vtype_d.vlmul[2:0] < csr_vtype_q.vlmul[2:0])) state_d = WAIT_IDLE; end @@ -631,7 +635,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( if (insn.varith_type.vm) begin ara_req_d.eew_vs1 = eew_q[ara_req_d.vs1]; ara_req_d.vtype.vsew = eew_q[ara_req_d.vs1]; - ara_req_d.vl = (vl_q << vtype_q.vsew[1:0]) >> ara_req_d.eew_vs1[1:0]; + ara_req_d.vl = (csr_vl_q << csr_vtype_q.vsew[1:0]) >> ara_req_d.eew_vs1[1:0]; end end 6'b100000: ara_req_d.op = ara_pkg::VSADDU; @@ -647,11 +651,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b101100: begin ara_req_d.op = ara_pkg::VNSRL; ara_req_d.conversion_vs1 = OpQueueConversionZExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); - lmul_vs2 = next_lmul(vtype_q.vlmul); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); // Check whether the EEW is not too wide. - if (int'(vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1; + if (int'(csr_vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1; // Check whether we can access vs2 unique case (ara_req_d.emul.next()) @@ -665,11 +669,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b101101: begin ara_req_d.op = ara_pkg::VNSRA; ara_req_d.conversion_vs1 = OpQueueConversionZExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); - lmul_vs2 = next_lmul(vtype_q.vlmul); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); // Check whether the EEW is not too wide. - if (int'(vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1; + if (int'(csr_vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1; // Check whether we can access vs2 unique case (ara_req_d.emul.next()) @@ -682,28 +686,28 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( end 6'b101110: begin ara_req_d.op = ara_pkg::VNCLIPU; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end 6'b101111: begin ara_req_d.op = ara_pkg::VNCLIP; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end // Reductions encode in cvt_resize the neutral value bits // CVT_WIDE is 2'b00 (hack to save wires) 6'b110000: begin ara_req_d.op = ara_pkg::VWREDSUMU; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.eew_vs1 = vtype_q.vsew.next(); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.eew_vs1 = csr_vtype_q.vsew.next(); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueReductionZExt; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; end 6'b110001: begin ara_req_d.op = ara_pkg::VWREDSUM; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.eew_vs1 = vtype_q.vsew.next(); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.eew_vs1 = csr_vtype_q.vsew.next(); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueReductionZExt; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; @@ -727,7 +731,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( endcase // Instruction is invalid if the vtype is invalid - if (vtype_q.vill) illegal_insn = 1'b1; + if (csr_vtype_q.vill) illegal_insn = 1'b1; end OPIVX: begin: opivx @@ -757,7 +761,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b001110: begin ara_req_d.op = ara_pkg::VSLIDEUP; ara_req_d.stride = acc_req_i.rs1; - ara_req_d.eew_vs2 = vtype_q.vsew; + ara_req_d.eew_vs2 = csr_vtype_q.vsew; // Encode vslideup/vslide1up on the use_scalar_op field ara_req_d.use_scalar_op = 1'b0; // Vl refers to current system vsew, but operand requesters @@ -765,13 +769,13 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // i.e., request will need reshuffling ara_req_d.scale_vl = 1'b1; // If stride > vl, the vslideup has no effects - if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(vl_q)] || - (vlen_t'(ara_req_d.stride) >= vl_q)) null_vslideup = 1'b1; + if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(csr_vl_q)] || + (vlen_t'(ara_req_d.stride) >= csr_vl_q)) null_vslideup = 1'b1; end 6'b001111: begin ara_req_d.op = ara_pkg::VSLIDEDOWN; ara_req_d.stride = acc_req_i.rs1; - ara_req_d.eew_vs2 = vtype_q.vsew; + ara_req_d.eew_vs2 = csr_vtype_q.vsew; // Encode vslidedown/vslide1down on the use_scalar_op field ara_req_d.use_scalar_op = 1'b0; // Request will need reshuffling @@ -880,11 +884,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b101100: begin ara_req_d.op = ara_pkg::VNSRL; ara_req_d.conversion_vs1 = OpQueueConversionZExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); - lmul_vs2 = next_lmul(vtype_q.vlmul); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); // Check whether the EEW is not too wide. - if (int'(vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1; + if (int'(csr_vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1; // Check whether we can access vs2 unique case (ara_req_d.emul.next()) @@ -898,11 +902,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b101101: begin ara_req_d.op = ara_pkg::VNSRA; ara_req_d.conversion_vs1 = OpQueueConversionZExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); - lmul_vs2 = next_lmul(vtype_q.vlmul); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); // Check whether the EEW is not too wide. - if (int'(vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1; + if (int'(csr_vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1; // Check whether we can access vs2 unique case (ara_req_d.emul.next()) @@ -915,11 +919,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( end 6'b101110: begin ara_req_d.op = ara_pkg::VNCLIPU; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end 6'b101111: begin ara_req_d.op = ara_pkg::VNCLIP; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end default: illegal_insn = 1'b1; endcase @@ -937,7 +941,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( endcase // Instruction is invalid if the vtype is invalid - if (vtype_q.vill) illegal_insn = 1'b1; + if (csr_vtype_q.vill) illegal_insn = 1'b1; end OPIVI: begin: opivi @@ -965,19 +969,19 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b001110: begin ara_req_d.op = ara_pkg::VSLIDEUP; ara_req_d.stride = {{ELEN{insn.varith_type.rs1[19]}}, insn.varith_type.rs1}; - ara_req_d.eew_vs2 = vtype_q.vsew; + ara_req_d.eew_vs2 = csr_vtype_q.vsew; // Encode vslideup/vslide1up on the use_scalar_op field ara_req_d.use_scalar_op = 1'b0; // Request will need reshuffling ara_req_d.scale_vl = 1'b1; // If stride > vl, the vslideup has no effects - if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(vl_q)] || - (vlen_t'(ara_req_d.stride) >= vl_q)) null_vslideup = 1'b1; + if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(csr_vl_q)] || + (vlen_t'(ara_req_d.stride) >= csr_vl_q)) null_vslideup = 1'b1; end 6'b001111: begin ara_req_d.op = ara_pkg::VSLIDEDOWN; ara_req_d.stride = {{ELEN{insn.varith_type.rs1[19]}}, insn.varith_type.rs1}; - ara_req_d.eew_vs2 = vtype_q.vsew; + ara_req_d.eew_vs2 = csr_vtype_q.vsew; // Encode vslidedown/vslide1down on the use_scalar_op field ara_req_d.use_scalar_op = 1'b0; // Request will need reshuffling @@ -1090,11 +1094,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b101100: begin ara_req_d.op = ara_pkg::VNSRL; ara_req_d.conversion_vs1 = OpQueueConversionZExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); - lmul_vs2 = next_lmul(vtype_q.vlmul); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); // Check whether the EEW is not too wide. - if (int'(vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1; + if (int'(csr_vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1; // Check whether we can access vs2 unique case (ara_req_d.emul.next()) @@ -1108,11 +1112,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b101101: begin ara_req_d.op = ara_pkg::VNSRA; ara_req_d.conversion_vs1 = OpQueueConversionZExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); - lmul_vs2 = next_lmul(vtype_q.vlmul); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); // Check whether the EEW is not too wide. - if (int'(vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1; + if (int'(csr_vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1; // Check whether we can access vs2 unique case (ara_req_d.emul.next()) @@ -1125,11 +1129,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( end 6'b101110: begin ara_req_d.op = ara_pkg::VNCLIPU; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end 6'b101111: begin ara_req_d.op = ara_pkg::VNCLIP; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end default: illegal_insn = 1'b1; endcase @@ -1147,7 +1151,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( endcase // Instruction is invalid if the vtype is invalid - if (vtype_q.vill) illegal_insn = 1'b1; + if (csr_vtype_q.vill) illegal_insn = 1'b1; end OPMVV: begin: opmvv @@ -1236,7 +1240,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ignore_zero_vl_check = 1'b1; // Sign extend operands - unique case (vtype_q.vsew) + unique case (csr_vtype_q.vsew) EW8: begin ara_req_d.conversion_vs2 = OpQueueConversionSExt8; end @@ -1250,13 +1254,14 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( endcase // Wait until the back-end answers to acknowledge those instructions - if (ara_resp_valid_i) begin - acc_resp_o.req_ready = 1'b1; - acc_resp_o.result = ara_resp_i.resp; - acc_resp_o.error = ara_resp_i.error; - acc_resp_o.resp_valid = 1'b1; - ara_req_valid_d = 1'b0; - end + if ( ara_resp_valid_i ) begin : ara_resp_valid + acc_resp_o.req_ready = 1'b1; + acc_resp_o.resp_valid = 1'b1; + acc_resp_o.result = ara_resp_i.resp; + acc_resp_o.exception = ara_resp_i.exception; + // Clear request to backend + ara_req_valid_d = 1'b0; + end : ara_resp_valid end 6'b010100: begin ara_req_d.use_vd_op = 1'b1; @@ -1356,8 +1361,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.cvt_resize = CVT_WIDE; // Invalid conversion - if (int'(vtype_q.vsew) < int'(EW64) || - int'(vtype_q.vlmul) inside {LMUL_1_2, LMUL_1_4, LMUL_1_8}) + if (int'(csr_vtype_q.vsew) < int'(EW64) || + int'(csr_vtype_q.vlmul) inside {LMUL_1_2, LMUL_1_4, LMUL_1_8}) illegal_insn = 1'b1; end 5'b00011: begin // VSEXT.VF8 @@ -1366,44 +1371,44 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.cvt_resize = CVT_WIDE; // Invalid conversion - if (int'(vtype_q.vsew) < int'(EW64) || - int'(vtype_q.vlmul) inside {LMUL_1_2, LMUL_1_4, LMUL_1_8}) + if (int'(csr_vtype_q.vsew) < int'(EW64) || + int'(csr_vtype_q.vlmul) inside {LMUL_1_2, LMUL_1_4, LMUL_1_8}) illegal_insn = 1'b1; end 5'b00100: begin // VZEXT.VF4 ara_req_d.conversion_vs2 = OpQueueConversionZExt4; - ara_req_d.eew_vs2 = prev_prev_ew(vtype_q.vsew); + ara_req_d.eew_vs2 = prev_prev_ew(csr_vtype_q.vsew); ara_req_d.cvt_resize = CVT_WIDE; // Invalid conversion - if (int'(vtype_q.vsew) < int'(EW32) || - int'(vtype_q.vlmul) inside {LMUL_1_4, LMUL_1_8}) illegal_insn = 1'b1; + if (int'(csr_vtype_q.vsew) < int'(EW32) || + int'(csr_vtype_q.vlmul) inside {LMUL_1_4, LMUL_1_8}) illegal_insn = 1'b1; end 5'b00101: begin // VSEXT.VF4 ara_req_d.conversion_vs2 = OpQueueConversionSExt4; - ara_req_d.eew_vs2 = prev_prev_ew(vtype_q.vsew); + ara_req_d.eew_vs2 = prev_prev_ew(csr_vtype_q.vsew); ara_req_d.cvt_resize = CVT_WIDE; // Invalid conversion - if (int'(vtype_q.vsew) < int'(EW32) || - int'(vtype_q.vlmul) inside {LMUL_1_4, LMUL_1_8}) illegal_insn = 1'b1; + if (int'(csr_vtype_q.vsew) < int'(EW32) || + int'(csr_vtype_q.vlmul) inside {LMUL_1_4, LMUL_1_8}) illegal_insn = 1'b1; end 5'b00110: begin // VZEXT.VF2 ara_req_d.conversion_vs2 = OpQueueConversionZExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.prev(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.prev(); ara_req_d.cvt_resize = CVT_WIDE; // Invalid conversion - if (int'(vtype_q.vsew) < int'(EW16) || int'(vtype_q.vlmul) inside {LMUL_1_8}) + if (int'(csr_vtype_q.vsew) < int'(EW16) || int'(csr_vtype_q.vlmul) inside {LMUL_1_8}) illegal_insn = 1'b1; end 5'b00111: begin // VSEXT.VF2 ara_req_d.conversion_vs2 = OpQueueConversionSExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.prev(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.prev(); ara_req_d.cvt_resize = CVT_WIDE; // Invalid conversion - if (int'(vtype_q.vsew) < int'(EW16) || int'(vtype_q.vlmul) inside {LMUL_1_8}) + if (int'(csr_vtype_q.vsew) < int'(EW16) || int'(csr_vtype_q.vlmul) inside {LMUL_1_8}) illegal_insn = 1'b1; end default: illegal_insn = 1'b1; @@ -1443,92 +1448,92 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // Widening instructions 6'b110000: begin // VWADDU ara_req_d.op = ara_pkg::VADD; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; end 6'b110001: begin // VWADD ara_req_d.op = ara_pkg::VADD; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; end 6'b110010: begin // VWSUBU ara_req_d.op = ara_pkg::VSUB; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; end 6'b110011: begin // VWSUB ara_req_d.op = ara_pkg::VSUB; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; end 6'b110100: begin // VWADDU.W ara_req_d.op = ara_pkg::VADD; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - lmul_vs2 = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end 6'b110101: begin // VWADD.W ara_req_d.op = ara_pkg::VADD; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - lmul_vs2 = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end 6'b110110: begin // VWSUBU.W ara_req_d.op = ara_pkg::VSUB; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - lmul_vs2 = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end 6'b110111: begin // VWSUB.W ara_req_d.op = ara_pkg::VSUB; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - lmul_vs2 = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end 6'b111000: begin // VWMULU ara_req_d.op = ara_pkg::VMUL; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; end 6'b111010: begin // VWMULSU ara_req_d.op = ara_pkg::VMUL; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; end 6'b111011: begin // VWMUL ara_req_d.op = ara_pkg::VMUL; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; @@ -1536,31 +1541,31 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b111100: begin // VWMACCU ara_req_d.op = ara_pkg::VMACC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end 6'b111101: begin // VWMACC ara_req_d.op = ara_pkg::VMACC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end 6'b111111: begin // VWMACCSU ara_req_d.op = ara_pkg::VMACC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end default: illegal_insn = 1'b1; @@ -1594,7 +1599,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( if (int'(ara_req_d.vtype.vsew) > int'(EW64)) illegal_insn = 1'b1; // Instruction is invalid if the vtype is invalid - if (vtype_q.vill) illegal_insn = 1'b1; + if (csr_vtype_q.vill) illegal_insn = 1'b1; end OPMVX: begin: opmvx @@ -1619,17 +1624,17 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b001110: begin // vslide1up ara_req_d.op = ara_pkg::VSLIDEUP; ara_req_d.stride = 1; - ara_req_d.eew_vs2 = vtype_q.vsew; + ara_req_d.eew_vs2 = csr_vtype_q.vsew; // Request will need reshuffling ara_req_d.scale_vl = 1'b1; // If stride > vl, the vslideup has no effects - if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(vl_q)] || - (vlen_t'(ara_req_d.stride) >= vl_q)) null_vslideup = 1'b1; + if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(csr_vl_q)] || + (vlen_t'(ara_req_d.stride) >= csr_vl_q)) null_vslideup = 1'b1; end 6'b001111: begin // vslide1down ara_req_d.op = ara_pkg::VSLIDEDOWN; ara_req_d.stride = 1; - ara_req_d.eew_vs2 = vtype_q.vsew; + ara_req_d.eew_vs2 = csr_vtype_q.vsew; // Request will need reshuffling ara_req_d.scale_vl = 1'b1; end @@ -1637,7 +1642,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // vmv.s.x ara_req_d.op = ara_pkg::VMVSX; ara_req_d.use_vs2 = 1'b0; - ara_req_d.vl = |vl_q ? 1 : '0; + ara_req_d.vl = |csr_vl_q ? 1 : '0; // This instruction ignores LMUL checks skip_lmul_checks = 1'b1; end @@ -1675,92 +1680,92 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // Widening instructions 6'b110000: begin // VWADDU ara_req_d.op = ara_pkg::VADD; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; end 6'b110001: begin // VWADD ara_req_d.op = ara_pkg::VADD; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; end 6'b110010: begin // VWSUBU ara_req_d.op = ara_pkg::VSUB; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; end 6'b110011: begin // VWSUB ara_req_d.op = ara_pkg::VSUB; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; end 6'b110100: begin // VWADDU.W ara_req_d.op = ara_pkg::VADD; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - lmul_vs2 = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end 6'b110101: begin // VWADD.W ara_req_d.op = ara_pkg::VADD; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - lmul_vs2 = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end 6'b110110: begin // VWSUBU.W ara_req_d.op = ara_pkg::VSUB; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - lmul_vs2 = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end 6'b110111: begin // VWSUB.W ara_req_d.op = ara_pkg::VSUB; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - lmul_vs2 = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end 6'b111000: begin // VWMULU ara_req_d.op = ara_pkg::VMUL; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; end 6'b111010: begin // VWMULSU ara_req_d.op = ara_pkg::VMUL; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; end 6'b111011: begin // VWMUL ara_req_d.op = ara_pkg::VMUL; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; @@ -1768,41 +1773,41 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b111100: begin // VWMACCU ara_req_d.op = ara_pkg::VMACC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end 6'b111101: begin // VWMACC ara_req_d.op = ara_pkg::VMACC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end 6'b111110: begin // VWMACCUS ara_req_d.op = ara_pkg::VMACC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end 6'b111111: begin // VWMACCSU ara_req_d.op = ara_pkg::VMACC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end default: illegal_insn = 1'b1; @@ -1830,7 +1835,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( if (int'(ara_req_d.vtype.vsew) > int'(EW64)) illegal_insn = 1'b1; // Instruction is invalid if the vtype is invalid - if (vtype_q.vill) illegal_insn = 1'b1; + if (csr_vtype_q.vill) illegal_insn = 1'b1; end OPFVV: begin: opfvv @@ -1900,7 +1905,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ignore_zero_vl_check = 1'b1; // Zero-extend operands - unique case (vtype_q.vsew) + unique case (csr_vtype_q.vsew) EW16: begin ara_req_d.conversion_vs2 = OpQueueConversionZExt4; end @@ -1911,13 +1916,14 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( endcase // Wait until the back-end answers to acknowledge those instructions - if (ara_resp_valid_i) begin - acc_resp_o.req_ready = 1'b1; - acc_resp_o.result = ara_resp_i.resp; - acc_resp_o.error = ara_resp_i.error; - acc_resp_o.resp_valid = 1'b1; - ara_req_valid_d = 1'b0; - end + if ( ara_resp_valid_i ) begin : ara_resp_valid + acc_resp_o.req_ready = 1'b1; + acc_resp_o.resp_valid = 1'b1; + acc_resp_o.result = ara_resp_i.resp; + acc_resp_o.exception = ara_resp_i.exception; + // Clear request to backend + ara_req_valid_d = 1'b0; + end : ara_resp_valid end 6'b011000: ara_req_d.op = ara_pkg::VMFEQ; 6'b011001: ara_req_d.op = ara_pkg::VMFLE; @@ -1938,96 +1944,95 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 5'b01000: begin // Widening VFCVTXUF ara_req_d.op = VFCVTXUF; ara_req_d.cvt_resize = CVT_WIDE; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; end 5'b01001: begin // Widening VFCVTXF ara_req_d.op = VFCVTXF; ara_req_d.cvt_resize = CVT_WIDE; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; end 5'b01010: begin // Widening VFCVTFXU ara_req_d.op = VFCVTFXU; ara_req_d.cvt_resize = CVT_WIDE; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; end 5'b01011: begin // Widening VFCVTFX ara_req_d.op = VFCVTFX; ara_req_d.cvt_resize = CVT_WIDE; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; end 5'b01100: begin // Widening VFCVTFF ara_req_d.op = VFCVTFF; ara_req_d.cvt_resize = CVT_WIDE; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; end 5'b01110: begin // Widening VFCVTRTZXUF ara_req_d.op = VFCVTRTZXUF; ara_req_d.cvt_resize = CVT_WIDE; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; end 5'b01111: begin // Widening VFCVTRTZXF ara_req_d.op = VFCVTRTZXF; ara_req_d.cvt_resize = CVT_WIDE; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; end 5'b10000: begin // Narrowing VFCVTXUF ara_req_d.op = VFCVTXUF; ara_req_d.cvt_resize = CVT_NARROW; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end 5'b10001: begin // Narrowing VFCVTXF ara_req_d.op = VFCVTXF; ara_req_d.cvt_resize = CVT_NARROW; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end 5'b10010: begin // Narrowing VFCVTFXU ara_req_d.op = VFCVTFXU; ara_req_d.cvt_resize = CVT_NARROW; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end 5'b10011: begin // Narrowing VFCVTFX ara_req_d.op = VFCVTFX; ara_req_d.cvt_resize = CVT_NARROW; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end 5'b10100: begin // Narrowing VFCVTFF ara_req_d.op = VFCVTFF; ara_req_d.cvt_resize = CVT_NARROW; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end 5'b10101: begin // Narrowing VFNCVTRODFF ara_req_d.op = VFNCVTRODFF; ara_req_d.cvt_resize = CVT_NARROW; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end 5'b10110: begin // Narrowing VFCVTRTZXUF ara_req_d.op = VFCVTRTZXUF; ara_req_d.cvt_resize = CVT_NARROW; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end 5'b10111: begin // Narrowing VFCVTRTZXF ara_req_d.op = VFCVTRTZXF; ara_req_d.cvt_resize = CVT_NARROW; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end default: begin // Trigger an error - acc_resp_o.error = 1'b1; - ara_req_valid_d = 1'b0; + illegal_insn = 1'b1; end endcase end @@ -2090,99 +2095,99 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b110000: begin // VFWADD ara_req_d.op = ara_pkg::VFADD; ara_req_d.swap_vs2_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; end 6'b110001: begin // VFWREDUSUM ara_req_d.op = ara_pkg::VFWREDUSUM; ara_req_d.swap_vs2_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueReductionZExt; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; - ara_req_d.eew_vs1 = vtype_q.vsew.next(); + ara_req_d.eew_vs1 = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = resize_e'(2'b00); end 6'b110010: begin // VFWSUB ara_req_d.op = ara_pkg::VFSUB; ara_req_d.swap_vs2_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; end 6'b110011: begin // VFWREDOSUM ara_req_d.op = ara_pkg::VFWREDOSUM; ara_req_d.swap_vs2_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueReductionZExt; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; - ara_req_d.eew_vs1 = vtype_q.vsew.next(); + ara_req_d.eew_vs1 = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = resize_e'(2'b00); end 6'b110100: begin // VFWADD.W ara_req_d.op = ara_pkg::VFADD; ara_req_d.swap_vs2_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - lmul_vs2 = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; end 6'b110110: begin // VFWSUB.W ara_req_d.op = ara_pkg::VFSUB; ara_req_d.swap_vs2_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - lmul_vs2 = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; end 6'b111000: begin // VFWMUL ara_req_d.op = ara_pkg::VFMUL; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; end 6'b111100: begin // VFWMACC ara_req_d.op = ara_pkg::VFMACC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); end 6'b111101: begin // VFWNMACC ara_req_d.op = ara_pkg::VFNMACC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); end 6'b111110: begin // VFWMSAC ara_req_d.op = ara_pkg::VFMSAC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); end 6'b111111: begin // VFWNMSAC ara_req_d.op = ara_pkg::VFNMSAC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); end default: illegal_insn = 1'b1; endcase @@ -2238,7 +2243,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( endcase // Instruction is invalid if the vtype is invalid - if (vtype_q.vill) illegal_insn = 1'b1; + if (csr_vtype_q.vill) illegal_insn = 1'b1; end else illegal_insn = 1'b1; // Vector FP instructions are disabled end @@ -2277,17 +2282,17 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b001110: begin // vfslide1up ara_req_d.op = ara_pkg::VSLIDEUP; ara_req_d.stride = 1; - ara_req_d.eew_vs2 = vtype_q.vsew; + ara_req_d.eew_vs2 = csr_vtype_q.vsew; // Request will need reshuffling ara_req_d.scale_vl = 1'b1; // If stride > vl, the vslideup has no effects - if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(vl_q)] || - (vlen_t'(ara_req_d.stride) >= vl_q)) null_vslideup = 1'b1; + if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(csr_vl_q)] || + (vlen_t'(ara_req_d.stride) >= csr_vl_q)) null_vslideup = 1'b1; end 6'b001111: begin // vfslide1down ara_req_d.op = ara_pkg::VSLIDEDOWN; ara_req_d.stride = 1; - ara_req_d.eew_vs2 = vtype_q.vsew; + ara_req_d.eew_vs2 = csr_vtype_q.vsew; // Request will need reshuffling ara_req_d.scale_vl = 1'b1; end @@ -2295,7 +2300,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // vmv.s.f ara_req_d.op = ara_pkg::VFMVSF; ara_req_d.use_vs2 = 1'b0; - ara_req_d.vl = |vl_q ? 1 : '0; + ara_req_d.vl = |csr_vl_q ? 1 : '0; // This instruction ignores LMUL checks skip_lmul_checks = 1'b1; end @@ -2356,85 +2361,85 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b110000: begin // VFWADD ara_req_d.op = ara_pkg::VFADD; ara_req_d.swap_vs2_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.wide_fp_imm = 1'b1; end 6'b110010: begin // VFWSUB ara_req_d.op = ara_pkg::VFSUB; ara_req_d.swap_vs2_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.wide_fp_imm = 1'b1; end 6'b110100: begin // VFWADD.W ara_req_d.op = ara_pkg::VFADD; ara_req_d.swap_vs2_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - lmul_vs2 = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); ara_req_d.wide_fp_imm = 1'b1; end 6'b110110: begin // VFWSUB.W ara_req_d.op = ara_pkg::VFSUB; ara_req_d.swap_vs2_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - lmul_vs2 = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); ara_req_d.wide_fp_imm = 1'b1; end 6'b111000: begin // VFWMUL ara_req_d.op = ara_pkg::VFMUL; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.wide_fp_imm = 1'b1; end 6'b111100: begin // VFWMACC ara_req_d.op = ara_pkg::VFMACC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.wide_fp_imm = 1'b1; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); end 6'b111101: begin // VFWNMACC ara_req_d.op = ara_pkg::VFNMACC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.wide_fp_imm = 1'b1; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); end 6'b111110: begin // VFWMSAC ara_req_d.op = ara_pkg::VFMSAC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.wide_fp_imm = 1'b1; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); end 6'b111111: begin // VFWNMSAC ara_req_d.op = ara_pkg::VFNMSAC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.wide_fp_imm = 1'b1; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); end default: illegal_insn = 1'b1; endcase // Check if the FP scalar operand is NaN-boxed. If not, replace it with a NaN. - case (vtype_q.vsew) + case (csr_vtype_q.vsew) EW16: if (~(&acc_req_i.rs1[63:16])) ara_req_d.scalar_op = 64'h0000000000007e00; EW32: if (~(&acc_req_i.rs1[63:32])) ara_req_d.scalar_op = 64'h000000007fc00000; endcase @@ -2477,17 +2482,17 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( endcase // Instruction is invalid if the vtype is invalid - if (vtype_q.vill) illegal_insn = 1'b1; + if (csr_vtype_q.vill) illegal_insn = 1'b1; end else illegal_insn = 1'b1; // Vector FP instructions are disabled end endcase - end + end : OpcodeVec //////////////////// // Vector Loads // //////////////////// - riscv::OpcodeLoadFp: begin + riscv::OpcodeLoadFp: begin : OpcodeLoadFp // Instruction is of one of the RVV types automatic rvv_instruction_t insn = rvv_instruction_t'(acc_req_i.insn.instr); @@ -2511,7 +2516,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( if (insn.vmem_type.mop != 2'b01 && insn.vmem_type.mop != 2'b11) begin ara_req_d.vtype.vsew = EW8; end else begin - ara_req_d.vtype.vsew = vtype_q.vsew; + ara_req_d.vtype.vsew = csr_vtype_q.vsew; ara_req_d.eew_vs2 = EW8; end end @@ -2519,7 +2524,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( if (insn.vmem_type.mop != 2'b01 && insn.vmem_type.mop != 2'b11) begin ara_req_d.vtype.vsew = EW16; end else begin - ara_req_d.vtype.vsew = vtype_q.vsew; + ara_req_d.vtype.vsew = csr_vtype_q.vsew; ara_req_d.eew_vs2 = EW16; end end @@ -2527,7 +2532,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( if (insn.vmem_type.mop != 2'b01 && insn.vmem_type.mop != 2'b11) begin ara_req_d.vtype.vsew = EW32; end else begin - ara_req_d.vtype.vsew = vtype_q.vsew; + ara_req_d.vtype.vsew = csr_vtype_q.vsew; ara_req_d.eew_vs2 = EW32; end end @@ -2535,15 +2540,15 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( if (insn.vmem_type.mop != 2'b01 && insn.vmem_type.mop != 2'b11) begin ara_req_d.vtype.vsew = EW64; end else begin - ara_req_d.vtype.vsew = vtype_q.vsew; + ara_req_d.vtype.vsew = csr_vtype_q.vsew; ara_req_d.eew_vs2 = EW64; end end default: begin // Invalid. Element is too wide, or encoding is non-existant. acc_resp_o.req_ready = 1'b1; - acc_resp_o.error = 1'b1; acc_resp_o.resp_valid = 1'b1; - ara_req_valid_d = 1'b0; + illegal_insn = 1'b1; + ara_req_valid_d = 1'b0; end endcase @@ -2558,19 +2563,15 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 5'b01000:; // Unit-strided, whole registers 5'b01011: begin // Unit-strided, mask load, EEW=1 // We operate ceil(vl/8) bytes - ara_req_d.vl = (vl_q >> 3) + |vl_q[2:0]; + ara_req_d.vl = (csr_vl_q >> 3) + |csr_vl_q[2:0]; ara_req_d.vtype.vsew = EW8; end 5'b10000: begin // Unit-strided, fault-only first // TODO: Not implemented - illegal_insn = 1'b1; - acc_resp_o.req_ready = 1'b1; - acc_resp_o.resp_valid = 1'b1; + illegal_insn_load = 1'b1; end default: begin // Reserved - illegal_insn = 1'b1; - acc_resp_o.req_ready = 1'b1; - acc_resp_o.resp_valid = 1'b1; + illegal_insn_load = 1'b1; end endcase end @@ -2590,24 +2591,22 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // For memory operations: EMUL = LMUL * (EEW / SEW) // EEW is encoded in the instruction - ara_req_d.emul = vlmul_e'(vtype_q.vlmul + (ara_req_d.vtype.vsew - vtype_q.vsew)); + ara_req_d.emul = vlmul_e'(csr_vtype_q.vlmul + (ara_req_d.vtype.vsew - csr_vtype_q.vsew)); // Exception if EMUL > 8 or < 1/8 - unique case ({vtype_q.vlmul[2], ara_req_d.emul[2]}) + unique case ({csr_vtype_q.vlmul[2], ara_req_d.emul[2]}) // The new emul is lower than the previous lmul 2'b01: begin // But the new eew is greater than vsew - if (signed'(ara_req_d.vtype.vsew - vtype_q.vsew) > 0) begin - illegal_insn = 1'b1; - acc_resp_o.resp_valid = 1'b1; + if (signed'(ara_req_d.vtype.vsew - csr_vtype_q.vsew) > 0) begin + illegal_insn_load = 1'b1; end end // The new emul is greater than the previous lmul 2'b10: begin // But the new eew is lower than vsew - if (signed'(ara_req_d.vtype.vsew - vtype_q.vsew) < 0) begin - illegal_insn = 1'b1; - acc_resp_o.resp_valid = 1'b1; + if (signed'(ara_req_d.vtype.vsew - csr_vtype_q.vsew) < 0) begin + illegal_insn_load = 1'b1; end end default:; @@ -2617,20 +2616,16 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // access. unique case (ara_req_d.emul) LMUL_2: if ((insn.varith_type.rd & 5'b00001) != 5'b00000) begin - illegal_insn = 1'b1; - acc_resp_o.resp_valid = 1'b1; + illegal_insn_load = 1'b1; end LMUL_4: if ((insn.varith_type.rd & 5'b00011) != 5'b00000) begin - illegal_insn = 1'b1; - acc_resp_o.resp_valid = 1'b1; + illegal_insn_load = 1'b1; end LMUL_8: if ((insn.varith_type.rd & 5'b00111) != 5'b00000) begin - illegal_insn = 1'b1; - acc_resp_o.resp_valid = 1'b1; + illegal_insn_load = 1'b1; end LMUL_RSVD: begin - illegal_insn = 1'b1; - acc_resp_o.resp_valid = 1'b1; + illegal_insn_load = 1'b1; end default:; endcase @@ -2640,9 +2635,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // Execute also if vl == 0 ignore_zero_vl_check = 1'b1; // The LMUL value is kept in the instruction itself - illegal_insn = 1'b0; - acc_resp_o.req_ready = 1'b0; - acc_resp_o.resp_valid = 1'b0; + illegal_insn_load = 1'b0; ara_req_valid_d = 1'b1; // Maximum vector length. VLMAX = nf * VLEN / EW8. @@ -2666,22 +2659,23 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( end default: begin // Trigger an error for the reserved simm values - illegal_insn = 1'b1; + illegal_insn_load = 1'b1; end endcase end // Wait until the back-end answers to acknowledge those instructions - if (ara_resp_valid_i) begin + if ( ara_resp_valid_i ) begin : ara_resp_valid acc_resp_o.req_ready = 1'b1; - acc_resp_o.error = ara_resp_i.error; acc_resp_o.resp_valid = 1'b1; - ara_req_valid_d = 1'b0; - // In case of error, modify vstart - if (ara_resp_i.error) - vstart_d = ara_resp_i.error_vl; - end - end + acc_resp_o.exception = ara_resp_i.exception; + ara_req_valid_d = 1'b0; // Clear request to backend + // In case of exception, modify vstart + if ( ara_resp_i.exception.valid ) begin : exception + csr_vstart_d = ara_resp_i.exception_vstart; + end : exception + end : ara_resp_valid + end : OpcodeLoadFp ///////////////////// // Vector Stores // @@ -2693,7 +2687,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // The current vector length refers to the target EEW! // Vector stores never re-shuffle the source register! - riscv::OpcodeStoreFp: begin + riscv::OpcodeStoreFp: begin : OpcodeStoreFp // Instruction is of one of the RVV types automatic rvv_instruction_t insn = rvv_instruction_t'(acc_req_i.insn.instr); @@ -2724,7 +2718,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( if (insn.vmem_type.mop != 2'b01 && insn.vmem_type.mop != 2'b11) begin ara_req_d.vtype.vsew = EW8; // ara_req_d.vtype.vsew is the target EEW! end else begin - ara_req_d.vtype.vsew = vtype_q.vsew; + ara_req_d.vtype.vsew = csr_vtype_q.vsew; ara_req_d.eew_vs2 = EW8; end end @@ -2732,7 +2726,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( if (insn.vmem_type.mop != 2'b01 && insn.vmem_type.mop != 2'b11) begin ara_req_d.vtype.vsew = EW16; end else begin - ara_req_d.vtype.vsew = vtype_q.vsew; + ara_req_d.vtype.vsew = csr_vtype_q.vsew; ara_req_d.eew_vs2 = EW16; end end @@ -2740,7 +2734,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( if (insn.vmem_type.mop != 2'b01 && insn.vmem_type.mop != 2'b11) begin ara_req_d.vtype.vsew = EW32; end else begin - ara_req_d.vtype.vsew = vtype_q.vsew; + ara_req_d.vtype.vsew = csr_vtype_q.vsew; ara_req_d.eew_vs2 = EW32; end end @@ -2748,15 +2742,12 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( if (insn.vmem_type.mop != 2'b01 && insn.vmem_type.mop != 2'b11) begin ara_req_d.vtype.vsew = EW64; end else begin - ara_req_d.vtype.vsew = vtype_q.vsew; + ara_req_d.vtype.vsew = csr_vtype_q.vsew; ara_req_d.eew_vs2 = EW64; end end default: begin // Invalid. Element is too wide, or encoding is non-existant. - acc_resp_o.req_ready = 1'b1; - acc_resp_o.error = 1'b1; - acc_resp_o.resp_valid = 1'b1; - ara_req_valid_d = 1'b0; + illegal_insn = 1'b1; end endcase @@ -2771,13 +2762,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 5'b01000:; // Unit-strided, whole registers 5'b01011: begin // Unit-strided, mask store, EEW=1 // We operate ceil(vl/8) bytes - ara_req_d.vl = (vl_q >> 3) + |vl_q[2:0]; + ara_req_d.vl = (csr_vl_q >> 3) + |csr_vl_q[2:0]; ara_req_d.vtype.vsew = EW8; end default: begin // Reserved - illegal_insn = 1'b1; - acc_resp_o.req_ready = 1'b1; - acc_resp_o.resp_valid = 1'b1; + illegal_insn_store = 1'b1; end endcase end @@ -2797,24 +2786,22 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // For memory operations: EMUL = LMUL * (EEW / SEW) // EEW is encoded in the instruction - ara_req_d.emul = vlmul_e'(vtype_q.vlmul + (ara_req_d.vtype.vsew - vtype_q.vsew)); + ara_req_d.emul = vlmul_e'(csr_vtype_q.vlmul + (ara_req_d.vtype.vsew - csr_vtype_q.vsew)); // Exception if EMUL > 8 or < 1/8 - unique case ({vtype_q.vlmul[2], ara_req_d.emul[2]}) + unique case ({csr_vtype_q.vlmul[2], ara_req_d.emul[2]}) // The new emul is lower than the previous lmul 2'b01: begin // But the new eew is greater than vsew - if (signed'(ara_req_d.vtype.vsew - vtype_q.vsew) > 0) begin - illegal_insn = 1'b1; - acc_resp_o.resp_valid = 1'b1; + if (signed'(ara_req_d.vtype.vsew - csr_vtype_q.vsew) > 0) begin + illegal_insn_store = 1'b1; end end // The new emul is greater than the previous lmul 2'b10: begin // But the new eew is lower than vsew - if (signed'(ara_req_d.vtype.vsew - vtype_q.vsew) < 0) begin - illegal_insn = 1'b1; - acc_resp_o.resp_valid = 1'b1; + if (signed'(ara_req_d.vtype.vsew - csr_vtype_q.vsew) < 0) begin + illegal_insn_store = 1'b1; end end default:; @@ -2824,20 +2811,16 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // access. unique case (ara_req_d.emul) LMUL_2: if ((insn.varith_type.rd & 5'b00001) != 5'b00000) begin - illegal_insn = 1'b1; - acc_resp_o.resp_valid = 1'b1; + illegal_insn_store = 1'b1; end LMUL_4: if ((insn.varith_type.rd & 5'b00011) != 5'b00000) begin - illegal_insn = 1'b1; - acc_resp_o.resp_valid = 1'b1; + illegal_insn_store = 1'b1; end LMUL_8: if ((insn.varith_type.rd & 5'b00111) != 5'b00000) begin - illegal_insn = 1'b1; - acc_resp_o.resp_valid = 1'b1; + illegal_insn_store = 1'b1; end LMUL_RSVD: begin - illegal_insn = 1'b1; - acc_resp_o.resp_valid = 1'b1; + illegal_insn_store = 1'b1; end default:; endcase @@ -2869,227 +2852,309 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( end default: begin // Trigger an error for the reserved simm values - illegal_insn = 1'b1; + illegal_insn_store = 1'b1; end endcase - illegal_insn = 1'b0; + // illegal_insn_store = 1'b0; // TODO: IS THIS A BUG? acc_resp_o.req_ready = 1'b0; acc_resp_o.resp_valid = 1'b0; ara_req_valid_d = 1'b1; end // Wait until the back-end answers to acknowledge those instructions - if (ara_resp_valid_i) begin + if ( ara_resp_valid_i ) begin : ara_resp_valid acc_resp_o.req_ready = 1'b1; - acc_resp_o.error = ara_resp_i.error; acc_resp_o.resp_valid = 1'b1; - ara_req_valid_d = 1'b0; - // If there is an error, change vstart - if (ara_resp_i.error) - vstart_d = ara_resp_i.error_vl; - end - end + acc_resp_o.exception = ara_resp_i.exception; + ara_req_valid_d = 1'b0; // Clear request to backend + // In case of exception, modify vstart + if ( ara_resp_i.exception.valid ) begin : exception + csr_vstart_d = ara_resp_i.exception_vstart; + end : exception + end : ara_resp_valid + end : OpcodeStoreFp //////////////////////////// // CSR Reads and Writes // //////////////////////////// - riscv::OpcodeSystem: begin - // These always respond at the same cycle - acc_resp_o.resp_valid = 1'b1; - is_config = 1'b1; - - unique case (acc_req_i.insn.itype.funct3) - 3'b001: begin // csrrw - // Decode the CSR. - case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm)) - // Only vstart can be written with CSR instructions. - riscv::CSR_VSTART: begin - vstart_d = acc_req_i.rs1; - acc_resp_o.result = vstart_q; - end - riscv::CSR_VXRM: begin - vxrm_d = vxrm_t'(acc_req_i.rs1[1:0]); - acc_resp_o.result = vlen_t'(vxrm_q); - end - riscv::CSR_VXSAT: begin - vxsat_d = vxsat_e'(acc_req_i.rs1[0]); - acc_resp_o.result = vlen_t'(vxsat_q); - end - default: acc_resp_o.error = 1'b1; - endcase - end - 3'b010: begin // csrrs - // Decode the CSR. - case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm)) - riscv::CSR_VSTART: begin - vstart_d = vstart_q | vlen_t'(acc_req_i.rs1); - acc_resp_o.result = vstart_q; - end - riscv::CSR_VTYPE: begin - // Only reads are allowed - if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = xlen_vtype(vtype_q); - else acc_resp_o.error = 1'b1; - end - riscv::CSR_VL: begin - // Only reads are allowed - if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = vl_q; - else acc_resp_o.error = 1'b1; - end - riscv::CSR_VLENB: begin - // Only reads are allowed - if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = VLENB; - else acc_resp_o.error = 1'b1; - end - riscv::CSR_VXRM: begin - vxrm_d = vxrm_q | vxrm_t'(acc_req_i.rs1[1:0]); - acc_resp_o.result = vlen_t'(vxrm_q); - end - riscv::CSR_VXSAT: begin - vxsat_d = vxsat_q | vxsat_e'(acc_req_i.rs1[0]); - acc_resp_o.result = vlen_t'(vxsat_q); - end - default: acc_resp_o.error = 1'b1; - endcase - end - 3'b011: begin // csrrc - // Decode the CSR. - case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm)) - riscv::CSR_VSTART: begin - vstart_d = vstart_q & ~vlen_t'(acc_req_i.rs1); - acc_resp_o.result = vstart_q; - end - riscv::CSR_VTYPE: begin - // Only reads are allowed - if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = xlen_vtype(vtype_q); - else acc_resp_o.error = 1'b1; - end - riscv::CSR_VL: begin - // Only reads are allowed - if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = vl_q; - else acc_resp_o.error = 1'b1; - end - riscv::CSR_VLENB: begin - // Only reads are allowed - if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = VLENB; - else acc_resp_o.error = 1'b1; - end - riscv::CSR_VXSAT: begin - vxsat_d = vxsat_q & ~vxsat_e'(acc_req_i.rs1[0]); - acc_resp_o.result = vxsat_q; - end - default: acc_resp_o.error = 1'b1; - endcase - end - 3'b101: begin // csrrwi - // Decode the CSR. - case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm)) - // Only vstart can be written with CSR instructions. - riscv::CSR_VSTART: begin - vstart_d = vlen_t'(acc_req_i.insn.itype.rs1); - acc_resp_o.result = vstart_q; - end - riscv::CSR_VXRM: begin - vxrm_d = vxrm_t'(acc_req_i.rs1[1:0]); - acc_resp_o.result = vlen_t'(vxrm_q); - end - riscv::CSR_VXSAT: begin - // logic [19:15] rs1; So, LSB is [15] - vxsat_d = acc_req_i.insn.itype.rs1[15]; - acc_resp_o.result = vxsat_q; - end - default: acc_resp_o.error = 1'b1; - endcase - end - 3'b110: begin // csrrsi - // Decode the CSR. - case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm)) - riscv::CSR_VSTART: begin - vstart_d = vstart_q | vlen_t'(acc_req_i.insn.itype.rs1); - acc_resp_o.result = vstart_q; - end - riscv::CSR_VTYPE: begin - // Only reads are allowed - if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = xlen_vtype(vtype_q); - else acc_resp_o.error = 1'b1; - end - riscv::CSR_VL: begin - // Only reads are allowed - if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = vl_q; - else acc_resp_o.error = 1'b1; - end - riscv::CSR_VLENB: begin - // Only reads are allowed - if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = VLENB; - else acc_resp_o.error = 1'b1; - end - riscv::CSR_VXSAT: begin - // logic [19:15] rs1; So, LSB is [15] - vxsat_d = vxsat_q | vxsat_e'(acc_req_i.insn.itype.rs1[15]); - acc_resp_o.result = vxsat_q; - end - default: acc_resp_o.error = 1'b1; - endcase - end - 3'b111: begin // csrrci - // Decode the CSR. - unique case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm)) - riscv::CSR_VSTART: begin - vstart_d = vstart_q & ~vlen_t'(acc_req_i.insn.itype.rs1); - acc_resp_o.result = vstart_q; - end - riscv::CSR_VTYPE: begin - // Only reads are allowed - if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = xlen_vtype(vtype_q); - else acc_resp_o.error = 1'b1; - end - riscv::CSR_VL: begin - // Only reads are allowed - if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = vl_q; - else acc_resp_o.error = 1'b1; - end - riscv::CSR_VLENB: begin - // Only reads are allowed - if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = VLENB; - else acc_resp_o.error = 1'b1; - end - riscv::CSR_VXSAT: begin - // logic [19:15] rs1; So, LSB is [15] - vxsat_d = vxsat_q & ~vxsat_e'(acc_req_i.insn.itype.rs1[15]); - acc_resp_o.result = vxsat_q; - end - default: acc_resp_o.error = 1'b1; - endcase - end - default: begin - // Trigger an illegal instruction - acc_resp_o.error = 1'b1; - acc_resp_o.resp_valid = 1'b1; - end - endcase - end + riscv::OpcodeSystem: begin : OpcodeSystem + // CSR ops have semantic dependency from vector instrucitons. + // Therefore, Ara must be idle before performing any CSR operation. + + // Stall if there is any pending vector instruction + // NOTE: This is overconstraining. Not all CSR ops actually need to stall if a vector instruction is pending. + // E.g., CSR vl is never updated by instructions past ara_dispatcher, except for "unit-stride fault-only-first loads". Reading vl would be safe otherwise. + // E.g., CSR vlenb is a design-constant parameter, reading is always safe. + // E.g., CSRs vxrm and vxsat have no influence on-non fixed-point instructions, it could be read and written safely when no fixed-point operation is running. + // By better analyzing the spec, more of optimizations of such can be made. For the sake of simplicity, the current implementation treats CSR ops as one block. + if ( ara_idle_i ) begin : ara_idle + // These always respond at the same cycle + acc_resp_o.resp_valid = 1'b1; + is_config = 1'b1; + + unique case (acc_req_i.insn.itype.funct3) + 3'b001: begin // csrrw + // Decode the CSR. + case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm)) + // Only vstart can be written with CSR instructions. + riscv::CSR_VSTART: begin + csr_vstart_d = acc_req_i.rs1; + acc_resp_o.result = csr_vstart_q; + end + riscv::CSR_VXRM: begin + csr_vxrm_d = vxrm_t'(acc_req_i.rs1[16:15]); + acc_resp_o.result = vlen_t'(csr_vxrm_q); + end + riscv::CSR_VXSAT: begin + csr_vxsat_d = vxsat_e'(acc_req_i.rs1[15]); + acc_resp_o.result = vlen_t'(csr_vxsat_q); + end + riscv::CSR_VCSR: begin + csr_vxrm_d = vxrm_t'( acc_req_i.rs1[17:16] ); + csr_vxsat_d = vxsat_e'( acc_req_i.rs1[15] ); + acc_resp_o.result = vlen_t'( { csr_vxrm_q, csr_vxsat_q } ); + end + default: illegal_insn = 1'b1; + endcase + end + 3'b010: begin // csrrs + // Decode the CSR. + case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm)) + riscv::CSR_VSTART: begin + csr_vstart_d = csr_vstart_q | vlen_t'(acc_req_i.rs1); + acc_resp_o.result = csr_vstart_q; + end + riscv::CSR_VTYPE: begin + // Only reads are allowed + if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = xlen_vtype(csr_vtype_q); + else illegal_insn = 1'b1; + end + riscv::CSR_VL: begin + // Only reads are allowed + if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = csr_vl_q; + else illegal_insn = 1'b1; + end + riscv::CSR_VLENB: begin + // Only reads are allowed + if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = VLENB; + else illegal_insn = 1'b1; + end + riscv::CSR_VXRM: begin + csr_vxrm_d = csr_vxrm_q | vxrm_t'(acc_req_i.rs1[16:15]); + acc_resp_o.result = vlen_t'(csr_vxrm_q); + end + riscv::CSR_VXSAT: begin + csr_vxsat_d = csr_vxsat_q | vxsat_e'(acc_req_i.rs1[15]); + acc_resp_o.result = vlen_t'(csr_vxsat_q); + end + riscv::CSR_VCSR: begin + csr_vxrm_d = csr_vxrm_q | vxrm_t'(acc_req_i.rs1[17:16]); + csr_vxsat_d = csr_vxsat_q | vxsat_e'(acc_req_i.rs1[15]); + acc_resp_o.result = vlen_t'( { csr_vxrm_q, csr_vxsat_q } ); + end + default: illegal_insn = 1'b1; + endcase + end + 3'b011: begin // csrrc + // Decode the CSR. + case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm)) + riscv::CSR_VSTART: begin + csr_vstart_d = csr_vstart_q & ~vlen_t'(acc_req_i.rs1); + acc_resp_o.result = csr_vstart_q; + end + riscv::CSR_VTYPE: begin + // Only reads are allowed + if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = xlen_vtype(csr_vtype_q); + else illegal_insn = 1'b1; + end + riscv::CSR_VL: begin + // Only reads are allowed + if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = csr_vl_q; + else illegal_insn = 1'b1; + end + riscv::CSR_VLENB: begin + // Only reads are allowed + if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = VLENB; + else illegal_insn = 1'b1; + end + riscv::CSR_VXSAT: begin + csr_vxsat_d = csr_vxsat_q & ~vxsat_e'(acc_req_i.rs1[0]); + acc_resp_o.result = csr_vxsat_q; + end + riscv::CSR_VXRM: begin + csr_vxrm_d = csr_vxrm_q & ~vxsat_e'(acc_req_i.rs1[1:0]); + acc_resp_o.result = csr_vxrm_q; + end + riscv::CSR_VCSR: begin + csr_vxrm_d = csr_vxrm_q & ~vxsat_e'(acc_req_i.rs1[2:1]); + csr_vxsat_d = csr_vxsat_q & ~vxsat_e'(acc_req_i.rs1[0]); + acc_resp_o.result = vlen_t'( { csr_vxrm_q, csr_vxsat_q } ); + end + default: illegal_insn = 1'b1; + endcase + end + 3'b101: begin // csrrwi + // Decode the CSR. + case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm)) + // Only vstart can be written with CSR instructions. + riscv::CSR_VSTART: begin + csr_vstart_d = vlen_t'(acc_req_i.rs1); + acc_resp_o.result = csr_vstart_q; + end + riscv::CSR_VXRM: begin + csr_vxrm_d = vxrm_t'(acc_req_i.rs1[1:0]); + acc_resp_o.result = vlen_t'(csr_vxrm_q); + end + riscv::CSR_VXSAT: begin + csr_vxsat_d = acc_req_i.rs1[0]; + acc_resp_o.result = csr_vxsat_q; + end + riscv::CSR_VCSR: begin + // logic [19:15] rs1; So, LSB is [15] + csr_vxrm_d = vxrm_t'(acc_req_i.rs1[2:1]); + csr_vxsat_d = vxsat_e'(acc_req_i.rs1[0]); + acc_resp_o.result = csr_vxsat_q; + end + default: illegal_insn = 1'b1; + endcase + end + 3'b110: begin // csrrsi + // Decode the CSR. + case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm)) + riscv::CSR_VSTART: begin + csr_vstart_d = csr_vstart_q | vlen_t'(acc_req_i.rs1); + acc_resp_o.result = csr_vstart_q; + end + riscv::CSR_VTYPE: begin + // Only reads are allowed + if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = xlen_vtype(csr_vtype_q); + else illegal_insn = 1'b1; + end + riscv::CSR_VL: begin + // Only reads are allowed + if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = csr_vl_q; + else illegal_insn = 1'b1; + end + riscv::CSR_VLENB: begin + // Only reads are allowed + if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = VLENB; + else illegal_insn = 1'b1; + end + riscv::CSR_VXSAT: begin + // logic [19:15] rs1; So, LSB is [15] + csr_vxsat_d = csr_vxsat_q | vxsat_e'(acc_req_i.rs1[0]); + acc_resp_o.result = csr_vxsat_q; + end + riscv::CSR_VXRM: begin + // logic [19:15] rs1; So, LSB is [15] + csr_vxrm_d = csr_vxrm_q | vxrm_t'(acc_req_i.rs1[1:0]); + acc_resp_o.result = csr_vxrm_q; + end + riscv::CSR_VCSR: begin + // logic [19:15] rs1; So, LSB is [15] + csr_vxrm_d = csr_vxrm_q | vxrm_t'(acc_req_i.rs1[2:1]); + csr_vxsat_d = csr_vxsat_q | vxsat_e'(acc_req_i.rs1[0]); + acc_resp_o.result = { csr_vxrm_q, csr_vxsat_q }; + end + default: illegal_insn = 1'b1; + endcase + end + 3'b111: begin // csrrci + // Decode the CSR. + unique case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm)) + riscv::CSR_VSTART: begin + csr_vstart_d = csr_vstart_q & ~vlen_t'(acc_req_i.rs1); + acc_resp_o.result = csr_vstart_q; + end + riscv::CSR_VTYPE: begin + // Only reads are allowed + if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = xlen_vtype(csr_vtype_q); + else illegal_insn = 1'b1; + end + riscv::CSR_VL: begin + // Only reads are allowed + if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = csr_vl_q; + else illegal_insn = 1'b1; + end + riscv::CSR_VLENB: begin + // Only reads are allowed + if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = VLENB; + else illegal_insn = 1'b1; + end + riscv::CSR_VXSAT: begin + csr_vxsat_d = csr_vxsat_q & ~vxsat_e'(acc_req_i.rs1[0]); + acc_resp_o.result = csr_vxsat_q; + end + riscv::CSR_VXRM: begin + csr_vxrm_d = csr_vxrm_q & ~vxsat_e'(acc_req_i.rs1[1:0]); + acc_resp_o.result = csr_vxrm_q; + end + riscv::CSR_VCSR: begin + // logic [19:15] rs1; So, LSB is [15] + csr_vxrm_d = csr_vxrm_q & ~vxrm_t'(acc_req_i.rs1[2:1]); + csr_vxsat_d = csr_vxsat_q & ~vxsat_e'(acc_req_i.rs1[0]); + acc_resp_o.result = { csr_vxrm_q, csr_vxsat_q }; + end + default: illegal_insn= 1'b1; + endcase + end + default: begin + // Trigger an illegal instruction + illegal_insn = 1'b1; + end + endcase // acc_req_i.insn.itype.funct3 + end : ara_idle + else begin : csr_stall + acc_resp_o.req_ready = 1'b0; + end : csr_stall + end : OpcodeSystem default: begin // Trigger an illegal instruction - acc_resp_o.error = 1'b1; - acc_resp_o.resp_valid = 1'b1; + illegal_insn = 1'b1; end - endcase - end + + endcase // acc_req_i.insn.itype.opcode + end : ready // Check that we have fixed-point support if requested // vxsat and vxrm are always accessible anyway - if (ara_req_valid_d && (ara_req_d.op inside {[VSADDU:VNCLIPU], VSMUL}) && (FixPtSupport == FixedPointDisable)) + if (ara_req_valid_d && (ara_req_d.op inside {[VSADDU:VNCLIPU], VSMUL}) && (FixPtSupport == FixedPointDisable)) begin : fixed_point_check illegal_insn = 1'b1; + end : fixed_point_check // Check that we have we have vfrec7, vfrsqrt7 - if (ara_req_valid_d && (ara_req_d.op inside {VFREC7, VFRSQRT7}) && (FPExtSupport == FPExtSupportDisable)) + if (ara_req_valid_d && (ara_req_d.op inside {VFREC7, VFRSQRT7}) && (FPExtSupport == FPExtSupportDisable)) begin : vfrec7_vfrsqrt7_support_check illegal_insn = 1'b1; + end : vfrec7_vfrsqrt7_support_check + + + // Raise an illegal instruction exception + if ( illegal_insn || illegal_insn_load || illegal_insn_store ) begin : illegal_instruction + ara_req_valid_d = 1'b0; + acc_resp_o.req_ready = 1'b1; + acc_resp_o.resp_valid = 1'b1; + acc_resp_o.exception.valid = 1'b1; + acc_resp_o.exception.cause = riscv::ILLEGAL_INSTR; + acc_resp_o.exception.tval = acc_req_i.insn; + end : illegal_instruction + + // Reset vstart to zero for successful vector instructions + // Corner cases: + // * vstart exception reporting, e.g., VLSU, is handled above + // * CSR operations are not considered vector instructions + if ( acc_resp_o.resp_valid + & !acc_resp_o.exception.valid + & (acc_req_i.insn.itype.opcode != riscv::OpcodeSystem) + ) begin : reset_vstart + csr_vstart_d = '0; + end : reset_vstart // Check if we need to reshuffle our vector registers involved in the operation // This operation is costly when occurs, so avoid it if possible - if (ara_req_valid_d && !acc_resp_o.error) begin + if ( ara_req_valid_d && !acc_resp_o.exception.valid ) begin : check_reshuffle automatic rvv_instruction_t insn = rvv_instruction_t'(acc_req_i.insn.instr); // Is the instruction an in-lane one and could it be subject to reshuffling? @@ -3100,7 +3165,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // Optimization: reshuffle vd only if we are not overwriting the whole vector register! reshuffle_req_d = {ara_req_d.use_vs1 && (ara_req_d.eew_vs1 != eew_q[ara_req_d.vs1]) && eew_valid_q[ara_req_d.vs1] && in_lane_op, ara_req_d.use_vs2 && (ara_req_d.eew_vs2 != eew_q[ara_req_d.vs2]) && eew_valid_q[ara_req_d.vs2] && in_lane_op, - ara_req_d.use_vd && (ara_req_d.vtype.vsew != eew_q[ara_req_d.vd ]) && eew_valid_q[ara_req_d.vd ] && vl_q != (VLENB >> ara_req_d.vtype.vsew)}; + ara_req_d.use_vd && (ara_req_d.vtype.vsew != eew_q[ara_req_d.vd ]) && eew_valid_q[ara_req_d.vd ] && csr_vl_q != (VLENB >> ara_req_d.vtype.vsew)}; // Prepare the information to reshuffle the vector registers during the next cycles // Reshuffle in the following order: vd, v2, v1. The order is arbitrary. @@ -3122,7 +3187,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( end default:; endcase - end + end : check_reshuffle // Reshuffle if at least one of the three registers needs a reshuffle if (|reshuffle_req_d) begin @@ -3145,13 +3210,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // Reshuffle state_d = RESHUFFLE; end - end - - // Raise an illegal instruction exception - if (illegal_insn) begin - acc_resp_o.error = 1'b1; - ara_req_valid_d = 1'b0; - end + end : not_reshuffling // Update the EEW if (ara_req_valid_d && ara_req_d.use_vd && ara_req_ready_i) begin @@ -3191,8 +3250,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // Any valid non-config instruction is a NOP if vl == 0, with some exceptions, // e.g. whole vector memory operations / whole vector register move - if (is_decoding && (vl_q == '0 || null_vslideup) && !is_config && - !ignore_zero_vl_check && !acc_resp_o.error) begin + if (is_decoding && (csr_vl_q == '0 || null_vslideup) && !is_config && + !ignore_zero_vl_check && !acc_resp_o.exception.valid) begin // If we are acknowledging a memory operation, we must tell Ariane that the memory // operation was resolved (to decrement its pending load/store counter) // This can collide with the same signal from the vector load/store unit, so we must diff --git a/hardware/src/ara_sequencer.sv b/hardware/src/ara_sequencer.sv index 5fb0abff1..74fce4573 100644 --- a/hardware/src/ara_sequencer.sv +++ b/hardware/src/ara_sequencer.sv @@ -40,8 +40,8 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i output logic pe_scalar_resp_ready_o, // Interface with the Address Generation input logic addrgen_ack_i, - input logic addrgen_error_i, - input vlen_t addrgen_error_vl_i + input ariane_pkg::exception_t addrgen_exception_i, + input vlen_t addrgen_exception_vstart_i ); /////////////////////////////////// @@ -438,8 +438,8 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i state_d = IDLE; ara_req_ready_o = 1'b1; ara_resp_valid_o = 1'b1; - ara_resp_o.error = addrgen_error_i; - ara_resp_o.error_vl = addrgen_error_vl_i; + ara_resp_o.exception = addrgen_exception_i; + ara_resp_o.exception_vstart = addrgen_exception_vstart_i; end // Wait for the scalar result diff --git a/hardware/src/vlsu/addrgen.sv b/hardware/src/vlsu/addrgen.sv index f9ed43709..d57226e7d 100644 --- a/hardware/src/vlsu/addrgen.sv +++ b/hardware/src/vlsu/addrgen.sv @@ -26,15 +26,33 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( output axi_aw_t axi_aw_o, output logic axi_aw_valid_o, input logic axi_aw_ready_i, + // CSR input + input logic en_ld_st_translation_i, + // Interface with CVA6's sv39 MMU + // This is everything the MMU can provide, it might be overcomplete for Ara and some signals be useless + output ariane_pkg::exception_t mmu_misaligned_ex_o, + output logic mmu_req_o, // request address translation + output logic [riscv::VLEN-1:0] mmu_addr_o, // virtual address out + output logic mmu_is_store_o, // the translation is requested by a store + // if we need to walk the page table we can't grant in the same cycle + // Cycle 0 + input logic mmu_dtlb_hit_i, // sent in the same cycle as the request if translation hits in the DTLB + input logic [riscv::PPNW-1:0] mmu_dtlb_ppn_i, // ppn (send same cycle as hit) + // Cycle 1 + input logic mmu_valid_i, // translation is valid + input logic [riscv::PLEN-1:0] mmu_paddr_i, // translated address + input ariane_pkg::exception_t mmu_exception_i, // address translation threw an exception // Interace with the dispatcher input logic core_st_pending_i, // Interface with the main sequencer input pe_req_t pe_req_i, input logic pe_req_valid_i, input logic [NrVInsn-1:0] pe_vinsn_running_i, - output logic addrgen_error_o, + output ariane_pkg::exception_t addrgen_exception_o, output logic addrgen_ack_o, - output vlen_t addrgen_error_vl_o, + output vlen_t addrgen_exception_vstart_o, + output logic addrgen_exception_load_o, + output logic addrgen_exception_store_o, // Interface with the load/store units output addrgen_axi_req_t axi_addrgen_req_o, output logic axi_addrgen_req_valid_o, @@ -47,6 +65,13 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( output logic addrgen_operand_ready_o ); + + /////////////////// + // Assignments // + /////////////////// + + assign mmu_misaligned_ex_o = '0; // Ara reports misaligned exceptions on its own + import cf_math_pkg::idx_width; import axi_pkg::aligned_addr; import axi_pkg::BURST_INCR; @@ -117,7 +142,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( axi_addr_t idx_final_addr_d, idx_final_addr_q; elen_t idx_addr; logic idx_op_error_d, idx_op_error_q; - vlen_t addrgen_error_vl_d; + vlen_t addrgen_exception_vstart_d; // Pointer to point to the correct logic [$clog2(NrLanes)-1:0] word_lane_ptr_d, word_lane_ptr_q; @@ -156,13 +181,18 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( // ADDRGEN_IDX_OP: Generates a series of AXI requests from a // vector instruction, but reading a vector of offsets from Ara's lanes. // This is used for scatter and gather operations. - enum logic [1:0] { + // WAIT_LAST_TRANSLATION: Wait for the last address translation to be acknowledged + enum logic [2:0] { IDLE, ADDRGEN, ADDRGEN_IDX_OP, - ADDRGEN_IDX_OP_END + ADDRGEN_IDX_OP_END, + WAIT_LAST_TRANSLATION } state_q, state_d; + // TODO: Masked elements do not generate exceptions on: + // * EEW misalignment + // * page faults always_comb begin: addr_generation // Maintain state state_d = state_q; @@ -177,16 +207,20 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( // Nothing to acknowledge addrgen_ack_o = 1'b0; - addrgen_error_o = 1'b0; + addrgen_exception_o.valid = 1'b0; + addrgen_exception_o.tval = '0; + addrgen_exception_o.cause = '0; + addrgen_exception_load_o = 1'b0; + addrgen_exception_store_o = 1'b0; // No valid words for the spill register - idx_addr_valid_d = 1'b0; + idx_addr_valid_d = 1'b0; addrgen_operand_ready_o = 1'b0; reduced_word = '0; elm_ptr_d = elm_ptr_q; idx_op_cnt_d = idx_op_cnt_q; word_lane_ptr_d = word_lane_ptr_q; - idx_final_addr_d = idx_final_addr_q; + idx_final_addr_d = idx_final_addr_q; last_elm_subw_d = last_elm_subw_q; // Support for indexed operations @@ -204,10 +238,10 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( idx_addr = reduced_word; case (state_q) - IDLE: begin + IDLE: begin : state_IDLE // Received a new request if (pe_req_valid_i && - (is_load(pe_req_i.op) || is_store(pe_req_i.op)) && !vinsn_running_q[pe_req_i.id]) begin + (is_load(pe_req_i.op) || is_store(pe_req_i.op)) && !vinsn_running_q[pe_req_i.id]) begin : pe_req_valid // Mark the instruction as running in this unit vinsn_running_d[pe_req_i.id] = 1'b1; @@ -232,19 +266,24 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( idx_op_cnt_d = pe_req_i.vl; end default: state_d = ADDRGEN; - endcase - end - end - ADDRGEN: begin + endcase // pe_req_i.op + end : pe_req_valid + end : state_IDLE + + ADDRGEN: begin : ADDRGEN // Ara does not support misaligned AXI requests - if (is_addr_error(pe_req_q.scalar_op, pe_req_q.vtype.vsew)) begin + if (is_addr_error(pe_req_q.scalar_op, pe_req_q.vtype.vsew)) begin : eew_misaligned_error state_d = IDLE; addrgen_ack_o = 1'b1; - addrgen_error_o = 1'b1; - end else begin + addrgen_exception_o.valid = 1'b1; + addrgen_exception_o.cause = riscv::ILLEGAL_INSTR; + addrgen_exception_o.tval = '0; + end : eew_misaligned_error + else begin : address_valid + addrgen_req = '{ addr : pe_req_q.scalar_op, - len : pe_req_q.vl, + len : pe_req_q.vl , stride : pe_req_q.stride, vew : pe_req_q.vtype.vsew, is_load : is_load(pe_req_q.op), @@ -253,19 +292,20 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( }; addrgen_req_valid = 1'b1; - if (addrgen_req_ready) begin + if (addrgen_req_ready) begin : finished addrgen_req_valid = '0; addrgen_ack_o = 1'b1; state_d = IDLE; - end - end - end - ADDRGEN_IDX_OP: begin + end : finished + end : address_valid + end : ADDRGEN + + ADDRGEN_IDX_OP: begin : ADDRGEN_IDX_OP // Stall the interface until the operation is over to catch possible exceptions // Every address can generate an exception addrgen_req = '{ - addr : pe_req_q.scalar_op, + addr : pe_req_q.scalar_op, len : pe_req_q.vl, stride : pe_req_q.stride, vew : pe_req_q.vtype.vsew, @@ -339,13 +379,13 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( end end - if (idx_op_error_d || addrgen_req_ready) begin + if (idx_op_error_d || addrgen_req_ready ) begin state_d = ADDRGEN_IDX_OP_END; end - end + end : ADDRGEN_IDX_OP // This state exists not to create combinatorial paths on the interface - ADDRGEN_IDX_OP_END : begin + ADDRGEN_IDX_OP_END : begin : ADDRGEN_IDX_OP_END // Acknowledge the indexed memory operation addrgen_ack_o = 1'b1; addrgen_req_valid = '0; @@ -355,11 +395,20 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( word_lane_ptr_d = '0; // Raise an error if necessary if (idx_op_error_q) begin - addrgen_error_o = 1'b1; + // In this case, we always get EEW-misaligned exceptions + addrgen_exception_o.valid = 1'b1; + addrgen_exception_o.cause = riscv::ILLEGAL_INSTR; + addrgen_exception_o.tval = '0; end - end - endcase - end + end : ADDRGEN_IDX_OP_END + endcase // state_q + + if ( addrgen_exception_o.valid & addrgen_ack_o ) begin + addrgen_exception_load_o = is_load(pe_req_q.op); + addrgen_exception_store_o = !is_load(pe_req_q.op); + end + + end : addr_generation always_ff @(posedge clk_i or negedge rst_ni) begin if (!rst_ni) begin @@ -371,7 +420,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( idx_op_cnt_q <= '0; last_elm_subw_q <= '0; idx_op_error_q <= '0; - addrgen_error_vl_o <= '0; + addrgen_exception_vstart_o <= '0; end else begin state_q <= state_d; pe_req_q <= pe_req_d; @@ -381,7 +430,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( idx_op_cnt_q <= idx_op_cnt_d; last_elm_subw_q <= last_elm_subw_d; idx_op_error_q <= idx_op_error_d; - addrgen_error_vl_o <= addrgen_error_vl_d; + addrgen_exception_vstart_o <= addrgen_exception_vstart_d; end end @@ -424,11 +473,12 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( // AXI Request Generation // ////////////////////////////// - enum logic [1:0] { + enum logic [2:0] { AXI_ADDRGEN_IDLE, AXI_ADDRGEN_AXI_DW_STORE_MISALIGNED, // Misaligned vector store to AxiDataWidth/8, needs special treatement AXI_ADDRGEN_WAITING_CORE_STORE_PENDING, // Wait until (core_st_pending_i == 0) - AXI_ADDRGEN_REQUESTING // Perform AW/AR transactions and push addrgen_req to VSTU/VLDU + AXI_ADDRGEN_REQUESTING, // Perform AW/AR transactions and push addrgen_req to VSTU/VLDU + AXI_ADDRGEN_WAIT_TRANSLATION // Wait for MMU to ack back } axi_addrgen_state_d, axi_addrgen_state_q; axi_addr_t aligned_start_addr_d, aligned_start_addr_q; @@ -441,7 +491,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( logic [clog2_AxiStrobeWidth:0] eff_axi_dw_d, eff_axi_dw_q; logic [idx_width(clog2_AxiStrobeWidth):0] eff_axi_dw_log_d, eff_axi_dw_log_q; - function automatic set_end_addr ( + function automatic void set_end_addr ( input logic [($bits(axi_addr_t) - 12)-1:0] next_2page_msb, input int unsigned num_bytes, input axi_addr_t addr, @@ -470,7 +520,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( aligned_end_addr_d = {aligned_start_addr_d[AxiAddrWidth-1:12], 12'hFFF}; aligned_next_start_addr_d = { next_2page_msb , 12'h000}; end - endfunction + endfunction // set_end_addr always_comb begin: axi_addrgen // Maintain state @@ -487,7 +537,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( eff_axi_dw_log_d = eff_axi_dw_log_q; idx_addr_ready_d = 1'b0; - addrgen_error_vl_d = '0; + addrgen_exception_vstart_d = '0; // No error by default idx_op_error_d = 1'b0; @@ -505,14 +555,23 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( axi_aw_o = '0; axi_aw_valid_o = 1'b0; - case (axi_addrgen_state_q) - AXI_ADDRGEN_IDLE: begin + // MMU + mmu_req_o = 1'b0; + mmu_addr_o = '0; + mmu_is_store_o = 1'b0; + + // For addrgen FSM + last_translation_completed = 1'b0; + + case (axi_addrgen_state_q) + AXI_ADDRGEN_IDLE: begin : axi_addrgen_state_AXI_ADDRGEN_IDLE if (addrgen_req_valid) begin axi_addrgen_d = addrgen_req; axi_addrgen_state_d = core_st_pending_i ? AXI_ADDRGEN_WAITING_CORE_STORE_PENDING : AXI_ADDRGEN_REQUESTING; // In case of a misaligned store, reduce the effective width of the AXI transaction, // since the store unit does not support misalignments between the AXI bus and the lanes + // BUG: this address check is not valid for indexed operations if ((axi_addrgen_d.addr[clog2_AxiStrobeWidth-1:0] != '0) && !axi_addrgen_d.is_load) begin // Calculate the start and the end addresses in the AXI_ADDRGEN_AXI_DW_STORE_MISALIGNED state @@ -542,10 +601,10 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( aligned_end_addr_d, aligned_next_start_addr_d ); - end - end - AXI_ADDRGEN_AXI_DW_STORE_MISALIGNED: begin + end : axi_addrgen_state_AXI_ADDRGEN_IDLE + + AXI_ADDRGEN_AXI_DW_STORE_MISALIGNED: begin : axi_addrgen_state_AXI_ADDRGEN_AXI_DW_STORE_MISALIGNED axi_addrgen_state_d = core_st_pending_i ? AXI_ADDRGEN_WAITING_CORE_STORE_PENDING : AXI_ADDRGEN_REQUESTING; // The start address is found by aligning the original request address by the width of @@ -561,15 +620,16 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( aligned_start_addr_d, aligned_end_addr_d, aligned_next_start_addr_d - ); - - end - AXI_ADDRGEN_WAITING_CORE_STORE_PENDING: begin + ); + end : axi_addrgen_state_AXI_ADDRGEN_AXI_DW_STORE_MISALIGNED + + AXI_ADDRGEN_WAITING_CORE_STORE_PENDING: begin : axi_addrgen_state_AXI_ADDRGEN_WAITING_CORE_STORE_PENDING if (!core_st_pending_i) begin axi_addrgen_state_d = AXI_ADDRGEN_REQUESTING; end - end - AXI_ADDRGEN_REQUESTING : begin + end : axi_addrgen_state_AXI_ADDRGEN_WAITING_CORE_STORE_PENDING + + AXI_ADDRGEN_REQUESTING : begin : axi_addrgen_state_AXI_ADDRGEN_REQUESTING automatic logic axi_ax_ready = (axi_addrgen_q.is_load && axi_ar_ready_i) || (!axi_addrgen_q.is_load && axi_aw_ready_i); // Pre-calculate the next_2page_msb. This should not require much energy if the addr @@ -584,9 +644,9 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( (axi_addrgen_req_o.is_load && axi_addrgen_q.is_load) || (~axi_addrgen_req_o.is_load && ~axi_addrgen_q.is_load ) - ) begin - if (!axi_addrgen_queue_full && axi_ax_ready) begin - if (axi_addrgen_q.is_burst) begin + ) begin : axi_ax_idle + if (!axi_addrgen_queue_full && axi_ax_ready) begin : start_req + if (axi_addrgen_q.is_burst) begin : unit_stride ///////////////////////// // Unit-Stride access // @@ -656,12 +716,6 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( end axi_addrgen_d.addr = aligned_next_start_addr_q; - // Finished generating AXI requests - if (axi_addrgen_d.len == 0) begin - addrgen_req_ready = 1'b1; - axi_addrgen_state_d = AXI_ADDRGEN_IDLE; - end - // Calculate the addresses for the next iteration // The start address is found by aligning the original request address by the width of // the memory interface. In our case, we have it already. @@ -680,8 +734,8 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( aligned_end_addr_d, aligned_next_start_addr_d ); - - end else if (state_q != ADDRGEN_IDX_OP) begin + end : unit_stride + else if (state_q != ADDRGEN_IDX_OP) begin : strided ///////////////////// // Strided access // @@ -724,82 +778,78 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( // Account for the requested operands axi_addrgen_d.len = axi_addrgen_q.len - 1; // Calculate the addresses for the next iteration, adding the correct stride - // NOTE: there is no need to check for misaligned erros, since the stride is alsways EEW aligned to the first address + // NOTE: there is no need to check for misaligned erros, since the stride always produces EEW-aligned to the first address axi_addrgen_d.addr = axi_addrgen_q.addr + axi_addrgen_q.stride; - - // Finished generating AXI requests - if (axi_addrgen_d.len == 0) begin - addrgen_req_ready = 1'b1; - axi_addrgen_state_d = AXI_ADDRGEN_IDLE; - end - end else begin - + end : strided + else begin : indexed ////////////////////// // Indexed access // ////////////////////// + // TODO: check if idx_addr_valid_q is stable + if (idx_addr_valid_q) begin : idx_addr_valid_q - if (idx_addr_valid_q) begin - // We consumed a word - idx_addr_ready_d = 1'b1; - - // AR Channel - if (axi_addrgen_q.is_load) begin - axi_ar_o = '{ - addr : idx_final_addr_q, - len : 0, - size : axi_addrgen_q.vew, - cache : CACHE_MODIFIABLE, - burst : BURST_INCR, - default: '0 - }; - axi_ar_valid_o = 1'b1; - end - // AW Channel - else begin - axi_aw_o = '{ - addr : idx_final_addr_q, - len : 0, - size : axi_addrgen_q.vew, - cache : CACHE_MODIFIABLE, - burst : BURST_INCR, - default: '0 - }; - axi_aw_valid_o = 1'b1; - end - - // Send this request to the load/store units - axi_addrgen_queue = '{ - addr : idx_final_addr_q, - size : axi_addrgen_q.vew, - len : 0, - is_load: axi_addrgen_q.is_load - }; - axi_addrgen_queue_push = 1'b1; - - // Account for the requested operands - axi_addrgen_d.len = axi_addrgen_q.len - 1; - - // Check if the address does generate an exception - if (is_addr_error(idx_final_addr_q, axi_addrgen_q.vew)) begin + // Check if the virtual address generates an exception + if (is_addr_error(idx_final_addr_q, axi_addrgen_q.vew)) begin : eew_misaligned_error // Generate an error idx_op_error_d = 1'b1; // Forward next vstart info to the dispatcher - addrgen_error_vl_d = addrgen_req.len - axi_addrgen_q.len - 1; + addrgen_exception_vstart_d = addrgen_req.len - axi_addrgen_q.len - 1; addrgen_req_ready = 1'b1; axi_addrgen_state_d = AXI_ADDRGEN_IDLE; - end - - // Finished generating AXI requests - if (axi_addrgen_d.len == 0) begin - addrgen_req_ready = 1'b1; - axi_addrgen_state_d = AXI_ADDRGEN_IDLE; - end - end + end : eew_misaligned_error + else begin : aligned_address + // We consumed a word + idx_addr_ready_d = 1'b1; + + // AR Channel + if (axi_addrgen_q.is_load) begin + axi_ar_o = '{ + addr : idx_final_addr_q, + len : 0, + size : axi_addrgen_q.vew, + cache : CACHE_MODIFIABLE, + burst : BURST_INCR, + default: '0 + }; + axi_ar_valid_o = 1'b1; + end + // AW Channel + else begin + axi_aw_o = '{ + addr : idx_final_addr_q, + len : 0, + size : axi_addrgen_q.vew, + cache : CACHE_MODIFIABLE, + burst : BURST_INCR, + default: '0 + }; + axi_aw_valid_o = 1'b1; + end + + // Send this request to the load/store units + axi_addrgen_queue = '{ + addr : idx_final_addr_q, + size : axi_addrgen_q.vew, + len : 0, + is_load: axi_addrgen_q.is_load + }; + axi_addrgen_queue_push = 1'b1; + + // Account for the requested operands + axi_addrgen_d.len = axi_addrgen_q.len - 1; + end : aligned_address + end : idx_addr_valid_q + end : indexed + + // Finished generating AXI requests + if (axi_addrgen_d.len == 0) begin + addrgen_req_ready = 1'b1; + axi_addrgen_state_d = AXI_ADDRGEN_IDLE; end - end - end - end - endcase + end : start_req + end : axi_ax_idle + end : axi_addrgen_state_AXI_ADDRGEN_REQUESTING + endcase // axi_addrgen_state_q end: axi_addrgen always_ff @(posedge clk_i or negedge rst_ni) begin diff --git a/hardware/src/vlsu/vldu.sv b/hardware/src/vlsu/vldu.sv index 51042ed8e..7c49f3af6 100644 --- a/hardware/src/vlsu/vldu.sv +++ b/hardware/src/vlsu/vldu.sv @@ -35,6 +35,7 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( output pe_resp_t pe_resp_o, // Interface with the address generator input addrgen_axi_req_t axi_addrgen_req_i, + input logic addrgen_exception_valid_i, input logic axi_addrgen_req_valid_i, output logic axi_addrgen_req_ready_o, // Interface with the lanes @@ -136,7 +137,7 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( // reading from and writing into the lanes (read_pnt). logic [idx_width(ResultQueueDepth)-1:0] result_queue_write_pnt_d, result_queue_write_pnt_q; logic [idx_width(ResultQueueDepth)-1:0] result_queue_read_pnt_d, result_queue_read_pnt_q; - // We need to count how many valid elements are there in this result queue. + // We need to count how many valid elements (payload_t) are there in this result queue. logic [idx_width(ResultQueueDepth):0] result_queue_cnt_d, result_queue_cnt_q; // Vector to register the final grants from the operand requesters, which indicate // that the result was actually written in the VRF (while the normal grant just says @@ -174,33 +175,33 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( logic [NrVInsn-1:0] vinsn_running_d, vinsn_running_q; // Interface with the main sequencer - pe_resp_t pe_resp; + pe_resp_t pe_resp_d; // Remaining bytes of the current instruction in the issue phase - vlen_t issue_cnt_d, issue_cnt_q; + vlen_t issue_cnt_bytes_d, issue_cnt_bytes_q; // Remaining bytes of the current instruction in the commit phase - vlen_t commit_cnt_d, commit_cnt_q; + vlen_t commit_cnt_bytes_d, commit_cnt_bytes_q; // Pointers // // We need several pointers to copy data from the memory interface // into the VRF. Namely, we need: // - A counter of how many beats are left in the current AXI burst - axi_pkg::len_t len_d, len_q; + axi_pkg::len_t axi_len_d, axi_len_q; // - A pointer to which byte in the current R beat we are reading data from. - logic [idx_width(AxiDataWidth/8):0] r_pnt_d, r_pnt_q; + logic [idx_width(AxiDataWidth/8):0] axi_r_byte_pnt_d, axi_r_byte_pnt_q; // - A pointer to which byte in the full VRF word we are writing data into. - logic [idx_width(DataWidth*NrLanes/8):0] vrf_pnt_d, vrf_pnt_q; + logic [idx_width(DataWidth*NrLanes/8):0] vrf_word_byte_pnt_d, vrf_word_byte_pnt_q; always_comb begin: p_vldu // Maintain state vinsn_queue_d = vinsn_queue_q; - issue_cnt_d = issue_cnt_q; - commit_cnt_d = commit_cnt_q; + issue_cnt_bytes_d = issue_cnt_bytes_q; + commit_cnt_bytes_d = commit_cnt_bytes_q; - len_d = len_q; - r_pnt_d = r_pnt_q; - vrf_pnt_d = vrf_pnt_q; + axi_len_d = axi_len_q; + axi_r_byte_pnt_d = axi_r_byte_pnt_q; + vrf_word_byte_pnt_d = vrf_word_byte_pnt_q; result_queue_d = result_queue_q; result_queue_valid_d = result_queue_valid_q; @@ -215,7 +216,7 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( // We are not ready, by default axi_addrgen_req_ready_o = 1'b0; - pe_resp = '0; + pe_resp_d = '0; axi_r_ready_o = 1'b0; mask_ready_o = 1'b0; load_complete_o = 1'b0; @@ -236,40 +237,46 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( // Bytes valid in the current R beat // If non-unit strided load, we do not progress within the beat automatic shortint unsigned lower_byte = beat_lower_byte(axi_addrgen_req_i.addr, - axi_addrgen_req_i.size, axi_addrgen_req_i.len, BURST_INCR, AxiDataWidth/8, len_q); + axi_addrgen_req_i.size, axi_addrgen_req_i.len, BURST_INCR, AxiDataWidth/8, axi_len_q); automatic shortint unsigned upper_byte = beat_upper_byte(axi_addrgen_req_i.addr, - axi_addrgen_req_i.size, axi_addrgen_req_i.len, BURST_INCR, AxiDataWidth/8, len_q); - + axi_addrgen_req_i.size, axi_addrgen_req_i.len, BURST_INCR, AxiDataWidth/8, axi_len_q); + // Is there a vector instruction ready to be issued? // Do we have the operands for it? - if (vinsn_issue_valid && (vinsn_issue_q.vm || (|mask_valid_i))) begin + if (vinsn_issue_valid && (vinsn_issue_q.vm || (|mask_valid_i))) begin : operands_valid + // TODO: add vstart here (use issue/commit_cnt_bytes_q) // Account for the issued bytes // How many bytes are valid in this VRF word - automatic vlen_t vrf_valid_bytes = NrLanes * 8 - vrf_pnt_q; + automatic vlen_t vrf_valid_bytes = (NrLanes * 8) - vrf_word_byte_pnt_q; // How many bytes are valid in this instruction - automatic vlen_t vinsn_valid_bytes = issue_cnt_q - vrf_pnt_q; + automatic vlen_t vinsn_valid_bytes = issue_cnt_bytes_q - vrf_word_byte_pnt_q; // How many bytes are valid in this AXI word - automatic vlen_t axi_valid_bytes = upper_byte - lower_byte - r_pnt_q + 1; + automatic vlen_t axi_valid_bytes = upper_byte - lower_byte - axi_r_byte_pnt_q + 1; // How many bytes are we committing? automatic logic [idx_width(DataWidth*NrLanes/8):0] valid_bytes; - valid_bytes = issue_cnt_q < NrLanes * 8 ? vinsn_valid_bytes : vrf_valid_bytes; - valid_bytes = valid_bytes < axi_valid_bytes ? valid_bytes : axi_valid_bytes; + valid_bytes = ( issue_cnt_bytes_q < (NrLanes * 8) ) ? vinsn_valid_bytes : vrf_valid_bytes; + // valid_bytes = ( valid_bytes < axi_valid_bytes ) ? valid_bytes : axi_valid_bytes; + if ( valid_bytes >= axi_valid_bytes ) begin : valid_bytes_overflow + valid_bytes = axi_valid_bytes; + end : valid_bytes_overflow - r_pnt_d = r_pnt_q + valid_bytes; - vrf_pnt_d = vrf_pnt_q + valid_bytes; + axi_r_byte_pnt_d = axi_r_byte_pnt_q + valid_bytes; + vrf_word_byte_pnt_d = vrf_word_byte_pnt_q + valid_bytes; // Copy data from the R channel into the result queue - for (int axi_byte = 0; axi_byte < AxiDataWidth/8; axi_byte++) begin + for (int axi_byte = 0; axi_byte < AxiDataWidth/8; axi_byte++) begin : axi_r_to_result_queue // Is this byte a valid byte in the R beat? - if (axi_byte >= lower_byte + r_pnt_q && axi_byte <= upper_byte) begin + if ( ( axi_byte >= ( lower_byte + axi_r_byte_pnt_q ) ) && + ( axi_byte <= upper_byte ) + ) begin : is_axi_r_byte // Map axi_byte to the corresponding byte in the VRF word (sequential) - automatic int vrf_seq_byte = axi_byte - lower_byte - r_pnt_q + vrf_pnt_q; + automatic int vrf_seq_byte = axi_byte - lower_byte - axi_r_byte_pnt_q + vrf_word_byte_pnt_q; // And then shuffle it automatic int vrf_byte = shuffle_index(vrf_seq_byte, NrLanes, vinsn_issue_q.vtype.vsew); // Is this byte a valid byte in the VRF word? - if (vrf_seq_byte < issue_cnt_q && vrf_seq_byte < NrLanes * 8) begin + if (vrf_seq_byte < issue_cnt_bytes_q && vrf_seq_byte < (NrLanes * 8)) begin : is_vrf_byte // At which lane, and what is the byte offset in that lane, of the byte vrf_byte? automatic int vrf_lane = vrf_byte >> 3; automatic int vrf_offset = vrf_byte[2:0]; @@ -279,27 +286,36 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( axi_r_i.data[8*axi_byte +: 8]; result_queue_d[result_queue_write_pnt_q][vrf_lane].be[vrf_offset] = vinsn_issue_q.vm || mask_i[vrf_lane][vrf_offset]; - end - end - end + end : is_vrf_byte + end : is_axi_r_byte + end : axi_r_to_result_queue // Initialize id and addr fields of the result queue requests for (int lane = 0; lane < NrLanes; lane++) begin result_queue_d[result_queue_write_pnt_q][lane].id = vinsn_issue_q.id; - result_queue_d[result_queue_write_pnt_q][lane].addr = vaddr(vinsn_issue_q.vd, NrLanes) + - (((vinsn_issue_q.vl - (issue_cnt_q >> int'(vinsn_issue_q.vtype.vsew))) / NrLanes) >> - (int'(EW64) - int'(vinsn_issue_q.vtype.vsew))); + result_queue_d[result_queue_write_pnt_q][lane].addr = + vaddr(vinsn_issue_q.vd, NrLanes) + // base address of vd + ( + ( + ( + (vinsn_issue_q.vl) - // total number of elements to be processed + (issue_cnt_bytes_q >> unsigned'(vinsn_issue_q.vtype.vsew)) // elements left (issue_cnt_bytes_q is in bytes, so we shift rx by EEW) + ) / NrLanes // elements per lane (each lane processes num elements / NrLanes) + ) >> (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew)) // 64-bit aligned address + ); // final offset to vd end - end + end : operands_valid // We have a word ready to be sent to the lanes - if (vrf_pnt_d == NrLanes*8 || vrf_pnt_d == issue_cnt_q) begin + if (vrf_word_byte_pnt_d == NrLanes*8 || vrf_word_byte_pnt_d == issue_cnt_bytes_q) begin // Increment result queue pointers and counters result_queue_cnt_d += 1; - if (result_queue_write_pnt_q == ResultQueueDepth-1) + if (result_queue_write_pnt_q == ResultQueueDepth-1) begin : result_queue_write_pnt_overflow result_queue_write_pnt_d = '0; - else + end : result_queue_write_pnt_overflow + else begin : result_queue_write_pnt_increment result_queue_write_pnt_d = result_queue_write_pnt_q + 1; + end : result_queue_write_pnt_increment // Trigger the request signal result_queue_valid_d[result_queue_write_pnt_q] = {NrLanes{1'b1}}; @@ -308,52 +324,56 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( mask_ready_o = !vinsn_issue_q.vm; // Reset the pointer in the VRF word - vrf_pnt_d = '0; + vrf_word_byte_pnt_d = '0; // Account for the results that were issued - issue_cnt_d = issue_cnt_q - NrLanes * 8; - if (issue_cnt_q < NrLanes * 8) - issue_cnt_d = '0; + issue_cnt_bytes_d = issue_cnt_bytes_q - (NrLanes * 8); // TODO: add vstart here (use issue/commit_cnt_bytes_q) + if (issue_cnt_bytes_q < (NrLanes * 8)) begin : issue_cnt_bytes_overflow + issue_cnt_bytes_d = '0; + end : issue_cnt_bytes_overflow end // Consumed all valid bytes in this R beat - if (r_pnt_d == upper_byte - lower_byte + 1 || issue_cnt_d == '0) begin + if ( ( axi_r_byte_pnt_d == ( upper_byte - lower_byte + 1 ) ) || ( issue_cnt_bytes_d == '0 ) ) begin : axi_r_beat_finish // Request another beat axi_r_ready_o = 1'b1; - r_pnt_d = '0; + axi_r_byte_pnt_d = '0; // Account for the beat we consumed - len_d = len_q + 1; - end + axi_len_d = axi_len_q + 1; + end : axi_r_beat_finish // Consumed all beats from this burst - if ($unsigned(len_d) == axi_pkg::len_t'($unsigned(axi_addrgen_req_i.len) + 1)) begin + if ($unsigned(axi_len_d) == axi_pkg::len_t'($unsigned(axi_addrgen_req_i.len) + 1)) begin : axi_finish // Reset AXI pointers - len_d = '0; - r_pnt_d = '0; + axi_len_d = '0; + axi_r_byte_pnt_d = '0; // Wait for another AXI request axi_addrgen_req_ready_o = 1'b1; - end + end : axi_finish // Finished issuing results - if (vinsn_issue_valid && issue_cnt_d == '0) begin + if (vinsn_issue_valid && (issue_cnt_bytes_d == '0)) begin : vrf_results_finish // Increment vector instruction queue pointers and counters vinsn_queue_d.issue_cnt -= 1; - if (vinsn_queue_q.issue_pnt == VInsnQueueDepth-1) + if (vinsn_queue_q.issue_pnt == (VInsnQueueDepth-1)) begin : issue_pnt_overflow vinsn_queue_d.issue_pnt = '0; - else + end : issue_pnt_overflow + else begin : issue_pnt_increment vinsn_queue_d.issue_pnt += 1; + end : issue_pnt_increment // Prepare for the next vector instruction - if (vinsn_queue_d.issue_cnt != 0) - issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl << int'(vinsn_queue_q.vinsn[ - vinsn_queue_d.issue_pnt].vtype.vsew); - end + if (vinsn_queue_d.issue_cnt != 0) begin : issue_cnt_bytes_update + issue_cnt_bytes_d = ( vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl + ) << unsigned'(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew); + end : issue_cnt_bytes_update + end : vrf_results_finish end ////////////////////////////////// // Write results into the VRF // ////////////////////////////////// - for (int lane = 0; lane < NrLanes; lane++) begin: result_write + for (int lane = 0; lane < NrLanes; lane++) begin: vrf_result_write ldu_result_req_o[lane] = result_queue_valid_q[result_queue_read_pnt_q][lane]; ldu_result_addr_o[lane] = result_queue_q[result_queue_read_pnt_q][lane].addr; ldu_result_id_o[lane] = result_queue_q[result_queue_read_pnt_q][lane].id; @@ -365,39 +385,43 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( // Received a grant from the VRF. // Deactivate the request, but do not bump the pointers for now. - if (ldu_result_req_o[lane] && ldu_result_gnt_i[lane]) begin + if (ldu_result_req_o[lane] && ldu_result_gnt_i[lane]) begin : vrf_grant result_queue_valid_d[result_queue_read_pnt_q][lane] = 1'b0; result_queue_d[result_queue_read_pnt_q][lane] = '0; // Reset the final gnt vector since we are now waiting for another final gnt result_final_gnt_d[lane] = 1'b0; - end - end: result_write + end : vrf_grant + end: vrf_result_write // All lanes accepted the VRF request // Wait for all the final grants, to be sure that all the results were written back if (!(|result_queue_valid_d[result_queue_read_pnt_q]) && - (&result_final_gnt_d || commit_cnt_q > (NrLanes * 8))) + (&result_final_gnt_d || commit_cnt_bytes_q > (NrLanes * 8))) begin // There is something waiting to be written - if (!result_queue_empty) begin + if (!result_queue_empty) begin : result_available // Increment the read pointer - if (result_queue_read_pnt_q == ResultQueueDepth-1) + if (result_queue_read_pnt_q == (ResultQueueDepth-1)) begin : result_queue_read_pnt_overflow result_queue_read_pnt_d = 0; - else + end : result_queue_read_pnt_overflow + else begin : result_queue_read_pnt_increment result_queue_read_pnt_d = result_queue_read_pnt_q + 1; + end : result_queue_read_pnt_increment // Decrement the counter of results waiting to be written result_queue_cnt_d -= 1; // Decrement the counter of remaining vector elements waiting to be written - commit_cnt_d = commit_cnt_q - NrLanes * 8; - if (commit_cnt_q < (NrLanes * 8)) - commit_cnt_d = '0; - end + commit_cnt_bytes_d = commit_cnt_bytes_q - (NrLanes * 8); // TODO: add vstart here (use issue/commit_cnt_bytes_q) + if (commit_cnt_bytes_q < (NrLanes * 8)) begin : commit_cnt_bytes_overflow + commit_cnt_bytes_d = '0; + end : commit_cnt_bytes_overflow + end : result_available + end // Finished committing the results of a vector instruction - if (vinsn_commit_valid && commit_cnt_d == '0) begin + if (vinsn_commit_valid && commit_cnt_bytes_d == '0) begin : vinsn_done // Mark the vector instruction as being done - pe_resp.vinsn_done[vinsn_commit.id] = 1'b1; + pe_resp_d.vinsn_done[vinsn_commit.id] = 1'b1; // Signal complete load load_complete_o = 1'b1; @@ -411,9 +435,16 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( // Update the commit counter for the next instruction if (vinsn_queue_d.commit_cnt != '0) - commit_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl << int'(vinsn_queue_q.vinsn[ - vinsn_queue_d.commit_pnt].vtype.vsew); - end + commit_cnt_bytes_d = ( vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl + ) << unsigned'(vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vtype.vsew); + end : vinsn_done + + // Ack back exceptions + if ( addrgen_exception_valid_i ) begin : exception + // Signal done to sequencer + pe_resp_d.vinsn_done[vinsn_commit.id] = 1'b1; + // Clear counters and flags + end : exception ////////////////////////////// // Accept new instruction // @@ -425,10 +456,13 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( vinsn_running_d[pe_req_i.id] = 1'b1; // Initialize counters - if (vinsn_queue_d.issue_cnt == '0) - issue_cnt_d = pe_req_i.vl << int'(pe_req_i.vtype.vsew); - if (vinsn_queue_d.commit_cnt == '0) - commit_cnt_d = pe_req_i.vl << int'(pe_req_i.vtype.vsew); + // TODO(bug fix): add masking logic (stores are not idempotent!) + if (vinsn_queue_d.issue_cnt == '0) begin : issue_cnt_bytes_init + issue_cnt_bytes_d = (pe_req_i.vl) << unsigned'(pe_req_i.vtype.vsew); + end : issue_cnt_bytes_init + if (vinsn_queue_d.commit_cnt == '0) begin : commit_cnt_bytes_init + commit_cnt_bytes_d = (pe_req_i.vl) << unsigned'(pe_req_i.vtype.vsew); + end : commit_cnt_bytes_init // Bump pointers and counters of the vector instruction queue vinsn_queue_d.accept_pnt += 1; @@ -440,21 +474,21 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( always_ff @(posedge clk_i or negedge rst_ni) begin if (!rst_ni) begin vinsn_running_q <= '0; - issue_cnt_q <= '0; - commit_cnt_q <= '0; - len_q <= '0; - r_pnt_q <= '0; - vrf_pnt_q <= '0; + issue_cnt_bytes_q <= '0; + commit_cnt_bytes_q <= '0; + axi_len_q <= '0; + axi_r_byte_pnt_q <= '0; + vrf_word_byte_pnt_q <= '0; pe_resp_o <= '0; result_final_gnt_q <= '0; end else begin vinsn_running_q <= vinsn_running_d; - issue_cnt_q <= issue_cnt_d; - commit_cnt_q <= commit_cnt_d; - len_q <= len_d; - r_pnt_q <= r_pnt_d; - vrf_pnt_q <= vrf_pnt_d; - pe_resp_o <= pe_resp; + issue_cnt_bytes_q <= issue_cnt_bytes_d; + commit_cnt_bytes_q <= commit_cnt_bytes_d; + axi_len_q <= axi_len_d; + axi_r_byte_pnt_q <= axi_r_byte_pnt_d; + vrf_word_byte_pnt_q <= vrf_word_byte_pnt_d; + pe_resp_o <= pe_resp_d; result_final_gnt_q <= result_final_gnt_d; end end diff --git a/hardware/src/vlsu/vlsu.sv b/hardware/src/vlsu/vlsu.sv index 7505f9f6f..efd14b3d3 100644 --- a/hardware/src/vlsu/vlsu.sv +++ b/hardware/src/vlsu/vlsu.sv @@ -42,8 +42,8 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( output logic [1:0] pe_req_ready_o, // Load (0) and Store (1) units output pe_resp_t [1:0] pe_resp_o, // Load (0) and Store (1) units output logic addrgen_ack_o, - output logic addrgen_error_o, - output vlen_t addrgen_error_vl_o, + output ariane_pkg::exception_t addrgen_exception_o, + output vlen_t addrgen_exception_vstart_o, // Interface with the lanes // Store unit operands input elen_t [NrLanes-1:0] stu_operand_i, @@ -59,6 +59,25 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( input logic [NrLanes-1:0] mask_valid_i, output logic vldu_mask_ready_o, output logic vstu_mask_ready_o, + + // CSR input + input logic en_ld_st_translation_i, + + // Interface with CVA6's sv39 MMU + // This is everything the MMU can provide, it might be overcomplete for Ara and some signals be useless + output ariane_pkg::exception_t mmu_misaligned_ex_o, + output logic mmu_req_o, // request address translation + output logic [riscv::VLEN-1:0] mmu_vaddr_o, // virtual address out + output logic mmu_is_store_o, // the translation is requested by a store + // if we need to walk the page table we can't grant in the same cycle + // Cycle 0 + input logic mmu_dtlb_hit_i, // sent in the same cycle as the request if translation hits in the DTLB + input logic [riscv::PPNW-1:0] mmu_dtlb_ppn_i, // ppn (send same cycle as hit) + // Cycle 1 + input logic mmu_valid_i, // translation is valid + input logic [riscv::PLEN-1:0] mmu_paddr_i, // translated address + input ariane_pkg::exception_t mmu_exception_i, // address translation threw an exception + // Results output logic [NrLanes-1:0] ldu_result_req_o, output vid_t [NrLanes-1:0] ldu_result_id_o, @@ -69,6 +88,11 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( input logic [NrLanes-1:0] ldu_result_final_gnt_i ); + logic load_complete, store_complete; + logic addrgen_exception_load, addrgen_exception_store; + assign load_complete_o = load_complete | addrgen_exception_load; + assign store_complete_o = store_complete | addrgen_exception_store; + /////////////////// // Definitions // /////////////////// @@ -133,8 +157,10 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( .pe_req_valid_i (pe_req_valid_i ), .pe_vinsn_running_i (pe_vinsn_running_i ), .addrgen_ack_o (addrgen_ack_o ), - .addrgen_error_o (addrgen_error_o ), - .addrgen_error_vl_o (addrgen_error_vl_o ), + .addrgen_exception_o ( addrgen_exception_o ), + .addrgen_exception_vstart_o ( addrgen_exception_vstart_o ), + .addrgen_exception_load_o ( addrgen_exception_load ), + .addrgen_exception_store_o ( addrgen_exception_store ), // Interface with the lanes .addrgen_operand_i (addrgen_operand_i ), .addrgen_operand_target_fu_i(addrgen_operand_target_fu_i), @@ -144,7 +170,19 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( .axi_addrgen_req_o (axi_addrgen_req ), .axi_addrgen_req_valid_o (axi_addrgen_req_valid ), .ldu_axi_addrgen_req_ready_i(ldu_axi_addrgen_req_ready ), - .stu_axi_addrgen_req_ready_i(stu_axi_addrgen_req_ready ) + .stu_axi_addrgen_req_ready_i(stu_axi_addrgen_req_ready ), + + // CSR input + .en_ld_st_translation_i, + .mmu_misaligned_ex_o, + .mmu_req_o, + .mmu_vaddr_o, + .mmu_is_store_o, + .mmu_dtlb_hit_i, + .mmu_dtlb_ppn_i, + .mmu_valid_i, + .mmu_paddr_i, + .mmu_exception_i ); //////////////////////// @@ -165,7 +203,7 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( .axi_r_valid_i (axi_resp.r_valid ), .axi_r_ready_o (axi_req.r_ready ), // Interface with the dispatcher - .load_complete_o (load_complete_o ), + .load_complete_o (load_complete ), // Interface with the main sequencer .pe_req_i (pe_req_i ), .pe_req_valid_i (pe_req_valid_i ), @@ -173,6 +211,7 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( .pe_req_ready_o (pe_req_ready_o[OffsetLoad]), .pe_resp_o (pe_resp_o[OffsetLoad] ), // Interface with the address generator + .addrgen_exception_valid_i ( addrgen_ack_o & addrgen_exception_o.valid ), .axi_addrgen_req_i (axi_addrgen_req ), .axi_addrgen_req_valid_i(axi_addrgen_req_valid ), .axi_addrgen_req_ready_o(ldu_axi_addrgen_req_ready ), @@ -213,7 +252,7 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( .axi_b_ready_o (axi_req.b_ready ), // Interface with the dispatcher .store_pending_o (store_pending_o ), - .store_complete_o (store_complete_o ), + .store_complete_o (store_complete ), // Interface with the main sequencer .pe_req_i (pe_req_i ), .pe_req_valid_i (pe_req_valid_i ), @@ -221,6 +260,7 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( .pe_req_ready_o (pe_req_ready_o[OffsetStore]), .pe_resp_o (pe_resp_o[OffsetStore] ), // Interface with the address generator + .addrgen_exception_valid_i ( addrgen_ack_o & addrgen_exception_o.valid ), .axi_addrgen_req_i (axi_addrgen_req ), .axi_addrgen_req_valid_i(axi_addrgen_req_valid ), .axi_addrgen_req_ready_o(stu_axi_addrgen_req_ready ), diff --git a/hardware/src/vlsu/vstu.sv b/hardware/src/vlsu/vstu.sv index 9580f59b0..6d92c03a7 100644 --- a/hardware/src/vlsu/vstu.sv +++ b/hardware/src/vlsu/vstu.sv @@ -47,6 +47,7 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #( // Interface with the address generator input addrgen_axi_req_t axi_addrgen_req_i, input logic axi_addrgen_req_valid_i, + input logic addrgen_exception_valid_i, output logic axi_addrgen_req_ready_o, // Interface with the lanes input elen_t [NrLanes-1:0] stu_operand_i,