From f08c28f443dd38d39051129463756a68d0a0c613 Mon Sep 17 00:00:00 2001
From: Vincenzo Maisto <maisto_v@libero.it>
Date: Fri, 13 Oct 2023 13:28:28 +0200
Subject: [PATCH] Supporting vstart CSR for operand read, VALU, VLSU

* vstart support for vector unit-stride loads and stores

* vstart support for vector strided loads and stores

* vstart support for valu operations, mask operations not tested

* Preliminary work on vstart support for vector indexed loads and stores

* Minor fixes

* Refactoring

* Explanatory comments
---
 hardware/include/ara_pkg.sv            |  15 +-
 hardware/src/ara.sv                    |   4 +-
 hardware/src/ara_dispatcher.sv         |  19 +-
 hardware/src/lane/lane_sequencer.sv    | 170 +++++++-------
 hardware/src/lane/operand_queue.sv     |  44 ++--
 hardware/src/lane/operand_requester.sv | 306 +++++++++++++------------
 hardware/src/lane/valu.sv              |  78 ++++---
 hardware/src/vlsu/addrgen.sv           |  49 +++-
 hardware/src/vlsu/vldu.sv              | 150 +++++++-----
 hardware/src/vlsu/vstu.sv              | 249 ++++++++++++--------
 10 files changed, 637 insertions(+), 447 deletions(-)

diff --git a/hardware/include/ara_pkg.sv b/hardware/include/ara_pkg.sv
index df0cdc7b5..b8ffa78c8 100644
--- a/hardware/include/ara_pkg.sv
+++ b/hardware/include/ara_pkg.sv
@@ -974,11 +974,20 @@ package ara_pkg;
   } opqueue_e;
 
   // Each lane has eight VRF banks
+  // NOTE: values != 8 are not supported
   localparam int unsigned NrVRFBanksPerLane = 8;
 
-  // Find the starting address of a vector register vid
+  // Find the starting address (in bytes) of a vector register chunk of vid
   function automatic logic [63:0] vaddr(logic [4:0] vid, int NrLanes);
-    vaddr = vid * (VLENB / NrLanes / 8);
+    // Each vector register spans multiple words in each bank in each lane
+    // The start address is the same in every lane
+    // Therefore, within each lane, each vector register chunk starts on a given offset
+    vaddr = vid * (VLENB / NrLanes / NrVRFBanksPerLane); 
+    // NOTE: the only extensively tested configuration of Ara keeps:
+    //        - (VLEN / NrLanes) constant to 1024;
+    //        - NrVRFBanksPerLane always equal to 8.
+    //        Given so, each vector register will span 2 words across all the banks and lanes, 
+    //        therefore, vaddr = vid * 16
   endfunction: vaddr
 
   // Differenciate between SLDU and ADDRGEN operands from opqueue
@@ -1016,7 +1025,7 @@ package ara_pkg;
 
   typedef struct packed {
     rvv_pkg::vew_e eew;        // Effective element width
-    vlen_t vl;                 // Vector length
+    vlen_t elem_count;         // Vector body length
     opqueue_conversion_e conv; // Type conversion
     logic [1:0] ntr_red;       // Neutral type for reductions
     logic is_reduct;           // Is this a reduction?
diff --git a/hardware/src/ara.sv b/hardware/src/ara.sv
index 3288219d4..2bb6c6d08 100644
--- a/hardware/src/ara.sv
+++ b/hardware/src/ara.sv
@@ -43,7 +43,7 @@ module ara import ara_pkg::*; #(
     
     // Interface with CVA6's sv39 MMU
     // This is everything the MMU can provide, it might be overcomplete for Ara and some signals be useless
-    output  exception_t                    mmu_misaligned_ex_o,
+    output  ariane_pkg::exception_t        mmu_misaligned_ex_o,
     output  logic                          mmu_req_o,        // request address translation
     output  logic [riscv::VLEN-1:0]        mmu_vaddr_o,      // virtual address out
     output  logic                          mmu_is_store_o,   // the translation is requested by a store
@@ -54,7 +54,7 @@ module ara import ara_pkg::*; #(
     // Cycle 1
     input logic                            mmu_valid_i,      // translation is valid
     input logic [riscv::PLEN-1:0]          mmu_paddr_i,      // translated address
-    input exception_t                      mmu_exception_i,  // address translation threw an exception
+    input ariane_pkg::exception_t          mmu_exception_i,  // address translation threw an exception
 
     // Interface with Ariane
     input  accelerator_req_t  acc_req_i,
diff --git a/hardware/src/ara_dispatcher.sv b/hardware/src/ara_dispatcher.sv
index 3c270b885..667a100f5 100644
--- a/hardware/src/ara_dispatcher.sv
+++ b/hardware/src/ara_dispatcher.sv
@@ -2662,15 +2662,16 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
             end
 
             // Wait until the back-end answers to acknowledge those instructions
-            if (ara_resp_valid_i) begin
+            if ( ara_resp_valid_i ) begin : ara_resp_valid
               acc_resp_o.req_ready  = 1'b1;
               acc_resp_o.exception = ara_resp_i.exception;
               acc_resp_o.resp_valid = 1'b1;
               ara_req_valid_d  = 1'b0;
               // In case of error, modify vstart
-              if (ara_resp_i.exception.valid)
+              if ( ara_resp_i.exception.valid ) begin : exception
                 csr_vstart_d = ara_resp_i.exception_vl;
-            end
+              end : exception
+            end : ara_resp_valid
           end : OpcodeLoadFp
 
           /////////////////////
@@ -2859,15 +2860,16 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
             end
 
             // Wait until the back-end answers to acknowledge those instructions
-            if (ara_resp_valid_i) begin
+            if (ara_resp_valid_i) begin : ara_resp_valid
               acc_resp_o.req_ready  = 1'b1;
               acc_resp_o.exception = ara_resp_i.exception;
               acc_resp_o.resp_valid = 1'b1;
               ara_req_valid_d  = 1'b0;
               // If there is an error, change vstart
-              if (ara_resp_i.exception.valid)
+              if ( ara_resp_i.exception.valid ) begin : exception
                 csr_vstart_d = ara_resp_i.exception_vl;
-            end
+              end : exception
+            end : ara_resp_valid
           end : OpcodeStoreFp
 
           ////////////////////////////
@@ -2879,6 +2881,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
             // Therefore, Ara must be idle before performing any CSR operation.
 
             // Stall if there is any pending vector instruction
+            // NOTE: This is overconstraining. Not all CSR ops actually need to stall if a vector instruction is pending.
+            //       E.g., CSR vl is never updated by instructions past ara_dispatcher, except for "unit-stride fault-only-first loads". Reading vl would be safe otherwise.
+            //       E.g., CSR vlenb is a design-constant parameter, reading is always safe.
+            //       E.g., CSRs vxrm and vxsat have no influence on-non fixed-point instructions, it could be read and written safely when no fixed-point operation is running.
+            //       By better analyzing the spec, more of optimizations of such can be made. For the sake of simplicity, the current implementation treats CSR ops as one block.
             if ( ara_idle_i ) begin : ara_idle
               // These always respond at the same cycle
               acc_resp_o.resp_valid = 1'b1;
diff --git a/hardware/src/lane/lane_sequencer.sv b/hardware/src/lane/lane_sequencer.sv
index 386b9823c..ba82f8922 100644
--- a/hardware/src/lane/lane_sequencer.sv
+++ b/hardware/src/lane/lane_sequencer.sv
@@ -113,7 +113,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
   // bits that indicate whether there is a hazard between different vector
   // instructions. Such hazards must be continuously cleared based on the
   // value of the currently running loops from the main sequencer.
-  operand_request_cmd_t [NrOperandQueues-1:0] operand_request_i;
+  operand_request_cmd_t [NrOperandQueues-1:0] operand_request;
   logic                 [NrOperandQueues-1:0] operand_request_push;
 
   operand_request_cmd_t [NrOperandQueues-1:0] operand_request_d;
@@ -133,7 +133,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
 
       // Got a new request
       if (operand_request_push[queue]) begin
-        operand_request_d[queue]       = operand_request_i[queue];
+        operand_request_d[queue]       = operand_request[queue];
         operand_request_valid_d[queue] = 1'b1;
       end
     end
@@ -189,7 +189,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
     pe_resp_o.vinsn_done = vinsn_done_q;
 
     // Make no requests to the operand requester
-    operand_request_i    = '0;
+    operand_request    = '0;
     operand_request_push = '0;
 
     // Make no requests to the lane's VFUs
@@ -197,7 +197,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
     vfu_operation_valid_d = 1'b0;
 
     // If the operand requesters are busy, abort the request and wait for another cycle.
-    if (pe_req_valid) begin
+    if (pe_req_valid) begin : stall_op_req_busy
       unique case (pe_req.vfu)
         VFU_Alu : begin
           pe_req_ready = !(operand_request_valid_o[AluA] ||
@@ -230,11 +230,11 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
           pe_req_ready = !(operand_request_valid_o[MaskB]);
         end
         default:;
-      endcase
+      endcase // stall_op_req_busy
     end
 
     // We received a new vector instruction
-    if (pe_req_valid && pe_req_ready && !vinsn_running_d[pe_req.id]) begin
+    if (pe_req_valid && pe_req_ready && !vinsn_running_d[pe_req.id]) begin : pe_req_valid
       // Populate the VFU request
       vfu_operation_d = '{
         id             : pe_req.id,
@@ -263,9 +263,13 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
       if (lane_id_i < pe_req.vl[idx_width(NrLanes)-1:0]) vfu_operation_d.vl += 1;
 
       // Vector start calculation
-      vfu_operation_d.vstart = pe_req.vstart / NrLanes;
-      // If lane_id_i < vstart % NrLanes, this lane needs to execute one micro-operation less.
-      if (lane_id_i < pe_req.vstart[idx_width(NrLanes)-1:0]) vfu_operation_d.vstart -= 1;
+      // TODO: check for LMUL = 4, 8
+      // TODO: check for SEW != 64
+      vfu_operation_d.vstart = pe_req.vstart / NrLanes; // High bits
+      // If lane_id_i < (vstart % NrLanes), this lane needs to execute one micro-operation less.
+      if (lane_id_i < pe_req.vstart[idx_width(NrLanes)-1:0]) begin : adjust_vstart_lane
+        vfu_operation_d.vstart += 1;
+      end : adjust_vstart_lane
 
       // Mark the vector instruction as running
       vinsn_running_d[pe_req.id] = (vfu_operation_d.vfu != VFU_None) ? 1'b1 : 1'b0;
@@ -287,7 +291,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
 
       unique case (pe_req.vfu)
         VFU_Alu: begin
-          operand_request_i[AluA] = '{
+          operand_request[AluA] = '{
             id         : pe_req.id,
             vs         : pe_req.vs1,
             eew        : pe_req.eew_vs1,
@@ -306,7 +310,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
           };
           operand_request_push[AluA] = pe_req.use_vs1;
 
-          operand_request_i[AluB] = '{
+          operand_request[AluB] = '{
             id         : pe_req.id,
             vs         : pe_req.vs2,
             eew        : pe_req.eew_vs2,
@@ -328,24 +332,24 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
           operand_request_push[AluB] = pe_req.use_vs2;
 
           // This vector instruction uses masks
-          operand_request_i[MaskM] = '{
+          operand_request[MaskM] = '{
             id     : pe_req.id,
             vs     : VMASK,
             eew    : pe_req.vtype.vsew,
             vtype  : pe_req.vtype,
             // Since this request goes outside of the lane, we might need to request an
             // extra operand regardless of whether it is valid in this lane or not.
-            vl     : (pe_req.vl / NrLanes / 8) >> int'(pe_req.vtype.vsew),
+            vl     : (pe_req.vl / NrLanes / 8) >> unsigned'(pe_req.vtype.vsew),
             vstart : vfu_operation_d.vstart,
             hazard : pe_req.hazard_vm | pe_req.hazard_vd,
             default: '0
           };
-          if ((operand_request_i[MaskM].vl << int'(pe_req.vtype.vsew)) *
-              NrLanes * 8 != pe_req.vl) operand_request_i[MaskM].vl += 1;
+          if ((operand_request[MaskM].vl << unsigned'(pe_req.vtype.vsew)) *
+              NrLanes * 8 != pe_req.vl) operand_request[MaskM].vl += 1;
           operand_request_push[MaskM] = !pe_req.vm;
         end
         VFU_MFpu: begin
-          operand_request_i[MulFPUA] = '{
+          operand_request[MulFPUA] = '{
             id         : pe_req.id,
             vs         : pe_req.vs1,
             eew        : pe_req.eew_vs1,
@@ -365,7 +369,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
           };
           operand_request_push[MulFPUA] = pe_req.use_vs1;
 
-          operand_request_i[MulFPUB] = '{
+          operand_request[MulFPUB] = '{
             id         : pe_req.id,
             vs         : pe_req.swap_vs2_vd_op ? pe_req.vd        : pe_req.vs2,
             eew        : pe_req.swap_vs2_vd_op ? pe_req.eew_vd_op : pe_req.eew_vs2,
@@ -388,7 +392,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
           operand_request_push[MulFPUB] = pe_req.swap_vs2_vd_op ?
           pe_req.use_vd_op : pe_req.use_vs2;
 
-          operand_request_i[MulFPUC] = '{
+          operand_request[MulFPUC] = '{
             id         : pe_req.id,
             vs         : pe_req.swap_vs2_vd_op ? pe_req.vs2            : pe_req.vd,
             eew        : pe_req.swap_vs2_vd_op ? pe_req.eew_vs2        : pe_req.eew_vd_op,
@@ -411,42 +415,42 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
           pe_req.use_vs2 : pe_req.use_vd_op;
 
           // This vector instruction uses masks
-          operand_request_i[MaskM] = '{
+          operand_request[MaskM] = '{
             id     : pe_req.id,
             vs     : VMASK,
             eew    : pe_req.vtype.vsew,
             vtype  : pe_req.vtype,
             // Since this request goes outside of the lane, we might need to request an
             // extra operand regardless of whether it is valid in this lane or not.
-            vl     : (pe_req.vl / NrLanes / 8) >> int'(pe_req.vtype.vsew),
+            vl     : (pe_req.vl / NrLanes / 8) >> unsigned'(pe_req.vtype.vsew),
             vstart : vfu_operation_d.vstart,
             hazard : pe_req.hazard_vm | pe_req.hazard_vd,
             default: '0
           };
-          if ((operand_request_i[MaskM].vl << int'(pe_req.vtype.vsew)) *
-              NrLanes * 8 != pe_req.vl) operand_request_i[MaskM].vl += 1;
+          if ((operand_request[MaskM].vl << unsigned'(pe_req.vtype.vsew)) *
+              NrLanes * 8 != pe_req.vl) operand_request[MaskM].vl += 1;
           operand_request_push[MaskM] = !pe_req.vm;
         end
         VFU_LoadUnit : begin
           // This vector instruction uses masks
-          operand_request_i[MaskM] = '{
+          operand_request[MaskM] = '{
             id     : pe_req.id,
             vs     : VMASK,
             eew    : pe_req.vtype.vsew,
             vtype  : pe_req.vtype,
             // Since this request goes outside of the lane, we might need to request an
             // extra operand regardless of whether it is valid in this lane or not.
-            vl     : (pe_req.vl / NrLanes / 8) >> int'(pe_req.vtype.vsew),
+            vl     : (pe_req.vl / NrLanes / 8) >> unsigned'(pe_req.vtype.vsew),
             vstart : vfu_operation_d.vstart,
             hazard : pe_req.hazard_vm | pe_req.hazard_vd,
             default: '0
           };
-          if ((operand_request_i[MaskM].vl << int'(pe_req.vtype.vsew)) *
-              NrLanes * 8 != pe_req.vl) operand_request_i[MaskM].vl += 1;
+          if ((operand_request[MaskM].vl << unsigned'(pe_req.vtype.vsew)) *
+              NrLanes * 8 != pe_req.vl) operand_request[MaskM].vl += 1;
           operand_request_push[MaskM] = !pe_req.vm;
 
           // Load indexed
-          operand_request_i[SlideAddrGenA] = '{
+          operand_request[SlideAddrGenA] = '{
             id       : pe_req_i.id,
             vs       : pe_req_i.vs2,
             eew      : pe_req_i.eew_vs2,
@@ -461,13 +465,14 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
           };
           // Since this request goes outside of the lane, we might need to request an
           // extra operand regardless of whether it is valid in this lane or not.
-          if (operand_request_i[SlideAddrGenA].vl * NrLanes != pe_req_i.vl)
-            operand_request_i[SlideAddrGenA].vl += 1;
+          if (operand_request[SlideAddrGenA].vl * NrLanes != pe_req_i.vl)
+            operand_request[SlideAddrGenA].vl += 1;
           operand_request_push[SlideAddrGenA] = pe_req_i.op == VLXE;
         end
 
         VFU_StoreUnit : begin
-          operand_request_i[StA] = '{
+          // vstart is supported here
+          operand_request[StA] = '{
             id      : pe_req.id,
             vs      : pe_req.vs1,
             eew     : pe_req.eew_vs1,
@@ -481,28 +486,34 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
             hazard  : pe_req.hazard_vs1 | pe_req.hazard_vd,
             default : '0
           };
-          if (operand_request_i[StA].vl * NrLanes != pe_req.vl) operand_request_i[StA].vl += 1;
+          // vl is not an integer multiple of NrLanes
+          // I.e., ( ( pe_req.vl / NrLanes * NrLanes ) == vl ) <=> ( ( vl % NrLanes ) != 0 )
+          if ( ( operand_request[StA].vl * NrLanes ) != pe_req.vl ) begin : tweak_vl_StA
+            operand_request[StA].vl += 1;
+          end : tweak_vl_StA
           operand_request_push[StA] = pe_req.use_vs1;
 
           // This vector instruction uses masks
-          operand_request_i[MaskM] = '{
+          // TODO: add vstart support here
+          operand_request[MaskM] = '{
             id     : pe_req.id,
             vs     : VMASK,
             eew    : pe_req.vtype.vsew,
             vtype  : pe_req.vtype,
             // Since this request goes outside of the lane, we might need to request an
             // extra operand regardless of whether it is valid in this lane or not.
-            vl     : (pe_req.vl / NrLanes / 8) >> int'(pe_req.vtype.vsew),
+            vl     : (pe_req.vl / NrLanes / 8) >> unsigned'(pe_req.vtype.vsew),
             vstart : vfu_operation_d.vstart,
             hazard : pe_req.hazard_vm | pe_req.hazard_vd,
             default: '0
           };
-          if ((operand_request_i[MaskM].vl << int'(pe_req.vtype.vsew)) *
-              NrLanes * 8 != pe_req.vl) operand_request_i[MaskM].vl += 1;
+          if ((operand_request[MaskM].vl << unsigned'(pe_req.vtype.vsew)) *
+              NrLanes * 8 != pe_req.vl) operand_request[MaskM].vl += 1;
           operand_request_push[MaskM] = !pe_req.vm;
 
           // Store indexed
-          operand_request_i[SlideAddrGenA] = '{
+          // TODO: add vstart support here
+          operand_request[SlideAddrGenA] = '{
             id       : pe_req_i.id,
             vs       : pe_req_i.vs2,
             eew      : pe_req_i.eew_vs2,
@@ -517,13 +528,14 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
           };
           // Since this request goes outside of the lane, we might need to request an
           // extra operand regardless of whether it is valid in this lane or not.
-          if (operand_request_i[SlideAddrGenA].vl * NrLanes != pe_req_i.vl)
-            operand_request_i[SlideAddrGenA].vl += 1;
+          if (operand_request[SlideAddrGenA].vl * NrLanes != pe_req_i.vl) begin : tweak_vl_SlideAddrGenA
+            operand_request[SlideAddrGenA].vl += 1;
+          end : tweak_vl_SlideAddrGenA
           operand_request_push[SlideAddrGenA] = pe_req_i.op == VSXE;
         end
 
         VFU_SlideUnit: begin
-          operand_request_i[SlideAddrGenA] = '{
+          operand_request[SlideAddrGenA] = '{
             id       : pe_req.id,
             vs       : pe_req.vs2,
             eew      : pe_req.eew_vs2,
@@ -543,7 +555,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
               // as operands by the slide unit.
               // Since this request goes outside of the lane, we might need to request an
               // extra operand regardless of whether it is valid in this lane or not.
-              operand_request_i[SlideAddrGenA].vl =
+              operand_request[SlideAddrGenA].vl =
               (pe_req.vl - pe_req.stride + NrLanes - 1) / NrLanes;
             end
             VSLIDEDOWN: begin
@@ -554,7 +566,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
 
               // We need to trim full words from the start of the vector that are not used
               // as operands by the slide unit.
-              operand_request_i[SlideAddrGenA].vstart = pe_req.stride / NrLanes;
+              operand_request[SlideAddrGenA].vstart = pe_req.stride / NrLanes;
 
               // The stride move the initial address in boundaries of 8*NrLanes Byte.
               // If the stride is not multiple of a full VRF word (8*NrLanes Byte),
@@ -576,15 +588,15 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
                 vl_tot += extra_stride;
 
               // Ask the elements, and ask one more if we do not perfectly divide NrLanes
-              operand_request_i[SlideAddrGenA].vl = vl_tot / NrLanes;
-              if (operand_request_i[SlideAddrGenA].vl * NrLanes != vl_tot)
-                operand_request_i[SlideAddrGenA].vl += 1;
+              operand_request[SlideAddrGenA].vl = vl_tot / NrLanes;
+              if (operand_request[SlideAddrGenA].vl * NrLanes != vl_tot)
+                operand_request[SlideAddrGenA].vl += 1;
             end
             default:;
           endcase
 
           // This vector instruction uses masks
-          operand_request_i[MaskM] = '{
+          operand_request[MaskM] = '{
             id     : pe_req.id,
             vs     : VMASK,
             eew    : pe_req.vtype.vsew,
@@ -601,32 +613,32 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
               // as operands by the slide unit.
               // Since this request goes outside of the lane, we might need to request an
               // extra operand regardless of whether it is valid in this lane or not.
-              operand_request_i[MaskM].vl =
+              operand_request[MaskM].vl =
               ((pe_req.vl - pe_req.stride + NrLanes - 1) / 8 / NrLanes)
-              >> int'(pe_req.vtype.vsew);
+              >> unsigned'(pe_req.vtype.vsew);
 
-              if (((operand_request_i[MaskM].vl + pe_req.stride) <<
-                    int'(pe_req.vtype.vsew) * NrLanes * 8 != pe_req.vl))
-                operand_request_i[MaskM].vl += 1;
+              if (((operand_request[MaskM].vl + pe_req.stride) <<
+                    unsigned'(pe_req.vtype.vsew) * NrLanes * 8 != pe_req.vl))
+                operand_request[MaskM].vl += 1;
 
               // SLIDEUP only uses mask bits whose indices are > stride
               // Don't send the previous (unused) ones to the MASKU
               if (pe_req.stride >= NrLanes * 64)
-                operand_request_i[MaskM].vstart += ((pe_req.stride >> NrLanes * 64) << NrLanes * 64) / 8;
+                operand_request[MaskM].vstart += ((pe_req.stride >> NrLanes * 64) << NrLanes * 64) / 8;
             end
             VSLIDEDOWN: begin
               // Since this request goes outside of the lane, we might need to request an
               // extra operand regardless of whether it is valid in this lane or not.
-              operand_request_i[MaskM].vl = ((pe_req.vl / NrLanes / 8) >> int'(
+              operand_request[MaskM].vl = ((pe_req.vl / NrLanes / 8) >> unsigned'(
                     pe_req.vtype.vsew));
-              if ((operand_request_i[MaskM].vl << int'(pe_req.vtype.vsew)) *
+              if ((operand_request[MaskM].vl << unsigned'(pe_req.vtype.vsew)) *
                   NrLanes * 8 != pe_req.vl)
-                operand_request_i[MaskM].vl += 1;
+                operand_request[MaskM].vl += 1;
             end
           endcase
         end
         VFU_MaskUnit: begin
-          operand_request_i[AluA] = '{
+          operand_request[AluA] = '{
             id      : pe_req.id,
             vs      : pe_req.vs1,
             eew     : pe_req.eew_vs1,
@@ -640,21 +652,21 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
           // This is an operation that runs normally on the ALU, and then gets *condensed* and
           // reshuffled at the Mask Unit.
           if (pe_req.op inside {[VMSEQ:VMSBC]}) begin
-            operand_request_i[AluA].vl = vfu_operation_d.vl;
+            operand_request[AluA].vl = vfu_operation_d.vl;
           end
           // This is an operation that runs normally on the ALU, and then gets reshuffled at the
           // Mask Unit.
           else begin
             // Since this request goes outside of the lane, we might need to request an
             // extra operand regardless of whether it is valid in this lane or not.
-            operand_request_i[AluA].vl = (pe_req.vl / NrLanes) >>
-            (int'(EW64) - int'(pe_req.eew_vs1));
-            if ((operand_request_i[AluA].vl << (int'(EW64) - int'(pe_req.eew_vs1))) * NrLanes !=
-                pe_req.vl) operand_request_i[AluA].vl += 1;
+            operand_request[AluA].vl = (pe_req.vl / NrLanes) >>
+            (unsigned'(EW64) - unsigned'(pe_req.eew_vs1));
+            if ((operand_request[AluA].vl << (unsigned'(EW64) - unsigned'(pe_req.eew_vs1))) * NrLanes !=
+                pe_req.vl) operand_request[AluA].vl += 1;
           end
           operand_request_push[AluA] = pe_req.use_vs1 && !(pe_req.op inside {[VMFEQ:VMFGE]});
 
-          operand_request_i[AluB] = '{
+          operand_request[AluB] = '{
             id      : pe_req.id,
             vs      : pe_req.vs2,
             eew     : pe_req.eew_vs2,
@@ -667,21 +679,21 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
           // This is an operation that runs normally on the ALU, and then gets *condensed* and
           // reshuffled at the Mask Unit.
           if (pe_req.op inside {[VMSEQ:VMSBC]}) begin
-            operand_request_i[AluB].vl = vfu_operation_d.vl;
+            operand_request[AluB].vl = vfu_operation_d.vl;
           end
           // This is an operation that runs normally on the ALU, and then gets reshuffled at the
           // Mask Unit.
           else begin
             // Since this request goes outside of the lane, we might need to request an
             // extra operand regardless of whether it is valid in this lane or not.
-            operand_request_i[AluB].vl = (pe_req.vl / NrLanes) >>
-            (int'(EW64) - int'(pe_req.eew_vs2));
-            if ((operand_request_i[AluB].vl << (int'(EW64) - int'(pe_req.eew_vs2))) * NrLanes !=
-                pe_req.vl) operand_request_i[AluB].vl += 1;
+            operand_request[AluB].vl = (pe_req.vl / NrLanes) >>
+            (unsigned'(EW64) - unsigned'(pe_req.eew_vs2));
+            if ((operand_request[AluB].vl << (unsigned'(EW64) - unsigned'(pe_req.eew_vs2))) * NrLanes !=
+                pe_req.vl) operand_request[AluB].vl += 1;
           end
           operand_request_push[AluB] = pe_req.use_vs2 && !(pe_req.op inside {[VMFEQ:VMFGE]});
 
-          operand_request_i[MulFPUA] = '{
+          operand_request[MulFPUA] = '{
             id      : pe_req.id,
             vs      : pe_req.vs1,
             eew     : pe_req.eew_vs1,
@@ -694,10 +706,10 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
 
           // This is an operation that runs normally on the ALU, and then gets *condensed* and
           // reshuffled at the Mask Unit.
-          operand_request_i[MulFPUA].vl = vfu_operation_d.vl;
+          operand_request[MulFPUA].vl = vfu_operation_d.vl;
           operand_request_push[MulFPUA] = pe_req.use_vs1 && pe_req.op inside {[VMFEQ:VMFGE]};
 
-          operand_request_i[MulFPUB] = '{
+          operand_request[MulFPUB] = '{
             id      : pe_req.id,
             vs      : pe_req.vs2,
             eew     : pe_req.eew_vs2,
@@ -709,10 +721,10 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
           };
           // This is an operation that runs normally on the ALU, and then gets *condensed* and
           // reshuffled at the Mask Unit.
-          operand_request_i[MulFPUB].vl = vfu_operation_d.vl;
+          operand_request[MulFPUB].vl = vfu_operation_d.vl;
           operand_request_push[MulFPUB] = pe_req.use_vs2 && pe_req.op inside {[VMFEQ:VMFGE]};
 
-          operand_request_i[MaskB] = '{
+          operand_request[MaskB] = '{
             id      : pe_req.id,
             vs      : pe_req.vd,
             eew     : pe_req.eew_vd_op,
@@ -720,16 +732,16 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
             vtype   : pe_req.vtype,
             // Since this request goes outside of the lane, we might need to request an
             // extra operand regardless of whether it is valid in this lane or not.
-            vl      : (pe_req.vl / NrLanes / ELEN) << (int'(EW64) - int'(pe_req.vtype.vsew)),
+            vl      : (pe_req.vl / NrLanes / ELEN) << (unsigned'(EW64) - unsigned'(pe_req.vtype.vsew)),
             vstart  : vfu_operation_d.vstart,
             hazard  : pe_req.hazard_vd,
             default : '0
           };
           if (((pe_req.vl / NrLanes / ELEN) * NrLanes * ELEN) !=
-            pe_req.vl) operand_request_i[MaskB].vl += 1;
+            pe_req.vl) operand_request[MaskB].vl += 1;
           operand_request_push[MaskB] = pe_req.use_vd_op;
 
-          operand_request_i[MaskM] = '{
+          operand_request[MaskM] = '{
             id     : pe_req.id,
             vs     : VMASK,
             eew    : pe_req.vtype.vsew,
@@ -741,13 +753,13 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
             hazard : pe_req.hazard_vm,
             default: '0
           };
-          if ((operand_request_i[MaskM].vl * NrLanes * ELEN) != pe_req.vl) begin
-            operand_request_i[MaskM].vl += 1;
+          if ((operand_request[MaskM].vl * NrLanes * ELEN) != pe_req.vl) begin
+            operand_request[MaskM].vl += 1;
           end
           operand_request_push[MaskM] = !pe_req.vm;
         end
         VFU_None: begin
-          operand_request_i[MaskB] = '{
+          operand_request[MaskB] = '{
             id         : pe_req.id,
             vs         : pe_req.vs2,
             eew        : pe_req.eew_vs2,
@@ -763,8 +775,8 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
           operand_request_push[MaskB] = 1'b1;
         end
         default:;
-      endcase
-    end
+      endcase // pe_req.vfu
+    end : pe_req_valid
   end: sequencer
 
   always_ff @(posedge clk_i or negedge rst_ni) begin: p_sequencer_ff
diff --git a/hardware/src/lane/operand_queue.sv b/hardware/src/lane/operand_queue.sv
index 72c8202e1..9b8c1464c 100644
--- a/hardware/src/lane/operand_queue.sv
+++ b/hardware/src/lane/operand_queue.sv
@@ -127,7 +127,7 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
   ///////////////////////
 
   // Count how many operands were already produced
-  vlen_t vl_d, vl_q;
+  vlen_t elem_count_d, elem_count_q;
 
   elen_t                            conv_operand;
   // Decide whether we are taking the operands from the lower or from the upper half of the input
@@ -226,23 +226,23 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
       end
 
       // Assert the signal if the last 64-bit packet will contain also
-      // elements with idx >= vl (they should not contribute to the result!).
+      // elements with idx >= elem_count (they should not contribute to the result!).
       // Gate for power saving
       // Power optimization:
       // The optimal solution would be to act on the mask bits in the two
       // processing units (valu and vmfpu), masking the unused elements.
       unique case (cmd.eew)
         EW8 : begin
-          incomplete_packet = |cmd.vl[2:0];
-          last_packet       = ((cmd.vl - vl_q) <= 8) ? 1'b1 : 1'b0;
+          incomplete_packet = |cmd.elem_count[2:0];
+          last_packet       = ((cmd.elem_count - elem_count_q) <= 8) ? 1'b1 : 1'b0;
         end
         EW16: begin
-          incomplete_packet = |cmd.vl[1:0];
-          last_packet       = ((cmd.vl - vl_q) <= 4) ? 1'b1 : 1'b0;
+          incomplete_packet = |cmd.elem_count[1:0];
+          last_packet       = ((cmd.elem_count - elem_count_q) <= 4) ? 1'b1 : 1'b0;
         end
         EW32: begin
-          incomplete_packet = |cmd.vl[0:0];
-          last_packet       = ((cmd.vl - vl_q) <= 2) ? 1'b1 : 1'b0;
+          incomplete_packet = |cmd.elem_count[0:0];
+          last_packet       = ((cmd.elem_count - elem_count_q) <= 2) ? 1'b1 : 1'b0;
         end
         default: begin
           incomplete_packet = 1'b0;
@@ -373,15 +373,15 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
           if (SupportNtrVal) unique case (cmd.eew)
             EW8 : for (int unsigned b = 0; b < 8; b++) begin
                     automatic int unsigned bs = shuffle_index(b, 1, EW8);
-                    if ((b >> 0) >= cmd.vl[2:0]) conv_operand[8*bs +: 8] = ntr.w8[b];
+                    if ((b >> 0) >= cmd.elem_count[2:0]) conv_operand[8*bs +: 8] = ntr.w8[b];
                   end
             EW16: for (int unsigned b = 0; b < 8; b++) begin
                     automatic int unsigned bs = shuffle_index(b, 1, EW16);
-                    if ((b >> 1) >= cmd.vl[1:0]) conv_operand[8*bs +: 8] = ntr.w8[b];
+                    if ((b >> 1) >= cmd.elem_count[1:0]) conv_operand[8*bs +: 8] = ntr.w8[b];
                   end
             EW32: for (int unsigned b = 0; b < 8; b++) begin
                     automatic int unsigned bs = shuffle_index(b, 1, EW32);
-                    if ((b >> 2) >= cmd.vl[0:0]) conv_operand[8*bs +: 8] = ntr.w8[b];
+                    if ((b >> 2) >= cmd.elem_count[0:0]) conv_operand[8*bs +: 8] = ntr.w8[b];
                   end
             default:;
           endcase
@@ -401,7 +401,7 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
 
     // Maintain state
     select_d = select_q;
-    vl_d     = vl_q;
+    elem_count_d     = elem_count_q;
 
     // Send the operand
     operand_o       = conv_operand;
@@ -418,16 +418,16 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
         OpQueueConversionZExt2,
         OpQueueConversionWideFP2,
         OpQueueAdjustFPCvt:
-          if (SupportIntExt2) vl_d = vl_q + (1 << (int'(EW64) - int'(cmd.eew))) / 2;
+          if (SupportIntExt2) elem_count_d = elem_count_q + (1 << (unsigned'(EW64) - unsigned'(cmd.eew))) / 2;
         OpQueueConversionSExt4,
         OpQueueConversionZExt4:
-          if (SupportIntExt4) vl_d = vl_q + (1 << (int'(EW64) - int'(cmd.eew))) / 4;
+          if (SupportIntExt4) elem_count_d = elem_count_q + (1 << (unsigned'(EW64) - unsigned'(cmd.eew))) / 4;
         OpQueueConversionSExt8,
         OpQueueConversionZExt8:
-          if (SupportIntExt8) vl_d = vl_q + (1 << (int'(EW64) - int'(cmd.eew))) / 8;
+          if (SupportIntExt8) elem_count_d = elem_count_q + (1 << (unsigned'(EW64) - unsigned'(cmd.eew))) / 8;
         OpQueueReductionZExt:
-          vl_d = vl_q + 1;
-        default: vl_d = vl_q + (1 << (int'(EW64) - int'(cmd.eew)));
+          elem_count_d = elem_count_q + 1;
+        default: elem_count_d = elem_count_q + (1 << (unsigned'(EW64) - unsigned'(cmd.eew)));
       endcase
 
       // Update the pointer to the input operand
@@ -443,22 +443,22 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
       if ((select_q != '0 && select_d == '0) || cmd.conv == OpQueueConversionNone) ibuf_pop = 1'b1;
 
       // Finished execution
-      if (vl_d >= cmd.vl) begin
+      if (elem_count_d >= cmd.elem_count) begin : finished_elems
         ibuf_pop = 1'b1;
         cmd_pop  = 1'b1;
         select_d = '0;
-        vl_d     = '0;
-      end
+        elem_count_d     = '0;
+      end : finished_elems
     end
   end : obuf_control
 
   always_ff @(posedge clk_i or negedge rst_ni) begin: p_type_conversion_ff
     if (!rst_ni) begin
       select_q <= '0;
-      vl_q     <= '0;
+      elem_count_q     <= '0;
     end else begin
       select_q <= select_d;
-      vl_q     <= vl_d;
+      elem_count_q     <= elem_count_d;
     end
   end : p_type_conversion_ff
 
diff --git a/hardware/src/lane/operand_requester.sv b/hardware/src/lane/operand_requester.sv
index 54590fbc3..cbad2b2d5 100644
--- a/hardware/src/lane/operand_requester.sv
+++ b/hardware/src/lane/operand_requester.sv
@@ -199,7 +199,7 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
   //  Operand request  //
   ///////////////////////
 
-  // There is an operand requester for each operand queue. Each one
+  // There is an operand requester_index for each operand queue. Each one
   // can be in one of the following two states.
   typedef enum logic {
     IDLE,
@@ -223,216 +223,230 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
   logic     [NrBanks-1:0][NrMasters-1:0] operand_gnt;
   payload_t [NrMasters-1:0]              operand_payload;
 
-  for (genvar requester = 0; requester < NrOperandQueues; requester++) begin: gen_operand_requester
-    // State of this operand requester
+  // Metadata required to request all elements of this vector operand
+  typedef struct packed {
+    // ID of the instruction for this requester_index
+    vid_t id;
+    // Address of the next element to be read
+    vaddr_t addr;
+    // How many elements remain to be read
+    vlen_t len;
+    // Element width
+    vew_e vew;
+
+    // Hazards between vector instructions
+    logic [NrVInsn-1:0] hazard;
+
+    // Widening instructions produces two writes of every read
+    // In case of a WAW with a previous instruction,
+    // read once every two writes of the previous instruction
+    logic is_widening;
+    // One-bit counters
+    logic [NrVInsn-1:0] waw_hazard_counter;
+  } requester_metadata_t;
+
+  for (genvar requester_index = 0; requester_index < NrOperandQueues; requester_index++) begin : gen_operand_requester
+    // State of this operand requester_index
     state_t state_d, state_q;
 
-    // Metadata required to request all elements of this vector operand
-    struct packed {
-      // ID of the instruction for this requester
-      vid_t id;
-      // Address of the next element to be read
-      vaddr_t addr;
-      // How many elements remain to be read
-      vlen_t len;
-      // Element width
-      vew_e vew;
-
-      // Hazards between vector instructions
-      logic [NrVInsn-1:0] hazard;
-
-      // Widening instructions produces two writes of every read
-      // In case of a WAW with a previous instruction,
-      // read once every two writes of the previous instruction
-      logic is_widening;
-      // One-bit counters
-      logic [NrVInsn-1:0] waw_hazard_counter;
-    } requester_d, requester_q;
-
+    requester_metadata_t requester_metadata_d, requester_metadata_q;
 
     // Is there a hazard during this cycle?
     logic stall;
-    assign stall = |(requester_q.hazard & ~(vinsn_result_written_q &
-                   (~{NrVInsn{requester_q.is_widening}} | requester_q.waw_hazard_counter)));
+    assign stall = |(requester_metadata_q.hazard & ~(vinsn_result_written_q &
+                   (~{NrVInsn{requester_metadata_q.is_widening}} | requester_metadata_q.waw_hazard_counter)));
 
     // Did we get a grant?
     logic [NrBanks-1:0] operand_requester_gnt;
     for (genvar bank = 0; bank < NrBanks; bank++) begin: gen_operand_requester_gnt
-      assign operand_requester_gnt[bank] = operand_gnt[bank][requester];
+      assign operand_requester_gnt[bank] = operand_gnt[bank][requester_index];
     end
 
     // Did we issue a word to this operand queue?
-    assign operand_issued_o[requester] = |(operand_requester_gnt);
+    assign operand_issued_o[requester_index] = |(operand_requester_gnt);
 
     always_comb begin: operand_requester
+      // Helper local variables
+      automatic operand_queue_cmd_t  operand_queue_cmd_tmp;
+      automatic requester_metadata_t requester_metadata_tmp;
+      automatic vlen_t               vector_body_length;
+      automatic vlen_t               scaled_vector_body_length;
+      automatic vlen_t               effective_vector_body_length;
+      automatic vaddr_t              vrf_addr;
+
       // Maintain state
       state_d     = state_q;
-      requester_d = requester_q;
+      requester_metadata_d = requester_metadata_q;
 
       // Make no requests to the VRF
-      operand_payload[requester] = '0;
-      for (int bank = 0; bank < NrBanks; bank++) operand_req[bank][requester] = 1'b0;
+      operand_payload[requester_index] = '0;
+      for (int bank = 0; bank < NrBanks; bank++) operand_req[bank][requester_index] = 1'b0;
 
-      // Do not acknowledge any operand requester commands
-      operand_request_ready_o[requester] = 1'b0;
+      // Do not acknowledge any operand requester_index commands
+      operand_request_ready_o[requester_index] = 1'b0;
 
       // Do not send any operand conversion commands
-      operand_queue_cmd_o[requester]       = '0;
-      operand_queue_cmd_valid_o[requester] = 1'b0;
+      operand_queue_cmd_o[requester_index]       = '0;
+      operand_queue_cmd_valid_o[requester_index] = 1'b0;
+
+      // Prepare metadata upfront
+      // Length of vector body in elements, i.e., vl - vstart
+      vector_body_length = operand_request_i[requester_index].vl - operand_request_i[requester_index].vstart;
+      // For memory operations, the number of elements initially refers to the new EEW (vsew here),
+      // but the requester_index must refer to the old EEW (eew here)
+      // This reasoning cannot be applied also to widening instructions, which modify vsew
+      // treating it as the EEW of vd
+      scaled_vector_body_length = (
+                                   vector_body_length
+                                    << operand_request_i[requester_index].vtype.vsew
+                                  ) >> operand_request_i[requester_index].eew;
+      // Final computed length
+      effective_vector_body_length = ( operand_request_i[requester_index].scale_vl )
+                                      ? scaled_vector_body_length
+                                      : vector_body_length;
+      // Address of the vstart element of the vector in the VRF
+      vrf_addr = vaddr(operand_request_i[requester_index].vs, NrLanes)
+                  + (
+                      operand_request_i[requester_index].vstart
+                      >> (unsigned'(EW64) - unsigned'(operand_request_i[requester_index].eew))
+                    );
+      // Init helper variables
+      requester_metadata_tmp = {
+        id          : operand_request_i[requester_index].id,
+        addr        : vrf_addr,
+        len         : effective_vector_body_length,
+        vew         : operand_request_i[requester_index].eew,
+        hazard      : operand_request_i[requester_index].hazard,
+        is_widening : operand_request_i[requester_index].cvt_resize == CVT_WIDE,
+        default: '0
+      };
+      operand_queue_cmd_tmp = '{
+        eew       : operand_request_i[requester_index].eew,
+        elem_count: effective_vector_body_length,
+        conv      : operand_request_i[requester_index].conv,
+        ntr_red   : operand_request_i[requester_index].cvt_resize,
+        target_fu : operand_request_i[requester_index].target_fu,
+        is_reduct : operand_request_i[requester_index].is_reduct
+      };
 
       case (state_q)
-        IDLE: begin
+        IDLE: begin : state_q_IDLE
           // Accept a new instruction
-          if (operand_request_valid_i[requester]) begin
+          if (operand_request_valid_i[requester_index]) begin : op_req_valid
             state_d                            = REQUESTING;
             // Acknowledge the request
-            operand_request_ready_o[requester] = 1'b1;
+            operand_request_ready_o[requester_index] = 1'b1;
 
             // Send a command to the operand queue
-            operand_queue_cmd_o[requester] = '{
-              eew : operand_request_i[requester].eew,
-              // For memory operations, the number of elements initially refers to the new EEW (vsew here),
-              // but the requester must refer to the old EEW (eew here)
-              // This reasoning cannot be applied also to widening instructions, which modify vsew
-              // treating it as the EEW of vd
-              vl       : (operand_request_i[requester].scale_vl) ?
-                           ((operand_request_i[requester].vl <<
-                           operand_request_i[requester].vtype.vsew) >>
-                           operand_request_i[requester].eew) :
-                           operand_request_i[requester].vl,
-              conv     : operand_request_i[requester].conv,
-              ntr_red  : operand_request_i[requester].cvt_resize,
-              target_fu: operand_request_i[requester].target_fu,
-              is_reduct: operand_request_i[requester].is_reduct
-            };
+            operand_queue_cmd_o[requester_index] = operand_queue_cmd_tmp;
+            operand_queue_cmd_valid_o[requester_index] = 1'b1;
+
             // The length should be at least one after the rescaling
-            if (operand_queue_cmd_o[requester].vl == '0)
-              operand_queue_cmd_o[requester].vl = 1;
-            operand_queue_cmd_valid_o[requester] = 1'b1;
+            if (operand_queue_cmd_o[requester_index].elem_count == '0) begin : cmd_zero_rescaled_vl
+              operand_queue_cmd_o[requester_index].elem_count = 1;
+            end : cmd_zero_rescaled_vl
 
             // Store the request
-            requester_d = '{
-              id     : operand_request_i[requester].id,
-              addr   : vaddr(operand_request_i[requester].vs, NrLanes) +
-              (operand_request_i[requester].vstart >>
-                (int'(EW64) - int'(operand_request_i[requester].eew))),
-              // For memory operations, the number of elements initially refers to the new EEW (vsew here),
-              // but the requester must refer to the old EEW (eew here)
-              // This reasoning cannot be applied also to widening instructions, which modify vsew
-              // treating it as the EEW of vd
-              len         : (operand_request_i[requester].scale_vl) ?
-                              ((operand_request_i[requester].vl <<
-                              operand_request_i[requester].vtype.vsew) >>
-                              operand_request_i[requester].eew) :
-                              operand_request_i[requester].vl,
-              vew         : operand_request_i[requester].eew,
-              hazard      : operand_request_i[requester].hazard,
-              is_widening : operand_request_i[requester].cvt_resize == CVT_WIDE,
-              default: '0
-            };
+            requester_metadata_d = requester_metadata_tmp;
+
             // The length should be at least one after the rescaling
-            if (requester_d.len == '0)
-              requester_d.len = 1;
+            if (requester_metadata_d.len == '0) begin : req_zero_rescaled_vl
+              requester_metadata_d.len = 1;
+            end : req_zero_rescaled_vl
+
 
             // Mute the requisition if the vl is zero
-            if (operand_request_i[requester].vl == '0) begin
+            if (operand_request_i[requester_index].vl == '0) begin : zero_vl
               state_d                              = IDLE;
-              operand_queue_cmd_valid_o[requester] = 1'b0;
-            end
-          end
-        end
+              operand_queue_cmd_valid_o[requester_index] = 1'b0;
+            end : zero_vl
+          end : op_req_valid
+        end : state_q_IDLE
 
-        REQUESTING: begin
+        REQUESTING: begin : state_q_REQUESTING
           // Update waw counters
-          for (int b = 0; b < NrVInsn; b++)
-            if (vinsn_result_written_d[b])
-              requester_d.waw_hazard_counter[b] = ~requester_q.waw_hazard_counter[b];
+          for (int b = 0; b < NrVInsn; b++) begin : waw_counters_update
+            if ( vinsn_result_written_d[b] ) begin : result_valid
+              requester_metadata_d.waw_hazard_counter[b] = ~requester_metadata_q.waw_hazard_counter[b];
+            end : result_valid
+          end : waw_counters_update
 
-          if (operand_queue_ready_i[requester]) begin
+          if (operand_queue_ready_i[requester_index]) begin : op_queue_ready
             // Bank we are currently requesting
-            automatic int bank = requester_q.addr[idx_width(NrBanks)-1:0];
+            automatic int bank = requester_metadata_q.addr[idx_width(NrBanks)-1:0];
+            automatic vlen_t num_bytes;
 
             // Operand request
-            operand_req[bank][requester] = !stall;
-            operand_payload[requester]   = '{
-              addr   : requester_q.addr >> $clog2(NrBanks),
-              opqueue: opqueue_e'(requester),
-              default: '0
+            operand_req[bank][requester_index] = !stall;
+            operand_payload[requester_index]   = '{
+              addr   : requester_metadata_q.addr >> $clog2(NrBanks),
+              opqueue: opqueue_e'(requester_index),
+              default: '0 // this is a read operation
             };
 
             // Received a grant.
-            if (|operand_requester_gnt) begin
+            if (|operand_requester_gnt) begin : op_req_grant
               // Bump the address pointer
-              requester_d.addr = requester_q.addr + 1'b1;
+              requester_metadata_d.addr = requester_metadata_q.addr + 1'b1;
 
               // We read less than 64 bits worth of elements
-              if (requester_q.len < (1 << (int'(EW64) - int'(requester_q.vew))))
-                requester_d.len    = 0;
-              else requester_d.len = requester_q.len - (1 << (int'(EW64) - int'(requester_q.vew)));
-            end
+              num_bytes = ( 1 << ( unsigned'(EW64) - unsigned'(requester_metadata_q.vew) ) );
+              if (requester_metadata_q.len < num_bytes) begin
+                requester_metadata_d.len    = 0;
+              end
+              else begin
+                requester_metadata_d.len = requester_metadata_q.len - num_bytes;
+              end
+            end : op_req_grant
 
             // Finished requesting all the elements
-            if (requester_d.len == '0) begin
+            if (requester_metadata_d.len == '0) begin : req_finished
               state_d = IDLE;
 
               // Accept a new instruction
-              if (operand_request_valid_i[requester]) begin
+              if (operand_request_valid_i[requester_index]) begin : op_req_valid
                 state_d                            = REQUESTING;
                 // Acknowledge the request
-                operand_request_ready_o[requester] = 1'b1;
+                operand_request_ready_o[requester_index] = 1'b1;
 
                 // Send a command to the operand queue
-                operand_queue_cmd_o[requester] = '{
-                  eew      : operand_request_i[requester].eew,
-                  vl       : (operand_request_i[requester].scale_vl) ?
-                               ((operand_request_i[requester].vl <<
-                               operand_request_i[requester].vtype.vsew) >>
-                               operand_request_i[requester].eew) :
-                               operand_request_i[requester].vl,
-                  conv     : operand_request_i[requester].conv,
-                  ntr_red  : operand_request_i[requester].cvt_resize,
-                  target_fu: operand_request_i[requester].target_fu,
-                  is_reduct: operand_request_i[requester].is_reduct
-                };
-                operand_queue_cmd_valid_o[requester] = 1'b1;
+                operand_queue_cmd_o[requester_index] = operand_queue_cmd_tmp;
+                operand_queue_cmd_valid_o[requester_index] = 1'b1;
+
                 // The length should be at least one after the rescaling
-                if (operand_queue_cmd_o[requester].vl == '0)
-                  operand_queue_cmd_o[requester].vl = 1;
+                if (operand_queue_cmd_o[requester_index].elem_count == '0) begin : cmd_zero_rescaled_vl
+                  operand_queue_cmd_o[requester_index].elem_count = 1;
+                end : cmd_zero_rescaled_vl
 
                 // Store the request
-                requester_d = '{
-                  id   : operand_request_i[requester].id,
-                  addr : vaddr(operand_request_i[requester].vs, NrLanes) +
-                  (operand_request_i[requester].vstart >>
-                    (int'(EW64) - int'(operand_request_i[requester].eew))),
-                  len    : (operand_request_i[requester].scale_vl) ?
-                             ((operand_request_i[requester].vl <<
-                             operand_request_i[requester].vtype.vsew) >>
-                             operand_request_i[requester].eew) :
-                             operand_request_i[requester].vl,
-                  vew    : operand_request_i[requester].eew,
-                  hazard : operand_request_i[requester].hazard,
-                  default: '0
-                };
+                requester_metadata_d = requester_metadata_tmp;
+
                 // The length should be at least one after the rescaling
-                if (requester_d.len == '0)
-                  requester_d.len = 1;
-              end
-            end
-          end
-        end
-      endcase
+                if (requester_metadata_d.len == '0) begin : req_zero_rescaled_vl
+                  requester_metadata_d.len = 1;
+                end : req_zero_rescaled_vl
+                
+                // Mute the requisition if the vl is zero
+                if (operand_request_i[requester_index].vl == '0) begin : zero_vl
+                  state_d                              = IDLE;
+                  operand_queue_cmd_valid_o[requester_index] = 1'b0;
+                end : zero_vl
+              end : op_req_valid
+            end : req_finished
+          end : op_queue_ready
+        end : state_q_REQUESTING
+      endcase // state_q
       // Always keep the hazard bits up to date with the global hazard table
-      requester_d.hazard &= global_hazard_table_i[requester_d.id];
+      requester_metadata_d.hazard &= global_hazard_table_i[requester_metadata_d.id];
     end : operand_requester
 
     always_ff @(posedge clk_i or negedge rst_ni) begin
       if (!rst_ni) begin
         state_q     <= IDLE;
-        requester_q <= '0;
+        requester_metadata_q <= '0;
       end else begin
         state_q     <= state_d;
-        requester_q <= requester_d;
+        requester_metadata_q <= requester_metadata_d;
       end
     end
   end : gen_operand_requester
@@ -452,7 +466,7 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
       operand_req[bank][NrOperandQueues + VFU_LoadUnit]  = 1'b0;
     end
 
-    // Generate the payload
+    // Generate the payloads for write back operations
     operand_payload[NrOperandQueues + VFU_Alu] = '{
       addr   : alu_result_addr_i >> $clog2(NrBanks),
       wen    : 1'b1,
@@ -523,7 +537,7 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
     logic payload_hp_req;
     logic payload_hp_gnt;
     rr_arb_tree #(
-      .NumIn    (int'(MulFPUC) - int'(AluA) + 1 + int'(VFU_MFpu) - int'(VFU_Alu) + 1),
+      .NumIn    (unsigned'(MulFPUC) - unsigned'(AluA) + 1 + unsigned'(VFU_MFpu) - unsigned'(VFU_Alu) + 1),
       .DataWidth($bits(payload_t)                                                   ),
       .AxiVldRdy(1'b0                                                               )
     ) i_hp_vrf_arbiter (
@@ -548,7 +562,7 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
     logic payload_lp_req;
     logic payload_lp_gnt;
     rr_arb_tree #(
-      .NumIn(int'(SlideAddrGenA)- int'(MaskB) + 1 + int'(VFU_LoadUnit) - int'(VFU_SlideUnit) + 1),
+      .NumIn(unsigned'(SlideAddrGenA)- unsigned'(MaskB) + 1 + unsigned'(VFU_LoadUnit) - unsigned'(VFU_SlideUnit) + 1),
       .DataWidth($bits(payload_t)                                                               ),
       .AxiVldRdy(1'b0                                                                           )
     ) i_lp_vrf_arbiter (
diff --git a/hardware/src/lane/valu.sv b/hardware/src/lane/valu.sv
index 8d8b1024d..369784f78 100644
--- a/hardware/src/lane/valu.sv
+++ b/hardware/src/lane/valu.sv
@@ -449,7 +449,9 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
                 (alu_operand_valid_i[0] || !vinsn_issue_q.use_vs1) &&
                 (mask_valid_i || vinsn_issue_q.vm)) begin
               // How many elements are we committing with this word?
-              automatic logic [3:0] element_cnt = (1 << (int'(EW64) - int'(vinsn_issue_q.vtype.vsew)));
+              automatic logic [3:0] element_cnt = (1 << (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew)));
+              automatic vlen_t vector_body_length = vinsn_issue_q.vl - vinsn_issue_q.vstart;
+              
               if (element_cnt > issue_cnt_q)
                 element_cnt = issue_cnt_q;
 
@@ -465,7 +467,12 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
 
               // Store the result in the result queue
               result_queue_d[result_queue_write_pnt_q].wdata = result_queue_q[result_queue_write_pnt_q].wdata | valu_result;
-              result_queue_d[result_queue_write_pnt_q].addr  = vaddr(vinsn_issue_q.vd, NrLanes) + ((vinsn_issue_q.vl - issue_cnt_q) >> (int'(EW64) - vinsn_issue_q.vtype.vsew));
+              result_queue_d[result_queue_write_pnt_q].addr  = vaddr(vinsn_issue_q.vd, NrLanes) 
+                                                                + ( 
+                                                                    ( vinsn_issue_q.vl - issue_cnt_q ) // vstart is already considered in issue_cnt_q
+                                                                      >> (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew)
+                                                                    )
+                                                                  );
               result_queue_d[result_queue_write_pnt_q].id    = vinsn_issue_q.id;
               result_queue_d[result_queue_write_pnt_q].mask  = vinsn_issue_q.vfu == VFU_MaskUnit;
               if (!narrowing(vinsn_issue_q.op) || !narrowing_select_q)
@@ -474,7 +481,7 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
               // Is this a narrowing instruction?
               if (narrowing(vinsn_issue_q.op)) begin
                 // How many elements did we calculate in this iteration?
-                automatic logic [3:0] element_cnt_narrow = (1 << (int'(EW64) - int'(vinsn_issue_q.vtype.vsew))) / 2;
+                automatic logic [3:0] element_cnt_narrow = (1 << (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew))) / 2;
                 if (element_cnt_narrow > issue_cnt_q)
                   element_cnt_narrow = issue_cnt_q;
 
@@ -523,12 +530,15 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
 
                 // Assign vector length for next instruction in the instruction queue
                 if (vinsn_queue_d.issue_cnt != 0) begin
+                  automatic vlen_t vector_body_length = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl
+                                                        - vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vstart;
                   if (!(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].op inside {[VMANDNOT:VMXNOR]}))
-                    issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl;
+                    issue_cnt_d = vector_body_length;
                   else begin
-                    issue_cnt_d = (vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl / 8) >>
+                    $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}");
+                    issue_cnt_d = (vector_body_length / 8) >>
                       vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew;
-                    issue_cnt_d += |vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl[2:0];
+                    issue_cnt_d += |vector_body_length[2:0];
                   end
                 end
               end
@@ -547,7 +557,7 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
                 (alu_operand_valid_i[0] || !vinsn_issue_q.use_vs1 || !first_op_q) &&
                 (mask_valid_i || vinsn_issue_q.vm)) begin
               // How many elements are we committing with this word?
-              automatic logic [3:0] element_cnt = (1 << (int'(EW64) - int'(vinsn_issue_q.vtype.vsew)));
+              automatic logic [3:0] element_cnt = (1 << (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew)));
               if (element_cnt > issue_cnt_q)
                 element_cnt = issue_cnt_q;
 
@@ -654,12 +664,15 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
 
           // Assign vector length for next instruction in the instruction queue
           if (vinsn_queue_d.issue_cnt != 0) begin
+            automatic vlen_t vector_body_length = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl
+                                                  - vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vstart;
             if (!(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].op inside {[VMANDNOT:VMXNOR]}))
-              issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl;
+              issue_cnt_d = vector_body_length;
             else begin
-              issue_cnt_d = (vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl / 8) >>
+              $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}");
+              issue_cnt_d = (vector_body_length / 8) >>
                 vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew;
-              issue_cnt_d += |vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl[2:0];
+              issue_cnt_d += |vector_body_length[2:0];
             end
           end
 
@@ -690,12 +703,15 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
 
               // Assign vector length for next instruction in the instruction queue
               if (vinsn_queue_d.issue_cnt != 0) begin
+                automatic vlen_t vector_body_length = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl
+                                                      - vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vstart;
                 if (!(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].op inside {[VMANDNOT:VMXNOR]}))
-                  issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl;
+                  issue_cnt_d = vector_body_length;
                 else begin
-                  issue_cnt_d = (vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl / 8) >>
+                  $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}");
+                  issue_cnt_d = (vector_body_length / 8) >>
                     vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew;
-                  issue_cnt_d += |vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl[2:0];
+                  issue_cnt_d += |vector_body_length[2:0];
                 end
               end
 
@@ -750,8 +766,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
       // Decrement the counter of remaining vector elements waiting to be written
       // Don't do it in case of a reduction
       if (!is_reduction(vinsn_commit.op))
-        commit_cnt_d = commit_cnt_q - (1 << (int'(EW64) - vinsn_commit.vtype.vsew));
-      if (commit_cnt_q < (1 << (int'(EW64) - vinsn_commit.vtype.vsew))) commit_cnt_d = '0;
+        commit_cnt_d = commit_cnt_q - (1 << (unsigned'(EW64) - vinsn_commit.vtype.vsew));
+      if (commit_cnt_q < (1 << (unsigned'(EW64) - vinsn_commit.vtype.vsew))) commit_cnt_d = '0;
     end
 
     // Finished committing the results of a vector instruction
@@ -765,16 +781,20 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
       else vinsn_queue_d.commit_pnt += 1;
 
       // Update the commit counter for the next instruction
-      if (vinsn_queue_d.commit_cnt != '0)
+      if (vinsn_queue_d.commit_cnt != '0) begin
+        automatic vlen_t vector_body_length = vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl
+                                              - vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vstart;
         if (!(vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].op inside {[VMANDNOT:VMXNOR]}))
-          commit_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl;
+          commit_cnt_d = vector_body_length;
         else begin
           // We are asking for bits, and we want at least one chunk of bits if
           // vl > 0. Therefore, commit_cnt = ceil((vl / 8) >> sew)
-          commit_cnt_d = (vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl / 8) >>
+          $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}");
+          commit_cnt_d = (vector_body_length / 8) >>
             vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vtype.vsew;
-          commit_cnt_d += |vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl[2:0];
+          commit_cnt_d += |vector_body_length[2:0];
         end
+      end
 
       // Initialize counters and alu state if needed by the next instruction
       // After a reduction, the next instructions starts after the reduction commits
@@ -796,14 +816,18 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
 
     if (!vinsn_queue_full && vfu_operation_valid_i &&
       (vfu_operation_i.vfu == VFU_Alu || vfu_operation_i.op inside {[VMSEQ:VMXNOR]})) begin
+      automatic vlen_t vector_body_length = vfu_operation_i.vl - vfu_operation_i.vstart;
+      
       vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt] = vfu_operation_i;
       // Do not wait for masks if, during a reduction, this lane is just a pass-through
       // The only valid instructions here with vl == '0 are reductions
+      // TODO: check if vector_body_length should be used insteada of plain vl here
       vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].vm = vfu_operation_i.vm | (vfu_operation_i.vl == '0);
 
       // Initialize counters and alu state if the instruction queue was empty
       // and the lane is not reducing
       if ((vinsn_queue_d.issue_cnt == '0) && !prevent_commit) begin
+
         alu_state_d = is_reduction(vfu_operation_i.op) ? INTRA_LANE_REDUCTION : NO_REDUCTION;
         // The next will be the first operation of this instruction
         // This information is useful for reduction operation
@@ -812,22 +836,24 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
         reduction_rx_cnt_d      = reduction_rx_cnt_init(NrLanes, lane_id_i);
         sldu_transactions_cnt_d = $clog2(NrLanes) + 1;
 
-        issue_cnt_d = vfu_operation_i.vl;
+        issue_cnt_d = vector_body_length;
         if (!(vfu_operation_i.op inside {[VMANDNOT:VMXNOR]}))
-          issue_cnt_d = vfu_operation_i.vl;
+          issue_cnt_d = vector_body_length;
         else begin
-          issue_cnt_d = (vfu_operation_i.vl / 8) >>
+          $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}");
+          issue_cnt_d = (vector_body_length / 8) >>
             vfu_operation_i.vtype.vsew;
-          issue_cnt_d += |vfu_operation_i.vl[2:0];
+          issue_cnt_d += |vector_body_length[2:0];
         end
       end
       if (vinsn_queue_d.commit_cnt == '0)
         if (!(vfu_operation_i.op inside {[VMANDNOT:VMXNOR]}))
-          commit_cnt_d = vfu_operation_i.vl;
+          commit_cnt_d = vector_body_length;
         else begin
+          $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}");
           // Operations between mask vectors operate on bits
-          commit_cnt_d  = (vfu_operation_i.vl / 8) >> vfu_operation_i.vtype.vsew;
-          commit_cnt_d += |vfu_operation_i.vl[2:0];
+          commit_cnt_d  = (vector_body_length / 8) >> vfu_operation_i.vtype.vsew;
+          commit_cnt_d += |vector_body_length[2:0];
         end
 
       // Bump pointers and counters of the vector instruction queue
diff --git a/hardware/src/vlsu/addrgen.sv b/hardware/src/vlsu/addrgen.sv
index 0ea8c52b9..eaf66bba1 100644
--- a/hardware/src/vlsu/addrgen.sv
+++ b/hardware/src/vlsu/addrgen.sv
@@ -65,6 +65,8 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
     output logic                           addrgen_operand_ready_o
   );
 
+  localparam unsigned DataWidth = $bits(elen_t);
+  localparam unsigned DataWidthB = DataWidth / 8;
 
   ///////////////////
   //  Assignments  //
@@ -272,7 +274,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
               endcase
 
               // Load element counter
-              idx_op_cnt_d = pe_req_i.vl;
+              idx_op_cnt_d = pe_req_i.vl - pe_req_i.vstart;
             end
             default: state_d = ADDRGEN;
           endcase // pe_req_i.op
@@ -300,10 +302,23 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
           addrgen_exception_o.tval  = '0;
         end : eew_misaligned_error
         else begin : address_valid
+          // NOTE: indexed are not covered here          
+          automatic logic [riscv::VLEN-1:0] vaddr_start;
+
+          case ( pe_req_q.op )
+            // Unit-stride: address = base + (vstart in elements)
+            VLE,  VSE : vaddr_start = pe_req_q.scalar_op + ( pe_req_q.vstart << unsigned'(pe_req_q.vtype.vsew) ); 
+            // Strided: address = base + (vstart * stride)
+            // NOTE: this multiplier might cause some timing issues
+            VLSE, VSSE: vaddr_start = pe_req_q.scalar_op + ( pe_req_q.vstart * pe_req_q.stride ) ;
+            // Indexed: let the next stage take care of vstart
+            VLXE, VSXE: vaddr_start = pe_req_q.scalar_op;
+            default   : vaddr_start = '0;
+          endcase // pe_req_q.op 
 
           addrgen_req = '{
-            addr    : pe_req_q.scalar_op,
-            len     : pe_req_q.vl ,
+            addr    : vaddr_start,
+            len     : pe_req_q.vl - pe_req_q.vstart,
             stride  : pe_req_q.stride,
             vew     : pe_req_q.vtype.vsew,
             is_load : is_load(pe_req_q.op),
@@ -329,12 +344,18 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
       end : ADDRGEN
 
       ADDRGEN_IDX_OP: begin : ADDRGEN_IDX_OP
+        // NOTE: vstart is not supported for indexed operations
+        //       the logic shuld be introduced:
+        //       1. in the addrgen_operand_i operand read
+        //       2. in idx_vaddr computation
+        automatic logic [NrLanes-1:0] addrgen_operand_valid;
+
         // Stall the interface until the operation is over to catch possible exceptions
 
         // Every address can generate an exception
         addrgen_req = '{
           addr    : pe_req_q.scalar_op, 
-          len     : pe_req_q.vl,
+          len     : pe_req_q.vl - pe_req_q.vstart,
           stride  : pe_req_q.stride,
           vew     : pe_req_q.vtype.vsew,
           is_load : is_load(pe_req_q.op),
@@ -343,10 +364,24 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
         };
         addrgen_req_valid = 1'b1;
 
+        // Adjust valid signals to the next block "operands_ready"
+        addrgen_operand_valid = addrgen_operand_valid_i;
+        for ( int unsigned lane = 0; lane < NrLanes; lane++ ) begin : adjust_operand_valid
+          // - We are left with less byte than the maximim to issue, 
+          //    this means that at least one lane is not going to push us any operand anymore
+          // - For the lanes which index % NrLanes != 0
+          if ( ( ( idx_op_cnt_q << pe_req_q.vtype.vsew ) < (NrLanes * DataWidthB) )
+                & ( lane < pe_req_q.vstart[idx_width(NrLanes)-1:0] )
+                ) begin : vstart_lane_adjust
+            addrgen_operand_valid[lane] |= 1'b1;
+          end : vstart_lane_adjust
+        end : adjust_operand_valid
+        // TODO: apply the same vstart logic also to mask_valid_i
+    
         // Handle handshake and data between VRF and spill register
         // We accept all the incoming data, without any checks
         // since Ara stalls on an indexed memory operation
-        if (&addrgen_operand_valid_i & addrgen_operand_target_fu_i[0] == MFPU_ADDRGEN) begin
+        if (&addrgen_operand_valid & addrgen_operand_target_fu_i[0] == MFPU_ADDRGEN) begin
 
           // Valid data for the spill register
           idx_vaddr_valid_d = 1'b1;
@@ -388,6 +423,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
             // Consumed one element
             idx_op_cnt_d = idx_op_cnt_q - 1;
             // Have we finished a full NrLanes*64b word?
+            // TODO: check for the need of vstart logic here
             if (elm_ptr_q == last_elm_subw_q) begin
               // Bump lane pointer
               elm_ptr_d       = '0;
@@ -448,14 +484,12 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
           addrgen_exception_o = mmu_exception_q;
         end
       end : WAIT_LAST_TRANSLATION
-
     endcase // state_q
 
     if ( addrgen_exception_o.valid & addrgen_ack_o ) begin
       addrgen_exception_load_o  = is_load(pe_req_q.op);
       addrgen_exception_store_o = !is_load(pe_req_q.op);
     end
-
   end : addr_generation
 
   always_ff @(posedge clk_i or negedge rst_ni) begin
@@ -936,6 +970,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
           end : start_req
         end : axi_ax_idle
       end : axi_addrgen_state_AXI_ADDRGEN_REQUESTING
+
       AXI_ADDRGEN_WAIT_TRANSLATION : begin : axi_addrgen_state_AXI_ADDRGEN_WAIT_TRANSLATION
         // keep request high
         mmu_req_o      = 1'b1;       
diff --git a/hardware/src/vlsu/vldu.sv b/hardware/src/vlsu/vldu.sv
index 7c49f3af6..467ae4a70 100644
--- a/hardware/src/vlsu/vldu.sv
+++ b/hardware/src/vlsu/vldu.sv
@@ -193,6 +193,10 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #(
   // - A pointer to which byte in the full VRF word we are writing data into.
   logic [idx_width(DataWidth*NrLanes/8):0] vrf_word_byte_pnt_d, vrf_word_byte_pnt_q;
 
+  localparam unsigned DataWidthB = DataWidth / 8;
+  
+    vlen_t vstart_lane;
+
   always_comb begin: p_vldu
     // Maintain state
     vinsn_queue_d = vinsn_queue_q;
@@ -233,7 +237,7 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #(
     // - The Address Generator sent us the data about the corresponding AR beat
     // - There is place in the result queue to write the data read from the R channel
     if (axi_r_valid_i && axi_addrgen_req_valid_i
-        && axi_addrgen_req_i.is_load && !result_queue_full) begin
+        && axi_addrgen_req_i.is_load && !result_queue_full) begin : axi_r_beat_read
       // Bytes valid in the current R beat
       // If non-unit strided load, we do not progress within the beat
       automatic shortint unsigned lower_byte = beat_lower_byte(axi_addrgen_req_i.addr,
@@ -244,42 +248,47 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #(
       // Is there a vector instruction ready to be issued?
       // Do we have the operands for it?
       if (vinsn_issue_valid && (vinsn_issue_q.vm || (|mask_valid_i))) begin : operands_valid
-        // TODO: add vstart here (use issue/commit_cnt_bytes_q)
         // Account for the issued bytes
         // How many bytes are valid in this VRF word
-        automatic vlen_t vrf_valid_bytes   = (NrLanes * 8) - vrf_word_byte_pnt_q; 
+        automatic vlen_t vrf_valid_bytes   = (NrLanes * DataWidthB) - vrf_word_byte_pnt_q; 
         // How many bytes are valid in this instruction
         automatic vlen_t vinsn_valid_bytes = issue_cnt_bytes_q - vrf_word_byte_pnt_q;
         // How many bytes are valid in this AXI word
         automatic vlen_t axi_valid_bytes   = upper_byte - lower_byte - axi_r_byte_pnt_q + 1;
 
+
         // How many bytes are we committing?
         automatic logic [idx_width(DataWidth*NrLanes/8):0] valid_bytes;
-        valid_bytes = ( issue_cnt_bytes_q < (NrLanes * 8) ) ? vinsn_valid_bytes : vrf_valid_bytes;
-        // valid_bytes = ( valid_bytes       < axi_valid_bytes ) ? valid_bytes       : axi_valid_bytes;
-        if ( valid_bytes >= axi_valid_bytes ) begin : valid_bytes_overflow
-          valid_bytes = axi_valid_bytes;
-        end : valid_bytes_overflow
+        valid_bytes = ( issue_cnt_bytes_q < (NrLanes * DataWidthB) ) ? vinsn_valid_bytes : vrf_valid_bytes;
+        valid_bytes = ( valid_bytes       < axi_valid_bytes        ) ? valid_bytes       : axi_valid_bytes;
 
+        // Bump R beat and VRF word pointers
         axi_r_byte_pnt_d   = axi_r_byte_pnt_q + valid_bytes;
         vrf_word_byte_pnt_d = vrf_word_byte_pnt_q + valid_bytes;
 
         // Copy data from the R channel into the result queue
-        for (int axi_byte = 0; axi_byte < AxiDataWidth/8; axi_byte++) begin : axi_r_to_result_queue
+        for (int unsigned axi_byte = 0; axi_byte < AxiDataWidth/8; axi_byte++) begin : axi_r_to_result_queue
           // Is this byte a valid byte in the R beat?
           if ( ( axi_byte >= ( lower_byte + axi_r_byte_pnt_q ) ) &&
                ( axi_byte <= upper_byte ) 
               ) begin : is_axi_r_byte
             // Map axi_byte to the corresponding byte in the VRF word (sequential)
-            automatic int vrf_seq_byte = axi_byte - lower_byte - axi_r_byte_pnt_q + vrf_word_byte_pnt_q;
+            automatic int unsigned vrf_seq_byte = axi_byte - lower_byte - axi_r_byte_pnt_q + vrf_word_byte_pnt_q;
             // And then shuffle it
-            automatic int vrf_byte = shuffle_index(vrf_seq_byte, NrLanes, vinsn_issue_q.vtype.vsew);
+            automatic int unsigned vrf_byte = shuffle_index(vrf_seq_byte, NrLanes, vinsn_issue_q.vtype.vsew);
 
             // Is this byte a valid byte in the VRF word?
-            if (vrf_seq_byte < issue_cnt_bytes_q && vrf_seq_byte < (NrLanes * 8)) begin : is_vrf_byte
+            if (vrf_seq_byte < issue_cnt_bytes_q && vrf_seq_byte < (NrLanes * DataWidthB)) begin : is_vrf_byte
               // At which lane, and what is the byte offset in that lane, of the byte vrf_byte?
-              automatic int vrf_lane   = vrf_byte >> 3;
-              automatic int vrf_offset = vrf_byte[2:0];
+              automatic int unsigned vrf_offset = vrf_byte[2:0];
+              // Consider also vstart and make sure this index wraps around the number of lane
+              automatic int unsigned vrf_lane = (vrf_byte >> 3);
+              // Adjust lane selection w.r.t. vstart
+              vrf_lane += vinsn_issue_q.vstart[idx_width(NrLanes)-1:0];
+              if ( vrf_lane >= NrLanes ) begin : vstart_lane_adjust
+                vrf_lane -= NrLanes;
+              end : vstart_lane_adjust
+
 
               // Copy data and byte strobe
               result_queue_d[result_queue_write_pnt_q][vrf_lane].wdata[8*vrf_offset +: 8] =
@@ -290,24 +299,37 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #(
           end : is_axi_r_byte
         end : axi_r_to_result_queue
 
-        // Initialize id and addr fields of the result queue requests
-        for (int lane = 0; lane < NrLanes; lane++) begin
+        for (int unsigned lane = 0; lane < NrLanes; lane++) begin : compute_vrf_addr
+          automatic vlen_t issue_cnt_elems;
+          // elements per lane (each lane processes num elements / NrLanes)
+          automatic vlen_t elem_left_per_lane;
+          // 64-bit aligned address
+          automatic vlen_t lane_word_offset;
+          // How many elements in the vector body
+          automatic vlen_t elem_body_count;
+          // vstart value local ot the lane
+          automatic vlen_t vstart_lane;        
+          
+          // Compute VRF chunk address per lane
+          elem_body_count    = vinsn_issue_q.vl - vinsn_issue_q.vstart;
+          issue_cnt_elems    = issue_cnt_bytes_q >> unsigned'(vinsn_issue_q.vtype.vsew);
+          elem_left_per_lane = ( elem_body_count - issue_cnt_elems ) / NrLanes;
+          lane_word_offset   = elem_left_per_lane >> (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew));
+          
+          vstart_lane = vinsn_issue_q.vstart / NrLanes;
+          // If lane_id < (vstart % NrLanes), this lane needs to execute one micro-operation less.
+          if ( lane < vinsn_issue_q.vstart[idx_width(NrLanes)-1:0] ) begin : vstart_lane_adjust
+            vstart_lane += 1;
+          end : vstart_lane_adjust
+
+          // Store in result queue
+          result_queue_d[result_queue_write_pnt_q][lane].addr = vaddr(vinsn_issue_q.vd, NrLanes) + lane_word_offset + vstart_lane;  
           result_queue_d[result_queue_write_pnt_q][lane].id   = vinsn_issue_q.id;
-          result_queue_d[result_queue_write_pnt_q][lane].addr = 
-            vaddr(vinsn_issue_q.vd, NrLanes) +                                   // base address of vd
-            ( 
-              (  
-                ( 
-                  (vinsn_issue_q.vl) -                    // total number of elements to be processed
-                  (issue_cnt_bytes_q >> unsigned'(vinsn_issue_q.vtype.vsew))     // elements left (issue_cnt_bytes_q is in bytes, so we shift rx by EEW)
-                ) / NrLanes                                                      // elements per lane (each lane processes num elements / NrLanes)
-              ) >> (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew))       // 64-bit aligned address
-            );                                                                   // final offset to vd
-        end
+        end : compute_vrf_addr
       end : operands_valid
 
       // We have a word ready to be sent to the lanes
-      if (vrf_word_byte_pnt_d == NrLanes*8 || vrf_word_byte_pnt_d == issue_cnt_bytes_q) begin
+      if (vrf_word_byte_pnt_d == (NrLanes * DataWidthB) || vrf_word_byte_pnt_d == issue_cnt_bytes_q) begin : vrf_word_ready
         // Increment result queue pointers and counters
         result_queue_cnt_d += 1;
         if (result_queue_write_pnt_q == ResultQueueDepth-1) begin : result_queue_write_pnt_overflow
@@ -318,6 +340,7 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #(
         end : result_queue_write_pnt_increment
 
         // Trigger the request signal
+        // TODO: check if triggering all lanes is actually necessary here
         result_queue_valid_d[result_queue_write_pnt_q] = {NrLanes{1'b1}};
 
         // Acknowledge the mask operands
@@ -326,11 +349,11 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #(
         // Reset the pointer in the VRF word
         vrf_word_byte_pnt_d   = '0;
         // Account for the results that were issued
-        issue_cnt_bytes_d = issue_cnt_bytes_q - (NrLanes * 8); // TODO: add vstart here (use issue/commit_cnt_bytes_q)
-        if (issue_cnt_bytes_q < (NrLanes * 8)) begin : issue_cnt_bytes_overflow
+        issue_cnt_bytes_d = issue_cnt_bytes_q - (NrLanes * DataWidthB);
+        if (issue_cnt_bytes_q < (NrLanes * DataWidthB)) begin : issue_cnt_bytes_overflow
           issue_cnt_bytes_d = '0;
         end : issue_cnt_bytes_overflow
-      end
+      end : vrf_word_ready
 
       // Consumed all valid bytes in this R beat
       if ( ( axi_r_byte_pnt_d == ( upper_byte - lower_byte + 1 ) ) || ( issue_cnt_bytes_d == '0 ) ) begin : axi_r_beat_finish
@@ -363,17 +386,19 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #(
 
         // Prepare for the next vector instruction
         if (vinsn_queue_d.issue_cnt != 0) begin : issue_cnt_bytes_update
-          issue_cnt_bytes_d = ( vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl
+          issue_cnt_bytes_d = (
+                                vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl 
+                                - vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vstart
                               ) << unsigned'(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew);
         end : issue_cnt_bytes_update
       end : vrf_results_finish
-    end
+    end : axi_r_beat_read
 
     //////////////////////////////////
     //  Write results into the VRF  //
     //////////////////////////////////
 
-    for (int lane = 0; lane < NrLanes; lane++) begin: vrf_result_write
+    for (int unsigned lane = 0; lane < NrLanes; lane++) begin: vrf_result_write
       ldu_result_req_o[lane]   = result_queue_valid_q[result_queue_read_pnt_q][lane];
       ldu_result_addr_o[lane]  = result_queue_q[result_queue_read_pnt_q][lane].addr;
       ldu_result_id_o[lane]    = result_queue_q[result_queue_read_pnt_q][lane].id;
@@ -396,7 +421,7 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #(
     // All lanes accepted the VRF request
     // Wait for all the final grants, to be sure that all the results were written back
     if (!(|result_queue_valid_d[result_queue_read_pnt_q]) &&
-      (&result_final_gnt_d || commit_cnt_bytes_q > (NrLanes * 8))) begin
+      (&result_final_gnt_d || commit_cnt_bytes_q > (NrLanes * DataWidthB))) begin : wait_for_write_back
       // There is something waiting to be written
       if (!result_queue_empty) begin : result_available
         // Increment the read pointer
@@ -411,12 +436,12 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #(
         result_queue_cnt_d -= 1;
 
         // Decrement the counter of remaining vector elements waiting to be written
-        commit_cnt_bytes_d = commit_cnt_bytes_q - (NrLanes * 8); // TODO: add vstart here (use issue/commit_cnt_bytes_q)
-        if (commit_cnt_bytes_q < (NrLanes * 8)) begin : commit_cnt_bytes_overflow
+        commit_cnt_bytes_d = commit_cnt_bytes_q - (NrLanes * DataWidthB);
+        if (commit_cnt_bytes_q < (NrLanes * DataWidthB)) begin : commit_cnt_bytes_overflow
           commit_cnt_bytes_d = '0;
         end : commit_cnt_bytes_overflow
       end : result_available
-    end
+    end : wait_for_write_back
 
     // Finished committing the results of a vector instruction
     if (vinsn_commit_valid && commit_cnt_bytes_d == '0) begin : vinsn_done
@@ -435,11 +460,13 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #(
 
       // Update the commit counter for the next instruction
       if (vinsn_queue_d.commit_cnt != '0)
-        commit_cnt_bytes_d = ( vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl
-                        ) << unsigned'(vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vtype.vsew);
+        commit_cnt_bytes_d = (
+                               vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl
+                                - vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vstart
+                              ) << unsigned'(vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vtype.vsew);
     end : vinsn_done
 
-    // Ack back exceptions
+    // Clear instruction queue in case of exceptions from addrgen
     if ( addrgen_exception_valid_i ) begin : exception
       // Signal done to sequencer
       pe_resp_d.vinsn_done[vinsn_commit.id] = 1'b1;
@@ -451,45 +478,44 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #(
     //////////////////////////////
 
     if (!vinsn_queue_full && pe_req_valid_i && !vinsn_running_q[pe_req_i.id] &&
-      pe_req_i.vfu == VFU_LoadUnit) begin
+      pe_req_i.vfu == VFU_LoadUnit) begin : pe_req_valid
       vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt] = pe_req_i;
       vinsn_running_d[pe_req_i.id]                  = 1'b1;
 
       // Initialize counters
-      // TODO(bug fix): add masking logic (stores are not idempotent!)
       if (vinsn_queue_d.issue_cnt == '0) begin : issue_cnt_bytes_init
-        issue_cnt_bytes_d = (pe_req_i.vl) << unsigned'(pe_req_i.vtype.vsew);
+        issue_cnt_bytes_d = (pe_req_i.vl - pe_req_i.vstart) << unsigned'(pe_req_i.vtype.vsew);
       end : issue_cnt_bytes_init
       if (vinsn_queue_d.commit_cnt == '0) begin : commit_cnt_bytes_init
-        commit_cnt_bytes_d = (pe_req_i.vl) << unsigned'(pe_req_i.vtype.vsew);
+        commit_cnt_bytes_d = (pe_req_i.vl - pe_req_i.vstart) << unsigned'(pe_req_i.vtype.vsew);
       end : commit_cnt_bytes_init
 
       // Bump pointers and counters of the vector instruction queue
       vinsn_queue_d.accept_pnt += 1;
       vinsn_queue_d.issue_cnt += 1;
       vinsn_queue_d.commit_cnt += 1;
-    end
+    end : pe_req_valid
   end: p_vldu
 
   always_ff @(posedge clk_i or negedge rst_ni) begin
     if (!rst_ni) begin
-      vinsn_running_q    <= '0;
-      issue_cnt_bytes_q        <= '0;
-      commit_cnt_bytes_q       <= '0;
-      axi_len_q              <= '0;
-      axi_r_byte_pnt_q            <= '0;
-      vrf_word_byte_pnt_q          <= '0;
-      pe_resp_o          <= '0;
-      result_final_gnt_q <= '0;
+      vinsn_running_q     <= '0;
+      issue_cnt_bytes_q   <= '0;
+      commit_cnt_bytes_q  <= '0;
+      axi_len_q           <= '0;
+      axi_r_byte_pnt_q    <= '0;
+      vrf_word_byte_pnt_q <= '0;
+      pe_resp_o           <= '0;
+      result_final_gnt_q  <= '0;
     end else begin
-      vinsn_running_q    <= vinsn_running_d;
-      issue_cnt_bytes_q        <= issue_cnt_bytes_d;
-      commit_cnt_bytes_q       <= commit_cnt_bytes_d;
-      axi_len_q              <= axi_len_d;
-      axi_r_byte_pnt_q            <= axi_r_byte_pnt_d;
-      vrf_word_byte_pnt_q          <= vrf_word_byte_pnt_d;
-      pe_resp_o          <= pe_resp_d;
-      result_final_gnt_q <= result_final_gnt_d;
+      vinsn_running_q     <= vinsn_running_d;
+      issue_cnt_bytes_q   <= issue_cnt_bytes_d;
+      commit_cnt_bytes_q  <= commit_cnt_bytes_d;
+      axi_len_q           <= axi_len_d;
+      axi_r_byte_pnt_q    <= axi_r_byte_pnt_d;
+      vrf_word_byte_pnt_q <= vrf_word_byte_pnt_d;
+      pe_resp_o           <= pe_resp_d;
+      result_final_gnt_q  <= result_final_gnt_d;
     end
   end
 
diff --git a/hardware/src/vlsu/vstu.sv b/hardware/src/vlsu/vstu.sv
index 9580f59b0..f6e5e38ca 100644
--- a/hardware/src/vlsu/vstu.sv
+++ b/hardware/src/vlsu/vstu.sv
@@ -46,6 +46,7 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #(
     output pe_resp_t                       pe_resp_o,
     // Interface with the address generator
     input  addrgen_axi_req_t               axi_addrgen_req_i,
+    input  logic                           addrgen_exception_valid_i,
     input  logic                           axi_addrgen_req_valid_i,
     output logic                           axi_addrgen_req_ready_o,
     // Interface with the lanes
@@ -63,12 +64,14 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #(
   import axi_pkg::beat_upper_byte;
   import axi_pkg::BURST_INCR;
 
+  localparam unsigned DataWidthB = DataWidth / 8;
+
   ///////////////////////
   //  Spill registers  //
   ///////////////////////
 
   elen_t [NrLanes-1:0] stu_operand;
-  logic  [NrLanes-1:0] stu_operand_valid;
+  logic  [NrLanes-1:0] stu_operand_valid_lanes;
   logic                stu_operand_ready;
 
   for (genvar lane = 0; lane < NrLanes; lane++) begin: gen_regs
@@ -83,7 +86,7 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #(
       .valid_i   (stu_operand_valid_i[lane]),
       .ready_o   (stu_operand_ready_o[lane]),
       .data_o    (stu_operand[lane]        ),
-      .valid_o   (stu_operand_valid[lane]  ),
+      .valid_o   (stu_operand_valid_lanes[lane]  ),
       .ready_i   (stu_operand_ready        )
     );
   end: gen_regs
@@ -153,30 +156,47 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #(
   //  Store Unit  //
   //////////////////
 
+  // NOTE: these are out here only for debug visibility, they could go in p_vldu as automatic variables
+  int unsigned vrf_seq_byte;
+  int unsigned vrf_byte ;
+  vlen_t vrf_valid_bytes ;
+  vlen_t vinsn_valid_bytes;
+  vlen_t axi_valid_bytes   ;
+  logic [idx_width(DataWidth*NrLanes/8):0] valid_bytes;      
+
+
   // Vector instructions currently running
   logic [NrVInsn-1:0] vinsn_running_d, vinsn_running_q;
 
   // Interface with the main sequencer
-  pe_resp_t pe_resp;
+  pe_resp_t pe_resp_d;
 
   // Remaining bytes of the current instruction in the issue phase
-  vlen_t issue_cnt_d, issue_cnt_q;
+  vlen_t issue_cnt_bytes_d, issue_cnt_bytes_q;
 
   // Pointers
   //
   // We need several pointers to copy data to the memory interface
   // from the VRF. Namely, we need:
   // - A counter of how many beats are left in the current AXI burst
-  axi_pkg::len_t len_d, len_q;
+  axi_pkg::len_t axi_len_d, axi_len_q;
   // - A pointer to which byte in the full VRF word we are reading data from.
   logic [idx_width(DataWidth*NrLanes/8):0] vrf_pnt_d, vrf_pnt_q;
 
   always_comb begin: p_vstu
+    // NOTE: these are out here only for debug visibility, they could go in p_vldu as automatic variables
+    vrf_seq_byte = '0;
+    vrf_byte  = '0;
+    vrf_valid_bytes  = '0;
+    vinsn_valid_bytes = '0;
+    axi_valid_bytes    = '0;
+    valid_bytes = '0;
+
     // Maintain state
     vinsn_queue_d = vinsn_queue_q;
-    issue_cnt_d   = issue_cnt_q;
+    issue_cnt_bytes_d   = issue_cnt_bytes_q;
 
-    len_d     = len_q;
+    axi_len_d     = axi_len_q;
     vrf_pnt_d = vrf_pnt_q;
 
     // Vector instructions currently running
@@ -184,7 +204,7 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #(
 
     // We are not ready, by default
     axi_addrgen_req_ready_o = 1'b0;
-    pe_resp                 = '0;
+    pe_resp_d               = '0;
     axi_w_o                 = '0;
     axi_w_valid_o           = 1'b0;
     axi_b_ready_o           = 1'b0;
@@ -204,92 +224,130 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #(
     // - We received all the operands from the lanes
     // - The address generator generated an AXI AW request for this write beat
     // - The AXI subsystem is ready to accept this W beat
-    if (vinsn_issue_valid && &stu_operand_valid && (vinsn_issue_q.vm || (|mask_valid_i)) &&
-        axi_addrgen_req_valid_i && !axi_addrgen_req_i.is_load && axi_w_ready_i) begin
+    if (vinsn_issue_valid &&
+        axi_addrgen_req_valid_i && !axi_addrgen_req_i.is_load && axi_w_ready_i) begin : issue_valid
       // Bytes valid in the current W beat
       automatic shortint unsigned lower_byte = beat_lower_byte(axi_addrgen_req_i.addr,
-        axi_addrgen_req_i.size, axi_addrgen_req_i.len, BURST_INCR, AxiDataWidth/8, len_q);
+        axi_addrgen_req_i.size, axi_addrgen_req_i.len, BURST_INCR, AxiDataWidth/8, axi_len_q);
       automatic shortint unsigned upper_byte = beat_upper_byte(axi_addrgen_req_i.addr,
-        axi_addrgen_req_i.size, axi_addrgen_req_i.len, BURST_INCR, AxiDataWidth/8, len_q);
+        axi_addrgen_req_i.size, axi_addrgen_req_i.len, BURST_INCR, AxiDataWidth/8, axi_len_q);
+
+      // For non-zero vstart values, the last operand read is not going to involve all the lanes
+      automatic logic [NrLanes-1:0] stu_operand_valid;
+      automatic logic [NrLanes-1:0] mask_valid;
 
+      // How many bytes are we committing?
+      // automatic logic [idx_width(DataWidth*NrLanes/8):0] valid_bytes;      
+      
       // Account for the issued bytes
       // How many bytes are valid in this VRF word
-      automatic vlen_t vrf_valid_bytes   = NrLanes * 8 - vrf_pnt_q;
+      vrf_valid_bytes   = (NrLanes * DataWidthB) - vrf_pnt_q;
       // How many bytes are valid in this instruction
-      automatic vlen_t vinsn_valid_bytes = issue_cnt_q - vrf_pnt_q;
+      vinsn_valid_bytes = issue_cnt_bytes_q - vrf_pnt_q;
       // How many bytes are valid in this AXI word
-      automatic vlen_t axi_valid_bytes   = upper_byte - lower_byte + 1;
-
-      // How many bytes are we committing?
-      automatic logic [idx_width(DataWidth*NrLanes/8):0] valid_bytes;
-      valid_bytes = issue_cnt_q < NrLanes * 8     ? vinsn_valid_bytes : vrf_valid_bytes;
-      valid_bytes = valid_bytes < axi_valid_bytes ? valid_bytes       : axi_valid_bytes;
-
-      vrf_pnt_d = vrf_pnt_q + valid_bytes;
-
-      // Copy data from the operands into the W channel
-      for (int axi_byte = 0; axi_byte < AxiDataWidth/8; axi_byte++) begin
-        // Is this byte a valid byte in the W beat?
-        if (axi_byte >= lower_byte && axi_byte <= upper_byte) begin
-          // Map axy_byte to the corresponding byte in the VRF word (sequential)
-          automatic int vrf_seq_byte = axi_byte - lower_byte + vrf_pnt_q;
-          // And then shuffle it
-          automatic int vrf_byte     = shuffle_index(vrf_seq_byte, NrLanes, vinsn_issue_q.eew_vs1);
-
-          // Is this byte a valid byte in the VRF word?
-          if (vrf_seq_byte < issue_cnt_q) begin
-            // At which lane, and what is the byte offset in that lane, of the byte vrf_byte?
-            automatic int vrf_lane   = vrf_byte >> 3;
-            automatic int vrf_offset = vrf_byte[2:0];
-
-            // Copy data
-            axi_w_o.data[8*axi_byte +: 8] = stu_operand[vrf_lane][8*vrf_offset +: 8];
-            axi_w_o.strb[axi_byte]        = vinsn_issue_q.vm || mask_i[vrf_lane][vrf_offset];
+      axi_valid_bytes   = upper_byte - lower_byte + 1;
+
+      valid_bytes = ( issue_cnt_bytes_q < (NrLanes * DataWidthB) ) ? vinsn_valid_bytes : vrf_valid_bytes;
+      valid_bytes = ( valid_bytes < axi_valid_bytes              ) ? valid_bytes       : axi_valid_bytes;
+
+      // Adjust valid signals to the next block "operands_ready"
+      stu_operand_valid = stu_operand_valid_lanes;
+      for ( int unsigned lane = 0; lane < NrLanes; lane++ ) begin : adjust_operand_valid
+        // - We are left with less byte than the maximim to issue, 
+        //    this means that at least one lane is not going to push us any operand anymore
+        // - For the lanes which index % NrLanes != 0
+        if ( ( issue_cnt_bytes_q < (NrLanes * DataWidthB) )
+              & ( lane < vinsn_issue_q.vstart[idx_width(NrLanes)-1:0] )
+              ) begin : vstart_lane_adjust
+          stu_operand_valid[lane] |= 1'b1;
+        end : vstart_lane_adjust
+      end : adjust_operand_valid
+    
+      // TODO: apply the same vstart logic also to mask_valid_i
+      // For now, assume (vstart % NrLanes == 0)
+      mask_valid = mask_valid_i;
+
+      // Wait for all expected operands from the lanes
+      if ( &stu_operand_valid && (vinsn_issue_q.vm || (|mask_valid_i) ) ) begin : operands_ready
+        vrf_pnt_d = vrf_pnt_q + valid_bytes;
+
+        // Copy data from the operands into the W channel
+        for (int unsigned axi_byte = 0; axi_byte < AxiDataWidth/8; axi_byte++) begin : stu_operand_to_axi_w
+          // Is this byte a valid byte in the W beat?
+          if (axi_byte >= lower_byte && axi_byte <= upper_byte) begin
+            // Map axy_byte to the corresponding byte in the VRF word (sequential)
+            vrf_seq_byte = axi_byte - lower_byte + vrf_pnt_q;
+            // And then shuffle it
+            vrf_byte     = shuffle_index(vrf_seq_byte, NrLanes, vinsn_issue_q.eew_vs1);
+
+            // Is this byte a valid byte in the VRF word?
+            if (vrf_seq_byte < issue_cnt_bytes_q) begin
+              // At which lane, and what is the byte offset in that lane, of the byte vrf_byte?
+              automatic int unsigned vrf_offset = vrf_byte[2:0];
+
+              // Consider also vstart and make sure this index wraps around the number of lane
+              // automatic logic [$clog2(NrLanes)-1:0] vrf_lane = (vrf_byte >> 3) + vinsn_issue_q.vstart[idx_width(NrLanes)-1:0];
+              automatic int unsigned vrf_lane = (vrf_byte >> 3);
+              // Adjust lane selection w.r.t. vstart
+              vrf_lane += vinsn_issue_q.vstart[idx_width(NrLanes)-1:0];
+              if ( vrf_lane >= NrLanes ) begin : vstart_lane_adjust
+                vrf_lane -= NrLanes;
+              end : vstart_lane_adjust
+
+              // Copy data
+              axi_w_o.data[8*axi_byte +: 8] = stu_operand[vrf_lane][8*vrf_offset +: 8];
+              axi_w_o.strb[axi_byte]        = vinsn_issue_q.vm || mask_i[vrf_lane][vrf_offset];
+            end
           end
-        end
-      end
-
-      // Send the W beat
-      axi_w_valid_o = 1'b1;
-      // Account for the beat we sent
-      len_d         = len_q + 1;
-      // We wrote all the beats for this AW burst
-      if ($unsigned(len_d) == axi_pkg::len_t'($unsigned(axi_addrgen_req_i.len) + 1)) begin
-        axi_w_o.last            = 1'b1;
-        // Ask for another burst by the address generator
-        axi_addrgen_req_ready_o = 1'b1;
-        // Reset AXI pointers
-        len_d                   = '0;
-      end
-
-      // We consumed a whole word from the lanes
-      if (vrf_pnt_d == NrLanes*8 || vrf_pnt_d == issue_cnt_q) begin
-        // Reset the pointer in the VRF word
-        vrf_pnt_d         = '0;
-        // Acknowledge the operands with the lanes
-        stu_operand_ready = '1;
-        // Acknowledge the mask operand
-        mask_ready_o      = !vinsn_issue_q.vm;
-        // Account for the results that were issued
-        issue_cnt_d       = issue_cnt_q - NrLanes * 8;
-        if (issue_cnt_q < NrLanes * 8)
-          issue_cnt_d = '0;
-      end
-    end
+        end : stu_operand_to_axi_w
+
+        // Send the W beat
+        axi_w_valid_o = 1'b1;
+        // Account for the beat we sent
+        axi_len_d     = axi_len_q + 1;
+        // We wrote all the beats for this AW burst
+        if ($unsigned(axi_len_d) == axi_pkg::len_t'($unsigned(axi_addrgen_req_i.len) + 1)) begin : beats_complete
+          axi_w_o.last            = 1'b1;
+          // Ask for another burst by the address generator
+          axi_addrgen_req_ready_o = 1'b1;
+          // Reset AXI pointers
+          axi_len_d                   = '0;
+        end : beats_complete
+
+        // We consumed a whole word from the lanes
+        if (vrf_pnt_d == NrLanes*8 || vrf_pnt_d == issue_cnt_bytes_q) begin : vrf_word_done
+          // Reset the pointer in the VRF word
+          vrf_pnt_d         = '0;
+          // Acknowledge the operands with the lanes
+          stu_operand_ready = '1;
+          // Acknowledge the mask operand
+          mask_ready_o      = !vinsn_issue_q.vm;
+          // Account for the results that were issued
+          issue_cnt_bytes_d       = issue_cnt_bytes_q - (NrLanes * DataWidthB);
+          if (issue_cnt_bytes_q < (NrLanes * DataWidthB)) begin : issue_cnt_bytes_overflow
+            issue_cnt_bytes_d = '0;
+          end : issue_cnt_bytes_overflow
+        end : vrf_word_done
+      end : operands_ready
+    end : issue_valid
 
     // Finished issuing W beats for this vector store
-    if (vinsn_issue_valid && issue_cnt_d == 0) begin
+    if (vinsn_issue_valid && issue_cnt_bytes_d == 0) begin : axi_w_beat_finish
       // Bump issue counters and pointers of the vector instruction queue
       vinsn_queue_d.issue_cnt -= 1;
-      if (vinsn_queue_q.issue_pnt == VInsnQueueDepth-1)
+      if (vinsn_queue_q.issue_pnt == VInsnQueueDepth-1) begin : issue_pnt_overflow
         vinsn_queue_d.issue_pnt = 0;
-      else
+      end : issue_pnt_overflow
+      else begin : issue_pnt_increment
         vinsn_queue_d.issue_pnt += 1;
+      end : issue_pnt_increment
 
-      if (vinsn_queue_d.issue_cnt != 0)
-        issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl <<
-          int'(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew);
-    end
+      if (vinsn_queue_d.issue_cnt != 0) begin : issue_cnt_bytes_update
+        issue_cnt_bytes_d = ( vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl - 
+                        vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vstart
+                      ) << unsigned'(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew);
+      end : issue_cnt_bytes_update
+    end : axi_w_beat_finish
 
     ////////////////////////////
     //  Handle the B channel  //
@@ -297,63 +355,66 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #(
 
     // TODO: We cannot handle errors on the B channel.
     // We just acknowledge any AXI requests that come on the B channel.
-    if (axi_b_valid_i) begin
+    if (axi_b_valid_i) begin : axi_b_valid
       // Acknowledge the B beat
       axi_b_ready_o = 1'b1;
 
       // Mark the vector instruction as being done
-      if (vinsn_queue_d.issue_pnt != vinsn_queue_d.commit_pnt) begin
+      if (vinsn_queue_d.issue_pnt != vinsn_queue_d.commit_pnt) begin : instr_done
         // Signal complete store
         store_complete_o = 1'b1;
 
-        pe_resp.vinsn_done[vinsn_commit.id] = 1'b1;
+        pe_resp_d.vinsn_done[vinsn_commit.id] = 1'b1;
 
         // Update the commit counters and pointers
         vinsn_queue_d.commit_cnt -= 1;
-        if (vinsn_queue_d.commit_pnt == VInsnQueueDepth-1)
+        if (vinsn_queue_d.commit_pnt == VInsnQueueDepth-1) begin : commit_pnt_overflow
           vinsn_queue_d.commit_pnt = '0;
-        else
+        end : commit_pnt_overflow
+        else begin : commit_pnt_increment
           vinsn_queue_d.commit_pnt += 1;
-      end
-    end
+        end : commit_pnt_increment
+      end : instr_done
+    end : axi_b_valid
 
     //////////////////////////////
     //  Accept new instruction  //
     //////////////////////////////
 
     if (!vinsn_queue_full && pe_req_valid_i && !vinsn_running_q[pe_req_i.id] &&
-      pe_req_i.vfu == VFU_StoreUnit) begin
+      pe_req_i.vfu == VFU_StoreUnit) begin : issue_cnt_bytes_init
       vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt] = pe_req_i;
       vinsn_running_d[pe_req_i.id]                  = 1'b1;
 
       // Initialize counters
-      if (vinsn_queue_d.issue_cnt == '0)
-        issue_cnt_d = pe_req_i.vl << int'(pe_req_i.vtype.vsew);
+      if (vinsn_queue_d.issue_cnt == '0) begin : issue_cnt_bytes_init
+        issue_cnt_bytes_d = (pe_req_i.vl - pe_req_i.vstart) << unsigned'(pe_req_i.vtype.vsew);
+      end : issue_cnt_bytes_init
 
       // Bump pointers and counters of the vector instruction queue
       vinsn_queue_d.accept_pnt += 1;
       vinsn_queue_d.issue_cnt += 1;
       vinsn_queue_d.commit_cnt += 1;
-    end
+    end : issue_cnt_bytes_init
   end: p_vstu
 
   always_ff @(posedge clk_i or negedge rst_ni) begin
     if (!rst_ni) begin
       vinsn_running_q <= '0;
-      issue_cnt_q     <= '0;
+      issue_cnt_bytes_q     <= '0;
 
-      len_q     <= '0;
+      axi_len_q     <= '0;
       vrf_pnt_q <= '0;
 
       pe_resp_o <= '0;
     end else begin
       vinsn_running_q <= vinsn_running_d;
-      issue_cnt_q     <= issue_cnt_d;
+      issue_cnt_bytes_q     <= issue_cnt_bytes_d;
 
-      len_q     <= len_d;
+      axi_len_q     <= axi_len_d;
       vrf_pnt_q <= vrf_pnt_d;
 
-      pe_resp_o <= pe_resp;
+      pe_resp_o <= pe_resp_d;
     end
   end