From a6d240b1e9bf81292e4199398e5249647aa36569 Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Tue, 24 Dec 2024 15:03:54 +0800 Subject: [PATCH] [rtl] reorder read for other unit in lsu. --- t1/src/T1.scala | 1 + t1/src/lsu/LSU.scala | 45 ++++++++++++++++++++++++------- t1/src/lsu/SimpleAccessUnit.scala | 8 ++++-- 3 files changed, 43 insertions(+), 11 deletions(-) diff --git a/t1/src/T1.scala b/t1/src/T1.scala index af0cbf3cf..fe93a1999 100644 --- a/t1/src/T1.scala +++ b/t1/src/T1.scala @@ -327,6 +327,7 @@ case class T1Parameter( transferSize = lsuTransposeSize, vrfReadLatency = vrfReadLatency, axi4BundleParameter = axi4BundleParameter, + lsuReadShifterSize = lsuReadShifterSize, name = "main" ) def vrfParam: VRFParam = VRFParam(vLen, laneNumber, datapathWidth, chainingSize, vrfBankSize, vrfRamType) diff --git a/t1/src/lsu/LSU.scala b/t1/src/lsu/LSU.scala index cd29f90be..e3d3b1d2a 100644 --- a/t1/src/lsu/LSU.scala +++ b/t1/src/lsu/LSU.scala @@ -37,6 +37,7 @@ case class LSUParameter( // TODO: refactor to per lane parameter. vrfReadLatency: Int, axi4BundleParameter: AXI4BundleParameter, + lsuReadShifterSize: Seq[Int], name: String) { val sewMin: Int = 8 @@ -61,7 +62,16 @@ case class LSUParameter( val sourceQueueSize: Int = 32.min(vLen * 8 / (transferSize * 8)) def mshrParam: MSHRParam = - MSHRParam(chainingSize, datapathWidth, vLen, laneNumber, paWidth, transferSize, vrfReadLatency) + MSHRParam( + chainingSize, + datapathWidth, + vLen, + laneNumber, + paWidth, + transferSize, + lsuReadShifterSize.head, + vrfReadLatency + ) /** see [[VRFParam.regNumBits]] */ val regNumBits: Int = log2Ceil(32) @@ -245,7 +255,6 @@ class LSU(param: LSUParameter) extends Module { /** TileLink D Channel write to VRF queue: TL-D -CrossBar-> MSHR -proxy-> write queue -CrossBar-> VRF */ - @public val writeQueueVec: Seq[QueueIO[LSUWriteQueueBundle]] = Seq.fill(param.laneNumber)( Queue.io(new LSUWriteQueueBundle(param), param.toVRFWriteQueueSize, flow = true) ) @@ -253,20 +262,38 @@ class LSU(param: LSUParameter) extends Module { @public val lsuProbe = IO(Output(Probe(new LSUProbe(param), layers.Verification))) + // todo: require all shifter same as head + val readLatency: Int = param.vrfReadLatency + param.lsuReadShifterSize.head * 2 + val otherUnitTargetQueue: QueueIO[UInt] = Queue.io(UInt(param.laneNumber.W), 2 * readLatency, pipe = true) + val otherUnitDataQueueVec: Seq[QueueIO[UInt]] = Seq.fill(param.laneNumber)( + Queue.io(UInt(param.datapathWidth.W), readLatency, flow = true) + ) + val dataDeqFire: UInt = Wire(UInt(param.laneNumber.W)) // read vrf - val otherTryReadVrf: UInt = Mux(otherUnit.vrfReadDataPorts.valid, otherUnit.status.targetLane, 0.U) + val otherTryReadVrf: UInt = Mux(otherUnit.vrfReadDataPorts.valid, otherUnit.status.targetLane, 0.U) vrfReadDataPorts.zipWithIndex.foreach { case (read, index) => read.valid := otherTryReadVrf(index) || storeUnit.vrfReadDataPorts(index).valid read.bits := Mux(otherTryReadVrf(index), otherUnit.vrfReadDataPorts.bits, storeUnit.vrfReadDataPorts(index).bits) storeUnit.vrfReadDataPorts(index).ready := read.ready && !otherTryReadVrf(index) storeUnit.vrfReadResults(index) := vrfReadResults(index) + storeUnit.vrfReadResults(index).valid := vrfReadResults(index).valid && otherUnitTargetQueue.empty + + val otherUnitQueue: QueueIO[UInt] = otherUnitDataQueueVec(index) + otherUnitQueue.enq.valid := vrfReadResults(index).valid && !otherUnitTargetQueue.empty + otherUnitQueue.enq.bits := vrfReadResults(index).bits + otherUnitQueue.deq.ready := dataDeqFire(index) } - otherUnit.vrfReadDataPorts.ready := (otherTryReadVrf & VecInit(vrfReadDataPorts.map(_.ready)).asUInt).orR - val pipeOtherRead: ValidIO[UInt] = - Pipe(otherUnit.vrfReadDataPorts.fire, otherUnit.status.targetLane, param.vrfReadLatency) - // todo: read data reorder - otherUnit.vrfReadResults.bits := Mux1H(pipeOtherRead.bits, vrfReadResults.map(_.bits)) - otherUnit.vrfReadResults.valid := pipeOtherRead.valid + otherUnit.vrfReadDataPorts.ready := (otherTryReadVrf & VecInit(vrfReadDataPorts.map(_.ready)).asUInt).orR && + otherUnitTargetQueue.enq.ready + otherUnitTargetQueue.enq.bits := otherUnit.status.targetLane + otherUnitTargetQueue.enq.valid := otherUnit.vrfReadDataPorts.fire + + // read data reorder + otherUnit.vrfReadResults.bits := Mux1H(otherUnitTargetQueue.deq.bits, otherUnitDataQueueVec.map(_.deq.bits)) + otherUnit.vrfReadResults.valid := otherUnitTargetQueue.deq.valid && + (otherUnitTargetQueue.deq.bits & VecInit(otherUnitDataQueueVec.map(_.deq.valid)).asUInt).orR + dataDeqFire := maskAnd(otherUnit.vrfReadResults.valid, otherUnitTargetQueue.deq.bits) + otherUnitTargetQueue.deq.ready := otherUnit.vrfReadResults.valid // write vrf val otherTryToWrite: UInt = Mux(otherUnit.vrfWritePort.valid, otherUnit.status.targetLane, 0.U) diff --git a/t1/src/lsu/SimpleAccessUnit.scala b/t1/src/lsu/SimpleAccessUnit.scala index bc517d5eb..7ebd0b671 100644 --- a/t1/src/lsu/SimpleAccessUnit.scala +++ b/t1/src/lsu/SimpleAccessUnit.scala @@ -50,6 +50,7 @@ case class MSHRParam( laneNumber: Int, paWidth: Int, lsuTransposeSize: Int, + lsuReadShifter: Int, vrfReadLatency: Int) { /** see [[LaneParameter.lmulMax]] */ @@ -124,6 +125,9 @@ case class MSHRParam( // outstanding of StoreUnit.vrfReadDataPorts // todo: param from T1Param val storeUnitReadOutStanding: Int = 8 + + // One round trip is required + val lsuReadShifterLatency: Int = 2 * lsuReadShifter } /** Miss Status Handler Register this is used to record the outstanding memory access request for each instruction. it @@ -705,9 +709,9 @@ class SimpleAccessUnit(param: MSHRParam) extends Module with LSUPublic { // Reading vrf may take multiple cycles and requires additional information to be stored val s1EnqQueue: QueueIO[SimpleAccessStage1] = - Queue.io(new SimpleAccessStage1(param), param.vrfReadLatency + 2) + Queue.io(new SimpleAccessStage1(param), param.vrfReadLatency + param.lsuReadShifterLatency + 2) val s1EnqDataQueue: QueueIO[UInt] = - Queue.io(UInt(param.datapathWidth.W), param.vrfReadLatency + 2) + Queue.io(UInt(param.datapathWidth.W), param.vrfReadLatency + param.lsuReadShifterLatency + 2) /** which byte to access in VRF, e.g. VLEN=1024,datapath=32,laneNumber=8 XXXXXXXXXX <- 10 bits for element(32bits) * index XX <- 2 bits for SEW XXXXXXXXXX <- strip MSB for the constraint that sew*vlmax <= 8*VLEN <-