From 80befd42a26307f96592d2fca9196462f4e77b5d Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Thu, 5 Dec 2024 15:19:56 +0800 Subject: [PATCH 01/41] [rtl] merge csrInterface to laneRequest. --- t1/src/Bundles.scala | 6 ++---- t1/src/Lane.scala | 14 ++++---------- t1/src/T1.scala | 9 +++++---- 3 files changed, 11 insertions(+), 18 deletions(-) diff --git a/t1/src/Bundles.scala b/t1/src/Bundles.scala index 59b317b5b..492fc195d 100644 --- a/t1/src/Bundles.scala +++ b/t1/src/Bundles.scala @@ -138,6 +138,8 @@ class LaneRequest(param: LaneParameter) extends Bundle { /** data of rs1 */ val readFromScalar: UInt = UInt(param.datapathWidth.W) + val csrInterface: CSRInterface = new CSRInterface(param.vlMaxBits) + // vmacc 的vd需要跨lane读 TODO: move to [[V]] def ma: Bool = decodeResult(Decoder.multiplier) && decodeResult(Decoder.uop)(1, 0).xorR && !decodeResult(Decoder.vwmacc) @@ -220,10 +222,6 @@ class InstructionControlRecord(param: LaneParameter) extends Bundle { /** Store request from [[T1]]. */ val laneRequest: LaneRequest = new LaneRequest(param) - /** csr follows the instruction. TODO: move to [[laneRequest]] - */ - val csr: CSRInterface = new CSRInterface(param.vlMaxBits) - /** which group is the last group for instruction. */ val lastGroupForInstruction: UInt = UInt(param.groupNumberBits.W) diff --git a/t1/src/Lane.scala b/t1/src/Lane.scala index ec52d2bb7..5864f41e6 100644 --- a/t1/src/Lane.scala +++ b/t1/src/Lane.scala @@ -233,11 +233,6 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ @public val laneRequest: DecoupledIO[LaneRequest] = IO(Flipped(Decoupled(new LaneRequest(parameter)))) - /** CSR Interface. TODO: merge to [[laneRequest]] - */ - @public - val csrInterface: CSRInterface = IO(Input(new CSRInterface(parameter.vlMaxBits))) - @public val maskUnitRequest: ValidIO[MaskUnitExeReq] = IO(Valid(new MaskUnitExeReq(parameter))) @@ -324,6 +319,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ // TODO: remove dontTouch(writeBusPort) + val csrInterface: CSRInterface = laneRequest.bits.csrInterface /** VRF instantces. */ val vrf: Instance[VRF] = Instantiate(new VRF(parameter.vrfParam)) @@ -555,7 +551,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ record.laneRequest.decodeResult(Decoder.maskLogic) /** onehot value of SEW. */ - val vSew1H: UInt = UIntToOH(record.csr.vSew)(2, 0) + val vSew1H: UInt = UIntToOH(record.laneRequest.csrInterface.vSew)(2, 0) /** if asserted, the element won't be executed. adc: vm = 0; madc: vm = 0 -> s0 + s1 + c, vm = 1 -> s0 + s1 */ @@ -611,7 +607,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ laneState.lastGroupForInstruction := record.lastGroupForInstruction laneState.isLastLaneForInstruction := record.isLastLaneForInstruction laneState.instructionFinished := record.instructionFinished - laneState.csr := record.csr + laneState.csr := record.laneRequest.csrInterface laneState.maskType := record.laneRequest.mask laneState.maskNotMaskedElement := !record.laneRequest.mask || record.laneRequest.decodeResult(Decoder.maskSource) || @@ -943,7 +939,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ ) maskSelectSew := Mux1H( maskSelectArbitrator, - csrInterface.vSew +: slotControl.map(_.csr.vSew) + csrInterface.vSew +: slotControl.map(_.laneRequest.csrInterface.vSew) ) } @@ -954,8 +950,6 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ val maskLogicCompleted: Bool = laneRequest.bits.decodeResult(Decoder.maskLogic) && (laneIndex ## 0.U(parameter.datapathWidthBits.W) >= csrInterface.vl) - // latch CSR from V - entranceControl.csr := csrInterface entranceControl.laneRequest := laneRequest.bits // TODO: in scalar core, raise illegal instruction exception when vstart is nonzero. diff --git a/t1/src/T1.scala b/t1/src/T1.scala index f739c885c..aa0ec607b 100644 --- a/t1/src/T1.scala +++ b/t1/src/T1.scala @@ -724,11 +724,12 @@ class T1(val parameter: T1Parameter) lane.laneRequest.bits.mask := maskType laneReady(index) := lane.laneRequest.ready - lane.csrInterface := requestRegCSR + // connect csrInterface + lane.laneRequest.bits.csrInterface := requestRegCSR // index type EEW Decoded in the instruction - lane.csrInterface.vSew := vSewSelect - lane.csrInterface.vl := evlForLane - lane.laneIndex := index.U + lane.laneRequest.bits.csrInterface.vSew := vSewSelect + lane.laneRequest.bits.csrInterface.vl := evlForLane + lane.laneIndex := index.U // lsu 优先会有死锁: // vmadc, v1, v2, 1 (vl=17) -> 需要先读后写 From 5038cb5e49548c56e508c15a18397ceb3cf23ba9 Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Thu, 5 Dec 2024 16:19:54 +0800 Subject: [PATCH 02/41] [rtl] connect laneRequest with shifter. --- t1/src/T1.scala | 80 ++++++++++++++++++++++++++------------------ t1/src/package.scala | 30 +++++++++++++++++ 2 files changed, 78 insertions(+), 32 deletions(-) diff --git a/t1/src/T1.scala b/t1/src/T1.scala index aa0ec607b..e72f91eb4 100644 --- a/t1/src/T1.scala +++ b/t1/src/T1.scala @@ -252,6 +252,9 @@ case class T1Parameter( // and the values are their respective delays. val crossLaneConnectCycles: Seq[Seq[Int]] = Seq.tabulate(laneNumber)(_ => Seq(1, 1)) + val laneRequestTokenSize: Int = 4 + val laneRequestShifterSize: Seq[Int] = Seq.tabulate(laneNumber)(_ => 1) + val decoderParam: DecoderParam = DecoderParam(fpuEnable, zvbbEnable, allInstructions) /** paraemter for AXI4. */ @@ -624,9 +627,21 @@ class T1(val parameter: T1Parameter) control } - /** lane is ready to receive new instruction. */ - val laneReady: Vec[Bool] = Wire(Vec(parameter.laneNumber, Bool())) - val allLaneReady: Bool = laneReady.asUInt.andR + // Close to top + val laneRequestSourceWire: Vec[DecoupledIO[LaneRequest]] = Wire( + Vec(parameter.laneNumber, Decoupled(new LaneRequest(parameter.laneParam))) + ) + // Close to lane + val laneRequestSinkWire: Vec[DecoupledIO[LaneRequest]] = Wire( + Vec(parameter.laneNumber, Decoupled(new LaneRequest(parameter.laneParam))) + ) + + laneRequestSourceWire.zipWithIndex.foreach { case (source, index) => + val sink = laneRequestSinkWire(index) + connectDecoupledWithShifter(parameter.laneRequestShifterSize(index), parameter.laneRequestTokenSize)(source, sink) + } + + val allLaneReady: Bool = VecInit(laneRequestSourceWire.map(_.ready)).asUInt.andR // TODO: review later // todo: 把scheduler的反馈也加上,lsu有更高的优先级 @@ -688,48 +703,50 @@ class T1(val parameter: T1Parameter) requestReg.bits.issue.vl ) - /** instantiate lanes. TODO: move instantiate to top of class. - */ - val laneVec: Seq[Instance[Lane]] = Seq.tabulate(parameter.laneNumber) { index => - val lane: Instance[Lane] = Instantiate(new Lane(parameter.laneParam)) - // lane.laneRequest.valid -> requestRegDequeue.ready -> lane.laneRequest.ready -> lane.laneRequest.bits - // TODO: this is harmful for PnR design, since it broadcast ready singal to each lanes, which will significantly - // reduce the scalability for large number of lanes. - lane.laneRequest.valid := requestRegDequeue.fire && !noOffsetReadLoadStore && !maskUnitInstruction + laneRequestSourceWire.foreach { request => + request.valid := requestRegDequeue.fire && !noOffsetReadLoadStore && !maskUnitInstruction // hard wire - lane.laneRequest.bits.instructionIndex := requestReg.bits.instructionIndex - lane.laneRequest.bits.decodeResult := decodeResult - lane.laneRequest.bits.vs1 := requestRegDequeue.bits.instruction(19, 15) - lane.laneRequest.bits.vs2 := requestRegDequeue.bits.instruction(24, 20) - lane.laneRequest.bits.vd := requestRegDequeue.bits.instruction(11, 7) - lane.laneRequest.bits.segment := Mux( + request.bits.instructionIndex := requestReg.bits.instructionIndex + request.bits.decodeResult := decodeResult + request.bits.vs1 := requestRegDequeue.bits.instruction(19, 15) + request.bits.vs2 := requestRegDequeue.bits.instruction(24, 20) + request.bits.vd := requestRegDequeue.bits.instruction(11, 7) + request.bits.segment := Mux( decodeResult(Decoder.nr), requestRegDequeue.bits.instruction(17, 15), requestRegDequeue.bits.instruction(31, 29) ) - lane.laneRequest.bits.loadStoreEEW := requestRegDequeue.bits.instruction(13, 12) + request.bits.loadStoreEEW := requestRegDequeue.bits.instruction(13, 12) // if the instruction is vi and vx type of gather, gather from rs2 with mask VRF read channel from one lane, // and broadcast to all lanes. - lane.laneRequest.bits.readFromScalar := source1Select + request.bits.readFromScalar := source1Select - lane.laneRequest.bits.issueInst := requestRegDequeue.fire - lane.laneRequest.bits.loadStore := isLoadStoreType + request.bits.issueInst := requestRegDequeue.fire + request.bits.loadStore := isLoadStoreType // let record in VRF to know there is a store instruction. - lane.laneRequest.bits.store := isStoreType + request.bits.store := isStoreType // let lane know if this is a special instruction, which need group-level synchronization between lane and [[V]] - lane.laneRequest.bits.special := specialInstruction - lane.laneRequest.bits.lsWholeReg := lsWholeReg + request.bits.special := specialInstruction + request.bits.lsWholeReg := lsWholeReg // mask type instruction. - lane.laneRequest.bits.mask := maskType - laneReady(index) := lane.laneRequest.ready + request.bits.mask := maskType // connect csrInterface - lane.laneRequest.bits.csrInterface := requestRegCSR + request.bits.csrInterface := requestRegCSR // index type EEW Decoded in the instruction - lane.laneRequest.bits.csrInterface.vSew := vSewSelect - lane.laneRequest.bits.csrInterface.vl := evlForLane - lane.laneIndex := index.U + request.bits.csrInterface.vSew := vSewSelect + request.bits.csrInterface.vl := evlForLane + } + + /** instantiate lanes. TODO: move instantiate to top of class. + */ + val laneVec: Seq[Instance[Lane]] = Seq.tabulate(parameter.laneNumber) { index => + val lane: Instance[Lane] = Instantiate(new Lane(parameter.laneParam)) + lane.laneRequest.valid := laneRequestSinkWire(index).valid && lane.vrfAllocateIssue + lane.laneRequest.bits := laneRequestSinkWire(index).bits + laneRequestSinkWire(index).ready := lane.laneRequest.ready && lane.vrfAllocateIssue + lane.laneIndex := index.U // lsu 优先会有死锁: // vmadc, v1, v2, 1 (vl=17) -> 需要先读后写 @@ -899,7 +916,6 @@ class T1(val parameter: T1Parameter) /** for lsu instruction lsu is ready, for normal instructions, lanes are ready. */ val executionReady: Bool = (!isLoadStoreType || lsu.request.ready) && (noOffsetReadLoadStore || allLaneReady) - val vrfAllocate: Bool = VecInit(laneVec.map(_.vrfAllocateIssue)).asUInt.andR // - ready to issue instruction // - for vi and vx type of gather, it need to access vs2 for one time, we read vs2 firstly in `gatherReadFinish` // and convert it to mv instruction. @@ -908,7 +924,7 @@ class T1(val parameter: T1Parameter) // we detect the hazard and decide should we issue this slide or // issue the instruction after the slide which already in the slot. requestRegDequeue.ready := executionReady && slotReady && (!gatherNeedRead || maskUnit.io.gatherData.valid) && - tokenManager.issueAllow && instructionIndexFree && vrfAllocate + tokenManager.issueAllow && instructionIndexFree instructionToSlotOH := Mux(requestRegDequeue.fire, slotToEnqueue, 0.U) diff --git a/t1/src/package.scala b/t1/src/package.scala index 13bde15a9..4bb5835da 100644 --- a/t1/src/package.scala +++ b/t1/src/package.scala @@ -7,6 +7,7 @@ import chisel3._ import chisel3.experimental.hierarchy.{Instance, Instantiate} import chisel3.util._ import chisel3.util.experimental.decode.DecodeBundle +import org.chipsalliance.dwbb.stdlib.queue.Queue import org.chipsalliance.t1.rtl.decoder.{Decoder, TableGenerator} import org.chipsalliance.t1.rtl.lane.Distributor @@ -221,6 +222,35 @@ package object rtl { id.map(f => (shifterReg :+ source).map(p => Mux(p.valid, indexToOH(f(p.bits), 4), 0.U)).reduce(_ | _)) } + def connectDecoupledWithShifter[T <: Data](latency: Int, tokenSize: Int)(source: DecoupledIO[T], sink: DecoupledIO[T]) + : Unit = { + val queue = Queue.io(chiselTypeOf(source.bits), tokenSize, flow = true) + // Reverse pipe release + val releasePipe = Pipe( + sink.fire, + 0.U.asTypeOf(new EmptyBundle), + latency + ).valid + val tokenCheck: Bool = pipeToken(tokenSize)(source.fire, releasePipe) + source.ready := tokenCheck + + // Complete the handshake at the source end and convert the result of the handshake into a data stream + val validSource: ValidIO[T] = Wire(Valid(chiselTypeOf(source.bits))) + validSource.valid := source.fire + validSource.bits := source.bits + + val validSink: ValidIO[T] = Wire(Valid(chiselTypeOf(source.bits))) + + // Shift Data + connectWithShifter(latency)(validSource, validSink) + // Throw the moved data into the queue + // todo: assert(queue.enq.ready || !queue.enq.valid) + queue.enq.valid := validSink.valid + queue.enq.bits := validSink.bits + // Finally, send the data to the sink + sink <> queue.deq + } + def instantiateVFU( parameter: VFUInstantiateParameter )(requestVec: Vec[SlotRequestToVFU], From ce40876c4263ca3ef3805b720f7afb1fe0f20060 Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Fri, 6 Dec 2024 16:43:47 +0800 Subject: [PATCH 03/41] [rtl] connect vrf read with shifter. --- t1/src/T1.scala | 38 +++++++++----------- t1/src/lsu/LSU.scala | 5 +-- t1/src/lsu/StoreUnit.scala | 12 +++---- t1/src/mask/BitLevelMaskWrite.scala | 8 ++--- t1/src/mask/MaskUnit.scala | 4 +-- t1/src/package.scala | 54 +++++++++++++++++++++++++++++ 6 files changed, 84 insertions(+), 37 deletions(-) diff --git a/t1/src/T1.scala b/t1/src/T1.scala index e72f91eb4..dfbfc1db5 100644 --- a/t1/src/T1.scala +++ b/t1/src/T1.scala @@ -255,6 +255,12 @@ case class T1Parameter( val laneRequestTokenSize: Int = 4 val laneRequestShifterSize: Seq[Int] = Seq.tabulate(laneNumber)(_ => 1) + val maskUnitReadTokenSize: Seq[Int] = Seq.tabulate(laneNumber)(_ => 4) + val maskUnitReadShifterSize: Seq[Int] = Seq.tabulate(laneNumber)(_ => 1) + + val lsuReadTokenSize: Seq[Int] = Seq.tabulate(laneNumber)(_ => 4) + val lsuReadShifterSize: Seq[Int] = Seq.tabulate(laneNumber)(_ => 1) + val decoderParam: DecoderParam = DecoderParam(fpuEnable, zvbbEnable, allInstructions) /** paraemter for AXI4. */ @@ -748,29 +754,17 @@ class T1(val parameter: T1Parameter) laneRequestSinkWire(index).ready := lane.laneRequest.ready && lane.vrfAllocateIssue lane.laneIndex := index.U - // lsu 优先会有死锁: - // vmadc, v1, v2, 1 (vl=17) -> 需要先读后写 - // vse32.v v1, (a0) -> 依赖上一条,但是会先发出read - - // Mask priority will also be - // vse32.v v19, (a0) - // vfslide1down.vf v19, v10, x1 - val maskUnitFirst = RegInit(false.B) - val tryToRead = lsu.vrfReadDataPorts(index).valid || maskUnit.io.readChannel(index).valid - when(tryToRead && !lane.vrfReadAddressChannel.fire) { - maskUnitFirst := !maskUnitFirst - } - lane.vrfReadAddressChannel.valid := Mux( - maskUnitFirst, - maskUnit.io.readChannel(index).valid, - lsu.vrfReadDataPorts(index).valid + connectVrfAccess( + Seq(parameter.maskUnitReadShifterSize(index), parameter.lsuReadShifterSize(index)), + Seq(parameter.maskUnitReadTokenSize(index), parameter.lsuReadTokenSize(index)), + Some(parameter.vrfReadLatency) + )( + VecInit(Seq(maskUnit.io.readChannel(index), lsu.vrfReadDataPorts(index))), + lane.vrfReadAddressChannel, + 0, + Some(lane.vrfReadDataChannel), + Some(Seq(maskUnit.io.readResult(index), lsu.vrfReadResults(index))) ) - lane.vrfReadAddressChannel.bits := - Mux(maskUnitFirst, maskUnit.io.readChannel(index).bits, lsu.vrfReadDataPorts(index).bits) - lsu.vrfReadDataPorts(index).ready := lane.vrfReadAddressChannel.ready && !maskUnitFirst - maskUnit.io.readChannel(index).ready := lane.vrfReadAddressChannel.ready && maskUnitFirst - maskUnit.io.readResult(index) := lane.vrfReadDataChannel - lsu.vrfReadResults(index) := lane.vrfReadDataChannel val maskTryToWrite = maskUnit.io.exeResp(index) // lsu & mask unit write lane diff --git a/t1/src/lsu/LSU.scala b/t1/src/lsu/LSU.scala index 7c93e09ed..b7bd81202 100644 --- a/t1/src/lsu/LSU.scala +++ b/t1/src/lsu/LSU.scala @@ -150,7 +150,7 @@ class LSU(param: LSUParameter) extends Module { /** hard wire form Top. TODO: merge to [[vrfReadDataPorts]] */ @public - val vrfReadResults: Vec[UInt] = IO(Input(Vec(param.laneNumber, UInt(param.datapathWidth.W)))) + val vrfReadResults: Vec[ValidIO[UInt]] = IO(Vec(param.laneNumber, Flipped(Valid(UInt(param.datapathWidth.W))))) /** write channel to [[V]], which will redirect it to [[Lane.vrf]]. */ @public @@ -250,7 +250,8 @@ class LSU(param: LSUParameter) extends Module { otherUnit.vrfReadDataPorts.ready := (otherTryReadVrf & VecInit(vrfReadDataPorts.map(_.ready)).asUInt).orR val pipeOtherRead: ValidIO[UInt] = Pipe(otherUnit.vrfReadDataPorts.fire, otherUnit.status.targetLane, param.vrfReadLatency) - otherUnit.vrfReadResults.bits := Mux1H(pipeOtherRead.bits, vrfReadResults) + // todo: read data reorder + otherUnit.vrfReadResults.bits := Mux1H(pipeOtherRead.bits, vrfReadResults.map(_.bits)) otherUnit.vrfReadResults.valid := pipeOtherRead.valid // write vrf diff --git a/t1/src/lsu/StoreUnit.scala b/t1/src/lsu/StoreUnit.scala index 1fb72096b..a4e125296 100644 --- a/t1/src/lsu/StoreUnit.scala +++ b/t1/src/lsu/StoreUnit.scala @@ -39,9 +39,9 @@ class StoreUnit(param: MSHRParam) extends StrideBase(param) with LSUPublic { /** hard wire form Top. see [[LSU.vrfReadResults]] */ @public - val vrfReadResults: Vec[UInt] = IO(Input(Vec(param.laneNumber, UInt(param.datapathWidth.W)))) + val vrfReadResults: Vec[ValidIO[UInt]] = IO(Input(Vec(param.laneNumber, Valid(UInt(param.datapathWidth.W))))) @public - val vrfReadyToStore: Bool = IO(Input(Bool())) + val vrfReadyToStore: Bool = IO(Input(Bool())) @public val storeResponse = IO(Input(Bool())) @@ -73,6 +73,7 @@ class StoreUnit(param: MSHRParam) extends StrideBase(param) with LSUPublic { val readCount: UInt = RegInit(0.U(dataGroupBits.W)) val stageValid = RegInit(false.B) // queue for read latency + // todo: param.vrfReadLatency => param.vrfReadLatency + shifterLatency val queue: QueueIO[UInt] = Queue.io(UInt(param.datapathWidth.W), param.vrfReadLatency, flow = true) @@ -114,12 +115,9 @@ class StoreUnit(param: MSHRParam) extends StrideBase(param) with LSUPublic { readPort.bits.offset := readCount readPort.bits.instructionIndex := lsuRequestReg.instructionIndex - // pipe read fire - val readResultFire = Pipe(readPort.fire, 0.U.asTypeOf(new EmptyBundle), param.vrfReadLatency).valid - // latency queue enq - queue.enq.valid := readResultFire - queue.enq.bits := vrfReadResults(laneIndex) + queue.enq.valid := vrfReadResults(laneIndex).valid + queue.enq.bits := vrfReadResults(laneIndex).bits AssertProperty(BoolSequence(!queue.enq.valid || queue.enq.ready)) vrfReadQueueVec(laneIndex).enq <> queue.deq stageValid || RegNext(readPort.fire) diff --git a/t1/src/mask/BitLevelMaskWrite.scala b/t1/src/mask/BitLevelMaskWrite.scala index cefe31868..5bc73491e 100644 --- a/t1/src/mask/BitLevelMaskWrite.scala +++ b/t1/src/mask/BitLevelMaskWrite.scala @@ -43,8 +43,8 @@ class BitLevelMaskWrite(parameter: T1Parameter) extends Module { ) } - val readResult: Seq[UInt] = Seq.tabulate(parameter.laneNumber) { _ => - IO(Input(UInt(parameter.datapathWidth.W))) + val readResult: Seq[ValidIO[UInt]] = Seq.tabulate(parameter.laneNumber) { _ => + IO(Flipped(Valid(UInt(parameter.datapathWidth.W)))) } val stageClear: Bool = IO(Output(Bool())) @@ -52,7 +52,7 @@ class BitLevelMaskWrite(parameter: T1Parameter) extends Module { val stageClearVec: Seq[Bool] = in.zipWithIndex.map { case (req, index) => val reqQueue: QueueIO[BitLevelWriteRequest] = Queue.io(chiselTypeOf(req.bits), 4) val readPort = readChannel(index) - val readData = readResult(index) + val readData = readResult(index).bits val res = out(index) val WaitReadQueue: QueueIO[BitLevelWriteRequest] = Queue.io(chiselTypeOf(req.bits), readVRFLatency) @@ -68,7 +68,7 @@ class BitLevelMaskWrite(parameter: T1Parameter) extends Module { readPort.bits.vs := vd + (reqQueue.deq.bits.groupCounter >> readPort.bits.offset.getWidth).asUInt readPort.bits.offset := changeUIntSize(reqQueue.deq.bits.groupCounter, readPort.bits.offset.getWidth) - val readValidPipe = Pipe(readPort.fire, false.B, readVRFLatency).valid + val readValidPipe = Pipe(readPort.fire, false.B, readVRFLatency).valid && readResult(index).valid val readResultValid = !needWAR || readValidPipe val WARData = (WaitReadQueue.deq.bits.data & WaitReadQueue.deq.bits.bitMask) | diff --git a/t1/src/mask/MaskUnit.scala b/t1/src/mask/MaskUnit.scala index 932e3a6ba..ca0c1b36e 100644 --- a/t1/src/mask/MaskUnit.scala +++ b/t1/src/mask/MaskUnit.scala @@ -67,7 +67,7 @@ class MaskUnitInterface(parameter: T1Parameter) extends Bundle { ) ) ) - val readResult: Vec[UInt] = Flipped(Vec(parameter.laneNumber, UInt(parameter.datapathWidth.W))) + val readResult: Vec[ValidIO[UInt]] = Flipped(Vec(parameter.laneNumber, Valid(UInt(parameter.datapathWidth.W)))) val writeRD: ValidIO[UInt] = Valid(UInt(parameter.datapathWidth.W)) val lastReport: UInt = Output(UInt((2 * parameter.chainingSize).W)) val lsuMaskInput: Vec[UInt] = Output(Vec(parameter.lsuMSHRSize, UInt(parameter.maskGroupWidth.W))) @@ -795,7 +795,7 @@ class MaskUnit(val parameter: T1Parameter) val dataOffset: UInt = Mux1H(readResultSelect, pipeDataOffset) readTokenRelease(index) := readDataQueue.deq.fire readDataQueue.enq.valid := readResultSelect.orR - readDataQueue.enq.bits := Mux1H(readResultSelect, readResult) >> (dataOffset ## 0.U(3.W)) + readDataQueue.enq.bits := Mux1H(readResultSelect, readResult.map(_.bits)) >> (dataOffset ## 0.U(3.W)) readDataQueue.deq } diff --git a/t1/src/package.scala b/t1/src/package.scala index 4bb5835da..5eca78e39 100644 --- a/t1/src/package.scala +++ b/t1/src/package.scala @@ -251,6 +251,60 @@ package object rtl { sink <> queue.deq } + def maskUnitReadArbitrate[T <: Data](source: Vec[DecoupledIO[T]]): DecoupledIO[T] = { + require(source.size == 2) + val maskRead = source.head + val lsuRead = source.last + val sinkWire: DecoupledIO[T] = Wire(Decoupled(chiselTypeOf(maskRead.bits))) + val maskUnitFirst = RegInit(false.B) + val tryToRead = maskRead.valid || lsuRead.valid + when(tryToRead && !sinkWire.fire) { + maskUnitFirst := !maskUnitFirst + } + + sinkWire.valid := Mux( + maskUnitFirst, + maskRead.valid, + lsuRead.valid + ) + sinkWire.bits := + Mux(maskUnitFirst, maskRead.bits, lsuRead.bits) + lsuRead.ready := sinkWire.ready && !maskUnitFirst + maskRead.ready := sinkWire.ready && maskUnitFirst + sinkWire + } + + def connectVrfAccess[T <: Data]( + latencyVec: Seq[Int], + tokenSizeVec: Seq[Int], + vrfReadLatency: Option[Int] + )(sourceVec: Vec[DecoupledIO[T]], + sink: DecoupledIO[T], + arb: Int, + dataAck: Option[UInt] = None, + dataToSource: Option[Seq[ValidIO[UInt]]] = None + ): Unit = { + val sinkVec: Vec[DecoupledIO[T]] = VecInit(sourceVec.zipWithIndex.map { case (source, index) => + val sinkWire: DecoupledIO[T] = Wire(Decoupled(chiselTypeOf(source.bits))) + connectDecoupledWithShifter(latencyVec(index), tokenSizeVec(index))(source, sinkWire) + sinkWire + }) + if (arb == 0) { + sink <> maskUnitReadArbitrate(sinkVec) + } + dataToSource.foreach { sourceDataVec => + require(dataAck.isDefined) + sourceDataVec.zipWithIndex.foreach { case (sourceData, index) => + val sinkRequest = sinkVec(index) + val accessDataValid = Pipe(sinkRequest.fire, 0.U.asTypeOf(new EmptyBundle), vrfReadLatency.get).valid + val accessDataSource = Wire(Valid(chiselTypeOf(dataAck.get))) + accessDataSource.valid := accessDataValid + accessDataSource.bits := accessDataValid + connectWithShifter(latencyVec(index))(accessDataSource, sourceData) + } + } + } + def instantiateVFU( parameter: VFUInstantiateParameter )(requestVec: Vec[SlotRequestToVFU], From 9fc4d81f59e262e1bc5e2f9f047064d6d12b1f71 Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Sun, 8 Dec 2024 19:01:27 +0800 Subject: [PATCH 04/41] [rtl] Pipe result in float adder. --- t1/src/FloatModule.scala | 6 +++--- t1/src/mask/MaskReduce.scala | 27 +++++++++++++++++++++++---- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/t1/src/FloatModule.scala b/t1/src/FloatModule.scala index 8862937c2..cff306c9c 100644 --- a/t1/src/FloatModule.scala +++ b/t1/src/FloatModule.scala @@ -17,7 +17,7 @@ object FloatAdderParameter { implicit def rwP = upickle.default.macroRW[FloatAdderParameter] } -case class FloatAdderParameter(expWidth: Int, sigWidth: Int) extends SerializableModuleParameter +case class FloatAdderParameter(expWidth: Int, sigWidth: Int, latency: Int) extends SerializableModuleParameter class FloatAdderInterface(val parameter: FloatAdderParameter) extends Bundle { val expWidth = parameter.expWidth @@ -61,8 +61,8 @@ class FloatAdder(val parameter: FloatAdderParameter) addRecFN.io.roundingMode := io.roundingMode addRecFN.io.detectTininess := false.B - io.out := fNFromRecFN(8, 24, addRecFN.io.out) - io.exceptionFlags := addRecFN.io.exceptionFlags + io.out := Pipe(true.B, fNFromRecFN(8, 24, addRecFN.io.out), parameter.latency).bits + io.exceptionFlags := Pipe(true.B, addRecFN.io.exceptionFlags, parameter.latency).bits } object FloatCompareParameter { diff --git a/t1/src/mask/MaskReduce.scala b/t1/src/mask/MaskReduce.scala index adba8a813..d479b064b 100644 --- a/t1/src/mask/MaskReduce.scala +++ b/t1/src/mask/MaskReduce.scala @@ -75,7 +75,8 @@ class MaskReduce(val parameter: MaskReduceParameter) val omInstance: Instance[MaskReduceOM] = Instantiate(new MaskReduceOM(parameter)) io.om := omInstance.getPropertyReference - val maskSize: Int = parameter.laneNumber * parameter.datapathWidth / 8 + val floatAdderLatency: Int = 1 + val maskSize: Int = parameter.laneNumber * parameter.datapathWidth / 8 // todo: uop decode val order: Bool = in.bits.uop === "b101".U @@ -89,7 +90,7 @@ class MaskReduce(val parameter: MaskReduceParameter) val logicUnit: Instance[LaneLogic] = Instantiate(new LaneLogic(LaneLogicParameter(parameter.datapathWidth))) // option unit for flot reduce val floatAdder: Option[Instance[FloatAdder]] = - Option.when(parameter.fpuEnable)(Instantiate(new FloatAdder(FloatAdderParameter(8, 24)))) + Option.when(parameter.fpuEnable)(Instantiate(new FloatAdder(FloatAdderParameter(8, 24, floatAdderLatency)))) omInstance.floatAdderIn.zip(floatAdder).foreach { case (l, r) => l := r.io.om.asAnyClassType } val flotCompare: Option[Instance[FloatCompare]] = Option.when(parameter.fpuEnable)(Instantiate(new FloatCompare(FloatCompareParameter(8, 24)))) @@ -114,6 +115,7 @@ class MaskReduce(val parameter: MaskReduceParameter) val floatType: Bool = reqReg.uop(2) || reqReg.uop(1, 0).andR val NotAdd: Bool = reqReg.uop(1) val widen: Bool = reqReg.uop === "b001".U || reqReg.uop(2, 1) === "b11".U + val floatAdd: Bool = floatType && !NotAdd // eew1HReg(0) || (eew1HReg(1) && !widen) val needFold: Bool = false.B val writeEEW: UInt = Mux(pop, 2.U, reqReg.eew + widen) @@ -123,16 +125,21 @@ class MaskReduce(val parameter: MaskReduceParameter) // crossFold: reduce between lane // lastFold: reduce in data path // orderRed: order reduce - val idle :: crossFold :: lastFold :: orderRed :: Nil = Enum(4) + val idle :: crossFold :: lastFold :: orderRed :: waitRes :: Nil = Enum(5) val state: UInt = RegInit(idle) val stateIdle: Bool = state === idle val stateCross: Bool = state === crossFold val stateLast: Bool = state === lastFold val stateOrder: Bool = state === orderRed + val stateWait: Bool = state === waitRes + // wait float + val waitCount: UInt = RegInit(0.U(log2Ceil(floatAdderLatency.max(2)).W)) + when(stateWait) { waitCount := waitCount + 1.U } + val resFire: Bool = stateWait && waitCount === (floatAdderLatency - 1).U updateResult := - stateLast || ((stateCross || stateOrder) && sourceValid) + stateLast || ((stateCross || stateOrder) && sourceValid && !floatAdd) || resFire // state update in.ready := stateIdle @@ -143,9 +150,21 @@ class MaskReduce(val parameter: MaskReduceParameter) } when(stateCross) { + when(floatAdd) { + state := waitRes + waitCount := 0.U + }.elsewhen(groupLastReduce) { + state := Mux(reqReg.lastGroup && needFold, lastFold, idle) + outValid := reqReg.lastGroup && !needFold + } + } + + when(stateWait && resFire) { when(groupLastReduce) { state := Mux(reqReg.lastGroup && needFold, lastFold, idle) outValid := reqReg.lastGroup && !needFold + }.otherwise { + state := crossFold } } From 73905a9dcc48b22301761dcaf614e793021b1bbb Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Mon, 9 Dec 2024 15:38:29 +0800 Subject: [PATCH 05/41] [rtl] connect vrf write with shifter. --- t1/src/Bundles.scala | 3 +- t1/src/Lane.scala | 47 ++++++---------------- t1/src/T1.scala | 35 +++++----------- t1/src/laneStage/MaskExchangeUnit.scala | 7 ++-- t1/src/laneStage/SlotTokenManager.scala | 29 +++----------- t1/src/mask/MaskUnit.scala | 53 +++++++++++-------------- t1/src/package.scala | 2 +- t1/src/vrf/VRF.scala | 4 +- 8 files changed, 58 insertions(+), 122 deletions(-) diff --git a/t1/src/Bundles.scala b/t1/src/Bundles.scala index 492fc195d..5dd255384 100644 --- a/t1/src/Bundles.scala +++ b/t1/src/Bundles.scala @@ -792,6 +792,5 @@ class MaskUnitReadVs1(parameter: T1Parameter) extends Bundle { } class LaneTokenBundle extends Bundle { - val maskResponseRelease: Bool = Output(Bool()) - val maskRequestRelease: Bool = Input(Bool()) + val maskRequestRelease: Bool = Input(Bool()) } diff --git a/t1/src/Lane.scala b/t1/src/Lane.scala index 5864f41e6..d6e68acf6 100644 --- a/t1/src/Lane.scala +++ b/t1/src/Lane.scala @@ -349,9 +349,6 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ val maskIndexVec: Vec[UInt] = RegInit(VecInit(Seq.fill(parameter.chainingSize)(0.U(log2Ceil(parameter.maskGroupWidth).W)))) - /** the find first one index register in this lane. */ - val ffoIndexReg: UInt = RegInit(0.U(log2Ceil(parameter.vLen / 8).W)) - /** result of reduce instruction. */ val reduceResult: UInt = RegInit(0.U(parameter.datapathWidth.W)) @@ -359,7 +356,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ */ val vrfWriteArbiter: Vec[DecoupledIO[VRFWriteRequest]] = Wire( Vec( - parameter.chainingSize + 2, + parameter.chainingSize + 1, Decoupled( new VRFWriteRequest( parameter.vrfParam.regNumBits, @@ -371,30 +368,15 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ ) ) - val lsuWriteQueue: QueueIO[VRFWriteRequest] = Queue.io(vrfWriteType, 1, flow = true) - // connect lsuWriteQueue.enq - lsuWriteQueue.enq.valid := vrfWriteChannel.valid && !writeFromMask - lsuWriteQueue.enq.bits := vrfWriteChannel.bits - vrfWriteChannel.ready := writeFromMask || lsuWriteQueue.enq.ready - - val maskWriteQueue: QueueIO[VRFWriteRequest] = Queue.io(vrfWriteType, parameter.maskUnitVefWriteQueueSize) - // connect maskWriteQueue.enq - maskWriteQueue.enq.valid := vrfWriteChannel.valid && writeFromMask - maskWriteQueue.enq.bits := vrfWriteChannel.bits - - vrfWriteArbiter(parameter.chainingSize).valid := lsuWriteQueue.deq.valid - vrfWriteArbiter(parameter.chainingSize).bits := lsuWriteQueue.deq.bits - lsuWriteQueue.deq.ready := vrfWriteArbiter(parameter.chainingSize).ready + vrfWriteArbiter(parameter.chainingSize).valid := vrfWriteChannel.valid + vrfWriteArbiter(parameter.chainingSize).bits := vrfWriteChannel.bits + vrfWriteChannel.ready := vrfWriteArbiter(parameter.chainingSize).ready - vrfWriteArbiter(parameter.chainingSize + 1).valid := maskWriteQueue.deq.valid - vrfWriteArbiter(parameter.chainingSize + 1).bits := maskWriteQueue.deq.bits - maskWriteQueue.deq.ready := vrfWriteArbiter(parameter.chainingSize + 1).ready - - val allVrfWriteAfterCheck: Seq[VRFWriteRequest] = Seq.tabulate(parameter.chainingSize + 4) { i => + val allVrfWriteAfterCheck: Seq[VRFWriteRequest] = Seq.tabulate(parameter.chainingSize + 3) { i => RegInit(0.U.asTypeOf(vrfWriteArbiter.head.bits)) } - val afterCheckValid: Seq[Bool] = Seq.tabulate(parameter.chainingSize + 4) { _ => RegInit(false.B) } - val afterCheckDequeueReady: Vec[Bool] = Wire(Vec(parameter.chainingSize + 4, Bool())) + val afterCheckValid: Seq[Bool] = Seq.tabulate(parameter.chainingSize + 3) { _ => RegInit(false.B) } + val afterCheckDequeueReady: Vec[Bool] = Wire(Vec(parameter.chainingSize + 3, Bool())) val afterCheckDequeueFire: Seq[Bool] = afterCheckValid.zip(afterCheckDequeueReady).map { case (v, r) => v && r } /** for each slot, assert when it is asking [[T1]] to change mask */ @@ -594,7 +576,6 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ maskUnitRequest <> mask.maskReq maskRequestToLSU <> mask.maskRequestToLSU tokenIO <> mask.tokenIO - tokenIO.maskResponseRelease := maskWriteQueue.deq.fire mask.dequeue }.getOrElse(stage3EnqWire) stage3.enqueue <> stage3EnqSelect @@ -849,7 +830,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ // It’s been a long time since I selected it. Need pipe val queueBeforeMaskWrite: QueueIO[VRFWriteRequest] = Queue.io(chiselTypeOf(maskedWriteUnit.enqueue.bits), entries = 1, pipe = true) - val writeSelect: UInt = Wire(UInt((parameter.chainingSize + 4).W)) + val writeSelect: UInt = Wire(UInt((parameter.chainingSize + 3).W)) val writeCavitation: UInt = VecInit(allVrfWriteAfterCheck.map(_.mask === 0.U)).asUInt // 处理 rf @@ -1156,8 +1137,8 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ writeReadyForLsu := vrf.writeReadyForLsu vrfReadyToStore := vrf.vrfReadyToStore tokenManager.crossWriteReports.zipWithIndex.foreach { case (rpt, rptIndex) => - rpt.valid := afterCheckDequeueFire(parameter.chainingSize + 2 + rptIndex) - rpt.bits := allVrfWriteAfterCheck(parameter.chainingSize + 2 + rptIndex).instructionIndex + rpt.valid := afterCheckDequeueFire(parameter.chainingSize + 1 + rptIndex) + rpt.bits := allVrfWriteAfterCheck(parameter.chainingSize + 1 + rptIndex).instructionIndex } // todo: add mask unit write token tokenManager.responseReport.valid := maskUnitRequest.valid @@ -1193,13 +1174,9 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ tokenManager.topWriteEnq.valid := vrfWriteChannel.fire tokenManager.topWriteEnq.bits := vrfWriteChannel.bits.instructionIndex - tokenManager.fromMask := writeFromMask - - tokenManager.lsuWriteDeq.valid := afterCheckDequeueFire(parameter.chainingSize) - tokenManager.lsuWriteDeq.bits := allVrfWriteAfterCheck(parameter.chainingSize).instructionIndex - tokenManager.maskWriteDeq.valid := afterCheckDequeueFire(parameter.chainingSize + 1) - tokenManager.maskWriteDeq.bits := allVrfWriteAfterCheck(parameter.chainingSize + 1).instructionIndex + tokenManager.topWriteDeq.valid := afterCheckDequeueFire(parameter.chainingSize) + tokenManager.topWriteDeq.bits := allVrfWriteAfterCheck(parameter.chainingSize).instructionIndex tokenManager.maskUnitLastReport := lsuLastReport diff --git a/t1/src/T1.scala b/t1/src/T1.scala index dfbfc1db5..dc510ffea 100644 --- a/t1/src/T1.scala +++ b/t1/src/T1.scala @@ -656,20 +656,6 @@ class T1(val parameter: T1Parameter) val completeIndexInstruction: Bool = ohCheck(lsu.lastReport, slots.last.record.instructionIndex, parameter.chainingSize) && !slots.last.state.idle - val vrfWrite: Vec[DecoupledIO[VRFWriteRequest]] = Wire( - Vec( - parameter.laneNumber, - Decoupled( - new VRFWriteRequest( - parameter.vrfParam.regNumBits, - parameter.vrfParam.vrfOffsetBits, - parameter.instructionIndexBits, - parameter.datapathWidth - ) - ) - ) - ) - val freeOR: Bool = VecInit(slots.map(_.state.idle)).asUInt.orR /** slot is ready to accept new instructions. */ @@ -766,13 +752,15 @@ class T1(val parameter: T1Parameter) Some(Seq(maskUnit.io.readResult(index), lsu.vrfReadResults(index))) ) - val maskTryToWrite = maskUnit.io.exeResp(index) - // lsu & mask unit write lane - // Mask write has absolute priority because it has a token - lane.vrfWriteChannel.valid := vrfWrite(index).valid || maskTryToWrite.valid - lane.vrfWriteChannel.bits := Mux(maskTryToWrite.valid, maskTryToWrite.bits, vrfWrite(index).bits) - vrfWrite(index).ready := lane.vrfWriteChannel.ready && !maskTryToWrite.valid - lane.writeFromMask := maskTryToWrite.valid + connectVrfAccess( + Seq(parameter.maskUnitReadShifterSize(index), parameter.lsuReadShifterSize(index)), + Seq(parameter.maskUnitReadTokenSize(index), parameter.lsuReadTokenSize(index)) + )( + VecInit(Seq(maskUnit.io.exeResp(index), lsu.vrfWritePort(index))), + lane.vrfWriteChannel, + 0 + ) + lane.writeFromMask := maskUnit.exeResp(index).fire lsu.offsetReadResult(index).valid := lane.maskUnitRequest.valid && lane.maskRequestToLSU lsu.offsetReadResult(index).bits := lane.maskUnitRequest.bits.source2 @@ -850,7 +838,6 @@ class T1(val parameter: T1Parameter) } maskUnit.io.tokenIO.zip(laneVec).zipWithIndex.foreach { case ((token, lane), index) => - token.maskResponseRelease := lane.tokenIO.maskResponseRelease lane.tokenIO.maskRequestRelease := token.maskRequestRelease || lsu.tokenIO.offsetGroupRelease(index) } @@ -887,8 +874,6 @@ class T1(val parameter: T1Parameter) io.highBandwidthLoadStorePort <> lsu.axi4Port io.indexedLoadStorePort <> lsu.simpleAccessPorts - // 暂时直接连lsu的写,后续需要处理scheduler的写 - vrfWrite.zip(lsu.vrfWritePort).foreach { case (sink, source) => sink <> source } /** Slot has free entries. */ val free = VecInit(slots.map(_.state.idle)).asUInt @@ -974,7 +959,7 @@ class T1(val parameter: T1Parameter) probeWire.requestRegReady := requestRegDequeue.ready // maskUnitWrite maskUnitWriteReady probeWire.writeQueueEnqVec.zip(maskUnit.io.exeResp).foreach { case (probe, write) => - probe.valid := write.valid && write.bits.mask.orR + probe.valid := write.fire && write.bits.mask.orR probe.bits := write.bits.instructionIndex } probeWire.instructionValid := maskAnd( diff --git a/t1/src/laneStage/MaskExchangeUnit.scala b/t1/src/laneStage/MaskExchangeUnit.scala index cb8257f83..49faff45f 100644 --- a/t1/src/laneStage/MaskExchangeUnit.scala +++ b/t1/src/laneStage/MaskExchangeUnit.scala @@ -54,8 +54,7 @@ class MaskExchangeUnit(parameter: LaneParameter) extends Module { val maskRequestEnqReady: Bool = !enqIsMaskRequest || maskRequestAllow - dequeue.valid := enqueue.valid && enqSendToDeq - dequeue.bits := enqueue.bits - enqueue.ready := Mux(enqSendToDeq, dequeue.ready, maskRequestEnqReady) - tokenIO.maskResponseRelease := DontCare + dequeue.valid := enqueue.valid && enqSendToDeq + dequeue.bits := enqueue.bits + enqueue.ready := Mux(enqSendToDeq, dequeue.ready, maskRequestEnqReady) } diff --git a/t1/src/laneStage/SlotTokenManager.scala b/t1/src/laneStage/SlotTokenManager.scala index 22eef760b..dadea0fd5 100644 --- a/t1/src/laneStage/SlotTokenManager.scala +++ b/t1/src/laneStage/SlotTokenManager.scala @@ -94,13 +94,7 @@ class SlotTokenManager(parameter: LaneParameter) extends Module { val topWriteEnq: ValidIO[UInt] = IO(Flipped(Valid(UInt(parameter.instructionIndexBits.W)))) @public - val fromMask: Bool = IO(Input(Bool())) - - @public - val lsuWriteDeq: ValidIO[UInt] = IO(Flipped(Valid(UInt(parameter.instructionIndexBits.W)))) - - @public - val maskWriteDeq: ValidIO[UInt] = IO(Flipped(Valid(UInt(parameter.instructionIndexBits.W)))) + val topWriteDeq: ValidIO[UInt] = IO(Flipped(Valid(UInt(parameter.instructionIndexBits.W)))) @public val instructionValid: UInt = IO(Output(UInt((2 * parameter.chainingSize).W))) @@ -212,27 +206,16 @@ class SlotTokenManager(parameter: LaneParameter) extends Module { val instructionInWritePipe: UInt = tokenUpdate(writePipeToken, writePipeEnq, writePipeDeq) // lsu & mask write token - val lsuWriteToken: Seq[UInt] = Seq.tabulate(2 * parameter.chainingSize)(_ => RegInit(0.U(tokenWith.W))) - val maskWriteToken: Seq[UInt] = Seq.tabulate(2 * parameter.chainingSize)(_ => RegInit(0.U(tokenWith.W))) + val topWriteToken: Seq[UInt] = Seq.tabulate(2 * parameter.chainingSize)(_ => RegInit(0.U(tokenWith.W))) val topWriteDoEnq: UInt = maskAnd(topWriteEnq.valid, indexToOH(topWriteEnq.bits, parameter.chainingSize)).asUInt - val lsuWriteDoEnq: UInt = - maskAnd(topWriteEnq.valid && !fromMask, indexToOH(topWriteEnq.bits, parameter.chainingSize)).asUInt - - val maskWriteDoEnq: UInt = - maskAnd(topWriteEnq.valid && fromMask, indexToOH(topWriteEnq.bits, parameter.chainingSize)).asUInt - - val lsuWriteDoDeq: UInt = - maskAnd(lsuWriteDeq.valid, indexToOH(lsuWriteDeq.bits, parameter.chainingSize)).asUInt - - val maskWriteDoDeq: UInt = - maskAnd(maskWriteDeq.valid, indexToOH(maskWriteDeq.bits, parameter.chainingSize)).asUInt + val topWriteDoDeq: UInt = + maskAnd(topWriteDeq.valid, indexToOH(topWriteDeq.bits, parameter.chainingSize)).asUInt - val lsuInTopWrite = tokenUpdate(lsuWriteToken, lsuWriteDoEnq, lsuWriteDoDeq) - val maskInTopWrite = tokenUpdate(maskWriteToken, maskWriteDoEnq, maskWriteDoDeq) + val topWrite: UInt = tokenUpdate(topWriteToken, topWriteDoEnq, topWriteDoDeq) - dataInWritePipe := instructionInWritePipe | lsuInTopWrite | maskInTopWrite + dataInWritePipe := instructionInWritePipe | topWrite instructionValid := dataInWritePipe | instructionInSlot } diff --git a/t1/src/mask/MaskUnit.scala b/t1/src/mask/MaskUnit.scala index ca0c1b36e..ce1ab81ae 100644 --- a/t1/src/mask/MaskUnit.scala +++ b/t1/src/mask/MaskUnit.scala @@ -39,15 +39,15 @@ import org.chipsalliance.t1.rtl.decoder.Decoder // 11 11 1 -> maskdestination class MaskUnitInterface(parameter: T1Parameter) extends Bundle { - val clock: Clock = Input(Clock()) - val reset: Reset = Input(Reset()) - val instReq: ValidIO[MaskUnitInstReq] = Flipped(Valid(new MaskUnitInstReq(parameter))) - val exeReq: Vec[ValidIO[MaskUnitExeReq]] = Flipped( + val clock: Clock = Input(Clock()) + val reset: Reset = Input(Reset()) + val instReq: ValidIO[MaskUnitInstReq] = Flipped(Valid(new MaskUnitInstReq(parameter))) + val exeReq: Vec[ValidIO[MaskUnitExeReq]] = Flipped( Vec(parameter.laneNumber, Valid(new MaskUnitExeReq(parameter.laneParam))) ) - val exeResp: Vec[ValidIO[VRFWriteRequest]] = Vec( + val exeResp: Vec[DecoupledIO[VRFWriteRequest]] = Vec( parameter.laneNumber, - Valid( + Decoupled( new VRFWriteRequest( parameter.vrfParam.regNumBits, parameter.laneParam.vrfOffsetBits, @@ -56,8 +56,8 @@ class MaskUnitInterface(parameter: T1Parameter) extends Bundle { ) ) ) - val tokenIO: Vec[LaneTokenBundle] = Flipped(Vec(parameter.laneNumber, new LaneTokenBundle)) - val readChannel: Vec[DecoupledIO[VRFReadRequest]] = Vec( + val tokenIO: Vec[LaneTokenBundle] = Flipped(Vec(parameter.laneNumber, new LaneTokenBundle)) + val readChannel: Vec[DecoupledIO[VRFReadRequest]] = Vec( parameter.laneNumber, Decoupled( new VRFReadRequest( @@ -67,19 +67,19 @@ class MaskUnitInterface(parameter: T1Parameter) extends Bundle { ) ) ) - val readResult: Vec[ValidIO[UInt]] = Flipped(Vec(parameter.laneNumber, Valid(UInt(parameter.datapathWidth.W)))) - val writeRD: ValidIO[UInt] = Valid(UInt(parameter.datapathWidth.W)) - val lastReport: UInt = Output(UInt((2 * parameter.chainingSize).W)) - val lsuMaskInput: Vec[UInt] = Output(Vec(parameter.lsuMSHRSize, UInt(parameter.maskGroupWidth.W))) - val lsuMaskSelect: Vec[UInt] = Input(Vec(parameter.lsuMSHRSize, UInt(parameter.lsuParameters.maskGroupSizeBits.W))) - val laneMaskInput: Vec[UInt] = Output(Vec(parameter.laneNumber, UInt(parameter.datapathWidth.W))) - val laneMaskSelect: Vec[UInt] = Input(Vec(parameter.laneNumber, UInt(parameter.laneParam.maskGroupSizeBits.W))) - val laneMaskSewSelect: Vec[UInt] = Input(Vec(parameter.laneNumber, UInt(2.W))) - val v0UpdateVec: Vec[ValidIO[V0Update]] = Flipped(Vec(parameter.laneNumber, Valid(new V0Update(parameter.laneParam)))) - val writeRDData: UInt = Output(UInt(parameter.xLen.W)) - val gatherData: DecoupledIO[UInt] = Decoupled(UInt(parameter.xLen.W)) - val gatherRead: Bool = Input(Bool()) - val om: Property[ClassType] = Output(Property[AnyClassType]()) + val readResult: Vec[ValidIO[UInt]] = Flipped(Vec(parameter.laneNumber, Valid(UInt(parameter.datapathWidth.W)))) + val writeRD: ValidIO[UInt] = Valid(UInt(parameter.datapathWidth.W)) + val lastReport: UInt = Output(UInt((2 * parameter.chainingSize).W)) + val lsuMaskInput: Vec[UInt] = Output(Vec(parameter.lsuMSHRSize, UInt(parameter.maskGroupWidth.W))) + val lsuMaskSelect: Vec[UInt] = Input(Vec(parameter.lsuMSHRSize, UInt(parameter.lsuParameters.maskGroupSizeBits.W))) + val laneMaskInput: Vec[UInt] = Output(Vec(parameter.laneNumber, UInt(parameter.datapathWidth.W))) + val laneMaskSelect: Vec[UInt] = Input(Vec(parameter.laneNumber, UInt(parameter.laneParam.maskGroupSizeBits.W))) + val laneMaskSewSelect: Vec[UInt] = Input(Vec(parameter.laneNumber, UInt(2.W))) + val v0UpdateVec: Vec[ValidIO[V0Update]] = Flipped(Vec(parameter.laneNumber, Valid(new V0Update(parameter.laneParam)))) + val writeRDData: UInt = Output(UInt(parameter.xLen.W)) + val gatherData: DecoupledIO[UInt] = Decoupled(UInt(parameter.xLen.W)) + val gatherRead: Bool = Input(Bool()) + val om: Property[ClassType] = Output(Property[AnyClassType]()) } @instantiable @@ -1091,17 +1091,10 @@ class MaskUnit(val parameter: T1Parameter) } queue.enq.bits.index := instReg.instructionIndex - // write token - val tokenCounter = RegInit(0.U(log2Ceil(parameter.maskUnitVefWriteQueueSize + 1).W)) - val tokenAllow: Bool = queue.deq.fire - val counterChange: UInt = Mux(tokenAllow, 1.U, -1.S(tokenCounter.getWidth.W).asUInt) - when(tokenAllow ^ tokenIO(index).maskResponseRelease) { - tokenCounter := tokenCounter + counterChange - } // write vrf val writePort = exeResp(index) - queue.deq.ready := !tokenCounter.asBools.last - writePort.valid := tokenAllow + queue.deq.ready := writePort.ready + writePort.valid := queue.deq.valid writePort.bits.last := DontCare writePort.bits.instructionIndex := instReg.instructionIndex writePort.bits.data := Mux(queue.deq.bits.ffoByOther, queue.deq.bits.pipeData, queue.deq.bits.writeData.data) diff --git a/t1/src/package.scala b/t1/src/package.scala index 5eca78e39..412d62ddf 100644 --- a/t1/src/package.scala +++ b/t1/src/package.scala @@ -277,7 +277,7 @@ package object rtl { def connectVrfAccess[T <: Data]( latencyVec: Seq[Int], tokenSizeVec: Seq[Int], - vrfReadLatency: Option[Int] + vrfReadLatency: Option[Int] = None )(sourceVec: Vec[DecoupledIO[T]], sink: DecoupledIO[T], arb: Int, diff --git a/t1/src/vrf/VRF.scala b/t1/src/vrf/VRF.scala index 721c888f4..6e094a47b 100644 --- a/t1/src/vrf/VRF.scala +++ b/t1/src/vrf/VRF.scala @@ -199,7 +199,7 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar @public val writeCheck: Vec[LSUWriteCheck] = IO( Vec( - parameter.chainingSize + 4, + parameter.chainingSize + 3, Input( new LSUWriteCheck( parameter.regNumBits, @@ -211,7 +211,7 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar ) @public - val writeAllow: Vec[Bool] = IO(Vec(parameter.chainingSize + 4, Output(Bool()))) + val writeAllow: Vec[Bool] = IO(Vec(parameter.chainingSize + 3, Output(Bool()))) /** when instruction is fired, record it in the VRF for chaining. */ @public From ddb8e39e0e6ec41e5372cabc1bc707ee7d285b9d Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Wed, 11 Dec 2024 16:50:12 +0800 Subject: [PATCH 06/41] [rtl] add mask control. --- t1/src/Lane.scala | 113 ++++++++++++++++++++++++++++++++++------------ t1/src/T1.scala | 11 +++-- 2 files changed, 91 insertions(+), 33 deletions(-) diff --git a/t1/src/Lane.scala b/t1/src/Lane.scala index d6e68acf6..0f241c091 100644 --- a/t1/src/Lane.scala +++ b/t1/src/Lane.scala @@ -98,6 +98,7 @@ case class LaneParameter( crossLaneVRFWriteEscapeQueueSize: Int, fpuEnable: Boolean, portFactor: Int, + maskRequestLatency: Int, vrfRamType: RamType, decoderParam: DecoderParam, vfuInstantiateParameter: VFUInstantiateParameter) @@ -325,17 +326,6 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ val vrf: Instance[VRF] = Instantiate(new VRF(parameter.vrfParam)) omInstance.vrfIn := Property(vrf.om.asAnyClassType) - /** TODO: review later - */ - val maskGroupedOrR: UInt = VecInit( - maskInput.asBools - .grouped(parameter.dataPathByteWidth) - .toSeq - .map( - VecInit(_).asUInt.orR - ) - ).asUInt - val fullMask: UInt = (-1.S(parameter.datapathWidth.W)).asUInt /** the slot is occupied by instruction */ @@ -379,6 +369,66 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ val afterCheckDequeueReady: Vec[Bool] = Wire(Vec(parameter.chainingSize + 3, Bool())) val afterCheckDequeueFire: Seq[Bool] = afterCheckValid.zip(afterCheckDequeueReady).map { case (v, r) => v && r } + // todo: mv to bundle.scala + class MaskControl(parameter: LaneParameter) extends Bundle { + val index: UInt = UInt(parameter.instructionIndexBits.W) + val sew: UInt = UInt(2.W) + val maskData: UInt = UInt(parameter.datapathWidth.W) + val group: UInt = UInt(parameter.maskGroupSizeBits.W) + val dataValid: Bool = Bool() + val waiteResponse: Bool = Bool() + val controlValid: Bool = Bool() + } + + val maskControlRelease: Vec[ValidIO[UInt]] = + Wire(Vec(parameter.chainingSize, Valid(UInt(parameter.instructionIndexBits.W)))) + + val maskControlEnq: UInt = Wire(UInt(parameter.chainingSize.W)) + val maskControlDataDeq: UInt = Wire(UInt(parameter.chainingSize.W)) + val maskControlReq: Vec[Bool] = Wire(Vec(parameter.chainingSize, Bool())) + val maskControlReqSelect: UInt = ffo(maskControlReq.asUInt) + // mask request & response handle + val maskControlVec: Seq[MaskControl] = Seq.tabulate(parameter.chainingSize) { index => + val state = RegInit(0.U.asTypeOf(new MaskControl(parameter))) + val releaseHit: Bool = maskControlRelease.map(r => r.valid && (r.bits === state.index)).reduce(_ || _) + val responseFire = + Pipe(maskControlReqSelect(index), 0.U.asTypeOf(new EmptyBundle), parameter.maskRequestLatency).valid + + when(maskControlEnq(index)) { + state := 0.U.asTypeOf(state) + state.index := laneRequest.bits.instructionIndex + state.sew := laneRequest.bits.csrInterface.vSew + state.controlValid := true.B + } + + when(state.controlValid) { + when(releaseHit) { + state.controlValid := false.B + } + } + + maskControlReq(index) := state.controlValid && !state.dataValid && !state.waiteResponse + when(maskControlReqSelect(index)) { + state.waiteResponse := true.B + state.group := state.group + 1.U + } + + when(responseFire) { + state.dataValid := true.B + state.waiteResponse := false.B + state.maskData := maskInput + } + + when(maskControlDataDeq(index)) { + state.dataValid := false.B + } + + state + } + val maskControlFree: Seq[Bool] = maskControlVec.map(s => !s.controlValid && !s.waiteResponse) + val freeSelect: UInt = ffo(VecInit(maskControlFree).asUInt) + maskControlEnq := maskAnd(laneRequest.fire && laneRequest.bits.mask, freeSelect) + /** for each slot, assert when it is asking [[T1]] to change mask */ val slotMaskRequestVec: Vec[ValidIO[UInt]] = Wire( Vec( @@ -388,7 +438,8 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ ) /** which slot wins the arbitration for requesting mask. */ - val maskRequestFireOH: UInt = Wire(UInt(parameter.chainingSize.W)) + val maskRequestFireOH: Vec[Bool] = Wire(Vec(parameter.chainingSize, Bool())) + val maskDataVec: Vec[UInt] = Wire(Vec(parameter.chainingSize, UInt(parameter.maskGroupWidth.W))) /** FSM control for each slot. if index == 0, * - slot can support write v0 in mask type, see [[Decoder.maskDestination]] [[Decoder.maskSource]] @@ -611,6 +662,8 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ laneState.elements.get(k).foreach(stateData => d := stateData) } + maskControlRelease(index).valid := false.B + maskControlRelease(index).bits := record.laneRequest.instructionIndex // update lane state when(stage0.enqueue.fire) { maskGroupCountVec(index) := stage0.updateLaneState.maskGroupCount @@ -618,6 +671,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ maskIndexVec(index) := stage0.updateLaneState.maskIndex when(stage0.updateLaneState.outOfExecutionRange) { slotOccupied(index) := false.B + maskControlRelease(index).valid := true.B } } @@ -632,7 +686,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ val maskFailure: Bool = stage0.updateLaneState.maskExhausted && stage0.enqueue.fire // update mask register when(maskUpdateFire) { - record.mask.bits := maskInput + record.mask.bits := DontCare } when(maskUpdateFire ^ maskFailure) { record.mask.valid := maskUpdateFire @@ -908,20 +962,21 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ } { - // 处理mask的请求 - val maskSelectArbitrator = ffo( - VecInit(slotMaskRequestVec.map(_.valid)).asUInt ## - (laneRequest.valid && (laneRequest.bits.mask || laneRequest.bits.decodeResult(Decoder.maskSource))) - ) - maskRequestFireOH := maskSelectArbitrator(parameter.chainingSize, 1) - maskSelect := Mux1H( - maskSelectArbitrator, - 0.U.asTypeOf(slotMaskRequestVec.head.bits) +: slotMaskRequestVec.map(_.bits) - ) - maskSelectSew := Mux1H( - maskSelectArbitrator, - csrInterface.vSew +: slotControl.map(_.laneRequest.csrInterface.vSew) - ) + maskSelect := Mux1H(maskControlReqSelect, maskControlVec.map(_.group)) + maskSelectSew := Mux1H(maskControlReqSelect, maskControlVec.map(_.sew)) + maskControlDataDeq := slotMaskRequestVec.zipWithIndex.map { case (req, index) => + val slotIndex = slotControl(index).laneRequest.instructionIndex + val hitMaskControl = VecInit(maskControlVec.map(_.index === slotIndex)).asUInt + val dataValid = Mux1H(hitMaskControl, maskControlVec.map(_.dataValid)) + val data = Mux1H(hitMaskControl, maskControlVec.map(_.maskData)) + val group = Mux1H(hitMaskControl, maskControlVec.map(_.group)) + val sameGroup = group === req.bits + dontTouch(sameGroup) + val maskRequestFire = req.valid && dataValid + maskRequestFireOH(index) := maskRequestFire + maskDataVec(index) := data + maskAnd(maskRequestFire, hitMaskControl).asUInt + }.reduce(_ | _) } // package a control logic for incoming instruction. @@ -944,9 +999,9 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ // for 'nr' type instructions, they will need another complete signal. !(laneRequest.bits.decodeResult(Decoder.nr) || laneRequest.bits.lsWholeReg) // indicate if this is the mask type. - entranceControl.mask.valid := laneRequest.bits.mask + entranceControl.mask.valid := false.B // assign mask from [[V]] - entranceControl.mask.bits := maskInput + entranceControl.mask.bits := DontCare // mask used for VRF write in this group. entranceControl.vrfWriteMask := 0.U diff --git a/t1/src/T1.scala b/t1/src/T1.scala index dc510ffea..0418d056a 100644 --- a/t1/src/T1.scala +++ b/t1/src/T1.scala @@ -261,6 +261,8 @@ case class T1Parameter( val lsuReadTokenSize: Seq[Int] = Seq.tabulate(laneNumber)(_ => 4) val lsuReadShifterSize: Seq[Int] = Seq.tabulate(laneNumber)(_ => 1) + val maskRequestLatency = 2 + val decoderParam: DecoderParam = DecoderParam(fpuEnable, zvbbEnable, allInstructions) /** paraemter for AXI4. */ @@ -299,6 +301,7 @@ case class T1Parameter( crossLaneVRFWriteEscapeQueueSize = vrfWriteQueueSize, fpuEnable = fpuEnable, portFactor = vrfBankSize, + maskRequestLatency = 2 * maskRequestLatency, vrfRamType = vrfRamType, decoderParam = decoderParam, vfuInstantiateParameter = vfuInstantiateParameter @@ -760,7 +763,7 @@ class T1(val parameter: T1Parameter) lane.vrfWriteChannel, 0 ) - lane.writeFromMask := maskUnit.exeResp(index).fire + lane.writeFromMask := maskUnit.io.exeResp(index).fire lsu.offsetReadResult(index).valid := lane.maskUnitRequest.valid && lane.maskRequestToLSU lsu.offsetReadResult(index).bits := lane.maskUnitRequest.bits.source2 @@ -770,9 +773,9 @@ class T1(val parameter: T1Parameter) d := ohCheck(lane.instructionFinished, f, parameter.chainingSize) } vxsatReportVec(index) := lane.vxsatReport - lane.maskInput := maskUnit.io.laneMaskInput(index) - maskUnit.io.laneMaskSelect(index) := lane.maskSelect - maskUnit.io.laneMaskSewSelect(index) := lane.maskSelectSew + lane.maskInput := Pipe(true.B, maskUnit.io.laneMaskInput(index), parameter.maskRequestLatency).bits + maskUnit.io.laneMaskSelect(index) := Pipe(true.B, lane.maskSelect, parameter.maskRequestLatency).bits + maskUnit.io.laneMaskSewSelect(index) := Pipe(true.B, lane.maskSelectSew, parameter.maskRequestLatency).bits maskUnit.io.v0UpdateVec(index) <> lane.v0Update lane.lsuLastReport := lsu.lastReport | maskUnit.io.lastReport From 5a60637b413ffc5ae1c83de09f4b45569f380b1b Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Wed, 11 Dec 2024 18:09:37 +0800 Subject: [PATCH 07/41] [rtl] duplicate v0 in lsu. --- t1/src/Bundles.scala | 6 ++-- t1/src/Lane.scala | 64 +++++++++++++++++++------------------- t1/src/T1.scala | 11 +++---- t1/src/lsu/LSU.scala | 34 ++++++++++++++------ t1/src/mask/MaskUnit.scala | 13 ++------ 5 files changed, 67 insertions(+), 61 deletions(-) diff --git a/t1/src/Bundles.scala b/t1/src/Bundles.scala index 5dd255384..d59a97d75 100644 --- a/t1/src/Bundles.scala +++ b/t1/src/Bundles.scala @@ -360,9 +360,9 @@ class LaneResponseFeedback(param: LaneParameter) extends Bundle { val complete: Bool = Bool() } -class V0Update(param: LaneParameter) extends Bundle { - val data: UInt = UInt(param.datapathWidth.W) - val offset: UInt = UInt(param.vrfOffsetBits.W) +class V0Update(datapathWidth: Int, vrfOffsetBits: Int) extends Bundle { + val data: UInt = UInt(datapathWidth.W) + val offset: UInt = UInt(vrfOffsetBits.W) // mask/ld类型的有可能不会写完整的32bit val mask: UInt = UInt(4.W) } diff --git a/t1/src/Lane.scala b/t1/src/Lane.scala index 0f241c091..5d5e5f08f 100644 --- a/t1/src/Lane.scala +++ b/t1/src/Lane.scala @@ -276,7 +276,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ /** V0 update in the lane should also update [[T1.v0]] */ @public - val v0Update: ValidIO[V0Update] = IO(Valid(new V0Update(parameter))) + val v0Update: ValidIO[V0Update] = IO(Valid(new V0Update(parameter.datapathWidth, parameter.vrfOffsetBits))) /** input of mask data */ @public @@ -371,33 +371,33 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ // todo: mv to bundle.scala class MaskControl(parameter: LaneParameter) extends Bundle { - val index: UInt = UInt(parameter.instructionIndexBits.W) - val sew: UInt = UInt(2.W) - val maskData: UInt = UInt(parameter.datapathWidth.W) - val group: UInt = UInt(parameter.maskGroupSizeBits.W) - val dataValid: Bool = Bool() + val index: UInt = UInt(parameter.instructionIndexBits.W) + val sew: UInt = UInt(2.W) + val maskData: UInt = UInt(parameter.datapathWidth.W) + val group: UInt = UInt(parameter.maskGroupSizeBits.W) + val dataValid: Bool = Bool() val waiteResponse: Bool = Bool() - val controlValid: Bool = Bool() + val controlValid: Bool = Bool() } val maskControlRelease: Vec[ValidIO[UInt]] = Wire(Vec(parameter.chainingSize, Valid(UInt(parameter.instructionIndexBits.W)))) - val maskControlEnq: UInt = Wire(UInt(parameter.chainingSize.W)) - val maskControlDataDeq: UInt = Wire(UInt(parameter.chainingSize.W)) - val maskControlReq: Vec[Bool] = Wire(Vec(parameter.chainingSize, Bool())) - val maskControlReqSelect: UInt = ffo(maskControlReq.asUInt) + val maskControlEnq: UInt = Wire(UInt(parameter.chainingSize.W)) + val maskControlDataDeq: UInt = Wire(UInt(parameter.chainingSize.W)) + val maskControlReq: Vec[Bool] = Wire(Vec(parameter.chainingSize, Bool())) + val maskControlReqSelect: UInt = ffo(maskControlReq.asUInt) // mask request & response handle - val maskControlVec: Seq[MaskControl] = Seq.tabulate(parameter.chainingSize) { index => + val maskControlVec: Seq[MaskControl] = Seq.tabulate(parameter.chainingSize) { index => val state = RegInit(0.U.asTypeOf(new MaskControl(parameter))) val releaseHit: Bool = maskControlRelease.map(r => r.valid && (r.bits === state.index)).reduce(_ || _) val responseFire = Pipe(maskControlReqSelect(index), 0.U.asTypeOf(new EmptyBundle), parameter.maskRequestLatency).valid when(maskControlEnq(index)) { - state := 0.U.asTypeOf(state) - state.index := laneRequest.bits.instructionIndex - state.sew := laneRequest.bits.csrInterface.vSew + state := 0.U.asTypeOf(state) + state.index := laneRequest.bits.instructionIndex + state.sew := laneRequest.bits.csrInterface.vSew state.controlValid := true.B } @@ -410,13 +410,13 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ maskControlReq(index) := state.controlValid && !state.dataValid && !state.waiteResponse when(maskControlReqSelect(index)) { state.waiteResponse := true.B - state.group := state.group + 1.U + state.group := state.group + 1.U } when(responseFire) { - state.dataValid := true.B + state.dataValid := true.B state.waiteResponse := false.B - state.maskData := maskInput + state.maskData := maskInput } when(maskControlDataDeq(index)) { @@ -425,8 +425,8 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ state } - val maskControlFree: Seq[Bool] = maskControlVec.map(s => !s.controlValid && !s.waiteResponse) - val freeSelect: UInt = ffo(VecInit(maskControlFree).asUInt) + val maskControlFree: Seq[Bool] = maskControlVec.map(s => !s.controlValid && !s.waiteResponse) + val freeSelect: UInt = ffo(VecInit(maskControlFree).asUInt) maskControlEnq := maskAnd(laneRequest.fire && laneRequest.bits.mask, freeSelect) /** for each slot, assert when it is asking [[T1]] to change mask */ @@ -439,7 +439,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ /** which slot wins the arbitration for requesting mask. */ val maskRequestFireOH: Vec[Bool] = Wire(Vec(parameter.chainingSize, Bool())) - val maskDataVec: Vec[UInt] = Wire(Vec(parameter.chainingSize, UInt(parameter.maskGroupWidth.W))) + val maskDataVec: Vec[UInt] = Wire(Vec(parameter.chainingSize, UInt(parameter.maskGroupWidth.W))) /** FSM control for each slot. if index == 0, * - slot can support write v0 in mask type, see [[Decoder.maskDestination]] [[Decoder.maskSource]] @@ -663,14 +663,14 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ } maskControlRelease(index).valid := false.B - maskControlRelease(index).bits := record.laneRequest.instructionIndex + maskControlRelease(index).bits := record.laneRequest.instructionIndex // update lane state when(stage0.enqueue.fire) { maskGroupCountVec(index) := stage0.updateLaneState.maskGroupCount // todo: handle all elements in first group are masked maskIndexVec(index) := stage0.updateLaneState.maskIndex when(stage0.updateLaneState.outOfExecutionRange) { - slotOccupied(index) := false.B + slotOccupied(index) := false.B maskControlRelease(index).valid := true.B } } @@ -962,19 +962,19 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ } { - maskSelect := Mux1H(maskControlReqSelect, maskControlVec.map(_.group)) - maskSelectSew := Mux1H(maskControlReqSelect, maskControlVec.map(_.sew)) + maskSelect := Mux1H(maskControlReqSelect, maskControlVec.map(_.group)) + maskSelectSew := Mux1H(maskControlReqSelect, maskControlVec.map(_.sew)) maskControlDataDeq := slotMaskRequestVec.zipWithIndex.map { case (req, index) => - val slotIndex = slotControl(index).laneRequest.instructionIndex - val hitMaskControl = VecInit(maskControlVec.map(_.index === slotIndex)).asUInt - val dataValid = Mux1H(hitMaskControl, maskControlVec.map(_.dataValid)) - val data = Mux1H(hitMaskControl, maskControlVec.map(_.maskData)) - val group = Mux1H(hitMaskControl, maskControlVec.map(_.group)) - val sameGroup = group === req.bits + val slotIndex = slotControl(index).laneRequest.instructionIndex + val hitMaskControl = VecInit(maskControlVec.map(_.index === slotIndex)).asUInt + val dataValid = Mux1H(hitMaskControl, maskControlVec.map(_.dataValid)) + val data = Mux1H(hitMaskControl, maskControlVec.map(_.maskData)) + val group = Mux1H(hitMaskControl, maskControlVec.map(_.group)) + val sameGroup = group === req.bits dontTouch(sameGroup) val maskRequestFire = req.valid && dataValid maskRequestFireOH(index) := maskRequestFire - maskDataVec(index) := data + maskDataVec(index) := data maskAnd(maskRequestFire, hitMaskControl).asUInt }.reduce(_ | _) } diff --git a/t1/src/T1.scala b/t1/src/T1.scala index 0418d056a..0d71bf6a5 100644 --- a/t1/src/T1.scala +++ b/t1/src/T1.scala @@ -777,6 +777,7 @@ class T1(val parameter: T1Parameter) maskUnit.io.laneMaskSelect(index) := Pipe(true.B, lane.maskSelect, parameter.maskRequestLatency).bits maskUnit.io.laneMaskSewSelect(index) := Pipe(true.B, lane.maskSelectSew, parameter.maskRequestLatency).bits maskUnit.io.v0UpdateVec(index) <> lane.v0Update + lsu.v0UpdateVec(index) <> lane.v0Update lane.lsuLastReport := lsu.lastReport | maskUnit.io.lastReport @@ -811,12 +812,10 @@ class T1(val parameter: T1Parameter) lsu.request.bits.instructionInformation.isStore := isStoreType lsu.request.bits.instructionInformation.maskedLoadStore := maskType - maskUnit.io.lsuMaskSelect := lsu.maskSelect - lsu.maskInput := maskUnit.io.lsuMaskInput - lsu.csrInterface := requestRegCSR - lsu.csrInterface.vl := evlForLsu - lsu.writeReadyForLsu := VecInit(laneVec.map(_.writeReadyForLsu)).asUInt.andR - lsu.vrfReadyToStore := VecInit(laneVec.map(_.vrfReadyToStore)).asUInt.andR + lsu.csrInterface := requestRegCSR + lsu.csrInterface.vl := evlForLsu + lsu.writeReadyForLsu := VecInit(laneVec.map(_.writeReadyForLsu)).asUInt.andR + lsu.vrfReadyToStore := VecInit(laneVec.map(_.vrfReadyToStore)).asUInt.andR // connect mask unit maskUnit.io.instReq.valid := requestRegDequeue.fire && requestReg.bits.decodeResult(Decoder.maskUnit) diff --git a/t1/src/lsu/LSU.scala b/t1/src/lsu/LSU.scala index b7bd81202..cd29f90be 100644 --- a/t1/src/lsu/LSU.scala +++ b/t1/src/lsu/LSU.scala @@ -118,15 +118,10 @@ class LSU(param: LSUParameter) extends Module { @public val request: DecoupledIO[LSURequest] = IO(Flipped(Decoupled(new LSURequest(param.datapathWidth)))) - /** mask from [[V]] TODO: since mask is one-cycle information for a mask group, we should latch it in the LSU, and - * reduce the IO width. this needs PnR information. - */ - @public - val maskInput: Vec[UInt] = IO(Input(Vec(param.lsuMSHRSize, UInt(param.maskGroupWidth.W)))) - - /** the address of the mask group in the [[V]]. */ @public - val maskSelect: Vec[UInt] = IO(Output(Vec(param.lsuMSHRSize, UInt(param.maskGroupSizeBits.W)))) + val v0UpdateVec: Vec[ValidIO[V0Update]] = IO( + Flipped(Vec(param.laneNumber, Valid(new V0Update(param.datapathWidth, param.vrfOffsetBits)))) + ) @public val axi4Port: AXI4RWIrrevocable = IO(new AXI4RWIrrevocable(param.axi4BundleParameter)) @@ -197,6 +192,25 @@ class LSU(param: LSUParameter) extends Module { val storeUnit: StoreUnit = Module(new StoreUnit(param.mshrParam)) val otherUnit: SimpleAccessUnit = Module(new SimpleAccessUnit(param.mshrParam)) + /** duplicate v0 in lsu */ + val v0: Vec[UInt] = RegInit( + VecInit(Seq.fill(param.vLen / param.datapathWidth)(0.U(param.datapathWidth.W))) + ) + + // write v0(mask) + v0.zipWithIndex.foreach { case (data, index) => + // 属于哪个lane + val laneIndex: Int = index % param.laneNumber + // 取出写的端口 + val v0Write = v0UpdateVec(laneIndex) + // offset + val offset: Int = index / param.laneNumber + val maskExt = FillInterleaved(8, v0Write.bits.mask) + when(v0Write.valid && v0Write.bits.offset === offset.U) { + data := (data & (~maskExt).asUInt) | (maskExt & v0Write.bits.data) + } + } + val unitVec = Seq(loadUnit, storeUnit, otherUnit) /** Always merge into cache line */ @@ -222,8 +236,8 @@ class LSU(param: LSUParameter) extends Module { mshr.lsuRequest.valid := reqEnq(index) mshr.lsuRequest.bits := request.bits - maskSelect(index) := Mux(mshr.maskSelect.valid, mshr.maskSelect.bits, 0.U) - mshr.maskInput := maskInput(index) + val maskSelect = Mux(mshr.maskSelect.valid, mshr.maskSelect.bits, 0.U) + mshr.maskInput := cutUInt(v0.asUInt, param.maskGroupWidth)(maskSelect) // broadcast CSR mshr.csrInterface := csrInterface diff --git a/t1/src/mask/MaskUnit.scala b/t1/src/mask/MaskUnit.scala index ce1ab81ae..76cd556d2 100644 --- a/t1/src/mask/MaskUnit.scala +++ b/t1/src/mask/MaskUnit.scala @@ -70,12 +70,12 @@ class MaskUnitInterface(parameter: T1Parameter) extends Bundle { val readResult: Vec[ValidIO[UInt]] = Flipped(Vec(parameter.laneNumber, Valid(UInt(parameter.datapathWidth.W)))) val writeRD: ValidIO[UInt] = Valid(UInt(parameter.datapathWidth.W)) val lastReport: UInt = Output(UInt((2 * parameter.chainingSize).W)) - val lsuMaskInput: Vec[UInt] = Output(Vec(parameter.lsuMSHRSize, UInt(parameter.maskGroupWidth.W))) - val lsuMaskSelect: Vec[UInt] = Input(Vec(parameter.lsuMSHRSize, UInt(parameter.lsuParameters.maskGroupSizeBits.W))) val laneMaskInput: Vec[UInt] = Output(Vec(parameter.laneNumber, UInt(parameter.datapathWidth.W))) val laneMaskSelect: Vec[UInt] = Input(Vec(parameter.laneNumber, UInt(parameter.laneParam.maskGroupSizeBits.W))) val laneMaskSewSelect: Vec[UInt] = Input(Vec(parameter.laneNumber, UInt(2.W))) - val v0UpdateVec: Vec[ValidIO[V0Update]] = Flipped(Vec(parameter.laneNumber, Valid(new V0Update(parameter.laneParam)))) + val v0UpdateVec: Vec[ValidIO[V0Update]] = Flipped( + Vec(parameter.laneNumber, Valid(new V0Update(parameter.laneParam.datapathWidth, parameter.laneParam.vrfOffsetBits))) + ) val writeRDData: UInt = Output(UInt(parameter.xLen.W)) val gatherData: DecoupledIO[UInt] = Decoupled(UInt(parameter.xLen.W)) val gatherRead: Bool = Input(Bool()) @@ -116,8 +116,6 @@ class MaskUnit(val parameter: T1Parameter) val readResult = io.readResult val writeRD = io.writeRD val lastReport = io.lastReport - val lsuMaskInput = io.lsuMaskInput - val lsuMaskSelect = io.lsuMaskSelect val laneMaskInput = io.laneMaskInput val laneMaskSelect = io.laneMaskSelect val laneMaskSewSelect = io.laneMaskSewSelect @@ -168,11 +166,6 @@ class MaskUnit(val parameter: T1Parameter) input := cutUInt(v0SelectBySew, parameter.datapathWidth)(laneMaskSelect(index)) } - // lsu - lsuMaskInput.zip(lsuMaskSelect).foreach { case (data, index) => - data := cutUInt(v0.asUInt, parameter.maskGroupWidth)(index) - } - val maskedWrite: BitLevelMaskWrite = Module(new BitLevelMaskWrite(parameter)) def gatherIndex(elementIndex: UInt, vlmul: UInt, sew: UInt): (UInt, UInt, UInt, UInt, Bool) = { From 41a2e4dc767a715ee13c88bc8f7bfd38bb212ff6 Mon Sep 17 00:00:00 2001 From: Jiuyang Liu Date: Fri, 13 Dec 2024 10:59:25 +0800 Subject: [PATCH 08/41] [om] bug fix on the ReferenceTarget pattern match --- omreaderlib/src/T1OMReaderAPI.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/omreaderlib/src/T1OMReaderAPI.scala b/omreaderlib/src/T1OMReaderAPI.scala index 754d07b75..cda1fa2f5 100644 --- a/omreaderlib/src/T1OMReaderAPI.scala +++ b/omreaderlib/src/T1OMReaderAPI.scala @@ -54,7 +54,7 @@ object Path { implicit val rw: ReadWriter[Instruction] = macroRW def parse(str: String): Path = str match { - case s"OMInstanceTarget:~${top}|${hier}>${local}" => + case s"OMReferenceTarget:~${top}|${hier}>${local}" => Path( top, hier From 81cc226c0dfb3cd36377b2cd9cd47f32b612fccf Mon Sep 17 00:00:00 2001 From: Jiuyang Liu Date: Fri, 13 Dec 2024 10:59:45 +0800 Subject: [PATCH 09/41] [om] retime FloatAdder in Permutation --- omreaderlib/src/t1/T1.scala | 11 ++++++++++- omreaderlib/src/t1rocketv/T1RocketTile.scala | 8 +++++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/omreaderlib/src/t1/T1.scala b/omreaderlib/src/t1/T1.scala index 5e040886e..2c7ee4666 100644 --- a/omreaderlib/src/t1/T1.scala +++ b/omreaderlib/src/t1/T1.scala @@ -18,6 +18,15 @@ class T1(val mlirbc: Array[Byte]) extends T1OMReaderAPI { def instructions: Seq[Instruction] = t1("decoder").obj("instructions").list.elements().map(_.obj).map(getInstruction) def sram: Seq[SRAM] = t1("lanes").list.elements().map(_.obj("vrf").obj).flatMap(getSRAM) - def retime: Seq[Retime] = + + def floatAdder = { + val reduceUnit = t1("permutatuon").obj("reduceUnit").obj + // TODO: need fieldOpt(name: String) + Option.when(reduceUnit.fieldNames().contains("floatAdder"))(reduceUnit("floatAdder").obj).flatMap(getRetime) + } + + def vfus: Seq[Retime] = t1("lanes").list.elements().map(_.obj("vfus")).flatMap(_.list.elements().map(_.obj)).flatMap(getRetime) + + def retime = (vfus ++ floatAdder).distinct } diff --git a/omreaderlib/src/t1rocketv/T1RocketTile.scala b/omreaderlib/src/t1rocketv/T1RocketTile.scala index 1224726e4..3187adec6 100644 --- a/omreaderlib/src/t1rocketv/T1RocketTile.scala +++ b/omreaderlib/src/t1rocketv/T1RocketTile.scala @@ -22,6 +22,12 @@ class T1RocketTile(val mlirbc: Array[Byte]) extends T1OMReaderAPI { Seq(tile("frontend").obj("icache").obj, tile("hellaCache").obj).flatMap(getSRAM) def vfu: Seq[Retime] = t1("lanes").list.elements().map(_.obj("vfus")).flatMap(_.list.elements().map(_.obj)).flatMap(getRetime) - def retime = vfu + def floatAdder = { + val reduceUnit = t1("permutatuon").obj("reduceUnit").obj + // TODO: need fieldOpt(name: String) + Option.when(reduceUnit.fieldNames().contains("floatAdder"))(reduceUnit("floatAdder").obj).flatMap(getRetime) + } + + def retime = (vfu ++ floatAdder).distinct def sram = vrf ++ cache } From 36a271093f18b9f6f3aaf73b1164316168df99ad Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Mon, 16 Dec 2024 16:09:40 +0800 Subject: [PATCH 10/41] [rtl] fix mask update in lane. --- t1/src/Lane.scala | 2 +- t1/src/T1.scala | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/t1/src/Lane.scala b/t1/src/Lane.scala index 5d5e5f08f..abecc3220 100644 --- a/t1/src/Lane.scala +++ b/t1/src/Lane.scala @@ -686,7 +686,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ val maskFailure: Bool = stage0.updateLaneState.maskExhausted && stage0.enqueue.fire // update mask register when(maskUpdateFire) { - record.mask.bits := DontCare + record.mask.bits := maskDataVec(index) } when(maskUpdateFire ^ maskFailure) { record.mask.valid := maskUpdateFire diff --git a/t1/src/T1.scala b/t1/src/T1.scala index 0d71bf6a5..4c395f147 100644 --- a/t1/src/T1.scala +++ b/t1/src/T1.scala @@ -964,10 +964,9 @@ class T1(val parameter: T1Parameter) probe.valid := write.fire && write.bits.mask.orR probe.bits := write.bits.instructionIndex } - probeWire.instructionValid := maskAnd( - !slots.last.state.wMaskUnitLast && !slots.last.state.idle, - indexToOH(slots.last.record.instructionIndex, parameter.chainingSize) - ).asUInt + probeWire.instructionValid := slots + .map(s => maskAnd(!s.state.idle, indexToOH(s.record.instructionIndex, parameter.chainingSize)).asUInt) + .reduce(_ | _) probeWire.responseCounter := responseCounter probeWire.laneProbes.zip(laneVec).foreach { case (p, l) => p := probe.read(l.laneProbe) } probeWire.lsuProbe := probe.read(lsu.lsuProbe) From 0ff54831bce0f2f8171a162aeddbbd946b5789e9 Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Mon, 16 Dec 2024 16:10:07 +0800 Subject: [PATCH 11/41] [code] format. --- omreaderlib/src/T1OMReaderAPI.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/omreaderlib/src/T1OMReaderAPI.scala b/omreaderlib/src/T1OMReaderAPI.scala index cda1fa2f5..22005a8e4 100644 --- a/omreaderlib/src/T1OMReaderAPI.scala +++ b/omreaderlib/src/T1OMReaderAPI.scala @@ -65,7 +65,7 @@ object Path { }), Some(local) ) - case s"OMInstanceTarget:~${top}|${hier}" => + case s"OMInstanceTarget:~${top}|${hier}" => Path( top, hier From 334b8165b4375524ff95583b95e4952a522f6a1f Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Tue, 17 Dec 2024 15:07:28 +0800 Subject: [PATCH 12/41] [rtl] fix vrf read result. --- t1/src/package.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/t1/src/package.scala b/t1/src/package.scala index 412d62ddf..7b648bc75 100644 --- a/t1/src/package.scala +++ b/t1/src/package.scala @@ -299,7 +299,7 @@ package object rtl { val accessDataValid = Pipe(sinkRequest.fire, 0.U.asTypeOf(new EmptyBundle), vrfReadLatency.get).valid val accessDataSource = Wire(Valid(chiselTypeOf(dataAck.get))) accessDataSource.valid := accessDataValid - accessDataSource.bits := accessDataValid + accessDataSource.bits := dataAck.get connectWithShifter(latencyVec(index))(accessDataSource, sourceData) } } From e04edb39b01193f31d81ce08dbe3329bf87bb8f4 Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Thu, 19 Dec 2024 11:06:34 +0800 Subject: [PATCH 13/41] [rtl] Check whether the order of instruction index is clear in the sequencer. --- t1/src/Lane.scala | 18 +++++++++--------- t1/src/T1.scala | 21 +++++++++++++++------ t1/src/vrf/VRF.scala | 33 +++++++++++++++++---------------- 3 files changed, 41 insertions(+), 31 deletions(-) diff --git a/t1/src/Lane.scala b/t1/src/Lane.scala index abecc3220..934e9af27 100644 --- a/t1/src/Lane.scala +++ b/t1/src/Lane.scala @@ -315,9 +315,6 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ @public val laneProbe = IO(Output(Probe(new LaneProbe(parameter), layers.Verification))) - @public - val vrfAllocateIssue: Bool = IO(Output(Bool())) - // TODO: remove dontTouch(writeBusPort) val csrInterface: CSRInterface = laneRequest.bits.csrInterface @@ -544,8 +541,10 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ // Overflow occurs val vxsatEnq: Vec[UInt] = Wire(Vec(parameter.chainingSize, UInt((2 * parameter.chainingSize).W))) + + val instructionFinishInSlot: UInt = Wire(UInt((2 * parameter.chainingSize).W)) // vxsatEnq and instructionFinished cannot happen at the same time - vxsatResult := (vxsatEnq.reduce(_ | _) | vxsatResult) & (~instructionFinished).asUInt + vxsatResult := (vxsatEnq.reduce(_ | _) | vxsatResult) & (~instructionFinishInSlot).asUInt /** assert when a instruction will not use mask unit */ val instructionUnrelatedMaskUnitVec: Vec[UInt] = Wire(Vec(parameter.chainingSize, UInt(parameter.chainingSize.W))) @@ -1087,7 +1086,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ // enqueue from lane request if (slotIndex == parameter.chainingSize - 1) { enqueueValid := laneRequest.valid - enqueueReady := slotShiftValid(slotIndex) && vrf.instructionWriteReport.ready + enqueueReady := slotShiftValid(slotIndex) when(enqueueFire) { slotControl(slotIndex) := entranceControl maskGroupCountVec(slotIndex) := 0.U(parameter.maskGroupSizeBits.W) @@ -1117,7 +1116,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ // handshake // @todo @Clo91eaf lane can take request from Sequencer - laneRequest.ready := slotFree && vrf.instructionWriteReport.ready + laneRequest.ready := slotFree val instructionFinishAndNotReportByTop: Bool = entranceControl.instructionFinished && !laneRequest.bits.decodeResult(Decoder.readOnly) && (writeCount === 0.U) @@ -1149,7 +1148,6 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ vrf.instructionWriteReport.bits.state.wLaneLastReport := !laneRequest.valid vrf.instructionWriteReport.bits.state.wTopLastReport := !laneRequest.bits.decodeResult(Decoder.maskUnit) vrf.instructionWriteReport.bits.state.wLaneClear := false.B - vrfAllocateIssue := vrf.vrfAllocateIssue val elementSizeForOneRegister: Int = parameter.vLen / parameter.datapathWidth / parameter.laneNumber val nrMask: UInt = VecInit(Seq.tabulate(8) { i => @@ -1183,12 +1181,14 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ vrf.instructionWriteReport.bits.elementMask := selectMask + instructionFinishInSlot := (~instructionValid).asUInt & instructionValidNext + // clear record by instructionFinished - vrf.instructionLastReport := instructionFinished + vrf.instructionLastReport := instructionFinishInSlot vrf.lsuLastReport := lsuLastReport vrf.loadDataInLSUWriteQueue := loadDataInLSUWriteQueue vrf.dataInLane := instructionValid - instructionFinished := (~instructionValid).asUInt & instructionValidNext + instructionFinished := vrf.vrfSlotRelease writeReadyForLsu := vrf.writeReadyForLsu vrfReadyToStore := vrf.vrfReadyToStore tokenManager.crossWriteReports.zipWithIndex.foreach { case (rpt, rptIndex) => diff --git a/t1/src/T1.scala b/t1/src/T1.scala index 4c395f147..6813c9b05 100644 --- a/t1/src/T1.scala +++ b/t1/src/T1.scala @@ -664,6 +664,13 @@ class T1(val parameter: T1Parameter) /** slot is ready to accept new instructions. */ val slotReady: Bool = Mux(specialInstruction, slots.map(_.state.idle).last, freeOR) + val olderCheck: Bool = slots.map { re => + // The same lsb will make it difficult to distinguish between the new and the old + val notSameLSB: Bool = re.record.instructionIndex(parameter.instructionIndexBits - 2, 0) =/= + requestReg.bits.instructionIndex(parameter.instructionIndexBits - 2, 0) + re.state.idle || (instIndexL(re.record.instructionIndex, requestReg.bits.instructionIndex) && notSameLSB) + }.reduce(_ && _) + val source1Select: UInt = Mux( decodeResult(Decoder.gather), @@ -699,7 +706,7 @@ class T1(val parameter: T1Parameter) ) laneRequestSourceWire.foreach { request => - request.valid := requestRegDequeue.fire && !noOffsetReadLoadStore && !maskUnitInstruction + request.valid := requestRegDequeue.fire // hard wire request.bits.instructionIndex := requestReg.bits.instructionIndex request.bits.decodeResult := decodeResult @@ -717,7 +724,7 @@ class T1(val parameter: T1Parameter) // and broadcast to all lanes. request.bits.readFromScalar := source1Select - request.bits.issueInst := requestRegDequeue.fire + request.bits.issueInst := !noOffsetReadLoadStore && !maskUnitInstruction request.bits.loadStore := isLoadStoreType // let record in VRF to know there is a store instruction. request.bits.store := isStoreType @@ -738,10 +745,12 @@ class T1(val parameter: T1Parameter) */ val laneVec: Seq[Instance[Lane]] = Seq.tabulate(parameter.laneNumber) { index => val lane: Instance[Lane] = Instantiate(new Lane(parameter.laneParam)) - lane.laneRequest.valid := laneRequestSinkWire(index).valid && lane.vrfAllocateIssue + lane.laneRequest.valid := laneRequestSinkWire(index).valid && laneRequestSinkWire(index).bits.issueInst lane.laneRequest.bits := laneRequestSinkWire(index).bits - laneRequestSinkWire(index).ready := lane.laneRequest.ready && lane.vrfAllocateIssue - lane.laneIndex := index.U + lane.laneRequest.bits.issueInst := laneRequestSinkWire(index).fire + laneRequestSinkWire(index).ready := !laneRequestSinkWire(index).bits.issueInst || lane.laneRequest.ready + + lane.laneIndex := index.U connectVrfAccess( Seq(parameter.maskUnitReadShifterSize(index), parameter.lsuReadShifterSize(index)), @@ -905,7 +914,7 @@ class T1(val parameter: T1Parameter) // we detect the hazard and decide should we issue this slide or // issue the instruction after the slide which already in the slot. requestRegDequeue.ready := executionReady && slotReady && (!gatherNeedRead || maskUnit.io.gatherData.valid) && - tokenManager.issueAllow && instructionIndexFree + tokenManager.issueAllow && instructionIndexFree && olderCheck instructionToSlotOH := Mux(requestRegDequeue.fire, slotToEnqueue, 0.U) diff --git a/t1/src/vrf/VRF.scala b/t1/src/vrf/VRF.scala index 6e094a47b..0292d8409 100644 --- a/t1/src/vrf/VRF.scala +++ b/t1/src/vrf/VRF.scala @@ -15,6 +15,7 @@ import chisel3.properties.{AnyClassType, Class, ClassType, Path, Property} import org.chipsalliance.stdlib.GeneralOM import org.chipsalliance.t1.rtl.{ ffo, + indexToOH, instIndexL, instIndexLE, ohCheck, @@ -215,7 +216,7 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar /** when instruction is fired, record it in the VRF for chaining. */ @public - val instructionWriteReport: DecoupledIO[VRFWriteReport] = IO(Flipped(Decoupled(new VRFWriteReport(parameter)))) + val instructionWriteReport: ValidIO[VRFWriteReport] = IO(Flipped(Valid(new VRFWriteReport(parameter)))) /** similar to [[flush]]. */ @public @@ -224,6 +225,9 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar @public val lsuLastReport: UInt = IO(Input(UInt((2 * parameter.chainingSize).W))) + @public + val vrfSlotRelease: UInt = IO(Output(UInt((2 * parameter.chainingSize).W))) + @public val dataInLane: UInt = IO(Input(UInt((2 * parameter.chainingSize).W))) @@ -232,9 +236,6 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar @public val vrfReadyToStore: Bool = IO(Output(Bool())) - @public - val vrfAllocateIssue: Bool = IO(Output(Bool())) - /** we can only chain LSU instructions, after [[LSU.writeQueueVec]] is cleared. */ @public val loadDataInLSUWriteQueue: UInt = IO(Input(UInt((2 * parameter.chainingSize).W))) @@ -275,6 +276,13 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar val chainingRecordCopy: Vec[ValidIO[VRFWriteReport]] = RegInit( VecInit(Seq.fill(parameter.chainingSize + 1)(0.U.asTypeOf(Valid(new VRFWriteReport(parameter))))) ) + val recordRelease: Vec[UInt] = WireDefault( + VecInit( + Seq.fill(parameter.chainingSize + 1)( + 0.U.asTypeOf(UInt((parameter.chainingSize * 2).W)) + ) + ) + ) val recordValidVec: Seq[Bool] = chainingRecord.map(r => !r.bits.elementMask.andR && r.valid) // first read @@ -500,23 +508,12 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar // @todo @Clo91eaf VRF ready signal for performance. val freeRecord: UInt = VecInit(chainingRecord.map(!_.valid)).asUInt val recordFFO: UInt = ffo(freeRecord) - val recordEnq: UInt = Wire(UInt((parameter.chainingSize + 1).W)) - val olderCheck = chainingRecord.map { re => - // The same lsb will make it difficult to distinguish between the new and the old - val notSameLSB: Bool = re.bits.instIndex(parameter.instructionIndexBits - 2, 0) =/= - instructionWriteReport.bits.instIndex(parameter.instructionIndexBits - 2, 0) - !re.valid || (instIndexL(re.bits.instIndex, instructionWriteReport.bits.instIndex) && notSameLSB) - }.reduce(_ && _) - // handle VRF hazard - // @todo @Clo91eaf VRF ready signal for performance. - instructionWriteReport.ready := freeRecord.orR && olderCheck - recordEnq := Mux( + val recordEnq: UInt = Mux( // 纯粹的lsu指令的记录不需要ready instructionWriteReport.valid, recordFFO, 0.U((parameter.chainingSize + 1).W) ) - vrfAllocateIssue := freeRecord.orR && olderCheck val writePort: Seq[ValidIO[VRFWriteRequest]] = Seq(writePipe) val loadUnitReadPorts: Seq[DecoupledIO[VRFReadRequest]] = Seq(readRequests.last) @@ -570,6 +567,9 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar when(stateClear) { record.valid := false.B + when(record.valid) { + recordRelease(i) := indexToOH(record.bits.instIndex, parameter.chainingSize) + } } when(recordEnq(i)) { @@ -617,6 +617,7 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar } writeReadyForLsu := !hazardVec.map(_.map(_._1).reduce(_ || _)).reduce(_ || _) vrfReadyToStore := !hazardVec.map(_.map(_._2).reduce(_ || _)).reduce(_ || _) + vrfSlotRelease := recordRelease.reduce(_ | _) writeCheck.zip(writeAllow).foreach { case (check, allow) => allow := chainingRecordCopy From 36705e4bbd3673f731d0eac6a2d452e51d4d32f4 Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Thu, 19 Dec 2024 11:16:06 +0800 Subject: [PATCH 14/41] [rtl] Pipe result in MaskCompress. --- t1/src/mask/MaskCompress.scala | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/t1/src/mask/MaskCompress.scala b/t1/src/mask/MaskCompress.scala index 24f597259..aef7421f5 100644 --- a/t1/src/mask/MaskCompress.scala +++ b/t1/src/mask/MaskCompress.scala @@ -44,6 +44,8 @@ class MaskCompress(parameter: T1Parameter) extends Module { val writeRD = in.bits.uop === BitPat("b?11") val ffoType = in.bits.uop === BitPat("b11?") + val outWire: CompressOutput = Wire(new CompressOutput(parameter)) + val eew1H: UInt = UIntToOH(in.bits.eew)(2, 0) val compressInit: UInt = RegInit(0.U(log2Ceil(parameter.vLen).W)) val compressVec: Vec[UInt] = Wire(Vec(maskSize, UInt(compressInit.getWidth.W))) @@ -142,11 +144,11 @@ class MaskCompress(parameter: T1Parameter) extends Module { } val compressResult: UInt = Mux1H(eew1H, compressDataVec) val lastCompressEnq: Bool = in.fire && in.bits.lastCompress - when(newInstruction || lastCompressEnq || out.compressValid) { + when(newInstruction || lastCompressEnq || outWire.compressValid) { compressTailValid := lastCompressEnq && compress } - when(newInstruction || out.compressValid) { + when(newInstruction || outWire.compressValid) { compressWriteGroupCount := Mux(newInstruction, 0.U, compressWriteGroupCount + 1.U) } @@ -177,7 +179,7 @@ class MaskCompress(parameter: T1Parameter) extends Module { val ffoMask: UInt = FillInterleaved(parameter.datapathWidth / 8, in.bits.validInput) - out.data := Mux1H( + outWire.data := Mux1H( Seq( compress -> compressResult, viota -> viotaResult, @@ -187,7 +189,7 @@ class MaskCompress(parameter: T1Parameter) extends Module { ) // todo: compressMask - out.mask := Mux1H( + outWire.mask := Mux1H( Seq( compress -> compressMask, viota -> viotaMask, @@ -197,8 +199,8 @@ class MaskCompress(parameter: T1Parameter) extends Module { ) // todo - out.compressValid := (compressTailValid || (compressDeqValid && in.fire)) && !writeRD - out.groupCounter := Mux(compress, compressWriteGroupCount, in.bits.groupCounter) + outWire.compressValid := (compressTailValid || (compressDeqValid && in.fire)) && !writeRD + outWire.groupCounter := Mux(compress, compressWriteGroupCount, in.bits.groupCounter) when(newInstruction && ffoInstruction) { ffoIndex := -1.S(parameter.datapathWidth.W).asUInt @@ -235,5 +237,6 @@ class MaskCompress(parameter: T1Parameter) extends Module { }.elsewhen(mvRd) { ffoIndex := source1SigExtend } - out.ffoOutput := completedLeftOr | Fill(parameter.laneNumber, ffoValid) + outWire.ffoOutput := completedLeftOr | Fill(parameter.laneNumber, ffoValid) + out := RegNext(outWire, 0.U.asTypeOf(out)) } From ecaf3776f3019220f1dd91a4369660ffef6358af Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Thu, 19 Dec 2024 17:44:21 +0800 Subject: [PATCH 15/41] [rtl] Add read token in store unit. --- t1/src/lsu/SimpleAccessUnit.scala | 4 ++++ t1/src/lsu/StoreUnit.scala | 26 ++++++++++++++------------ 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/t1/src/lsu/SimpleAccessUnit.scala b/t1/src/lsu/SimpleAccessUnit.scala index d7409500b..bc517d5eb 100644 --- a/t1/src/lsu/SimpleAccessUnit.scala +++ b/t1/src/lsu/SimpleAccessUnit.scala @@ -120,6 +120,10 @@ case class MSHRParam( // outstanding of MaskExchangeUnit.maskReq // todo: param from T1Param val maskRequestQueueSize: Int = 8 + + // outstanding of StoreUnit.vrfReadDataPorts + // todo: param from T1Param + val storeUnitReadOutStanding: Int = 8 } /** Miss Status Handler Register this is used to record the outstanding memory access request for each instruction. it diff --git a/t1/src/lsu/StoreUnit.scala b/t1/src/lsu/StoreUnit.scala index a4e125296..00f261a4b 100644 --- a/t1/src/lsu/StoreUnit.scala +++ b/t1/src/lsu/StoreUnit.scala @@ -62,8 +62,9 @@ class StoreUnit(param: MSHRParam) extends StrideBase(param) with LSUPublic { // todo: need hazardCheck? val hazardCheck: Bool = RegEnable(vrfReadyToStore && !lsuRequest.valid, false.B, lsuRequest.valid || vrfReadyToStore) // read stage dequeue ready need all source valid, Or add a queue to coordinate - val vrfReadQueueVec: Seq[QueueIO[UInt]] = - Seq.tabulate(param.laneNumber)(_ => Queue.io(UInt(param.datapathWidth.W), 2, flow = true, pipe = true)) + val vrfReadQueueVec: Seq[QueueIO[UInt]] = Seq.tabulate(param.laneNumber)(_ => + Queue.io(UInt(param.datapathWidth.W), param.storeUnitReadOutStanding, flow = true, pipe = true) + ) // 从vrf里面读数据 val readStageValid: Bool = Seq @@ -72,10 +73,6 @@ class StoreUnit(param: MSHRParam) extends StrideBase(param) with LSUPublic { val segPtr: UInt = RegInit(0.U(3.W)) val readCount: UInt = RegInit(0.U(dataGroupBits.W)) val stageValid = RegInit(false.B) - // queue for read latency - // todo: param.vrfReadLatency => param.vrfReadLatency + shifterLatency - val queue: QueueIO[UInt] = - Queue.io(UInt(param.datapathWidth.W), param.vrfReadLatency, flow = true) val lastReadPtr: Bool = segPtr === 0.U @@ -105,8 +102,14 @@ class StoreUnit(param: MSHRParam) extends StrideBase(param) with LSUPublic { readCount := nextReadCount } + val readCounter = RegInit(0.U(log2Ceil(param.storeUnitReadOutStanding + 1).W)) + val counterChange: UInt = Mux(readPort.fire, 1.U, -1.S(readCounter.getWidth.W).asUInt) + when(readPort.fire ^ vrfReadQueueVec(laneIndex).deq.fire) { + readCounter := readCounter + counterChange + } + // vrf read request - readPort.valid := stageValid && vrfReadQueueVec(laneIndex).enq.ready + readPort.valid := stageValid && !readCounter.asBools.last readPort.bits.vs := lsuRequestReg.instructionInformation.vs3 + segPtr * segmentInstructionIndexInterval + @@ -116,11 +119,10 @@ class StoreUnit(param: MSHRParam) extends StrideBase(param) with LSUPublic { readPort.bits.instructionIndex := lsuRequestReg.instructionIndex // latency queue enq - queue.enq.valid := vrfReadResults(laneIndex).valid - queue.enq.bits := vrfReadResults(laneIndex).bits - AssertProperty(BoolSequence(!queue.enq.valid || queue.enq.ready)) - vrfReadQueueVec(laneIndex).enq <> queue.deq - stageValid || RegNext(readPort.fire) + AssertProperty(BoolSequence(!vrfReadQueueVec(laneIndex).enq.valid || vrfReadQueueVec(laneIndex).enq.ready)) + vrfReadQueueVec(laneIndex).enq.valid := vrfReadResults(laneIndex).valid + vrfReadQueueVec(laneIndex).enq.bits := vrfReadResults(laneIndex).bits + stageValid || readCounter.orR } .reduce(_ || _) From 56131d27c00b6dfa20c52e441a57a5ce7c20af93 Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Fri, 20 Dec 2024 11:55:53 +0800 Subject: [PATCH 16/41] [rtl] retime compress unit. --- omreaderlib/src/t1/T1.scala | 13 ++-- omreaderlib/src/t1rocketv/T1RocketTile.scala | 19 ++++-- t1/src/T1.scala | 8 +-- t1/src/mask/MaskCompress.scala | 70 +++++++++++++++---- t1/src/mask/MaskUnit.scala | 72 ++++++++++++-------- 5 files changed, 128 insertions(+), 54 deletions(-) diff --git a/omreaderlib/src/t1/T1.scala b/omreaderlib/src/t1/T1.scala index 2c7ee4666..a935163dd 100644 --- a/omreaderlib/src/t1/T1.scala +++ b/omreaderlib/src/t1/T1.scala @@ -19,14 +19,19 @@ class T1(val mlirbc: Array[Byte]) extends T1OMReaderAPI { def sram: Seq[SRAM] = t1("lanes").list.elements().map(_.obj("vrf").obj).flatMap(getSRAM) - def floatAdder = { - val reduceUnit = t1("permutatuon").obj("reduceUnit").obj + def permutation: Seq[Retime] = { + val permutation = t1("permutation") + val reduceUnit = permutation.obj("reduceUnit").obj + val compressUnit = permutation.obj("compress").obj // TODO: need fieldOpt(name: String) - Option.when(reduceUnit.fieldNames().contains("floatAdder"))(reduceUnit("floatAdder").obj).flatMap(getRetime) + val floatAdder = + Option.when(reduceUnit.fieldNames().contains("floatAdder"))(reduceUnit("floatAdder").obj) + + (Seq(compressUnit) ++ floatAdder).flatMap(getRetime) } def vfus: Seq[Retime] = t1("lanes").list.elements().map(_.obj("vfus")).flatMap(_.list.elements().map(_.obj)).flatMap(getRetime) - def retime = (vfus ++ floatAdder).distinct + def retime = (vfus ++ permutation).distinct } diff --git a/omreaderlib/src/t1rocketv/T1RocketTile.scala b/omreaderlib/src/t1rocketv/T1RocketTile.scala index 3187adec6..0c33bd3f6 100644 --- a/omreaderlib/src/t1rocketv/T1RocketTile.scala +++ b/omreaderlib/src/t1rocketv/T1RocketTile.scala @@ -20,14 +20,21 @@ class T1RocketTile(val mlirbc: Array[Byte]) extends T1OMReaderAPI { t1("lanes").list.elements().map(_.obj("vrf").obj).flatMap(getSRAM) def cache: Seq[SRAM] = Seq(tile("frontend").obj("icache").obj, tile("hellaCache").obj).flatMap(getSRAM) - def vfu: Seq[Retime] = - t1("lanes").list.elements().map(_.obj("vfus")).flatMap(_.list.elements().map(_.obj)).flatMap(getRetime) - def floatAdder = { - val reduceUnit = t1("permutatuon").obj("reduceUnit").obj + + def permutation: Seq[Retime] = { + val permutation = t1("permutation") + val reduceUnit = permutation.obj("reduceUnit").obj + val compressUnit = permutation.obj("compress").obj // TODO: need fieldOpt(name: String) - Option.when(reduceUnit.fieldNames().contains("floatAdder"))(reduceUnit("floatAdder").obj).flatMap(getRetime) + val floatAdder = + Option.when(reduceUnit.fieldNames().contains("floatAdder"))(reduceUnit("floatAdder").obj) + + (Seq(compressUnit) ++ floatAdder).flatMap(getRetime) } - def retime = (vfu ++ floatAdder).distinct + def vfus: Seq[Retime] = + t1("lanes").list.elements().map(_.obj("vfus")).flatMap(_.list.elements().map(_.obj)).flatMap(getRetime) + + def retime = (vfus ++ permutation).distinct def sram = vrf ++ cache } diff --git a/t1/src/T1.scala b/t1/src/T1.scala index 6813c9b05..8466cd7ba 100644 --- a/t1/src/T1.scala +++ b/t1/src/T1.scala @@ -66,10 +66,10 @@ class T1OM(parameter: T1Parameter) extends GeneralOM[T1Parameter, T1](parameter) val decoderIn = IO(Input(Property[AnyClassType]())) decoder := decoderIn - val permutatuon = IO(Output(Property[AnyClassType]())) + val permutation = IO(Output(Property[AnyClassType]())) @public - val permutatuonIn = IO(Input(Property[AnyClassType]())) - permutatuon := permutatuonIn + val permutationIn = IO(Input(Property[AnyClassType]())) + permutation := permutationIn } object T1Parameter { @@ -403,7 +403,7 @@ class T1(val parameter: T1Parameter) val maskUnit: Instance[MaskUnit] = Instantiate(new MaskUnit(parameter)) maskUnit.io.clock := implicitClock maskUnit.io.reset := implicitReset - omInstance.permutatuonIn := Property(maskUnit.io.om.asAnyClassType) + omInstance.permutationIn := Property(maskUnit.io.om.asAnyClassType) val tokenManager: Instance[T1TokenManager] = Instantiate(new T1TokenManager(parameter)) diff --git a/t1/src/mask/MaskCompress.scala b/t1/src/mask/MaskCompress.scala index aef7421f5..2ece87ca9 100644 --- a/t1/src/mask/MaskCompress.scala +++ b/t1/src/mask/MaskCompress.scala @@ -4,9 +4,26 @@ package org.chipsalliance.t1.rtl import chisel3._ +import chisel3.experimental.hierarchy.{instantiable, Instance, Instantiate} +import chisel3.experimental.{SerializableModule, SerializableModuleParameter} +import chisel3.properties.{AnyClassType, Path, Property} import chisel3.util._ +import org.chipsalliance.stdlib.GeneralOM -class CompressInput(parameter: T1Parameter) extends Bundle { +case class CompressParam( + datapathWidth: Int, + xLen: Int, + vLen: Int, + laneNumber: Int, + groupNumberBits: Int, + latency: Int) + extends SerializableModuleParameter + +object CompressParam { + implicit def rwP = upickle.default.macroRW[CompressParam] +} + +class CompressInput(parameter: CompressParam) extends Bundle { val maskType: Bool = Bool() val eew: UInt = UInt(2.W) val uop: UInt = UInt(3.W) @@ -14,26 +31,55 @@ class CompressInput(parameter: T1Parameter) extends Bundle { val source1: UInt = UInt(parameter.datapathWidth.W) val mask: UInt = UInt(parameter.datapathWidth.W) val source2: UInt = UInt((parameter.laneNumber * parameter.datapathWidth).W) - val groupCounter: UInt = UInt(parameter.laneParam.groupNumberBits.W) + val groupCounter: UInt = UInt(parameter.groupNumberBits.W) val ffoInput: UInt = UInt(parameter.laneNumber.W) val validInput: UInt = UInt(parameter.laneNumber.W) val lastCompress: Bool = Bool() } -class CompressOutput(parameter: T1Parameter) extends Bundle { +class CompressOutput(parameter: CompressParam) extends Bundle { val data: UInt = UInt((parameter.laneNumber * parameter.datapathWidth).W) val mask: UInt = UInt((parameter.laneNumber * parameter.datapathWidth / 8).W) - val groupCounter: UInt = UInt(parameter.laneParam.groupNumberBits.W) + val groupCounter: UInt = UInt(parameter.groupNumberBits.W) val ffoOutput: UInt = UInt(parameter.laneNumber.W) val compressValid: Bool = Bool() } -class MaskCompress(parameter: T1Parameter) extends Module { - val in: ValidIO[CompressInput] = IO(Flipped(Valid(new CompressInput(parameter)))) - val out: CompressOutput = IO(Output(new CompressOutput(parameter))) - val newInstruction: Bool = IO(Input(Bool())) - val ffoInstruction: Bool = IO(Input(Bool())) - val writeData: UInt = IO(Output(UInt(parameter.xLen.W))) +class MaskCompressInterFace(parameter: CompressParam) extends Bundle { + val clock = Input(Clock()) + val reset = Input(Reset()) + + val in: ValidIO[CompressInput] = Flipped(Valid(new CompressInput(parameter))) + val out: CompressOutput = Output(new CompressOutput(parameter)) + val newInstruction: Bool = Input(Bool()) + val ffoInstruction: Bool = Input(Bool()) + val writeData: UInt = Output(UInt(parameter.xLen.W)) + val om = Output(Property[AnyClassType]()) +} + +@instantiable +class MaskCompressOM(parameter: CompressParam) extends GeneralOM[CompressParam, MaskCompress](parameter) { + override def hasRetime: Boolean = true +} + +class MaskCompress(val parameter: CompressParam) + extends FixedIORawModule(new MaskCompressInterFace(parameter)) + with SerializableModule[CompressParam] + with ImplicitClock + with ImplicitReset { + + protected def implicitClock = io.clock + protected def implicitReset = io.reset + + val omInstance: Instance[MaskCompressOM] = Instantiate(new MaskCompressOM(parameter)) + io.om := omInstance.getPropertyReference + omInstance.retimeIn.foreach(_ := Property(Path(io.clock))) + + val in = io.in + val out = io.out + val newInstruction = io.newInstruction + val ffoInstruction = io.ffoInstruction + val writeData = io.writeData val maskSize: Int = parameter.laneNumber * parameter.datapathWidth / 8 @@ -122,7 +168,7 @@ class MaskCompress(parameter: T1Parameter) extends Module { val compressDataReg = RegInit(0.U((parameter.laneNumber * parameter.datapathWidth).W)) val compressTailValid: Bool = RegInit(false.B) - val compressWriteGroupCount: UInt = RegInit(0.U(parameter.laneParam.groupNumberBits.W)) + val compressWriteGroupCount: UInt = RegInit(0.U(parameter.groupNumberBits.W)) val compressDataVec = Seq(1, 2, 4).map { dataByte => val dataBit = dataByte * 8 val elementSizePerSet = parameter.laneNumber * parameter.datapathWidth / 8 / dataByte @@ -238,5 +284,5 @@ class MaskCompress(parameter: T1Parameter) extends Module { ffoIndex := source1SigExtend } outWire.ffoOutput := completedLeftOr | Fill(parameter.laneNumber, ffoValid) - out := RegNext(outWire, 0.U.asTypeOf(out)) + out := Pipe(true.B, outWire, parameter.latency).bits } diff --git a/t1/src/mask/MaskUnit.scala b/t1/src/mask/MaskUnit.scala index 76cd556d2..af21abaec 100644 --- a/t1/src/mask/MaskUnit.scala +++ b/t1/src/mask/MaskUnit.scala @@ -84,11 +84,15 @@ class MaskUnitInterface(parameter: T1Parameter) extends Bundle { @instantiable class MaskUnitOM(parameter: T1Parameter) extends GeneralOM[T1Parameter, MaskUnit](parameter) { - @public val reduceUnit = IO(Output(Property[AnyClassType]())) @public val reduceUnitIn = IO(Input(Property[AnyClassType]())) reduceUnit := reduceUnitIn + + val compress = IO(Output(Property[AnyClassType]())) + @public + val compressIn = IO(Input(Property[AnyClassType]())) + compress := compressIn } // TODO: no T1Parameter here. @@ -898,14 +902,24 @@ class MaskUnit(val parameter: T1Parameter) // Determine whether the data is ready val executeEnqValid: Bool = otherTypeRequestDeq && !readType + val compressParam: CompressParam = CompressParam( + parameter.datapathWidth, + parameter.xLen, + parameter.vLen, + parameter.laneNumber, + parameter.laneParam.groupNumberBits, + 1 + ) // start execute - val compressUnit: MaskCompress = Module(new MaskCompress(parameter)) - val reduceUnit = Instantiate( + val compressUnit = Instantiate(new MaskCompress(compressParam)) + val reduceUnit = Instantiate( new MaskReduce( MaskReduceParameter(parameter.datapathWidth, parameter.laneNumber, parameter.fpuEnable) ) ) omInstance.reduceUnitIn := reduceUnit.io.om.asAnyClassType + omInstance.compressIn := compressUnit.io.om.asAnyClassType + val extendUnit: MaskExtend = Module(new MaskExtend(parameter)) // todo @@ -935,28 +949,30 @@ class MaskUnit(val parameter: T1Parameter) val compressSource1: UInt = Mux1H(sew1H, vs1Split.map(_._1)) val source1Select: UInt = Mux(mv, readVS1Reg.data, compressSource1) val source1Change: Bool = Mux1H(sew1H, vs1Split.map(_._2)) - when(source1Change && compressUnit.in.fire) { + when(source1Change && compressUnit.io.in.fire) { readVS1Reg.dataValid := false.B readVS1Reg.requestSend := false.B readVS1Reg.readIndex := readVS1Reg.readIndex + 1.U } - viotaCounterAdd := compressUnit.in.fire - - compressUnit.in.valid := executeEnqValid && unitType(1) - compressUnit.in.bits.maskType := instReg.maskType - compressUnit.in.bits.eew := instReg.sew - compressUnit.in.bits.uop := instReg.decodeResult(Decoder.topUop) - compressUnit.in.bits.readFromScalar := instReg.readFromScala - compressUnit.in.bits.source1 := source1Select - compressUnit.in.bits.mask := executeElementMask - compressUnit.in.bits.source2 := source2 - compressUnit.in.bits.groupCounter := requestCounter - compressUnit.in.bits.lastCompress := lastGroup - compressUnit.in.bits.ffoInput := VecInit(exeReqReg.map(_.bits.ffo)).asUInt - compressUnit.in.bits.validInput := VecInit(exeReqReg.map(_.valid)).asUInt - compressUnit.newInstruction := instReq.valid - compressUnit.ffoInstruction := instReq.bits.decodeResult(Decoder.topUop)(2, 0) === BitPat("b11?") + viotaCounterAdd := compressUnit.io.in.fire + + compressUnit.io.clock := implicitClock + compressUnit.io.reset := implicitReset + compressUnit.io.in.valid := executeEnqValid && unitType(1) + compressUnit.io.in.bits.maskType := instReg.maskType + compressUnit.io.in.bits.eew := instReg.sew + compressUnit.io.in.bits.uop := instReg.decodeResult(Decoder.topUop) + compressUnit.io.in.bits.readFromScalar := instReg.readFromScala + compressUnit.io.in.bits.source1 := source1Select + compressUnit.io.in.bits.mask := executeElementMask + compressUnit.io.in.bits.source2 := source2 + compressUnit.io.in.bits.groupCounter := requestCounter + compressUnit.io.in.bits.lastCompress := lastGroup + compressUnit.io.in.bits.ffoInput := VecInit(exeReqReg.map(_.bits.ffo)).asUInt + compressUnit.io.in.bits.validInput := VecInit(exeReqReg.map(_.valid)).asUInt + compressUnit.io.newInstruction := instReq.valid + compressUnit.io.ffoInstruction := instReq.bits.decodeResult(Decoder.topUop)(2, 0) === BitPat("b11?") reduceUnit.io.clock := implicitClock reduceUnit.io.reset := implicitReset @@ -980,7 +996,7 @@ class MaskUnit(val parameter: T1Parameter) sink := VecInit(exeReqReg.map(_.bits.fpReduceValid.get)).asUInt } - when(reduceUnit.io.in.fire || compressUnit.in.fire) { + when(reduceUnit.io.in.fire || compressUnit.io.in.fire) { readVS1Reg.sendToExecution := true.B } @@ -1001,7 +1017,7 @@ class MaskUnit(val parameter: T1Parameter) val executeResult: UInt = Mux1H( unitType(3, 1), Seq( - compressUnit.out.data, + compressUnit.io.out.data, reduceUnit.io.out.bits.data, extendUnit.out ) @@ -1021,7 +1037,7 @@ class MaskUnit(val parameter: T1Parameter) val executeValid: Bool = Mux1H( unitType(3, 1), Seq( - compressUnit.out.compressValid, + compressUnit.io.out.compressValid, false.B, executeEnqValid ) @@ -1039,13 +1055,13 @@ class MaskUnit(val parameter: T1Parameter) val executeDeqGroupCounter: UInt = Mux1H( unitType(3, 1), Seq( - compressUnit.out.groupCounter, + compressUnit.io.out.groupCounter, requestCounter, extendGroupCount ) ) - val executeWriteByteMask: UInt = Mux(compress || ffo || mvVd, compressUnit.out.mask, executeByteMask) + val executeWriteByteMask: UInt = Mux(compress || ffo || mvVd, compressUnit.io.out.mask, executeByteMask) maskedWrite.needWAR := maskDestinationType maskedWrite.vd := instReg.vd maskedWrite.in.zipWithIndex.foreach { case (req, index) => @@ -1057,7 +1073,7 @@ class MaskUnit(val parameter: T1Parameter) req.bits.pipeData := exeReqReg(index).bits.source1 req.bits.bitMask := bitMask req.bits.groupCounter := executeDeqGroupCounter - req.bits.ffoByOther := compressUnit.out.ffoOutput(index) && ffo + req.bits.ffoByOther := compressUnit.io.out.ffoOutput(index) && ffo if (index == 0) { // reduce result when(unitType(2)) { @@ -1117,7 +1133,7 @@ class MaskUnit(val parameter: T1Parameter) val executeStageInvalid: Bool = Mux1H( unitType(3, 1), Seq( - !compressUnit.out.compressValid, + !compressUnit.io.out.compressValid, reduceUnit.io.in.ready, true.B ) @@ -1136,7 +1152,7 @@ class MaskUnit(val parameter: T1Parameter) lastReportValid, indexToOH(instReg.instructionIndex, parameter.chainingSize) ) - writeRDData := Mux(pop, reduceUnit.io.out.bits.data, compressUnit.writeData) + writeRDData := Mux(pop, reduceUnit.io.out.bits.data, compressUnit.io.writeData) // gather read state when(gatherRequestFire) { From fafcb31a1efbf26a4a5aade85f50eacc41d7017e Mon Sep 17 00:00:00 2001 From: Qinjun Li <44799832+qinjun-li@users.noreply.github.com> Date: Fri, 20 Dec 2024 20:28:02 +0800 Subject: [PATCH 17/41] [rtl] retime compress unit. --- t1/src/mask/MaskUnit.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/t1/src/mask/MaskUnit.scala b/t1/src/mask/MaskUnit.scala index af21abaec..641ea9bd4 100644 --- a/t1/src/mask/MaskUnit.scala +++ b/t1/src/mask/MaskUnit.scala @@ -908,7 +908,7 @@ class MaskUnit(val parameter: T1Parameter) parameter.vLen, parameter.laneNumber, parameter.laneParam.groupNumberBits, - 1 + 2 ) // start execute val compressUnit = Instantiate(new MaskCompress(compressParam)) From 1ddf0dd335f3ed3b0679030ffba27791e18c0932 Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Sat, 21 Dec 2024 17:56:50 +0800 Subject: [PATCH 18/41] [rtl] cut pipe in compress unit. --- t1/src/mask/MaskCompress.scala | 57 ++++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 20 deletions(-) diff --git a/t1/src/mask/MaskCompress.scala b/t1/src/mask/MaskCompress.scala index 2ece87ca9..c4e025e0b 100644 --- a/t1/src/mask/MaskCompress.scala +++ b/t1/src/mask/MaskCompress.scala @@ -58,9 +58,7 @@ class MaskCompressInterFace(parameter: CompressParam) extends Bundle { } @instantiable -class MaskCompressOM(parameter: CompressParam) extends GeneralOM[CompressParam, MaskCompress](parameter) { - override def hasRetime: Boolean = true -} +class MaskCompressOM(parameter: CompressParam) extends GeneralOM[CompressParam, MaskCompress](parameter) {} class MaskCompress(val parameter: CompressParam) extends FixedIORawModule(new MaskCompressInterFace(parameter)) @@ -95,7 +93,7 @@ class MaskCompress(val parameter: CompressParam) val eew1H: UInt = UIntToOH(in.bits.eew)(2, 0) val compressInit: UInt = RegInit(0.U(log2Ceil(parameter.vLen).W)) val compressVec: Vec[UInt] = Wire(Vec(maskSize, UInt(compressInit.getWidth.W))) - val compressMaskVec: Seq[Bool] = changeUIntSize(in.bits.source1 & in.bits.mask, maskSize).asBools + val compressMaskVec: Vec[Bool] = VecInit(changeUIntSize(in.bits.source1 & in.bits.mask, maskSize).asBools) val compressCount: UInt = compressMaskVec.zipWithIndex.foldLeft(compressInit) { case (pre, (mask, index)) => compressVec(index) := pre pre + mask @@ -106,6 +104,7 @@ class MaskCompress(val parameter: CompressParam) val ffoValid: Bool = RegInit(false.B) writeData := ffoIndex + // compress & viota stage 1: update compressInit when(newInstruction) { compressInit := 0.U } @@ -131,28 +130,43 @@ class MaskCompress(val parameter: CompressParam) } } - val viotaResult: UInt = Mux1H( + // compress & viota stage 2: get result + // pipe stage1 result + def initRegEnable[T <: Data](data: T, enable: Bool) = { + RegEnable(data, 0.U.asTypeOf(data), enable) + } + val compressVecPipe: Vec[UInt] = initRegEnable(compressVec, in.fire) + val compressMaskVecPipe: Vec[Bool] = initRegEnable(compressMaskVec, in.fire) + val maskPipe: UInt = initRegEnable(in.bits.mask, in.fire) + val source2Pipe: UInt = initRegEnable(in.bits.source2, in.fire) + val lastCompressPipe: Bool = initRegEnable(in.bits.lastCompress, in.fire) + val stage2Valid: Bool = RegNext(in.fire, false.B) + val newInstructionPipe: Bool = RegNext(newInstruction, false.B) + val compressInitPipe: UInt = initRegEnable(compressInit, in.fire) + val compressDeqValidPipe: Bool = initRegEnable(compressDeqValid, in.fire) + val groupCounterPipe: UInt = initRegEnable(in.bits.groupCounter, in.fire) + val viotaResult: UInt = Mux1H( eew1H, Seq(1, 2, 4).map { eew => VecInit(Seq.tabulate(parameter.laneNumber) { index => // data width: eew * 8, data path 32, need [4 / eew] element val dataSize = 4 / eew val res: Seq[UInt] = Seq.tabulate(dataSize) { i => - changeUIntSize(compressVec(dataSize * index + i), eew * 8) + changeUIntSize(compressVecPipe(dataSize * index + i), eew * 8) } // each data path VecInit(res).asUInt }).asUInt } ) - val viotaMask: UInt = Mux1H( + val viotaMask: UInt = Mux1H( eew1H, Seq(1, 2, 4).map { eew => VecInit(Seq.tabulate(parameter.laneNumber) { index => val dataSize = 4 / eew val res: Seq[UInt] = Seq.tabulate(dataSize) { i => val maskIndex: Int = (parameter.datapathWidth - 1).min(dataSize * index + i) - Fill(eew, in.bits.mask(maskIndex)) + Fill(eew, maskPipe(maskIndex)) } // 4 bit mask VecInit(res).asUInt @@ -163,7 +177,7 @@ class MaskCompress(val parameter: CompressParam) val tailCount: UInt = { val minElementSizePerSet = parameter.laneNumber * parameter.datapathWidth / 8 val maxCountWidth = log2Ceil(minElementSizePerSet) - changeUIntSize(compressInit, maxCountWidth) + changeUIntSize(compressInitPipe, maxCountWidth) } val compressDataReg = RegInit(0.U((parameter.laneNumber * parameter.datapathWidth).W)) @@ -174,10 +188,12 @@ class MaskCompress(val parameter: CompressParam) val elementSizePerSet = parameter.laneNumber * parameter.datapathWidth / 8 / dataByte VecInit(Seq.tabulate(elementSizePerSet * 2) { index => val hitReq = - Seq.tabulate(elementSizePerSet)(maskIndex => compressMaskVec(maskIndex) && compressVec(maskIndex) === index.U) + Seq.tabulate(elementSizePerSet)(maskIndex => + compressMaskVecPipe(maskIndex) && compressVecPipe(maskIndex) === index.U + ) val selectReqData = Mux1H( hitReq, - cutUInt(in.bits.source2, dataBit) + cutUInt(source2Pipe, dataBit) ) if (index < elementSizePerSet) { val useTail = index.U < tailCount @@ -189,18 +205,18 @@ class MaskCompress(val parameter: CompressParam) }).asUInt } val compressResult: UInt = Mux1H(eew1H, compressDataVec) - val lastCompressEnq: Bool = in.fire && in.bits.lastCompress - when(newInstruction || lastCompressEnq || outWire.compressValid) { + val lastCompressEnq: Bool = stage2Valid && lastCompressPipe + when(newInstructionPipe || lastCompressEnq || outWire.compressValid) { compressTailValid := lastCompressEnq && compress } - when(newInstruction || outWire.compressValid) { - compressWriteGroupCount := Mux(newInstruction, 0.U, compressWriteGroupCount + 1.U) + when(newInstructionPipe || outWire.compressValid) { + compressWriteGroupCount := Mux(newInstructionPipe, 0.U, compressWriteGroupCount + 1.U) } val splitCompressResult: Vec[UInt] = cutUIntBySize(compressResult, 2) - when(in.fire) { - compressDataReg := Mux(compressDeqValid, splitCompressResult(1), splitCompressResult(0)) + when(stage2Valid) { + compressDataReg := Mux(compressDeqValidPipe, splitCompressResult(1), splitCompressResult(0)) } // todo: connect & update compressInit @@ -245,9 +261,10 @@ class MaskCompress(val parameter: CompressParam) ) // todo - outWire.compressValid := (compressTailValid || (compressDeqValid && in.fire)) && !writeRD - outWire.groupCounter := Mux(compress, compressWriteGroupCount, in.bits.groupCounter) + outWire.compressValid := (compressTailValid || (compressDeqValidPipe && stage2Valid)) && !writeRD + outWire.groupCounter := Mux(compress, compressWriteGroupCount, groupCounterPipe) + // ffo type execute when(newInstruction && ffoInstruction) { ffoIndex := -1.S(parameter.datapathWidth.W).asUInt ffoValid := false.B @@ -284,5 +301,5 @@ class MaskCompress(val parameter: CompressParam) ffoIndex := source1SigExtend } outWire.ffoOutput := completedLeftOr | Fill(parameter.laneNumber, ffoValid) - out := Pipe(true.B, outWire, parameter.latency).bits + out := RegNext(outWire, 0.U.asTypeOf(outWire)) } From 460cd6d871bbcb599c88172d768e2a74ba2550e7 Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Sun, 22 Dec 2024 10:29:07 +0800 Subject: [PATCH 19/41] [rtl] pipe instructionFinished in sequencer. --- t1/src/T1.scala | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/t1/src/T1.scala b/t1/src/T1.scala index 8466cd7ba..dc76eb784 100644 --- a/t1/src/T1.scala +++ b/t1/src/T1.scala @@ -263,6 +263,8 @@ case class T1Parameter( val maskRequestLatency = 2 + val releaseShifterSize: Seq[Int] = Seq.tabulate(laneNumber)(_ => 1) + val decoderParam: DecoderParam = DecoderParam(fpuEnable, zvbbEnable, allInstructions) /** paraemter for AXI4. */ @@ -778,10 +780,11 @@ class T1(val parameter: T1Parameter) lsu.offsetReadResult(index).bits := lane.maskUnitRequest.bits.source2 lsu.offsetReadIndex(index) := lane.maskUnitRequest.bits.index + val instructionFinishedPipe = Pipe(true.B, lane.instructionFinished, parameter.releaseShifterSize(index)).bits instructionFinished(index).zip(slots.map(_.record.instructionIndex)).foreach { case (d, f) => - d := ohCheck(lane.instructionFinished, f, parameter.chainingSize) + d := ohCheck(instructionFinishedPipe, f, parameter.chainingSize) } - vxsatReportVec(index) := lane.vxsatReport + vxsatReportVec(index) := lane.vxsatReport lane.maskInput := Pipe(true.B, maskUnit.io.laneMaskInput(index), parameter.maskRequestLatency).bits maskUnit.io.laneMaskSelect(index) := Pipe(true.B, lane.maskSelect, parameter.maskRequestLatency).bits maskUnit.io.laneMaskSewSelect(index) := Pipe(true.B, lane.maskSelectSew, parameter.maskRequestLatency).bits @@ -798,7 +801,7 @@ class T1(val parameter: T1Parameter) (requestReg.bits.writeByte(rowWith - 1, 0) > ((parameter.datapathWidth / 8) * index).U) // token manager - tokenManager.instructionFinish(index) := lane.instructionFinished + tokenManager.instructionFinish(index) := instructionFinishedPipe lane } From a3887e36d69d326e89504eb649985b680ecf5bc9 Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Mon, 23 Dec 2024 13:29:26 +0800 Subject: [PATCH 20/41] [rtl] Calculate compressCount using tree structure. --- t1/src/mask/MaskCompress.scala | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/t1/src/mask/MaskCompress.scala b/t1/src/mask/MaskCompress.scala index c4e025e0b..03520db7a 100644 --- a/t1/src/mask/MaskCompress.scala +++ b/t1/src/mask/MaskCompress.scala @@ -93,8 +93,11 @@ class MaskCompress(val parameter: CompressParam) val eew1H: UInt = UIntToOH(in.bits.eew)(2, 0) val compressInit: UInt = RegInit(0.U(log2Ceil(parameter.vLen).W)) val compressVec: Vec[UInt] = Wire(Vec(maskSize, UInt(compressInit.getWidth.W))) - val compressMaskVec: Vec[Bool] = VecInit(changeUIntSize(in.bits.source1 & in.bits.mask, maskSize).asBools) - val compressCount: UInt = compressMaskVec.zipWithIndex.foldLeft(compressInit) { case (pre, (mask, index)) => + val maskInput: UInt = changeUIntSize(in.bits.source1 & in.bits.mask, maskSize) + val compressMaskVec: Vec[Bool] = VecInit(maskInput.asBools) + val compressCount: UInt = compressInit + PopCount(maskInput) + + compressMaskVec.zipWithIndex.foldLeft(compressInit) { case (pre, (mask, index)) => compressVec(index) := pre pre + mask } @@ -113,7 +116,7 @@ class MaskCompress(val parameter: CompressParam) val dataByte = 1 << sewInt val elementSizePerSet = parameter.laneNumber * parameter.datapathWidth / 8 / dataByte val countWidth = log2Ceil(elementSizePerSet) - val compressDeqValid = (compressCount >> countWidth).asUInt.orR + val compressDeqValid = (compressCount >> countWidth).asUInt(0) val compressUpdate = changeUIntSize(compressCount, countWidth) (compressDeqValid, compressUpdate) } From 4e95ea3f7e36944af956db08b95baa718e7ea3c7 Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Mon, 23 Dec 2024 13:29:46 +0800 Subject: [PATCH 21/41] [code] format. --- omreaderlib/src/t1/T1.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/omreaderlib/src/t1/T1.scala b/omreaderlib/src/t1/T1.scala index a935163dd..dc304d860 100644 --- a/omreaderlib/src/t1/T1.scala +++ b/omreaderlib/src/t1/T1.scala @@ -20,11 +20,11 @@ class T1(val mlirbc: Array[Byte]) extends T1OMReaderAPI { t1("lanes").list.elements().map(_.obj("vrf").obj).flatMap(getSRAM) def permutation: Seq[Retime] = { - val permutation = t1("permutation") - val reduceUnit = permutation.obj("reduceUnit").obj + val permutation = t1("permutation") + val reduceUnit = permutation.obj("reduceUnit").obj val compressUnit = permutation.obj("compress").obj // TODO: need fieldOpt(name: String) - val floatAdder = + val floatAdder = Option.when(reduceUnit.fieldNames().contains("floatAdder"))(reduceUnit("floatAdder").obj) (Seq(compressUnit) ++ floatAdder).flatMap(getRetime) From 7f67533ea497b7682106eaca7b7b6767ada6a7d0 Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Mon, 23 Dec 2024 16:29:25 +0800 Subject: [PATCH 22/41] [rtl] add last report for empty instruction. --- t1/src/Lane.scala | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/t1/src/Lane.scala b/t1/src/Lane.scala index 934e9af27..1c8893634 100644 --- a/t1/src/Lane.scala +++ b/t1/src/Lane.scala @@ -1183,12 +1183,16 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ instructionFinishInSlot := (~instructionValid).asUInt & instructionValidNext + val emptyInstValid: Bool = RegNext(laneRequest.bits.issueInst && !vrf.instructionWriteReport.valid, false.B) + val emptyInstCount: UInt = RegNext(indexToOH(laneRequest.bits.instructionIndex, parameter.chainingSize)) + val emptyReport: UInt = maskAnd(emptyInstValid, emptyInstCount).asUInt + // clear record by instructionFinished vrf.instructionLastReport := instructionFinishInSlot vrf.lsuLastReport := lsuLastReport vrf.loadDataInLSUWriteQueue := loadDataInLSUWriteQueue vrf.dataInLane := instructionValid - instructionFinished := vrf.vrfSlotRelease + instructionFinished := vrf.vrfSlotRelease | emptyReport writeReadyForLsu := vrf.writeReadyForLsu vrfReadyToStore := vrf.vrfReadyToStore tokenManager.crossWriteReports.zipWithIndex.foreach { case (rpt, rptIndex) => From 399ece05c1a21473e7772bdc567c67675422888e Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Tue, 24 Dec 2024 11:29:58 +0800 Subject: [PATCH 23/41] [rtl] Fix the arrival time of lsu's lastreport. --- t1/src/T1.scala | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/t1/src/T1.scala b/t1/src/T1.scala index dc76eb784..af0cbf3cf 100644 --- a/t1/src/T1.scala +++ b/t1/src/T1.scala @@ -791,7 +791,10 @@ class T1(val parameter: T1Parameter) maskUnit.io.v0UpdateVec(index) <> lane.v0Update lsu.v0UpdateVec(index) <> lane.v0Update - lane.lsuLastReport := lsu.lastReport | maskUnit.io.lastReport + // Must arrive after the instruction request + val lsuLastPipe: UInt = Pipe(true.B, lsu.lastReport, parameter.laneRequestShifterSize(index)).bits + val maskLastPipe: UInt = Pipe(true.B, maskUnit.io.lastReport, parameter.laneRequestShifterSize(index)).bits + lane.lsuLastReport := lsuLastPipe | maskLastPipe lane.loadDataInLSUWriteQueue := lsu.dataInWriteQueue(index) // 2 + 3 = 5 From 29f06d90edddef30265f6243d0f2904567e4f377 Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Tue, 24 Dec 2024 15:03:54 +0800 Subject: [PATCH 24/41] [rtl] reorder read for other unit in lsu. --- t1/src/T1.scala | 1 + t1/src/lsu/LSU.scala | 45 ++++++++++++++++++++++++------- t1/src/lsu/SimpleAccessUnit.scala | 8 ++++-- 3 files changed, 43 insertions(+), 11 deletions(-) diff --git a/t1/src/T1.scala b/t1/src/T1.scala index af0cbf3cf..fe93a1999 100644 --- a/t1/src/T1.scala +++ b/t1/src/T1.scala @@ -327,6 +327,7 @@ case class T1Parameter( transferSize = lsuTransposeSize, vrfReadLatency = vrfReadLatency, axi4BundleParameter = axi4BundleParameter, + lsuReadShifterSize = lsuReadShifterSize, name = "main" ) def vrfParam: VRFParam = VRFParam(vLen, laneNumber, datapathWidth, chainingSize, vrfBankSize, vrfRamType) diff --git a/t1/src/lsu/LSU.scala b/t1/src/lsu/LSU.scala index cd29f90be..e3d3b1d2a 100644 --- a/t1/src/lsu/LSU.scala +++ b/t1/src/lsu/LSU.scala @@ -37,6 +37,7 @@ case class LSUParameter( // TODO: refactor to per lane parameter. vrfReadLatency: Int, axi4BundleParameter: AXI4BundleParameter, + lsuReadShifterSize: Seq[Int], name: String) { val sewMin: Int = 8 @@ -61,7 +62,16 @@ case class LSUParameter( val sourceQueueSize: Int = 32.min(vLen * 8 / (transferSize * 8)) def mshrParam: MSHRParam = - MSHRParam(chainingSize, datapathWidth, vLen, laneNumber, paWidth, transferSize, vrfReadLatency) + MSHRParam( + chainingSize, + datapathWidth, + vLen, + laneNumber, + paWidth, + transferSize, + lsuReadShifterSize.head, + vrfReadLatency + ) /** see [[VRFParam.regNumBits]] */ val regNumBits: Int = log2Ceil(32) @@ -245,7 +255,6 @@ class LSU(param: LSUParameter) extends Module { /** TileLink D Channel write to VRF queue: TL-D -CrossBar-> MSHR -proxy-> write queue -CrossBar-> VRF */ - @public val writeQueueVec: Seq[QueueIO[LSUWriteQueueBundle]] = Seq.fill(param.laneNumber)( Queue.io(new LSUWriteQueueBundle(param), param.toVRFWriteQueueSize, flow = true) ) @@ -253,20 +262,38 @@ class LSU(param: LSUParameter) extends Module { @public val lsuProbe = IO(Output(Probe(new LSUProbe(param), layers.Verification))) + // todo: require all shifter same as head + val readLatency: Int = param.vrfReadLatency + param.lsuReadShifterSize.head * 2 + val otherUnitTargetQueue: QueueIO[UInt] = Queue.io(UInt(param.laneNumber.W), 2 * readLatency, pipe = true) + val otherUnitDataQueueVec: Seq[QueueIO[UInt]] = Seq.fill(param.laneNumber)( + Queue.io(UInt(param.datapathWidth.W), readLatency, flow = true) + ) + val dataDeqFire: UInt = Wire(UInt(param.laneNumber.W)) // read vrf - val otherTryReadVrf: UInt = Mux(otherUnit.vrfReadDataPorts.valid, otherUnit.status.targetLane, 0.U) + val otherTryReadVrf: UInt = Mux(otherUnit.vrfReadDataPorts.valid, otherUnit.status.targetLane, 0.U) vrfReadDataPorts.zipWithIndex.foreach { case (read, index) => read.valid := otherTryReadVrf(index) || storeUnit.vrfReadDataPorts(index).valid read.bits := Mux(otherTryReadVrf(index), otherUnit.vrfReadDataPorts.bits, storeUnit.vrfReadDataPorts(index).bits) storeUnit.vrfReadDataPorts(index).ready := read.ready && !otherTryReadVrf(index) storeUnit.vrfReadResults(index) := vrfReadResults(index) + storeUnit.vrfReadResults(index).valid := vrfReadResults(index).valid && otherUnitTargetQueue.empty + + val otherUnitQueue: QueueIO[UInt] = otherUnitDataQueueVec(index) + otherUnitQueue.enq.valid := vrfReadResults(index).valid && !otherUnitTargetQueue.empty + otherUnitQueue.enq.bits := vrfReadResults(index).bits + otherUnitQueue.deq.ready := dataDeqFire(index) } - otherUnit.vrfReadDataPorts.ready := (otherTryReadVrf & VecInit(vrfReadDataPorts.map(_.ready)).asUInt).orR - val pipeOtherRead: ValidIO[UInt] = - Pipe(otherUnit.vrfReadDataPorts.fire, otherUnit.status.targetLane, param.vrfReadLatency) - // todo: read data reorder - otherUnit.vrfReadResults.bits := Mux1H(pipeOtherRead.bits, vrfReadResults.map(_.bits)) - otherUnit.vrfReadResults.valid := pipeOtherRead.valid + otherUnit.vrfReadDataPorts.ready := (otherTryReadVrf & VecInit(vrfReadDataPorts.map(_.ready)).asUInt).orR && + otherUnitTargetQueue.enq.ready + otherUnitTargetQueue.enq.bits := otherUnit.status.targetLane + otherUnitTargetQueue.enq.valid := otherUnit.vrfReadDataPorts.fire + + // read data reorder + otherUnit.vrfReadResults.bits := Mux1H(otherUnitTargetQueue.deq.bits, otherUnitDataQueueVec.map(_.deq.bits)) + otherUnit.vrfReadResults.valid := otherUnitTargetQueue.deq.valid && + (otherUnitTargetQueue.deq.bits & VecInit(otherUnitDataQueueVec.map(_.deq.valid)).asUInt).orR + dataDeqFire := maskAnd(otherUnit.vrfReadResults.valid, otherUnitTargetQueue.deq.bits) + otherUnitTargetQueue.deq.ready := otherUnit.vrfReadResults.valid // write vrf val otherTryToWrite: UInt = Mux(otherUnit.vrfWritePort.valid, otherUnit.status.targetLane, 0.U) diff --git a/t1/src/lsu/SimpleAccessUnit.scala b/t1/src/lsu/SimpleAccessUnit.scala index bc517d5eb..7ebd0b671 100644 --- a/t1/src/lsu/SimpleAccessUnit.scala +++ b/t1/src/lsu/SimpleAccessUnit.scala @@ -50,6 +50,7 @@ case class MSHRParam( laneNumber: Int, paWidth: Int, lsuTransposeSize: Int, + lsuReadShifter: Int, vrfReadLatency: Int) { /** see [[LaneParameter.lmulMax]] */ @@ -124,6 +125,9 @@ case class MSHRParam( // outstanding of StoreUnit.vrfReadDataPorts // todo: param from T1Param val storeUnitReadOutStanding: Int = 8 + + // One round trip is required + val lsuReadShifterLatency: Int = 2 * lsuReadShifter } /** Miss Status Handler Register this is used to record the outstanding memory access request for each instruction. it @@ -705,9 +709,9 @@ class SimpleAccessUnit(param: MSHRParam) extends Module with LSUPublic { // Reading vrf may take multiple cycles and requires additional information to be stored val s1EnqQueue: QueueIO[SimpleAccessStage1] = - Queue.io(new SimpleAccessStage1(param), param.vrfReadLatency + 2) + Queue.io(new SimpleAccessStage1(param), param.vrfReadLatency + param.lsuReadShifterLatency + 2) val s1EnqDataQueue: QueueIO[UInt] = - Queue.io(UInt(param.datapathWidth.W), param.vrfReadLatency + 2) + Queue.io(UInt(param.datapathWidth.W), param.vrfReadLatency + param.lsuReadShifterLatency + 2) /** which byte to access in VRF, e.g. VLEN=1024,datapath=32,laneNumber=8 XXXXXXXXXX <- 10 bits for element(32bits) * index XX <- 2 bits for SEW XXXXXXXXXX <- strip MSB for the constraint that sew*vlmax <= 8*VLEN <- From 08bed56aaba6c6cafe493e76b0f256c0a4323d0f Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Wed, 25 Dec 2024 14:06:39 +0800 Subject: [PATCH 25/41] [rtl] fix read in mask unit. --- t1/src/Bundles.scala | 5 +++++ t1/src/mask/BitLevelMaskWrite.scala | 4 ++-- t1/src/mask/MaskUnit.scala | 26 +++++++++++++++----------- 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/t1/src/Bundles.scala b/t1/src/Bundles.scala index d59a97d75..9b8985ec4 100644 --- a/t1/src/Bundles.scala +++ b/t1/src/Bundles.scala @@ -794,3 +794,8 @@ class MaskUnitReadVs1(parameter: T1Parameter) extends Bundle { class LaneTokenBundle extends Bundle { val maskRequestRelease: Bool = Input(Bool()) } + +class MaskUnitReadPipe(parameter: T1Parameter) extends Bundle { + val readSource: UInt = UInt(parameter.laneNumber.W) + val dataOffset: UInt = UInt(log2Ceil(parameter.datapathWidth / 8).W) +} diff --git a/t1/src/mask/BitLevelMaskWrite.scala b/t1/src/mask/BitLevelMaskWrite.scala index 5bc73491e..5e30675d1 100644 --- a/t1/src/mask/BitLevelMaskWrite.scala +++ b/t1/src/mask/BitLevelMaskWrite.scala @@ -18,7 +18,7 @@ class BitLevelWriteRequest(parameter: T1Parameter) extends Bundle { class BitLevelMaskWrite(parameter: T1Parameter) extends Module { // todo - val readVRFLatency: Int = 2 + val readVRFLatency: Int = 4 val needWAR: Bool = IO(Input(Bool())) val vd: UInt = IO(Input(UInt(5.W))) @@ -68,7 +68,7 @@ class BitLevelMaskWrite(parameter: T1Parameter) extends Module { readPort.bits.vs := vd + (reqQueue.deq.bits.groupCounter >> readPort.bits.offset.getWidth).asUInt readPort.bits.offset := changeUIntSize(reqQueue.deq.bits.groupCounter, readPort.bits.offset.getWidth) - val readValidPipe = Pipe(readPort.fire, false.B, readVRFLatency).valid && readResult(index).valid + val readValidPipe = readResult(index).valid val readResultValid = !needWAR || readValidPipe val WARData = (WaitReadQueue.deq.bits.data & WaitReadQueue.deq.bits.bitMask) | diff --git a/t1/src/mask/MaskUnit.scala b/t1/src/mask/MaskUnit.scala index 641ea9bd4..cacc18c08 100644 --- a/t1/src/mask/MaskUnit.scala +++ b/t1/src/mask/MaskUnit.scala @@ -130,7 +130,7 @@ class MaskUnit(val parameter: T1Parameter) // todo: param val readQueueSize: Int = 4 - val readVRFLatency: Int = 2 + val readVRFLatency: Int = 3 val maskUnitWriteQueueSize: Int = 8 /** duplicate v0 for mask */ @@ -758,13 +758,15 @@ class MaskUnit(val parameter: T1Parameter) val pipeDataOffset: Vec[UInt] = Wire(Vec(parameter.laneNumber, UInt(log2Ceil(parameter.datapathWidth / 8).W))) readCrossBar.output.zipWithIndex.foreach { case (request, index) => + val readMessageQueue: QueueIO[MaskUnitReadPipe] = + Queue.io(new MaskUnitReadPipe(parameter), readVRFLatency + 4) val sourceLane = UIntToOH(request.bits.writeIndex) - readChannel(index).valid := request.valid + readChannel(index).valid := request.valid && readMessageQueue.enq.ready readChannel(index).bits.readSource := 2.U readChannel(index).bits.vs := request.bits.vs readChannel(index).bits.offset := request.bits.offset readChannel(index).bits.instructionIndex := instReg.instructionIndex - request.ready := readChannel(index).ready + request.ready := readChannel(index).ready && readMessageQueue.enq.ready maskedWrite.readChannel(index).ready := readChannel(index).ready maskedWrite.readResult(index) := readResult(index) @@ -774,15 +776,17 @@ class MaskUnit(val parameter: T1Parameter) readChannel(index).bits.offset := maskedWrite.readChannel(index).bits.offset } - // pipe read fire - val pipeRead = Pipe( - readChannel(index).fire && !maskDestinationType, - sourceLane, - readVRFLatency + readMessageQueue.enq.valid := readChannel(index).fire && !maskDestinationType + readMessageQueue.enq.bits.readSource := sourceLane + readMessageQueue.enq.bits.dataOffset := request.bits.dataOffset + readMessageQueue.deq.ready := readResult(index).valid + + write1HPipe(index) := Mux( + readMessageQueue.deq.valid && readResult(index).valid, + readMessageQueue.deq.bits.readSource, + 0.U(parameter.laneNumber.W) ) - val pipeOffset = Pipe(readChannel(index).fire, request.bits.dataOffset, readVRFLatency) - write1HPipe(index) := Mux(pipeRead.valid, pipeRead.bits, 0.U(parameter.laneNumber.W)) - pipeDataOffset(index) := pipeOffset.bits + pipeDataOffset(index) := readMessageQueue.deq.bits.dataOffset } // Processing read results From f35fe1a2d9ac09524b36f218be0b7a4a2aef3a57 Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Wed, 25 Dec 2024 16:23:56 +0800 Subject: [PATCH 26/41] [rtl] add write release for mask unit. --- t1/src/T1.scala | 3 ++- t1/src/lsu/LSU.scala | 3 +++ t1/src/mask/MaskUnit.scala | 17 ++++++++++++++--- t1/src/package.scala | 9 ++++++++- 4 files changed, 27 insertions(+), 5 deletions(-) diff --git a/t1/src/T1.scala b/t1/src/T1.scala index fe93a1999..5d06b4c53 100644 --- a/t1/src/T1.scala +++ b/t1/src/T1.scala @@ -773,7 +773,8 @@ class T1(val parameter: T1Parameter) )( VecInit(Seq(maskUnit.io.exeResp(index), lsu.vrfWritePort(index))), lane.vrfWriteChannel, - 0 + 0, + releaseSource = Some(Seq(maskUnit.io.writeRelease(index), lsu.writeRelease(index))) ) lane.writeFromMask := maskUnit.io.exeResp(index).fire diff --git a/t1/src/lsu/LSU.scala b/t1/src/lsu/LSU.scala index e3d3b1d2a..9df2e0213 100644 --- a/t1/src/lsu/LSU.scala +++ b/t1/src/lsu/LSU.scala @@ -168,6 +168,9 @@ class LSU(param: LSUParameter) extends Module { ) ) + @public + val writeRelease: Vec[Bool] = IO(Vec(param.laneNumber, Input(Bool()))) + @public val dataInWriteQueue: Vec[UInt] = IO(Output(Vec(param.laneNumber, UInt((2 * param.chainingSize).W)))) diff --git a/t1/src/mask/MaskUnit.scala b/t1/src/mask/MaskUnit.scala index cacc18c08..a1905a8b8 100644 --- a/t1/src/mask/MaskUnit.scala +++ b/t1/src/mask/MaskUnit.scala @@ -56,6 +56,7 @@ class MaskUnitInterface(parameter: T1Parameter) extends Bundle { ) ) ) + val writeRelease: Vec[Bool] = Vec(parameter.laneNumber, Input(Bool())) val tokenIO: Vec[LaneTokenBundle] = Flipped(Vec(parameter.laneNumber, new LaneTokenBundle)) val readChannel: Vec[DecoupledIO[VRFReadRequest]] = Vec( parameter.laneNumber, @@ -1094,7 +1095,7 @@ class MaskUnit(val parameter: T1Parameter) Queue.io(new MaskUnitExeResponse(parameter.laneParam), maskUnitWriteQueueSize) } - writeQueue.zipWithIndex.foreach { case (queue, index) => + val dataNotInShifter: Bool = writeQueue.zipWithIndex.map { case (queue, index) => val readTypeWriteVrf: Bool = waiteStageDeqFire && WillWriteLane(index) queue.enq.valid := maskedWrite.out(index).valid || readTypeWriteVrf maskedWrite.out(index).ready := queue.enq.ready @@ -1117,7 +1118,17 @@ class MaskUnit(val parameter: T1Parameter) parameter.laneParam.vrfOffsetBits ) writePort.bits.offset := queue.deq.bits.writeData.groupCounter - } + + val writeTokenSize = 8 + val writeTokenWidth = log2Ceil(writeTokenSize) + val writeTokenCounter = RegInit(0.U(writeTokenWidth.W)) + + val writeTokenChange = Mux(writePort.fire, 1.U(writeTokenWidth.W), -1.S(writeTokenWidth.W).asUInt) + when(writePort.fire ^ io.writeRelease(index)) { + writeTokenCounter := writeTokenCounter + writeTokenChange + } + writeTokenCounter === 0.U + }.reduce(_ && _) waiteStageDeqReady := writeQueue.zipWithIndex.map { case (queue, index) => !WillWriteLane(index) || queue.enq.ready }.reduce(_ && _) @@ -1126,7 +1137,7 @@ class MaskUnit(val parameter: T1Parameter) // todo: token val waiteLastRequest: Bool = RegInit(false.B) val waitQueueClear: Bool = RegInit(false.B) - val lastReportValid = waitQueueClear && !writeQueue.map(_.deq.valid).reduce(_ || _) + val lastReportValid = waitQueueClear && !writeQueue.map(_.deq.valid).reduce(_ || _) && dataNotInShifter when(lastReportValid) { waitQueueClear := false.B waiteLastRequest := false.B diff --git a/t1/src/package.scala b/t1/src/package.scala index 7b648bc75..06cfba148 100644 --- a/t1/src/package.scala +++ b/t1/src/package.scala @@ -282,7 +282,8 @@ package object rtl { sink: DecoupledIO[T], arb: Int, dataAck: Option[UInt] = None, - dataToSource: Option[Seq[ValidIO[UInt]]] = None + dataToSource: Option[Seq[ValidIO[UInt]]] = None, + releaseSource: Option[Seq[Bool]] = None ): Unit = { val sinkVec: Vec[DecoupledIO[T]] = VecInit(sourceVec.zipWithIndex.map { case (source, index) => val sinkWire: DecoupledIO[T] = Wire(Decoupled(chiselTypeOf(source.bits))) @@ -303,6 +304,12 @@ package object rtl { connectWithShifter(latencyVec(index))(accessDataSource, sourceData) } } + releaseSource.foreach { sourceVec => + sourceVec.zipWithIndex.foreach { case (release, index) => + val sinkRequest = sinkVec(index) + release := Pipe(sinkRequest.fire, 0.U.asTypeOf(new EmptyBundle), latencyVec(index)).valid + } + } } def instantiateVFU( From 66e76ffd37938f0a79b81b8552a9714c8a2a3563 Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Wed, 25 Dec 2024 16:59:06 +0800 Subject: [PATCH 27/41] [rtl] Release write queue counter after write release. --- t1/src/lsu/LSU.scala | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/t1/src/lsu/LSU.scala b/t1/src/lsu/LSU.scala index 9df2e0213..4d8f03f44 100644 --- a/t1/src/lsu/LSU.scala +++ b/t1/src/lsu/LSU.scala @@ -358,13 +358,20 @@ class LSU(param: LSUParameter) extends Module { ) // Record whether there is data for the corresponding instruction in the queue - writeQueueVec.zip(dataInWriteQueue).foreach { case (q, p) => + writeQueueVec.zip(dataInWriteQueue).zipWithIndex.foreach { case ((q, p), i) => val queueCount: Seq[UInt] = Seq.tabulate(2 * param.chainingSize) { _ => RegInit(0.U(log2Ceil(param.toVRFWriteQueueSize).W)) } val enqOH: UInt = indexToOH(q.enq.bits.data.instructionIndex, param.chainingSize) val queueEnq: UInt = Mux(q.enq.fire, enqOH, 0.U) - val queueDeq = Mux(q.deq.fire, indexToOH(q.deq.bits.data.instructionIndex, param.chainingSize), 0.U) + + val writeTokenSize = 8 + val writeIndexQueue = Queue.io(UInt(param.instructionIndexBits.W), writeTokenSize) + writeIndexQueue.enq.valid := q.deq.fire + writeIndexQueue.enq.bits := q.deq.bits.data.instructionIndex + writeIndexQueue.deq.ready := writeRelease(i) + + val queueDeq = Mux(writeIndexQueue.deq.fire, indexToOH(writeIndexQueue.deq.bits, param.chainingSize), 0.U) queueCount.zipWithIndex.foreach { case (count, index) => val counterUpdate: UInt = Mux(queueEnq(index), 1.U, -1.S(log2Ceil(param.toVRFWriteQueueSize).W).asUInt) when(queueEnq(index) ^ queueDeq(index)) { From 5fd93cee95450c81fd2294f2a9e9c64538b97d49 Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Thu, 26 Dec 2024 11:43:56 +0800 Subject: [PATCH 28/41] [rtl] The commit of the load instruction needs to wait for the confirmation of the lane. --- t1/src/T1.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/t1/src/T1.scala b/t1/src/T1.scala index 5d06b4c53..2320fa37c 100644 --- a/t1/src/T1.scala +++ b/t1/src/T1.scala @@ -486,7 +486,7 @@ class T1(val parameter: T1Parameter) val readOnlyInstruction: Bool = decodeResult(Decoder.readOnly) // 只进mask unit的指令 val maskUnitInstruction: Bool = (decodeResult(Decoder.slid) || decodeResult(Decoder.mv)) - val skipLastFromLane: Bool = isLoadStoreType || maskUnitInstruction || readOnlyInstruction + val skipLastFromLane: Bool = isStoreType || maskUnitInstruction || readOnlyInstruction val instructionValid: Bool = requestReg.bits.issue.vl > requestReg.bits.issue.vstart // TODO: these should be decoding results From a6ca15f3d5e6e728433de63a8f4979b28396800f Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Thu, 26 Dec 2024 13:06:39 +0800 Subject: [PATCH 29/41] [rtl] pipe writeCount. --- t1/src/T1.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/t1/src/T1.scala b/t1/src/T1.scala index 2320fa37c..1fa8c62c8 100644 --- a/t1/src/T1.scala +++ b/t1/src/T1.scala @@ -800,10 +800,10 @@ class T1(val parameter: T1Parameter) lane.loadDataInLSUWriteQueue := lsu.dataInWriteQueue(index) // 2 + 3 = 5 - val rowWith: Int = log2Ceil(parameter.datapathWidth / 8) + log2Ceil(parameter.laneNumber) - lane.writeCount := - (requestReg.bits.writeByte >> rowWith).asUInt + - (requestReg.bits.writeByte(rowWith - 1, 0) > ((parameter.datapathWidth / 8) * index).U) + val rowWith: Int = log2Ceil(parameter.datapathWidth / 8) + log2Ceil(parameter.laneNumber) + val writeCounter: UInt = (requestReg.bits.writeByte >> rowWith).asUInt + + (requestReg.bits.writeByte(rowWith - 1, 0) > ((parameter.datapathWidth / 8) * index).U) + lane.writeCount := Pipe(true.B, writeCounter, parameter.laneRequestShifterSize(index)).bits // token manager tokenManager.instructionFinish(index) := instructionFinishedPipe From 1f642358b4132ce5f59615c4561a2542e9ec4ee7 Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Thu, 26 Dec 2024 14:27:52 +0800 Subject: [PATCH 30/41] [rtl] fix mask update. --- t1/src/Lane.scala | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/t1/src/Lane.scala b/t1/src/Lane.scala index 1c8893634..77efe5c8c 100644 --- a/t1/src/Lane.scala +++ b/t1/src/Lane.scala @@ -676,7 +676,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ // update mask todo: handle maskRequestFireOH slotMaskRequestVec(index).valid := - record.laneRequest.mask && + record.laneRequest.mask && slotOccupied(index) && ((stage0.enqueue.fire && stage0.updateLaneState.maskExhausted) || !record.mask.valid) slotMaskRequestVec(index).bits := stage0.updateLaneState.maskGroupCount // There are new masks @@ -885,6 +885,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ Queue.io(chiselTypeOf(maskedWriteUnit.enqueue.bits), entries = 1, pipe = true) val writeSelect: UInt = Wire(UInt((parameter.chainingSize + 3).W)) val writeCavitation: UInt = VecInit(allVrfWriteAfterCheck.map(_.mask === 0.U)).asUInt + val slotEnqueueFire: Seq[Bool] = Seq.tabulate(parameter.chainingSize)(_ => Wire(Bool())) // 处理 rf { @@ -964,14 +965,15 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ maskSelect := Mux1H(maskControlReqSelect, maskControlVec.map(_.group)) maskSelectSew := Mux1H(maskControlReqSelect, maskControlVec.map(_.sew)) maskControlDataDeq := slotMaskRequestVec.zipWithIndex.map { case (req, index) => - val slotIndex = slotControl(index).laneRequest.instructionIndex - val hitMaskControl = VecInit(maskControlVec.map(_.index === slotIndex)).asUInt - val dataValid = Mux1H(hitMaskControl, maskControlVec.map(_.dataValid)) - val data = Mux1H(hitMaskControl, maskControlVec.map(_.maskData)) - val group = Mux1H(hitMaskControl, maskControlVec.map(_.group)) - val sameGroup = group === req.bits + val slotIndex = slotControl(index).laneRequest.instructionIndex + val hitMaskControl = VecInit(maskControlVec.map(c => c.index === slotIndex && c.controlValid)).asUInt + val dataValid = Mux1H(hitMaskControl, maskControlVec.map(_.dataValid)) + val data = Mux1H(hitMaskControl, maskControlVec.map(_.maskData)) + val group = Mux1H(hitMaskControl, maskControlVec.map(_.group)) + val sameGroup = group === req.bits dontTouch(sameGroup) - val maskRequestFire = req.valid && dataValid + val hitShifter: Bool = if (index == 0) false.B else slotEnqueueFire(index - 1) + val maskRequestFire = req.valid && dataValid && !hitShifter maskRequestFireOH(index) := maskRequestFire maskDataVec(index) := data maskAnd(maskRequestFire, hitMaskControl).asUInt @@ -1079,7 +1081,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ pre || !current } - val slotEnqueueFire: Seq[Bool] = Seq.tabulate(parameter.chainingSize) { slotIndex => + Seq.tabulate(parameter.chainingSize) { slotIndex => val enqueueReady: Bool = Wire(Bool()) val enqueueValid: Bool = Wire(Bool()) val enqueueFire: Bool = enqueueReady && enqueueValid @@ -1092,7 +1094,6 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ maskGroupCountVec(slotIndex) := 0.U(parameter.maskGroupSizeBits.W) maskIndexVec(slotIndex) := 0.U(log2Ceil(parameter.maskGroupWidth).W) } - enqueueFire } else { // shifter for slot enqueueValid := slotCanShift(slotIndex + 1) && slotOccupied(slotIndex + 1) @@ -1102,8 +1103,8 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ maskGroupCountVec(slotIndex) := maskGroupCountVec(slotIndex + 1) maskIndexVec(slotIndex) := maskIndexVec(slotIndex + 1) } - enqueueFire } + slotEnqueueFire(slotIndex) := enqueueFire } val slotDequeueFire: Seq[Bool] = (slotCanShift.head && slotOccupied.head) +: slotEnqueueFire From d44603c4696da82426903eb7b1c55810ca52d985 Mon Sep 17 00:00:00 2001 From: Qinjun Li <44799832+qinjun-li@users.noreply.github.com> Date: Mon, 23 Dec 2024 20:00:17 +0800 Subject: [PATCH 31/41] [rtl] Pipe input in compress unit. --- t1/src/mask/MaskCompress.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/t1/src/mask/MaskCompress.scala b/t1/src/mask/MaskCompress.scala index 03520db7a..8287f3cad 100644 --- a/t1/src/mask/MaskCompress.scala +++ b/t1/src/mask/MaskCompress.scala @@ -73,7 +73,7 @@ class MaskCompress(val parameter: CompressParam) io.om := omInstance.getPropertyReference omInstance.retimeIn.foreach(_ := Property(Path(io.clock))) - val in = io.in + val in = RegNext(io.in, 0.U.asTypeOf(io.in)) val out = io.out val newInstruction = io.newInstruction val ffoInstruction = io.ffoInstruction From b1462ca929a08e47b3e64dc0c58de980660a5c26 Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Thu, 26 Dec 2024 18:37:21 +0800 Subject: [PATCH 32/41] [rtl] fix ffo pipe. --- t1/src/mask/MaskCompress.scala | 14 ++++++++++---- t1/src/mask/MaskUnit.scala | 2 +- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/t1/src/mask/MaskCompress.scala b/t1/src/mask/MaskCompress.scala index 8287f3cad..4f7f7dd56 100644 --- a/t1/src/mask/MaskCompress.scala +++ b/t1/src/mask/MaskCompress.scala @@ -54,6 +54,7 @@ class MaskCompressInterFace(parameter: CompressParam) extends Bundle { val newInstruction: Bool = Input(Bool()) val ffoInstruction: Bool = Input(Bool()) val writeData: UInt = Output(UInt(parameter.xLen.W)) + val stageValid: Bool = Bool() val om = Output(Property[AnyClassType]()) } @@ -239,10 +240,13 @@ class MaskCompress(val parameter: CompressParam) ) compressMask := Mux(compressTailValid, compressTailMask, (-1.S(out.mask.getWidth.W)).asUInt) + val validInputPipe = initRegEnable(in.bits.validInput, in.fire) + val readFromScalarPipe = initRegEnable(in.bits.readFromScalar, in.fire) + val mvMask = Mux1H(eew1H, Seq(1.U, 3.U, 15.U)) - val mvData = in.bits.readFromScalar + val mvData = readFromScalarPipe - val ffoMask: UInt = FillInterleaved(parameter.datapathWidth / 8, in.bits.validInput) + val ffoMask: UInt = FillInterleaved(parameter.datapathWidth / 8, validInputPipe) outWire.data := Mux1H( Seq( @@ -303,6 +307,8 @@ class MaskCompress(val parameter: CompressParam) }.elsewhen(mvRd) { ffoIndex := source1SigExtend } - outWire.ffoOutput := completedLeftOr | Fill(parameter.laneNumber, ffoValid) - out := RegNext(outWire, 0.U.asTypeOf(outWire)) + val ffoOutPipe: UInt = initRegEnable(completedLeftOr | Fill(parameter.laneNumber, ffoValid), in.fire) + outWire.ffoOutput := ffoOutPipe + out := RegNext(outWire, 0.U.asTypeOf(outWire)) + io.stageValid := stage2Valid || in.valid } diff --git a/t1/src/mask/MaskUnit.scala b/t1/src/mask/MaskUnit.scala index a1905a8b8..21bec0485 100644 --- a/t1/src/mask/MaskUnit.scala +++ b/t1/src/mask/MaskUnit.scala @@ -1148,7 +1148,7 @@ class MaskUnit(val parameter: T1Parameter) val executeStageInvalid: Bool = Mux1H( unitType(3, 1), Seq( - !compressUnit.io.out.compressValid, + !compressUnit.io.out.compressValid && !compressUnit.io.stageValid, reduceUnit.io.in.ready, true.B ) From 04c0d957cfd23ec6751f046b93f80a9ae8d1bc50 Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Sun, 29 Dec 2024 15:27:59 +0800 Subject: [PATCH 33/41] [rtl] reorder read in mask unit. --- t1/src/Bundles.scala | 5 ++ t1/src/T1.scala | 2 +- t1/src/mask/MaskUnit.scala | 102 ++++++++++++++++++++++++++++++------- 3 files changed, 91 insertions(+), 18 deletions(-) diff --git a/t1/src/Bundles.scala b/t1/src/Bundles.scala index 9b8985ec4..5f25278d6 100644 --- a/t1/src/Bundles.scala +++ b/t1/src/Bundles.scala @@ -713,6 +713,11 @@ class MaskUnitReadState(parameter: T1Parameter) extends Bundle { val last: Bool = Bool() } +class MaskReadReorderQueue(parameter: T1Parameter) extends Bundle { + val data: UInt = UInt(parameter.datapathWidth.W) + val write1H: UInt = UInt(parameter.laneNumber.W) +} + class MaskUnitInstReq(parameter: T1Parameter) extends Bundle { val instructionIndex: UInt = UInt(parameter.instructionIndexBits.W) val decodeResult: DecodeBundle = Decoder.bundle(parameter.decoderParam) diff --git a/t1/src/T1.scala b/t1/src/T1.scala index 1fa8c62c8..11e3e9b26 100644 --- a/t1/src/T1.scala +++ b/t1/src/T1.scala @@ -547,7 +547,7 @@ class T1(val parameter: T1Parameter) parameter.instructionIndexBits ) - val gatherNeedRead: Bool = requestRegDequeue.valid && decodeResult(Decoder.gather) + val gatherNeedRead: Bool = requestRegDequeue.valid && decodeResult(Decoder.gather) && !decodeResult(Decoder.vtype) /** state machine register for each instruction. */ val slots: Seq[InstructionControl] = Seq.tabulate(parameter.chainingSize) { index => diff --git a/t1/src/mask/MaskUnit.scala b/t1/src/mask/MaskUnit.scala index 21bec0485..fbcf13553 100644 --- a/t1/src/mask/MaskUnit.scala +++ b/t1/src/mask/MaskUnit.scala @@ -458,6 +458,11 @@ class MaskUnit(val parameter: T1Parameter) val readIssueStageState: MaskUnitReadState = RegInit(0.U.asTypeOf(new MaskUnitReadState(parameter))) val readIssueStageValid: Bool = RegInit(false.B) + val accessCountType: Vec[UInt] = Vec(parameter.laneNumber, UInt(log2Ceil(parameter.laneNumber).W)) + val accessCountEnq = Wire(accessCountType) + // todo: param 16 + val accessCountQueue = Queue.io(accessCountType, 8) + def indexAnalysis(sewInt: Int)(elementIndex: UInt, vlmul: UInt, valid: Option[Bool] = None): Seq[UInt] = { val intLMULInput: UInt = (1.U << vlmul(1, 0)).asUInt val positionSize = parameter.laneParam.vlMaxBits - 1 @@ -622,6 +627,8 @@ class MaskUnit(val parameter: T1Parameter) // todo: param val readDataQueueSize: Int = 8 + // todo: param + val reorderQueueSize: Int = 16 // The queue waiting to read data. This queue contains other information about this group. // 64: todo: max or token? @@ -703,18 +710,20 @@ class MaskUnit(val parameter: T1Parameter) val readTypeRequestDeq: Bool = (anyReadFire && groupReadFinish) || (readIssueStageValid && readIssueStageState.needRead === 0.U) - val noSourceValid: Bool = noSource && counterValid && + val noSourceValid: Bool = noSource && counterValid && (instReg.vl.orR || (mvRd && !readVS1Reg.sendToExecution)) - val vs1DataValid: Bool = readVS1Reg.dataValid || !(unitType(2) || compress || mvRd) - val executeReady: Bool = Wire(Bool()) - val executeDeqReady: Bool = VecInit(maskedWrite.in.map(_.ready)).asUInt.andR - val otherTypeRequestDeq: Bool = + val vs1DataValid: Bool = readVS1Reg.dataValid || !(unitType(2) || compress || mvRd) + val executeReady: Bool = Wire(Bool()) + val executeDeqReady: Bool = VecInit(maskedWrite.in.map(_.ready)).asUInt.andR + val otherTypeRequestDeq: Bool = Mux(noSource, noSourceValid, allDataValid) && vs1DataValid && instVlValid && executeDeqReady - val readIssueStageEnq: Bool = + val reorderQueueAllocate: Bool = Wire(Bool()) + val readIssueStageEnq: Bool = (allDataValid || slideAddressGen.indexDeq.valid) && - (readTypeRequestDeq || !readIssueStageValid) && instVlValid && readType - val requestStageDeq: Bool = Mux(readType, readIssueStageEnq, otherTypeRequestDeq && executeReady) + (readTypeRequestDeq || !readIssueStageValid) && instVlValid && readType && + accessCountQueue.enq.ready && reorderQueueAllocate + val requestStageDeq: Bool = Mux(readType, readIssueStageEnq, otherTypeRequestDeq && executeReady) slideAddressGen.indexDeq.ready := readTypeRequestDeq || !readIssueStageValid when(anyReadFire) { readIssueStageState.groupReadState := readStateUpdate @@ -728,6 +737,12 @@ class MaskUnit(val parameter: T1Parameter) when(requestStageDeq && anyDataValid) { executeIndex := executeIndex + executeIndexGrowth } + accessCountEnq.zipWithIndex.foreach { case (d, i) => + d := PopCount(cutUIntBySize(accessLaneSelect, parameter.laneNumber).zipWithIndex.map { + case (accessIndex, sourceIndex) => + (accessIndex === i.U) && !notReadSelect(sourceIndex) + }) + } when(readIssueStageEnq) { readIssueStageState.groupReadState := 0.U readIssueStageState.needRead := (~notReadSelect).asUInt @@ -741,8 +756,15 @@ class MaskUnit(val parameter: T1Parameter) readIssueStageState.last := isVlBoundary when(slideAddressGen.indexDeq.fire) { readIssueStageState := slideAddressGen.indexDeq.bits + accessCountEnq.zipWithIndex.foreach { case (d, i) => + d := PopCount(slideAddressGen.indexDeq.bits.accessLane.zipWithIndex.map { case (accessIndex, sourceIndex) => + (accessIndex === i.U) && slideAddressGen.indexDeq.bits.needRead(sourceIndex) + }) + } } } + accessCountQueue.enq.valid := readIssueStageEnq + accessCountQueue.enq.bits := accessCountEnq readWaitQueue.enq.valid := readTypeRequestDeq readWaitQueue.enq.bits.executeGroup := readIssueStageState.executeGroup @@ -754,14 +776,52 @@ class MaskUnit(val parameter: T1Parameter) // last execute group in this request group dequeue lastExecuteGroupDeq := requestStageDeq && isLastExecuteGroup + // handle reorder + val reorderQueueVec: Seq[QueueIO[MaskReadReorderQueue]] = Seq.tabulate(parameter.laneNumber) { _ => + Queue.io(new MaskReadReorderQueue(parameter), reorderQueueSize) + } + + // reorderQueue token + reorderQueueAllocate := Seq + .tabulate(parameter.laneNumber) { i => + val tokenSize = log2Ceil(reorderQueueSize + 1) + val counter = RegInit(0.U(tokenSize.W)) + val release = reorderQueueVec(i).deq.fire + val allocate = Mux(readIssueStageEnq, accessCountEnq(i), 0.U) + when(release || readIssueStageEnq) { + counter := counter + allocate - release + } + // counter if allocate all + val counterWillUpdate = counter + parameter.laneNumber.U(tokenSize.W) + !counterWillUpdate(tokenSize - 1) + } + .reduce(_ && _) + + val reorderStageValid: Bool = RegInit(false.B) + val reorderStageState: Vec[UInt] = RegInit(0.U.asTypeOf(accessCountType)) + val reorderStageNeed: Vec[UInt] = RegInit(0.U.asTypeOf(accessCountType)) + + val stateCheck: Bool = reorderStageState === reorderStageNeed + + accessCountQueue.deq.ready := !reorderStageValid || stateCheck + val reorderStageEnqFire: Bool = accessCountQueue.deq.fire + val reorderStageDeqFire: Bool = stateCheck && reorderStageValid + when(reorderStageEnqFire ^ reorderStageDeqFire) { reorderStageValid := reorderStageEnqFire } + when(reorderStageEnqFire) { + reorderStageState := 0.U.asTypeOf(reorderStageState) + reorderStageNeed := accessCountQueue.deq.bits + } + // s1 read vrf - val write1HPipe: Vec[UInt] = Wire(Vec(parameter.laneNumber, UInt(parameter.laneNumber.W))) - val pipeDataOffset: Vec[UInt] = Wire(Vec(parameter.laneNumber, UInt(log2Ceil(parameter.datapathWidth / 8).W))) + val write1HPipe: Vec[UInt] = Wire(Vec(parameter.laneNumber, UInt(parameter.laneNumber.W))) + val dataAfterReorderCheck: Vec[UInt] = Wire(Vec(parameter.laneNumber, UInt(parameter.datapathWidth.W))) readCrossBar.output.zipWithIndex.foreach { case (request, index) => val readMessageQueue: QueueIO[MaskUnitReadPipe] = Queue.io(new MaskUnitReadPipe(parameter), readVRFLatency + 4) - val sourceLane = UIntToOH(request.bits.writeIndex) + val reorderQueue = reorderQueueVec(index) + val deqAllocate = !readType || reorderStageValid && (reorderStageState(index) =/= reorderStageNeed(index)) + val sourceLane = UIntToOH(request.bits.writeIndex) readChannel(index).valid := request.valid && readMessageQueue.enq.ready readChannel(index).bits.readSource := 2.U readChannel(index).bits.vs := request.bits.vs @@ -782,22 +842,30 @@ class MaskUnit(val parameter: T1Parameter) readMessageQueue.enq.bits.dataOffset := request.bits.dataOffset readMessageQueue.deq.ready := readResult(index).valid - write1HPipe(index) := Mux( - readMessageQueue.deq.valid && readResult(index).valid, - readMessageQueue.deq.bits.readSource, + reorderQueue.enq.valid := readResult(index).valid + reorderQueue.enq.bits.data := readResult(index).bits >> (readMessageQueue.deq.bits.dataOffset ## 0.U(3.W)) + reorderQueue.enq.bits.write1H := readMessageQueue.deq.bits.readSource + + reorderQueue.deq.ready := deqAllocate + write1HPipe(index) := Mux( + reorderQueue.deq.fire, + reorderQueue.deq.bits.write1H, 0.U(parameter.laneNumber.W) ) - pipeDataOffset(index) := readMessageQueue.deq.bits.dataOffset + dataAfterReorderCheck(index) := reorderQueue.deq.bits.data + when(reorderQueue.deq.fire && readType) { + reorderStageState(index) := reorderStageState(index) + 1.U + } } // Processing read results val readData: Seq[DecoupledIO[UInt]] = Seq.tabulate(parameter.laneNumber) { index => val readDataQueue = Queue.io(UInt(parameter.datapathWidth.W), readDataQueueSize, flow = true) val readResultSelect = VecInit(write1HPipe.map(_(index))).asUInt - val dataOffset: UInt = Mux1H(readResultSelect, pipeDataOffset) + val data: UInt = Mux1H(readResultSelect, dataAfterReorderCheck) readTokenRelease(index) := readDataQueue.deq.fire readDataQueue.enq.valid := readResultSelect.orR - readDataQueue.enq.bits := Mux1H(readResultSelect, readResult.map(_.bits)) >> (dataOffset ## 0.U(3.W)) + readDataQueue.enq.bits := data readDataQueue.deq } From 0f24aaf9b6ad0a760bbfeb425d97fcd77b3bf59a Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Mon, 30 Dec 2024 10:27:02 +0800 Subject: [PATCH 34/41] [rtl] fix mask destination type. --- t1/src/mask/MaskUnit.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/t1/src/mask/MaskUnit.scala b/t1/src/mask/MaskUnit.scala index fbcf13553..ec4d1c22f 100644 --- a/t1/src/mask/MaskUnit.scala +++ b/t1/src/mask/MaskUnit.scala @@ -848,7 +848,7 @@ class MaskUnit(val parameter: T1Parameter) reorderQueue.deq.ready := deqAllocate write1HPipe(index) := Mux( - reorderQueue.deq.fire, + reorderQueue.deq.fire && !maskDestinationType, reorderQueue.deq.bits.write1H, 0.U(parameter.laneNumber.W) ) From 24b4334f90cb896e84ace37f5f33e5dfab4e25f6 Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Mon, 30 Dec 2024 11:24:01 +0800 Subject: [PATCH 35/41] [rtl] fix reorderQueueAllocate timing. --- t1/src/mask/MaskUnit.scala | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/t1/src/mask/MaskUnit.scala b/t1/src/mask/MaskUnit.scala index ec4d1c22f..9f881f54b 100644 --- a/t1/src/mask/MaskUnit.scala +++ b/t1/src/mask/MaskUnit.scala @@ -786,13 +786,15 @@ class MaskUnit(val parameter: T1Parameter) .tabulate(parameter.laneNumber) { i => val tokenSize = log2Ceil(reorderQueueSize + 1) val counter = RegInit(0.U(tokenSize.W)) + val counterWillUpdate = RegInit(0.U(tokenSize.W)) val release = reorderQueueVec(i).deq.fire val allocate = Mux(readIssueStageEnq, accessCountEnq(i), 0.U) + val counterUpdate = counter + allocate - release when(release || readIssueStageEnq) { - counter := counter + allocate - release + counter := counterUpdate + // counter if allocate all + counterWillUpdate := counterUpdate + parameter.laneNumber.U(tokenSize.W) } - // counter if allocate all - val counterWillUpdate = counter + parameter.laneNumber.U(tokenSize.W) !counterWillUpdate(tokenSize - 1) } .reduce(_ && _) From 217d62efdedd571c92b02b7e0e295b1b35787848 Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Mon, 30 Dec 2024 13:39:17 +0800 Subject: [PATCH 36/41] [rtl] fix slide. --- t1/src/mask/MaskUnit.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/t1/src/mask/MaskUnit.scala b/t1/src/mask/MaskUnit.scala index 9f881f54b..e04726d40 100644 --- a/t1/src/mask/MaskUnit.scala +++ b/t1/src/mask/MaskUnit.scala @@ -724,7 +724,8 @@ class MaskUnit(val parameter: T1Parameter) (readTypeRequestDeq || !readIssueStageValid) && instVlValid && readType && accessCountQueue.enq.ready && reorderQueueAllocate val requestStageDeq: Bool = Mux(readType, readIssueStageEnq, otherTypeRequestDeq && executeReady) - slideAddressGen.indexDeq.ready := readTypeRequestDeq || !readIssueStageValid + slideAddressGen.indexDeq.ready := (readTypeRequestDeq || !readIssueStageValid) && + accessCountQueue.enq.ready && reorderQueueAllocate when(anyReadFire) { readIssueStageState.groupReadState := readStateUpdate } From 2aa1386254a37c66315a51d837f9c1778af36186 Mon Sep 17 00:00:00 2001 From: Qinjun Li <44799832+qinjun-li@users.noreply.github.com> Date: Sun, 29 Dec 2024 15:46:38 +0800 Subject: [PATCH 37/41] [rtl] fix reorder queue size. --- t1/src/mask/MaskUnit.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/t1/src/mask/MaskUnit.scala b/t1/src/mask/MaskUnit.scala index e04726d40..1b614cd8a 100644 --- a/t1/src/mask/MaskUnit.scala +++ b/t1/src/mask/MaskUnit.scala @@ -628,7 +628,7 @@ class MaskUnit(val parameter: T1Parameter) // todo: param val readDataQueueSize: Int = 8 // todo: param - val reorderQueueSize: Int = 16 + val reorderQueueSize: Int = 2 * parameter.laneNumber // The queue waiting to read data. This queue contains other information about this group. // 64: todo: max or token? From 7d813ed3a750468f605bfd12b1b43f1382ea080c Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Mon, 30 Dec 2024 15:54:28 +0800 Subject: [PATCH 38/41] [rtl] fix release for reorder counter. --- t1/src/mask/MaskUnit.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/t1/src/mask/MaskUnit.scala b/t1/src/mask/MaskUnit.scala index 1b614cd8a..b46ca9c35 100644 --- a/t1/src/mask/MaskUnit.scala +++ b/t1/src/mask/MaskUnit.scala @@ -788,7 +788,7 @@ class MaskUnit(val parameter: T1Parameter) val tokenSize = log2Ceil(reorderQueueSize + 1) val counter = RegInit(0.U(tokenSize.W)) val counterWillUpdate = RegInit(0.U(tokenSize.W)) - val release = reorderQueueVec(i).deq.fire + val release = reorderQueueVec(i).deq.fire && readType val allocate = Mux(readIssueStageEnq, accessCountEnq(i), 0.U) val counterUpdate = counter + allocate - release when(release || readIssueStageEnq) { From 3a4b475356669e7e2563653ae88a203c22c146fb Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Mon, 30 Dec 2024 17:47:58 +0800 Subject: [PATCH 39/41] [rtl] fix mask read. --- t1/src/mask/MaskUnit.scala | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/t1/src/mask/MaskUnit.scala b/t1/src/mask/MaskUnit.scala index b46ca9c35..7415dc4d6 100644 --- a/t1/src/mask/MaskUnit.scala +++ b/t1/src/mask/MaskUnit.scala @@ -256,6 +256,7 @@ class MaskUnit(val parameter: T1Parameter) val ffo: Bool = instReg.decodeResult(Decoder.topUop) === BitPat("b0111?") val extendType: Bool = unitType(3) && (subType(2) || subType(1)) val pop: Bool = instReg.decodeResult(Decoder.popCount) + val readValid: Bool = readType && instVlValid // Instructions for writing vd without source val noSource: Bool = mv || viota @@ -458,7 +459,7 @@ class MaskUnit(val parameter: T1Parameter) val readIssueStageState: MaskUnitReadState = RegInit(0.U.asTypeOf(new MaskUnitReadState(parameter))) val readIssueStageValid: Bool = RegInit(false.B) - val accessCountType: Vec[UInt] = Vec(parameter.laneNumber, UInt(log2Ceil(parameter.laneNumber).W)) + val accessCountType: Vec[UInt] = Vec(parameter.laneNumber, UInt(log2Ceil(parameter.laneNumber + 1).W)) val accessCountEnq = Wire(accessCountType) // todo: param 16 val accessCountQueue = Queue.io(accessCountType, 8) @@ -788,7 +789,7 @@ class MaskUnit(val parameter: T1Parameter) val tokenSize = log2Ceil(reorderQueueSize + 1) val counter = RegInit(0.U(tokenSize.W)) val counterWillUpdate = RegInit(0.U(tokenSize.W)) - val release = reorderQueueVec(i).deq.fire && readType + val release = reorderQueueVec(i).deq.fire && readValid val allocate = Mux(readIssueStageEnq, accessCountEnq(i), 0.U) val counterUpdate = counter + allocate - release when(release || readIssueStageEnq) { @@ -823,7 +824,7 @@ class MaskUnit(val parameter: T1Parameter) val readMessageQueue: QueueIO[MaskUnitReadPipe] = Queue.io(new MaskUnitReadPipe(parameter), readVRFLatency + 4) val reorderQueue = reorderQueueVec(index) - val deqAllocate = !readType || reorderStageValid && (reorderStageState(index) =/= reorderStageNeed(index)) + val deqAllocate = !readValid || reorderStageValid && (reorderStageState(index) =/= reorderStageNeed(index)) val sourceLane = UIntToOH(request.bits.writeIndex) readChannel(index).valid := request.valid && readMessageQueue.enq.ready readChannel(index).bits.readSource := 2.U From 9fe9a7abf4f65a8dcfa146798178b6df486f8855 Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Mon, 30 Dec 2024 18:44:51 +0800 Subject: [PATCH 40/41] [rtl] fix float reduce. --- t1/src/mask/MaskReduce.scala | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/t1/src/mask/MaskReduce.scala b/t1/src/mask/MaskReduce.scala index d479b064b..d0abe2598 100644 --- a/t1/src/mask/MaskReduce.scala +++ b/t1/src/mask/MaskReduce.scala @@ -139,7 +139,8 @@ class MaskReduce(val parameter: MaskReduceParameter) when(stateWait) { waitCount := waitCount + 1.U } val resFire: Bool = stateWait && waitCount === (floatAdderLatency - 1).U updateResult := - stateLast || ((stateCross || stateOrder) && sourceValid && !floatAdd) || resFire + stateLast || (stateCross && sourceValid && !floatAdd) || (resFire && sourceValid) + val waiteDeq: Bool = stateWait && resFire // state update in.ready := stateIdle @@ -159,20 +160,18 @@ class MaskReduce(val parameter: MaskReduceParameter) } } - when(stateWait && resFire) { + when(waiteDeq) { when(groupLastReduce) { state := Mux(reqReg.lastGroup && needFold, lastFold, idle) outValid := reqReg.lastGroup && !needFold }.otherwise { - state := crossFold + state := Mux(order, orderRed, crossFold) } } when(stateOrder) { - when(groupLastReduce) { - state := idle - outValid := reqReg.lastGroup - } + state := waitRes + waitCount := 0.U } when(stateLast) { @@ -195,7 +194,7 @@ class MaskReduce(val parameter: MaskReduceParameter) // count update // todo: stateCross <=> stateOrder ?? - when(stateCross || stateOrder || in.fire) { + when((stateCross && !floatType) || waiteDeq || in.fire) { crossFoldCount := Mux(in.fire, 0.U, crossFoldCount + 1.U) } From cf0eaf1220bc6fafffd9ee694750a6ce8aa153d7 Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Mon, 30 Dec 2024 19:59:29 +0800 Subject: [PATCH 41/41] [rtl] fix float reduce & compress. --- t1/src/mask/MaskCompress.scala | 10 ++++++++-- t1/src/mask/MaskReduce.scala | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/t1/src/mask/MaskCompress.scala b/t1/src/mask/MaskCompress.scala index 4f7f7dd56..cb381958f 100644 --- a/t1/src/mask/MaskCompress.scala +++ b/t1/src/mask/MaskCompress.scala @@ -184,6 +184,12 @@ class MaskCompress(val parameter: CompressParam) changeUIntSize(compressInitPipe, maxCountWidth) } + val tailCountForMask: UInt = { + val minElementSizePerSet = parameter.laneNumber * parameter.datapathWidth / 8 + val maxCountWidth = log2Ceil(minElementSizePerSet) + changeUIntSize(compressInit, maxCountWidth) + } + val compressDataReg = RegInit(0.U((parameter.laneNumber * parameter.datapathWidth).W)) val compressTailValid: Bool = RegInit(false.B) val compressWriteGroupCount: UInt = RegInit(0.U(parameter.groupNumberBits.W)) @@ -232,7 +238,7 @@ class MaskCompress(val parameter: CompressParam) val dataByte = 1 << sewInt val elementSizePerSet = parameter.laneNumber * parameter.datapathWidth / 8 / dataByte VecInit(Seq.tabulate(elementSizePerSet) { elementIndex => - val elementValid = elementIndex.U < tailCount + val elementValid = elementIndex.U < tailCountForMask val elementMask = Fill(dataByte, elementValid) elementMask }).asUInt @@ -310,5 +316,5 @@ class MaskCompress(val parameter: CompressParam) val ffoOutPipe: UInt = initRegEnable(completedLeftOr | Fill(parameter.laneNumber, ffoValid), in.fire) outWire.ffoOutput := ffoOutPipe out := RegNext(outWire, 0.U.asTypeOf(outWire)) - io.stageValid := stage2Valid || in.valid + io.stageValid := stage2Valid || in.valid || compressTailValid } diff --git a/t1/src/mask/MaskReduce.scala b/t1/src/mask/MaskReduce.scala index d0abe2598..ff48579d0 100644 --- a/t1/src/mask/MaskReduce.scala +++ b/t1/src/mask/MaskReduce.scala @@ -194,7 +194,7 @@ class MaskReduce(val parameter: MaskReduceParameter) // count update // todo: stateCross <=> stateOrder ?? - when((stateCross && !floatType) || waiteDeq || in.fire) { + when((stateCross && !floatAdd) || waiteDeq || in.fire) { crossFoldCount := Mux(in.fire, 0.U, crossFoldCount + 1.U) }